LU-15913 mdt: disable parallel rename for striped dirs

[fs/lustre-release.git] / lustre / mdt / mdt_reint.c
diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c

index 452a006..a60f1d1 100644 (file)
--- a/lustre/mdt/mdt_reint.c
+++ b/lustre/mdt/mdt_reint.c
@@ -27,7 +27,6 @@
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
   *
   * lustre/mdt/mdt_reint.c
   *
@@ -45,12 +44,13 @@
  #include <lprocfs_status.h>
  #include "mdt_internal.h"
  #include <lustre_lmv.h>
+#include <lustre_crypto.h>
  
  static inline void mdt_reint_init_ma(struct mdt_thread_info *info,
-                                     struct md_attr *ma)
+                                    struct md_attr *ma)
  {
         ma->ma_need = MA_INODE;
-        ma->ma_valid = 0;
+       ma->ma_valid = 0;
  }
  
  /**
@@ -59,7 +59,7 @@ static inline void mdt_reint_init_ma(struct mdt_thread_info *info,
   * Return real version or ENOENT_VERSION if object doesn't exist
   */
  static void mdt_obj_version_get(struct mdt_thread_info *info,
-                                struct mdt_object *o, __u64 *version)
+                               struct mdt_object *o, __u64 *version)
  {
         LASSERT(o);
  
@@ -78,20 +78,20 @@ static void mdt_obj_version_get(struct mdt_thread_info *info,
   * Should be called only during replay.
   */
  static int mdt_version_check(struct ptlrpc_request *req,
-                             __u64 version, int idx)
+                            __u64 version, int idx)
  {
-        __u64 *pre_ver = lustre_msg_get_versions(req->rq_reqmsg);
-        ENTRY;
-
-        if (!exp_connect_vbr(req->rq_export))
-                RETURN(0);
-
-        LASSERT(req_is_replay(req));
-        /** VBR: version is checked always because costs nothing */
-        LASSERT(idx < PTLRPC_NUM_VERSIONS);
-        /** Sanity check for malformed buffers */
-        if (pre_ver == NULL) {
-                CERROR("No versions in request buffer\n");
+       __u64 *pre_ver = lustre_msg_get_versions(req->rq_reqmsg);
+
+       ENTRY;
+       if (!exp_connect_vbr(req->rq_export))
+               RETURN(0);
+
+       LASSERT(req_is_replay(req));
+       /** VBR: version is checked always because costs nothing */
+       LASSERT(idx < PTLRPC_NUM_VERSIONS);
+       /** Sanity check for malformed buffers */
+       if (pre_ver == NULL) {
+               CERROR("No versions in request buffer\n");
                 spin_lock(&req->rq_export->exp_lock);
                 req->rq_export->exp_vbr_failed = 1;
                 spin_unlock(&req->rq_export->exp_lock);
@@ -111,18 +111,18 @@ static int mdt_version_check(struct ptlrpc_request *req,
   * Save pre-versions in reply.
   */
  static void mdt_version_save(struct ptlrpc_request *req, __u64 version,
-                             int idx)
+                            int idx)
  {
-        __u64 *reply_ver;
+       __u64 *reply_ver;
  
-        if (!exp_connect_vbr(req->rq_export))
-                return;
+       if (!exp_connect_vbr(req->rq_export))
+               return;
  
-        LASSERT(!req_is_replay(req));
-        LASSERT(req->rq_repmsg != NULL);
-        reply_ver = lustre_msg_get_versions(req->rq_repmsg);
-        if (reply_ver)
-                reply_ver[idx] = version;
+       LASSERT(!req_is_replay(req));
+       LASSERT(req->rq_repmsg != NULL);
+       reply_ver = lustre_msg_get_versions(req->rq_repmsg);
+       if (reply_ver)
+               reply_ver[idx] = version;
  }
  
  /**
@@ -131,11 +131,11 @@ static void mdt_version_save(struct ptlrpc_request *req, __u64 version,
   */
  static void mdt_enoent_version_save(struct mdt_thread_info *info, int idx)
  {
-        /* save version of file name for replay, it must be ENOENT here */
-        if (!req_is_replay(mdt_info_req(info))) {
-                info->mti_ver[idx] = ENOENT_VERSION;
-                mdt_version_save(mdt_info_req(info), info->mti_ver[idx], idx);
-        }
+       /* save version of file name for replay, it must be ENOENT here */
+       if (!req_is_replay(mdt_info_req(info))) {
+               info->mti_ver[idx] = ENOENT_VERSION;
+               mdt_version_save(mdt_info_req(info), info->mti_ver[idx], idx);
+       }
  }
  
  /**
@@ -144,44 +144,44 @@ static void mdt_enoent_version_save(struct mdt_thread_info *info, int idx)
   * Versions are saved in reply only during normal operations not replays.
   */
  void mdt_version_get_save(struct mdt_thread_info *info,
-                          struct mdt_object *mto, int idx)
+                         struct mdt_object *mto, int idx)
  {
-        /* don't save versions during replay */
-        if (!req_is_replay(mdt_info_req(info))) {
-                mdt_obj_version_get(info, mto, &info->mti_ver[idx]);
-                mdt_version_save(mdt_info_req(info), info->mti_ver[idx], idx);
-        }
+       /* don't save versions during replay */
+       if (!req_is_replay(mdt_info_req(info))) {
+               mdt_obj_version_get(info, mto, &info->mti_ver[idx]);
+               mdt_version_save(mdt_info_req(info), info->mti_ver[idx], idx);
+       }
  }
  
  /**
   * Get version from disk and check it, no save in reply.
   */
  int mdt_version_get_check(struct mdt_thread_info *info,
-                          struct mdt_object *mto, int idx)
+                         struct mdt_object *mto, int idx)
  {
-        /* only check versions during replay */
-        if (!req_is_replay(mdt_info_req(info)))
-                return 0;
+       /* only check versions during replay */
+       if (!req_is_replay(mdt_info_req(info)))
+               return 0;
  
-        mdt_obj_version_get(info, mto, &info->mti_ver[idx]);
-        return mdt_version_check(mdt_info_req(info), info->mti_ver[idx], idx);
+       mdt_obj_version_get(info, mto, &info->mti_ver[idx]);
+       return mdt_version_check(mdt_info_req(info), info->mti_ver[idx], idx);
  }
  
  /**
   * Get version from disk and check if recovery or just save.
   */
  int mdt_version_get_check_save(struct mdt_thread_info *info,
-                               struct mdt_object *mto, int idx)
+                              struct mdt_object *mto, int idx)
  {
-        int rc = 0;
-
-        mdt_obj_version_get(info, mto, &info->mti_ver[idx]);
-        if (req_is_replay(mdt_info_req(info)))
-                rc = mdt_version_check(mdt_info_req(info), info->mti_ver[idx],
-                                       idx);
-        else
-                mdt_version_save(mdt_info_req(info), info->mti_ver[idx], idx);
-        return rc;
+       int rc = 0;
+
+       mdt_obj_version_get(info, mto, &info->mti_ver[idx]);
+       if (req_is_replay(mdt_info_req(info)))
+               rc = mdt_version_check(mdt_info_req(info), info->mti_ver[idx],
+                                      idx);
+       else
+               mdt_version_save(mdt_info_req(info), info->mti_ver[idx], idx);
+       return rc;
  }
  
  /**
@@ -190,30 +190,31 @@ int mdt_version_get_check_save(struct mdt_thread_info *info,
   * This checks version of 'name'. Many reint functions uses 'name' for child not
   * FID, therefore we need to get object by name and check its version.
   */
-static int mdt_lookup_version_check(struct mdt_thread_info *info,
-                                   struct mdt_object *p,
-                                   const struct lu_name *lname,
-                                   struct lu_fid *fid, int idx)
+int mdt_lookup_version_check(struct mdt_thread_info *info,
+                            struct mdt_object *p,
+                            const struct lu_name *lname,
+                            struct lu_fid *fid, int idx)
  {
-        int rc, vbrc;
-
-        rc = mdo_lookup(info->mti_env, mdt_object_child(p), lname, fid,
-                        &info->mti_spec);
-        /* Check version only during replay */
-        if (!req_is_replay(mdt_info_req(info)))
-                return rc;
-
-        info->mti_ver[idx] = ENOENT_VERSION;
-        if (rc == 0) {
-                struct mdt_object *child;
-                child = mdt_object_find(info->mti_env, info->mti_mdt, fid);
-                if (likely(!IS_ERR(child))) {
-                        mdt_obj_version_get(info, child, &info->mti_ver[idx]);
-                        mdt_object_put(info->mti_env, child);
-                }
-        }
-        vbrc = mdt_version_check(mdt_info_req(info), info->mti_ver[idx], idx);
-        return vbrc ? vbrc : rc;
+       int rc, vbrc;
+
+       rc = mdo_lookup(info->mti_env, mdt_object_child(p), lname, fid,
+                       &info->mti_spec);
+       /* Check version only during replay */
+       if (!req_is_replay(mdt_info_req(info)))
+               return rc;
+
+       info->mti_ver[idx] = ENOENT_VERSION;
+       if (rc == 0) {
+               struct mdt_object *child;
+
+               child = mdt_object_find(info->mti_env, info->mti_mdt, fid);
+               if (likely(!IS_ERR(child))) {
+                       mdt_obj_version_get(info, child, &info->mti_ver[idx]);
+                       mdt_object_put(info->mti_env, child);
+               }
+       }
+       vbrc = mdt_version_check(mdt_info_req(info), info->mti_ver[idx], idx);
+       return vbrc ? vbrc : rc;
  
  }
  
@@ -291,6 +292,7 @@ static int mdt_lock_slaves(struct mdt_thread_info *mti, struct mdt_object *obj,
         einfo->ei_enq_slave = 1;
         einfo->ei_namespace = mti->mti_mdt->mdt_namespace;
         einfo->ei_inodebits = ibits;
+       einfo->ei_req_slot = 1;
         memset(policy, 0, sizeof(*policy));
         policy->l_inodebits.bits = ibits;
  
@@ -342,6 +344,149 @@ void mdt_reint_striped_unlock(struct mdt_thread_info *info,
         mdt_object_unlock(info, o, lh, decref);
  }
  
+static int mdt_restripe(struct mdt_thread_info *info,
+                       struct mdt_object *parent,
+                       const struct lu_name *lname,
+                       const struct lu_fid *tfid,
+                       struct md_op_spec *spec,
+                       struct md_attr *ma)
+{
+       struct mdt_device *mdt = info->mti_mdt;
+       struct lu_fid *fid = &info->mti_tmp_fid2;
+       struct ldlm_enqueue_info *einfo = &info->mti_einfo[0];
+       struct lmv_user_md *lum = spec->u.sp_ea.eadata;
+       struct lmv_mds_md_v1 *lmv;
+       struct mdt_object *child;
+       struct mdt_lock_handle *lhp;
+       struct mdt_lock_handle *lhc;
+       struct mdt_body *repbody;
+       int rc;
+
+       ENTRY;
+       if (!mdt->mdt_enable_dir_restripe)
+               RETURN(-EPERM);
+
+       LASSERT(lum);
+       lum->lum_hash_type |= cpu_to_le32(LMV_HASH_FLAG_FIXED);
+
+       rc = mdt_version_get_check_save(info, parent, 0);
+       if (rc)
+               RETURN(rc);
+
+       lhp = &info->mti_lh[MDT_LH_PARENT];
+       mdt_lock_pdo_init(lhp, LCK_PW, lname);
+       rc = mdt_reint_object_lock(info, parent, lhp, MDS_INODELOCK_UPDATE,
+                                  true);
+       if (rc)
+               RETURN(rc);
+
+       rc = mdt_stripe_get(info, parent, ma, XATTR_NAME_LMV);
+       if (rc)
+               GOTO(unlock_parent, rc);
+
+       if (ma->ma_valid & MA_LMV) {
+               /* don't allow restripe if parent dir layout is changing */
+               lmv = &ma->ma_lmv->lmv_md_v1;
+               if (!lmv_is_sane2(lmv))
+                       GOTO(unlock_parent, rc = -EBADF);
+
+               if (lmv_is_layout_changing(lmv))
+                       GOTO(unlock_parent, rc = -EBUSY);
+       }
+
+       fid_zero(fid);
+       rc = mdt_lookup_version_check(info, parent, lname, fid, 1);
+       if (rc)
+               GOTO(unlock_parent, rc);
+
+       child = mdt_object_find(info->mti_env, mdt, fid);
+       if (IS_ERR(child))
+               GOTO(unlock_parent, rc = PTR_ERR(child));
+
+       if (!mdt_object_exists(child))
+               GOTO(out_child, rc = -ENOENT);
+
+       if (mdt_object_remote(child)) {
+               struct mdt_body *repbody;
+
+               repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+               if (!repbody)
+                       GOTO(out_child, rc = -EPROTO);
+
+               repbody->mbo_fid1 = *fid;
+               repbody->mbo_valid |= (OBD_MD_FLID | OBD_MD_MDS);
+               GOTO(out_child, rc = -EREMOTE);
+       }
+
+       if (!S_ISDIR(lu_object_attr(&child->mot_obj)))
+               GOTO(out_child, rc = -ENOTDIR);
+
+       rc = mdt_stripe_get(info, child, ma, XATTR_NAME_LMV);
+       if (rc)
+               GOTO(out_child, rc);
+
+       /* race with migrate? */
+       if ((ma->ma_valid & MA_LMV) &&
+            lmv_is_migrating(&ma->ma_lmv->lmv_md_v1))
+               GOTO(out_child, rc = -EBUSY);
+
+       /* lock object */
+       lhc = &info->mti_lh[MDT_LH_CHILD];
+       mdt_lock_reg_init(lhc, LCK_EX);
+
+       /* enqueue object remote LOOKUP lock */
+       if (mdt_object_remote(parent)) {
+               rc = mdt_remote_object_lock(info, parent, fid,
+                                           &lhc->mlh_rreg_lh,
+                                           lhc->mlh_rreg_mode,
+                                           MDS_INODELOCK_LOOKUP, false);
+               if (rc != ELDLM_OK)
+                       GOTO(out_child, rc);
+       }
+
+       rc = mdt_reint_striped_lock(info, child, lhc, MDS_INODELOCK_FULL, einfo,
+                                   true);
+       if (rc)
+               GOTO(unlock_child, rc);
+
+       tgt_vbr_obj_set(info->mti_env, mdt_obj2dt(child));
+       rc = mdt_version_get_check_save(info, child, 1);
+       if (rc)
+               GOTO(unlock_child, rc);
+
+       spin_lock(&mdt->mdt_restriper.mdr_lock);
+       if (child->mot_restriping) {
+               /* race? */
+               spin_unlock(&mdt->mdt_restriper.mdr_lock);
+               GOTO(unlock_child, rc = -EBUSY);
+       }
+       child->mot_restriping = 1;
+       spin_unlock(&mdt->mdt_restriper.mdr_lock);
+
+       *fid = *tfid;
+       rc = mdt_restripe_internal(info, parent, child, lname, fid, spec, ma);
+       if (rc)
+               GOTO(restriping_clear, rc);
+
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       if (!repbody)
+               GOTO(restriping_clear, rc = -EPROTO);
+
+       mdt_pack_attr2body(info, repbody, &ma->ma_attr, fid);
+       EXIT;
+
+restriping_clear:
+       child->mot_restriping = 0;
+unlock_child:
+       mdt_reint_striped_unlock(info, child, lhc, einfo, rc);
+out_child:
+       mdt_object_put(info->mti_env, child);
+unlock_parent:
+       mdt_object_unlock(info, parent, lhp, rc);
+
+       return rc;
+}
+
  /*
   * VBR: we save three versions in reply:
   * 0 - parent. Check that parent version is the same during replay.
@@ -352,19 +497,20 @@ void mdt_reint_striped_unlock(struct mdt_thread_info *info,
   */
  static int mdt_create(struct mdt_thread_info *info)
  {
-       struct mdt_device       *mdt = info->mti_mdt;
-       struct mdt_object       *parent;
-       struct mdt_object       *child;
-       struct mdt_lock_handle  *lh;
-       struct mdt_body         *repbody;
-       struct md_attr          *ma = &info->mti_attr;
+       struct mdt_device *mdt = info->mti_mdt;
+       struct mdt_object *parent;
+       struct mdt_object *child;
+       struct mdt_lock_handle *lh;
+       struct mdt_body *repbody;
+       struct md_attr *ma = &info->mti_attr;
         struct mdt_reint_record *rr = &info->mti_rr;
-       struct md_op_spec       *spec = &info->mti_spec;
+       struct md_op_spec *spec = &info->mti_spec;
+       bool restripe = false;
         int rc;
-       ENTRY;
  
-       DEBUG_REQ(D_INODE, mdt_info_req(info), "Create  ("DNAME"->"DFID") "
-                 "in "DFID,
+       ENTRY;
+       DEBUG_REQ(D_INODE, mdt_info_req(info),
+                 "Create ("DNAME"->"DFID") in "DFID,
                   PNAME(&rr->rr_name), PFID(rr->rr_fid2), PFID(rr->rr_fid1));
  
         if (!fid_is_md_operative(rr->rr_fid1))
@@ -377,7 +523,8 @@ static int mdt_create(struct mdt_thread_info *info)
                 struct obd_export *exp = mdt_info_req(info)->rq_export;
  
                 /* Only new clients can create remote dir( >= 2.4) and
-                * striped dir(>= 2.6), old client will return -ENOTSUPP */
+                * striped dir(>= 2.6), old client will return -ENOTSUPP
+                */
                 if (!mdt_is_dne_client(exp))
                         RETURN(-ENOTSUPP);
  
@@ -391,10 +538,23 @@ static int mdt_create(struct mdt_thread_info *info)
                         RETURN(-EPERM);
                 }
  
-               if (!md_capable(uc, CFS_CAP_SYS_ADMIN) &&
+               if ((!(exp_connect_flags2(exp) & OBD_CONNECT2_CRUSH)) &&
+                   (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_TYPE_MASK) >=
+                   LMV_HASH_TYPE_CRUSH)
+                       RETURN(-EPROTO);
+
+               if (!cap_raised(uc->uc_cap, CAP_SYS_ADMIN) &&
                     uc->uc_gid != mdt->mdt_enable_remote_dir_gid &&
                     mdt->mdt_enable_remote_dir_gid != -1)
                         RETURN(-EPERM);
+
+               /* restripe if later found dir exists, MDS_OPEN_CREAT means
+                * this is create only, don't try restripe.
+                */
+               if (mdt->mdt_enable_dir_restripe &&
+                   le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT &&
+                   !(spec->sp_cr_flags & MDS_OPEN_CREAT))
+                       restripe = true;
         }
  
         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
@@ -406,34 +566,45 @@ static int mdt_create(struct mdt_thread_info *info)
         if (!mdt_object_exists(parent))
                 GOTO(put_parent, rc = -ENOENT);
  
-       lh = &info->mti_lh[MDT_LH_PARENT];
-       mdt_lock_pdo_init(lh, LCK_PW, &rr->rr_name);
-       rc = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE);
+       rc = mdt_check_enc(info, parent);
         if (rc)
                 GOTO(put_parent, rc);
  
-       if (!mdt_object_remote(parent)) {
-               rc = mdt_version_get_check_save(info, parent, 0);
-               if (rc)
-                       GOTO(unlock_parent, rc);
-       }
-
         /*
-        * Check child name version during replay.
-        * During create replay a file may exist with same name.
+        * LU-10235: check if name exists locklessly first to avoid massive
+        * lock recalls on existing directories.
          */
         rc = mdt_lookup_version_check(info, parent, &rr->rr_name,
                                       &info->mti_tmp_fid1, 1);
-       if (rc == 0)
-               GOTO(unlock_parent, rc = -EEXIST);
+       if (rc == 0) {
+               if (!restripe)
+                       GOTO(put_parent, rc = -EEXIST);
+
+               rc = mdt_restripe(info, parent, &rr->rr_name, rr->rr_fid2, spec,
+                                 ma);
+       }
  
         /* -ENOENT is expected here */
         if (rc != -ENOENT)
-               GOTO(unlock_parent, rc);
+               GOTO(put_parent, rc);
  
         /* save version of file name for replay, it must be ENOENT here */
         mdt_enoent_version_save(info, 1);
  
+       OBD_RACE(OBD_FAIL_MDS_CREATE_RACE);
+
+       lh = &info->mti_lh[MDT_LH_PARENT];
+       mdt_lock_pdo_init(lh, LCK_PW, &rr->rr_name);
+       rc = mdt_object_lock(info, parent, lh, MDS_INODELOCK_UPDATE);
+       if (rc)
+               GOTO(put_parent, rc);
+
+       if (!mdt_object_remote(parent)) {
+               rc = mdt_version_get_check_save(info, parent, 0);
+               if (rc)
+                       GOTO(unlock_parent, rc);
+       }
+
         child = mdt_object_new(info->mti_env, mdt, rr->rr_fid2);
         if (unlikely(IS_ERR(child)))
                 GOTO(unlock_parent, rc = PTR_ERR(child));
@@ -450,14 +621,13 @@ static int mdt_create(struct mdt_thread_info *info)
         if (rc)
                 GOTO(put_child, rc);
  
-       /* Let lower layer know current lock mode. */
-       info->mti_spec.sp_cr_mode = mdt_dlm_mode2mdl_mode(lh->mlh_pdo_mode);
-
         /*
          * Do not perform lookup sanity check. We know that name does
          * not exist.
          */
         info->mti_spec.sp_cr_lookup = 0;
+       if (mdt_object_remote(parent))
+               info->mti_spec.sp_cr_lookup = 1;
         info->mti_spec.sp_feat = &dt_directory_features;
  
         rc = mdo_create(info->mti_env, mdt_object_child(parent), &rr->rr_name,
@@ -515,13 +685,14 @@ static int mdt_create(struct mdt_thread_info *info)
         if (ma->ma_valid & MA_INODE)
                 mdt_pack_attr2body(info, repbody, &ma->ma_attr,
                                    mdt_object_fid(child));
+       EXIT;
  put_child:
         mdt_object_put(info->mti_env, child);
  unlock_parent:
         mdt_object_unlock(info, parent, lh, rc);
  put_parent:
         mdt_object_put(info->mti_env, parent);
-       RETURN(rc);
+       return rc;
  }
  
  static int mdt_attr_set(struct mdt_thread_info *info, struct mdt_object *mo,
@@ -534,8 +705,8 @@ static int mdt_attr_set(struct mdt_thread_info *info, struct mdt_object *mo,
         struct ldlm_enqueue_info *einfo = &info->mti_einfo[0];
         bool cos_incompat;
         int rc;
-       ENTRY;
  
+       ENTRY;
         rc = mdt_object_striped(info, mo);
         if (rc < 0)
                 RETURN(rc);
@@ -548,9 +719,15 @@ static int mdt_attr_set(struct mdt_thread_info *info, struct mdt_object *mo,
         /* Even though the new MDT will grant PERM lock to the old
          * client, but the old client will almost ignore that during
          * So it needs to revoke both LOOKUP and PERM lock here, so
-        * both new and old client can cancel the dcache */
+        * both new and old client can cancel the dcache
+        */
         if (ma->ma_attr.la_valid & (LA_MODE|LA_UID|LA_GID))
                 lockpart |= MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
+       /* Clear xattr cache on clients, so the virtual project ID xattr
+        * can get the new project ID
+        */
+       if (ma->ma_attr.la_valid & LA_PROJID)
+               lockpart |= MDS_INODELOCK_XATTR;
  
         rc = mdt_reint_striped_lock(info, mo, lh, lockpart, einfo,
                                     cos_incompat);
@@ -582,7 +759,7 @@ static int mdt_attr_set(struct mdt_thread_info *info, struct mdt_object *mo,
  
         if (rc != 0)
                 GOTO(out_unlock, rc);
-       mdt_dom_obj_lvb_update(info->mti_env, mo, false);
+       mdt_dom_obj_lvb_update(info->mti_env, mo, NULL, false);
         EXIT;
  out_unlock:
         mdt_reint_striped_unlock(info, mo, lh, einfo, rc);
@@ -599,10 +776,10 @@ int mdt_add_dirty_flag(struct mdt_thread_info *info, struct mdt_object *mo,
                         struct md_attr *ma)
  {
         struct lu_ucred *uc = mdt_ucred(info);
-       cfs_cap_t cap_saved;
+       kernel_cap_t cap_saved;
         int rc;
-       ENTRY;
  
+       ENTRY;
         /* If the file was modified, add the dirty flag */
         ma->ma_need = MA_HSM;
         rc = mdt_attr_get_complex(info, mo, ma);
@@ -618,9 +795,10 @@ int mdt_add_dirty_flag(struct mdt_thread_info *info, struct mdt_object *mo,
                 ma->ma_hsm.mh_flags |= HS_DIRTY;
  
                 /* Bump cap so that closes from non-owner writers can
-                * set the HSM state to dirty. */
+                * set the HSM state to dirty.
+                */
                 cap_saved = uc->uc_cap;
-               uc->uc_cap |= MD_CAP_TO_MASK(CFS_CAP_FOWNER);
+               cap_raise(uc->uc_cap, CAP_FOWNER);
                 rc = mdt_hsm_attr_set(info, mo, &ma->ma_hsm);
                 uc->uc_cap = cap_saved;
                 if (rc)
@@ -640,15 +818,18 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
         struct ptlrpc_request *req = mdt_info_req(info);
         struct mdt_object *mo;
         struct mdt_body *repbody;
+       ktime_t kstart = ktime_get();
         int rc, rc2;
-       ENTRY;
  
+       ENTRY;
         DEBUG_REQ(D_INODE, req, "setattr "DFID" %x", PFID(rr->rr_fid1),
                   (unsigned int)ma->ma_attr.la_valid);
  
         if (info->mti_dlm_req)
                 ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
  
+       OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+
         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
         mo = mdt_object_find(info->mti_env, mdt, rr->rr_fid1);
         if (IS_ERR(mo))
@@ -660,6 +841,7 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
         if (mdt_object_remote(mo))
                 GOTO(out_put, rc = -EREMOTE);
  
+       ma->ma_enable_chprojid_gid = mdt->mdt_enable_chprojid_gid;
         /* revoke lease lock if size is going to be changed */
         if (unlikely(ma->ma_attr.la_valid & LA_SIZE &&
                      !(ma->ma_attr_flags & MDS_TRUNC_KEEP_LEASE) &&
@@ -688,7 +870,8 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
                         GOTO(out_put, rc = -ETXTBSY);
  
                 /* LU-10286: compatibility check for FLR.
-                * Please check the comment in mdt_finish_open() for details */
+                * Please check the comment in mdt_finish_open() for details
+                */
                 if (!exp_connect_flr(info->mti_exp) ||
                     !exp_connect_overstriping(info->mti_exp)) {
                         rc = mdt_big_xattr_get(info, mo, XATTR_NAME_LOV);
@@ -724,10 +907,17 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
  
                 /* MDT supports FMD for regular files due to Data-on-MDT */
                 if (S_ISREG(lu_object_attr(&mo->mot_obj)) &&
-                   ma->ma_attr.la_valid & (LA_ATIME | LA_MTIME | LA_CTIME))
+                   ma->ma_attr.la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) {
                         tgt_fmd_update(info->mti_exp, mdt_object_fid(mo),
                                        req->rq_xid);
  
+                       if (ma->ma_attr.la_valid & LA_MTIME) {
+                               rc = mdt_attr_get_pfid(info, mo, &ma->ma_pfid);
+                               if (!rc)
+                                       ma->ma_valid |= MA_PFID;
+                       }
+               }
+
                 rc = mdt_attr_set(info, mo, ma);
                 if (rc)
                         GOTO(out_put, rc);
@@ -745,7 +935,7 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
                             !mdt->mdt_enable_striped_dir)
                                 GOTO(out_put, rc = -EPERM);
  
-                       if (!md_capable(uc, CFS_CAP_SYS_ADMIN) &&
+                       if (!cap_raised(uc->uc_cap, CAP_SYS_ADMIN) &&
                             uc->uc_gid != mdt->mdt_enable_remote_dir_gid &&
                             mdt->mdt_enable_remote_dir_gid != -1)
                                 GOTO(out_put, rc = -EPERM);
@@ -757,35 +947,56 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
                 if (ma->ma_attr.la_valid != 0)
                         GOTO(out_put, rc = -EPROTO);
  
+               lh = &info->mti_lh[MDT_LH_PARENT];
+               mdt_lock_reg_init(lh, LCK_PW);
+
                 if (ma->ma_valid & MA_LOV) {
                         buf->lb_buf = ma->ma_lmm;
                         buf->lb_len = ma->ma_lmm_size;
                         name = XATTR_NAME_LOV;
                 } else {
                         struct lmv_user_md *lmu = &ma->ma_lmv->lmv_user_md;
+                       struct lu_fid *pfid = &info->mti_tmp_fid1;
+                       struct lu_name *pname = &info->mti_name;
+                       const char dotdot[] = "..";
+                       struct mdt_object *pobj;
  
                         buf->lb_buf = lmu;
                         buf->lb_len = ma->ma_lmv_size;
+                       name = XATTR_NAME_DEFAULT_LMV;
  
-                       if (le32_to_cpu(lmu->lum_hash_type) &
-                           LMV_HASH_FLAG_SPACE) {
-                               /*
-                                * only allow setting "space" hash flag on
-                                * plain directory.
-                                */
-                               rc = mdt_object_striped(info, mo);
+                       if (fid_is_root(rr->rr_fid1)) {
+                               lockpart |= MDS_INODELOCK_LOOKUP;
+                       } else {
+                               /* force client to update dir default layout */
+                               fid_zero(pfid);
+                               pname->ln_name = dotdot;
+                               pname->ln_namelen = sizeof(dotdot);
+                               rc = mdo_lookup(info->mti_env,
+                                               mdt_object_child(mo), pname,
+                                               pfid, NULL);
                                 if (rc)
-                                       GOTO(out_put,
-                                            rc = (rc == 1) ? -EPERM : rc);
-                       }
+                                       GOTO(out_put, rc);
  
-                       name = XATTR_NAME_DEFAULT_LMV;
-                       /* force client to update dir default layout */
-                       lockpart |= MDS_INODELOCK_LOOKUP;
-               }
+                               pobj = mdt_object_find(info->mti_env, mdt,
+                                                      pfid);
+                               if (IS_ERR(pobj))
+                                       GOTO(out_put, rc = PTR_ERR(pobj));
  
-               lh = &info->mti_lh[MDT_LH_PARENT];
-               mdt_lock_reg_init(lh, LCK_PW);
+                               if (mdt_object_remote(pobj))
+                                       rc = mdt_remote_object_lock(info, pobj,
+                                               mdt_object_fid(mo),
+                                               &lh->mlh_rreg_lh, LCK_EX,
+                                               MDS_INODELOCK_LOOKUP, false);
+                               else
+                                       lockpart |= MDS_INODELOCK_LOOKUP;
+
+                               mdt_object_put(info->mti_env, pobj);
+
+                               if (rc)
+                                       GOTO(out_put, rc);
+                       }
+               }
  
                 rc = mdt_object_lock(info, mo, lh, lockpart);
                 if (rc != 0)
@@ -805,8 +1016,8 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
         if (ma->ma_attr_flags & MDS_DATA_MODIFIED)
                 rc = mdt_add_dirty_flag(info, mo, ma);
  
-        ma->ma_need = MA_INODE;
-        ma->ma_valid = 0;
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
         rc = mdt_attr_get_complex(info, mo, ma);
         if (rc != 0)
                 GOTO(out_put, rc);
@@ -818,24 +1029,26 @@ out_put:
         mdt_object_put(info->mti_env, mo);
  out:
         if (rc == 0)
-               mdt_counter_incr(req, LPROC_MDT_SETATTR);
+               mdt_counter_incr(req, LPROC_MDT_SETATTR,
+                                ktime_us_delta(ktime_get(), kstart));
  
-        mdt_client_compatibility(info);
-        rc2 = mdt_fix_reply(info);
-        if (rc == 0)
-                rc = rc2;
-        return rc;
+       mdt_client_compatibility(info);
+       rc2 = mdt_fix_reply(info);
+       if (rc == 0)
+               rc = rc2;
+       return rc;
  }
  
  static int mdt_reint_create(struct mdt_thread_info *info,
-                            struct mdt_lock_handle *lhc)
+                           struct mdt_lock_handle *lhc)
  {
-        struct ptlrpc_request   *req = mdt_info_req(info);
-        int                     rc;
-        ENTRY;
+       struct ptlrpc_request   *req = mdt_info_req(info);
+       ktime_t                 kstart = ktime_get();
+       int                     rc;
  
-        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_CREATE))
-                RETURN(err_serious(-ESTALE));
+       ENTRY;
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_CREATE))
+               RETURN(err_serious(-ESTALE));
  
         if (info->mti_dlm_req)
                 ldlm_request_cancel(mdt_info_req(info),
@@ -846,16 +1059,12 @@ static int mdt_reint_create(struct mdt_thread_info *info,
  
         switch (info->mti_attr.ma_attr.la_mode & S_IFMT) {
         case S_IFDIR:
-               mdt_counter_incr(req, LPROC_MDT_MKDIR);
-               break;
-        case S_IFREG:
-        case S_IFLNK:
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
+       case S_IFREG:
+       case S_IFLNK:
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFIFO:
         case S_IFSOCK:
-               /* Special file should stay on the same node as parent. */
-               mdt_counter_incr(req, LPROC_MDT_MKNOD);
                 break;
         default:
                 CERROR("%s: Unsupported mode %o\n",
@@ -865,6 +1074,16 @@ static int mdt_reint_create(struct mdt_thread_info *info,
         }
  
         rc = mdt_create(info);
+       if (rc == 0) {
+               if ((info->mti_attr.ma_attr.la_mode & S_IFMT) == S_IFDIR)
+                       mdt_counter_incr(req, LPROC_MDT_MKDIR,
+                                        ktime_us_delta(ktime_get(), kstart));
+               else
+                       /* Special file should stay on the same node as parent*/
+                       mdt_counter_incr(req, LPROC_MDT_MKNOD,
+                                        ktime_us_delta(ktime_get(), kstart));
+       }
+
         RETURN(rc);
  }
  
@@ -887,10 +1106,10 @@ static int mdt_reint_unlink(struct mdt_thread_info *info,
         __u64 lock_ibits;
         bool cos_incompat = false;
         int no_name = 0;
+       ktime_t kstart = ktime_get();
         int rc;
  
         ENTRY;
-
         DEBUG_REQ(D_INODE, req, "unlink "DFID"/"DNAME"", PFID(rr->rr_fid1),
                   PNAME(&rr->rr_name));
  
@@ -915,6 +1134,8 @@ static int mdt_reint_unlink(struct mdt_thread_info *info,
                         GOTO(put_parent, rc);
         }
  
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
  relock:
         parent_lh = &info->mti_lh[MDT_LH_PARENT];
         mdt_lock_pdo_init(parent_lh, LCK_PW, &rr->rr_name);
@@ -923,32 +1144,37 @@ relock:
         if (rc != 0)
                 GOTO(put_parent, rc);
  
-       /* lookup child object along with version checking */
-       fid_zero(child_fid);
-       rc = mdt_lookup_version_check(info, mp, &rr->rr_name, child_fid, 1);
-       if (rc != 0) {
-               /* Name might not be able to find during resend of
-                * remote unlink, considering following case.
-                * dir_A is a remote directory, the name entry of
-                * dir_A is on MDT0, the directory is on MDT1,
-                *
-                * 1. client sends unlink req to MDT1.
-                * 2. MDT1 sends name delete update to MDT0.
-                * 3. name entry is being deleted in MDT0 synchronously.
-                * 4. MDT1 is restarted.
-                * 5. client resends unlink req to MDT1. So it can not
-                *    find the name entry on MDT0 anymore.
-                * In this case, MDT1 only needs to destory the local
-                * directory.
-                * */
-               if (mdt_object_remote(mp) && rc == -ENOENT &&
-                   !fid_is_zero(rr->rr_fid2) &&
-                   lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
-                       no_name = 1;
-                       *child_fid = *rr->rr_fid2;
-                } else {
-                       GOTO(unlock_parent, rc);
-                }
+       if (info->mti_spec.sp_cr_flags & MDS_OP_WITH_FID) {
+               *child_fid = *rr->rr_fid2;
+       } else {
+               /* lookup child object along with version checking */
+               fid_zero(child_fid);
+               rc = mdt_lookup_version_check(info, mp, &rr->rr_name, child_fid,
+                                             1);
+               if (rc != 0) {
+                       /* Name might not be able to find during resend of
+                        * remote unlink, considering following case.
+                        * dir_A is a remote directory, the name entry of
+                        * dir_A is on MDT0, the directory is on MDT1,
+                        *
+                        * 1. client sends unlink req to MDT1.
+                        * 2. MDT1 sends name delete update to MDT0.
+                        * 3. name entry is being deleted in MDT0 synchronously.
+                        * 4. MDT1 is restarted.
+                        * 5. client resends unlink req to MDT1. So it can not
+                        *    find the name entry on MDT0 anymore.
+                        * In this case, MDT1 only needs to destory the local
+                        * directory.
+                        */
+                       if (mdt_object_remote(mp) && rc == -ENOENT &&
+                           !fid_is_zero(rr->rr_fid2) &&
+                           lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+                               no_name = 1;
+                               *child_fid = *rr->rr_fid2;
+                       } else {
+                               GOTO(unlock_parent, rc);
+                       }
+               }
         }
  
         if (!fid_is_md_operative(child_fid))
@@ -959,10 +1185,20 @@ relock:
         if (IS_ERR(mc))
                 GOTO(unlock_parent, rc = PTR_ERR(mc));
  
+       if (info->mti_spec.sp_cr_flags & MDS_OP_WITH_FID) {
+               /* In this case, child fid is embedded in the request, and we do
+                * not have a proper name as rr_name contains an encoded
+                * hash. So find name that matches provided hash.
+                */
+               if (!find_name_matching_hash(info, &rr->rr_name,
+                                            NULL, mc))
+                       GOTO(put_child, rc = -ENOENT);
+       }
+
         if (!cos_incompat) {
                 rc = mdt_object_striped(info, mc);
                 if (rc < 0)
-                       GOTO(unlock_parent, rc);
+                       GOTO(put_child, rc);
  
                 cos_incompat = rc;
                 if (cos_incompat) {
@@ -981,7 +1217,7 @@ relock:
                         /* Return -ENOTSUPP for old client */
                         GOTO(put_child, rc = -ENOTSUPP);
  
-               if (!md_capable(uc, CFS_CAP_SYS_ADMIN))
+               if (!cap_raised(uc->uc_cap, CAP_SYS_ADMIN))
                         GOTO(put_child, rc = -EPERM);
  
                 ma->ma_need = MA_INODE;
@@ -1012,7 +1248,8 @@ relock:
                  * this MDT. Since the unlink will happen on another MDT,
                  * it will release the LOOKUP lock right away. Then What
                  * would happen if another client try to grab the LOOKUP
-                * lock at the same time with unlink XXX */
+                * lock at the same time with unlink XXX
+                */
                 mdt_object_lock(info, mc, child_lh, MDS_INODELOCK_LOOKUP);
                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
                 LASSERT(repbody != NULL);
@@ -1022,7 +1259,8 @@ relock:
         }
         /* We used to acquire MDS_INODELOCK_FULL here but we can't do
          * this now because a running HSM restore on the child (unlink
-        * victim) will hold the layout lock. See LU-4002. */
+        * victim) will hold the layout lock. See LU-4002.
+        */
         lock_ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE;
         if (mdt_object_remote(mp)) {
                 /* Enqueue lookup lock from parent MDT */
@@ -1075,7 +1313,8 @@ out_stat:
         if (ma->ma_valid & MA_INODE) {
                 switch (ma->ma_attr.la_mode & S_IFMT) {
                 case S_IFDIR:
-                       mdt_counter_incr(req, LPROC_MDT_RMDIR);
+                       mdt_counter_incr(req, LPROC_MDT_RMDIR,
+                                        ktime_us_delta(ktime_get(), kstart));
                         break;
                 case S_IFREG:
                 case S_IFLNK:
@@ -1083,7 +1322,8 @@ out_stat:
                 case S_IFBLK:
                 case S_IFIFO:
                 case S_IFSOCK:
-                       mdt_counter_incr(req, LPROC_MDT_UNLINK);
+                       mdt_counter_incr(req, LPROC_MDT_UNLINK,
+                                        ktime_us_delta(ktime_get(), kstart));
                         break;
                 default:
                         LASSERTF(0, "bad file type %o unlinking\n",
@@ -1096,12 +1336,16 @@ out_stat:
  unlock_child:
         mdt_reint_striped_unlock(info, mc, child_lh, einfo, rc);
  put_child:
+       if (info->mti_spec.sp_cr_flags & MDS_OP_WITH_FID &&
+           info->mti_big_buf.lb_buf)
+               lu_buf_free(&info->mti_big_buf);
         mdt_object_put(info->mti_env, mc);
  unlock_parent:
         mdt_object_unlock(info, mp, parent_lh, rc);
  put_parent:
         mdt_object_put(info->mti_env, mp);
-        return rc;
+       CFS_RACE_WAKEUP(OBD_FAIL_OBD_ZERO_NLINK_RACE);
+       return rc;
  }
  
  /*
@@ -1118,21 +1362,29 @@ static int mdt_reint_link(struct mdt_thread_info *info,
         struct mdt_object       *mp;
         struct mdt_lock_handle  *lhs;
         struct mdt_lock_handle  *lhp;
+       ktime_t kstart = ktime_get();
         bool cos_incompat;
         int rc;
-       ENTRY;
  
+       ENTRY;
         DEBUG_REQ(D_INODE, req, "link "DFID" to "DFID"/"DNAME,
                   PFID(rr->rr_fid1), PFID(rr->rr_fid2), PNAME(&rr->rr_name));
  
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK))
                 RETURN(err_serious(-ENOENT));
  
+       if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE) ||
+           OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND)) {
+               req->rq_no_reply = 1;
+               RETURN(err_serious(-ENOENT));
+       }
+
         if (info->mti_dlm_req)
                 ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
  
         /* Invalid case so return error immediately instead of
-        * processing it */
+        * processing it
+        */
         if (lu_fid_eq(rr->rr_fid1, rr->rr_fid2))
                 RETURN(-EPERM);
  
@@ -1149,6 +1401,10 @@ static int mdt_reint_link(struct mdt_thread_info *info,
         if (rc)
                 GOTO(put_parent, rc);
  
+       rc = mdt_check_enc(info, mp);
+       if (rc)
+               GOTO(put_parent, rc);
+
         /* step 2: find source */
         ms = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid1);
         if (IS_ERR(ms))
@@ -1162,6 +1418,8 @@ static int mdt_reint_link(struct mdt_thread_info *info,
  
         cos_incompat = (mdt_object_remote(mp) || mdt_object_remote(ms));
  
+       OBD_RACE(OBD_FAIL_MDS_LINK_RENAME_RACE);
+
         lhp = &info->mti_lh[MDT_LH_PARENT];
         mdt_lock_pdo_init(lhp, LCK_PW, &rr->rr_name);
         rc = mdt_reint_object_lock(info, mp, lhp, MDS_INODELOCK_UPDATE,
@@ -1208,7 +1466,8 @@ static int mdt_reint_link(struct mdt_thread_info *info,
                       mdt_object_child(ms), &rr->rr_name, ma);
  
         if (rc == 0)
-               mdt_counter_incr(req, LPROC_MDT_LINK);
+               mdt_counter_incr(req, LPROC_MDT_LINK,
+                                ktime_us_delta(ktime_get(), kstart));
  
         EXIT;
  unlock_source:
@@ -1265,15 +1524,16 @@ static int mdt_rename_lock(struct mdt_thread_info *info,
                            struct lustre_handle *lh)
  {
         int     rc;
-       ENTRY;
  
+       ENTRY;
         if (mdt_seq_site(info->mti_mdt)->ss_node_id != 0) {
                 struct lu_fid *fid = &info->mti_tmp_fid1;
                 struct mdt_object *obj;
  
                 /* XXX, right now, it has to use object API to
                  * enqueue lock cross MDT, so it will enqueue
-                * rename lock(with LUSTRE_BFL_FID) by root object */
+                * rename lock(with LUSTRE_BFL_FID) by root object
+                */
                 lu_root_fid(fid);
                 obj = mdt_object_find(info->mti_env, info->mti_mdt, fid);
                 if (IS_ERR(obj))
@@ -1291,7 +1551,7 @@ static int mdt_rename_lock(struct mdt_thread_info *info,
                 __u64 flags = 0;
  
                 fid_build_reg_res_name(&LUSTRE_BFL_FID, res_id);
-               memset(policy, 0, sizeof *policy);
+               memset(policy, 0, sizeof(*policy));
                 policy->l_inodebits.bits = MDS_INODELOCK_UPDATE;
                 flags = LDLM_FL_LOCAL_ONLY | LDLM_FL_ATOMIC_CB;
                 rc = ldlm_cli_enqueue_local(info->mti_env, ns, res_id,
@@ -1323,7 +1583,6 @@ static struct mdt_object *mdt_parent_find_check(struct mdt_thread_info *info,
         int rc;
  
         ENTRY;
-
         dir = mdt_object_find(info->mti_env, info->mti_mdt, fid);
         if (IS_ERR(dir))
                 RETURN(dir);
@@ -1398,9 +1657,9 @@ int mdt_revoke_remote_lookup_lock(struct mdt_thread_info *info,
   * different list.
   */
  struct mdt_sub_lock {
-       struct mdt_object      *msl_obj;
-       struct mdt_lock_handle  msl_lh;
-       struct list_head        msl_linkage;
+       struct mdt_object *msl_obj;
+       struct mdt_lock_handle msl_lh;
+       struct list_head msl_linkage;
  };
  
  static void mdt_unlock_list(struct mdt_thread_info *info,
@@ -1457,7 +1716,6 @@ static int mdt_link_parents_lock(struct mdt_thread_info *info,
         int rc;
  
         ENTRY;
-
         if (S_ISDIR(lu_object_attr(&obj->mot_obj)))
                 RETURN(0);
  
@@ -1633,15 +1891,20 @@ static int mdt_link_parents_lock(struct mdt_thread_info *info,
  
         EXIT;
  out:
-       if (rc)
+       if (rc) {
                 mdt_unlock_list(info, link_locks, rc);
-       else if (local_lnkp_cnt > RS_MAX_LOCKS - 6)
+       } else if (local_lnkp_cnt > RS_MAX_LOCKS - 5) {
+               CDEBUG(D_INFO, "Too many links (%d), sync operations\n",
+                      local_lnkp_cnt);
                 /*
                  * parent may have 3 local objects: master object and 2 stripes
-                * (if it's being migrated too); source may have 2 local
-                * objects: master and 1 stripe; target has 1 local object.
+                * (if it's being migrated too); source may have 1 local objects
+                * as regular file; target has 1 local object.
+                * Note, source may have 2 local locks if it is directory but it
+                * can't have hardlinks, so it is not considered here.
                  */
                 rc = 1;
+       }
         return rc;
  }
  
@@ -1659,17 +1922,13 @@ static int mdt_lock_remote_slaves(struct mdt_thread_info *info,
         int rc;
  
         ENTRY;
-
         LASSERT(mdt_object_remote(obj));
         LASSERT(ma->ma_valid & MA_LMV);
         LASSERT(lmv);
  
-       if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
+       if (!lmv_is_sane(lmv))
                 RETURN(-EINVAL);
  
-       if (le32_to_cpu(lmv->lmv_stripe_count) < 1)
-               RETURN(0);
-
         for (i = 0; i < le32_to_cpu(lmv->lmv_stripe_count); i++) {
                 fid_le_to_cpu(fid, &lmv->lmv_stripe_fids[i]);
  
@@ -1773,9 +2032,6 @@ static int mdt_migrate_object_lock(struct mdt_thread_info *info,
                 if (S_ISDIR(lu_object_attr(&obj->mot_obj))) {
                         struct md_attr *ma = &info->mti_attr;
  
-                       ma->ma_lmv = info->mti_big_lmm;
-                       ma->ma_lmv_size = info->mti_big_lmmsize;
-                       ma->ma_valid = 0;
                         rc = mdt_stripe_get(info, obj, ma, XATTR_NAME_LMV);
                         if (rc) {
                                 mdt_object_unlock(info, obj, lh, rc);
@@ -1825,25 +2081,15 @@ static int mdt_migrate_lookup(struct mdt_thread_info *info,
         if (ma->ma_valid & MA_LMV) {
                 /* if parent is striped, lookup on corresponding stripe */
                 struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1;
-               __u32 hash_type = le32_to_cpu(lmv->lmv_hash_type);
-               __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
-               bool is_migrating = le32_to_cpu(lmv->lmv_hash_type) &
-                                   LMV_HASH_FLAG_MIGRATION;
-
-               if (is_migrating) {
-                       hash_type = le32_to_cpu(lmv->lmv_migrate_hash);
-                       stripe_count -= le32_to_cpu(lmv->lmv_migrate_offset);
-               }
  
-               rc = lmv_name_to_stripe_index(hash_type, stripe_count,
-                                             lname->ln_name,
-                                             lname->ln_namelen);
+               if (!lmv_is_sane(lmv))
+                       return -EBADF;
+
+               rc = lmv_name_to_stripe_index_old(lmv, lname->ln_name,
+                                                 lname->ln_namelen);
                 if (rc < 0)
                         return rc;
  
-               if (le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_FLAG_MIGRATION)
-                       rc += le32_to_cpu(lmv->lmv_migrate_offset);
-
                 fid_le_to_cpu(fid, &lmv->lmv_stripe_fids[rc]);
  
                 stripe = mdt_object_find(env, info->mti_mdt, fid);
@@ -1853,20 +2099,17 @@ static int mdt_migrate_lookup(struct mdt_thread_info *info,
                 fid_zero(fid);
                 rc = mdo_lookup(env, mdt_object_child(stripe), lname, fid,
                                 &info->mti_spec);
-               if (rc == -ENOENT && is_migrating) {
+               if (rc == -ENOENT && lmv_is_layout_changing(lmv)) {
                         /*
-                        * if parent is migrating, and lookup child failed on
-                        * source stripe, lookup again on target stripe, if it
-                        * exists, it means previous migration was interrupted,
-                        * and current file was migrated already.
+                        * if parent layout is changeing, and lookup child
+                        * failed on source stripe, lookup again on target
+                        * stripe, if it exists, it means previous migration
+                        * was interrupted, and current file was migrated
+                        * already.
                          */
                         mdt_object_put(env, stripe);
  
-                       hash_type = le32_to_cpu(lmv->lmv_hash_type);
-                       stripe_count = le32_to_cpu(lmv->lmv_migrate_offset);
-
-                       rc = lmv_name_to_stripe_index(hash_type, stripe_count,
-                                                     lname->ln_name,
+                       rc = lmv_name_to_stripe_index(lmv, lname->ln_name,
                                                       lname->ln_namelen);
                         if (rc < 0)
                                 return rc;
@@ -1952,7 +2195,8 @@ static int mdd_migrate_close(struct mdt_thread_info *info,
          * cancelled, it's okay to cancel it now as we've held mot_open_sem.
          */
         ldlm_lock_cancel(lease);
-       ldlm_reprocess_all(lease->l_resource);
+       ldlm_reprocess_all(lease->l_resource,
+                          lease->l_policy_data.l_inodebits.bits);
         LDLM_LOCK_PUT(lease);
  
  close:
@@ -1976,8 +2220,8 @@ close:
   *  9. unlock above locks
   * 10. sync device if source has links
   */
-static int mdt_reint_migrate(struct mdt_thread_info *info,
-                            struct mdt_lock_handle *unused)
+int mdt_reint_migrate(struct mdt_thread_info *info,
+                     struct mdt_lock_handle *unused)
  {
         const struct lu_env *env = info->mti_env;
         struct mdt_device *mdt = info->mti_mdt;
@@ -2002,8 +2246,8 @@ static int mdt_reint_migrate(struct mdt_thread_info *info,
         bool open_sem_locked = false;
         bool do_sync = false;
         int rc;
-       ENTRY;
  
+       ENTRY;
         CDEBUG(D_INODE, "migrate "DFID"/"DNAME" to "DFID"\n", PFID(rr->rr_fid1),
                PNAME(&rr->rr_name), PFID(rr->rr_fid2));
  
@@ -2021,7 +2265,7 @@ static int mdt_reint_migrate(struct mdt_thread_info *info,
         if (!mdt->mdt_enable_remote_dir || !mdt->mdt_enable_dir_migration)
                 RETURN(-EPERM);
  
-       if (!md_capable(uc, CFS_CAP_SYS_ADMIN) &&
+       if (uc && !cap_raised(uc->uc_cap, CAP_SYS_ADMIN) &&
             uc->uc_gid != mdt->mdt_enable_remote_dir_gid &&
             mdt->mdt_enable_remote_dir_gid != -1)
                 RETURN(-EPERM);
@@ -2031,8 +2275,10 @@ static int mdt_reint_migrate(struct mdt_thread_info *info,
          * if other MDT holds rename lock, but being blocked to wait for
          * this MDT to finish its recovery, and the failover MDT can not
          * get rename lock, which will cause deadlock.
+        *
+        * req is NULL if this is called by directory auto-split.
          */
-       if (!req_is_replay(req)) {
+       if (req && !req_is_replay(req)) {
                 rc = mdt_rename_lock(info, &rename_lh);
                 if (rc != 0) {
                         CERROR("%s: can't lock FS for rename: rc = %d\n",
@@ -2042,20 +2288,26 @@ static int mdt_reint_migrate(struct mdt_thread_info *info,
         }
  
         /* pobj is master object of parent */
-       pobj = mdt_parent_find_check(info, rr->rr_fid1, 0);
+       pobj = mdt_object_find(env, mdt, rr->rr_fid1);
         if (IS_ERR(pobj))
                 GOTO(unlock_rename, rc = PTR_ERR(pobj));
  
-       if (unlikely(!info->mti_big_lmm)) {
-               info->mti_big_lmmsize = lmv_mds_md_size(64, LMV_MAGIC);
-               OBD_ALLOC(info->mti_big_lmm, info->mti_big_lmmsize);
-               if (!info->mti_big_lmm)
-                       GOTO(put_parent, rc = -ENOMEM);
+       if (req) {
+               rc = mdt_version_get_check(info, pobj, 0);
+               if (rc)
+                       GOTO(put_parent, rc);
         }
  
-       ma->ma_lmv = info->mti_big_lmm;
-       ma->ma_lmv_size = info->mti_big_lmmsize;
-       ma->ma_valid = 0;
+       if (!mdt_object_exists(pobj))
+               GOTO(put_parent, rc = -ENOENT);
+
+       if (!S_ISDIR(lu_object_attr(&pobj->mot_obj)))
+               GOTO(put_parent, rc = -ENOTDIR);
+
+       rc = mdt_check_enc(info, pobj);
+       if (rc)
+               GOTO(put_parent, rc);
+
         rc = mdt_stripe_get(info, pobj, ma, XATTR_NAME_LMV);
         if (rc)
                 GOTO(put_parent, rc);
@@ -2097,18 +2349,23 @@ lock_parent:
          */
         do_sync = rc;
  
-       /* TODO: DoM migration is not supported yet */
+       /* TODO: DoM migration is not supported, migrate dirent only */
         if (S_ISREG(lu_object_attr(&sobj->mot_obj))) {
-               ma->ma_lmm = info->mti_big_lmm;
-               ma->ma_lmm_size = info->mti_big_lmmsize;
-               ma->ma_valid = 0;
                 rc = mdt_stripe_get(info, sobj, ma, XATTR_NAME_LOV);
                 if (rc)
-                       GOTO(put_source, rc);
+                       GOTO(unlock_links, rc);
  
-               if (ma->ma_valid & MA_LOV &&
-                   mdt_lmm_dom_entry(ma->ma_lmm) != LMM_NO_DOM)
-                       GOTO(put_source, rc = -EOPNOTSUPP);
+               if (ma->ma_valid & MA_LOV && mdt_lmm_dom_stripesize(ma->ma_lmm))
+                       info->mti_spec.sp_migrate_nsonly = 1;
+       } else if (S_ISDIR(lu_object_attr(&sobj->mot_obj))) {
+               rc = mdt_stripe_get(info, sobj, ma, XATTR_NAME_LMV);
+               if (rc)
+                       GOTO(unlock_links, rc);
+
+               /* race with restripe/auto-split? */
+               if ((ma->ma_valid & MA_LMV) &&
+                   lmv_is_restriping(&ma->ma_lmv->lmv_md_v1))
+                       GOTO(unlock_links, rc = -EBUSY);
         }
  
         /* if migration HSM is allowed */
@@ -2163,7 +2420,11 @@ lock_parent:
  
         rc = mdo_migrate(env, mdt_object_child(pobj),
                          mdt_object_child(sobj), &rr->rr_name,
-                        mdt_object_child(tobj), &info->mti_spec, ma);
+                        mdt_object_child(tobj),
+                        &info->mti_spec, ma);
+       if (!rc)
+               lprocfs_counter_incr(mdt->mdt_lu_dev.ld_obd->obd_md_stats,
+                                    LPROC_MDT_MIGRATE + LPROC_MD_LAST_OPC);
         EXIT;
  
         mdt_object_unlock(info, tobj, lht, rc);
@@ -2176,6 +2437,11 @@ unlock_open_sem:
         if (open_sem_locked)
                 up_write(&sobj->mot_open_sem);
  unlock_links:
+       /* if we've got too many locks to save into RPC,
+        * then just commit before the locks are released
+        */
+       if (!rc && do_sync)
+               mdt_device_sync(env, mdt);
         mdt_unlock_list(info, &link_locks, do_sync ? 1 : rc);
  put_source:
         mdt_object_put(env, sobj);
@@ -2189,9 +2455,6 @@ unlock_rename:
         if (lustre_handle_is_used(&rename_lh))
                 mdt_rename_unlock(&rename_lh);
  
-       if (!rc && do_sync)
-               mdt_device_sync(env, mdt);
-
         return rc;
  }
  
@@ -2269,10 +2532,6 @@ static int mdt_rename_determine_lock_order(struct mdt_thread_info *info,
                 return 0;
  
         /* check whether sobj and tobj are sibling stripes */
-       ma->ma_need = MA_LMV;
-       ma->ma_valid = 0;
-       ma->ma_lmv = (union lmv_mds_md *)info->mti_xattr_buf;
-       ma->ma_lmv_size = sizeof(info->mti_xattr_buf);
         rc = mdt_stripe_get(info, sobj, ma, XATTR_NAME_LMV);
         if (rc)
                 return rc;
@@ -2306,6 +2565,108 @@ static int mdt_rename_determine_lock_order(struct mdt_thread_info *info,
  }
  
  /*
+ * lock rename source object.
+ *
+ * Both source and source parent may be remote, and source may be a remote
+ * object on source parent, to avoid overriding lock handle, store remote
+ * LOOKUP lock separately in @lhr.
+ *
+ * \retval     0 on success
+ * \retval     -ev negative errno upon error
+ */
+static int mdt_rename_source_lock(struct mdt_thread_info *info,
+                                 struct mdt_object *parent,
+                                 struct mdt_object *child,
+                                 struct mdt_lock_handle *lhc,
+                                 struct mdt_lock_handle *lhr,
+                                 __u64 ibits,
+                                 bool cos_incompat)
+{
+       int rc;
+
+       rc = mdt_is_remote_object(info, parent, child);
+       if (rc < 0)
+               return rc;
+
+       if (rc) {
+               /* enqueue remote LOOKUP lock from the parent MDT */
+               __u64 rmt_ibits = MDS_INODELOCK_LOOKUP;
+
+               if (mdt_object_remote(parent)) {
+                       rc = mdt_remote_object_lock(info, parent,
+                                                   mdt_object_fid(child),
+                                                   &lhr->mlh_rreg_lh,
+                                                   lhr->mlh_rreg_mode,
+                                                   rmt_ibits, false);
+                       if (rc != ELDLM_OK)
+                               return rc;
+               } else {
+                       LASSERT(mdt_object_remote(child));
+                       rc = mdt_object_local_lock(info, child, lhr,
+                                                  &rmt_ibits, 0, true);
+                       if (rc < 0)
+                               return rc;
+               }
+
+               ibits &= ~MDS_INODELOCK_LOOKUP;
+       }
+
+       if (mdt_object_remote(child)) {
+               rc = mdt_remote_object_lock(info, child, mdt_object_fid(child),
+                                           &lhc->mlh_rreg_lh,
+                                           lhc->mlh_rreg_mode,
+                                           ibits, false);
+               if (rc == ELDLM_OK)
+                       rc = 0;
+       } else {
+               rc = mdt_reint_object_lock(info, child, lhc, ibits,
+                                          cos_incompat);
+       }
+
+       if (!rc)
+               mdt_object_unlock(info, child, lhr, rc);
+
+       return rc;
+}
+
+/* Helper function for mdt_reint_rename so we don't need to opencode
+ * two different order lockings
+ */
+static int mdt_lock_two_dirs(struct mdt_thread_info *info,
+                            struct mdt_object *mfirstdir,
+                            struct mdt_lock_handle *lh_firstdirp,
+                            struct mdt_object *mseconddir,
+                            struct mdt_lock_handle *lh_seconddirp,
+                            bool cos_incompat)
+{
+       int rc;
+
+       rc = mdt_object_lock_save(info, mfirstdir, lh_firstdirp, 0,
+                                 cos_incompat);
+       if (rc)
+               return rc;
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5);
+
+       if (mfirstdir != mseconddir) {
+               rc = mdt_object_lock_save(info, mseconddir, lh_seconddirp, 1,
+                                         cos_incompat);
+       } else if (!mdt_object_remote(mseconddir) &&
+                  lh_firstdirp->mlh_pdo_hash !=
+                  lh_seconddirp->mlh_pdo_hash) {
+               rc = mdt_pdir_hash_lock(info, lh_seconddirp, mseconddir,
+                                       MDS_INODELOCK_UPDATE,
+                                       cos_incompat);
+               OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10);
+       }
+
+       if (rc != 0)
+               mdt_object_unlock(info, mfirstdir, lh_firstdirp, rc);
+
+       return rc;
+}
+
+/*
   * VBR: rename versions in reply: 0 - srcdir parent; 1 - tgtdir parent;
   * 2 - srcdir child; 3 - tgtdir child.
   * Update on disk version of srcdir child.
@@ -2325,15 +2686,18 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
         struct mdt_lock_handle *lh_srcdirp;
         struct mdt_lock_handle *lh_tgtdirp;
         struct mdt_lock_handle *lh_oldp = NULL;
+       struct mdt_lock_handle *lh_rmt = NULL;
         struct mdt_lock_handle *lh_newp = NULL;
         struct lu_fid *old_fid = &info->mti_tmp_fid1;
         struct lu_fid *new_fid = &info->mti_tmp_fid2;
         __u64 lock_ibits;
         bool reverse = false, discard = false;
         bool cos_incompat;
+       ktime_t kstart = ktime_get();
+       enum mdt_stat_idx msi = 0;
         int rc;
-       ENTRY;
  
+       ENTRY;
         DEBUG_REQ(D_INODE, req, "rename "DFID"/"DNAME" to "DFID"/"DNAME,
                   PFID(rr->rr_fid1), PNAME(&rr->rr_name),
                   PFID(rr->rr_fid2), PNAME(&rr->rr_tgt_name));
@@ -2350,6 +2714,10 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
         if (IS_ERR(msrcdir))
                 RETURN(PTR_ERR(msrcdir));
  
+       rc = mdt_check_enc(info, msrcdir);
+       if (rc)
+               GOTO(out_put_srcdir, rc);
+
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME3, 5);
  
         if (lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) {
@@ -2361,6 +2729,10 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
                         GOTO(out_put_srcdir, rc = PTR_ERR(mtgtdir));
         }
  
+       rc = mdt_check_enc(info, mtgtdir);
+       if (rc)
+               GOTO(out_put_tgtdir, rc);
+
         /*
          * Note: do not enqueue rename lock for replay request, because
          * if other MDT holds rename lock, but being blocked to wait for
@@ -2368,6 +2740,8 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
          * get rename lock, which will cause deadlock.
          */
         if (!req_is_replay(req)) {
+               bool remote = mdt_object_remote(msrcdir);
+
                 /*
                  * Normally rename RPC is handled on the MDT with the target
                  * directory (if target exists, it's on the MDT with the
@@ -2376,28 +2750,50 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
                  * cause any issue), return -EXDEV early to avoid taking
                  * rename_lock.
                  */
-               if (!mdt->mdt_enable_remote_rename &&
-                   mdt_object_remote(msrcdir))
+               if (!mdt->mdt_enable_remote_rename && remote)
                         GOTO(out_put_tgtdir, rc = -EXDEV);
  
-               rc = mdt_rename_lock(info, &rename_lh);
-               if (rc != 0) {
-                       CERROR("%s: can't lock FS for rename: rc = %d\n",
-                              mdt_obd_name(mdt), rc);
-                       GOTO(out_put_tgtdir, rc);
+               /* This might be further relaxed in the future for regular file
+                * renames in different source and target parents. Start with
+                * only same-directory renames for simplicity and because this
+                * is by far the most the common use case.
+                *
+                * Striped directories should be considered "remote".
+                */
+               if (msrcdir != mtgtdir || remote ||
+                   (S_ISDIR(ma->ma_attr.la_mode) &&
+                    !mdt->mdt_enable_parallel_rename_dir) ||
+                   (!S_ISDIR(ma->ma_attr.la_mode) &&
+                    !mdt->mdt_enable_parallel_rename_file)) {
+                       rc = mdt_rename_lock(info, &rename_lh);
+                       if (rc != 0) {
+                               CERROR("%s: cannot lock for rename: rc = %d\n",
+                                      mdt_obd_name(mdt), rc);
+                               GOTO(out_put_tgtdir, rc);
+                       }
+               } else {
+                       if (S_ISDIR(ma->ma_attr.la_mode))
+                               msi = LPROC_MDT_RENAME_PAR_DIR;
+                       else
+                               msi = LPROC_MDT_RENAME_PAR_FILE;
+
+                       CDEBUG(D_INFO,
+                              "%s: samedir parallel rename "DFID"/"DNAME"\n",
+                              mdt_obd_name(mdt), PFID(rr->rr_fid1),
+                              PNAME(&rr->rr_name));
                 }
         }
  
         rc = mdt_rename_determine_lock_order(info, msrcdir, mtgtdir);
         if (rc < 0)
                 GOTO(out_unlock_rename, rc);
-
         reverse = rc;
  
         /* source needs to be looked up after locking source parent, otherwise
          * this rename may race with unlink source, and cause rename hang, see
          * sanityn.sh 55b, so check parents first, if later we found source is
-        * remote, relock parents. */
+        * remote, relock parents.
+        */
         cos_incompat = (mdt_object_remote(msrcdir) ||
                         mdt_object_remote(mtgtdir));
  
@@ -2407,48 +2803,30 @@ static int mdt_reint_rename(struct mdt_thread_info *info,
         lh_srcdirp = &info->mti_lh[MDT_LH_PARENT];
         lh_tgtdirp = &info->mti_lh[MDT_LH_CHILD];
  
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN);
+       OBD_RACE(OBD_FAIL_MDS_REINT_OPEN2);
  relock:
         mdt_lock_pdo_init(lh_srcdirp, LCK_PW, &rr->rr_name);
         mdt_lock_pdo_init(lh_tgtdirp, LCK_PW, &rr->rr_tgt_name);
  
-       if (reverse) {
-               rc = mdt_object_lock_save(info, mtgtdir, lh_tgtdirp, 1,
-                                         cos_incompat);
-               if (rc)
-                       GOTO(out_unlock_rename, rc);
-
-               OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5);
-
-               rc = mdt_object_lock_save(info, msrcdir, lh_srcdirp, 0,
-                                         cos_incompat);
-               if (rc != 0) {
-                       mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc);
-                       GOTO(out_unlock_rename, rc);
-               }
-       } else {
-               rc = mdt_object_lock_save(info, msrcdir, lh_srcdirp, 0,
-                                         cos_incompat);
-               if (rc)
-                       GOTO(out_unlock_rename, rc);
-
-               OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME, 5);
+       /* In case of same dir local rename we must sort by the hash,
+        * otherwise a lock deadlock is possible when renaming
+        * a to b and b to a at the same time LU-15285
+        */
+       if (!mdt_object_remote(mtgtdir) && mtgtdir == msrcdir)
+               reverse = lh_srcdirp->mlh_pdo_hash > lh_tgtdirp->mlh_pdo_hash;
+       if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_MDS_PDO_LOCK)))
+               reverse = 0;
+
+       if (reverse)
+               rc = mdt_lock_two_dirs(info, mtgtdir, lh_tgtdirp, msrcdir,
+                                      lh_srcdirp, cos_incompat);
+       else
+               rc = mdt_lock_two_dirs(info, msrcdir, lh_srcdirp, mtgtdir,
+                                      lh_tgtdirp, cos_incompat);
  
-               if (mtgtdir != msrcdir) {
-                       rc = mdt_object_lock_save(info, mtgtdir, lh_tgtdirp, 1,
-                                                 cos_incompat);
-               } else if (!mdt_object_remote(mtgtdir) &&
-                          lh_srcdirp->mlh_pdo_hash !=
-                          lh_tgtdirp->mlh_pdo_hash) {
-                       rc = mdt_pdir_hash_lock(info, lh_tgtdirp, mtgtdir,
-                                               MDS_INODELOCK_UPDATE,
-                                               cos_incompat);
-                       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10);
-               }
-               if (rc != 0) {
-                       mdt_object_unlock(info, msrcdir, lh_srcdirp, rc);
-                       GOTO(out_unlock_rename, rc);
-               }
-       }
+       if (rc != 0)
+               GOTO(out_unlock_rename, rc);
  
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME4, 5);
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RENAME2, 5);
@@ -2480,7 +2858,8 @@ relock:
                 GOTO(out_put_old, rc = -EXDEV);
  
         /* Check if @mtgtdir is subdir of @mold, before locking child
-        * to avoid reverse locking. */
+        * to avoid reverse locking.
+        */
         if (mtgtdir != msrcdir) {
                 rc = mdo_is_subdir(info->mti_env, mdt_object_child(mtgtdir),
                                    old_fid);
@@ -2505,7 +2884,8 @@ relock:
  
         /* find mnew object:
          * mnew target object may not exist now
-        * lookup with version checking */
+        * lookup with version checking
+        */
         fid_zero(new_fid);
         rc = mdt_lookup_version_check(info, mtgtdir, &rr->rr_tgt_name, new_fid,
                                       3);
@@ -2546,35 +2926,25 @@ relock:
                 /* Before locking the target dir, check we do not replace
                  * a dir with a non-dir, otherwise it may deadlock with
                  * link op which tries to create a link in this dir
-                * back to this non-dir. */
+                * back to this non-dir.
+                */
                 if (S_ISDIR(lu_object_attr(&mnew->mot_obj)) &&
                     !S_ISDIR(lu_object_attr(&mold->mot_obj)))
                         GOTO(out_put_new, rc = -EISDIR);
  
                 lh_oldp = &info->mti_lh[MDT_LH_OLD];
+               lh_rmt = &info->mti_lh[MDT_LH_RMT];
                 mdt_lock_reg_init(lh_oldp, LCK_EX);
+               mdt_lock_reg_init(lh_rmt, LCK_EX);
                 lock_ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_XATTR;
-               if (mdt_object_remote(msrcdir)) {
-                       /* Enqueue lookup lock from the parent MDT */
-                       rc = mdt_remote_object_lock(info, msrcdir,
-                                                   mdt_object_fid(mold),
-                                                   &lh_oldp->mlh_rreg_lh,
-                                                   lh_oldp->mlh_rreg_mode,
-                                                   MDS_INODELOCK_LOOKUP,
-                                                   false);
-                       if (rc != ELDLM_OK)
-                               GOTO(out_put_new, rc);
-
-                       lock_ibits &= ~MDS_INODELOCK_LOOKUP;
-               }
-
-               rc = mdt_reint_object_lock(info, mold, lh_oldp, lock_ibits,
-                                          cos_incompat);
-               if (rc != 0)
-                       GOTO(out_unlock_old, rc);
+               rc = mdt_rename_source_lock(info, msrcdir, mold, lh_oldp,
+                                           lh_rmt, lock_ibits, cos_incompat);
+               if (rc < 0)
+                       GOTO(out_put_new, rc);
  
                 /* Check if @msrcdir is subdir of @mnew, before locking child
-                * to avoid reverse locking. */
+                * to avoid reverse locking.
+                */
                 if (mtgtdir != msrcdir) {
                         rc = mdo_is_subdir(info->mti_env,
                                            mdt_object_child(msrcdir), new_fid);
@@ -2588,43 +2958,43 @@ relock:
                 /* We used to acquire MDS_INODELOCK_FULL here but we
                  * can't do this now because a running HSM restore on
                  * the rename onto victim will hold the layout
-                * lock. See LU-4002. */
+                * lock. See LU-4002.
+                */
  
                 lh_newp = &info->mti_lh[MDT_LH_NEW];
                 mdt_lock_reg_init(lh_newp, LCK_EX);
-               rc = mdt_reint_object_lock(info, mnew, lh_newp,
-                                          MDS_INODELOCK_LOOKUP |
-                                          MDS_INODELOCK_UPDATE,
+               lock_ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE;
+               if (mdt_object_remote(mtgtdir)) {
+                       rc = mdt_remote_object_lock(info, mtgtdir,
+                                                   mdt_object_fid(mnew),
+                                                   &lh_newp->mlh_rreg_lh,
+                                                   lh_newp->mlh_rreg_mode,
+                                                   MDS_INODELOCK_LOOKUP,
+                                                   false);
+                       if (rc != ELDLM_OK)
+                               GOTO(out_unlock_old, rc);
+
+                       lock_ibits &= ~MDS_INODELOCK_LOOKUP;
+               }
+               rc = mdt_reint_object_lock(info, mnew, lh_newp, lock_ibits,
                                            cos_incompat);
                 if (rc != 0)
-                       GOTO(out_unlock_old, rc);
+                       GOTO(out_unlock_new, rc);
  
                 /* get and save version after locking */
                 mdt_version_get_save(info, mnew, 3);
-       } else if (rc != -EREMOTE && rc != -ENOENT) {
+       } else if (rc != -ENOENT) {
                 GOTO(out_put_old, rc);
         } else {
                 lh_oldp = &info->mti_lh[MDT_LH_OLD];
+               lh_rmt = &info->mti_lh[MDT_LH_RMT];
                 mdt_lock_reg_init(lh_oldp, LCK_EX);
+               mdt_lock_reg_init(lh_rmt, LCK_EX);
                 lock_ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_XATTR;
-               if (mdt_object_remote(msrcdir)) {
-                       /* Enqueue lookup lock from the parent MDT */
-                       rc = mdt_remote_object_lock(info, msrcdir,
-                                                   mdt_object_fid(mold),
-                                                   &lh_oldp->mlh_rreg_lh,
-                                                   lh_oldp->mlh_rreg_mode,
-                                                   MDS_INODELOCK_LOOKUP,
-                                                   false);
-                       if (rc != ELDLM_OK)
-                               GOTO(out_put_old, rc);
-
-                       lock_ibits &= ~MDS_INODELOCK_LOOKUP;
-               }
-
-               rc = mdt_reint_object_lock(info, mold, lh_oldp, lock_ibits,
-                                          cos_incompat);
+               rc = mdt_rename_source_lock(info, msrcdir, mold, lh_oldp,
+                                           lh_rmt, lock_ibits, cos_incompat);
                 if (rc != 0)
-                       GOTO(out_unlock_old, rc);
+                       GOTO(out_put_old, rc);
  
                 mdt_enoent_version_save(info, 3);
         }
@@ -2648,19 +3018,21 @@ relock:
  
         /* handle last link of tgt object */
         if (rc == 0) {
-               mdt_counter_incr(req, LPROC_MDT_RENAME);
                 if (mnew) {
                         mdt_handle_last_unlink(info, mnew, ma);
                         discard = mdt_dom_check_for_discard(info, mnew);
                 }
                 mdt_rename_counter_tally(info, info->mti_mdt, req,
-                                        msrcdir, mtgtdir);
+                                        msrcdir, mtgtdir, msi,
+                                        ktime_us_delta(ktime_get(), kstart));
         }
  
         EXIT;
+out_unlock_new:
         if (mnew != NULL)
                 mdt_object_unlock(info, mnew, lh_newp, rc);
  out_unlock_old:
+       mdt_object_unlock(info, NULL, lh_rmt, rc);
         mdt_object_unlock(info, mold, lh_oldp, rc);
  out_put_new:
         if (mnew && !discard)
@@ -2687,6 +3059,7 @@ out_put_srcdir:
                 mdt_dom_discard_data(info, mnew);
                 mdt_object_put(info->mti_env, mnew);
         }
+       OBD_RACE(OBD_FAIL_MDS_LINK_RENAME_RACE);
         return rc;
  }
  
@@ -2694,17 +3067,17 @@ static int mdt_reint_resync(struct mdt_thread_info *info,
                             struct mdt_lock_handle *lhc)
  {
         struct mdt_reint_record *rr = &info->mti_rr;
-       struct ptlrpc_request   *req = mdt_info_req(info);
-       struct md_attr          *ma = &info->mti_attr;
-       struct mdt_object       *mo;
-       struct ldlm_lock        *lease;
-       struct mdt_body         *repbody;
-       struct md_layout_change  layout = { .mlc_mirror_id = rr->rr_mirror_id };
-       bool                     lease_broken;
-       int                      rc, rc2;
-       ENTRY;
+       struct ptlrpc_request *req = mdt_info_req(info);
+       struct md_attr *ma = &info->mti_attr;
+       struct mdt_object *mo;
+       struct ldlm_lock *lease;
+       struct mdt_body *repbody;
+       struct md_layout_change layout = { .mlc_mirror_id = rr->rr_mirror_id };
+       bool lease_broken;
+       int rc, rc2;
  
-       DEBUG_REQ(D_INODE, req, DFID": FLR file resync\n", PFID(rr->rr_fid1));
+       ENTRY;
+       DEBUG_REQ(D_INODE, req, DFID", FLR file resync", PFID(rr->rr_fid1));
  
         if (info->mti_dlm_req)
                 ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
@@ -2729,7 +3102,8 @@ static int mdt_reint_resync(struct mdt_thread_info *info,
         /* It's really necessary to grab open_sem and check if the lease lock
          * has been lost. There would exist a concurrent writer coming in and
          * generating some dirty data in memory cache, the writeback would fail
-        * after the layout version is increased by MDS_REINT_RESYNC RPC. */
+        * after the layout version is increased by MDS_REINT_RESYNC RPC.
+        */
         if (!down_write_trylock(&mo->mot_open_sem))
                 GOTO(out_put_lease, rc = -EBUSY);
  
@@ -2741,10 +3115,13 @@ static int mdt_reint_resync(struct mdt_thread_info *info,
  
         /* the file has yet opened by anyone else after we took the lease. */
         layout.mlc_opc = MD_LAYOUT_RESYNC;
-       rc = mdt_layout_change(info, mo, &layout);
+       lhc = &info->mti_lh[MDT_LH_LOCAL];
+       rc = mdt_layout_change(info, mo, lhc, &layout);
         if (rc)
                 GOTO(out_unlock, rc);
  
+       mdt_object_unlock(info, mo, lhc, 0);
+
         ma->ma_need = MA_INODE;
         ma->ma_valid = 0;
         rc = mdt_attr_get_complex(info, mo, ma);
@@ -2822,8 +3199,8 @@ int mdt_reint_rec(struct mdt_thread_info *info,
  {
         const struct mdt_reinter *mr;
         int rc;
-       ENTRY;
  
+       ENTRY;
         if (!(info->mti_rr.rr_opcode < ARRAY_SIZE(mdt_reinters)))
                 RETURN(-EPROTO);