Whamcloud - gitweb
LU-2782 mdt: add macros for fid string len
[fs/lustre-release.git] / lustre / mdt / mdt_handler.c
index bdb13e1..acf394d 100644 (file)
@@ -68,6 +68,7 @@
 #include <lustre_acl.h>
 #include <lustre_param.h>
 #include <lustre_quota.h>
+#include <lustre_linkea.h>
 
 mdl_mode_t mdt_mdl_lock_modes[] = {
         [LCK_MINMODE] = MDL_MINMODE,
@@ -91,11 +92,8 @@ ldlm_mode_t mdt_dlm_lock_modes[] = {
         [MDL_GROUP]   = LCK_GROUP
 };
 
-
 static struct mdt_device *mdt_dev(struct lu_device *d);
 static int mdt_unpack_req_pack_rep(struct mdt_thread_info *info, __u32 flags);
-static int mdt_fid2path(const struct lu_env *env, struct mdt_device *mdt,
-                        struct getinfo_fid2path *fp);
 
 static const struct lu_object_operations mdt_obj_ops;
 
@@ -142,6 +140,7 @@ void mdt_lock_reg_init(struct mdt_lock_handle *lh, ldlm_mode_t lm)
 {
         lh->mlh_pdo_hash = 0;
         lh->mlh_reg_mode = lm;
+       lh->mlh_rreg_mode = lm;
         lh->mlh_type = MDT_REG_LOCK;
 }
 
@@ -149,11 +148,19 @@ void mdt_lock_pdo_init(struct mdt_lock_handle *lh, ldlm_mode_t lm,
                        const char *name, int namelen)
 {
         lh->mlh_reg_mode = lm;
+       lh->mlh_rreg_mode = lm;
         lh->mlh_type = MDT_PDO_LOCK;
 
         if (name != NULL && (name[0] != '\0')) {
                 LASSERT(namelen > 0);
                 lh->mlh_pdo_hash = full_name_hash(name, namelen);
+               /* XXX Workaround for LU-2856
+                * Zero is a valid return value of full_name_hash, but several
+                * users of mlh_pdo_hash assume a non-zero hash value. We
+                * therefore map zero onto an arbitrary, but consistent
+                * value (1) to avoid problems further down the road. */
+               if (unlikely(!lh->mlh_pdo_hash))
+                       lh->mlh_pdo_hash = 1;
         } else {
                 LASSERT(namelen == 0);
                 lh->mlh_pdo_hash = 0ull;
@@ -237,12 +244,10 @@ static void mdt_lock_pdo_mode(struct mdt_thread_info *info, struct mdt_object *o
 
 int mdt_getstatus(struct mdt_thread_info *info)
 {
-        struct mdt_device *mdt  = info->mti_mdt;
-        struct md_device  *next = mdt->mdt_child;
-        struct mdt_body   *repbody;
-        int                rc;
-
-        ENTRY;
+       struct mdt_device       *mdt  = info->mti_mdt;
+       struct mdt_body         *repbody;
+       int                     rc;
+       ENTRY;
 
         rc = mdt_check_ucred(info);
         if (rc)
@@ -251,11 +256,8 @@ int mdt_getstatus(struct mdt_thread_info *info)
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK))
                 RETURN(err_serious(-ENOMEM));
 
-        repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
-        rc = next->md_ops->mdo_root_get(info->mti_env, next, &repbody->fid1);
-        if (rc != 0)
-                RETURN(rc);
-
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       repbody->fid1 = mdt->mdt_md_root_fid;
         repbody->valid |= OBD_MD_FLID;
 
         if (mdt->mdt_opts.mo_mds_capa &&
@@ -426,7 +428,7 @@ void mdt_client_compatibility(struct mdt_thread_info *info)
         struct lu_attr        *la = &ma->ma_attr;
         ENTRY;
 
-       if (exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK)
+       if (exp_connect_layout(exp))
                /* the client can deal with 16-bit lmm_stripe_count */
                RETURN_EXIT;
 
@@ -683,13 +685,16 @@ static int mdt_getattr_internal(struct mdt_thread_info *info,
 
         ma->ma_valid = 0;
 
-        rc = mdt_object_exists(o);
-        if (rc < 0) {
-                /* This object is located on remote node.*/
-                repbody->fid1 = *mdt_object_fid(o);
-                repbody->valid = OBD_MD_FLID | OBD_MD_MDS;
-                GOTO(out, rc = 0);
-        }
+       if (mdt_object_remote(o)) {
+               /* This object is located on remote node.*/
+               /* Return -EIO for old client */
+               if (!mdt_is_dne_client(req->rq_export))
+                       GOTO(out, rc = -EIO);
+
+               repbody->fid1 = *mdt_object_fid(o);
+               repbody->valid = OBD_MD_FLID | OBD_MD_MDS;
+               GOTO(out, rc = 0);
+       }
 
        buffer->lb_len = reqbody->eadatasize;
        if (buffer->lb_len > 0)
@@ -721,11 +726,12 @@ static int mdt_getattr_internal(struct mdt_thread_info *info,
                 ma->ma_som = &info->mti_u.som.data;
 
        rc = mdt_attr_get_complex(info, o, ma);
-        if (unlikely(rc)) {
-                CERROR("getattr error for "DFID": %d\n",
-                        PFID(mdt_object_fid(o)), rc);
-                RETURN(rc);
-        }
+       if (unlikely(rc)) {
+               CERROR("%s: getattr error for "DFID": rc = %d\n",
+                      mdt_obd_name(info->mti_mdt),
+                      PFID(mdt_object_fid(o)), rc);
+               RETURN(rc);
+       }
 
        is_root = lu_fid_eq(mdt_object_fid(o), &info->mti_mdt->mdt_md_root_fid);
 
@@ -746,8 +752,9 @@ static int mdt_getattr_internal(struct mdt_thread_info *info,
                rc = mdt_attr_get_lov(info, root, ma);
                mdt_object_put(info->mti_env, root);
                if (unlikely(rc)) {
-                       CERROR("getattr error for "DFID": %d\n",
-                                       PFID(mdt_object_fid(o)), rc);
+                       CERROR("%s: getattr error for "DFID": rc = %d\n",
+                              mdt_obd_name(info->mti_mdt),
+                              PFID(mdt_object_fid(o)), rc);
                        RETURN(rc);
                }
        }
@@ -772,17 +779,19 @@ static int mdt_getattr_internal(struct mdt_thread_info *info,
                         repbody->eadatasize = ma->ma_lmv_size;
                         repbody->valid |= (OBD_MD_FLDIREA|OBD_MD_MEA);
                 }
-        } else if (S_ISLNK(la->la_mode) &&
-                   reqbody->valid & OBD_MD_LINKNAME) {
-                buffer->lb_buf = ma->ma_lmm;
-                /* eadatasize from client includes NULL-terminator, so
-                 * there is no need to read it */
-                buffer->lb_len = reqbody->eadatasize - 1;
-                rc = mo_readlink(env, next, buffer);
-                if (unlikely(rc <= 0)) {
-                        CERROR("readlink failed: %d\n", rc);
-                        rc = -EFAULT;
-                } else {
+       } else if (S_ISLNK(la->la_mode) &&
+                  reqbody->valid & OBD_MD_LINKNAME) {
+               buffer->lb_buf = ma->ma_lmm;
+               /* eadatasize from client includes NULL-terminator, so
+                * there is no need to read it */
+               buffer->lb_len = reqbody->eadatasize - 1;
+               rc = mo_readlink(env, next, buffer);
+               if (unlikely(rc <= 0)) {
+                       CERROR("%s: readlink failed for "DFID": rc = %d\n",
+                              mdt_obd_name(info->mti_mdt),
+                              PFID(mdt_object_fid(o)), rc);
+                       rc = -EFAULT;
+               } else {
                        int print_limit = min_t(int, CFS_PAGE_SIZE - 128, rc);
 
                        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_READLINK_EPROTO))
@@ -792,8 +801,11 @@ static int mdt_getattr_internal(struct mdt_thread_info *info,
                         * because client expects that */
                        repbody->eadatasize = rc + 1;
                        if (repbody->eadatasize != reqbody->eadatasize)
-                               CERROR("Read shorter symlink %d, expected %d\n",
-                                      rc, reqbody->eadatasize - 1);
+                               CDEBUG(D_INODE, "%s: Read shorter symlink %d "
+                                      "on "DFID ", expected %d\n",
+                                      mdt_obd_name(info->mti_mdt),
+                                      rc, PFID(mdt_object_fid(o)),
+                                      reqbody->eadatasize - 1);
                        /* NULL terminate */
                        ((char *)ma->ma_lmm)[rc] = 0;
 
@@ -847,9 +859,12 @@ static int mdt_getattr_internal(struct mdt_thread_info *info,
                                         rc = 0;
                                 } else if (rc == -EOPNOTSUPP) {
                                         rc = 0;
-                                } else {
-                                        CERROR("got acl size: %d\n", rc);
-                                }
+                               } else {
+                                       CERROR("%s: unable to read "DFID
+                                              " ACL: rc = %d\n",
+                                              mdt_obd_name(info->mti_mdt),
+                                              PFID(mdt_object_fid(o)), rc);
+                               }
                         } else {
                                 repbody->aclsize = rc;
                                 repbody->valid |= OBD_MD_FLACL;
@@ -992,18 +1007,108 @@ int mdt_is_subdir(struct mdt_thread_info *info)
 
         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
 
-        /*
-         * We save last checked parent fid to @repbody->fid1 for remote
-         * directory case.
-         */
-        LASSERT(fid_is_sane(&body->fid2));
-        LASSERT(mdt_object_exists(o) > 0);
-        rc = mdo_is_subdir(info->mti_env, mdt_object_child(o),
-                           &body->fid2, &repbody->fid1);
-        if (rc == 0 || rc == -EREMOTE)
-                repbody->valid |= OBD_MD_FLID;
+       /*
+        * We save last checked parent fid to @repbody->fid1 for remote
+        * directory case.
+        */
+       LASSERT(fid_is_sane(&body->fid2));
+       LASSERT(mdt_object_exists(o) && !mdt_object_remote(o));
+       rc = mdo_is_subdir(info->mti_env, mdt_object_child(o),
+                          &body->fid2, &repbody->fid1);
+       if (rc == 0 || rc == -EREMOTE)
+               repbody->valid |= OBD_MD_FLID;
 
-        RETURN(rc);
+       RETURN(rc);
+}
+
+int mdt_swap_layouts(struct mdt_thread_info *info)
+{
+       struct ptlrpc_request   *req = mdt_info_req(info);
+       struct obd_export       *exp = req->rq_export;
+       struct mdt_object       *o1, *o2, *o;
+       struct mdt_lock_handle  *lh1, *lh2;
+       struct mdc_swap_layouts *msl;
+       int                      rc;
+       ENTRY;
+
+       /* client does not support layout lock, so layout swaping
+        * is disabled.
+        * FIXME: there is a problem for old clients which don't support
+        * layout lock yet. If those clients have already opened the file
+        * they won't be notified at all so that old layout may still be
+        * used to do IO. This can be fixed after file release is landed by
+        * doing exclusive open and taking full EX ibits lock. - Jinshan */
+       if (!exp_connect_layout(exp))
+               RETURN(-EOPNOTSUPP);
+
+       if (req_capsule_get_size(info->mti_pill, &RMF_CAPA1, RCL_CLIENT))
+               mdt_set_capainfo(info, 0, &info->mti_body->fid1,
+                                req_capsule_client_get(info->mti_pill,
+                                                       &RMF_CAPA1));
+
+       if (req_capsule_get_size(info->mti_pill, &RMF_CAPA2, RCL_CLIENT))
+               mdt_set_capainfo(info, 1, &info->mti_body->fid2,
+                                req_capsule_client_get(info->mti_pill,
+                                                       &RMF_CAPA2));
+
+       o1 = info->mti_object;
+       o = o2 = mdt_object_find(info->mti_env, info->mti_mdt,
+                               &info->mti_body->fid2);
+       if (IS_ERR(o))
+               GOTO(out, rc = PTR_ERR(o));
+
+       if (mdt_object_remote(o) || !mdt_object_exists(o)) /* remote object */
+               GOTO(put, rc = -ENOENT);
+
+       rc = lu_fid_cmp(&info->mti_body->fid1, &info->mti_body->fid2);
+       if (unlikely(rc == 0)) /* same file, you kidding me? no-op. */
+               GOTO(put, rc);
+
+       if (rc < 0)
+               swap(o1, o2);
+
+       /* permission check. Make sure the calling process having permission
+        * to write both files. */
+       rc = mo_permission(info->mti_env, NULL, mdt_object_child(o1), NULL,
+                               MAY_WRITE);
+       if (rc < 0)
+               GOTO(put, rc);
+
+       rc = mo_permission(info->mti_env, NULL, mdt_object_child(o2), NULL,
+                               MAY_WRITE);
+       if (rc < 0)
+               GOTO(put, rc);
+
+       msl = req_capsule_client_get(info->mti_pill, &RMF_SWAP_LAYOUTS);
+       if (msl == NULL)
+               GOTO(put, rc = -EPROTO);
+
+       lh1 = &info->mti_lh[MDT_LH_NEW];
+       mdt_lock_reg_init(lh1, LCK_EX);
+       lh2 = &info->mti_lh[MDT_LH_OLD];
+       mdt_lock_reg_init(lh2, LCK_EX);
+
+       rc = mdt_object_lock(info, o1, lh1, MDS_INODELOCK_LAYOUT,
+                            MDT_LOCAL_LOCK);
+       if (rc < 0)
+               GOTO(put, rc);
+
+       rc = mdt_object_lock(info, o2, lh2, MDS_INODELOCK_LAYOUT,
+                            MDT_LOCAL_LOCK);
+       if (rc < 0)
+               GOTO(unlock1, rc);
+
+       rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
+                            mdt_object_child(o2), msl->msl_flags);
+       GOTO(unlock2, rc);
+unlock2:
+       mdt_object_unlock(info, o2, lh2, rc);
+unlock1:
+       mdt_object_unlock(info, o1, lh1, rc);
+put:
+       mdt_object_put(info->mti_env, o);
+out:
+       RETURN(rc);
 }
 
 static int mdt_raw_lookup(struct mdt_thread_info *info,
@@ -1116,14 +1221,14 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info,
         }
         mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_EXECD);
 
-       rc = mdt_object_exists(parent);
-       if (unlikely(rc == 0)) {
+       if (unlikely(!mdt_object_exists(parent))) {
                LU_OBJECT_DEBUG(D_INODE, info->mti_env,
                                &parent->mot_obj.mo_lu,
                                "Parent doesn't exist!\n");
                RETURN(-ESTALE);
        } else if (!info->mti_cross_ref) {
-               LASSERTF(rc > 0, "Parent "DFID" is on remote server\n",
+               LASSERTF(!mdt_object_remote(parent),
+                        "Parent "DFID" is on remote server\n",
                         PFID(mdt_object_fid(parent)));
        }
         if (lname) {
@@ -1160,9 +1265,9 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info,
                          * needed here but update is.
                          */
                         child_bits &= ~MDS_INODELOCK_LOOKUP;
-                        child_bits |= MDS_INODELOCK_UPDATE;
+                       child_bits |= MDS_INODELOCK_PERM | MDS_INODELOCK_UPDATE;
 
-                        rc = mdt_object_lock(info, child, lhc, child_bits,
+                       rc = mdt_object_lock(info, child, lhc, child_bits,
                                              MDT_LOCAL_LOCK);
                 }
                 if (rc == 0) {
@@ -1247,14 +1352,15 @@ relock:
                 mdt_lock_handle_init(lhc);
                mdt_lock_reg_init(lhc, LCK_PR);
 
-                if (mdt_object_exists(child) == 0) {
-                        LU_OBJECT_DEBUG(D_INODE, info->mti_env,
-                                        &child->mot_obj.mo_lu,
-                                        "Object doesn't exist!\n");
-                        GOTO(out_child, rc = -ENOENT);
-                }
+               if (!mdt_object_exists(child)) {
+                       LU_OBJECT_DEBUG(D_INODE, info->mti_env,
+                                       &child->mot_obj.mo_lu,
+                                       "Object doesn't exist!\n");
+                       GOTO(out_child, rc = -ENOENT);
+               }
 
-                if (!(child_bits & MDS_INODELOCK_UPDATE)) {
+               if (!(child_bits & MDS_INODELOCK_UPDATE) &&
+                     mdt_object_exists(child) && !mdt_object_remote(child)) {
                         struct md_attr *ma = &info->mti_attr;
 
                         ma->ma_valid = 0;
@@ -1331,7 +1437,8 @@ relock:
                          (unsigned long)res_id->name[1],
                          (unsigned long)res_id->name[2],
                          PFID(mdt_object_fid(child)));
-                mdt_pack_size2body(info, child);
+               if (mdt_object_exists(child) && !mdt_object_remote(child))
+                       mdt_pack_size2body(info, child);
         }
         if (lock)
                 LDLM_LOCK_PUT(lock);
@@ -1514,10 +1621,10 @@ static int mdt_sendpage(struct mdt_thread_info *info,
         int                      rc;
         ENTRY;
 
-        desc = ptlrpc_prep_bulk_exp(req, rdpg->rp_npages, BULK_PUT_SOURCE,
-                                    MDS_BULK_PORTAL);
-        if (desc == NULL)
-                RETURN(-ENOMEM);
+       desc = ptlrpc_prep_bulk_exp(req, rdpg->rp_npages, 1, BULK_PUT_SOURCE,
+                                   MDS_BULK_PORTAL);
+       if (desc == NULL)
+               RETURN(-ENOMEM);
 
        if (!(exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE))
                /* old client requires reply size in it's PAGE_SIZE,
@@ -1570,7 +1677,7 @@ int mdt_readpage(struct mdt_thread_info *info)
        if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_64BITHASH)
                rdpg->rp_attrs |= LUDA_64BITHASH;
        rdpg->rp_count  = min_t(unsigned int, reqbody->nlink,
-                               exp_brw_size(info->mti_exp));
+                               exp_max_brw_size(info->mti_exp));
         rdpg->rp_npages = (rdpg->rp_count + CFS_PAGE_SIZE - 1) >>
                           CFS_PAGE_SHIFT;
         OBD_ALLOC(rdpg->rp_pages, rdpg->rp_npages * sizeof rdpg->rp_pages[0]);
@@ -1675,40 +1782,45 @@ out_shrink:
 }
 
 static long mdt_reint_opcode(struct mdt_thread_info *info,
-                             const struct req_format **fmt)
-{
-        struct mdt_rec_reint *rec;
-        long opc;
-
-        opc = err_serious(-EFAULT);
-        rec = req_capsule_client_get(info->mti_pill, &RMF_REC_REINT);
-        if (rec != NULL) {
-                opc = rec->rr_opcode;
-                DEBUG_REQ(D_INODE, mdt_info_req(info), "reint opt = %ld", opc);
-                if (opc < REINT_MAX && fmt[opc] != NULL)
-                        req_capsule_extend(info->mti_pill, fmt[opc]);
-                else {
-                        CERROR("Unsupported opc: %ld\n", opc);
-                        opc = err_serious(opc);
-                }
-        }
-        return opc;
+                            const struct req_format **fmt)
+{
+       struct mdt_rec_reint *rec;
+       long opc;
+
+       rec = req_capsule_client_get(info->mti_pill, &RMF_REC_REINT);
+       if (rec != NULL) {
+               opc = rec->rr_opcode;
+               DEBUG_REQ(D_INODE, mdt_info_req(info), "reint opt = %ld", opc);
+               if (opc < REINT_MAX && fmt[opc] != NULL)
+                       req_capsule_extend(info->mti_pill, fmt[opc]);
+               else {
+                       CERROR("%s: Unsupported opcode '%ld' from client '%s': "
+                              "rc = %d\n", mdt_obd_name(info->mti_mdt), opc,
+                              info->mti_mdt->mdt_ldlm_client->cli_name,
+                              -EFAULT);
+                       opc = err_serious(-EFAULT);
+               }
+       } else {
+               opc = err_serious(-EFAULT);
+       }
+       return opc;
 }
 
 int mdt_reint(struct mdt_thread_info *info)
 {
-        long opc;
-        int  rc;
-
-        static const struct req_format *reint_fmts[REINT_MAX] = {
-                [REINT_SETATTR]  = &RQF_MDS_REINT_SETATTR,
-                [REINT_CREATE]   = &RQF_MDS_REINT_CREATE,
-                [REINT_LINK]     = &RQF_MDS_REINT_LINK,
-                [REINT_UNLINK]   = &RQF_MDS_REINT_UNLINK,
-                [REINT_RENAME]   = &RQF_MDS_REINT_RENAME,
-                [REINT_OPEN]     = &RQF_MDS_REINT_OPEN,
-                [REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR
-        };
+       long opc;
+       int  rc;
+
+       static const struct req_format *reint_fmts[REINT_MAX] = {
+               [REINT_SETATTR]  = &RQF_MDS_REINT_SETATTR,
+               [REINT_CREATE]   = &RQF_MDS_REINT_CREATE,
+               [REINT_LINK]     = &RQF_MDS_REINT_LINK,
+               [REINT_UNLINK]   = &RQF_MDS_REINT_UNLINK,
+               [REINT_RENAME]   = &RQF_MDS_REINT_RENAME,
+               [REINT_OPEN]     = &RQF_MDS_REINT_OPEN,
+               [REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR,
+               [REINT_RMENTRY] = &RQF_MDS_REINT_UNLINK
+       };
 
         ENTRY;
 
@@ -2003,7 +2115,7 @@ int mdt_obd_idx_read(struct mdt_thread_info *info)
        if (req_ii->ii_count <= 0)
                GOTO(out, rc = -EFAULT);
        rdpg->rp_count = min_t(unsigned int, req_ii->ii_count << LU_PAGE_SHIFT,
-                              exp_brw_size(info->mti_exp));
+                              exp_max_brw_size(info->mti_exp));
        rdpg->rp_npages = (rdpg->rp_count + CFS_PAGE_SIZE -1) >> CFS_PAGE_SHIFT;
 
        /* allocate pages to store the containers */
@@ -2378,6 +2490,57 @@ int mdt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
         RETURN(rc);
 }
 
+int mdt_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                       void *data, int flag)
+{
+       struct lustre_handle lockh;
+       int               rc;
+
+       switch (flag) {
+       case LDLM_CB_BLOCKING:
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               if (rc < 0) {
+                       CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+                       RETURN(rc);
+               }
+               break;
+       case LDLM_CB_CANCELING:
+               LDLM_DEBUG(lock, "Revoke remote lock\n");
+               break;
+       default:
+               LBUG();
+       }
+       RETURN(0);
+}
+
+int mdt_remote_object_lock(struct mdt_thread_info *mti,
+                          struct mdt_object *o, struct lustre_handle *lh,
+                          ldlm_mode_t mode, __u64 ibits)
+{
+       struct ldlm_enqueue_info *einfo = &mti->mti_einfo;
+       ldlm_policy_data_t *policy = &mti->mti_policy;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(mdt_object_remote(o));
+
+       LASSERT((ibits & MDS_INODELOCK_UPDATE));
+
+       memset(einfo, 0, sizeof(*einfo));
+       einfo->ei_type = LDLM_IBITS;
+       einfo->ei_mode = mode;
+       einfo->ei_cb_bl = mdt_md_blocking_ast;
+       einfo->ei_cb_cp = ldlm_completion_ast;
+
+       memset(policy, 0, sizeof(*policy));
+       policy->l_inodebits.bits = ibits;
+
+       rc = mo_object_lock(mti->mti_env, mdt_object_child(o), lh, einfo,
+                           policy);
+       RETURN(rc);
+}
+
 static int mdt_object_lock0(struct mdt_thread_info *info, struct mdt_object *o,
                            struct mdt_lock_handle *lh, __u64 ibits,
                            bool nonblock, int locality)
@@ -2394,20 +2557,23 @@ static int mdt_object_lock0(struct mdt_thread_info *info, struct mdt_object *o,
         LASSERT(lh->mlh_reg_mode != LCK_MINMODE);
         LASSERT(lh->mlh_type != MDT_NUL_LOCK);
 
-        if (mdt_object_exists(o) < 0) {
+       if (mdt_object_remote(o)) {
                 if (locality == MDT_CROSS_LOCK) {
-                        /* cross-ref object fix */
-                        ibits &= ~MDS_INODELOCK_UPDATE;
+                       ibits &= ~(MDS_INODELOCK_UPDATE | MDS_INODELOCK_PERM);
                         ibits |= MDS_INODELOCK_LOOKUP;
                 } else {
-                        LASSERT(!(ibits & MDS_INODELOCK_UPDATE));
+                       LASSERTF(!(ibits &
+                                 (MDS_INODELOCK_UPDATE | MDS_INODELOCK_PERM)),
+                               "%s: wrong bit "LPX64" for remote obj "DFID"\n",
+                               mdt_obd_name(info->mti_mdt), ibits,
+                               PFID(mdt_object_fid(o)));
                         LASSERT(ibits & MDS_INODELOCK_LOOKUP);
                 }
                 /* No PDO lock on remote object */
                 LASSERT(lh->mlh_type != MDT_PDO_LOCK);
         }
 
-        if (lh->mlh_type == MDT_PDO_LOCK) {
+       if (lh->mlh_type == MDT_PDO_LOCK) {
                 /* check for exists after object is locked */
                 if (mdt_object_exists(o) == 0) {
                         /* Non-existent object shouldn't have PDO lock */
@@ -2568,6 +2734,9 @@ void mdt_object_unlock(struct mdt_thread_info *info, struct mdt_object *o,
         mdt_save_lock(info, &lh->mlh_pdo_lh, lh->mlh_pdo_mode, decref);
         mdt_save_lock(info, &lh->mlh_reg_lh, lh->mlh_reg_mode, decref);
 
+       if (lustre_handle_is_used(&lh->mlh_rreg_lh))
+               ldlm_lock_decref(&lh->mlh_rreg_lh, lh->mlh_rreg_mode);
+
         EXIT;
 }
 
@@ -2882,6 +3051,8 @@ void mdt_lock_handle_init(struct mdt_lock_handle *lh)
         lh->mlh_reg_mode = LCK_MINMODE;
         lh->mlh_pdo_lh.cookie = 0ull;
         lh->mlh_pdo_mode = LCK_MINMODE;
+       lh->mlh_rreg_lh.cookie = 0ull;
+       lh->mlh_rreg_mode = LCK_MINMODE;
 }
 
 void mdt_lock_handle_fini(struct mdt_lock_handle *lh)
@@ -2919,7 +3090,8 @@ static void mdt_thread_info_init(struct ptlrpc_request *req,
         info->mti_mos = NULL;
 
         memset(&info->mti_attr, 0, sizeof(info->mti_attr));
-        info->mti_body = NULL;
+       info->mti_big_buf = LU_BUF_NULL;
+       info->mti_body = NULL;
         info->mti_object = NULL;
         info->mti_dlm_req = NULL;
         info->mti_has_trans = 0;
@@ -2929,20 +3101,25 @@ static void mdt_thread_info_init(struct ptlrpc_request *req,
 
         /* To not check for split by default. */
         info->mti_spec.no_create = 0;
+       info->mti_spec.sp_rm_entry = 0;
 }
 
 static void mdt_thread_info_fini(struct mdt_thread_info *info)
 {
-        int i;
+       int i;
 
-        req_capsule_fini(info->mti_pill);
-        if (info->mti_object != NULL) {
-                mdt_object_put(info->mti_env, info->mti_object);
-                info->mti_object = NULL;
-        }
-        for (i = 0; i < ARRAY_SIZE(info->mti_lh); i++)
-                mdt_lock_handle_fini(&info->mti_lh[i]);
-        info->mti_env = NULL;
+       req_capsule_fini(info->mti_pill);
+       if (info->mti_object != NULL) {
+               mdt_object_put(info->mti_env, info->mti_object);
+               info->mti_object = NULL;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(info->mti_lh); i++)
+               mdt_lock_handle_fini(&info->mti_lh[i]);
+       info->mti_env = NULL;
+
+       if (unlikely(info->mti_big_buf.lb_buf != NULL))
+               lu_buf_free(&info->mti_big_buf);
 }
 
 static int mdt_filter_recovery_request(struct ptlrpc_request *req,
@@ -2962,6 +3139,7 @@ static int mdt_filter_recovery_request(struct ptlrpc_request *req,
         case MDS_SYNC: /* used in unmounting */
         case OBD_PING:
         case MDS_REINT:
+       case UPDATE_OBJ:
         case SEQ_QUERY:
         case FLD_QUERY:
         case LDLM_ENQUEUE:
@@ -3105,6 +3283,7 @@ static int mdt_msg_check_version(struct lustre_msg *msg)
         case MDS_QUOTACHECK:
         case MDS_QUOTACTL:
        case UPDATE_OBJ:
+       case MDS_SWAP_LAYOUTS:
         case QUOTA_DQACQ:
         case QUOTA_DQREL:
         case SEQ_QUERY:
@@ -3507,10 +3686,11 @@ static int mdt_intent_getattr(enum mdt_it_code opcode,
 
         switch (opcode) {
         case MDT_IT_LOOKUP:
-                child_bits = MDS_INODELOCK_LOOKUP;
+               child_bits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
                 break;
         case MDT_IT_GETATTR:
-                child_bits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE;
+               child_bits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+                            MDS_INODELOCK_PERM;
                 break;
         default:
                 CERROR("Unsupported intent (%d)\n", opcode);
@@ -3528,8 +3708,8 @@ static int mdt_intent_getattr(enum mdt_it_code opcode,
         /* Get lock from request for possible resent case. */
         mdt_intent_fixup_resent(info, *lockp, &new_lock, lhc);
 
-        ldlm_rep->lock_policy_res2 =
-                mdt_getattr_name_lock(info, lhc, child_bits, ldlm_rep);
+       rc = mdt_getattr_name_lock(info, lhc, child_bits, ldlm_rep);
+       ldlm_rep->lock_policy_res2 = clear_serious(rc);
 
         if (mdt_get_disposition(ldlm_rep, DISP_LOOKUP_NEG))
                 ldlm_rep->lock_policy_res2 = 0;
@@ -3557,15 +3737,38 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                             __u64 flags)
 {
        struct layout_intent *layout;
+       struct lu_fid *fid;
+       struct mdt_object *obj = NULL;
+       struct md_object *child = NULL;
        int rc;
        ENTRY;
 
        if (opcode != MDT_IT_LAYOUT) {
-               CERROR("%s: Unknown intent (%d)\n",
-                       info->mti_exp->exp_obd->obd_name, opcode);
+               CERROR("%s: Unknown intent (%d)\n", mdt_obd_name(info->mti_mdt),
+                       opcode);
                RETURN(-EINVAL);
        }
 
+       fid = &info->mti_tmp_fid2;
+       fid_build_from_res_name(fid, &(*lockp)->l_resource->lr_name);
+
+       obj = mdt_object_find(info->mti_env, info->mti_mdt, fid);
+       if (IS_ERR(obj))
+               RETURN(PTR_ERR(obj));
+
+       if (mdt_object_exists(obj) && !mdt_object_remote(obj)) {
+               child = mdt_object_child(obj);
+
+               /* get the length of lsm */
+               rc = mo_xattr_get(info->mti_env, child, &LU_BUF_NULL,
+                                 XATTR_NAME_LOV);
+
+               if (rc > info->mti_mdt->mdt_max_mdsize)
+                       info->mti_mdt->mdt_max_mdsize = rc;
+       }
+
+       mdt_object_put(info->mti_env, obj);
+
        (*lockp)->l_lvb_type = LVB_T_LAYOUT;
        req_capsule_set_size(info->mti_pill, &RMF_DLM_LVB, RCL_SERVER,
                        ldlm_lvbo_size(*lockp));
@@ -3580,7 +3783,7 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                RETURN(0);
 
        CERROR("%s: Unsupported layout intent (%d)\n",
-               info->mti_exp->exp_obd->obd_name, layout->li_opc);
+               mdt_obd_name(info->mti_mdt), layout->li_opc);
        RETURN(-EINVAL);
 }
 
@@ -3626,14 +3829,16 @@ static int mdt_intent_reint(enum mdt_it_code opcode,
         if (rc != 0)
                 mdt_set_disposition(info, rep, DISP_LOOKUP_EXECD);
 
-        /* Cross-ref case, the lock should be returned to the client */
-        if (rc == -EREMOTE) {
-                LASSERT(lustre_handle_is_used(&lhc->mlh_reg_lh));
-                rep->lock_policy_res2 = 0;
-                rc = mdt_intent_lock_replace(info, lockp, NULL, lhc, flags);
-                RETURN(rc);
-        }
-        rep->lock_policy_res2 = clear_serious(rc);
+       /* the open lock or the lock for cross-ref object should be
+        * returned to the client */
+       if (rc == -EREMOTE || mdt_get_disposition(rep, DISP_OPEN_LOCK)) {
+               LASSERT(lustre_handle_is_used(&lhc->mlh_reg_lh));
+               rep->lock_policy_res2 = 0;
+               rc = mdt_intent_lock_replace(info, lockp, NULL, lhc, flags);
+               RETURN(rc);
+       }
+
+       rep->lock_policy_res2 = clear_serious(rc);
 
         if (rep->lock_policy_res2 == -ENOENT &&
             mdt_get_disposition(rep, DISP_LOOKUP_NEG))
@@ -3759,8 +3964,14 @@ static int mdt_intent_opc(long itopc, struct mdt_thread_info *info,
                        RETURN(-EROFS);
         }
         if (rc == 0 && flv->it_act != NULL) {
-                /* execute policy */
-                rc = flv->it_act(opc, info, lockp, flags);
+               struct ldlm_reply *rep;
+
+               /* execute policy */
+               rc = flv->it_act(opc, info, lockp, flags);
+
+               rep = req_capsule_server_get(pill, &RMF_DLM_REP);
+               rep->lock_policy_res2 =
+                       ptlrpc_status_hton(rep->lock_policy_res2);
         } else {
                 rc = -EOPNOTSUPP;
         }
@@ -3980,7 +4191,7 @@ static void mdt_stack_pre_fini(const struct lu_env *env,
         * objects (some of them are pinned by osd, for example *
         * the proper solution should be a model where object used
         * by osd only doesn't have mdt/mdd slices -bzzz */
-       lustre_cfg_bufs_reset(bufs, obd->obd_name);
+       lustre_cfg_bufs_reset(bufs, mdt_obd_name(m));
        lustre_cfg_bufs_set_string(bufs, 1, NULL);
        lcfg = lustre_cfg_new(LCFG_PRE_CLEANUP, bufs);
        if (!lcfg) {
@@ -3993,44 +4204,44 @@ static void mdt_stack_pre_fini(const struct lu_env *env,
 }
 
 static void mdt_stack_fini(const struct lu_env *env,
-                           struct mdt_device *m, struct lu_device *top)
+                          struct mdt_device *m, struct lu_device *top)
 {
-        struct obd_device       *obd = mdt2obd_dev(m);
-        struct lustre_cfg_bufs  *bufs;
-        struct lustre_cfg       *lcfg;
-        struct mdt_thread_info  *info;
-        char flags[3]="";
-        ENTRY;
+       struct obd_device       *obd = mdt2obd_dev(m);
+       struct lustre_cfg_bufs  *bufs;
+       struct lustre_cfg       *lcfg;
+       struct mdt_thread_info  *info;
+       char                     flags[3] = "";
+       ENTRY;
 
-        info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
-        LASSERT(info != NULL);
+       info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
+       LASSERT(info != NULL);
 
        lu_dev_del_linkage(top->ld_site, top);
 
        lu_site_purge(env, top->ld_site, -1);
 
-        bufs = &info->mti_u.bufs;
-        /* process cleanup, pass mdt obd name to get obd umount flags */
+       bufs = &info->mti_u.bufs;
+       /* process cleanup, pass mdt obd name to get obd umount flags */
        /* another purpose is to let all layers to release their objects */
-        lustre_cfg_bufs_reset(bufs, obd->obd_name);
-        if (obd->obd_force)
-                strcat(flags, "F");
-        if (obd->obd_fail)
-                strcat(flags, "A");
-        lustre_cfg_bufs_set_string(bufs, 1, flags);
-        lcfg = lustre_cfg_new(LCFG_CLEANUP, bufs);
-        if (!lcfg) {
-                CERROR("Cannot alloc lcfg!\n");
-                return;
-        }
-        LASSERT(top);
-        top->ld_ops->ldo_process_config(env, top, lcfg);
-        lustre_cfg_free(lcfg);
+       lustre_cfg_bufs_reset(bufs, mdt_obd_name(m));
+       if (obd->obd_force)
+               strcat(flags, "F");
+       if (obd->obd_fail)
+               strcat(flags, "A");
+       lustre_cfg_bufs_set_string(bufs, 1, flags);
+       lcfg = lustre_cfg_new(LCFG_CLEANUP, bufs);
+       if (!lcfg) {
+               CERROR("Cannot alloc lcfg!\n");
+               return;
+       }
+       LASSERT(top);
+       top->ld_ops->ldo_process_config(env, top, lcfg);
+       lustre_cfg_free(lcfg);
 
        lu_site_purge(env, top->ld_site, -1);
 
-        m->mdt_child = NULL;
-        m->mdt_bottom = NULL;
+       m->mdt_child = NULL;
+       m->mdt_bottom = NULL;
 
        obd_disconnect(m->mdt_child_exp);
        m->mdt_child_exp = NULL;
@@ -4054,7 +4265,7 @@ static int mdt_connect_to_next(const struct lu_env *env, struct mdt_device *m,
        obd = class_name2obd(next);
        if (obd == NULL) {
                CERROR("%s: can't locate next device: %s\n",
-                      m->mdt_md_dev.md_lu_dev.ld_obd->obd_name, next);
+                      mdt_obd_name(m), next);
                GOTO(out, rc = -ENOTCONN);
        }
 
@@ -4064,7 +4275,7 @@ static int mdt_connect_to_next(const struct lu_env *env, struct mdt_device *m,
        rc = obd_connect(NULL, exp, obd, &obd->obd_uuid, data, NULL);
        if (rc) {
                CERROR("%s: cannot connect to next dev %s (%d)\n",
-                      m->mdt_md_dev.md_lu_dev.ld_obd->obd_name, next, rc);
+                      mdt_obd_name(m), next, rc);
                GOTO(out, rc);
        }
 
@@ -4180,7 +4391,7 @@ static int mdt_stack_init(const struct lu_env *env, struct mdt_device *mdt,
 
        site = mdt->mdt_child_exp->exp_obd->obd_lu_dev->ld_site;
        LASSERT(site);
-       LASSERT(mdt->mdt_md_dev.md_lu_dev.ld_site == NULL);
+       LASSERT(mdt_lu_site(mdt) == NULL);
        mdt->mdt_md_dev.md_lu_dev.ld_site = site;
        site->ls_top_dev = &mdt->mdt_md_dev.md_lu_dev;
        mdt->mdt_child = lu2md_dev(mdt->mdt_child_exp->exp_obd->obd_lu_dev);
@@ -4404,6 +4615,8 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
                 m->mdt_nosquash_strlen = 0;
         }
 
+       next->md_ops->mdo_iocontrol(env, next, OBD_IOC_PAUSE_LFSCK,
+                                   0, NULL);
         mdt_seq_fini(env, m);
         mdt_fld_fini(env, m);
         sptlrpc_rule_set_free(&m->mdt_sptlrpc_rset);
@@ -4412,40 +4625,40 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
         cfs_timer_disarm(&m->mdt_ck_timer);
         mdt_ck_thread_stop(m);
 
-        /*
-         * Finish the stack
-         */
-        mdt_stack_fini(env, m, md2lu_dev(m->mdt_child));
+       /*
+        * Finish the stack
+        */
+       mdt_stack_fini(env, m, md2lu_dev(m->mdt_child));
 
-        LASSERT(cfs_atomic_read(&d->ld_ref) == 0);
+       LASSERT(cfs_atomic_read(&d->ld_ref) == 0);
 
-       server_put_mount(mdt2obd_dev(m)->obd_name, NULL);
+       server_put_mount(mdt_obd_name(m), NULL);
 
-        EXIT;
+       EXIT;
 }
 
 static int mdt_adapt_sptlrpc_conf(struct obd_device *obd, int initial)
 {
-        struct mdt_device       *m = mdt_dev(obd->obd_lu_dev);
-        struct sptlrpc_rule_set  tmp_rset;
-        int                      rc;
+       struct mdt_device       *m = mdt_dev(obd->obd_lu_dev);
+       struct sptlrpc_rule_set  tmp_rset;
+       int                      rc;
 
-        sptlrpc_rule_set_init(&tmp_rset);
-        rc = sptlrpc_conf_target_get_rules(obd, &tmp_rset, initial);
-        if (rc) {
-                CERROR("mdt %s: failed get sptlrpc rules: %d\n",
-                       obd->obd_name, rc);
-                return rc;
-        }
+       sptlrpc_rule_set_init(&tmp_rset);
+       rc = sptlrpc_conf_target_get_rules(obd, &tmp_rset, initial);
+       if (rc) {
+               CERROR("mdt %s: failed get sptlrpc rules: %d\n",
+                      mdt_obd_name(m), rc);
+               return rc;
+       }
 
-        sptlrpc_target_update_exp_flavor(obd, &tmp_rset);
+       sptlrpc_target_update_exp_flavor(obd, &tmp_rset);
 
        write_lock(&m->mdt_sptlrpc_lock);
-        sptlrpc_rule_set_free(&m->mdt_sptlrpc_rset);
-        m->mdt_sptlrpc_rset = tmp_rset;
+       sptlrpc_rule_set_free(&m->mdt_sptlrpc_rset);
+       m->mdt_sptlrpc_rset = tmp_rset;
        write_unlock(&m->mdt_sptlrpc_lock);
 
-        return 0;
+       return 0;
 }
 
 int mdt_postrecov(const struct lu_env *, struct mdt_device *);
@@ -4523,6 +4736,7 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
        spin_lock_init(&m->mdt_osfs_lock);
        m->mdt_osfs_age = cfs_time_shift_64(-1000);
        m->mdt_enable_remote_dir = 0;
+       m->mdt_enable_remote_dir_gid = 0;
 
         m->mdt_md_dev.md_lu_dev.ld_ops = &mdt_lu_ops;
         m->mdt_md_dev.md_lu_dev.ld_obd = obd;
@@ -4532,12 +4746,13 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
        /* init the stack */
        rc = mdt_stack_init((struct lu_env *)env, m, cfg);
        if (rc) {
-               CERROR("Can't init device stack, rc %d\n", rc);
+               CERROR("%s: Can't init device stack, rc %d\n",
+                      mdt_obd_name(m), rc);
                GOTO(err_lmi, rc);
        }
 
-       s = m->mdt_md_dev.md_lu_dev.ld_site;
-       ss_site = &m->mdt_seq_site;
+       s = mdt_lu_site(m);
+       ss_site = mdt_seq_site(m);
        s->ld_seq_site = ss_site;
        ss_site->ss_lu = s;
 
@@ -4552,28 +4767,28 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
         /* No connection accepted until configurations will finish */
         obd->obd_no_conn = 1;
 
-        if (cfg->lcfg_bufcount > 4 && LUSTRE_CFG_BUFLEN(cfg, 4) > 0) {
-                char *str = lustre_cfg_string(cfg, 4);
-                if (strchr(str, 'n')) {
-                        CWARN("%s: recovery disabled\n", obd->obd_name);
-                        obd->obd_replayable = 0;
-                }
-        }
+       if (cfg->lcfg_bufcount > 4 && LUSTRE_CFG_BUFLEN(cfg, 4) > 0) {
+               char *str = lustre_cfg_string(cfg, 4);
+               if (strchr(str, 'n')) {
+                       CWARN("%s: recovery disabled\n", mdt_obd_name(m));
+                       obd->obd_replayable = 0;
+               }
+       }
 
         rc = tgt_init(env, &m->mdt_lut, obd, m->mdt_bottom);
         if (rc)
                 GOTO(err_fini_stack, rc);
 
-       rc = mdt_fld_init(env, obd->obd_name, m);
+       rc = mdt_fld_init(env, mdt_obd_name(m), m);
        if (rc)
                GOTO(err_lut, rc);
 
-       rc = mdt_seq_init(env, obd->obd_name, m);
+       rc = mdt_seq_init(env, mdt_obd_name(m), m);
        if (rc)
                GOTO(err_fini_fld, rc);
 
-        snprintf(info->mti_u.ns_name, sizeof info->mti_u.ns_name,
-                 LUSTRE_MDT_NAME"-%p", m);
+       snprintf(info->mti_u.ns_name, sizeof(info->mti_u.ns_name), "%s-%s",
+                LUSTRE_MDT_NAME, obd->obd_uuid.uuid);
         m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
                                               LDLM_NAMESPACE_SERVER,
                                               LDLM_NAMESPACE_GREEDY,
@@ -4616,18 +4831,19 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
         else
                 m->mdt_opts.mo_acl = 0;
 
-        /* XXX: to support suppgid for ACL, we enable identity_upcall
-         * by default, otherwise, maybe got unexpected -EACCESS. */
-        if (m->mdt_opts.mo_acl)
-                identity_upcall = MDT_IDENTITY_UPCALL_PATH;
-
-        m->mdt_identity_cache = upcall_cache_init(obd->obd_name,identity_upcall,
-                                                &mdt_identity_upcall_cache_ops);
-        if (IS_ERR(m->mdt_identity_cache)) {
-                rc = PTR_ERR(m->mdt_identity_cache);
-                m->mdt_identity_cache = NULL;
+       /* XXX: to support suppgid for ACL, we enable identity_upcall
+        * by default, otherwise, maybe got unexpected -EACCESS. */
+       if (m->mdt_opts.mo_acl)
+               identity_upcall = MDT_IDENTITY_UPCALL_PATH;
+
+       m->mdt_identity_cache = upcall_cache_init(mdt_obd_name(m),
+                                               identity_upcall,
+                                               &mdt_identity_upcall_cache_ops);
+       if (IS_ERR(m->mdt_identity_cache)) {
+               rc = PTR_ERR(m->mdt_identity_cache);
+               m->mdt_identity_cache = NULL;
                GOTO(err_llog_cleanup, rc);
-        }
+       }
 
         rc = mdt_procfs_init(m, dev);
         if (rc) {
@@ -4863,11 +5079,21 @@ static int mdt_prepare(const struct lu_env *env,
        if (rc)
                RETURN(rc);
 
-       rc = mdt->mdt_child->md_ops->mdo_root_get(env, mdt->mdt_child,
-                                                 &mdt->mdt_md_root_fid);
-       if (rc)
-               RETURN(rc);
+       rc = mdt->mdt_child->md_ops->mdo_iocontrol(env, mdt->mdt_child,
+                                                  OBD_IOC_START_LFSCK,
+                                                  0, NULL);
+       if (rc != 0) {
+               CWARN("%s: auto trigger paused LFSCK failed: rc = %d\n",
+                     mdt_obd_name(mdt), rc);
+               rc = 0;
+       }
 
+       if (mdt->mdt_seq_site.ss_node_id == 0) {
+               rc = mdt->mdt_child->md_ops->mdo_root_get(env, mdt->mdt_child,
+                                                        &mdt->mdt_md_root_fid);
+               if (rc)
+                       RETURN(rc);
+       }
        LASSERT(!test_bit(MDT_FL_CFGLOG, &mdt->mdt_state));
        target_recovery_init(&mdt->mdt_lut, mdt_recovery_handle);
        set_bit(MDT_FL_CFGLOG, &mdt->mdt_state);
@@ -4970,7 +5196,7 @@ static int mdt_connect_internal(struct obd_export *exp,
                               "unexpectedly zero, network data "
                               "corruption? Refusing connection of this"
                               " client\n",
-                              exp->exp_obd->obd_name,
+                              mdt_obd_name(mdt),
                               exp->exp_client_uuid.uuid,
                               exp, data->ocd_connect_flags, data->ocd_version,
                               data->ocd_grant, data->ocd_index);
@@ -4995,7 +5221,7 @@ static int mdt_connect_internal(struct obd_export *exp,
 
        if ((data->ocd_connect_flags & OBD_CONNECT_FID) == 0) {
                CWARN("%s: MDS requires FID support, but client not\n",
-                     mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name);
+                     mdt_obd_name(mdt));
                return -EBADE;
        }
 
@@ -5004,10 +5230,20 @@ static int mdt_connect_internal(struct obd_export *exp,
                                         OBD_CONNECT_MDS_MDS |
                                         OBD_CONNECT_SOM))) {
                CWARN("%s: MDS has SOM enabled, but client does not support "
-                     "it\n", mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name);
+                     "it\n", mdt_obd_name(mdt));
                return -EBADE;
        }
 
+       if (OCD_HAS_FLAG(data, PINGLESS)) {
+               if (ptlrpc_pinger_suppress_pings()) {
+                       spin_lock(&exp->exp_obd->obd_dev_lock);
+                       list_del_init(&exp->exp_obd_chain_timed);
+                       spin_unlock(&exp->exp_obd->obd_dev_lock);
+               } else {
+                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+               }
+       }
+
        return 0;
 }
 
@@ -5298,32 +5534,201 @@ static int mdt_destroy_export(struct obd_export *exp)
         RETURN(0);
 }
 
-static int mdt_rpc_fid2path(struct mdt_thread_info *info, void *key,
-                            void *val, int vallen)
+/** The maximum depth that fid2path() will search.
+ * This is limited only because we want to store the fids for
+ * historical path lookup purposes.
+ */
+#define MAX_PATH_DEPTH 100
+
+/** mdt_path() lookup structure. */
+struct path_lookup_info {
+       __u64                   pli_recno;      /**< history point */
+       __u64                   pli_currec;     /**< current record */
+       struct lu_fid           pli_fid;
+       struct lu_fid           pli_fids[MAX_PATH_DEPTH]; /**< path, in fids */
+       struct mdt_object       *pli_mdt_obj;
+       char                    *pli_path;      /**< full path */
+       int                     pli_pathlen;
+       int                     pli_linkno;     /**< which hardlink to follow */
+       int                     pli_fidcount;   /**< number of \a pli_fids */
+};
+
+static int mdt_links_read(struct mdt_thread_info *info,
+                         struct mdt_object *mdt_obj, struct linkea_data *ldata)
 {
-        struct mdt_device *mdt = mdt_dev(info->mti_exp->exp_obd->obd_lu_dev);
-        struct getinfo_fid2path *fpout, *fpin;
-        int rc = 0;
+       int rc;
 
-        fpin = key + cfs_size_round(sizeof(KEY_FID2PATH));
-        fpout = val;
+       LASSERT(ldata->ld_buf->lb_buf != NULL);
+
+       if (!mdt_object_exists(mdt_obj))
+               return -ENODATA;
+
+       rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
+                         ldata->ld_buf, XATTR_NAME_LINK);
+       if (rc == -ERANGE) {
+               /* Buf was too small, figure out what we need. */
+               lu_buf_free(ldata->ld_buf);
+               rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
+                                 ldata->ld_buf, XATTR_NAME_LINK);
+               if (rc < 0)
+                       return rc;
+               ldata->ld_buf = lu_buf_check_and_alloc(ldata->ld_buf, rc);
+               if (ldata->ld_buf->lb_buf == NULL)
+                       return -ENOMEM;
+               rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
+                                 ldata->ld_buf, XATTR_NAME_LINK);
+       }
+       if (rc < 0)
+               return rc;
 
-        if (ptlrpc_req_need_swab(info->mti_pill->rc_req))
-                lustre_swab_fid2path(fpin);
+       linkea_init(ldata);
 
-        memcpy(fpout, fpin, sizeof(*fpin));
-        if (fpout->gf_pathlen != vallen - sizeof(*fpin))
-                RETURN(-EINVAL);
+       return 0;
+}
 
-        rc = mdt_fid2path(info->mti_env, mdt, fpout);
-        RETURN(rc);
+static int mdt_path_current(struct mdt_thread_info *info,
+                           struct path_lookup_info *pli)
+{
+       struct mdt_device       *mdt = info->mti_mdt;
+       struct mdt_object       *mdt_obj;
+       struct link_ea_header   *leh;
+       struct link_ea_entry    *lee;
+       struct lu_name          *tmpname = &info->mti_name;
+       struct lu_fid           *tmpfid = &info->mti_tmp_fid1;
+       struct lu_buf           *buf = &info->mti_big_buf;
+       char                    *ptr;
+       int                     reclen;
+       struct linkea_data      ldata = { 0 };
+       int                     rc = 0;
+       ENTRY;
+
+       /* temp buffer for path element, the buffer will be finally freed
+        * in mdt_thread_info_fini */
+       buf = lu_buf_check_and_alloc(buf, PATH_MAX);
+       if (buf->lb_buf == NULL)
+               RETURN(-ENOMEM);
+
+       ldata.ld_buf = buf;
+       ptr = pli->pli_path + pli->pli_pathlen - 1;
+       *ptr = 0;
+       --ptr;
+       pli->pli_fidcount = 0;
+       pli->pli_fids[0] = *(struct lu_fid *)mdt_object_fid(pli->pli_mdt_obj);
+
+       /* root FID only exists on MDT0, and fid2path should also ends at MDT0,
+        * so checking root_fid can only happen on MDT0. */
+       while (!lu_fid_eq(&mdt->mdt_md_root_fid,
+                         &pli->pli_fids[pli->pli_fidcount])) {
+               mdt_obj = mdt_object_find(info->mti_env, mdt,
+                                         &pli->pli_fids[pli->pli_fidcount]);
+               if (IS_ERR(mdt_obj))
+                       GOTO(out, rc = PTR_ERR(mdt_obj));
+               if (mdt_object_remote(mdt_obj)) {
+                       mdt_object_put(info->mti_env, mdt_obj);
+                       GOTO(remote_out, rc = -EREMOTE);
+               }
+               if (!mdt_object_exists(mdt_obj)) {
+                       mdt_object_put(info->mti_env, mdt_obj);
+                       GOTO(out, rc = -ENOENT);
+               }
+
+               rc = mdt_links_read(info, mdt_obj, &ldata);
+               mdt_object_put(info->mti_env, mdt_obj);
+               if (rc != 0)
+                       GOTO(out, rc = PTR_ERR(buf));
+
+               leh = buf->lb_buf;
+               lee = (struct link_ea_entry *)(leh + 1); /* link #0 */
+               linkea_entry_unpack(lee, &reclen, tmpname, tmpfid);
+               /* If set, use link #linkno for path lookup, otherwise use
+                  link #0.  Only do this for the final path element. */
+               if ((pli->pli_fidcount == 0) &&
+                   (pli->pli_linkno < leh->leh_reccount)) {
+                       int count;
+                       for (count = 0; count < pli->pli_linkno; count++) {
+                               lee = (struct link_ea_entry *)
+                                    ((char *)lee + reclen);
+                               linkea_entry_unpack(lee, &reclen, tmpname,
+                                                   tmpfid);
+                       }
+                       if (pli->pli_linkno < leh->leh_reccount - 1)
+                               /* indicate to user there are more links */
+                               pli->pli_linkno++;
+               }
+
+               /* Pack the name in the end of the buffer */
+               ptr -= tmpname->ln_namelen;
+               if (ptr - 1 <= pli->pli_path)
+                       GOTO(out, rc = -EOVERFLOW);
+               strncpy(ptr, tmpname->ln_name, tmpname->ln_namelen);
+               *(--ptr) = '/';
+
+               /* Store the parent fid for historic lookup */
+               if (++pli->pli_fidcount >= MAX_PATH_DEPTH)
+                       GOTO(out, rc = -EOVERFLOW);
+               pli->pli_fids[pli->pli_fidcount] = *tmpfid;
+       }
+
+remote_out:
+       ptr++; /* skip leading / */
+       memmove(pli->pli_path, ptr, pli->pli_path + pli->pli_pathlen - ptr);
+
+       EXIT;
+out:
+       return rc;
 }
 
-static int mdt_fid2path(const struct lu_env *env, struct mdt_device *mdt,
-               struct getinfo_fid2path *fp)
+/* Returns the full path to this fid, as of changelog record recno. */
+static int mdt_path(struct mdt_thread_info *info, struct mdt_object *obj,
+                   char *path, int pathlen, __u64 *recno, int *linkno,
+                   struct lu_fid *fid)
 {
+       struct mdt_device       *mdt = info->mti_mdt;
+       struct path_lookup_info *pli;
+       int                     tries = 3;
+       int                     rc = -EAGAIN;
+       ENTRY;
+
+       if (pathlen < 3)
+               RETURN(-EOVERFLOW);
+
+       if (lu_fid_eq(&mdt->mdt_md_root_fid, mdt_object_fid(obj))) {
+               path[0] = '\0';
+               RETURN(0);
+       }
+
+       OBD_ALLOC_PTR(pli);
+       if (pli == NULL)
+               RETURN(-ENOMEM);
+
+       pli->pli_mdt_obj = obj;
+       pli->pli_recno = *recno;
+       pli->pli_path = path;
+       pli->pli_pathlen = pathlen;
+       pli->pli_linkno = *linkno;
+
+       /* Retry multiple times in case file is being moved */
+       while (tries-- && rc == -EAGAIN)
+               rc = mdt_path_current(info, pli);
+
+       /* return the last resolved fids to the client, so the client will
+        * build the left path on another MDT for remote object */
+       *fid = pli->pli_fids[pli->pli_fidcount];
+
+       *recno = pli->pli_currec;
+       /* Return next link index to caller */
+       *linkno = pli->pli_linkno;
+
+       OBD_FREE_PTR(pli);
+
+       RETURN(rc);
+}
+
+static int mdt_fid2path(struct mdt_thread_info *info,
+                       struct getinfo_fid2path *fp)
+{
+       struct mdt_device *mdt = info->mti_mdt;
        struct mdt_object *obj;
-       struct obd_device *obd = mdt2obd_dev(mdt);
        int    rc;
        ENTRY;
 
@@ -5333,39 +5738,65 @@ static int mdt_fid2path(const struct lu_env *env, struct mdt_device *mdt,
        if (!fid_is_sane(&fp->gf_fid))
                RETURN(-EINVAL);
 
-       if (!fid_is_client_mdt_visible(&fp->gf_fid)) {
+       if (!fid_is_namespace_visible(&fp->gf_fid)) {
                CWARN("%s: "DFID" is invalid, sequence should be "
-                       ">= "LPX64"\n", obd->obd_name,
-                       PFID(&fp->gf_fid), (__u64)FID_SEQ_NORMAL);
+                     ">= "LPX64"\n", mdt_obd_name(mdt),
+                     PFID(&fp->gf_fid), (__u64)FID_SEQ_NORMAL);
                RETURN(-EINVAL);
        }
 
-       obj = mdt_object_find(env, mdt, &fp->gf_fid);
+       obj = mdt_object_find(info->mti_env, mdt, &fp->gf_fid);
        if (obj == NULL || IS_ERR(obj)) {
                CDEBUG(D_IOCTL, "no object "DFID": %ld\n", PFID(&fp->gf_fid),
-                       PTR_ERR(obj));
+                      PTR_ERR(obj));
                RETURN(-EINVAL);
        }
 
-       rc = lu_object_exists(&obj->mot_obj.mo_lu);
-       if (rc <= 0) {
-               if (rc == -1)
-                       rc = -EREMOTE;
-               else
-                       rc = -ENOENT;
-               mdt_object_put(env, obj);
+       if (mdt_object_remote(obj))
+               rc = -EREMOTE;
+       else if (!mdt_object_exists(obj))
+               rc = -ENOENT;
+       else
+               rc = 0;
+
+       if (rc < 0) {
+               mdt_object_put(info->mti_env, obj);
                CDEBUG(D_IOCTL, "nonlocal object "DFID": %d\n",
-                       PFID(&fp->gf_fid), rc);
+                      PFID(&fp->gf_fid), rc);
                RETURN(rc);
        }
 
-       rc = mo_path(env, md_object_next(&obj->mot_obj), fp->gf_path,
-                       fp->gf_pathlen, &fp->gf_recno, &fp->gf_linkno);
-       mdt_object_put(env, obj);
+       rc = mdt_path(info, obj, fp->gf_path, fp->gf_pathlen, &fp->gf_recno,
+                     &fp->gf_linkno, &fp->gf_fid);
+
+       CDEBUG(D_INFO, "fid "DFID", path %s recno "LPX64" linkno %u\n",
+              PFID(&fp->gf_fid), fp->gf_path, fp->gf_recno, fp->gf_linkno);
+
+       mdt_object_put(info->mti_env, obj);
 
        RETURN(rc);
 }
 
+static int mdt_rpc_fid2path(struct mdt_thread_info *info, void *key,
+                           void *val, int vallen)
+{
+       struct getinfo_fid2path *fpout, *fpin;
+       int rc = 0;
+
+       fpin = key + cfs_size_round(sizeof(KEY_FID2PATH));
+       fpout = val;
+
+       if (ptlrpc_req_need_swab(info->mti_pill->rc_req))
+               lustre_swab_fid2path(fpin);
+
+       memcpy(fpout, fpin, sizeof(*fpin));
+       if (fpout->gf_pathlen != vallen - sizeof(*fpin))
+               RETURN(-EINVAL);
+
+       rc = mdt_fid2path(info, fpout);
+       RETURN(rc);
+}
+
 int mdt_get_info(struct mdt_thread_info *info)
 {
         struct ptlrpc_request *req = mdt_info_req(info);
@@ -5435,17 +5866,24 @@ static int mdt_ioc_child(struct lu_env *env, struct mdt_device *mdt,
 
 static int mdt_ioc_version_get(struct mdt_thread_info *mti, void *karg)
 {
-        struct obd_ioctl_data *data = karg;
-        struct lu_fid *fid = (struct lu_fid *)data->ioc_inlbuf1;
-        __u64 version;
-        struct mdt_object *obj;
-        struct mdt_lock_handle  *lh;
-        int rc;
-        ENTRY;
+       struct obd_ioctl_data *data = karg;
+       struct lu_fid *fid;
+       __u64 version;
+       struct mdt_object *obj;
+       struct mdt_lock_handle  *lh;
+       int rc;
+       ENTRY;
 
-        CDEBUG(D_IOCTL, "getting version for "DFID"\n", PFID(fid));
-        if (!fid_is_sane(fid))
-                RETURN(-EINVAL);
+       if (data->ioc_inlbuf1 == NULL || data->ioc_inllen1 != sizeof(*fid) ||
+           data->ioc_inlbuf2 == NULL || data->ioc_inllen2 != sizeof(version))
+               RETURN(-EINVAL);
+
+       fid = (struct lu_fid *)data->ioc_inlbuf1;
+
+       if (!fid_is_sane(fid))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_IOCTL, "getting version for "DFID"\n", PFID(fid));
 
         lh = &mti->mti_lh[MDT_LH_PARENT];
         mdt_lock_reg_init(lh, LCK_CR);
@@ -5454,24 +5892,23 @@ static int mdt_ioc_version_get(struct mdt_thread_info *mti, void *karg)
         if (IS_ERR(obj))
                 RETURN(PTR_ERR(obj));
 
-        rc = mdt_object_exists(obj);
-        if (rc < 0) {
-                rc = -EREMOTE;
-                /**
-                 * before calling version get the correct MDS should be
-                 * fid, this is error to find remote object here
-                 */
-                CERROR("nonlocal object "DFID"\n", PFID(fid));
-        } else if (rc == 0) {
-                *(__u64 *)data->ioc_inlbuf2 = ENOENT_VERSION;
-                rc = -ENOENT;
-        } else {
-                version = dt_version_get(mti->mti_env, mdt_obj2dt(obj));
-               *(__u64 *)data->ioc_inlbuf2 = version;
-                rc = 0;
-        }
-        mdt_object_unlock_put(mti, obj, lh, 1);
-        RETURN(rc);
+       if (mdt_object_remote(obj)) {
+               rc = -EREMOTE;
+               /**
+                * before calling version get the correct MDS should be
+                * fid, this is error to find remote object here
+                */
+               CERROR("nonlocal object "DFID"\n", PFID(fid));
+       } else if (!mdt_object_exists(obj)) {
+               *(__u64 *)data->ioc_inlbuf2 = ENOENT_VERSION;
+               rc = -ENOENT;
+       } else {
+               version = dt_version_get(mti->mti_env, mdt_obj2dt(obj));
+              *(__u64 *)data->ioc_inlbuf2 = version;
+               rc = 0;
+       }
+       mdt_object_unlock_put(mti, obj, lh, 1);
+       RETURN(rc);
 }
 
 /* ioctls on obd dev */
@@ -5497,11 +5934,11 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
         case OBD_IOC_SET_READONLY:
                 rc = dt->dd_ops->dt_ro(&env, dt);
                 break;
-        case OBD_IOC_ABORT_RECOVERY:
-                CERROR("Aborting recovery for device %s\n", obd->obd_name);
-                target_stop_recovery_thread(obd);
-                rc = 0;
-                break;
+       case OBD_IOC_ABORT_RECOVERY:
+               CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
+               target_stop_recovery_thread(obd);
+               rc = 0;
+               break;
         case OBD_IOC_CHANGELOG_REG:
         case OBD_IOC_CHANGELOG_DEREG:
         case OBD_IOC_CHANGELOG_CLEAR:
@@ -5533,11 +5970,11 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 rc = mdt_ioc_version_get(mti, karg);
                 break;
         }
-        default:
-                CERROR("Not supported cmd = %d for device %s\n",
-                       cmd, obd->obd_name);
-                rc = -EOPNOTSUPP;
-        }
+       default:
+               rc = -EOPNOTSUPP;
+               CERROR("%s: Not supported cmd = %d, rc = %d\n",
+                       mdt_obd_name(mdt), cmd, rc);
+       }
 
         lu_env_fini(&env);
         RETURN(rc);
@@ -5705,6 +6142,10 @@ static int __init mdt_mod_init(void)
        struct lprocfs_static_vars lvars;
        int rc;
 
+       CLASSERT(sizeof("0x0123456789ABCDEF:0x01234567:0x01234567") ==
+                FID_NOBRACE_LEN + 1);
+       CLASSERT(sizeof("[0x0123456789ABCDEF:0x01234567:0x01234567]") ==
+                FID_LEN + 1);
        rc = lu_kmem_init(mdt_caches);
        if (rc)
                return rc;