LU-3126 osd: remove fld lookup during configuration

[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_handler.c
diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c

index 6d86a0c..d9f0a40 100644 (file)
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -27,7 +27,7 @@
   * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2013, Intel Corporation.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -73,6 +73,10 @@ int ldiskfs_pdo = 1;
  CFS_MODULE_PARM(ldiskfs_pdo, "i", int, 0644,
                  "ldiskfs with parallel directory operations");
  
+int ldiskfs_track_declares_assert;
+CFS_MODULE_PARM(ldiskfs_track_declares_assert, "i", int, 0644,
+               "LBUG during tracking of declares");
+
  static const char dot[] = ".";
  static const char dotdot[] = "..";
  static const char remote_obj_dir[] = "REM_OBJ_DIR";
@@ -84,7 +88,6 @@ static const struct dt_object_operations      osd_obj_otable_it_ops;
  static const struct dt_index_operations       osd_index_iam_ops;
  static const struct dt_index_operations       osd_index_ea_ops;
  
-#ifdef OSD_TRACK_DECLARES
  int osd_trans_declare_op2rb[] = {
         [OSD_OT_ATTR_SET]       = OSD_OT_ATTR_SET,
         [OSD_OT_PUNCH]          = OSD_OT_MAX,
@@ -96,9 +99,9 @@ int osd_trans_declare_op2rb[] = {
         [OSD_OT_WRITE]          = OSD_OT_WRITE,
         [OSD_OT_INSERT]         = OSD_OT_DELETE,
         [OSD_OT_DELETE]         = OSD_OT_INSERT,
+       [OSD_OT_UPDATE]         = OSD_OT_MAX,
         [OSD_OT_QUOTA]          = OSD_OT_MAX,
  };
-#endif
  
  static int osd_has_index(const struct osd_object *obj)
  {
@@ -168,41 +171,27 @@ static struct lu_object *osd_object_alloc(const struct lu_env *env,
          }
  }
  
-static inline int __osd_xattr_get(struct inode *inode, struct dentry *dentry,
-                                 const char *name, void *buf, int len)
-{
-       dentry->d_inode = inode;
-       dentry->d_sb = inode->i_sb;
-       return inode->i_op->getxattr(dentry, name, buf, len);
-}
-
  int osd_get_lma(struct osd_thread_info *info, struct inode *inode,
                 struct dentry *dentry, struct lustre_mdt_attrs *lma)
  {
         int rc;
  
-       rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA, (void *)lma,
-                            sizeof(*lma));
-       if (rc == -ERANGE) {
-               /* try with old lma size */
-               rc = inode->i_op->getxattr(dentry, XATTR_NAME_LMA,
-                                          info->oti_mdt_attrs_old,
-                                          LMA_OLD_SIZE);
-               if (rc > 0)
-                       memcpy(lma, info->oti_mdt_attrs_old, sizeof(*lma));
-       }
+       CLASSERT(LMA_OLD_SIZE >= sizeof(*lma));
+       rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
+                            info->oti_mdt_attrs_old, LMA_OLD_SIZE);
         if (rc > 0) {
+               if ((void *)lma != (void *)info->oti_mdt_attrs_old)
+                       memcpy(lma, info->oti_mdt_attrs_old, sizeof(*lma));
+               rc = 0;
+               lustre_lma_swab(lma);
                 /* Check LMA compatibility */
-               if (lma->lma_incompat & ~cpu_to_le32(LMA_INCOMPAT_SUPP)) {
-                       CWARN("%.16s: unsupported incompat LMA feature(s) "
-                             "%lu/%#x\n",
+               if (lma->lma_incompat & ~LMA_INCOMPAT_SUPP) {
+                       CWARN("%.16s: unsupported incompat LMA feature(s) %#x "
+                             "for fid = "DFID", ino = %lu\n",
                               LDISKFS_SB(inode->i_sb)->s_es->s_volume_name,
-                             inode->i_ino, le32_to_cpu(lma->lma_incompat) &
-                                                       ~LMA_INCOMPAT_SUPP);
-                       rc = -ENOSYS;
-               } else {
-                       lustre_lma_swab(lma);
-                       rc = 0;
+                             lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
+                             PFID(&lma->lma_self_fid), inode->i_ino);
+                       rc = -EOPNOTSUPP;
                 }
         } else if (rc == 0) {
                 rc = -ENODATA;
@@ -284,35 +273,91 @@ osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev,
         return inode;
  }
  
-static struct inode *
-osd_iget_verify(struct osd_thread_info *info, struct osd_device *dev,
-               struct osd_inode_id *id, const struct lu_fid *fid)
+/**
+ * \retval +v: new filter_fid, does not contain self-fid
+ * \retval 0:  filter_fid_old, contains self-fid
+ * \retval -v: other failure cases
+ */
+int osd_get_idif(struct osd_thread_info *info, struct inode *inode,
+                struct dentry *dentry, struct lu_fid *fid)
  {
-       struct lustre_mdt_attrs *lma   = &info->oti_mdt_attrs;
-       struct inode            *inode;
+       struct filter_fid_old   *ff     = &info->oti_ff;
+       struct ost_id           *ostid  = &info->oti_ostid;
         int                      rc;
  
-       inode = osd_iget(info, dev, id);
-       if (IS_ERR(inode))
-               return inode;
+       rc = __osd_xattr_get(inode, dentry, XATTR_NAME_FID, ff, sizeof(*ff));
+       if (rc == sizeof(*ff)) {
+               rc = 0;
+               ostid_set_seq(ostid, le64_to_cpu(ff->ff_seq));
+               ostid_set_id(ostid, le64_to_cpu(ff->ff_objid));
+               /* XXX: should use real OST index in the future. LU-3569 */
+               ostid_to_fid(fid, ostid, 0);
+       } else if (rc == sizeof(struct filter_fid)) {
+               rc = 1;
+       } else if (rc >= 0) {
+               rc = -EINVAL;
+       }
  
-       rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma);
-       if (rc == -ENODATA)
-               return inode;
+       return rc;
+}
  
-       if (rc != 0) {
-               iput(inode);
-               return ERR_PTR(rc);
+static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
+{
+       struct osd_thread_info  *info   = osd_oti_get(env);
+       struct lustre_mdt_attrs *lma    = &info->oti_mdt_attrs;
+       struct inode            *inode  = obj->oo_inode;
+       struct dentry           *dentry = &info->oti_obj_dentry;
+       struct lu_fid           *fid    = NULL;
+       int                      rc;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_INVALID_ENTRY))
+               RETURN(0);
+
+       CLASSERT(LMA_OLD_SIZE >= sizeof(*lma));
+       rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
+                            info->oti_mdt_attrs_old, LMA_OLD_SIZE);
+       if (rc == -ENODATA && !fid_is_igif(lu_object_fid(&obj->oo_dt.do_lu)) &&
+           osd_obj2dev(obj)->od_check_ff) {
+               fid = &lma->lma_self_fid;
+               rc = osd_get_idif(info, inode, dentry, fid);
+               if (rc > 0)
+                       RETURN(0);
         }
  
-       if (!lu_fid_eq(fid, &lma->lma_self_fid)) {
-               CDEBUG(D_LFSCK, "inconsistent obj: "DFID", %lu, "DFID"\n",
-                      PFID(&lma->lma_self_fid), inode->i_ino, PFID(fid));
-               iput(inode);
-               return ERR_PTR(-EREMCHG);
+       if (unlikely(rc == -ENODATA))
+               RETURN(0);
+
+       if (rc < 0)
+               RETURN(rc);
+
+       if (rc > 0) {
+               rc = 0;
+               lustre_lma_swab(lma);
+               if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
+                            CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
+                       CWARN("%s: unsupported incompat LMA feature(s) %#x for "
+                             "fid = "DFID", ino = %lu\n",
+                             osd_obj2dev(obj)->od_svname,
+                             lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
+                             PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+                             inode->i_ino);
+                       rc = -EOPNOTSUPP;
+               } else if (!(lma->lma_compat & LMAC_NOT_IN_OI)) {
+                       fid = &lma->lma_self_fid;
+               }
         }
  
-       return inode;
+       if (fid != NULL &&
+           unlikely(!lu_fid_eq(lu_object_fid(&obj->oo_dt.do_lu), fid))) {
+               CDEBUG(D_INODE, "%s: FID "DFID" != self_fid "DFID"\n",
+                      osd_obj2dev(obj)->od_svname,
+                      PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+                      PFID(&lma->lma_self_fid));
+               rc = -EREMCHG;
+       }
+
+       RETURN(rc);
  }
  
  static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
@@ -328,7 +373,9 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
         struct osd_scrub       *scrub;
         struct scrub_file      *sf;
         int                     result;
-       int                     verify = 0;
+       int                     saved  = 0;
+       bool                    in_oi  = false;
+       bool                    triggered = false;
         ENTRY;
  
         LINVRNT(osd_invariant(obj));
@@ -346,7 +393,8 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
                 RETURN(-ENOENT);
  
         /* Search order: 1. per-thread cache. */
-       if (lu_fid_eq(fid, &oic->oic_fid)) {
+       if (lu_fid_eq(fid, &oic->oic_fid) &&
+           likely(oic->oic_dev == dev)) {
                 id = &oic->oic_lid;
                 goto iget;
         }
@@ -359,9 +407,6 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
                         goto iget;
         }
  
-       if (sf->sf_flags & SF_INCONSISTENT)
-               verify = 1;
-
         /*
          * Objects are created as locking anchors or place holders for objects
          * yet to be created. No need to osd_oi_lookup() at here because FID
@@ -373,9 +418,10 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
                 GOTO(out, result = 0);
  
         /* Search order: 3. OI files. */
-       result = osd_oi_lookup(info, dev, fid, id, true);
+       result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
         if (result == -ENOENT) {
-               if (!fid_is_norm(fid) || fid_is_on_ost(info, dev, fid) ||
+               if (!fid_is_norm(fid) ||
+                   fid_is_on_ost(info, dev, fid, OI_CHECK_FLD) ||
                     !ldiskfs_test_bit(osd_oi_fid2idx(dev,fid),
                                       sf->sf_oi_bitmap))
                         GOTO(out, result = 0);
@@ -386,19 +432,44 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
         if (result != 0)
                 GOTO(out, result);
  
+       in_oi = true;
+
  iget:
-       if (verify == 0)
-               inode = osd_iget(info, dev, id);
-       else
-               inode = osd_iget_verify(info, dev, id, fid);
+       inode = osd_iget(info, dev, id);
         if (IS_ERR(inode)) {
                 result = PTR_ERR(inode);
                 if (result == -ENOENT || result == -ESTALE) {
-                       fid_zero(&oic->oic_fid);
-                       result = 0;
+                       if (!in_oi) {
+                               fid_zero(&oic->oic_fid);
+                               GOTO(out, result = 0);
+                       }
+
+                       /* XXX: There are three possible cases:
+                        *      1. Backup/restore caused the OI invalid.
+                        *      2. Someone unlinked the object but NOT removed
+                        *         the OI mapping, such as mount target device
+                        *         as ldiskfs, and modify something directly.
+                        *      3. Someone just removed the object between the
+                        *         former oi_lookup and the iget. It is normal.
+                        *
+                        *      It is diffcult to distinguish the 2nd from the
+                        *      1st case. Relatively speaking, the 1st case is
+                        *      common than the 2nd case, trigger OI scrub. */
+                       result = osd_oi_lookup(info, dev, fid, id, true);
+                       if (result == 0)
+                               /* It is the case 1 or 2. */
+                               goto trigger;
+
+                       if (result == -ENOENT)
+                               /* It is the case 3. */
+                               result = 0;
                 } else if (result == -EREMCHG) {
  
  trigger:
+                       if (unlikely(triggered))
+                               GOTO(out, result = saved);
+
+                       triggered = true;
                         if (thread_is_running(&scrub->os_thread)) {
                                 result = -EINPROGRESS;
                         } else if (!dev->od_noscrub) {
@@ -412,6 +483,31 @@ trigger:
                                 else
                                         result = -EREMCHG;
                         }
+
+                       /* We still have chance to get the valid inode: for the
+                        * object which is referenced by remote name entry, the
+                        * object on the local MDT will be linked under the dir
+                        * of "/REMOTE_PARENT_DIR" with its FID string as name.
+                        *
+                        * We do not know whether the object for the given FID
+                        * is referenced by some remote name entry or not, and
+                        * especially for DNE II, a multiple-linked object may
+                        * have many name entries reside on many MDTs.
+                        *
+                        * To simplify the operation, OSD will not distinguish
+                        * more, just lookup "/REMOTE_PARENT_DIR". Usually, it
+                        * only happened for the RPC from other MDT during the
+                        * OI scrub, or for the client side RPC with FID only,
+                        * such as FID to path, or from old connected client. */
+                       saved = result;
+                       result = osd_lookup_in_remote_parent(info, dev,
+                                                            fid, id);
+                       if (result == 0) {
+                               in_oi = false;
+                               goto iget;
+                       }
+
+                       result = saved;
                 }
  
                  GOTO(out, result);
@@ -420,6 +516,16 @@ trigger:
          obj->oo_inode = inode;
          LASSERT(obj->oo_inode->i_sb == osd_sb(dev));
  
+       result = osd_check_lma(env, obj);
+       if (result != 0) {
+               iput(inode);
+               obj->oo_inode = NULL;
+               if (result == -EREMCHG)
+                       goto trigger;
+
+               GOTO(out, result);
+       }
+
         obj->oo_compat_dot_created = 1;
         obj->oo_compat_dotdot_created = 1;
  
@@ -451,35 +557,6 @@ static void osd_object_init0(struct osd_object *obj)
                  (LOHA_EXISTS | (obj->oo_inode->i_mode & S_IFMT));
  }
  
-static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
-{
-       struct osd_thread_info  *info   = osd_oti_get(env);
-       struct lustre_mdt_attrs *lma    = &info->oti_mdt_attrs;
-       int                     rc;
-       ENTRY;
-
-       rc = __osd_xattr_get(obj->oo_inode, &info->oti_obj_dentry,
-                            XATTR_NAME_LMA, (void *)lma, sizeof(*lma));
-       if (rc > 0) {
-               rc = 0;
-               if (unlikely((le32_to_cpu(lma->lma_incompat) &
-                             ~LMA_INCOMPAT_SUPP) ||
-                            CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
-                       CWARN("%s: unsupported incompat LMA feature(s) %#x for "
-                             DFID"\n", osd_obj2dev(obj)->od_svname,
-                             le32_to_cpu(lma->lma_incompat) &
-                             ~LMA_INCOMPAT_SUPP,
-                             PFID(lu_object_fid(&obj->oo_dt.do_lu)));
-                       rc = -EOPNOTSUPP;
-               }
-       } else if (rc == -ENODATA) {
-               /* haven't initialize LMA xattr */
-               rc = 0;
-       }
-
-       RETURN(rc);
-}
-
  /*
   * Concurrency: no concurrent access is possible that early in object
   * life-cycle.
@@ -500,10 +577,8 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
  
         result = osd_fid_lookup(env, obj, lu_object_fid(l), conf);
         obj->oo_dt.do_body_ops = &osd_body_ops_new;
-       if (result == 0 && obj->oo_inode != NULL) {
+       if (result == 0 && obj->oo_inode != NULL)
                 osd_object_init0(obj);
-               result = osd_check_lma(env, obj);
-       }
  
         LINVRNT(osd_invariant(obj));
         return result;
@@ -677,7 +752,7 @@ static void osd_trans_commit_cb(struct super_block *sb,
                 dcb->dcb_func(NULL, th, dcb, error);
         }
  
-        lu_ref_del_at(&lud->ld_reference, oh->ot_dev_link, "osd-tx", th);
+       lu_ref_del_at(&lud->ld_reference, &oh->ot_dev_link, "osd-tx", th);
          lu_device_put(lud);
          th->th_dev = NULL;
  
@@ -699,7 +774,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
          LASSERT(cfs_atomic_read(&iobuf->dr_numreqs) == 0);
  
          th = ERR_PTR(-ENOMEM);
-        OBD_ALLOC_GFP(oh, sizeof *oh, CFS_ALLOC_IO);
+       OBD_ALLOC_GFP(oh, sizeof *oh, __GFP_IO);
          if (oh != NULL) {
                 oh->ot_quota_trans = &oti->oti_quota_trans;
                 memset(oh->ot_quota_trans, 0, sizeof(*oh->ot_quota_trans));
@@ -712,7 +787,6 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
                  CFS_INIT_LIST_HEAD(&oh->ot_dcb_list);
                  osd_th_alloced(oh);
  
-#ifdef OSD_TRACK_DECLARES
                 memset(oti->oti_declare_ops, 0,
                                         sizeof(oti->oti_declare_ops));
                 memset(oti->oti_declare_ops_rb, 0,
@@ -720,7 +794,6 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
                 memset(oti->oti_declare_ops_cred, 0,
                                         sizeof(oti->oti_declare_ops_cred));
                 oti->oti_rollback = false;
-#endif
          }
          RETURN(th);
  }
@@ -750,16 +823,13 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                  GOTO(out, rc);
  
         if (unlikely(osd_param_is_not_sane(dev, th))) {
-#ifdef OSD_TRACK_DECLARES
                 static unsigned long last_printed;
                 static int last_credits;
-#endif
  
                 CWARN("%.16s: too many transaction credits (%d > %d)\n",
                       LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
                       oh->ot_credits,
                       osd_journal(dev)->j_max_transaction_buffers);
-#ifdef OSD_TRACK_DECLARES
                 CWARN("  create: %u/%u, delete: %u/%u, destroy: %u/%u\n",
                       oti->oti_declare_ops[OSD_OT_CREATE],
                       oti->oti_declare_ops_cred[OSD_OT_CREATE],
@@ -796,7 +866,6 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                         last_credits = oh->ot_credits;
                         last_printed = jiffies;
                 }
-#endif
                 /* XXX Limit the credits to 'max_transaction_buffers', and
                  *     let the underlying filesystem to catch the error if
                  *     we really need so many credits.
@@ -819,8 +888,8 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                  lu_context_enter(&th->th_ctx);
  
                  lu_device_get(&d->dd_lu_dev);
-                oh->ot_dev_link = lu_ref_add(&d->dd_lu_dev.ld_reference,
-                                             "osd-tx", th);
+               lu_ref_add_at(&d->dd_lu_dev.ld_reference, &oh->ot_dev_link,
+                             "osd-tx", th);
                  oti->oti_txns++;
                  rc = 0;
          } else {
@@ -1351,48 +1420,50 @@ static int capa_is_sane(const struct lu_env *env,
  }
  
  int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
-                    struct lustre_capa *capa, __u64 opc)
+                   struct lustre_capa *capa, __u64 opc)
  {
-        const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
-        struct osd_device *dev = osd_dev(dt->do_lu.lo_dev);
-        struct md_capainfo *ci;
-        int rc;
+       const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
+       struct osd_device *osd = osd_dev(dt->do_lu.lo_dev);
+       struct lu_capainfo *lci;
+       int rc;
  
-        if (!dev->od_fl_capa)
-                return 0;
+       if (!osd->od_fl_capa)
+               return 0;
  
-        if (capa == BYPASS_CAPA)
-                return 0;
+       if (capa == BYPASS_CAPA)
+               return 0;
  
-        ci = md_capainfo(env);
-        if (unlikely(!ci))
-                return 0;
+       lci = lu_capainfo_get(env);
+       if (unlikely(lci == NULL))
+               return 0;
  
-        if (ci->mc_auth == LC_ID_NONE)
-                return 0;
+       if (lci->lci_auth == LC_ID_NONE)
+               return 0;
  
-        if (!capa) {
-                CERROR("no capability is provided for fid "DFID"\n", PFID(fid));
-                return -EACCES;
-        }
+       if (capa == NULL) {
+               CERROR("%s: no capability provided for FID "DFID": rc = %d\n",
+                      osd_name(osd), PFID(fid), -EACCES);
+               return -EACCES;
+       }
  
-        if (!lu_fid_eq(fid, &capa->lc_fid)) {
-                DEBUG_CAPA(D_ERROR, capa, "fid "DFID" mismatch with",
-                           PFID(fid));
-                return -EACCES;
-        }
+       if (!lu_fid_eq(fid, &capa->lc_fid)) {
+               DEBUG_CAPA(D_ERROR, capa, "fid "DFID" mismatch with",
+                          PFID(fid));
+               return -EACCES;
+       }
  
-        if (!capa_opc_supported(capa, opc)) {
-                DEBUG_CAPA(D_ERROR, capa, "opc "LPX64" not supported by", opc);
-                return -EACCES;
-        }
+       if (!capa_opc_supported(capa, opc)) {
+               DEBUG_CAPA(D_ERROR, capa, "opc "LPX64" not supported by", opc);
+               return -EACCES;
+       }
  
-        if ((rc = capa_is_sane(env, dev, capa, dev->od_capa_keys))) {
-                DEBUG_CAPA(D_ERROR, capa, "insane (rc %d)", rc);
-                return -EACCES;
-        }
+       rc = capa_is_sane(env, osd, capa, osd->od_capa_keys);
+       if (rc != 0) {
+               DEBUG_CAPA(D_ERROR, capa, "insane: rc = %d", rc);
+               return -EACCES;
+       }
  
-        return 0;
+       return 0;
  }
  
  static struct timespec *osd_inode_time(const struct lu_env *env,
@@ -1728,9 +1799,8 @@ struct dentry *osd_child_dentry_get(const struct lu_env *env,
  }
  
  static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj,
-                      cfs_umode_t mode,
-                      struct dt_allocation_hint *hint,
-                      struct thandle *th)
+                     umode_t mode, struct dt_allocation_hint *hint,
+                     struct thandle *th)
  {
          int result;
          struct osd_device  *osd = osd_obj2dev(obj);
@@ -1863,16 +1933,16 @@ static int osd_mksym(struct osd_thread_info *info, struct osd_object *obj,
  }
  
  static int osd_mknod(struct osd_thread_info *info, struct osd_object *obj,
-                     struct lu_attr *attr,
-                     struct dt_allocation_hint *hint,
-                     struct dt_object_format *dof,
-                     struct thandle *th)
+                    struct lu_attr *attr,
+                    struct dt_allocation_hint *hint,
+                    struct dt_object_format *dof,
+                    struct thandle *th)
  {
-        cfs_umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX);
-        int result;
+       umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX);
+       int result;
  
-        LINVRNT(osd_invariant(obj));
-        LASSERT(obj->oo_inode == NULL);
+       LINVRNT(osd_invariant(obj));
+       LASSERT(obj->oo_inode == NULL);
          LASSERT(S_ISCHR(mode) || S_ISBLK(mode) ||
                  S_ISFIFO(mode) || S_ISSOCK(mode));
  
@@ -1927,7 +1997,7 @@ static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
  
  static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
                         struct dt_object *parent, struct dt_object *child,
-                       cfs_umode_t child_mode)
+                       umode_t child_mode)
  {
          LASSERT(ah);
  
@@ -2023,22 +2093,22 @@ static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj,
          LASSERT(obj->oo_inode != NULL);
  
         osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
-       return osd_oi_insert(info, osd, fid, id, th);
+       return osd_oi_insert(info, osd, fid, id, th, OI_CHECK_FLD);
  }
  
  int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
-                  const struct lu_fid *fid, struct lu_seq_range *range)
+                  obd_seq seq, struct lu_seq_range *range)
  {
         struct seq_server_site  *ss = osd_seq_site(osd);
         int                     rc;
  
-       if (fid_is_idif(fid)) {
+       if (fid_seq_is_idif(seq)) {
                 fld_range_set_ost(range);
-               range->lsr_index = fid_idif_ost_idx(fid);
+               range->lsr_index = idif_ost_idx(seq);
                 return 0;
         }
  
-       if (!fid_seq_in_fldb(fid_seq(fid))) {
+       if (!fid_seq_in_fldb(seq)) {
                 fld_range_set_mdt(range);
                 if (ss != NULL)
                         /* FIXME: If ss is NULL, it suppose not get lsr_index
@@ -2049,10 +2119,10 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
  
         LASSERT(ss != NULL);
         fld_range_set_any(range);
-       rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
+       rc = fld_server_lookup(env, ss->ss_server_fld, seq, range);
         if (rc != 0) {
-               CERROR("%s: cannot find FLD range for "DFID": rc = %d\n",
-                      osd_name(osd), PFID(fid), rc);
+               CERROR("%s: cannot find FLD range for "LPX64": rc = %d\n",
+                      osd_name(osd), seq, rc);
         }
         return rc;
  }
@@ -2080,7 +2150,7 @@ static int osd_declare_object_create(const struct lu_env *env,
         osd_trans_declare_op(env, oh, OSD_OT_CREATE,
                              osd_dto_credits_noquota[DTO_OBJECT_CREATE]);
         if (!fid_is_on_ost(osd_oti_get(env), osd_dt_dev(handle->th_dev),
-                          lu_object_fid(&dt->do_lu)))
+                          lu_object_fid(&dt->do_lu), OI_CHECK_FLD))
                 /* Reuse idle OI block may cause additional one OI block
                  * to be changed. */
                 osd_trans_declare_op(env, oh, OSD_OT_INSERT,
@@ -2112,7 +2182,7 @@ static int osd_declare_object_create(const struct lu_env *env,
         if (fid_is_norm(lu_object_fid(&dt->do_lu)) &&
                 !fid_is_last_id(lu_object_fid(&dt->do_lu)))
                 osd_fld_lookup(env, osd_dt_dev(handle->th_dev),
-                              lu_object_fid(&dt->do_lu), range);
+                              fid_seq(lu_object_fid(&dt->do_lu)), range);
  
  
         RETURN(rc);
@@ -2211,9 +2281,6 @@ static int osd_object_destroy(const struct lu_env *env,
         if (unlikely(fid_is_acct(fid)))
                 RETURN(-EPERM);
  
-       /* Parallel control for OI scrub. For most of cases, there is no
-        * lock contention. So it will not affect unlink performance. */
-       mutex_lock(&inode->i_mutex);
         if (S_ISDIR(inode->i_mode)) {
                 LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1);
                 /* it will check/delete the inode from remote parent,
@@ -2231,8 +2298,7 @@ static int osd_object_destroy(const struct lu_env *env,
  
         osd_trans_exec_op(env, th, OSD_OT_DESTROY);
  
-        result = osd_oi_delete(osd_oti_get(env), osd, fid, th);
-       mutex_unlock(&inode->i_mutex);
+        result = osd_oi_delete(osd_oti_get(env), osd, fid, th, OI_CHECK_FLD);
  
          /* XXX: add to ext3 orphan list */
          /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -2254,23 +2320,45 @@ static int osd_object_destroy(const struct lu_env *env,
   * FIXME: It is good to have/use ldiskfs_xattr_set_handle() here
   */
  int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode,
-                  const struct lu_fid *fid, __u64 flags)
+                  const struct lu_fid *fid, __u32 compat, __u32 incompat)
  {
         struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
         int                      rc;
+       ENTRY;
  
         if (OBD_FAIL_CHECK(OBD_FAIL_FID_INLMA))
-               return 0;
+               RETURN(0);
  
-       lustre_lma_init(lma, fid, flags);
+       lustre_lma_init(lma, fid, compat, incompat);
         lustre_lma_swab(lma);
  
         rc = __osd_xattr_set(info, inode, XATTR_NAME_LMA, lma, sizeof(*lma),
                              XATTR_CREATE);
-       /* Someone may created the EA by race. */
-       if (unlikely(rc == -EEXIST))
-               rc = 0;
-       return rc;
+       /* LMA may already exist, but we need to check that all the
+        * desired compat/incompat flags have been added. */
+       if (unlikely(rc == -EEXIST)) {
+               if (compat == 0 && incompat == 0)
+                       RETURN(0);
+
+               rc = __osd_xattr_get(inode, &info->oti_obj_dentry,
+                                    XATTR_NAME_LMA, info->oti_mdt_attrs_old,
+                                    LMA_OLD_SIZE);
+               if (rc <= 0)
+                       RETURN(-EINVAL);
+
+               lustre_lma_swab(lma);
+               if (!(~lma->lma_compat & compat) &&
+                   !(~lma->lma_incompat & incompat))
+                       RETURN(0);
+
+               lma->lma_compat |= compat;
+               lma->lma_incompat |= incompat;
+               lustre_lma_swab(lma);
+               rc = __osd_xattr_set(info, inode, XATTR_NAME_LMA, lma,
+                                    sizeof(*lma), XATTR_REPLACE);
+       }
+
+       RETURN(rc);
  }
  
  /**
@@ -2367,7 +2455,7 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env,
         }
  
         /* Set special LMA flag for local agent inode */
-       rc = osd_ea_fid_set(info, local, fid, LMAI_AGENT);
+       rc = osd_ea_fid_set(info, local, fid, 0, LMAI_AGENT);
         if (rc != 0) {
                 CERROR("%s: set LMA for "DFID" remote inode failed: rc = %d\n",
                        osd_name(osd), PFID(fid), rc);
@@ -2452,7 +2540,10 @@ static int osd_object_ea_create(const struct lu_env *env, struct dt_object *dt,
  
          result = __osd_object_create(info, obj, attr, hint, dof, th);
         if (result == 0)
-               result = osd_ea_fid_set(info, obj->oo_inode, fid, 0);
+               result = osd_ea_fid_set(info, obj->oo_inode, fid,
+                               fid_is_on_ost(info, osd_obj2dev(obj),
+                                             fid, OI_CHECK_FLD) ?
+                               LMAC_FID_ON_OST : 0, 0);
  
         if (result == 0)
                 result = __osd_oi_insert(env, obj, fid, th);
@@ -2487,8 +2578,10 @@ static int osd_declare_object_ref_add(const struct lu_env *env,
  static int osd_object_ref_add(const struct lu_env *env,
                                struct dt_object *dt, struct thandle *th)
  {
-        struct osd_object *obj = osd_dt_obj(dt);
-        struct inode      *inode = obj->oo_inode;
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode      *inode = obj->oo_inode;
+       bool               need_dirty = false;
+       int                rc = 0;
  
          LINVRNT(osd_invariant(obj));
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
@@ -2497,33 +2590,44 @@ static int osd_object_ref_add(const struct lu_env *env,
  
         osd_trans_exec_op(env, th, OSD_OT_REF_ADD);
  
-       /*
-        * DIR_NLINK feature is set for compatibility reasons if:
-        * 1) nlinks > LDISKFS_LINK_MAX, or
-        * 2) nlinks == 2, since this indicates i_nlink was previously 1.
+       /* This based on ldiskfs_inc_count(), which is not exported.
+        *
+        * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX
+        * (65000) subdirectories by storing "1" in i_nlink if the link count
+        * would otherwise overflow. Directory tranversal tools understand
+        * that (st_nlink == 1) indicates that the filesystem dose not track
+        * hard links count on the directory, and will not abort subdirectory
+        * scanning early once (st_nlink - 2) subdirs have been found.
          *
-        * It is easier to always set this flag (rather than check and set),
-        * since it has less overhead, and the superblock will be dirtied
-        * at some point. Both e2fsprogs and any Lustre-supported ldiskfs
-        * do not actually care whether this flag is set or not.
+        * This also has to properly handle the case of inodes with nlink == 0
+        * in case they are being linked into the PENDING directory
          */
         spin_lock(&obj->oo_guard);
-       /* inc_nlink from 0 may cause WARN_ON */
-       if(inode->i_nlink == 0)
+       if (unlikely(!S_ISDIR(inode->i_mode) &&
+                    inode->i_nlink >= LDISKFS_LINK_MAX)) {
+               /* MDD should have checked this, but good to be safe */
+               rc = -EMLINK;
+       } else if (unlikely(inode->i_nlink == 0 ||
+                           (S_ISDIR(inode->i_mode) &&
+                            inode->i_nlink >= LDISKFS_LINK_MAX))) {
+               /* inc_nlink from 0 may cause WARN_ON */
                 set_nlink(inode, 1);
-       else
+               need_dirty = true;
+       } else if (!S_ISDIR(inode->i_mode) ||
+                  (S_ISDIR(inode->i_mode) && inode->i_nlink >= 2)) {
                 inc_nlink(inode);
-       if (S_ISDIR(inode->i_mode) && inode->i_nlink > 1) {
-               if (inode->i_nlink >= LDISKFS_LINK_MAX ||
-                   inode->i_nlink == 2)
-                       set_nlink(inode, 1);
-       }
+               need_dirty = true;
+       } /* else (S_ISDIR(inode->i_mode) && inode->i_nlink == 1) { ; } */
+
         LASSERT(inode->i_nlink <= LDISKFS_LINK_MAX);
         spin_unlock(&obj->oo_guard);
-       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+
+       if (need_dirty)
+               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+
         LINVRNT(osd_invariant(obj));
  
-       return 0;
+       return rc;
  }
  
  static int osd_declare_object_ref_del(const struct lu_env *env,
@@ -2562,15 +2666,24 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
  
         spin_lock(&obj->oo_guard);
         LASSERT(inode->i_nlink > 0);
-       drop_nlink(inode);
-       /* If this is/was a many-subdir directory (nlink > LDISKFS_LINK_MAX)
-        * then the nlink count is 1. Don't let it be set to 0 or the directory
-        * inode will be deleted incorrectly. */
-       if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
-               set_nlink(inode, 1);
-       spin_unlock(&obj->oo_guard);
-       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-       LINVRNT(osd_invariant(obj));
+
+       /* This based on ldiskfs_dec_count(), which is not exported.
+        *
+        * If a directory already has nlink == 1, then do not drop the nlink
+        * count to 0, even temporarily, to avoid race conditions with other
+        * threads not holding oo_guard seeing i_nlink == 0 in rare cases.
+        *
+        * nlink == 1 means the directory has/had > EXT4_LINK_MAX subdirs.
+        * */
+       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 1) {
+               drop_nlink(inode);
+
+               spin_unlock(&obj->oo_guard);
+               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+               LINVRNT(osd_invariant(obj));
+       } else {
+               spin_unlock(&obj->oo_guard);
+       }
  
         return 0;
  }
@@ -2767,90 +2880,89 @@ static int osd_xattr_del(const struct lu_env *env, struct dt_object *dt,
  }
  
  static struct obd_capa *osd_capa_get(const struct lu_env *env,
-                                     struct dt_object *dt,
-                                     struct lustre_capa *old,
-                                     __u64 opc)
+                                    struct dt_object *dt,
+                                    struct lustre_capa *old, __u64 opc)
  {
-        struct osd_thread_info *info = osd_oti_get(env);
-        const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
-        struct osd_object *obj = osd_dt_obj(dt);
-        struct osd_device *dev = osd_obj2dev(obj);
-        struct lustre_capa_key *key = &info->oti_capa_key;
-        struct lustre_capa *capa = &info->oti_capa;
-        struct obd_capa *oc;
-        struct md_capainfo *ci;
-        int rc;
-        ENTRY;
+       struct osd_thread_info *info = osd_oti_get(env);
+       const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
+       struct lustre_capa_key *key = &info->oti_capa_key;
+       struct lustre_capa *capa = &info->oti_capa;
+       struct obd_capa *oc;
+       struct lu_capainfo *lci;
+       int rc;
+       ENTRY;
  
-        if (!dev->od_fl_capa)
-                RETURN(ERR_PTR(-ENOENT));
+       if (!osd->od_fl_capa)
+               RETURN(ERR_PTR(-ENOENT));
  
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
-        LINVRNT(osd_invariant(obj));
+       LINVRNT(osd_invariant(obj));
  
-        /* renewal sanity check */
-        if (old && osd_object_auth(env, dt, old, opc))
-                RETURN(ERR_PTR(-EACCES));
-
-        ci = md_capainfo(env);
-        if (unlikely(!ci))
-                RETURN(ERR_PTR(-ENOENT));
-
-        switch (ci->mc_auth) {
-        case LC_ID_NONE:
-                RETURN(NULL);
-        case LC_ID_PLAIN:
-                capa->lc_uid = obj->oo_inode->i_uid;
-                capa->lc_gid = obj->oo_inode->i_gid;
-                capa->lc_flags = LC_ID_PLAIN;
-                break;
-        case LC_ID_CONVERT: {
-                __u32 d[4], s[4];
-
-                s[0] = obj->oo_inode->i_uid;
-                cfs_get_random_bytes(&(s[1]), sizeof(__u32));
-                s[2] = obj->oo_inode->i_gid;
-                cfs_get_random_bytes(&(s[3]), sizeof(__u32));
-                rc = capa_encrypt_id(d, s, key->lk_key, CAPA_HMAC_KEY_MAX_LEN);
-                if (unlikely(rc))
-                        RETURN(ERR_PTR(rc));
-
-                capa->lc_uid   = ((__u64)d[1] << 32) | d[0];
-                capa->lc_gid   = ((__u64)d[3] << 32) | d[2];
-                capa->lc_flags = LC_ID_CONVERT;
-                break;
-        }
-        default:
-                RETURN(ERR_PTR(-EINVAL));
+       /* renewal sanity check */
+       if (old && osd_object_auth(env, dt, old, opc))
+               RETURN(ERR_PTR(-EACCES));
+
+       lci = lu_capainfo_get(env);
+       if (unlikely(lci == NULL))
+               RETURN(ERR_PTR(-ENOENT));
+
+       switch (lci->lci_auth) {
+       case LC_ID_NONE:
+               RETURN(NULL);
+       case LC_ID_PLAIN:
+               capa->lc_uid = obj->oo_inode->i_uid;
+               capa->lc_gid = obj->oo_inode->i_gid;
+               capa->lc_flags = LC_ID_PLAIN;
+               break;
+       case LC_ID_CONVERT: {
+               __u32 d[4], s[4];
+
+               s[0] = obj->oo_inode->i_uid;
+               cfs_get_random_bytes(&(s[1]), sizeof(__u32));
+               s[2] = obj->oo_inode->i_gid;
+               cfs_get_random_bytes(&(s[3]), sizeof(__u32));
+               rc = capa_encrypt_id(d, s, key->lk_key, CAPA_HMAC_KEY_MAX_LEN);
+               if (unlikely(rc))
+                       RETURN(ERR_PTR(rc));
+
+               capa->lc_uid   = ((__u64)d[1] << 32) | d[0];
+               capa->lc_gid   = ((__u64)d[3] << 32) | d[2];
+               capa->lc_flags = LC_ID_CONVERT;
+               break;
          }
+       default:
+               RETURN(ERR_PTR(-EINVAL));
+       }
  
-        capa->lc_fid = *fid;
-        capa->lc_opc = opc;
-        capa->lc_flags |= dev->od_capa_alg << 24;
-        capa->lc_timeout = dev->od_capa_timeout;
-        capa->lc_expiry = 0;
+       capa->lc_fid = *fid;
+       capa->lc_opc = opc;
+       capa->lc_flags |= osd->od_capa_alg << 24;
+       capa->lc_timeout = osd->od_capa_timeout;
+       capa->lc_expiry = 0;
  
-        oc = capa_lookup(dev->od_capa_hash, capa, 1);
-        if (oc) {
-                LASSERT(!capa_is_expired(oc));
-                RETURN(oc);
-        }
+       oc = capa_lookup(osd->od_capa_hash, capa, 1);
+       if (oc) {
+               LASSERT(!capa_is_expired(oc));
+               RETURN(oc);
+       }
  
         spin_lock(&capa_lock);
-       *key = dev->od_capa_keys[1];
+       *key = osd->od_capa_keys[1];
         spin_unlock(&capa_lock);
  
-        capa->lc_keyid = key->lk_keyid;
-        capa->lc_expiry = cfs_time_current_sec() + dev->od_capa_timeout;
+       capa->lc_keyid = key->lk_keyid;
+       capa->lc_expiry = cfs_time_current_sec() + osd->od_capa_timeout;
  
-        rc = capa_hmac(capa->lc_hmac, capa, key->lk_key);
-        if (rc) {
-                DEBUG_CAPA(D_ERROR, capa, "HMAC failed: %d for", rc);
-                RETURN(ERR_PTR(rc));
-        }
+       rc = capa_hmac(capa->lc_hmac, capa, key->lk_key);
+       if (rc) {
+               DEBUG_CAPA(D_ERROR, capa, "HMAC failed: %d for", rc);
+               RETURN(ERR_PTR(rc));
+       }
  
-        oc = capa_add(dev->od_capa_hash, capa);
-        RETURN(oc);
+       oc = capa_add(osd->od_capa_hash, capa);
+       RETURN(oc);
  }
  
  static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
@@ -3211,26 +3323,45 @@ static inline int osd_get_fid_from_dentry(struct ldiskfs_dir_entry_2 *de,
         return rc;
  }
  
-static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
-                         struct lu_fid *fid)
+static int osd_mdt_seq_exists(const struct lu_env *env,
+                             struct osd_device *osd, obd_seq seq)
  {
         struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
         struct seq_server_site  *ss = osd_seq_site(osd);
         int                     rc;
         ENTRY;
  
-       /* Those FID seqs, which are not in FLDB, must be local seq */
-       if (unlikely(!fid_seq_in_fldb(fid_seq(fid)) || ss == NULL))
-               RETURN(0);
+       if (ss == NULL)
+               RETURN(1);
  
-       rc = osd_fld_lookup(env, osd, fid, range);
+       /* XXX: currently, each MDT only store avaible sequence on disk, and no
+        * allocated sequences information on disk, so we have to lookup FLDB,
+        * but it probably makes more sense also store allocated sequence
+        * locally, so we do not need do remote FLDB lookup in OSD */
+       rc = osd_fld_lookup(env, osd, seq, range);
         if (rc != 0) {
-               CERROR("%s: Can not lookup fld for "DFID"\n",
-                      osd_name(osd), PFID(fid));
-               RETURN(rc);
+               CERROR("%s: Can not lookup fld for "LPX64"\n",
+                      osd_name(osd), seq);
+               RETURN(0);
         }
  
-       RETURN(ss->ss_node_id != range->lsr_index);
+       RETURN(ss->ss_node_id == range->lsr_index);
+}
+
+static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
+                         struct lu_fid *fid)
+{
+       ENTRY;
+
+       /* FID seqs not in FLDB, must be local seq */
+       if (unlikely(!fid_seq_in_fldb(fid_seq(fid))))
+               RETURN(0);
+
+       /* Currently only check this for FID on MDT */
+       if (osd_mdt_seq_exists(env, osd, fid_seq(fid)))
+               RETURN(0);
+
+       RETURN(1);
  }
  
  /**
@@ -3703,7 +3834,7 @@ osd_consistency_check(struct osd_thread_info *oti, struct osd_device *dev,
                 RETURN_EXIT;
  
  again:
-       rc = osd_oi_lookup(oti, dev, fid, id, true);
+       rc = osd_oi_lookup(oti, dev, fid, id, OI_CHECK_FLD);
         if (rc != 0 && rc != -ENOENT)
                 RETURN_EXIT;
  
@@ -3762,6 +3893,18 @@ static int osd_fail_fid_lookup(struct osd_thread_info *oti,
         return rc;
  }
  
+int osd_add_oi_cache(struct osd_thread_info *info, struct osd_device *osd,
+                    struct osd_inode_id *id, const struct lu_fid *fid)
+{
+       CDEBUG(D_INODE, "add "DFID" %u:%u to info %p\n", PFID(fid),
+              id->oii_ino, id->oii_gen, info);
+       info->oti_cache.oic_lid = *id;
+       info->oti_cache.oic_fid = *fid;
+       info->oti_cache.oic_dev = osd;
+
+       return 0;
+}
+
  /**
   * Calls ->lookup() to find dentry. From dentry get inode and
   * read inode's ea to get fid. This is required for  interoperability
@@ -3820,13 +3963,15 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                         rc = osd_ea_fid_get(env, obj, ino, fid, id);
                 else
                         osd_id_gen(id, ino, OSD_OII_NOGEN);
-               if (rc != 0 || osd_remote_fid(env, dev, fid)) {
+               if (rc != 0) {
                         fid_zero(&oic->oic_fid);
                         GOTO(out, rc);
                 }
  
-               oic->oic_lid = *id;
-               oic->oic_fid = *fid;
+               rc = osd_add_oi_cache(osd_oti_get(env), osd_obj2dev(obj), id,
+                                     fid);
+               if (rc != 0)
+                       GOTO(out, rc);
                 if ((scrub->os_pos_current <= ino) &&
                     ((sf->sf_flags & SF_INCONSISTENT) ||
                      (sf->sf_flags & SF_UPGRADE && fid_is_igif(fid)) ||
@@ -4355,6 +4500,7 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
          struct osd_object       *obj  = osd_dt_obj(dt);
          struct osd_thread_info  *info = osd_oti_get(env);
          struct osd_it_ea        *it   = &info->oti_it_ea;
+       struct file             *file = &it->oie_file;
          struct lu_object        *lo   = &dt->do_lu;
          struct dentry           *obj_dentry = &info->oti_it_dentry;
          ENTRY;
@@ -4369,17 +4515,20 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
          it->oie_dirent          = NULL;
          it->oie_buf             = info->oti_it_ea_buf;
          it->oie_obj             = obj;
-        it->oie_file.f_pos      = 0;
-        it->oie_file.f_dentry   = obj_dentry;
-        if (attr & LUDA_64BITHASH)
-               it->oie_file.f_mode |= FMODE_64BITHASH;
-        else
-               it->oie_file.f_mode |= FMODE_32BITHASH;
-        it->oie_file.f_mapping    = obj->oo_inode->i_mapping;
-        it->oie_file.f_op         = obj->oo_inode->i_fop;
-        it->oie_file.private_data = NULL;
-        lu_object_get(lo);
-        RETURN((struct dt_it *) it);
+
+       /* Reset the "file" totally to avoid to reuse any old value from
+        * former readdir handling, the "file->f_pos" should be zero. */
+       memset(file, 0, sizeof(*file));
+       /* Only FMODE_64BITHASH or FMODE_32BITHASH should be set, NOT both. */
+       if (attr & LUDA_64BITHASH)
+               file->f_mode    = FMODE_64BITHASH;
+       else
+               file->f_mode    = FMODE_32BITHASH;
+       file->f_dentry          = obj_dentry;
+       file->f_mapping         = obj->oo_inode->i_mapping;
+       file->f_op              = obj->oo_inode->i_fop;
+       lu_object_get(lo);
+       RETURN((struct dt_it *) it);
  }
  
  /**
@@ -4841,8 +4990,14 @@ again:
                 GOTO(out_journal, rc);
         }
  
+       /* skip the REMOTE_PARENT_DIR. */
+       if (inode == dev->od_mdt_map->omm_remote_parent->d_inode)
+               GOTO(out_inode, rc = 0);
+
         rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma);
         if (rc == 0) {
+               LASSERT(!(lma->lma_compat & LMAC_NOT_IN_OI));
+
                 if (fid_is_sane(fid)) {
                         /* FID-in-dirent is valid. */
                         if (lu_fid_eq(fid, &lma->lma_self_fid))
@@ -4926,7 +5081,7 @@ again:
                 if (unlikely(fid_is_sane(fid))) {
                         /* FID-in-dirent exists, but FID-in-LMA is lost.
                          * Trust the FID-in-dirent, and add FID-in-LMA. */
-                       rc = osd_ea_fid_set(info, inode, fid, 0);
+                       rc = osd_ea_fid_set(info, inode, fid, 0, 0);
                         if (rc == 0)
                                 *attr |= LUDA_REPAIR;
                 } else {
@@ -4995,10 +5150,10 @@ static inline int osd_it_ea_rec(const struct lu_env *env,
                 if (unlikely(ino == osd_sb(dev)->s_root->d_inode->i_ino)) {
                         attr |= LUDA_IGNORE;
                         rc = 0;
-                       goto pack;
+               } else {
+                       rc = osd_dirent_check_repair(env, obj, it, fid, id,
+                                                    &attr);
                 }
-
-               rc = osd_dirent_check_repair(env, obj, it, fid, id, &attr);
         } else {
                 attr &= ~LU_DIRENT_ATTRS_MASK;
                 if (!fid_is_sane(fid)) {
@@ -5011,22 +5166,20 @@ static inline int osd_it_ea_rec(const struct lu_env *env,
                 }
         }
  
-       if (rc < 0)
-               RETURN(rc);
-
-pack:
+       /* Pack the entry anyway, at least the offset is right. */
         osd_it_pack_dirent(lde, fid, it->oie_dirent->oied_off,
                            it->oie_dirent->oied_name,
                            it->oie_dirent->oied_namelen,
                            it->oie_dirent->oied_type, attr);
  
+       if (rc < 0)
+               RETURN(rc);
+
         if (osd_remote_fid(env, dev, fid))
                 RETURN(0);
  
-       if (likely(!(attr & LUDA_IGNORE))) {
-               oic->oic_lid = *id;
-               oic->oic_fid = *fid;
-       }
+       if (likely(!(attr & LUDA_IGNORE)))
+               rc = osd_add_oi_cache(oti, dev, id, fid);
  
         if (!(attr & LUDA_VERIFY) &&
             (scrub->os_pos_current <= ino) &&
@@ -5209,20 +5362,33 @@ static int osd_shutdown(const struct lu_env *env, struct osd_device *o)
  {
         ENTRY;
  
-       osd_scrub_cleanup(env, o);
+       /* shutdown quota slave instance associated with the device */
+       if (o->od_quota_slave != NULL) {
+               qsd_fini(env, o->od_quota_slave);
+               o->od_quota_slave = NULL;
+       }
+
+       RETURN(0);
+}
+
+static void osd_umount(const struct lu_env *env, struct osd_device *o)
+{
+       ENTRY;
  
         if (o->od_fsops) {
                 fsfilt_put_ops(o->od_fsops);
                 o->od_fsops = NULL;
         }
  
-       /* shutdown quota slave instance associated with the device */
-       if (o->od_quota_slave != NULL) {
-               qsd_fini(env, o->od_quota_slave);
-               o->od_quota_slave = NULL;
+       if (o->od_mnt != NULL) {
+               shrink_dcache_sb(osd_sb(o));
+               osd_sync(env, &o->od_dt_dev);
+
+               mntput(o->od_mnt);
+               o->od_mnt = NULL;
         }
  
-       RETURN(0);
+       EXIT;
  }
  
  static int osd_mount(const struct lu_env *env,
@@ -5236,7 +5402,10 @@ static int osd_mount(const struct lu_env *env,
         struct file_system_type *type;
         char                    *options = NULL;
         char                    *str;
-       int                       rc = 0;
+       struct osd_thread_info  *info = osd_oti_get(env);
+       struct lu_fid           *fid = &info->oti_fid;
+       struct inode            *inode;
+       int                      rc = 0;
          ENTRY;
  
         if (o->od_mnt != NULL)
@@ -5246,13 +5415,14 @@ static int osd_mount(const struct lu_env *env,
                 RETURN(-E2BIG);
         strcpy(o->od_mntdev, dev);
  
-        o->od_fsops = fsfilt_get_ops(mt_str(LDD_MT_LDISKFS));
-        if (o->od_fsops == NULL) {
-                CERROR("Can't find fsfilt_ldiskfs\n");
-                RETURN(-ENOTSUPP);
-        }
+       o->od_fsops = fsfilt_get_ops(mt_str(LDD_MT_LDISKFS));
+       if (IS_ERR(o->od_fsops)) {
+               CERROR("%s: Can't find fsfilt_ldiskfs\n", name);
+               o->od_fsops = NULL;
+               RETURN(-ENOTSUPP);
+       }
  
-       OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
+       OBD_PAGE_ALLOC(__page, GFP_IOFS);
         if (__page == NULL)
                 GOTO(out, rc = -ENOMEM);
  
@@ -5262,7 +5432,7 @@ static int osd_mount(const struct lu_env *env,
         if (str)
                 lmd_flags = simple_strtoul(str + 1, NULL, 0);
         opts = lustre_cfg_string(cfg, 3);
-       page = (unsigned long)cfs_page_address(__page);
+       page = (unsigned long)page_address(__page);
         options = (char *)page;
         *options = '\0';
         if (opts == NULL)
@@ -5273,7 +5443,7 @@ static int osd_mount(const struct lu_env *env,
         /* Glom up mount options */
         if (*options != '\0')
                 strcat(options, ",");
-       strlcat(options, "no_mbcache", CFS_PAGE_SIZE);
+       strlcat(options, "no_mbcache", PAGE_CACHE_SIZE);
  
         type = get_fs_type("ldiskfs");
         if (!type) {
@@ -5286,8 +5456,8 @@ static int osd_mount(const struct lu_env *env,
  
         if (IS_ERR(o->od_mnt)) {
                 rc = PTR_ERR(o->od_mnt);
-               CERROR("%s: can't mount %s: %d\n", name, dev, rc);
                 o->od_mnt = NULL;
+               CERROR("%s: can't mount %s: %d\n", name, dev, rc);
                 GOTO(out, rc);
         }
  
@@ -5295,59 +5465,56 @@ static int osd_mount(const struct lu_env *env,
         if (dev_check_rdonly(o->od_mnt->mnt_sb->s_bdev)) {
                 CERROR("%s: underlying device %s is marked as read-only. "
                        "Setup failed\n", name, dev);
-               mntput(o->od_mnt);
-               o->od_mnt = NULL;
-               GOTO(out, rc = -EROFS);
+               GOTO(out_mnt, rc = -EROFS);
         }
  #endif
  
         if (!LDISKFS_HAS_COMPAT_FEATURE(o->od_mnt->mnt_sb,
             LDISKFS_FEATURE_COMPAT_HAS_JOURNAL)) {
                 CERROR("%s: device %s is mounted w/o journal\n", name, dev);
-               mntput(o->od_mnt);
-               o->od_mnt = NULL;
-               GOTO(out, rc = -EINVAL);
+               GOTO(out_mnt, rc = -EINVAL);
+       }
+
+       inode = osd_sb(o)->s_root->d_inode;
+       ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI);
+       lu_local_obj_fid(fid, OSD_FS_ROOT_OID);
+       rc = osd_ea_fid_set(info, inode, fid, LMAC_NOT_IN_OI, 0);
+       if (rc != 0) {
+               CERROR("%s: failed to set lma on %s root inode\n", name, dev);
+               GOTO(out_mnt, rc);
         }
  
-       ldiskfs_set_inode_state(osd_sb(o)->s_root->d_inode,
-                               LDISKFS_STATE_LUSTRE_NO_OI);
         if (lmd_flags & LMD_FLG_NOSCRUB)
                 o->od_noscrub = 1;
  
+       GOTO(out, rc = 0);
+
+out_mnt:
+       mntput(o->od_mnt);
+       o->od_mnt = NULL;
+
  out:
         if (__page)
                 OBD_PAGE_FREE(__page);
         if (rc)
                 fsfilt_put_ops(o->od_fsops);
  
-        RETURN(rc);
+       return rc;
  }
  
  static struct lu_device *osd_device_fini(const struct lu_env *env,
-                                         struct lu_device *d)
+                                        struct lu_device *d)
  {
-        int rc;
-        ENTRY;
-
-       rc = osd_shutdown(env, osd_dev(d));
-
-       osd_obj_map_fini(osd_dev(d));
-
-        shrink_dcache_sb(osd_sb(osd_dev(d)));
-        osd_sync(env, lu2dt_dev(d));
-
-        rc = osd_procfs_fini(osd_dev(d));
-        if (rc) {
-                CERROR("proc fini error %d \n", rc);
-                RETURN (ERR_PTR(rc));
-        }
+       struct osd_device *o = osd_dev(d);
+       ENTRY;
  
-       if (osd_dev(d)->od_mnt) {
-               mntput(osd_dev(d)->od_mnt);
-               osd_dev(d)->od_mnt = NULL;
-       }
+       osd_procfs_fini(o);
+       osd_shutdown(env, o);
+       osd_scrub_cleanup(env, o);
+       osd_obj_map_fini(o);
+       osd_umount(env, o);
  
-        RETURN(NULL);
+       RETURN(NULL);
  }
  
  static int osd_device_init0(const struct lu_env *env,
@@ -5385,12 +5552,6 @@ static int osd_device_init0(const struct lu_env *env,
         if (rc)
                 GOTO(out_capa, rc);
  
-       CFS_INIT_LIST_HEAD(&o->od_ios_list);
-       /* setup scrub, including OI files initialization */
-       rc = osd_scrub_setup(env, o);
-       if (rc < 0)
-               GOTO(out_mnt, rc);
-
         cplen = strlcpy(o->od_svname, lustre_cfg_string(cfg, 4),
                         sizeof(o->od_svname));
         if (cplen >= sizeof(o->od_svname)) {
@@ -5398,24 +5559,33 @@ static int osd_device_init0(const struct lu_env *env,
                 GOTO(out_mnt, rc);
         }
  
+       if (server_name_is_ost(o->od_svname))
+               o->od_is_ost = 1;
+
         rc = osd_obj_map_init(env, o);
         if (rc != 0)
-               GOTO(out_scrub, rc);
+               GOTO(out_mnt, rc);
  
         rc = lu_site_init(&o->od_site, l);
-       if (rc)
+       if (rc != 0)
                 GOTO(out_compat, rc);
         o->od_site.ls_bottom_dev = l;
  
         rc = lu_site_init_finish(&o->od_site);
-       if (rc)
+       if (rc != 0)
+               GOTO(out_site, rc);
+
+       CFS_INIT_LIST_HEAD(&o->od_ios_list);
+       /* setup scrub, including OI files initialization */
+       rc = osd_scrub_setup(env, o);
+       if (rc < 0)
                 GOTO(out_site, rc);
  
         rc = osd_procfs_init(o, o->od_svname);
         if (rc != 0) {
                 CERROR("%s: can't initialize procfs: rc = %d\n",
                        o->od_svname, rc);
-               GOTO(out_site, rc);
+               GOTO(out_scrub, rc);
         }
  
         LASSERT(l->ld_site->ls_linkage.next && l->ld_site->ls_linkage.prev);
@@ -5430,23 +5600,21 @@ static int osd_device_init0(const struct lu_env *env,
         }
  
         RETURN(0);
+
  out_procfs:
         osd_procfs_fini(o);
+out_scrub:
+       osd_scrub_cleanup(env, o);
  out_site:
         lu_site_fini(&o->od_site);
  out_compat:
         osd_obj_map_fini(o);
-out_scrub:
-       osd_scrub_cleanup(env, o);
  out_mnt:
-       osd_oi_fini(info, o);
-       osd_shutdown(env, o);
-       mntput(o->od_mnt);
-       o->od_mnt = NULL;
+       osd_umount(env, o);
  out_capa:
         cleanup_capa_hash(o->od_capa_hash);
  out:
-       RETURN(rc);
+       return rc;
  }
  
  static struct lu_device *osd_device_alloc(const struct lu_env *env,
@@ -5593,15 +5761,6 @@ static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
         int                result = 0;
         ENTRY;
  
-       if (dev->ld_site && lu_device_is_md(dev->ld_site->ls_top_dev)) {
-               /* MDT/MDD still use old infrastructure to create
-                * special files */
-               result = llo_local_objects_setup(env, lu2md_dev(pdev),
-                                                lu2dt_dev(dev));
-               if (result)
-                       RETURN(result);
-       }
-
         if (osd->od_quota_slave != NULL)
                 /* set up quota slave objects */
                 result = qsd_prepare(env, osd->od_quota_slave);