X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_handler.c;h=bc52d58d3ef89cc041eb44e4a95a9d7522a41cd6;hb=ef1b815d77ae717f3ee701e2392fd3fe6c71906d;hp=2c4ec7ff3d819b3a8972ffb37092f572c06a43b8;hpb=4e7541ab2328da4d57f60b3b4d6514990f996858;p=fs%2Flustre-release.git

diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c
index 2c4ec7f..bc52d58 100644
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -73,6 +73,10 @@ int ldiskfs_pdo = 1;
 CFS_MODULE_PARM(ldiskfs_pdo, "i", int, 0644,
                 "ldiskfs with parallel directory operations");
 
+int ldiskfs_track_declares_assert;
+CFS_MODULE_PARM(ldiskfs_track_declares_assert, "i", int, 0644,
+		"LBUG during tracking of declares");
+
 static const char dot[] = ".";
 static const char dotdot[] = "..";
 static const char remote_obj_dir[] = "REM_OBJ_DIR";
@@ -84,7 +88,6 @@ static const struct dt_object_operations      osd_obj_otable_it_ops;
 static const struct dt_index_operations       osd_index_iam_ops;
 static const struct dt_index_operations       osd_index_ea_ops;
 
-#ifdef OSD_TRACK_DECLARES
 int osd_trans_declare_op2rb[] = {
 	[OSD_OT_ATTR_SET]	= OSD_OT_ATTR_SET,
 	[OSD_OT_PUNCH]		= OSD_OT_MAX,
@@ -98,7 +101,6 @@ int osd_trans_declare_op2rb[] = {
 	[OSD_OT_DELETE]		= OSD_OT_INSERT,
 	[OSD_OT_QUOTA]		= OSD_OT_MAX,
 };
-#endif
 
 static int osd_has_index(const struct osd_object *obj)
 {
@@ -172,6 +174,7 @@ static inline int __osd_xattr_get(struct inode *inode, struct dentry *dentry,
 				  const char *name, void *buf, int len)
 {
 	dentry->d_inode = inode;
+	dentry->d_sb = inode->i_sb;
 	return inode->i_op->getxattr(dentry, name, buf, len);
 }
 
@@ -180,28 +183,22 @@ int osd_get_lma(struct osd_thread_info *info, struct inode *inode,
 {
 	int rc;
 
-	rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA, (void *)lma,
-			     sizeof(*lma));
-	if (rc == -ERANGE) {
-		/* try with old lma size */
-		rc = inode->i_op->getxattr(dentry, XATTR_NAME_LMA,
-					   info->oti_mdt_attrs_old,
-					   LMA_OLD_SIZE);
-		if (rc > 0)
-			memcpy(lma, info->oti_mdt_attrs_old, sizeof(*lma));
-	}
+	CLASSERT(LMA_OLD_SIZE >= sizeof(*lma));
+	rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
+			     info->oti_mdt_attrs_old, LMA_OLD_SIZE);
 	if (rc > 0) {
+		if ((void *)lma != (void *)info->oti_mdt_attrs_old)
+			memcpy(lma, info->oti_mdt_attrs_old, sizeof(*lma));
+		rc = 0;
+		lustre_lma_swab(lma);
 		/* Check LMA compatibility */
-		if (lma->lma_incompat & ~cpu_to_le32(LMA_INCOMPAT_SUPP)) {
-			CWARN("%.16s: unsupported incompat LMA feature(s) "
-			      "%lx/%#x\n",
+		if (lma->lma_incompat & ~LMA_INCOMPAT_SUPP) {
+			CWARN("%.16s: unsupported incompat LMA feature(s) %#x "
+			      "for fid = "DFID", ino = %lu\n",
 			      LDISKFS_SB(inode->i_sb)->s_es->s_volume_name,
-			      inode->i_ino, le32_to_cpu(lma->lma_incompat) &
-							~LMA_INCOMPAT_SUPP);
-			rc = -ENOSYS;
-		} else {
-			lustre_lma_swab(lma);
-			rc = 0;
+			      lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
+			      PFID(&lma->lma_self_fid), inode->i_ino);
+			rc = -EOPNOTSUPP;
 		}
 	} else if (rc == 0) {
 		rc = -ENODATA;
@@ -450,6 +447,50 @@ static void osd_object_init0(struct osd_object *obj)
                 (LOHA_EXISTS | (obj->oo_inode->i_mode & S_IFMT));
 }
 
+static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
+{
+	struct osd_thread_info	*info	= osd_oti_get(env);
+	struct lustre_mdt_attrs	*lma	= &info->oti_mdt_attrs;
+	int			rc;
+	ENTRY;
+
+	CLASSERT(LMA_OLD_SIZE >= sizeof(*lma));
+	rc = __osd_xattr_get(obj->oo_inode, &info->oti_obj_dentry,
+			     XATTR_NAME_LMA, info->oti_mdt_attrs_old,
+			     LMA_OLD_SIZE);
+	if (rc > 0) {
+		rc = 0;
+		lustre_lma_swab(lma);
+		if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
+			     CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
+			rc = -EOPNOTSUPP;
+			CWARN("%s: unsupported incompat LMA feature(s) %#x for "
+			      "fid = "DFID", ino = %lu: rc = %d\n",
+			      osd_obj2dev(obj)->od_svname,
+			      lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
+			      PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+			      obj->oo_inode->i_ino, rc);
+		}
+		if (unlikely(!lu_fid_eq(lu_object_fid(&obj->oo_dt.do_lu),
+					&lma->lma_self_fid))) {
+			CDEBUG(D_INODE, "%s: FID "DFID" != self_fid "DFID"\n",
+			       osd_obj2dev(obj)->od_svname,
+			       PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+			       PFID(&lma->lma_self_fid));
+			if (obj->oo_inode != NULL) {
+				iput(obj->oo_inode);
+				obj->oo_inode = NULL;
+			}
+			rc = -ESTALE;
+		}
+	} else if (rc == -ENODATA) {
+		/* haven't initialize LMA xattr */
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
 /*
  * Concurrency: no concurrent access is possible that early in object
  * life-cycle.
@@ -470,8 +511,13 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
 
 	result = osd_fid_lookup(env, obj, lu_object_fid(l), conf);
 	obj->oo_dt.do_body_ops = &osd_body_ops_new;
-	if (result == 0 && obj->oo_inode != NULL)
+	if (result == 0 && obj->oo_inode != NULL) {
+		result = osd_check_lma(env, obj);
+		if (result != 0)
+			return result;
+
 		osd_object_init0(obj);
+	}
 
 	LINVRNT(osd_invariant(obj));
 	return result;
@@ -680,9 +726,12 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
                 CFS_INIT_LIST_HEAD(&oh->ot_dcb_list);
                 osd_th_alloced(oh);
 
-		memset(oti->oti_declare_ops, 0, OSD_OT_MAX);
-		memset(oti->oti_declare_ops_rb, 0, OSD_OT_MAX);
-		memset(oti->oti_declare_ops_cred, 0, OSD_OT_MAX);
+		memset(oti->oti_declare_ops, 0,
+					sizeof(oti->oti_declare_ops));
+		memset(oti->oti_declare_ops_rb, 0,
+					sizeof(oti->oti_declare_ops_rb));
+		memset(oti->oti_declare_ops_cred, 0,
+					sizeof(oti->oti_declare_ops_cred));
 		oti->oti_rollback = false;
         }
         RETURN(th);
@@ -720,7 +769,6 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
 		      LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
 		      oh->ot_credits,
 		      osd_journal(dev)->j_max_transaction_buffers);
-#ifdef OSD_TRACK_DECLARES
 		CWARN("  create: %u/%u, delete: %u/%u, destroy: %u/%u\n",
 		      oti->oti_declare_ops[OSD_OT_CREATE],
 		      oti->oti_declare_ops_cred[OSD_OT_CREATE],
@@ -757,7 +805,6 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
 			last_credits = oh->ot_credits;
 			last_printed = jiffies;
 		}
-#endif
 		/* XXX Limit the credits to 'max_transaction_buffers', and
 		 *     let the underlying filesystem to catch the error if
 		 *     we really need so many credits.
@@ -1994,13 +2041,13 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
 	int			rc;
 
 	if (fid_is_idif(fid)) {
-		range->lsr_flags = LU_SEQ_RANGE_OST;
+		fld_range_set_ost(range);
 		range->lsr_index = fid_idif_ost_idx(fid);
 		return 0;
 	}
 
 	if (!fid_seq_in_fldb(fid_seq(fid))) {
-		range->lsr_flags = LU_SEQ_RANGE_MDT;
+		fld_range_set_mdt(range);
 		if (ss != NULL)
 			/* FIXME: If ss is NULL, it suppose not get lsr_index
 			 * at all */
@@ -2009,10 +2056,10 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
 	}
 
 	LASSERT(ss != NULL);
-	range->lsr_flags = -1;
+	fld_range_set_any(range);
 	rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
 	if (rc != 0) {
-		CERROR("%s can not find "DFID": rc = %d\n",
+		CERROR("%s: cannot find FLD range for "DFID": rc = %d\n",
 		       osd_name(osd), PFID(fid), rc);
 	}
 	return rc;
@@ -2135,13 +2182,12 @@ static int osd_declare_object_destroy(const struct lu_env *env,
 	LASSERT(oh->ot_handle == NULL);
 	LASSERT(inode);
 
-	osd_trans_declare_op(env, oh, OSD_OT_DELETE,
+	osd_trans_declare_op(env, oh, OSD_OT_DESTROY,
 			     osd_dto_credits_noquota[DTO_OBJECT_DELETE]);
 	/* Recycle idle OI leaf may cause additional three OI blocks
 	 * to be changed. */
-	osd_trans_declare_op(env, oh, OSD_OT_DESTROY,
+	osd_trans_declare_op(env, oh, OSD_OT_DELETE,
 			     osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3);
-
 	/* one less inode */
 	rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, -1, oh,
 				   false, true, NULL, false);
@@ -2173,9 +2219,6 @@ static int osd_object_destroy(const struct lu_env *env,
 	if (unlikely(fid_is_acct(fid)))
 		RETURN(-EPERM);
 
-	/* Parallel control for OI scrub. For most of cases, there is no
-	 * lock contention. So it will not affect unlink performance. */
-	mutex_lock(&inode->i_mutex);
 	if (S_ISDIR(inode->i_mode)) {
 		LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1);
 		/* it will check/delete the inode from remote parent,
@@ -2194,7 +2237,6 @@ static int osd_object_destroy(const struct lu_env *env,
 	osd_trans_exec_op(env, th, OSD_OT_DESTROY);
 
         result = osd_oi_delete(osd_oti_get(env), osd, fid, th);
-	mutex_unlock(&inode->i_mutex);
 
         /* XXX: add to ext3 orphan list */
         /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -2224,9 +2266,6 @@ int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode,
 	if (OBD_FAIL_CHECK(OBD_FAIL_FID_INLMA))
 		return 0;
 
-	if (OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF) && fid_is_client_visible(fid))
-		return 0;
-
 	lustre_lma_init(lma, fid, flags);
 	lustre_lma_swab(lma);
 
@@ -2249,7 +2288,8 @@ int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode,
 void osd_get_ldiskfs_dirent_param(struct ldiskfs_dentry_param *param,
 				  const struct dt_rec *fid)
 {
-	if (!fid_is_client_mdt_visible((const struct lu_fid *)fid)) {
+	if (!fid_is_namespace_visible((const struct lu_fid *)fid) ||
+	    OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF)) {
 		param->edp_magic = 0;
 		return;
 	}
@@ -2351,12 +2391,12 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env,
 }
 
 /**
- * Delete local inode for remote entry
+ * Delete local agent inode for remote entry
  */
-static int osd_delete_remote_inode(const struct lu_env *env,
-				   struct osd_device *osd,
-				   const struct lu_fid *fid,
-				    __u32 ino, struct osd_thandle *oh)
+static int osd_delete_local_agent_inode(const struct lu_env *env,
+					struct osd_device *osd,
+					const struct lu_fid *fid,
+					__u32 ino, struct osd_thandle *oh)
 {
 	struct osd_thread_info	*oti = osd_oti_get(env);
 	struct osd_inode_id	*id = &oti->oti_id;
@@ -2415,13 +2455,11 @@ static int osd_object_ea_create(const struct lu_env *env, struct dt_object *dt,
 	osd_trans_declare_rb(env, th, OSD_OT_REF_ADD);
 
         result = __osd_object_create(info, obj, attr, hint, dof, th);
-	if ((result == 0) &&
-	    (fid_is_last_id(fid) ||
-	     !fid_is_on_ost(info, osd_dt_dev(th->th_dev), fid)))
+	if (result == 0)
 		result = osd_ea_fid_set(info, obj->oo_inode, fid, 0);
 
-        if (result == 0)
-                result = __osd_oi_insert(env, obj, fid, th);
+	if (result == 0)
+		result = __osd_oi_insert(env, obj, fid, th);
 
 	LASSERT(ergo(result == 0,
 		     dt_object_exists(dt) && !dt_object_remote(dt)));
@@ -2453,8 +2491,10 @@ static int osd_declare_object_ref_add(const struct lu_env *env,
 static int osd_object_ref_add(const struct lu_env *env,
                               struct dt_object *dt, struct thandle *th)
 {
-        struct osd_object *obj = osd_dt_obj(dt);
-        struct inode      *inode = obj->oo_inode;
+	struct osd_object *obj = osd_dt_obj(dt);
+	struct inode      *inode = obj->oo_inode;
+	bool		   need_dirty = false;
+	int		   rc = 0;
 
         LINVRNT(osd_invariant(obj));
 	LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
@@ -2463,33 +2503,44 @@ static int osd_object_ref_add(const struct lu_env *env,
 
 	osd_trans_exec_op(env, th, OSD_OT_REF_ADD);
 
-	/*
-	 * DIR_NLINK feature is set for compatibility reasons if:
-	 * 1) nlinks > LDISKFS_LINK_MAX, or
-	 * 2) nlinks == 2, since this indicates i_nlink was previously 1.
+	/* This based on ldiskfs_inc_count(), which is not exported.
+	 *
+	 * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX
+	 * (65000) subdirectories by storing "1" in i_nlink if the link count
+	 * would otherwise overflow. Directory tranversal tools understand
+	 * that (st_nlink == 1) indicates that the filesystem dose not track
+	 * hard links count on the directory, and will not abort subdirectory
+	 * scanning early once (st_nlink - 2) subdirs have been found.
 	 *
-	 * It is easier to always set this flag (rather than check and set),
-	 * since it has less overhead, and the superblock will be dirtied
-	 * at some point. Both e2fsprogs and any Lustre-supported ldiskfs
-	 * do not actually care whether this flag is set or not.
+	 * This also has to properly handle the case of inodes with nlink == 0
+	 * in case they are being linked into the PENDING directory
 	 */
 	spin_lock(&obj->oo_guard);
-	/* inc_nlink from 0 may cause WARN_ON */
-	if(inode->i_nlink == 0)
+	if (unlikely(!S_ISDIR(inode->i_mode) &&
+		     inode->i_nlink >= LDISKFS_LINK_MAX)) {
+		/* MDD should have checked this, but good to be safe */
+		rc = -EMLINK;
+	} else if (unlikely(inode->i_nlink == 0 ||
+			    (S_ISDIR(inode->i_mode) &&
+			     inode->i_nlink >= LDISKFS_LINK_MAX))) {
+		/* inc_nlink from 0 may cause WARN_ON */
 		set_nlink(inode, 1);
-	else
+		need_dirty = true;
+	} else if (!S_ISDIR(inode->i_mode) ||
+		   (S_ISDIR(inode->i_mode) && inode->i_nlink >= 2)) {
 		inc_nlink(inode);
-	if (S_ISDIR(inode->i_mode) && inode->i_nlink > 1) {
-		if (inode->i_nlink >= LDISKFS_LINK_MAX ||
-		    inode->i_nlink == 2)
-			set_nlink(inode, 1);
-	}
+		need_dirty = true;
+	} /* else (S_ISDIR(inode->i_mode) && inode->i_nlink == 1) { ; } */
+
 	LASSERT(inode->i_nlink <= LDISKFS_LINK_MAX);
 	spin_unlock(&obj->oo_guard);
-	ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+
+	if (need_dirty)
+		ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+
 	LINVRNT(osd_invariant(obj));
 
-	return 0;
+	return rc;
 }
 
 static int osd_declare_object_ref_del(const struct lu_env *env,
@@ -2528,15 +2579,24 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
 
 	spin_lock(&obj->oo_guard);
 	LASSERT(inode->i_nlink > 0);
-	drop_nlink(inode);
-	/* If this is/was a many-subdir directory (nlink > LDISKFS_LINK_MAX)
-	 * then the nlink count is 1. Don't let it be set to 0 or the directory
-	 * inode will be deleted incorrectly. */
-	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
-		set_nlink(inode, 1);
-	spin_unlock(&obj->oo_guard);
-	ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-	LINVRNT(osd_invariant(obj));
+
+	/* This based on ldiskfs_dec_count(), which is not exported.
+	 *
+	 * If a directory already has nlink == 1, then do not drop the nlink
+	 * count to 0, even temporarily, to avoid race conditions with other
+	 * threads not holding oo_guard seeing i_nlink == 0 in rare cases.
+	 *
+	 * nlink == 1 means the directory has/had > EXT4_LINK_MAX subdirs.
+	 * */
+	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 1) {
+		drop_nlink(inode);
+
+		spin_unlock(&obj->oo_guard);
+		ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+		LINVRNT(osd_invariant(obj));
+	} else {
+		spin_unlock(&obj->oo_guard);
+	}
 
 	return 0;
 }
@@ -2681,6 +2741,7 @@ static int osd_xattr_list(const struct lu_env *env, struct dt_object *dt,
                 return -EACCES;
 
         dentry->d_inode = inode;
+	dentry->d_sb = inode->i_sb;
         return inode->i_op->listxattr(dentry, buf->lb_buf, buf->lb_len);
 }
 
@@ -2726,6 +2787,7 @@ static int osd_xattr_del(const struct lu_env *env, struct dt_object *dt,
 
 	ll_vfs_dq_init(inode);
         dentry->d_inode = inode;
+	dentry->d_sb = inode->i_sb;
         rc = inode->i_op->removexattr(dentry, name);
         return rc;
 }
@@ -2829,6 +2891,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
 	ENTRY;
 
 	dentry->d_inode = inode;
+	dentry->d_sb = inode->i_sb;
 	file->f_dentry = dentry;
 	file->f_mapping = inode->i_mapping;
 	file->f_op = inode->i_fop;
@@ -3248,11 +3311,77 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt,
 		down_write(&obj->oo_ext_idx_sem);
         }
 
-        bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+        bh = ldiskfs_find_entry(dir, &dentry->d_name, &de, hlock);
         if (bh) {
-                rc = ldiskfs_delete_entry(oh->ot_handle,
-                                          dir, de, bh);
+		__u32 ino = 0;
+
+		/* If this is not the ".." entry, it might be a remote DNE
+		 * entry and  we need to check if the FID is for a remote
+		 * MDT.  If the FID is  not in the directory entry (e.g.
+		 * upgraded 1.8 filesystem without dirdata enabled) then
+		 * we need to get the FID from the LMA. For a remote directory
+		 * there HAS to be an LMA, it cannot be an IGIF inode in this
+		 * case.
+		 *
+		 * Delete the entry before the agent inode in order to
+		 * simplify error handling.  At worst an error after deleting
+		 * the entry first might leak the agent inode afterward. The
+		 * reverse would need filesystem abort in case of error deleting
+		 * the entry after the agent had been removed, or leave a
+		 * dangling entry pointing at a random inode. */
+		if (strcmp((char *)key, dotdot) != 0) {
+			LASSERT(de != NULL);
+			rc = osd_get_fid_from_dentry(de, (struct dt_rec *)fid);
+			/* If Fid is not in dentry, try to get it from LMA */
+			if (rc == -ENODATA) {
+				struct osd_inode_id *id;
+				struct inode *inode;
+
+				/* Before trying to get fid from the inode,
+				 * check whether the inode is valid.
+				 *
+				 * If the inode has been deleted, do not go
+				 * ahead to do osd_ea_fid_get, which will set
+				 * the inode to bad inode, which might cause
+				 * the inode to be deleted uncorrectly */
+				inode = ldiskfs_iget(osd_sb(osd),
+						     le32_to_cpu(de->inode));
+				if (IS_ERR(inode)) {
+					CDEBUG(D_INODE, "%s: "DFID"get inode"
+					       "error.\n", osd_name(osd),
+					       PFID(fid));
+					rc = PTR_ERR(inode);
+				} else {
+					if (likely(inode->i_nlink != 0)) {
+						id = &osd_oti_get(env)->oti_id;
+						rc = osd_ea_fid_get(env, obj,
+						        le32_to_cpu(de->inode),
+								    fid, id);
+					} else {
+						CDEBUG(D_INFO, "%s: %u "DFID
+						       "deleted.\n",
+						       osd_name(osd),
+						       le32_to_cpu(de->inode),
+						       PFID(fid));
+						rc = -ESTALE;
+					}
+					iput(inode);
+				}
+			}
+			if (rc == 0 &&
+			    unlikely(osd_remote_fid(env, osd, fid)))
+				/* Need to delete agent inode */
+				ino = le32_to_cpu(de->inode);
+		}
+                rc = ldiskfs_delete_entry(oh->ot_handle, dir, de, bh);
                 brelse(bh);
+		if (rc == 0 && unlikely(ino != 0)) {
+			rc = osd_delete_local_agent_inode(env, osd, fid, ino,
+							  oh);
+			if (rc != 0)
+				CERROR("%s: del local inode "DFID": rc = %d\n",
+				       osd_name(osd), PFID(fid), rc);
+		}
         } else {
                 rc = -ENOENT;
         }
@@ -3265,23 +3394,20 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt,
 		GOTO(out, rc);
 
 	/* For inode on the remote MDT, .. will point to
-	 * /Agent directory. So do not try to lookup/delete
-	 * remote inode for .. */
-	if (strcmp((char *)key, dotdot) == 0)
-		GOTO(out, rc = 0);
-
-	LASSERT(de != NULL);
-	rc = osd_get_fid_from_dentry(de, (struct dt_rec *)fid);
-	if (rc == 0 && osd_remote_fid(env, osd, fid)) {
-		__u32 ino = le32_to_cpu(de->inode);
+	 * /Agent directory, Check whether it needs to delete
+	 * from agent directory */
+	if (unlikely(strcmp((char *)key, dotdot) == 0)) {
+		rc = osd_delete_from_remote_parent(env, osd_obj2dev(obj), obj,
+						   oh);
+		if (rc != 0 && rc != -ENOENT) {
+			CERROR("%s: delete agent inode "DFID": rc = %d\n",
+			       osd_name(osd), PFID(fid), rc);
+		}
 
-		rc = osd_delete_remote_inode(env, osd, fid, ino, oh);
-		if (rc != 0)
-			CERROR("%s: del local inode "DFID": rc = %d\n",
-				osd_name(osd), PFID(fid), rc);
-	} else {
-		if (rc == -ENODATA)
+		if (rc == -ENOENT)
 			rc = 0;
+
+		GOTO(out, rc);
 	}
 out:
 
@@ -3662,6 +3788,19 @@ static int osd_fail_fid_lookup(struct osd_thread_info *oti,
 	return rc;
 }
 
+static int osd_add_oi_cache(struct osd_thread_info *info,
+			    struct osd_device *osd,
+			    struct osd_inode_id *id,
+			    struct lu_fid *fid)
+{
+	CDEBUG(D_INODE, "add "DFID" %u:%u to info %p\n", PFID(fid),
+	       id->oii_ino, id->oii_gen, info);
+	info->oti_cache.oic_lid = *id;
+	info->oti_cache.oic_fid = *fid;
+
+	return 0;
+}
+
 /**
  * Calls ->lookup() to find dentry. From dentry get inode and
  * read inode's ea to get fid. This is required for  interoperability
@@ -3725,8 +3864,10 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
 			GOTO(out, rc);
 		}
 
-		oic->oic_lid = *id;
-		oic->oic_fid = *fid;
+		rc = osd_add_oi_cache(osd_oti_get(env), osd_obj2dev(obj), id,
+				      fid);
+		if (rc != 0)
+			GOTO(out, rc);
 		if ((scrub->os_pos_current <= ino) &&
 		    ((sf->sf_flags & SF_INCONSISTENT) ||
 		     (sf->sf_flags & SF_UPGRADE && fid_is_igif(fid)) ||
@@ -4255,6 +4396,7 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
         struct osd_object       *obj  = osd_dt_obj(dt);
         struct osd_thread_info  *info = osd_oti_get(env);
         struct osd_it_ea        *it   = &info->oti_it_ea;
+	struct file		*file = &it->oie_file;
         struct lu_object        *lo   = &dt->do_lu;
         struct dentry           *obj_dentry = &info->oti_it_dentry;
         ENTRY;
@@ -4269,17 +4411,20 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
         it->oie_dirent          = NULL;
         it->oie_buf             = info->oti_it_ea_buf;
         it->oie_obj             = obj;
-        it->oie_file.f_pos      = 0;
-        it->oie_file.f_dentry   = obj_dentry;
-        if (attr & LUDA_64BITHASH)
-		it->oie_file.f_mode |= FMODE_64BITHASH;
-        else
-		it->oie_file.f_mode |= FMODE_32BITHASH;
-        it->oie_file.f_mapping    = obj->oo_inode->i_mapping;
-        it->oie_file.f_op         = obj->oo_inode->i_fop;
-        it->oie_file.private_data = NULL;
-        lu_object_get(lo);
-        RETURN((struct dt_it *) it);
+
+	/* Reset the "file" totally to avoid to reuse any old value from
+	 * former readdir handling, the "file->f_pos" should be zero. */
+	memset(file, 0, sizeof(*file));
+	/* Only FMODE_64BITHASH or FMODE_32BITHASH should be set, NOT both. */
+	if (attr & LUDA_64BITHASH)
+		file->f_mode	= FMODE_64BITHASH;
+	else
+		file->f_mode	= FMODE_32BITHASH;
+	file->f_dentry		= obj_dentry;
+	file->f_mapping 	= obj->oo_inode->i_mapping;
+	file->f_op		= obj->oo_inode->i_fop;
+	lu_object_get(lo);
+	RETURN((struct dt_it *) it);
 }
 
 /**
@@ -4548,6 +4693,19 @@ osd_dirent_has_space(__u16 reclen, __u16 namelen, unsigned blocksize)
 		return 0;
 }
 
+static inline int
+osd_dot_dotdot_has_space(struct ldiskfs_dir_entry_2 *de, int dot_dotdot)
+{
+	LASSERTF(dot_dotdot == 1 || dot_dotdot == 2,
+		 "dot_dotdot = %d\n", dot_dotdot);
+
+	if (LDISKFS_DIR_REC_LEN(de) >=
+	    __LDISKFS_DIR_REC_LEN(dot_dotdot + 1 + sizeof(struct osd_fid_pack)))
+		return 1;
+	else
+		return 0;
+}
+
 static int
 osd_dirent_reinsert(const struct lu_env *env, handle_t *jh,
 		    struct inode *dir, struct inode *inode,
@@ -4641,18 +4799,15 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj,
 	struct inode		   *inode;
 	int			    credits;
 	int			    rc;
+	int			    dot_dotdot	= 0;
 	bool			    dirty	= false;
-	bool			    is_dotdot	= false;
 	ENTRY;
 
 	if (ent->oied_name[0] == '.') {
-		/* Skip dot entry, even if it has stale FID-in-dirent, because
-		 * we do not use such FID-in-dirent anymore, it is harmless. */
 		if (ent->oied_namelen == 1)
-			RETURN(0);
-
-		if (ent->oied_namelen == 2 && ent->oied_name[1] == '.')
-			is_dotdot = true;
+			dot_dotdot = 1;
+		else if (ent->oied_namelen == 2 && ent->oied_name[1] == '.')
+			dot_dotdot = 2;
 	}
 
 	dentry = osd_child_dentry_get(env, obj, ent->oied_name,
@@ -4685,26 +4840,36 @@ again:
 			       ent->oied_name, rc);
 			RETURN(rc);
 		}
-	}
 
-	if (obj->oo_hl_head != NULL) {
-		hlock = osd_oti_get(env)->oti_hlock;
-		ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir,
-				   LDISKFS_HLOCK_DEL);
+		if (obj->oo_hl_head != NULL) {
+			hlock = osd_oti_get(env)->oti_hlock;
+			/* "0" means exclusive lock for the whole directory.
+			 * We need to prevent others access such name entry
+			 * during the delete + insert. Neither HLOCK_ADD nor
+			 * HLOCK_DEL cannot guarantee the atomicity. */
+			ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir, 0);
+		} else {
+			down_write(&obj->oo_ext_idx_sem);
+		}
 	} else {
-		down_write(&obj->oo_ext_idx_sem);
+		if (obj->oo_hl_head != NULL) {
+			hlock = osd_oti_get(env)->oti_hlock;
+			ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir,
+					   LDISKFS_HLOCK_LOOKUP);
+		} else {
+			down_read(&obj->oo_ext_idx_sem);
+		}
 	}
 
 	bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
-	/* For dotdot entry, if there is not enough space to hold FID-in-dirent,
-	 * just keep it there. It only happens when the device upgraded from 1.8
-	 * or restored from MDT file-level backup. For the whole directory, only
-	 * dotdot entry has no FID-in-dirent and needs to get FID from LMA when
-	 * readdir, it will not affect the performance much. */
+	/* For dot/dotdot entry, if there is not enough space to hold the
+	 * FID-in-dirent, just keep them there. It only happens when the
+	 * device upgraded from 1.8 or restored from MDT file-level backup.
+	 * For the whole directory, only dot/dotdot entry have no FID-in-dirent
+	 * and needs to get FID from LMA when readdir, it will not affect the
+	 * performance much. */
 	if ((bh == NULL) || (le32_to_cpu(de->inode) != ent->oied_ino) ||
-	    (is_dotdot && !osd_dirent_has_space(de->rec_len,
-						ent->oied_namelen,
-						sb->s_blocksize))) {
+	    (dot_dotdot != 0 && !osd_dot_dotdot_has_space(de, dot_dotdot))) {
 		*attr |= LUDA_IGNORE;
 		GOTO(out_journal, rc = 0);
 	}
@@ -4721,6 +4886,10 @@ again:
 		GOTO(out_journal, rc);
 	}
 
+	/* skip the REMOTE_PARENT_DIR. */
+	if (inode == dev->od_mdt_map->omm_remote_parent->d_inode)
+		GOTO(out_inode, rc = 0);
+
 	rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma);
 	if (rc == 0) {
 		if (fid_is_sane(fid)) {
@@ -4740,7 +4909,7 @@ again:
 				if (hlock != NULL)
 					ldiskfs_htree_unlock(hlock);
 				else
-					up_write(&obj->oo_ext_idx_sem);
+					up_read(&obj->oo_ext_idx_sem);
 				dev->od_dirent_journal = 1;
 				goto again;
 			}
@@ -4754,6 +4923,7 @@ again:
 		} else {
 			/* Do not repair under dryrun mode. */
 			if (*attr & LUDA_VERIFY_DRYRUN) {
+				*fid = lma->lma_self_fid;
 				*attr |= LUDA_REPAIR;
 				GOTO(out_inode, rc = 0);
 			}
@@ -4764,7 +4934,7 @@ again:
 				if (hlock != NULL)
 					ldiskfs_htree_unlock(hlock);
 				else
-					up_write(&obj->oo_ext_idx_sem);
+					up_read(&obj->oo_ext_idx_sem);
 				dev->od_dirent_journal = 1;
 				goto again;
 			}
@@ -4780,10 +4950,13 @@ again:
 	} else if (rc == -ENODATA) {
 		/* Do not repair under dryrun mode. */
 		if (*attr & LUDA_VERIFY_DRYRUN) {
-			if (fid_is_sane(fid))
+			if (fid_is_sane(fid)) {
 				*attr |= LUDA_REPAIR;
-			else
+			} else {
+				lu_igif_build(fid, inode->i_ino,
+					      inode->i_generation);
 				*attr |= LUDA_UPGRADE;
+			}
 			GOTO(out_inode, rc = 0);
 		}
 
@@ -4793,7 +4966,7 @@ again:
 			if (hlock != NULL)
 				ldiskfs_htree_unlock(hlock);
 			else
-				up_write(&obj->oo_ext_idx_sem);
+				up_read(&obj->oo_ext_idx_sem);
 			dev->od_dirent_journal = 1;
 			goto again;
 		}
@@ -4823,10 +4996,14 @@ out_inode:
 
 out_journal:
 	brelse(bh);
-	if (hlock != NULL)
+	if (hlock != NULL) {
 		ldiskfs_htree_unlock(hlock);
-	else
-		up_write(&obj->oo_ext_idx_sem);
+	} else {
+		if (dev->od_dirent_journal)
+			up_write(&obj->oo_ext_idx_sem);
+		else
+			up_read(&obj->oo_ext_idx_sem);
+	}
 	if (jh != NULL)
 		ldiskfs_journal_stop(jh);
 	if (rc >= 0 && !dirty)
@@ -4895,10 +5072,8 @@ pack:
 	if (osd_remote_fid(env, dev, fid))
 		RETURN(0);
 
-	if (likely(!(attr & LUDA_IGNORE))) {
-		oic->oic_lid = *id;
-		oic->oic_fid = *fid;
-	}
+	if (likely(!(attr & LUDA_IGNORE)))
+		rc = osd_add_oi_cache(oti, dev, id, fid);
 
 	if (!(attr & LUDA_VERIFY) &&
 	    (scrub->os_pos_current <= ino) &&
@@ -5081,20 +5256,33 @@ static int osd_shutdown(const struct lu_env *env, struct osd_device *o)
 {
 	ENTRY;
 
-	osd_scrub_cleanup(env, o);
+	/* shutdown quota slave instance associated with the device */
+	if (o->od_quota_slave != NULL) {
+		qsd_fini(env, o->od_quota_slave);
+		o->od_quota_slave = NULL;
+	}
+
+	RETURN(0);
+}
+
+static void osd_umount(const struct lu_env *env, struct osd_device *o)
+{
+	ENTRY;
 
 	if (o->od_fsops) {
 		fsfilt_put_ops(o->od_fsops);
 		o->od_fsops = NULL;
 	}
 
-	/* shutdown quota slave instance associated with the device */
-	if (o->od_quota_slave != NULL) {
-		qsd_fini(env, o->od_quota_slave);
-		o->od_quota_slave = NULL;
+	if (o->od_mnt != NULL) {
+		shrink_dcache_sb(osd_sb(o));
+		osd_sync(env, &o->od_dt_dev);
+
+		mntput(o->od_mnt);
+		o->od_mnt = NULL;
 	}
 
-	RETURN(0);
+	EXIT;
 }
 
 static int osd_mount(const struct lu_env *env,
@@ -5196,30 +5384,18 @@ out:
 }
 
 static struct lu_device *osd_device_fini(const struct lu_env *env,
-                                         struct lu_device *d)
+					 struct lu_device *d)
 {
-        int rc;
-        ENTRY;
-
-	rc = osd_shutdown(env, osd_dev(d));
-
-	osd_obj_map_fini(osd_dev(d));
-
-        shrink_dcache_sb(osd_sb(osd_dev(d)));
-        osd_sync(env, lu2dt_dev(d));
-
-        rc = osd_procfs_fini(osd_dev(d));
-        if (rc) {
-                CERROR("proc fini error %d \n", rc);
-                RETURN (ERR_PTR(rc));
-        }
+	struct osd_device *o = osd_dev(d);
+	ENTRY;
 
-	if (osd_dev(d)->od_mnt) {
-		mntput(osd_dev(d)->od_mnt);
-		osd_dev(d)->od_mnt = NULL;
-	}
+	osd_procfs_fini(o);
+	osd_shutdown(env, o);
+	osd_scrub_cleanup(env, o);
+	osd_obj_map_fini(o);
+	osd_umount(env, o);
 
-        RETURN(NULL);
+	RETURN(NULL);
 }
 
 static int osd_device_init0(const struct lu_env *env,
@@ -5257,12 +5433,6 @@ static int osd_device_init0(const struct lu_env *env,
 	if (rc)
 		GOTO(out_capa, rc);
 
-	CFS_INIT_LIST_HEAD(&o->od_ios_list);
-	/* setup scrub, including OI files initialization */
-	rc = osd_scrub_setup(env, o);
-	if (rc < 0)
-		GOTO(out_mnt, rc);
-
 	cplen = strlcpy(o->od_svname, lustre_cfg_string(cfg, 4),
 			sizeof(o->od_svname));
 	if (cplen >= sizeof(o->od_svname)) {
@@ -5272,22 +5442,28 @@ static int osd_device_init0(const struct lu_env *env,
 
 	rc = osd_obj_map_init(env, o);
 	if (rc != 0)
-		GOTO(out_scrub, rc);
+		GOTO(out_mnt, rc);
 
 	rc = lu_site_init(&o->od_site, l);
-	if (rc)
+	if (rc != 0)
 		GOTO(out_compat, rc);
 	o->od_site.ls_bottom_dev = l;
 
 	rc = lu_site_init_finish(&o->od_site);
-	if (rc)
+	if (rc != 0)
+		GOTO(out_site, rc);
+
+	CFS_INIT_LIST_HEAD(&o->od_ios_list);
+	/* setup scrub, including OI files initialization */
+	rc = osd_scrub_setup(env, o);
+	if (rc < 0)
 		GOTO(out_site, rc);
 
 	rc = osd_procfs_init(o, o->od_svname);
 	if (rc != 0) {
 		CERROR("%s: can't initialize procfs: rc = %d\n",
 		       o->od_svname, rc);
-		GOTO(out_site, rc);
+		GOTO(out_scrub, rc);
 	}
 
 	LASSERT(l->ld_site->ls_linkage.next && l->ld_site->ls_linkage.prev);
@@ -5302,23 +5478,21 @@ static int osd_device_init0(const struct lu_env *env,
 	}
 
 	RETURN(0);
+
 out_procfs:
 	osd_procfs_fini(o);
+out_scrub:
+	osd_scrub_cleanup(env, o);
 out_site:
 	lu_site_fini(&o->od_site);
 out_compat:
 	osd_obj_map_fini(o);
-out_scrub:
-	osd_scrub_cleanup(env, o);
 out_mnt:
-	osd_oi_fini(info, o);
-	osd_shutdown(env, o);
-	mntput(o->od_mnt);
-	o->od_mnt = NULL;
+	osd_umount(env, o);
 out_capa:
 	cleanup_capa_hash(o->od_capa_hash);
 out:
-	RETURN(rc);
+	return rc;
 }
 
 static struct lu_device *osd_device_alloc(const struct lu_env *env,
@@ -5465,15 +5639,6 @@ static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
 	int		   result = 0;
 	ENTRY;
 
-	if (dev->ld_site && lu_device_is_md(dev->ld_site->ls_top_dev)) {
-		/* MDT/MDD still use old infrastructure to create
-		 * special files */
-		result = llo_local_objects_setup(env, lu2md_dev(pdev),
-						 lu2dt_dev(dev));
-		if (result)
-			RETURN(result);
-	}
-
 	if (osd->od_quota_slave != NULL)
 		/* set up quota slave objects */
 		result = qsd_prepare(env, osd->od_quota_slave);