From 89ead218ebe99a955afc0bc7f6aba83ef35019fb Mon Sep 17 00:00:00 2001
From: Fan Yong <fan.yong@intel.com>
Date: Thu, 18 Jan 2018 09:34:50 +0800
Subject: [PATCH] LU-7585 zfs: OI scrub for ZFS

The ZFS backend OI scrub is used for verifying OI mappings
consistency. ZFS has some mechanism to maintion the data
integrity, but there is still possible data corruption,
especially consider the data migration from other backend,
such as ldiskfs, or server side data backup and restore.
The OI scrub can check OI mappings consistency and rebuild
them when needed.

The ZFS backend OI scrub shares the same control interface
as ldiskfs backend. It can be triggered manually via the
lctl command:
lctl lfsck_start -M $device -t scrub

It also can be triggered automatically when inconsistency
detected if you do not disable 'auto_scrub' that can be
controlled via:
lct set_param -n osd-zfs.*.auto_scrub_interval=xxx

You can check the OI scrub status similar as you do for
ldiskfs backend:
lctl get_param -n osd-zfs.*.oi_scrub

Test-Parameters: envdefinitions=SLOW=yes testlist=sanity-scrub mdtfilesystemtype=zfs ostfilesystemtype=zfs mdscount=2 mdtcount=4
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I59ae3142ecd7b27f48b14f2a2d1d110d9c8296e3
Reviewed-on: https://review.whamcloud.com/30909
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/include/obd_support.h     |    1 +
 lustre/lfsck/lfsck_lib.c         |    3 +-
 lustre/mgs/mgs_handler.c         |    4 +-
 lustre/ofd/ofd_dev.c             |    2 +-
 lustre/osd-ldiskfs/osd_handler.c |    2 +-
 lustre/osd-zfs/Makefile.in       |    1 +
 lustre/osd-zfs/osd_handler.c     |   22 +-
 lustre/osd-zfs/osd_index.c       |  478 ++++++-----
 lustre/osd-zfs/osd_internal.h    |   99 ++-
 lustre/osd-zfs/osd_lproc.c       |   53 ++
 lustre/osd-zfs/osd_object.c      |  237 +++++-
 lustre/osd-zfs/osd_oi.c          |  276 ++++--
 lustre/osd-zfs/osd_scrub.c       | 1743 ++++++++++++++++++++++++++++++++++++++
 lustre/tests/sanity-scrub.sh     |  311 +++----
 14 files changed, 2723 insertions(+), 509 deletions(-)
 create mode 100644 lustre/osd-zfs/osd_scrub.c

diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 063821f..af090df 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -272,6 +272,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
 #define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
 #define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
+#define OBD_FAIL_OSD_NO_OI_ENTRY			0x198
 
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c
index be9e4d3..a99434f 100644
--- a/lustre/lfsck/lfsck_lib.c
+++ b/lustre/lfsck/lfsck_lib.c
@@ -1809,7 +1809,8 @@ void lfsck_pos_fill(const struct lu_env *env, struct lfsck_instance *lfsck,
 	if (!lfsck->li_current_oit_processed && !init)
 		pos->lp_oit_cookie--;
 
-	LASSERT(pos->lp_oit_cookie > 0);
+	if (unlikely(pos->lp_oit_cookie == 0))
+		pos->lp_oit_cookie = 1;
 
 	if (lfsck->li_di_dir != NULL) {
 		struct dt_object *dto = lfsck->li_obj_dir;
diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c
index 6db1b45..75b9c1b 100644
--- a/lustre/mgs/mgs_handler.c
+++ b/lustre/mgs/mgs_handler.c
@@ -1400,7 +1400,7 @@ err_ns:
 err_ops:
 	lu_site_purge(env, mgs2lu_dev(mgs)->ld_site, ~0);
 	if (!cfs_hash_is_empty(mgs2lu_dev(mgs)->ld_site->ls_obj_hash)) {
-		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_OTHER, NULL);
 		lu_site_print(env, mgs2lu_dev(mgs)->ld_site, &msgdata,
 				lu_cdebug_printer);
 	}
@@ -1576,7 +1576,7 @@ static struct lu_device *mgs_device_fini(const struct lu_env *env,
 
 	lu_site_purge(env, d->ld_site, ~0);
 	if (!cfs_hash_is_empty(d->ld_site->ls_obj_hash)) {
-		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_OTHER, NULL);
 		lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer);
 	}
 
diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c
index 3ef00be..da8b008 100644
--- a/lustre/ofd/ofd_dev.c
+++ b/lustre/ofd/ofd_dev.c
@@ -247,7 +247,7 @@ static void ofd_stack_fini(const struct lu_env *env, struct ofd_device *m,
 
 	lu_site_purge(env, top->ld_site, ~0);
 	if (!cfs_hash_is_empty(top->ld_site->ls_obj_hash)) {
-		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_OTHER, NULL);
 		lu_site_print(env, top->ld_site, &msgdata, lu_cdebug_printer);
 	}
 
diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c
index 9610d3a..2b699a0 100644
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -3669,7 +3669,7 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
 			obj->oo_dt.do_body_ops = &osd_body_ops;
 	}
 
-	if (result == 0)
+	if (!result && !CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY))
 		result = __osd_oi_insert(env, obj, fid, th);
 
 	/* a small optimization - dt_insert() isn't usually applied
diff --git a/lustre/osd-zfs/Makefile.in b/lustre/osd-zfs/Makefile.in
index 6ffa654..4483348 100644
--- a/lustre/osd-zfs/Makefile.in
+++ b/lustre/osd-zfs/Makefile.in
@@ -1,6 +1,7 @@
 MODULES := osd_zfs
 osd_zfs-objs := osd_handler.o osd_lproc.o osd_quota.o
 osd_zfs-objs += osd_object.o osd_io.o osd_oi.o osd_xattr.o osd_index.o
+osd_zfs-objs += osd_scrub.o
 
 EXTRA_PRE_CFLAGS += -include @SPL_OBJ@/spl_config.h
 EXTRA_PRE_CFLAGS += -include @ZFS_OBJ@/zfs_config.h
diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c
index 74db348..b55d329 100644
--- a/lustre/osd-zfs/osd_handler.c
+++ b/lustre/osd-zfs/osd_handler.c
@@ -1058,6 +1058,8 @@ static int osd_mount(const struct lu_env *env,
 	if (rc >= sizeof(o->od_svname))
 		RETURN(-E2BIG);
 
+	o->od_index = -1; /* -1 means index is invalid */
+	rc = server_name2index(o->od_svname, &o->od_index, NULL);
 	str = strstr(str, ":");
 	if (str) {
 		unsigned long flags;
@@ -1071,6 +1073,9 @@ static int osd_mount(const struct lu_env *env,
 			LCONSOLE_WARN("%s: set dev_rdonly on this device\n",
 				      svname);
 		}
+
+		if (flags & LMD_FLG_NOSCRUB)
+			o->od_auto_scrub_interval = AS_NEVER;
 	}
 
 	if (server_name_is_ost(o->od_svname))
@@ -1108,11 +1113,6 @@ static int osd_mount(const struct lu_env *env,
 	}
 #endif
 
-	/* 1. initialize oi before any file create or file open */
-	rc = osd_oi_init(env, o);
-	if (rc)
-		GOTO(err, rc);
-
 	rc = lu_site_init(&o->od_site, osd2lu_dev(o));
 	if (rc)
 		GOTO(err, rc);
@@ -1126,6 +1126,12 @@ static int osd_mount(const struct lu_env *env,
 	if (rc)
 		GOTO(err, rc);
 
+	o->od_in_init = 1;
+	rc = osd_scrub_setup(env, o);
+	o->od_in_init = 0;
+	if (rc)
+		GOTO(err, rc);
+
 	rc = osd_procfs_init(o, o->od_svname);
 	if (rc)
 		GOTO(err, rc);
@@ -1222,6 +1228,9 @@ static int osd_device_init0(const struct lu_env *env,
 
 	l->ld_ops = &osd_lu_ops;
 	o->od_dt_dev.dd_ops = &osd_dt_ops;
+	sema_init(&o->od_otable_sem, 1);
+	INIT_LIST_HEAD(&o->od_ios_list);
+	o->od_auto_scrub_interval = AS_DEFAULT;
 
 out:
 	RETURN(rc);
@@ -1304,7 +1313,7 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
 
 	/* now with all the callbacks completed we can cleanup the remainings */
 	osd_shutdown(env, o);
-	osd_oi_fini(env, o);
+	osd_scrub_cleanup(env, o);
 
 	rc = osd_procfs_fini(o);
 	if (rc) {
@@ -1552,7 +1561,6 @@ static void __exit osd_exit(void)
 	lu_kmem_fini(osd_caches);
 }
 
-extern unsigned int osd_oi_count;
 module_param(osd_oi_count, int, 0444);
 MODULE_PARM_DESC(osd_oi_count, "Number of Object Index containers to be created, it's only valid for new filesystem.");
 
diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c
index 09c3057..20f86f2 100644
--- a/lustre/osd-zfs/osd_index.c
+++ b/lustre/osd-zfs/osd_index.c
@@ -331,7 +331,7 @@ out:
  */
 static int osd_find_parent_by_dnode(const struct lu_env *env,
 				    struct dt_object *o,
-				    struct lu_fid *fid)
+				    struct lu_fid *fid, uint64_t *oid)
 {
 	struct osd_object	*obj = osd_dt_obj(o);
 	struct osd_device	*osd = osd_obj2dev(obj);
@@ -344,14 +344,17 @@ static int osd_find_parent_by_dnode(const struct lu_env *env,
 	if (rc != 0)
 		RETURN(rc);
 	rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PARENT(osd), &dnode, 8);
-	if (rc == 0)
+	if (!rc) {
+		if (oid)
+			*oid = dnode;
 		rc = osd_get_fid_by_oid(env, osd, dnode, fid);
+	}
 
 	RETURN(rc);
 }
 
 static int osd_find_parent_fid(const struct lu_env *env, struct dt_object *o,
-			       struct lu_fid *fid)
+			       struct lu_fid *fid, uint64_t *oid)
 {
 	struct link_ea_header  *leh;
 	struct link_ea_entry   *lee;
@@ -404,7 +407,7 @@ out:
 	if (rc == 0) {
 		struct lu_fid fid2;
 		int rc2;
-		rc2 = osd_find_parent_by_dnode(env, o, &fid2);
+		rc2 = osd_find_parent_by_dnode(env, o, &fid2, oid);
 		if (rc2 == 0)
 			if (lu_fid_eq(fid, &fid2) == 0)
 				CERROR("wrong parent: "DFID" != "DFID"\n",
@@ -414,19 +417,241 @@ out:
 
 	/* no LinkEA is found, let's try to find the fid in parent's LMA */
 	if (unlikely(rc != 0))
-		rc = osd_find_parent_by_dnode(env, o, fid);
+		rc = osd_find_parent_by_dnode(env, o, fid, oid);
 
 	RETURN(rc);
 }
 
+/*
+ * When lookup item under striped directory, we need to locate the master
+ * MDT-object of the striped directory firstly, then the client will send
+ * lookup (getattr_by_name) RPC to the MDT with some slave MDT-object's FID
+ * and the item's name. If the system is restored from MDT file level backup,
+ * then before the OI scrub completely built the OI files, the OI mappings of
+ * the master MDT-object and slave MDT-object may be invalid. Usually, it is
+ * not a problem for the master MDT-object. Because when locate the master
+ * MDT-object, we will do name based lookup (for the striped directory itself)
+ * firstly, during such process we can setup the correct OI mapping for the
+ * master MDT-object. But it will be trouble for the slave MDT-object. Because
+ * the client will not trigger name based lookup on the MDT to locate the slave
+ * MDT-object before locating item under the striped directory, then when
+ * osd_fid_lookup(), it will find that the OI mapping for the slave MDT-object
+ * is invalid and does not know what the right OI mapping is, then the MDT has
+ * to return -EINPROGRESS to the client to notify that the OI scrub is rebuiding
+ * the OI file, related OI mapping is unknown yet, please try again later. And
+ * then client will re-try the RPC again and again until related OI mapping has
+ * been updated. That is quite inefficient.
+ *
+ * To resolve above trouble, we will handle it as the following two cases:
+ *
+ * 1) The slave MDT-object and the master MDT-object are on different MDTs.
+ *    It is relative easy. Be as one of remote MDT-objects, the slave MDT-object
+ *    is linked under /REMOTE_PARENT_DIR with the name of its FID string.
+ *    We can locate the slave MDT-object via lookup the /REMOTE_PARENT_DIR
+ *    directly. Please check osd_fid_lookup().
+ *
+ * 2) The slave MDT-object and the master MDT-object reside on the same MDT.
+ *    Under such case, during lookup the master MDT-object, we will lookup the
+ *    slave MDT-object via readdir against the master MDT-object, because the
+ *    slave MDT-objects information are stored as sub-directories with the name
+ *    "${FID}:${index}". Then when find the local slave MDT-object, its OI
+ *    mapping will be recorded. Then subsequent osd_fid_lookup() will know
+ *    the correct OI mapping for the slave MDT-object.
+ */
+static int osd_check_lmv(const struct lu_env *env, struct osd_device *osd,
+			 uint64_t oid, const struct lu_fid *fid)
+{
+	struct osd_thread_info *info = osd_oti_get(env);
+	struct luz_direntry *zde = &info->oti_zde;
+	zap_attribute_t *za = &info->oti_za;
+	zap_cursor_t *zc = &info->oti_zc;
+	struct lu_fid *tfid = &info->oti_fid;
+	nvlist_t *nvbuf = NULL;
+	struct lmv_mds_md_v1 *lmv = NULL;
+	int size;
+	int rc;
+	ENTRY;
+
+	rc = __osd_xattr_load_by_oid(osd, oid, &nvbuf);
+	if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+		RETURN(0);
+
+	if (rc)
+		RETURN(rc);
+
+	rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMV,
+				       (uchar_t **)&lmv, &size);
+	if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+		GOTO(out_nvbuf, rc = 0);
+
+	if (rc || le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
+		GOTO(out_nvbuf, rc);
+
+	zap_cursor_init_serialized(zc, osd->od_os, oid, 0);
+	rc = -zap_cursor_retrieve(zc, za);
+	if (rc == -ENOENT) {
+		zap_cursor_advance(zc);
+	} else if (rc) {
+		CERROR("%s: fail to init for check LMV "DFID"(%llu): rc = %d\n",
+		       osd_name(osd), PFID(fid), oid, rc);
+		GOTO(out_zc, rc);
+	}
+
+	while (1) {
+		rc = -zap_cursor_retrieve(zc, za);
+		if (rc == -ENOENT)
+			GOTO(out_zc, rc = 0);
+
+		if (rc) {
+			CERROR("%s: fail to locate next for check LMV "
+			       DFID"(%llu): rc = %d\n",
+			       osd_name(osd), PFID(fid), oid, rc);
+			GOTO(out_zc, rc);
+		}
+
+		fid_zero(tfid);
+		sscanf(za->za_name + 1, SFID, RFID(tfid));
+		if (fid_is_sane(tfid) && !osd_remote_fid(env, osd, tfid)) {
+			rc = osd_zap_lookup(osd, oid, NULL, za->za_name,
+					za->za_integer_length,
+					sizeof(*zde) / za->za_integer_length,
+					(void *)zde);
+			if (rc) {
+				CERROR("%s: fail to lookup for check LMV "
+				       DFID"(%llu): rc = %d\n",
+				       osd_name(osd), PFID(fid), oid, rc);
+				GOTO(out_zc, rc);
+			}
+
+			rc = osd_oii_insert(env, osd, tfid,
+					    zde->lzd_reg.zde_dnode, false);
+			GOTO(out_zc, rc);
+		}
+
+		zap_cursor_advance(zc);
+	}
+
+out_zc:
+	zap_cursor_fini(zc);
+out_nvbuf:
+	nvlist_free(nvbuf);
+
+	return rc;
+}
+
+static int
+osd_consistency_check(const struct lu_env *env, struct osd_device *osd,
+		      struct osd_object *obj, const struct lu_fid *fid,
+		      uint64_t oid, bool is_dir)
+{
+	struct lustre_scrub *scrub = &osd->od_scrub;
+	dnode_t *dn = NULL;
+	uint64_t oid2;
+	int once = 0;
+	bool insert;
+	int rc;
+	ENTRY;
+
+	if (!fid_is_norm(fid) && !fid_is_igif(fid))
+		RETURN(0);
+
+	/* oid == ZFS_NO_OBJECT must be for lookup ".." case */
+	if (oid == ZFS_NO_OBJECT) {
+		rc = osd_sa_handle_get(obj);
+		if (rc)
+			RETURN(rc);
+
+		rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PARENT(osd), &oid, 8);
+		if (rc)
+			RETURN(rc);
+	}
+
+	if (thread_is_running(&scrub->os_thread)) {
+		if (scrub->os_pos_current > oid)
+			RETURN(0);
+	} else if (osd->od_auto_scrub_interval == AS_NEVER) {
+		RETURN(0);
+	} else {
+		if (cfs_time_before(cfs_time_current_sec(),
+				    scrub->os_file.sf_time_last_complete +
+				    osd->od_auto_scrub_interval))
+			RETURN(0);
+	}
+
+again:
+	rc = osd_fid_lookup(env, osd, fid, &oid2);
+	if (rc == -ENOENT) {
+		insert = true;
+		if (dn)
+			goto trigger;
+
+		rc = __osd_obj2dnode(osd->od_os, oid, &dn);
+		/* The object has been removed (by race maybe). */
+		if (rc)
+			RETURN(rc = (rc == -EEXIST ? -ENOENT : rc));
+
+		goto trigger;
+	} else if (rc || oid == oid2) {
+		GOTO(out, rc);
+	}
+
+	insert = false;
+
+trigger:
+	if (thread_is_running(&scrub->os_thread)) {
+		if (!dn) {
+			rc = __osd_obj2dnode(osd->od_os, oid, &dn);
+			/* The object has been removed (by race maybe). */
+			if (rc)
+				RETURN(rc = (rc == -EEXIST ? -ENOENT : rc));
+		}
+
+		rc = osd_oii_insert(env, osd, fid, oid, insert);
+		/* There is race condition between osd_oi_lookup and OI scrub.
+		 * The OI scrub finished just after osd_oi_lookup() failure.
+		 * Under such case, it is unnecessary to trigger OI scrub again,
+		 * but try to call osd_oi_lookup() again. */
+		if (unlikely(rc == -EAGAIN))
+			goto again;
+
+		if (is_dir)
+			rc = osd_check_lmv(env, osd, oid, fid);
+		else
+			rc = 0;
+
+		GOTO(out, rc);
+	}
+
+	if (osd->od_auto_scrub_interval != AS_NEVER && ++once == 1) {
+		rc = osd_scrub_start(env, osd, SS_AUTO_FULL |
+				     SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT);
+		CDEBUG(D_LFSCK | D_CONSOLE | D_WARNING,
+		       "%s: trigger partial OI scrub for RPC inconsistency "
+		       "checking FID "DFID": rc = %d\n",
+		       osd_name(osd), PFID(fid), rc);
+		if (!rc)
+			goto again;
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (dn)
+		osd_dnode_rele(dn);
+
+	return rc;
+}
+
 static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt,
 			  struct dt_rec *rec, const struct dt_key *key)
 {
 	struct osd_thread_info *oti = osd_oti_get(env);
-	struct osd_object  *obj = osd_dt_obj(dt);
-	struct osd_device  *osd = osd_obj2dev(obj);
-	char		   *name = (char *)key;
-	int                 rc;
+	struct osd_object *obj = osd_dt_obj(dt);
+	struct osd_device *osd = osd_obj2dev(obj);
+	struct lu_fid *fid = (struct lu_fid *)rec;
+	char *name = (char *)key;
+	uint64_t oid = ZFS_NO_OBJECT;
+	int rc;
 	ENTRY;
 
 	if (name[0] == '.') {
@@ -435,8 +660,8 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt,
 			memcpy(rec, f, sizeof(*f));
 			RETURN(1);
 		} else if (name[1] == '.' && name[2] == 0) {
-			rc = osd_find_parent_fid(env, dt, (struct lu_fid *)rec);
-			RETURN(rc == 0 ? 1 : rc);
+			rc = osd_find_parent_fid(env, dt, fid, &oid);
+			GOTO(out, rc);
 		}
 	}
 
@@ -447,15 +672,26 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt,
 	if (rc != 0)
 		RETURN(rc);
 
+	oid = oti->oti_zde.lzd_reg.zde_dnode;
 	if (likely(fid_is_sane(&oti->oti_zde.lzd_fid))) {
 		memcpy(rec, &oti->oti_zde.lzd_fid, sizeof(struct lu_fid));
-		RETURN(1);
+		GOTO(out, rc = 0);
 	}
 
-	rc = osd_get_fid_by_oid(env, osd, oti->oti_zde.lzd_reg.zde_dnode,
-				(struct lu_fid *)rec);
+	rc = osd_get_fid_by_oid(env, osd, oti->oti_zde.lzd_reg.zde_dnode, fid);
+
+	GOTO(out, rc);
+
+out:
+	if (!rc && !osd_remote_fid(env, osd, fid)) {
+		rc = osd_consistency_check(env, osd, obj, fid, oid,
+				S_ISDIR(DTTOIF(oti->oti_zde.lzd_reg.zde_type)));
+		/* Only -ENOENT error will affect the lookup result. */
+		if (rc != -ENOENT)
+			rc = 0;
+	}
 
-	RETURN(rc == 0 ? 1 : (rc == -ENOENT ? -ENODATA : rc));
+	return rc == 0 ? 1 : (rc == -ENOENT ? -ENODATA : rc);
 }
 
 /*
@@ -1266,7 +1502,7 @@ static int osd_dir_it_rec(const struct lu_env *env, const struct dt_it *di,
 		lde->lde_hash = cpu_to_le64(2);
 		strcpy(lde->lde_name, "..");
 		lde->lde_namelen = cpu_to_le16(2);
-		rc = osd_find_parent_fid(env, &it->ozi_obj->oo_dt, fid);
+		rc = osd_find_parent_fid(env, &it->ozi_obj->oo_dt, fid, NULL);
 		if (!rc) {
 			fid_cpu_to_le(&lde->lde_fid, fid);
 			lde->lde_attrs = LUDA_FID;
@@ -1772,214 +2008,6 @@ static struct dt_index_operations osd_index_ops = {
 	}
 };
 
-struct osd_metadnode_it {
-	struct osd_device       *mit_dev;
-	__u64			 mit_pos;
-	struct lu_fid		 mit_fid;
-	int			 mit_prefetched;
-	__u64			 mit_prefetched_dnode;
-};
-
-static struct dt_it *osd_zfs_otable_it_init(const struct lu_env *env,
-					    struct dt_object *dt, __u32 attr)
-{
-	struct osd_device	*dev   = osd_dev(dt->do_lu.lo_dev);
-	struct osd_metadnode_it *it;
-	ENTRY;
-
-	OBD_ALLOC_PTR(it);
-	if (unlikely(it == NULL))
-		RETURN(ERR_PTR(-ENOMEM));
-
-	it->mit_dev = dev;
-
-	/* XXX: dmu_object_next() does NOT find dnodes allocated
-	 *	in the current non-committed txg, so we force txg
-	 *	commit to find all existing dnodes ... */
-	if (!dev->od_dt_dev.dd_rdonly)
-		txg_wait_synced(dmu_objset_pool(dev->od_os), 0ULL);
-
-	RETURN((struct dt_it *)it);
-}
-
-static void osd_zfs_otable_it_fini(const struct lu_env *env, struct dt_it *di)
-{
-	struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-
-	OBD_FREE_PTR(it);
-}
-
-static int osd_zfs_otable_it_get(const struct lu_env *env,
-				 struct dt_it *di, const struct dt_key *key)
-{
-	return 0;
-}
-
-static void osd_zfs_otable_it_put(const struct lu_env *env, struct dt_it *di)
-{
-}
-
-#define OTABLE_PREFETCH		256
-
-static void osd_zfs_otable_prefetch(const struct lu_env *env,
-				    struct osd_metadnode_it *it)
-{
-	struct osd_device	*dev = it->mit_dev;
-	int			 rc;
-
-	/* can go negative on the very first access to the iterator
-	 * or if some non-Lustre objects were found */
-	if (unlikely(it->mit_prefetched < 0))
-		it->mit_prefetched = 0;
-
-	if (it->mit_prefetched >= (OTABLE_PREFETCH >> 1))
-		return;
-
-	if (it->mit_prefetched_dnode == 0)
-		it->mit_prefetched_dnode = it->mit_pos;
-
-	while (it->mit_prefetched < OTABLE_PREFETCH) {
-		rc = -dmu_object_next(dev->od_os, &it->mit_prefetched_dnode,
-				      B_FALSE, 0);
-		if (unlikely(rc != 0))
-			break;
-
-		osd_dmu_prefetch(dev->od_os, it->mit_prefetched_dnode,
-				 0, 0, 0, ZIO_PRIORITY_ASYNC_READ);
-
-		it->mit_prefetched++;
-	}
-}
-
-static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di)
-{
-	struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-	struct lustre_mdt_attrs *lma;
-	struct osd_device	*dev = it->mit_dev;
-	nvlist_t		*nvbuf = NULL;
-	uchar_t			*v;
-	__u64			 dnode;
-	int			 rc, s;
-
-	memset(&it->mit_fid, 0, sizeof(it->mit_fid));
-
-	dnode = it->mit_pos;
-	do {
-		rc = -dmu_object_next(dev->od_os, &it->mit_pos, B_FALSE, 0);
-		if (unlikely(rc != 0))
-			GOTO(out, rc = 1);
-		it->mit_prefetched--;
-
-		/* LMA is required for this to be a Lustre object.
-		 * If there is no xattr skip it. */
-		rc = __osd_xattr_load_by_oid(dev, it->mit_pos, &nvbuf);
-		if (unlikely(rc != 0))
-			continue;
-
-		LASSERT(nvbuf != NULL);
-		rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &v, &s);
-		if (likely(rc == 0)) {
-			/* Lustre object */
-			lma = (struct lustre_mdt_attrs *)v;
-			lustre_lma_swab(lma);
-			if (likely(!(lma->lma_compat & LMAC_NOT_IN_OI) &&
-				   !(lma->lma_incompat & LMAI_AGENT))) {
-				it->mit_fid = lma->lma_self_fid;
-				nvlist_free(nvbuf);
-				break;
-			}
-		}
-
-		/* not a Lustre visible object, try next one */
-		nvlist_free(nvbuf);
-	} while (1);
-
-
-	/* we aren't prefetching in the above loop because the number of
-	 * non-Lustre objects is very small and we will be repeating very
-	 * rare. in case we want to use this to iterate over non-Lustre
-	 * objects (i.e. when we convert regular ZFS in Lustre) it makes
-	 * sense to initiate prefetching in the loop */
-
-	/* 0 - there are more items, +1 - the end */
-	if (likely(rc == 0))
-		osd_zfs_otable_prefetch(env, it);
-
-	CDEBUG(D_OTHER, "advance: %llu -> %llu "DFID": %d\n", dnode,
-	       it->mit_pos, PFID(&it->mit_fid), rc);
-
-out:
-	return rc;
-}
-
-static struct dt_key *osd_zfs_otable_it_key(const struct lu_env *env,
-					    const struct dt_it *di)
-{
-	return NULL;
-}
-
-static int osd_zfs_otable_it_key_size(const struct lu_env *env,
-				      const struct dt_it *di)
-{
-	return sizeof(__u64);
-}
-
-static int osd_zfs_otable_it_rec(const struct lu_env *env,
-				 const struct dt_it *di,
-				 struct dt_rec *rec, __u32 attr)
-{
-	struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-	struct lu_fid *fid = (struct lu_fid *)rec;
-	ENTRY;
-
-	*fid = it->mit_fid;
-
-	RETURN(0);
-}
-
-
-static __u64 osd_zfs_otable_it_store(const struct lu_env *env,
-				     const struct dt_it *di)
-{
-	struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-
-	return it->mit_pos;
-}
-
-static int osd_zfs_otable_it_load(const struct lu_env *env,
-				  const struct dt_it *di, __u64 hash)
-{
-	struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-
-	it->mit_pos = hash;
-	it->mit_prefetched = 0;
-	it->mit_prefetched_dnode = 0;
-
-	return osd_zfs_otable_it_next(env, (struct dt_it *)di);
-}
-
-static int osd_zfs_otable_it_key_rec(const struct lu_env *env,
-				     const struct dt_it *di, void *key_rec)
-{
-	return 0;
-}
-
-const struct dt_index_operations osd_zfs_otable_ops = {
-	.dio_it = {
-		.init     = osd_zfs_otable_it_init,
-		.fini     = osd_zfs_otable_it_fini,
-		.get      = osd_zfs_otable_it_get,
-		.put	  = osd_zfs_otable_it_put,
-		.next     = osd_zfs_otable_it_next,
-		.key	  = osd_zfs_otable_it_key,
-		.key_size = osd_zfs_otable_it_key_size,
-		.rec      = osd_zfs_otable_it_rec,
-		.store    = osd_zfs_otable_it_store,
-		.load     = osd_zfs_otable_it_load,
-		.key_rec  = osd_zfs_otable_it_key_rec,
-	}
-};
-
 int osd_index_try(const struct lu_env *env, struct dt_object *dt,
 		const struct dt_index_features *feat)
 {
@@ -1997,7 +2025,7 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt,
 		GOTO(out, rc = -ERANGE);
 
 	if (unlikely(feat == &dt_otable_features)) {
-		dt->do_index_ops = &osd_zfs_otable_ops;
+		dt->do_index_ops = &osd_otable_ops;
 		GOTO(out, rc = 0);
 	}
 
diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h
index 8d5996a..37f24ea 100644
--- a/lustre/osd-zfs/osd_internal.h
+++ b/lustre/osd-zfs/osd_internal.h
@@ -43,6 +43,8 @@
 #include <dt_object.h>
 #include <md_object.h>
 #include <lustre_quota.h>
+#include <lustre_scrub.h>
+#include <obd.h>
 #ifdef SHRINK_STOP
 #undef SHRINK_STOP
 #endif
@@ -179,6 +181,38 @@ struct osd_idmap_cache {
 				oic_remote:1;      /* FID isn't local */
 };
 
+struct osd_inconsistent_item {
+	/* link into lustre_scrub::os_inconsistent_items,
+	 * protected by lustr_scrub::os_lock. */
+	struct list_head       oii_list;
+
+	/* The right FID <=> oid mapping. */
+	struct osd_idmap_cache oii_cache;
+
+	unsigned int	       oii_insert:1; /* insert or update mapping. */
+};
+
+struct osd_otable_it {
+	struct osd_device       *ooi_dev;
+	struct lu_fid		 ooi_fid;
+	__u64			 ooi_pos;
+	__u64			 ooi_prefetched_dnode;
+	int			 ooi_prefetched;
+
+	/* The following bits can be updated/checked w/o lock protection.
+	 * If more bits will be introduced in the future and need lock to
+	 * protect, please add comment. */
+	unsigned int		 ooi_used_outside:1, /* Some user out of OSD
+						      * uses the iteration. */
+				 ooi_all_cached:1, /* No more entries can be
+						    * filled into cache. */
+				 ooi_user_ready:1, /* The user out of OSD is
+						    * ready to iterate. */
+				 ooi_waiting:1; /* it::next is waiting. */
+};
+
+extern const struct dt_index_operations osd_otable_ops;
+
 /* max.number of regular attributes the callers may ask for */
 # define OSD_MAX_IN_BULK (sizeof(struct osa_attr)/sizeof(uint64_t))
 
@@ -218,6 +252,7 @@ struct osd_thread_info {
 	int		       oti_ins_cache_size;
 	int		       oti_ins_cache_used;
 	struct lu_buf	       oti_xattr_lbuf;
+	zap_cursor_t	       oti_zc;
 };
 
 extern struct lu_context_key osd_key;
@@ -295,13 +330,17 @@ struct osd_device {
 				 od_prop_rdonly:1,  /**< ZFS property readonly */
 				 od_xattr_in_sa:1,
 				 od_is_ost:1,
+				 od_in_init:1,
 				 od_posix_acl:1;
 	unsigned int		 od_dnsize;
 
 	char			 od_mntdev[128];
 	char			 od_svname[128];
+	char			 od_uuid[16];
 
 	int			 od_connects;
+	int			 od_index;
+	__s64			 od_auto_scrub_interval;
 	struct lu_site		 od_site;
 
 	dnode_t			*od_groupused_dn;
@@ -328,6 +367,11 @@ struct osd_device {
 
 	/* osd seq instance */
 	struct lu_client_seq	*od_cl_seq;
+
+	struct semaphore	 od_otable_sem;
+	struct osd_otable_it	*od_otable_it;
+	struct lustre_scrub	 od_scrub;
+	struct list_head	 od_ios_list;
 };
 
 enum osd_destroy_type {
@@ -388,6 +432,7 @@ struct osd_object {
 		};
 		uint64_t	oo_parent; /* used only at object creation */
 	};
+	struct lu_object_header *oo_header;
 };
 
 int osd_statfs(const struct lu_env *, struct dt_device *, struct obd_statfs *);
@@ -477,7 +522,33 @@ static inline struct seq_server_site *osd_seq_site(struct osd_device *osd)
 
 static inline char *osd_name(struct osd_device *osd)
 {
-	return osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name;
+	return osd->od_svname;
+}
+
+static inline void zfs_set_bit(int nr, __u8 *addr)
+{
+	set_bit(nr, (unsigned long *)addr);
+}
+
+static inline int zfs_test_bit(int nr, __u8 *addr)
+{
+	return test_bit(nr, (const unsigned long *)addr);
+}
+
+static inline int osd_oi_fid2idx(struct osd_device *dev,
+				 const struct lu_fid *fid)
+{
+	return fid->f_seq & (dev->od_oi_count - 1);
+}
+
+static inline struct osd_oi *osd_fid2oi(struct osd_device *osd,
+					const struct lu_fid *fid)
+{
+	LASSERTF(osd->od_oi_table && osd->od_oi_count >= 1,
+		 "%s: "DFID", oi_count %d\n",
+		 osd_name(osd), PFID(fid), osd->od_oi_count);
+
+	return osd->od_oi_table[osd_oi_fid2idx(osd, fid)];
 }
 
 #ifdef CONFIG_PROC_FS
@@ -523,6 +594,9 @@ int __osd_object_create(const struct lu_env *env, struct osd_device *osd,
 int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
 		    struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
 		    struct lu_attr *la, uint64_t parent, nvlist_t *);
+int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
+		       uint64_t oid, dnode_t **dnp);
+int osd_object_init0(const struct lu_env *env, struct osd_object *obj);
 
 /* osd_oi.c */
 int osd_oi_init(const struct lu_env *env, struct osd_device *o);
@@ -543,6 +617,17 @@ struct osd_idmap_cache *osd_idc_find_or_init(const struct lu_env *env,
 struct osd_idmap_cache *osd_idc_find(const struct lu_env *env,
 				     struct osd_device *osd,
 				     const struct lu_fid *fid);
+int osd_idc_find_and_init_with_oid(const struct lu_env *env,
+				   struct osd_device *osd,
+				   const struct lu_fid *fid,
+				   uint64_t oid);
+int fid_is_on_ost(const struct lu_env *env, struct osd_device *osd,
+		  const struct lu_fid *fid);
+int osd_obj_find_or_create(const struct lu_env *env, struct osd_device *o,
+			   uint64_t parent, const char *name, uint64_t *child,
+			   const struct lu_fid *fid, bool isdir);
+
+extern unsigned int osd_oi_count;
 
 /* osd_index.c */
 int osd_index_try(const struct lu_env *env, struct dt_object *dt,
@@ -565,6 +650,18 @@ int osd_delete_from_remote_parent(const struct lu_env *env,
 				  struct osd_device *osd,
 				  struct osd_object *obj,
 				  struct osd_thandle *oh, bool destroy);
+int __osd_xattr_load_by_oid(struct osd_device *osd, uint64_t oid,
+			    nvlist_t **sa);
+
+/* osd_scrub.c */
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev);
+void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev);
+int osd_scrub_start(const struct lu_env *env, struct osd_device *dev,
+		    __u32 flags);
+int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
+		   const struct lu_fid *fid, uint64_t oid, bool insert);
+int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
+		   uint64_t *oid);
 
 /* osd_xattr.c */
 int __osd_sa_xattr_schedule_update(const struct lu_env *env,
diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c
index 00bf845..39b1848 100644
--- a/lustre/osd-zfs/osd_lproc.c
+++ b/lustre/osd-zfs/osd_lproc.c
@@ -40,6 +40,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <lprocfs_status.h>
+#include <lustre_scrub.h>
 
 #include "osd_internal.h"
 
@@ -210,6 +211,54 @@ out:
 	RETURN(result);
 }
 
+static int zfs_osd_auto_scrub_seq_show(struct seq_file *m, void *data)
+{
+	struct osd_device *dev = osd_dt_dev((struct dt_device *)m->private);
+
+	LASSERT(dev != NULL);
+	if (unlikely(!dev->od_os))
+		return -EINPROGRESS;
+
+	seq_printf(m, "%lld\n", dev->od_auto_scrub_interval);
+	return 0;
+}
+
+static ssize_t
+zfs_osd_auto_scrub_seq_write(struct file *file, const char __user *buffer,
+			     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct dt_device *dt = m->private;
+	struct osd_device *dev = osd_dt_dev(dt);
+	int rc;
+	__s64 val;
+
+	LASSERT(dev != NULL);
+	if (unlikely(!dev->od_os))
+		return -EINPROGRESS;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	dev->od_auto_scrub_interval = val;
+	return count;
+}
+LPROC_SEQ_FOPS(zfs_osd_auto_scrub);
+
+static int zfs_osd_oi_scrub_seq_show(struct seq_file *m, void *data)
+{
+	struct osd_device *dev = osd_dt_dev((struct dt_device *)m->private);
+
+	LASSERT(dev != NULL);
+	if (unlikely(!dev->od_os))
+		return -EINPROGRESS;
+
+	scrub_dump(m, &dev->od_scrub);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(zfs_osd_oi_scrub);
+
 static int zfs_osd_fstype_seq_show(struct seq_file *m, void *data)
 {
 	seq_puts(m, "zfs\n");
@@ -266,6 +315,10 @@ struct lprocfs_vars lprocfs_osd_obd_vars[] = {
 	  .fops	=	&zfs_dt_filestotal_fops		},
 	{ .name	=	"filesfree",
 	  .fops	=	&zfs_dt_filesfree_fops		},
+	{ .name	=	"auto_scrub",
+	  .fops	=	&zfs_osd_auto_scrub_fops	},
+	{ .name	=	"oi_scrub",
+	  .fops	=	&zfs_osd_oi_scrub_fops		},
 	{ .name	=	"fstype",
 	  .fops	=	&zfs_osd_fstype_fops		},
 	{ .name	=	"mntdev",
diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c
index 00b3f4c..4a3ac47 100644
--- a/lustre/osd-zfs/osd_object.c
+++ b/lustre/osd-zfs/osd_object.c
@@ -309,9 +309,26 @@ struct lu_object *osd_object_alloc(const struct lu_env *env,
 	OBD_SLAB_ALLOC_PTR_GFP(mo, osd_object_kmem, GFP_NOFS);
 	if (mo != NULL) {
 		struct lu_object *l;
+		struct lu_object_header *h;
+		struct osd_device *o = osd_dev(d);
 
 		l = &mo->oo_dt.do_lu;
-		dt_object_init(&mo->oo_dt, NULL, d);
+		if (unlikely(o->od_in_init)) {
+			OBD_ALLOC_PTR(h);
+			if (!h) {
+				OBD_FREE_PTR(mo);
+				return NULL;
+			}
+
+			lu_object_header_init(h);
+			lu_object_init(l, h, d);
+			lu_object_add_top(h, l);
+			mo->oo_header = h;
+		} else {
+			dt_object_init(&mo->oo_dt, NULL, d);
+			mo->oo_header = NULL;
+		}
+
 		mo->oo_dt.do_ops = &osd_obj_ops;
 		l->lo_ops = &osd_lu_obj_ops;
 		INIT_LIST_HEAD(&mo->oo_sa_linkage);
@@ -437,6 +454,7 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
 	struct lu_buf		buf;
 	int			rc;
 	struct lustre_mdt_attrs	*lma;
+	const struct lu_fid *rfid = lu_object_fid(&obj->oo_dt.do_lu);
 	ENTRY;
 
 	CLASSERT(sizeof(info->oti_buf) >= sizeof(*lma));
@@ -453,8 +471,14 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
 			CWARN("%s: unsupported incompat LMA feature(s) %#x for "
 			      "fid = "DFID"\n", osd_obj2dev(obj)->od_svname,
 			      lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
-			      PFID(lu_object_fid(&obj->oo_dt.do_lu)));
+			      PFID(rfid));
 			rc = -EOPNOTSUPP;
+		} else if (unlikely(!lu_fid_eq(rfid, &lma->lma_self_fid))) {
+			CERROR("%s: FID-in-LMA "DFID" does not match the "
+			      "object self-fid "DFID"\n",
+			      osd_obj2dev(obj)->od_svname,
+			      PFID(&lma->lma_self_fid), PFID(rfid));
+			rc = -EREMCHG;
 		} else {
 			struct osd_device *osd = osd_obj2dev(obj);
 
@@ -512,8 +536,15 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
 	struct osd_object *obj = osd_obj(l);
 	struct osd_device *osd = osd_obj2dev(obj);
 	const struct lu_fid *fid = lu_object_fid(l);
+	struct lustre_scrub *scrub = &osd->od_scrub;
+	struct osd_thread_info *info = osd_oti_get(env);
+	struct luz_direntry *zde = &info->oti_zde;
+	struct osd_idmap_cache *idc;
+	char *name = info->oti_str;
 	uint64_t oid;
 	int rc = 0;
+	int rc1;
+	bool remote = false;
 	ENTRY;
 
 	LASSERT(osd_invariant(obj));
@@ -521,10 +552,11 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
 	if (fid_is_otable_it(&l->lo_header->loh_fid)) {
 		obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
 		l->lo_header->loh_attr |= LOHA_EXISTS;
-		RETURN(0);
+
+		GOTO(out, rc = 0);
 	}
 
-	if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+	if (conf && conf->loc_flags & LOC_F_NEW)
 		GOTO(out, rc = 0);
 
 	if (unlikely(fid_is_acct(fid))) {
@@ -537,31 +569,117 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
 		GOTO(out, rc = 0);
 	}
 
-	rc = osd_fid_lookup(env, osd, fid, &oid);
-	if (rc == 0) {
-		LASSERT(obj->oo_dn == NULL);
-		rc = __osd_obj2dnode(osd->od_os, oid, &obj->oo_dn);
-		/* EEXIST will be returned if object is being deleted in ZFS */
-		if (rc == -EEXIST) {
-			rc = 0;
-			GOTO(out, rc);
+	idc = osd_idc_find(env, osd, fid);
+	if (idc && !idc->oic_remote && idc->oic_dnode != ZFS_NO_OBJECT) {
+		oid = idc->oic_dnode;
+		goto zget;
+	}
+
+	rc = -ENOENT;
+	if (!list_empty(&osd->od_scrub.os_inconsistent_items))
+		rc = osd_oii_lookup(osd, fid, &oid);
+
+	if (rc)
+		rc = osd_fid_lookup(env, osd, fid, &oid);
+
+	if (rc == -ENOENT) {
+		if (likely(!(fid_is_norm(fid) || fid_is_igif(fid)) ||
+			   fid_is_on_ost(env, osd, fid) ||
+			   !zfs_test_bit(osd_oi_fid2idx(osd, fid),
+					 scrub->os_file.sf_oi_bitmap)))
+			GOTO(out, rc = 0);
+
+		rc = -EREMCHG;
+		goto trigger;
+	}
+
+	if (rc)
+		GOTO(out, rc);
+
+zget:
+	LASSERT(obj->oo_dn == NULL);
+
+	rc = __osd_obj2dnode(osd->od_os, oid, &obj->oo_dn);
+	/* EEXIST will be returned if object is being deleted in ZFS */
+	if (rc == -EEXIST)
+		GOTO(out, rc = 0);
+
+	if (rc) {
+		CERROR("%s: lookup "DFID"/%#llx failed: rc = %d\n",
+		       osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
+		GOTO(out, rc);
+	}
+
+	rc = osd_object_init0(env, obj);
+	if (rc)
+		GOTO(out, rc);
+
+	if (unlikely(obj->oo_header))
+		GOTO(out, rc = 0);
+
+	rc = osd_check_lma(env, obj);
+	if ((!rc && !remote) || (rc != -EREMCHG))
+		GOTO(out, rc);
+
+trigger:
+	/* We still have chance to get the valid dnode: for the object that is
+	 * referenced by remote name entry, the object on the local MDT will be
+	 * linked under the dir /REMOTE_PARENT_DIR with its FID string as name.
+	 *
+	 * During the OI scrub, if we cannot find the OI mapping, we may still
+	 * have change to map the FID to local OID via lookup the dir
+	 * /REMOTE_PARENT_DIR. */
+	if (!remote && !fid_is_on_ost(env, osd, fid)) {
+		osd_fid2str(name, fid, sizeof(info->oti_str));
+		rc = osd_zap_lookup(osd, osd->od_remote_parent_dir,
+				    NULL, name, 8, 3, (void *)zde);
+		if (!rc) {
+			oid = zde->lzd_reg.zde_dnode;
+			osd_dnode_rele(obj->oo_dn);
+			obj->oo_dn = NULL;
+			remote = true;
+			goto zget;
 		}
-		if (rc != 0) {
-			CERROR("%s: lookup "DFID"/%#llx failed: rc = %d\n",
-			       osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
-			GOTO(out, rc);
+	}
+
+	/* The case someone triggered the OI scrub already. */
+	if (thread_is_running(&scrub->os_thread)) {
+		if (!rc) {
+			LASSERT(remote);
+
+			lu_object_set_agent_entry(l);
+			osd_oii_insert(env, osd, fid, oid, false);
+		} else {
+			rc = -EINPROGRESS;
 		}
-		rc = osd_object_init0(env, obj);
-		if (rc != 0)
-			GOTO(out, rc);
 
-		rc = osd_check_lma(env, obj);
-		if (rc != 0)
-			GOTO(out, rc);
-	} else if (rc == -ENOENT) {
-		rc = 0;
+		GOTO(out, rc);
 	}
-	LASSERT(osd_invariant(obj));
+
+	/* The case NOT allow to trigger OI scrub automatically. */
+	if (osd->od_auto_scrub_interval == AS_NEVER)
+		GOTO(out, rc);
+
+	/* It is me to trigger the OI scrub. */
+	rc1 = osd_scrub_start(env, osd, SS_CLEAR_DRYRUN |
+			      SS_CLEAR_FAILOUT | SS_AUTO_FULL);
+	LCONSOLE_WARN("%s: trigger OI scrub by RPC for the "DFID": rc = %d\n",
+		      osd_name(osd), PFID(fid), rc1);
+	if (!rc) {
+		LASSERT(remote);
+
+		lu_object_set_agent_entry(l);
+		if (!rc1)
+			osd_oii_insert(env, osd, fid, oid, false);
+	} else {
+		if (!rc1)
+			rc = -EINPROGRESS;
+		else
+			rc = -EREMCHG;
+	}
+
+	GOTO(out, rc);
+
 out:
 	RETURN(rc);
 }
@@ -573,11 +691,16 @@ out:
 static void osd_object_free(const struct lu_env *env, struct lu_object *l)
 {
 	struct osd_object *obj = osd_obj(l);
+	struct lu_object_header *h = obj->oo_header;
 
 	LASSERT(osd_invariant(obj));
 
 	dt_object_fini(&obj->oo_dt);
 	OBD_SLAB_FREE_PTR(obj, osd_object_kmem);
+	if (unlikely(h)) {
+		lu_object_header_fini(h);
+		OBD_FREE_PTR(h);
+	}
 }
 
 static int
@@ -707,13 +830,6 @@ static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
 	/* remove obj ref from index dir (it depends) */
 	zapid = osd_get_name_n_idx(env, osd, fid, buf,
 				   sizeof(info->oti_str), &zdn);
-	rc = osd_zap_remove(osd, zapid, zdn, buf, oh->ot_tx);
-	if (rc) {
-		CERROR("%s: zap_remove(%s) failed: rc = %d\n",
-		       osd->od_svname, buf, rc);
-		GOTO(out, rc);
-	}
-
 	rc = osd_xattrs_destroy(env, obj, oh);
 	if (rc) {
 		CERROR("%s: cannot destroy xattrs for %s: rc = %d\n",
@@ -758,6 +874,17 @@ static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
 			       osd->od_svname, buf, oid, rc);
 	}
 
+	/* Remove the OI mapping after the destroy to handle the race with
+	 * OI scrub that may insert missed OI mapping during the interval. */
+	rc = osd_zap_remove(osd, zapid, zdn, buf, oh->ot_tx);
+	if (unlikely(rc == -ENOENT))
+		rc = 0;
+	if (rc)
+		CERROR("%s: zap_remove(%s) failed: rc = %d\n",
+		       osd->od_svname, buf, rc);
+
+	GOTO(out, rc);
+
 out:
 	/* not needed in the cache anymore */
 	set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
@@ -1116,6 +1243,26 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
 	   transaction group. */
 	LASSERT(oh->ot_tx->tx_txg != 0);
 
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FID_MAPPING) && !osd->od_is_ost) {
+		struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
+		char *buf = info->oti_str;
+		dnode_t *zdn = NULL;
+		uint64_t zapid;
+
+		zapid = osd_get_name_n_idx(env, osd, lu_object_fid(&dt->do_lu),
+					   buf, sizeof(info->oti_str), &zdn);
+		rc = osd_zap_lookup(osd, zapid, zdn, buf, 8,
+				    sizeof(*zde) / 8, zde);
+		if (!rc) {
+			zde->zde_dnode -= 1;
+			rc = -zap_update(osd->od_os, zapid, buf, 8,
+					 sizeof(*zde) / 8, zde, oh->ot_tx);
+		}
+		up_read(&obj->oo_guard);
+
+		RETURN(rc > 0 ? 0 : rc);
+	}
+
 	/* Only allow set size for regular file */
 	if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
 		valid &= ~(LA_SIZE | LA_BLOCKS);
@@ -1451,8 +1598,8 @@ int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
 	return rc;
 }
 
-static int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
-			      uint64_t oid, dnode_t **dnp)
+int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
+		       uint64_t oid, dnode_t **dnp)
 {
 	dmu_tx_hold_t *txh;
 	int rc = 0;
@@ -1737,6 +1884,7 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
 	dnode_t *dn = NULL, *zdn = NULL;
 	uint64_t		 zapid, parent = 0;
 	int			 rc;
+	__u32 compat = 0;
 
 	ENTRY;
 
@@ -1789,9 +1937,20 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
 
 	zapid = osd_get_name_n_idx(env, osd, fid, buf,
 				   sizeof(info->oti_str), &zdn);
-	rc = osd_zap_add(osd, zapid, zdn, buf, 8, 1, zde, oh->ot_tx);
-	if (rc)
-		GOTO(out, rc);
+	if (!CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY)) {
+		if (osd->od_is_ost &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_INVALID_ENTRY))
+			zde->zde_dnode++;
+
+		if (!osd->od_is_ost ||
+		    !OBD_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_NO_ENTRY)) {
+			rc = osd_zap_add(osd, zapid, zdn, buf, 8, 1,
+					 zde, oh->ot_tx);
+			if (rc)
+				GOTO(out, rc);
+		}
+	}
+
 	obj->oo_dn = dn;
 	/* Now add in all of the "SA" attributes */
 	rc = osd_sa_handle_get(obj);
@@ -1803,7 +1962,9 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
 		GOTO(out, rc);
 
 	/* initialize LMA */
-	lustre_lma_init(lma, fid, 0, 0);
+	if (fid_is_idif(fid) || (fid_is_norm(fid) && osd->od_is_ost))
+		compat |= LMAC_FID_ON_OST;
+	lustre_lma_init(lma, fid, compat, 0);
 	lustre_lma_swab(lma);
 	rc = -nvlist_add_byte_array(obj->oo_sa_xattr, XATTR_NAME_LMA,
 				    (uchar_t *)lma, sizeof(*lma));
diff --git a/lustre/osd-zfs/osd_oi.c b/lustre/osd-zfs/osd_oi.c
index b00c760..c8d1fcc 100644
--- a/lustre/osd-zfs/osd_oi.c
+++ b/lustre/osd-zfs/osd_oi.c
@@ -140,75 +140,91 @@ osd_oi_lookup(const struct lu_env *env, struct osd_device *o,
 	return 0;
 }
 
-/**
- * Create a new OI with the given name.
- */
-static int
-osd_oi_create(const struct lu_env *env, struct osd_device *o,
-	      uint64_t parent, const char *name, uint64_t *child)
+static int osd_obj_create(const struct lu_env *env, struct osd_device *o,
+			  uint64_t parent, const char *name, uint64_t *child,
+			  const struct lu_fid *fid, bool isdir)
 {
-	struct zpl_direntry	*zde = &osd_oti_get(env)->oti_zde.lzd_reg;
-	struct lu_attr		*la = &osd_oti_get(env)->oti_la;
-	sa_handle_t		*sa_hdl = NULL;
-	dmu_tx_t		*tx;
-	uint64_t		 oid;
-	int			 rc;
-
-	/* verify it doesn't already exist */
-	rc = -zap_lookup(o->od_os, parent, name, 8, 1, (void *)zde);
-	if (rc == 0)
-		return -EEXIST;
+	struct osd_thread_info *info = osd_oti_get(env);
+	struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
+	struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
+	struct lu_attr *la = &info->oti_la;
+	sa_handle_t *sa_hdl = NULL;
+	nvlist_t *nvbuf = NULL;
+	dmu_tx_t *tx;
+	uint64_t oid;
+	__u32 compat = LMAC_NOT_IN_OI;
+	int rc;
+	ENTRY;
 
 	if (o->od_dt_dev.dd_rdonly)
-		return -EROFS;
+		RETURN(-EROFS);
+
+	memset(la, 0, sizeof(*la));
+	la->la_valid = LA_MODE | LA_UID | LA_GID;
+	la->la_mode = S_IRUGO | S_IWUSR | (isdir ? S_IXUGO | S_IFDIR : S_IFREG);
+
+	if (fid) {
+		rc = -nvlist_alloc(&nvbuf, NV_UNIQUE_NAME, KM_SLEEP);
+		if (rc)
+			RETURN(rc);
+
+		if (o->od_is_ost)
+			compat |= LMAC_FID_ON_OST;
+		lustre_lma_init(lma, fid, compat, 0);
+		lustre_lma_swab(lma);
+		rc = -nvlist_add_byte_array(nvbuf, XATTR_NAME_LMA,
+					    (uchar_t *)lma, sizeof(*lma));
+		if (rc)
+			GOTO(out, rc);
+	}
 
 	/* create fid-to-dnode index */
 	tx = dmu_tx_create(o->od_os);
-	if (tx == NULL)
-		return -ENOMEM;
+	if (!tx)
+		GOTO(out, rc = -ENOMEM);
 
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1, NULL);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	dmu_tx_hold_bonus(tx, parent);
 	dmu_tx_hold_zap(tx, parent, TRUE, name);
 	dmu_tx_hold_sa_create(tx, ZFS_SA_BASE_ATTR_SIZE);
-
 	rc = -dmu_tx_assign(tx, TXG_WAIT);
 	if (rc) {
 		dmu_tx_abort(tx);
-		return rc;
+		GOTO(out, rc);
 	}
 
-	oid = osd_zap_create_flags(o->od_os, 0, ZAP_FLAG_HASH64,
-				   DMU_OT_DIRECTORY_CONTENTS,
-				   14, /* == ZFS fzap_default_block_shift */
-				   DN_MAX_INDBLKSHIFT,
-				   0, tx);
-
+	if (isdir)
+		oid = osd_zap_create_flags(o->od_os, 0, ZAP_FLAG_HASH64,
+					   DMU_OT_DIRECTORY_CONTENTS,
+					   14, DN_MAX_INDBLKSHIFT, 0, tx);
+	else
+		oid = osd_dmu_object_alloc(o->od_os, DMU_OTN_UINT8_METADATA,
+					   0, 0, tx);
 	rc = -sa_handle_get(o->od_os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl);
 	if (rc)
-		goto commit;
-	memset(la, 0, sizeof(*la));
-	la->la_valid = LA_MODE | LA_UID | LA_GID;
-	la->la_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-	rc = __osd_attr_init(env, o, NULL, sa_hdl, tx, la, parent, NULL);
+		GOTO(commit, rc);
+
+	rc = __osd_attr_init(env, o, NULL, sa_hdl, tx, la, parent, nvbuf);
 	sa_handle_destroy(sa_hdl);
 	if (rc)
-		goto commit;
+		GOTO(commit, rc);
 
 	zde->zde_dnode = oid;
 	zde->zde_pad = 0;
-	zde->zde_type = IFTODT(S_IFDIR);
-
+	zde->zde_type = IFTODT(isdir ? S_IFDIR : S_IFREG);
 	rc = -zap_add(o->od_os, parent, name, 8, 1, (void *)zde, tx);
 
+	GOTO(commit, rc);
+
 commit:
 	if (rc)
 		dmu_object_free(o->od_os, oid, tx);
-	dmu_tx_commit(tx);
-
-	if (rc == 0)
+	else
 		*child = oid;
-
+	dmu_tx_commit(tx);
+out:
+	if (nvbuf)
+		nvlist_free(nvbuf);
 	return rc;
 }
 
@@ -223,7 +239,23 @@ osd_oi_find_or_create(const struct lu_env *env, struct osd_device *o,
 	if (rc == 0)
 		*child = oi.oi_zapid;
 	else if (rc == -ENOENT)
-		rc = osd_oi_create(env, o, parent, name, child);
+		rc = osd_obj_create(env, o, parent, name, child, NULL, true);
+
+	return rc;
+}
+
+int osd_obj_find_or_create(const struct lu_env *env, struct osd_device *o,
+			   uint64_t parent, const char *name, uint64_t *child,
+			   const struct lu_fid *fid, bool isdir)
+{
+	struct osd_oi oi;
+	int rc;
+
+	rc = osd_oi_lookup(env, o, parent, name, &oi);
+	if (!rc)
+		*child = oi.oi_zapid;
+	else if (rc == -ENOENT)
+		rc = osd_obj_create(env, o, parent, name, child, fid, isdir);
 
 	return rc;
 }
@@ -252,7 +284,11 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
 		return 0;
 	}
 
-	LASSERT(ss != NULL);
+	/* The seq_server_site may be NOT ready during initial OI scrub */
+	if (unlikely(!ss || !ss->ss_server_fld ||
+		     !ss->ss_server_fld->lsf_cache))
+		return -ENOENT;
+
 	fld_range_set_any(range);
 	/* OSD will only do local fld lookup */
 	return fld_local_lookup(env, ss->ss_server_fld, seq, range);
@@ -269,7 +305,8 @@ int fid_is_on_ost(const struct lu_env *env, struct osd_device *osd,
 		RETURN(1);
 
 	if (unlikely(fid_is_local_file(fid) || fid_is_llog(fid)) ||
-		     fid_is_name_llog(fid) || fid_is_quota(fid))
+		     fid_is_name_llog(fid) || fid_is_quota(fid) ||
+		     fid_is_igif(fid))
 		RETURN(0);
 
 	rc = osd_fld_lookup(env, osd, fid_seq(fid), range);
@@ -479,8 +516,7 @@ osd_get_idx_for_fid(struct osd_device *osd, const struct lu_fid *fid,
 {
 	struct osd_oi *oi;
 
-	LASSERT(osd->od_oi_table != NULL);
-	oi = osd->od_oi_table[fid_seq(fid) & (osd->od_oi_count - 1)];
+	oi = osd_fid2oi(osd, fid);
 	if (buf)
 		osd_fid2str(buf, fid, bufsize);
 	if (zdn)
@@ -698,13 +734,15 @@ osd_oi_open_table(const struct lu_env *env, struct osd_device *o, int count)
 /**
  * Determine if the type and number of OIs used by this file system.
  */
-static int
-osd_oi_probe(const struct lu_env *env, struct osd_device *o, int *count)
+static int osd_oi_probe(const struct lu_env *env, struct osd_device *o)
 {
-	uint64_t	root_oid = o->od_root;
-	struct osd_oi	oi;
-	char		name[16];
-	int		rc;
+	struct lustre_scrub *scrub = &o->od_scrub;
+	struct scrub_file *sf = &scrub->os_file;
+	struct osd_oi oi;
+	char name[16];
+	int max = sf->sf_oi_count > 0 ? sf->sf_oi_count : OSD_OI_FID_NR_MAX;
+	int count;
+	int rc;
 	ENTRY;
 
 	/*
@@ -713,31 +751,25 @@ osd_oi_probe(const struct lu_env *env, struct osd_device *o, int *count)
 	 * The only safeguard is that we know the number of OIs must be a
 	 * power of two and this is checked for basic sanity.
 	 */
-	for (*count = 0; *count < OSD_OI_FID_NR_MAX; (*count)++) {
-		sprintf(name, "%s.%d", DMU_OSD_OI_NAME_BASE, *count);
-		rc = osd_oi_lookup(env, o, root_oid, name, &oi);
-		if (rc == 0)
+	for (count = 0; count < max; count++) {
+		snprintf(name, 15, "%s.%d", DMU_OSD_OI_NAME_BASE, count);
+		rc = osd_oi_lookup(env, o, o->od_root, name, &oi);
+		if (!rc)
 			continue;
 
 		if (rc == -ENOENT) {
-			if (*count == 0)
-				break;
-
-			if ((*count & (*count - 1)) != 0)
-				RETURN(-EDOM);
+			if (sf->sf_oi_count == 0)
+				RETURN(count);
 
-			RETURN(0);
+			zfs_set_bit(count, sf->sf_oi_bitmap);
+			continue;
 		}
 
-		RETURN(rc);
+		if (rc)
+			RETURN(rc);
 	}
 
-	/*
-	 * No OIs exist, this must be a new filesystem.
-	 */
-	*count = 0;
-
-	RETURN(0);
+	RETURN(count);
 }
 
 static void osd_ost_seq_fini(const struct lu_env *env, struct osd_device *osd)
@@ -802,47 +834,97 @@ osd_oi_init_remote_parent(const struct lu_env *env, struct osd_device *o)
  */
 int osd_oi_init(const struct lu_env *env, struct osd_device *o)
 {
-	char	*key = osd_oti_get(env)->oti_buf;
-	int	 i, rc, count = 0;
+	struct lustre_scrub *scrub = &o->od_scrub;
+	struct scrub_file *sf = &scrub->os_file;
+	char *key = osd_oti_get(env)->oti_buf;
+	uint64_t sdb;
+	int i, rc, count;
 	ENTRY;
 
+	LASSERTF((sf->sf_oi_count & (sf->sf_oi_count - 1)) == 0,
+		 "Invalid OI count in scrub file %d\n", sf->sf_oi_count);
+
 	osd_oi_init_remote_parent(env, o);
 
-	rc = osd_oi_probe(env, o, &count);
+	rc = osd_oi_init_compat(env, o);
 	if (rc)
 		RETURN(rc);
 
-	if (count == 0) {
-		uint64_t odb, sdb;
+	count = osd_oi_probe(env, o);
+	if (count < 0)
+		GOTO(out, rc = count);
 
-		count = osd_oi_count;
-		odb = o->od_root;
+	if (count > 0) {
+		if (count == sf->sf_oi_count)
+			goto open;
 
-		for (i = 0; i < count; i++) {
-			sprintf(key, "%s.%d", DMU_OSD_OI_NAME_BASE, i);
-			rc = osd_oi_find_or_create(env, o, odb, key, &sdb);
-			if (rc)
-				RETURN(rc);
+		if (sf->sf_oi_count == 0) {
+			if (likely((count & (count - 1)) == 0)) {
+				sf->sf_oi_count = count;
+				rc = scrub_file_store(env, scrub);
+				if (rc)
+					GOTO(out, rc);
+
+				goto open;
+			}
+
+			LCONSOLE_ERROR("%s: invalid oi count %d. You can "
+				       "remove all OIs, then remount it\n",
+				       osd_name(o), count);
+			GOTO(out, rc = -EDOM);
+		}
+
+		scrub_file_reset(scrub, o->od_uuid, SF_RECREATED);
+		count = sf->sf_oi_count;
+	} else {
+		if (sf->sf_oi_count > 0) {
+			count = sf->sf_oi_count;
+			memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+			for (i = 0; i < count; i++)
+				zfs_set_bit(i, sf->sf_oi_bitmap);
+			scrub_file_reset(scrub, o->od_uuid, SF_RECREATED);
+		} else {
+			count = sf->sf_oi_count = osd_oi_count;
 		}
 	}
 
-	rc = osd_oi_init_compat(env, o);
+	rc = scrub_file_store(env, scrub);
 	if (rc)
-		RETURN(rc);
+		GOTO(out, rc);
 
+	for (i = 0; i < count; i++) {
+		LASSERT(sizeof(osd_oti_get(env)->oti_buf) >= 32);
+
+		snprintf(key, sizeof(osd_oti_get(env)->oti_buf) - 1,
+			 "%s.%d", DMU_OSD_OI_NAME_BASE, i);
+		rc = osd_oi_find_or_create(env, o, o->od_root, key, &sdb);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+open:
 	LASSERT((count & (count - 1)) == 0);
 	o->od_oi_count = count;
 	OBD_ALLOC(o->od_oi_table, sizeof(struct osd_oi *) * count);
 	if (o->od_oi_table == NULL)
-		RETURN(-ENOMEM);
+		GOTO(out, rc = -ENOMEM);
 
 	rc = osd_oi_open_table(env, o, count);
+
+	GOTO(out, rc);
+
+out:
 	if (rc) {
-		OBD_FREE(o->od_oi_table, sizeof(struct osd_oi *) * count);
-		o->od_oi_table = NULL;
+		osd_ost_seq_fini(env, o);
+
+		if (o->od_oi_table) {
+			OBD_FREE(o->od_oi_table,
+				 sizeof(struct osd_oi *) * count);
+			o->od_oi_table = NULL;
+		}
 	}
 
-	RETURN(rc);
+	return rc;
 }
 
 void osd_oi_fini(const struct lu_env *env, struct osd_device *o)
@@ -1016,3 +1098,23 @@ int osd_idc_find_and_init(const struct lu_env *env, struct osd_device *osd,
 
 	return 0;
 }
+
+int osd_idc_find_and_init_with_oid(const struct lu_env *env,
+				   struct osd_device *osd,
+				   const struct lu_fid *fid,
+				   uint64_t oid)
+{
+	struct osd_idmap_cache *idc;
+
+	idc = osd_idc_find(env, osd, fid);
+	if (!idc) {
+		idc = osd_idc_add(env, osd, fid);
+		if (IS_ERR(idc))
+			return PTR_ERR(idc);
+	}
+
+	idc->oic_dnode = oid;
+	idc->oic_remote = 0;
+
+	return 0;
+}
diff --git a/lustre/osd-zfs/osd_scrub.c b/lustre/osd-zfs/osd_scrub.c
new file mode 100644
index 0000000..b2cda0a
--- /dev/null
+++ b/lustre/osd-zfs/osd_scrub.c
@@ -0,0 +1,1743 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/osd-zfs/osd_scrub.c
+ *
+ * Top-level entry points into osd module
+ *
+ * The OI scrub is used for rebuilding Object Index files when restores MDT from
+ * file-level backup.
+ *
+ * The otable based iterator scans ZFS objects to feed up layer LFSCK.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LFSCK
+
+#include <linux/kthread.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_disk.h>
+#include <dt_object.h>
+#include <linux/xattr.h>
+#include <lustre_scrub.h>
+#include <obd_class.h>
+#include <lustre_nodemap.h>
+#include <sys/dsl_dataset.h>
+
+#include "osd_internal.h"
+
+#define OSD_OTABLE_MAX_HASH		((1ULL << 48) - 1)
+#define OTABLE_PREFETCH			256
+
+#define DTO_INDEX_INSERT		1
+#define DTO_INDEX_DELETE		2
+#define DTO_INDEX_UPDATE		3
+
+static inline bool osd_scrub_has_window(struct osd_otable_it *it)
+{
+	return it->ooi_prefetched < OTABLE_PREFETCH;
+}
+
+/**
+ * update/insert/delete the specified OI mapping (@fid @id) according to the ops
+ *
+ * \retval   1, changed nothing
+ * \retval   0, changed successfully
+ * \retval -ve, on error
+ */
+static int osd_scrub_refresh_mapping(const struct lu_env *env,
+				     struct osd_device *dev,
+				     const struct lu_fid *fid,
+				     uint64_t oid, int ops,
+				     bool force, const char *name)
+{
+	struct osd_thread_info *info = osd_oti_get(env);
+	struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
+	char *buf = info->oti_str;
+	dmu_tx_t *tx = NULL;
+	dnode_t *dn = NULL;
+	uint64_t zapid;
+	int rc;
+	ENTRY;
+
+	if (dev->od_scrub.os_file.sf_param & SP_DRYRUN && !force)
+		GOTO(log, rc = 0);
+
+	tx = dmu_tx_create(dev->od_os);
+	if (!tx)
+		GOTO(log, rc = -ENOMEM);
+
+	zapid = osd_get_name_n_idx(env, dev, fid, buf,
+				   sizeof(info->oti_str), &dn);
+	osd_tx_hold_zap(tx, zapid, dn,
+			ops == DTO_INDEX_INSERT ? TRUE : FALSE, NULL);
+	rc = -dmu_tx_assign(tx, TXG_WAIT);
+	if (rc) {
+		dmu_tx_abort(tx);
+		GOTO(log, rc);
+	}
+
+	switch (ops) {
+	case DTO_INDEX_UPDATE:
+		zde->zde_pad = 0;
+		zde->zde_dnode = oid;
+		zde->zde_type = 0; /* The type in OI mapping is useless. */
+		rc = -zap_update(dev->od_os, zapid, buf, 8, sizeof(*zde) / 8,
+				 zde, tx);
+		if (unlikely(rc == -ENOENT)) {
+			/* Some unlink thread may removed the OI mapping. */
+			rc = 1;
+		}
+		break;
+	case DTO_INDEX_INSERT:
+		zde->zde_pad = 0;
+		zde->zde_dnode = oid;
+		zde->zde_type = 0; /* The type in OI mapping is useless. */
+		rc = osd_zap_add(dev, zapid, dn, buf, 8, sizeof(*zde) / 8,
+				 zde, tx);
+		if (unlikely(rc == -EEXIST))
+			rc = 1;
+		break;
+	case DTO_INDEX_DELETE:
+		rc = osd_zap_remove(dev, zapid, dn, buf, tx);
+		if (rc == -ENOENT) {
+			/* It is normal that the unlink thread has removed the
+			 * OI mapping already. */
+			rc = 1;
+		}
+		break;
+	default:
+		LASSERTF(0, "Unexpected ops %d\n", ops);
+		rc = -EINVAL;
+		break;
+	}
+
+	dmu_tx_commit(tx);
+	GOTO(log, rc);
+
+log:
+	CDEBUG(D_LFSCK, "%s: refresh OI map for scrub, op %d, force %s, "
+	       DFID" => %llu (%s): rc = %d\n", osd_name(dev), ops,
+	       force ? "yes" : "no", PFID(fid), oid, name ? name : "null", rc);
+
+	return rc;
+}
+
+static int
+osd_scrub_check_update(const struct lu_env *env, struct osd_device *dev,
+		       const struct lu_fid *fid, uint64_t oid, int val)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct scrub_file *sf = &scrub->os_file;
+	struct osd_inconsistent_item *oii = NULL;
+	nvlist_t *nvbuf = NULL;
+	dnode_t *dn = NULL;
+	uint64_t oid2;
+	int ops = DTO_INDEX_UPDATE;
+	int rc;
+	ENTRY;
+
+	down_write(&scrub->os_rwsem);
+	scrub->os_new_checked++;
+	if (val < 0)
+		GOTO(out, rc = val);
+
+	if (scrub->os_in_prior)
+		oii = list_entry(scrub->os_inconsistent_items.next,
+				 struct osd_inconsistent_item, oii_list);
+
+	if (oid < sf->sf_pos_latest_start && !oii)
+		GOTO(out, rc = 0);
+
+	if (oii && oii->oii_insert) {
+		ops = DTO_INDEX_INSERT;
+		goto zget;
+	}
+
+	rc = osd_fid_lookup(env, dev, fid, &oid2);
+	if (rc) {
+		if (rc != -ENOENT)
+			GOTO(out, rc);
+
+		ops = DTO_INDEX_INSERT;
+
+zget:
+		rc = __osd_obj2dnode(dev->od_os, oid, &dn);
+		if (rc) {
+			/* Someone removed the object by race. */
+			if (rc == -ENOENT || rc == -EEXIST)
+				rc = 0;
+			GOTO(out, rc);
+		}
+
+		scrub->os_full_speed = 1;
+		sf->sf_flags |= SF_INCONSISTENT;
+	} else if (oid == oid2) {
+		GOTO(out, rc = 0);
+	} else {
+		struct lustre_mdt_attrs *lma = NULL;
+		int size;
+
+		rc = __osd_xattr_load_by_oid(dev, oid2, &nvbuf);
+		if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+			goto update;
+		if (rc)
+			GOTO(out, rc);
+
+		rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+					       (uchar_t **)&lma, &size);
+		if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+			goto update;
+		if (rc)
+			GOTO(out, rc);
+
+		lustre_lma_swab(lma);
+		if (unlikely(lu_fid_eq(&lma->lma_self_fid, fid))) {
+			CDEBUG(D_LFSCK, "%s: the FID "DFID" is used by "
+			       "two objects: %llu and %llu (in OI)\n",
+			       osd_name(dev), PFID(fid), oid, oid2);
+
+			GOTO(out, rc = -EEXIST);
+		}
+
+update:
+		scrub->os_full_speed = 1;
+		sf->sf_flags |= SF_INCONSISTENT;
+	}
+
+	rc = osd_scrub_refresh_mapping(env, dev, fid, oid, ops, false, NULL);
+	if (!rc) {
+		if (scrub->os_in_prior)
+			sf->sf_items_updated_prior++;
+		else
+			sf->sf_items_updated++;
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (nvbuf)
+		nvlist_free(nvbuf);
+
+	if (rc < 0) {
+		sf->sf_items_failed++;
+		if (sf->sf_pos_first_inconsistent == 0 ||
+		    sf->sf_pos_first_inconsistent > oid)
+			sf->sf_pos_first_inconsistent = oid;
+	} else {
+		rc = 0;
+	}
+
+	/* There may be conflict unlink during the OI scrub,
+	 * if happend, then remove the new added OI mapping. */
+	if (ops == DTO_INDEX_INSERT && dn && dn->dn_free_txg)
+		osd_scrub_refresh_mapping(env, dev, fid, oid,
+					  DTO_INDEX_DELETE, false, NULL);
+	up_write(&scrub->os_rwsem);
+
+	if (dn)
+		osd_dnode_rele(dn);
+
+	if (oii) {
+		spin_lock(&scrub->os_lock);
+		if (likely(!list_empty(&oii->oii_list)))
+			list_del(&oii->oii_list);
+		spin_unlock(&scrub->os_lock);
+		OBD_FREE_PTR(oii);
+	}
+
+	RETURN(sf->sf_param & SP_FAILOUT ? rc : 0);
+}
+
+static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct scrub_file *sf = &scrub->os_file;
+	__u32 flags = scrub->os_start_flags;
+	int rc;
+	bool drop_dryrun = false;
+	ENTRY;
+
+	CDEBUG(D_LFSCK, "%s: OI scrub prep, flags = 0x%x\n",
+	       scrub->os_name, flags);
+
+	down_write(&scrub->os_rwsem);
+	if (flags & SS_SET_FAILOUT)
+		sf->sf_param |= SP_FAILOUT;
+	else if (flags & SS_CLEAR_FAILOUT)
+		sf->sf_param &= ~SP_FAILOUT;
+
+	if (flags & SS_SET_DRYRUN) {
+		sf->sf_param |= SP_DRYRUN;
+	} else if (flags & SS_CLEAR_DRYRUN && sf->sf_param & SP_DRYRUN) {
+		sf->sf_param &= ~SP_DRYRUN;
+		drop_dryrun = true;
+	}
+
+	if (flags & SS_RESET)
+		scrub_file_reset(scrub, dev->od_uuid, 0);
+
+	scrub->os_partial_scan = 0;
+	if (flags & SS_AUTO_FULL) {
+		scrub->os_full_speed = 1;
+		sf->sf_flags |= SF_AUTO;
+	} else if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+				   SF_UPGRADE)) {
+		scrub->os_full_speed = 1;
+	} else {
+		scrub->os_full_speed = 0;
+	}
+
+	spin_lock(&scrub->os_lock);
+	scrub->os_in_prior = 0;
+	scrub->os_waiting = 0;
+	scrub->os_paused = 0;
+	scrub->os_in_join = 0;
+	scrub->os_full_scrub = 0;
+	spin_unlock(&scrub->os_lock);
+	scrub->os_new_checked = 0;
+	if (drop_dryrun && sf->sf_pos_first_inconsistent != 0)
+		sf->sf_pos_latest_start = sf->sf_pos_first_inconsistent;
+	else if (sf->sf_pos_last_checkpoint != 0)
+		sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1;
+	else
+		sf->sf_pos_latest_start = 1;
+
+	scrub->os_pos_current = sf->sf_pos_latest_start;
+	sf->sf_status = SS_SCANNING;
+	sf->sf_time_latest_start = cfs_time_current_sec();
+	sf->sf_time_last_checkpoint = sf->sf_time_latest_start;
+	sf->sf_pos_last_checkpoint = sf->sf_pos_latest_start - 1;
+	rc = scrub_file_store(env, scrub);
+	if (!rc) {
+		spin_lock(&scrub->os_lock);
+		thread_set_flags(thread, SVC_RUNNING);
+		spin_unlock(&scrub->os_lock);
+		wake_up_all(&thread->t_ctl_waitq);
+	}
+	up_write(&scrub->os_rwsem);
+
+	RETURN(rc);
+}
+
+static int osd_scrub_post(const struct lu_env *env, struct osd_device *dev,
+			  int result)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct scrub_file *sf = &scrub->os_file;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_LFSCK, "%s: OI scrub post with result = %d\n",
+	       scrub->os_name, result);
+
+	down_write(&scrub->os_rwsem);
+	spin_lock(&scrub->os_lock);
+	thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+	spin_unlock(&scrub->os_lock);
+	if (scrub->os_new_checked > 0) {
+		sf->sf_items_checked += scrub->os_new_checked;
+		scrub->os_new_checked = 0;
+		sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+	}
+	sf->sf_time_last_checkpoint = cfs_time_current_sec();
+	if (result > 0) {
+		sf->sf_status = SS_COMPLETED;
+		if (!(sf->sf_param & SP_DRYRUN)) {
+			memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+			sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT |
+					  SF_UPGRADE | SF_AUTO);
+		}
+		sf->sf_time_last_complete = sf->sf_time_last_checkpoint;
+		sf->sf_success_count++;
+	} else if (result == 0) {
+		if (scrub->os_paused)
+			sf->sf_status = SS_PAUSED;
+		else
+			sf->sf_status = SS_STOPPED;
+	} else {
+		sf->sf_status = SS_FAILED;
+	}
+	sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC -
+					    scrub->os_time_last_checkpoint);
+	rc = scrub_file_store(env, scrub);
+	up_write(&scrub->os_rwsem);
+
+	RETURN(rc < 0 ? rc : result);
+}
+
+/* iteration engine */
+
+static inline int
+osd_scrub_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
+{
+	spin_lock(&scrub->os_lock);
+	if (osd_scrub_has_window(it) ||
+	    !list_empty(&scrub->os_inconsistent_items) ||
+	    it->ooi_waiting || !thread_is_running(&scrub->os_thread))
+		scrub->os_waiting = 0;
+	else
+		scrub->os_waiting = 1;
+	spin_unlock(&scrub->os_lock);
+
+	return !scrub->os_waiting;
+}
+
+static int osd_scrub_next(const struct lu_env *env, struct osd_device *dev,
+			  struct lu_fid *fid, uint64_t *oid)
+{
+	struct l_wait_info lwi = { 0 };
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct osd_otable_it *it = dev->od_otable_it;
+	struct lustre_mdt_attrs *lma = NULL;
+	nvlist_t *nvbuf = NULL;
+	int size = 0;
+	int rc = 0;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_DELAY) && cfs_fail_val > 0) {
+		lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
+		if (likely(lwi.lwi_timeout > 0)) {
+			l_wait_event(thread->t_ctl_waitq,
+				!list_empty(&scrub->os_inconsistent_items) ||
+				!thread_is_running(thread),
+				&lwi);
+			if (unlikely(!thread_is_running(thread)))
+				RETURN(SCRUB_NEXT_EXIT);
+		}
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
+		spin_lock(&scrub->os_lock);
+		thread_set_flags(thread, SVC_STOPPING);
+		spin_unlock(&scrub->os_lock);
+		RETURN(SCRUB_NEXT_CRASH);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_FATAL))
+		RETURN(SCRUB_NEXT_FATAL);
+
+again:
+	if (nvbuf) {
+		nvlist_free(nvbuf);
+		nvbuf = NULL;
+		lma = NULL;
+	}
+
+	if (!list_empty(&scrub->os_inconsistent_items)) {
+		spin_lock(&scrub->os_lock);
+		if (likely(!list_empty(&scrub->os_inconsistent_items))) {
+			struct osd_inconsistent_item *oii;
+
+			oii = list_entry(scrub->os_inconsistent_items.next,
+				struct osd_inconsistent_item, oii_list);
+			*fid = oii->oii_cache.oic_fid;
+			*oid = oii->oii_cache.oic_dnode;
+			scrub->os_in_prior = 1;
+			spin_unlock(&scrub->os_lock);
+
+			GOTO(out, rc = 0);
+		}
+		spin_unlock(&scrub->os_lock);
+	}
+
+	if (!scrub->os_full_speed && !osd_scrub_has_window(it)) {
+		memset(&lwi, 0, sizeof(lwi));
+		l_wait_event(thread->t_ctl_waitq,
+			     osd_scrub_wakeup(scrub, it),
+			     &lwi);
+	}
+
+	if (unlikely(!thread_is_running(thread)))
+		GOTO(out, rc = SCRUB_NEXT_EXIT);
+
+	rc = -dmu_object_next(dev->od_os, &scrub->os_pos_current, B_FALSE, 0);
+	if (rc)
+		GOTO(out, rc = (rc == -ESRCH ? SCRUB_NEXT_BREAK : rc));
+
+	rc = __osd_xattr_load_by_oid(dev, scrub->os_pos_current, &nvbuf);
+	if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+		goto again;
+
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(nvbuf != NULL);
+	rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+				       (uchar_t **)&lma, &size);
+	if (!rc) {
+		lustre_lma_swab(lma);
+		if (likely(!(lma->lma_compat & LMAC_NOT_IN_OI) &&
+			   !(lma->lma_incompat & LMAI_AGENT))) {
+			*fid = lma->lma_self_fid;
+			*oid = scrub->os_pos_current;
+
+			GOTO(out, rc = 0);
+		}
+	}
+
+	if (!scrub->os_full_speed) {
+		spin_lock(&scrub->os_lock);
+		it->ooi_prefetched++;
+		if (it->ooi_waiting) {
+			it->ooi_waiting = 0;
+			wake_up_all(&thread->t_ctl_waitq);
+		}
+		spin_unlock(&scrub->os_lock);
+	}
+
+	goto again;
+
+out:
+	if (nvbuf)
+		nvlist_free(nvbuf);
+
+	return rc;
+}
+
+static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
+			  const struct lu_fid *fid, uint64_t oid, int rc)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct osd_otable_it *it = dev->od_otable_it;
+
+	rc = osd_scrub_check_update(env, dev, fid, oid, rc);
+	if (!scrub->os_in_prior) {
+		if (!scrub->os_full_speed) {
+			spin_lock(&scrub->os_lock);
+			it->ooi_prefetched++;
+			if (it->ooi_waiting) {
+				it->ooi_waiting = 0;
+				wake_up_all(&thread->t_ctl_waitq);
+			}
+			spin_unlock(&scrub->os_lock);
+		}
+	} else {
+		scrub->os_in_prior = 0;
+	}
+
+	if (rc)
+		return rc;
+
+	rc = scrub_checkpoint(env, scrub);
+	if (rc) {
+		CDEBUG(D_LFSCK, "%s: fail to checkpoint, pos = %llu: "
+		       "rc = %d\n", scrub->os_name, scrub->os_pos_current, rc);
+		/* Continue, as long as the scrub itself can go ahead. */
+	}
+
+	return 0;
+}
+
+static int osd_scrub_main(void *args)
+{
+	struct lu_env env;
+	struct osd_device *dev = (struct osd_device *)args;
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct lu_fid *fid;
+	uint64_t oid;
+	int rc = 0;
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_DT_THREAD);
+	if (rc) {
+		CDEBUG(D_LFSCK, "%s: OI scrub fail to init env: rc = %d\n",
+		       scrub->os_name, rc);
+		GOTO(noenv, rc);
+	}
+
+	rc = osd_scrub_prep(&env, dev);
+	if (rc) {
+		CDEBUG(D_LFSCK, "%s: OI scrub fail to scrub prep: rc = %d\n",
+		       scrub->os_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (!scrub->os_full_speed) {
+		struct l_wait_info lwi = { 0 };
+		struct osd_otable_it *it = dev->od_otable_it;
+
+		l_wait_event(thread->t_ctl_waitq,
+			     it->ooi_user_ready || !thread_is_running(thread),
+			     &lwi);
+		if (unlikely(!thread_is_running(thread)))
+			GOTO(post, rc = 0);
+
+		scrub->os_pos_current = it->ooi_pos;
+	}
+
+	CDEBUG(D_LFSCK, "%s: OI scrub start, flags = 0x%x, pos = %llu\n",
+	       scrub->os_name, scrub->os_start_flags,
+	       scrub->os_pos_current);
+
+	fid = &osd_oti_get(&env)->oti_fid;
+	while (!rc && thread_is_running(thread)) {
+		rc = osd_scrub_next(&env, dev, fid, &oid);
+		switch (rc) {
+		case SCRUB_NEXT_EXIT:
+			GOTO(post, rc = 0);
+		case SCRUB_NEXT_CRASH:
+			spin_lock(&scrub->os_lock);
+			thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+			spin_unlock(&scrub->os_lock);
+			GOTO(out, rc = -EINVAL);
+		case SCRUB_NEXT_FATAL:
+			GOTO(post, rc = -EINVAL);
+		case SCRUB_NEXT_BREAK:
+			GOTO(post, rc = 1);
+		}
+
+		rc = osd_scrub_exec(&env, dev, fid, oid, rc);
+	}
+
+	GOTO(post, rc);
+
+post:
+	rc = osd_scrub_post(&env, dev, rc);
+	CDEBUG(D_LFSCK, "%s: OI scrub: stop, pos = %llu: rc = %d\n",
+	       scrub->os_name, scrub->os_pos_current, rc);
+
+out:
+	while (!list_empty(&scrub->os_inconsistent_items)) {
+		struct osd_inconsistent_item *oii;
+
+		oii = list_entry(scrub->os_inconsistent_items.next,
+				 struct osd_inconsistent_item, oii_list);
+		list_del_init(&oii->oii_list);
+		OBD_FREE_PTR(oii);
+	}
+
+	lu_env_fini(&env);
+
+noenv:
+	spin_lock(&scrub->os_lock);
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up_all(&thread->t_ctl_waitq);
+	spin_unlock(&scrub->os_lock);
+	return rc;
+}
+
+/* initial OI scrub */
+
+struct osd_lf_map;
+
+typedef int (*handle_dirent_t)(const struct lu_env *, struct osd_device *,
+			       const char *, uint64_t, uint64_t,
+			       enum osd_lf_flags, bool);
+static int osd_ios_varfid_hd(const struct lu_env *, struct osd_device *,
+			     const char *, uint64_t, uint64_t,
+			     enum osd_lf_flags, bool);
+static int osd_ios_uld_hd(const struct lu_env *, struct osd_device *,
+			  const char *, uint64_t, uint64_t,
+			  enum osd_lf_flags, bool);
+
+typedef int (*scan_dir_t)(const struct lu_env *, struct osd_device *,
+			  uint64_t, handle_dirent_t, enum osd_lf_flags);
+static int osd_ios_general_sd(const struct lu_env *, struct osd_device *,
+			      uint64_t, handle_dirent_t, enum osd_lf_flags);
+static int osd_ios_ROOT_sd(const struct lu_env *, struct osd_device *,
+			   uint64_t, handle_dirent_t, enum osd_lf_flags);
+
+struct osd_lf_map {
+	char			*olm_name;
+	struct lu_fid		 olm_fid;
+	enum osd_lf_flags	 olm_flags;
+	scan_dir_t		 olm_scan_dir;
+	handle_dirent_t		 olm_handle_dirent;
+};
+
+/* Add the new introduced local files in the list in the future. */
+static const struct osd_lf_map osd_lf_maps[] = {
+	/* CONFIGS */
+	{
+		.olm_name		= MOUNT_CONFIGS_DIR,
+		.olm_fid		= {
+			.f_seq	= FID_SEQ_LOCAL_FILE,
+			.f_oid	= MGS_CONFIGS_OID,
+		},
+		.olm_flags		= OLF_SCAN_SUBITEMS,
+		.olm_scan_dir		= osd_ios_general_sd,
+		.olm_handle_dirent	= osd_ios_varfid_hd,
+	},
+
+	/* NIDTBL_VERSIONS */
+	{
+		.olm_name		= MGS_NIDTBL_DIR,
+		.olm_flags		= OLF_SCAN_SUBITEMS,
+		.olm_scan_dir		= osd_ios_general_sd,
+		.olm_handle_dirent	= osd_ios_varfid_hd,
+	},
+
+	/* PENDING */
+	{
+		.olm_name		= "PENDING",
+	},
+
+	/* ROOT */
+	{
+		.olm_name		= "ROOT",
+		.olm_fid		= {
+			.f_seq	= FID_SEQ_ROOT,
+			.f_oid	= FID_OID_ROOT,
+		},
+		.olm_flags		= OLF_SCAN_SUBITEMS,
+		.olm_scan_dir		= osd_ios_ROOT_sd,
+	},
+
+	/* fld */
+	{
+		.olm_name		= "fld",
+		.olm_fid		= {
+			.f_seq	= FID_SEQ_LOCAL_FILE,
+			.f_oid	= FLD_INDEX_OID,
+		},
+	},
+
+	/* changelog_catalog */
+	{
+		.olm_name		= CHANGELOG_CATALOG,
+	},
+
+	/* changelog_users */
+	{
+		.olm_name		= CHANGELOG_USERS,
+	},
+
+	/* quota_master */
+	{
+		.olm_name		= QMT_DIR,
+		.olm_flags		= OLF_SCAN_SUBITEMS,
+		.olm_scan_dir		= osd_ios_general_sd,
+		.olm_handle_dirent	= osd_ios_varfid_hd,
+	},
+
+	/* quota_slave */
+	{
+		.olm_name		= QSD_DIR,
+		.olm_flags		= OLF_SCAN_SUBITEMS,
+		.olm_scan_dir		= osd_ios_general_sd,
+		.olm_handle_dirent	= osd_ios_varfid_hd,
+	},
+
+	/* LFSCK */
+	{
+		.olm_name		= LFSCK_DIR,
+		.olm_flags		= OLF_SCAN_SUBITEMS,
+		.olm_scan_dir		= osd_ios_general_sd,
+		.olm_handle_dirent	= osd_ios_varfid_hd,
+	},
+
+	/* lfsck_bookmark */
+	{
+		.olm_name		= LFSCK_BOOKMARK,
+	},
+
+	/* lfsck_layout */
+	{
+		.olm_name		= LFSCK_LAYOUT,
+	},
+
+	/* lfsck_namespace */
+	{
+		.olm_name		= LFSCK_NAMESPACE,
+	},
+
+	/* OSP update logs update_log{_dir} use f_seq = FID_SEQ_UPDATE_LOG{_DIR}
+	 * and f_oid = index for their log files.  See lu_update_log{_dir}_fid()
+	 * for more details. */
+
+	/* update_log */
+	{
+		.olm_name		= "update_log",
+		.olm_fid		= {
+			.f_seq	= FID_SEQ_UPDATE_LOG,
+		},
+		.olm_flags		= OLF_IDX_IN_FID,
+	},
+
+	/* update_log_dir */
+	{
+		.olm_name		= "update_log_dir",
+		.olm_fid	= {
+			.f_seq	= FID_SEQ_UPDATE_LOG_DIR,
+		},
+		.olm_flags		= OLF_SCAN_SUBITEMS | OLF_IDX_IN_FID,
+		.olm_scan_dir		= osd_ios_general_sd,
+		.olm_handle_dirent	= osd_ios_uld_hd,
+	},
+
+	/* hsm_actions */
+	{
+		.olm_name		= HSM_ACTIONS,
+	},
+
+	/* nodemap */
+	{
+		.olm_name		= LUSTRE_NODEMAP_NAME,
+	},
+
+	{
+		.olm_name		= NULL
+	}
+};
+
+/* Add the new introduced files under .lustre/ in the list in the future. */
+static const struct osd_lf_map osd_dl_maps[] = {
+	/* .lustre/fid */
+	{
+		.olm_name		= "fid",
+		.olm_fid		= {
+			.f_seq	= FID_SEQ_DOT_LUSTRE,
+			.f_oid	= FID_OID_DOT_LUSTRE_OBF,
+		},
+	},
+
+	/* .lustre/lost+found */
+	{
+		.olm_name		= "lost+found",
+		.olm_fid		= {
+			.f_seq	= FID_SEQ_DOT_LUSTRE,
+			.f_oid	= FID_OID_DOT_LUSTRE_LPF,
+		},
+	},
+
+	{
+		.olm_name		= NULL
+	}
+};
+
+struct osd_ios_item {
+	struct list_head	oii_list;
+	uint64_t		oii_parent;
+	enum osd_lf_flags	oii_flags;
+	scan_dir_t		oii_scan_dir;
+	handle_dirent_t		oii_handle_dirent;
+};
+
+static int osd_ios_new_item(struct osd_device *dev, uint64_t parent,
+			    enum osd_lf_flags flags, scan_dir_t scan_dir,
+			    handle_dirent_t handle_dirent)
+{
+	struct osd_ios_item *item;
+
+	OBD_ALLOC_PTR(item);
+	if (!item) {
+		CWARN("%s: initial OI scrub failed to add item for %llu\n",
+		      osd_name(dev), parent);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&item->oii_list);
+	item->oii_parent = parent;
+	item->oii_flags = flags;
+	item->oii_scan_dir = scan_dir;
+	item->oii_handle_dirent = handle_dirent;
+	list_add_tail(&item->oii_list, &dev->od_ios_list);
+
+	return 0;
+}
+
+/**
+ * verify FID-in-LMA and OI entry for one object
+ *
+ * ios: Initial OI Scrub.
+ */
+static int osd_ios_scan_one(const struct lu_env *env, struct osd_device *dev,
+			    const struct lu_fid *fid, uint64_t parent,
+			    uint64_t oid, const char *name,
+			    enum osd_lf_flags flags)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct scrub_file *sf = &scrub->os_file;
+	struct lustre_mdt_attrs	*lma = NULL;
+	nvlist_t *nvbuf = NULL;
+	struct lu_fid tfid;
+	uint64_t oid2 = 0;
+	__u64 flag = 0;
+	int size = 0;
+	int op = 0;
+	int rc;
+	ENTRY;
+
+	rc = __osd_xattr_load_by_oid(dev, oid, &nvbuf);
+	if (unlikely(rc == -ENOENT || rc == -EEXIST))
+		RETURN(0);
+
+	if (rc && rc != -ENODATA) {
+		CWARN("%s: initial OI scrub failed to get lma for %llu: "
+		      "rc = %d\n", osd_name(dev), oid, rc);
+
+		RETURN(rc);
+	}
+
+	if (!rc) {
+		LASSERT(nvbuf != NULL);
+		rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+					       (uchar_t **)&lma, &size);
+		if (rc || size == 0) {
+			LASSERT(lma == NULL);
+			rc = -ENODATA;
+		} else {
+			LASSERTF(lma != NULL, "corrupted LMA, size %d\n", size);
+			lustre_lma_swab(lma);
+			if (lma->lma_compat & LMAC_NOT_IN_OI) {
+				nvlist_free(nvbuf);
+				RETURN(0);
+			}
+
+			tfid = lma->lma_self_fid;
+		}
+		nvlist_free(nvbuf);
+	}
+
+	if (rc == -ENODATA) {
+		if (!fid) {
+			/* Skip the object without FID-in-LMA */
+			CDEBUG(D_LFSCK, "%s: %llu has no FID-in-LMA, skip it\n",
+			       osd_name(dev), oid);
+
+			RETURN(0);
+		}
+
+		LASSERT(!fid_is_zero(fid));
+
+		tfid = *fid;
+		if (flags & OLF_IDX_IN_FID) {
+			LASSERT(dev->od_index >= 0);
+
+			tfid.f_oid = dev->od_index;
+		}
+	}
+
+	rc = osd_fid_lookup(env, dev, &tfid, &oid2);
+	if (rc) {
+		if (rc != -ENOENT) {
+			CWARN("%s: initial OI scrub failed to lookup fid for "
+			      DFID"=>%llu: rc = %d\n",
+			      osd_name(dev), PFID(&tfid), oid, rc);
+
+			RETURN(rc);
+		}
+
+		flag = SF_RECREATED;
+		op = DTO_INDEX_INSERT;
+	} else {
+		if (oid == oid2)
+			RETURN(0);
+
+		flag = SF_INCONSISTENT;
+		op = DTO_INDEX_UPDATE;
+	}
+
+	if (!(sf->sf_flags & flag)) {
+		scrub_file_reset(scrub, dev->od_uuid, flag);
+		rc = scrub_file_store(env, scrub);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = osd_scrub_refresh_mapping(env, dev, &tfid, oid, op, true, name);
+
+	RETURN(rc > 0 ? 0 : rc);
+}
+
+static int osd_ios_varfid_hd(const struct lu_env *env, struct osd_device *dev,
+			     const char *name, uint64_t parent, uint64_t oid,
+			     enum osd_lf_flags flags, bool is_dir)
+{
+	int rc;
+	ENTRY;
+
+	rc = osd_ios_scan_one(env, dev, NULL, parent, oid, name, 0);
+	if (!rc && is_dir)
+		rc = osd_ios_new_item(dev, oid, flags, osd_ios_general_sd,
+				      osd_ios_varfid_hd);
+
+	RETURN(rc);
+}
+
+static int osd_ios_uld_hd(const struct lu_env *env, struct osd_device *dev,
+			  const char *name, uint64_t parent, uint64_t oid,
+			  enum osd_lf_flags flags, bool is_dir)
+{
+	struct lu_fid tfid;
+	int rc;
+	ENTRY;
+
+	/* skip any non-DFID format name */
+	if (name[0] != '[')
+		RETURN(0);
+
+	/* skip the start '[' */
+	sscanf(&name[1], SFID, RFID(&tfid));
+	if (fid_is_sane(&tfid))
+		rc = osd_ios_scan_one(env, dev, &tfid, parent, oid, name, 0);
+	else
+		rc = -EIO;
+
+	RETURN(rc);
+}
+
+/*
+ * General scanner for the directories execpt /ROOT during initial OI scrub.
+ * It scans the name entries under the given directory one by one. For each
+ * entry, verifies its OI mapping via the given @handle_dirent.
+ */
+static int osd_ios_general_sd(const struct lu_env *env, struct osd_device *dev,
+			      uint64_t parent, handle_dirent_t handle_dirent,
+			      enum osd_lf_flags flags)
+{
+	struct osd_thread_info *info = osd_oti_get(env);
+	struct luz_direntry *zde = &info->oti_zde;
+	zap_attribute_t *za = &info->oti_za;
+	zap_cursor_t *zc = &info->oti_zc;
+	int rc;
+	ENTRY;
+
+	zap_cursor_init_serialized(zc, dev->od_os, parent, 0);
+	rc = -zap_cursor_retrieve(zc, za);
+	if (rc == -ENOENT)
+		zap_cursor_advance(zc);
+	else if (rc)
+		GOTO(log, rc);
+
+	while (1) {
+		rc = -zap_cursor_retrieve(zc, za);
+		if (rc)
+			GOTO(log, rc = (rc == -ENOENT ? 0 : rc));
+
+		/* skip the entry started with '.' */
+		if (likely(za->za_name[0] != '.')) {
+			rc = osd_zap_lookup(dev, parent, NULL, za->za_name,
+					za->za_integer_length,
+					sizeof(*zde) / za->za_integer_length,
+					(void *)zde);
+			if (rc) {
+				CWARN("%s: initial OI scrub failed to lookup "
+				      "%s under %llu: rc = %d\n",
+				      osd_name(dev), za->za_name, parent, rc);
+				continue;
+			}
+
+			rc = handle_dirent(env, dev, za->za_name, parent,
+					zde->lzd_reg.zde_dnode, flags,
+					S_ISDIR(DTTOIF(zde->lzd_reg.zde_type)) ?
+					true : false);
+			CDEBUG(D_LFSCK, "%s: initial OI scrub handled %s under "
+			       "%llu: rc = %d\n",
+			       osd_name(dev), za->za_name, parent, rc);
+		}
+
+		zap_cursor_advance(zc);
+	}
+
+log:
+	if (rc)
+		CWARN("%s: initial OI scrub failed to scan the directory %llu: "
+		      "rc = %d\n", osd_name(dev), parent, rc);
+	zap_cursor_fini(zc);
+
+	return rc;
+}
+
+/*
+ * The scanner for /ROOT directory. It is not all the items under /ROOT will
+ * be scanned during the initial OI scrub, instead, only the .lustre and the
+ * sub-items under .lustre will be handled.
+ */
+static int osd_ios_ROOT_sd(const struct lu_env *env, struct osd_device *dev,
+			   uint64_t parent, handle_dirent_t handle_dirent,
+			   enum osd_lf_flags flags)
+{
+	struct luz_direntry *zde = &osd_oti_get(env)->oti_zde;
+	const struct osd_lf_map *map;
+	uint64_t oid;
+	int rc;
+	int rc1 = 0;
+	ENTRY;
+
+	rc = osd_zap_lookup(dev, parent, NULL, dot_lustre_name, 8,
+			    sizeof(*zde) / 8, (void *)zde);
+	if (rc == -ENOENT) {
+		/* The .lustre directory is lost. That is not fatal. It can
+		 * be re-created in the subsequent MDT start processing. */
+		RETURN(0);
+	}
+
+	if (rc) {
+		CWARN("%s: initial OI scrub failed to find .lustre: "
+		      "rc = %d\n", osd_name(dev), rc);
+
+		RETURN(rc);
+	}
+
+	oid = zde->lzd_reg.zde_dnode;
+	rc = osd_ios_scan_one(env, dev, &LU_DOT_LUSTRE_FID, parent, oid,
+			      dot_lustre_name, 0);
+	if (rc)
+		RETURN(rc);
+
+	for (map = osd_dl_maps; map->olm_name; map++) {
+		rc = osd_zap_lookup(dev, oid, NULL, map->olm_name, 8,
+				    sizeof(*zde) / 8, (void *)zde);
+		if (rc) {
+			if (rc != -ENOENT)
+				CWARN("%s: initial OI scrub failed to find"
+				      "the entry %s under .lustre: rc = %d\n",
+				      osd_name(dev), map->olm_name, rc);
+			else if (!fid_is_zero(&map->olm_fid))
+				/* Try to remove the stale OI mapping. */
+				osd_scrub_refresh_mapping(env, dev,
+						&map->olm_fid, 0,
+						DTO_INDEX_DELETE, true,
+						map->olm_name);
+			continue;
+		}
+
+		rc = osd_ios_scan_one(env, dev, &map->olm_fid, oid,
+				      zde->lzd_reg.zde_dnode, map->olm_name,
+				      map->olm_flags);
+		if (rc)
+			rc1 = rc;
+	}
+
+	RETURN(rc1);
+}
+
+static void osd_initial_OI_scrub(const struct lu_env *env,
+				 struct osd_device *dev)
+{
+	struct luz_direntry *zde = &osd_oti_get(env)->oti_zde;
+	const struct osd_lf_map *map;
+	int rc;
+	ENTRY;
+
+	for (map = osd_lf_maps; map->olm_name; map++) {
+		rc = osd_zap_lookup(dev, dev->od_root, NULL, map->olm_name, 8,
+				    sizeof(*zde) / 8, (void *)zde);
+		if (rc) {
+			if (rc != -ENOENT)
+				CWARN("%s: initial OI scrub failed "
+				      "to find the entry %s: rc = %d\n",
+				      osd_name(dev), map->olm_name, rc);
+			else if (!fid_is_zero(&map->olm_fid))
+				/* Try to remove the stale OI mapping. */
+				osd_scrub_refresh_mapping(env, dev,
+						&map->olm_fid, 0,
+						DTO_INDEX_DELETE, true,
+						map->olm_name);
+			continue;
+		}
+
+		rc = osd_ios_scan_one(env, dev, &map->olm_fid, dev->od_root,
+				      zde->lzd_reg.zde_dnode, map->olm_name,
+				      map->olm_flags);
+		if (!rc && map->olm_flags & OLF_SCAN_SUBITEMS)
+			osd_ios_new_item(dev, zde->lzd_reg.zde_dnode,
+					 map->olm_flags, map->olm_scan_dir,
+					 map->olm_handle_dirent);
+	}
+
+	while (!list_empty(&dev->od_ios_list)) {
+		struct osd_ios_item *item;
+
+		item = list_entry(dev->od_ios_list.next,
+				  struct osd_ios_item, oii_list);
+		list_del_init(&item->oii_list);
+		item->oii_scan_dir(env, dev, item->oii_parent,
+				   item->oii_handle_dirent, item->oii_flags);
+		OBD_FREE_PTR(item);
+	}
+
+	EXIT;
+}
+
+/* OI scrub start/stop */
+
+int osd_scrub_start(const struct lu_env *env, struct osd_device *dev,
+		    __u32 flags)
+{
+	int rc;
+	ENTRY;
+
+	if (dev->od_dt_dev.dd_rdonly)
+		RETURN(-EROFS);
+
+	/* od_otable_sem: prevent concurrent start/stop */
+	down(&dev->od_otable_sem);
+	rc = scrub_start(osd_scrub_main, &dev->od_scrub, dev, flags);
+	up(&dev->od_otable_sem);
+
+	RETURN(rc == -EALREADY ? 0 : rc);
+}
+
+static void osd_scrub_stop(struct osd_device *dev)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	ENTRY;
+
+	/* od_otable_sem: prevent concurrent start/stop */
+	down(&dev->od_otable_sem);
+	scrub->os_paused = 1;
+	scrub_stop(scrub);
+	up(&dev->od_otable_sem);
+
+	EXIT;
+}
+
+/* OI scrub setup/cleanup */
+
+static const char osd_scrub_name[] = "OI_scrub";
+
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
+{
+	struct osd_thread_info *info = osd_oti_get(env);
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct scrub_file *sf = &scrub->os_file;
+	struct lu_fid *fid = &info->oti_fid;
+	struct dt_object *obj;
+	uint64_t oid;
+	int rc = 0;
+	bool dirty = false;
+	ENTRY;
+
+	memcpy(dev->od_uuid,
+	       &dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid,
+	       sizeof(dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid));
+	memset(&dev->od_scrub, 0, sizeof(struct lustre_scrub));
+	init_waitqueue_head(&scrub->os_thread.t_ctl_waitq);
+	init_rwsem(&scrub->os_rwsem);
+	spin_lock_init(&scrub->os_lock);
+	INIT_LIST_HEAD(&scrub->os_inconsistent_items);
+	scrub->os_name = osd_name(dev);
+
+	/* 'What the @fid is' is not imporatant, because the object
+	 * has no OI mapping, and only is visible inside the OSD.*/
+	fid->f_seq = FID_SEQ_IGIF_MAX;
+	if (dev->od_is_ost)
+		fid->f_oid = ((1 << 31) | dev->od_index) + 1;
+	else
+		fid->f_oid = dev->od_index + 1;
+	fid->f_ver = 0;
+	rc = osd_obj_find_or_create(env, dev, dev->od_root,
+				    osd_scrub_name, &oid, fid, false);
+	if (rc)
+		RETURN(rc);
+
+	rc = osd_idc_find_and_init_with_oid(env, dev, fid, oid);
+	if (rc)
+		RETURN(rc);
+
+	obj = lu2dt(lu_object_find_slice(env, osd2lu_dev(dev), fid, NULL));
+	if (IS_ERR_OR_NULL(obj))
+		RETURN(obj ? PTR_ERR(obj) : -ENOENT);
+
+	scrub->os_obj = obj;
+	rc = scrub_file_load(env, scrub);
+	if (rc == -ENOENT || rc == -EFAULT) {
+		scrub_file_init(scrub, dev->od_uuid);
+		dirty = true;
+	} else if (rc < 0) {
+		GOTO(cleanup_obj, rc);
+	} else {
+		if (memcmp(sf->sf_uuid, dev->od_uuid, 16) != 0) {
+			struct obd_uuid *old_uuid;
+			struct obd_uuid *new_uuid;
+
+			OBD_ALLOC_PTR(old_uuid);
+			OBD_ALLOC_PTR(new_uuid);
+			if (!old_uuid || !new_uuid) {
+				CERROR("%s: UUID has been changed, but"
+				       "failed to allocate RAM for report\n",
+				       osd_name(dev));
+			} else {
+				class_uuid_unparse(sf->sf_uuid, old_uuid);
+				class_uuid_unparse(dev->od_uuid, new_uuid);
+				CDEBUG(D_LFSCK, "%s: UUID has been changed "
+				       "from %s to %s\n", osd_name(dev),
+				       old_uuid->uuid, new_uuid->uuid);
+			}
+			scrub_file_reset(scrub, dev->od_uuid, SF_INCONSISTENT);
+			dirty = true;
+			if (old_uuid)
+				OBD_FREE_PTR(old_uuid);
+			if (new_uuid)
+				OBD_FREE_PTR(new_uuid);
+		} else if (sf->sf_status == SS_SCANNING) {
+			sf->sf_status = SS_CRASHED;
+			dirty = true;
+		}
+
+		if ((sf->sf_oi_count & (sf->sf_oi_count - 1)) != 0) {
+			LCONSOLE_WARN("%s: invalid oi count %d, set it to %d\n",
+				      osd_name(dev), sf->sf_oi_count,
+				      osd_oi_count);
+			sf->sf_oi_count = osd_oi_count;
+			dirty = true;
+		}
+	}
+
+	if (sf->sf_pos_last_checkpoint != 0)
+		scrub->os_pos_current = sf->sf_pos_last_checkpoint + 1;
+	else
+		scrub->os_pos_current = 1;
+
+	if (dirty) {
+		rc = scrub_file_store(env, scrub);
+		if (rc)
+			GOTO(cleanup_obj, rc);
+	}
+
+	/* Initialize OI files. */
+	rc = osd_oi_init(env, dev);
+	if (rc < 0)
+		GOTO(cleanup_obj, rc);
+
+	if (!dev->od_dt_dev.dd_rdonly)
+		osd_initial_OI_scrub(env, dev);
+
+	if (!dev->od_dt_dev.dd_rdonly &&
+	    dev->od_auto_scrub_interval != AS_NEVER &&
+	    ((sf->sf_status == SS_PAUSED) ||
+	     (sf->sf_status == SS_CRASHED &&
+	      sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+			      SF_UPGRADE | SF_AUTO)) ||
+	     (sf->sf_status == SS_INIT &&
+	      sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+			      SF_UPGRADE))))
+		rc = osd_scrub_start(env, dev, SS_AUTO_FULL);
+
+	if (rc)
+		GOTO(cleanup_oi, rc);
+
+	RETURN(0);
+
+cleanup_oi:
+	osd_oi_fini(env, dev);
+cleanup_obj:
+	dt_object_put_nocache(env, scrub->os_obj);
+	scrub->os_obj = NULL;
+
+	return rc;
+}
+
+void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+
+	LASSERT(!dev->od_otable_it);
+
+	if (scrub->os_obj) {
+		osd_scrub_stop(dev);
+		dt_object_put_nocache(env, scrub->os_obj);
+		scrub->os_obj = NULL;
+	}
+
+	if (dev->od_oi_table)
+		osd_oi_fini(env, dev);
+}
+
+/* object table based iteration APIs */
+
+static struct dt_it *osd_otable_it_init(const struct lu_env *env,
+				       struct dt_object *dt, __u32 attr)
+{
+	enum dt_otable_it_flags flags = attr >> DT_OTABLE_IT_FLAGS_SHIFT;
+	enum dt_otable_it_valid valid = attr & ~DT_OTABLE_IT_FLAGS_MASK;
+	struct osd_device *dev = osd_dev(dt->do_lu.lo_dev);
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct osd_otable_it *it;
+	__u32 start = 0;
+	int rc;
+	ENTRY;
+
+	if (dev->od_dt_dev.dd_rdonly)
+		RETURN(ERR_PTR(-EROFS));
+
+	/* od_otable_sem: prevent concurrent init/fini */
+	down(&dev->od_otable_sem);
+	if (dev->od_otable_it)
+		GOTO(out, it = ERR_PTR(-EALREADY));
+
+	OBD_ALLOC_PTR(it);
+	if (!it)
+		GOTO(out, it = ERR_PTR(-ENOMEM));
+
+	if (flags & DOIF_OUTUSED)
+		it->ooi_used_outside = 1;
+
+	if (flags & DOIF_RESET)
+		start |= SS_RESET;
+
+	if (valid & DOIV_ERROR_HANDLE) {
+		if (flags & DOIF_FAILOUT)
+			start |= SS_SET_FAILOUT;
+		else
+			start |= SS_CLEAR_FAILOUT;
+	}
+
+	if (valid & DOIV_DRYRUN) {
+		if (flags & DOIF_DRYRUN)
+			start |= SS_SET_DRYRUN;
+		else
+			start |= SS_CLEAR_DRYRUN;
+	}
+
+	/* XXX: dmu_object_next() does NOT find dnodes allocated
+	 *	in the current non-committed txg, so we force txg
+	 *	commit to find all existing dnodes ... */
+	txg_wait_synced(dmu_objset_pool(dev->od_os), 0ULL);
+
+	dev->od_otable_it = it;
+	it->ooi_dev = dev;
+	rc = scrub_start(osd_scrub_main, scrub, dev, start & ~SS_AUTO_PARTIAL);
+	if (rc == -EALREADY) {
+		it->ooi_pos = 1;
+	} else if (rc < 0) {
+		dev->od_otable_it = NULL;
+		OBD_FREE_PTR(it);
+		it = ERR_PTR(rc);
+	} else {
+		it->ooi_pos = scrub->os_pos_current;
+	}
+
+	GOTO(out, it);
+
+out:
+	up(&dev->od_otable_sem);
+	return (struct dt_it *)it;
+}
+
+static void osd_otable_it_fini(const struct lu_env *env, struct dt_it *di)
+{
+	struct osd_otable_it *it = (struct osd_otable_it *)di;
+	struct osd_device *dev = it->ooi_dev;
+
+	/* od_otable_sem: prevent concurrent init/fini */
+	down(&dev->od_otable_sem);
+	scrub_stop(&dev->od_scrub);
+	LASSERT(dev->od_otable_it == it);
+
+	dev->od_otable_it = NULL;
+	up(&dev->od_otable_sem);
+	OBD_FREE_PTR(it);
+}
+
+static int osd_otable_it_get(const struct lu_env *env,
+			     struct dt_it *di, const struct dt_key *key)
+{
+	return 0;
+}
+
+static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di)
+{
+}
+
+static void osd_otable_it_preload(const struct lu_env *env,
+				  struct osd_otable_it *it)
+{
+	struct osd_device *dev = it->ooi_dev;
+	int rc;
+
+	/* can go negative on the very first access to the iterator
+	 * or if some non-Lustre objects were found */
+	if (unlikely(it->ooi_prefetched < 0))
+		it->ooi_prefetched = 0;
+
+	if (it->ooi_prefetched >= (OTABLE_PREFETCH >> 1))
+		return;
+
+	if (it->ooi_prefetched_dnode == 0)
+		it->ooi_prefetched_dnode = it->ooi_pos;
+
+	while (it->ooi_prefetched < OTABLE_PREFETCH) {
+		rc = -dmu_object_next(dev->od_os, &it->ooi_prefetched_dnode,
+				      B_FALSE, 0);
+		if (rc)
+			break;
+
+		osd_dmu_prefetch(dev->od_os, it->ooi_prefetched_dnode,
+				 0, 0, 0, ZIO_PRIORITY_ASYNC_READ);
+		it->ooi_prefetched++;
+	}
+}
+
+static inline int
+osd_otable_it_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
+{
+	spin_lock(&scrub->os_lock);
+	if (it->ooi_pos < scrub->os_pos_current || scrub->os_waiting ||
+	    !thread_is_running(&scrub->os_thread))
+		it->ooi_waiting = 0;
+	else
+		it->ooi_waiting = 1;
+	spin_unlock(&scrub->os_lock);
+
+	return !it->ooi_waiting;
+}
+
+static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di)
+{
+	struct osd_otable_it *it = (struct osd_otable_it *)di;
+	struct osd_device *dev = it->ooi_dev;
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct l_wait_info lwi = { 0 };
+	struct lustre_mdt_attrs *lma = NULL;
+	nvlist_t *nvbuf = NULL;
+	int size = 0;
+	int rc;
+	ENTRY;
+
+	LASSERT(it->ooi_user_ready);
+	fid_zero(&it->ooi_fid);
+
+	if (unlikely(it->ooi_all_cached))
+		RETURN(1);
+
+again:
+	if (nvbuf) {
+		nvlist_free(nvbuf);
+		nvbuf = NULL;
+		lma = NULL;
+		size = 0;
+	}
+
+	if (it->ooi_pos >= scrub->os_pos_current)
+		l_wait_event(thread->t_ctl_waitq,
+			     osd_otable_it_wakeup(scrub, it),
+			     &lwi);
+
+	if (!thread_is_running(thread) && !it->ooi_used_outside)
+		GOTO(out, rc = 1);
+
+	rc = -dmu_object_next(dev->od_os, &it->ooi_pos, B_FALSE, 0);
+	if (rc) {
+		if (unlikely(rc == -ESRCH)) {
+			it->ooi_all_cached = 1;
+			rc = 1;
+		}
+
+		GOTO(out, rc);
+	}
+
+	rc = __osd_xattr_load_by_oid(dev, it->ooi_pos, &nvbuf);
+
+	if (!scrub->os_full_speed)
+		spin_lock(&scrub->os_lock);
+	it->ooi_prefetched--;
+	if (!scrub->os_full_speed) {
+		if (scrub->os_waiting) {
+			scrub->os_waiting = 0;
+			wake_up_all(&thread->t_ctl_waitq);
+		}
+		spin_unlock(&scrub->os_lock);
+	}
+
+	if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+		goto again;
+
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(nvbuf != NULL);
+	rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+				       (uchar_t **)&lma, &size);
+	if (rc || size == 0)
+		/* It is either non-Lustre object or OSD internal object,
+		 * ignore it, go ahead */
+		goto again;
+
+	LASSERTF(lma != NULL, "corrupted LMA, size %d\n", size);
+	lustre_lma_swab(lma);
+	if (unlikely(lma->lma_compat & LMAC_NOT_IN_OI ||
+		     lma->lma_incompat & LMAI_AGENT))
+		goto again;
+
+	it->ooi_fid = lma->lma_self_fid;
+
+	GOTO(out, rc = 0);
+
+out:
+	if (nvbuf)
+		nvlist_free(nvbuf);
+
+	if (!rc && scrub->os_full_speed)
+		osd_otable_it_preload(env, it);
+
+	return rc;
+}
+
+static struct dt_key *osd_otable_it_key(const struct lu_env *env,
+					const struct dt_it *di)
+{
+	return NULL;
+}
+
+static int osd_otable_it_key_size(const struct lu_env *env,
+				  const struct dt_it *di)
+{
+	return sizeof(__u64);
+}
+
+static int osd_otable_it_rec(const struct lu_env *env, const struct dt_it *di,
+			     struct dt_rec *rec, __u32 attr)
+{
+	struct osd_otable_it *it  = (struct osd_otable_it *)di;
+	struct lu_fid *fid = (struct lu_fid *)rec;
+
+	*fid = it->ooi_fid;
+	return 0;
+}
+
+static __u64 osd_otable_it_store(const struct lu_env *env,
+				 const struct dt_it *di)
+{
+	struct osd_otable_it *it = (struct osd_otable_it *)di;
+
+	return it->ooi_pos;
+}
+
+/**
+ * Set the OSD layer iteration start position as the specified hash.
+ */
+static int osd_otable_it_load(const struct lu_env *env,
+			      const struct dt_it *di, __u64 hash)
+{
+	struct osd_otable_it *it = (struct osd_otable_it *)di;
+	struct osd_device *dev = it->ooi_dev;
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	int rc;
+	ENTRY;
+
+	/* Forbid to set iteration position after iteration started. */
+	if (it->ooi_user_ready)
+		RETURN(-EPERM);
+
+	if (hash > OSD_OTABLE_MAX_HASH)
+		hash = OSD_OTABLE_MAX_HASH;
+
+	/* The hash is the last checkpoint position,
+	 * we will start from the next one. */
+	it->ooi_pos = hash + 1;
+	it->ooi_prefetched = 0;
+	it->ooi_prefetched_dnode = 0;
+	it->ooi_user_ready = 1;
+	if (!scrub->os_full_speed)
+		wake_up_all(&scrub->os_thread.t_ctl_waitq);
+
+	/* Unplug OSD layer iteration by the first next() call. */
+	rc = osd_otable_it_next(env, (struct dt_it *)it);
+
+	RETURN(rc);
+}
+
+static int osd_otable_it_key_rec(const struct lu_env *env,
+				 const struct dt_it *di, void *key_rec)
+{
+	return 0;
+}
+
+const struct dt_index_operations osd_otable_ops = {
+	.dio_it = {
+		.init     = osd_otable_it_init,
+		.fini     = osd_otable_it_fini,
+		.get      = osd_otable_it_get,
+		.put	  = osd_otable_it_put,
+		.next     = osd_otable_it_next,
+		.key	  = osd_otable_it_key,
+		.key_size = osd_otable_it_key_size,
+		.rec      = osd_otable_it_rec,
+		.store    = osd_otable_it_store,
+		.load     = osd_otable_it_load,
+		.key_rec  = osd_otable_it_key_rec,
+	}
+};
+
+/* high priority inconsistent items list APIs */
+
+int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
+		   const struct lu_fid *fid, uint64_t oid, bool insert)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct osd_inconsistent_item *oii;
+	bool wakeup = false;
+	ENTRY;
+
+	osd_idc_find_and_init_with_oid(env, dev, fid, oid);
+	OBD_ALLOC_PTR(oii);
+	if (unlikely(!oii))
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&oii->oii_list);
+	oii->oii_cache.oic_dev = dev;
+	oii->oii_cache.oic_fid = *fid;
+	oii->oii_cache.oic_dnode = oid;
+	oii->oii_insert = insert;
+
+	spin_lock(&scrub->os_lock);
+	if (unlikely(!thread_is_running(thread))) {
+		spin_unlock(&scrub->os_lock);
+		OBD_FREE_PTR(oii);
+		RETURN(-EAGAIN);
+	}
+
+	if (list_empty(&scrub->os_inconsistent_items))
+		wakeup = true;
+	list_add_tail(&oii->oii_list, &scrub->os_inconsistent_items);
+	spin_unlock(&scrub->os_lock);
+
+	if (wakeup)
+		wake_up_all(&thread->t_ctl_waitq);
+
+	RETURN(0);
+}
+
+int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
+		   uint64_t *oid)
+{
+	struct lustre_scrub *scrub = &dev->od_scrub;
+	struct osd_inconsistent_item *oii;
+	int ret = -ENOENT;
+	ENTRY;
+
+	spin_lock(&scrub->os_lock);
+	list_for_each_entry(oii, &scrub->os_inconsistent_items, oii_list) {
+		if (lu_fid_eq(fid, &oii->oii_cache.oic_fid)) {
+			*oid = oii->oii_cache.oic_dnode;
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&scrub->os_lock);
+
+	RETURN(ret);
+}
diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh
index aa4e7aa..88fd417 100644
--- a/lustre/tests/sanity-scrub.sh
+++ b/lustre/tests/sanity-scrub.sh
@@ -27,52 +27,25 @@ if ! check_versions; then
 	exit 0
 fi
 
-[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
-	skip "ldiskfs only test" && exit 0
-
-[ $(facet_fstype ost1) != "ldiskfs" ] &&
-	skip "ldiskfs only test" && exit 0
-
-[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.2.90) ]] &&
-	skip "Need MDS version at least 2.2.90" && exit 0
+stopall
 
 SAVED_MDSSIZE=${MDSSIZE}
 SAVED_OSTSIZE=${OSTSIZE}
 SAVED_OSTCOUNT=${OSTCOUNT}
+
 # use small MDS + OST size to speed formatting time
 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
-# 200M MDT device can guarantee uninitialized groups during the OI scrub
-MDSSIZE=200000
-OSTSIZE=100000
-# no need too much OSTs, to reduce the format/start/stop overhead
-stopall
-[ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
+# 400M MDT device can guarantee uninitialized groups during the OI scrub
+MDSSIZE=400000
+OSTSIZE=200000
 
-MOUNT_2=""
+# no need too many OSTs, to reduce the format/start/stop overhead
+[ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
 
 # build up a clean test environment.
 formatall
 setupall
 
-[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.90) ]] &&
-	ALWAYS_EXCEPT="$ALWAYS_EXCEPT 1a"
-
-[[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.6.50) ]] &&
-	ALWAYS_EXCEPT="$ALWAYS_EXCEPT 4"
-
-[[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.1) ]] &&
-	ALWAYS_EXCEPT="$ALWAYS_EXCEPT 15"
-
-[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.90) ]] &&
-[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.4.50) ]] &&
-	ALWAYS_EXCEPT="$ALWAYS_EXCEPT 15"
-
-[[ $(lustre_version_code ost1) -lt $(version_code 2.4.50) ]] &&
-	ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14"
-
-[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.5.59) ]] &&
-	SCRUB_ONLY="-t scrub"
-
 build_test_filter
 
 MDT_DEV="${FSNAME}-MDT0000"
@@ -86,7 +59,7 @@ scrub_start() {
 	# use "lfsck_start -A" when we no longer need testing interop
 	for n in $(seq $MDSCOUNT); do
 		do_facet mds$n $LCTL lfsck_start -M $(facet_svc mds$n) \
-			$SCRUB_ONLY "$@" ||
+			-t scrub "$@" ||
 			error "($error_id) Failed to start OI scrub on mds$n"
 	done
 }
@@ -105,22 +78,22 @@ scrub_stop() {
 scrub_status() {
 	local n=$1
 
-	do_facet mds$n $LCTL get_param -n \
-		osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+	do_facet mds$n $LCTL get_param -n osd-*.$(facet_svc mds$n).oi_scrub
 }
 
-START_SCRUB="do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} $SCRUB_ONLY"
-START_SCRUB_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} $SCRUB_ONLY"
+START_SCRUB="do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t scrub"
+START_SCRUB_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t scrub"
 STOP_SCRUB="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
 SHOW_SCRUB="do_facet $SINGLEMDS \
-		$LCTL get_param -n osd-ldiskfs.${MDT_DEV}.oi_scrub"
+		$LCTL get_param -n osd-*.${MDT_DEV}.oi_scrub"
 SHOW_SCRUB_ON_OST="do_facet ost1 \
-		$LCTL get_param -n osd-ldiskfs.${OST_DEV}.oi_scrub"
+		$LCTL get_param -n osd-*.${OST_DEV}.oi_scrub"
 MOUNT_OPTS_SCRUB="-o user_xattr"
 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
 
 scrub_prep() {
 	local nfiles=$1
+	local inject=$2
 	local n
 
 	check_mount_and_prep
@@ -142,6 +115,34 @@ scrub_prep() {
 		fi
 	done
 	echo "prepared $(date)."
+
+	[ ! -z $inject ] && [ $inject -eq 2 ] && {
+		#define OBD_FAIL_OSD_NO_OI_ENTRY	0x198
+		do_nodes $(comma_list $(mdts_nodes)) \
+				$LCTL set_param fail_loc=0x198
+
+		for n in $(seq $MDSCOUNT); do
+			cp $LUSTRE/tests/runas $DIR/$tdir/mds$n ||
+				error "Fail to copy runas to MDS$n"
+		done
+
+		do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0
+	}
+
+	[ ! -z $inject ] && [ $inject -eq 1 ] &&
+		[ $(facet_fstype $SINGLEMDS) = "zfs" ] && {
+		#define OBD_FAIL_OSD_FID_MAPPING	0x193
+		do_nodes $(comma_list $(mdts_nodes)) \
+			$LCTL set_param fail_loc=0x193
+
+		for n in $(seq $MDSCOUNT); do
+			chmod 0400 $DIR/$tdir/mds$n/test-framework.sh
+			chmod 0400 $DIR/$tdir/mds$n/sanity-scrub.sh
+		done
+
+		do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0
+	}
+
 	cleanup_mount $MOUNT > /dev/null || error "Fail to stop client!"
 
 	# sync local transactions on every MDT
@@ -159,6 +160,17 @@ scrub_prep() {
 		echo "stop mds$n"
 		stop mds$n > /dev/null || error "Fail to stop MDS$n!"
 	done
+
+	[ ! -z $inject ] && [ $(facet_fstype $SINGLEMDS) = "ldiskfs" ] && {
+		if [ $inject -eq 1 ]; then
+			for n in $(seq $MDSCOUNT); do
+				mds_backup_restore mds$n ||
+					error "Backup/restore on mds$n failed"
+			done
+		elif [ $inject -eq 2 ]; then
+			scrub_remove_ois 1
+		fi
+	}
 }
 
 scrub_start_mds() {
@@ -190,7 +202,7 @@ scrub_check_status() {
 
 	for n in $(seq $MDSCOUNT); do
 		wait_update_facet mds$n "$LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+			osd-*.$(facet_svc mds$n).oi_scrub |
 			awk '/^status/ { print \\\$2 }'" "$expected" 6 ||
 			error "($error_id) Expected '$expected' on mds$n"
 	done
@@ -204,7 +216,7 @@ scrub_check_flags() {
 
 	for n in $(seq $MDSCOUNT); do
 		actual=$(do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+			osd-*.$(facet_svc mds$n).oi_scrub |
 			awk '/^flags/ { print $2 }')
 		if [ "$actual" != "$expected" ]; then
 			error "($error_id) Expected '$expected' on mds$n, but" \
@@ -221,7 +233,7 @@ scrub_check_params() {
 
 	for n in $(seq $MDSCOUNT); do
 		actual=$(do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+			osd-*.$(facet_svc mds$n).oi_scrub |
 			awk '/^param/ { print $2 }')
 		if [ "$actual" != "$expected" ]; then
 			error "($error_id) Expected '$expected' on mds$n, but" \
@@ -240,11 +252,11 @@ scrub_check_repaired() {
 	for n in $(seq $MDSCOUNT); do
 		if [ $dryrun -eq 1 ]; then
 			actual=$(do_facet mds$n $LCTL get_param -n \
-				osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+				osd-*.$(facet_svc mds$n).oi_scrub |
 				awk '/^inconsistent:/ { print $2 }')
 		else
 			actual=$(do_facet mds$n $LCTL get_param -n \
-				osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+				osd-*.$(facet_svc mds$n).oi_scrub |
 				awk '/^updated:/ { print $2 }')
 		fi
 
@@ -284,6 +296,8 @@ scrub_check_data2() {
 }
 
 scrub_remove_ois() {
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] && return
+
 	local error_id=$1
 	local index=$2
 	local n
@@ -294,40 +308,27 @@ scrub_remove_ois() {
 	done
 }
 
-scrub_backup_restore() {
-	local error_id=$1
-	local igif=$2
-	local n
-
-	for n in $(seq $MDSCOUNT); do
-		mds_backup_restore mds$n $igif ||
-			error "($error_id) Backup/restore on mds$n failed"
-	done
-}
-
 scrub_enable_auto() {
 	do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param -n \
-		osd-ldiskfs.*.auto_scrub=1
+		osd-*.*.auto_scrub=1
 }
 
 full_scrub_ratio() {
-	[[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.6.50) ]] &&
-		return
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] && return
 
 	local ratio=$1
 
 	do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param -n \
-		osd-ldiskfs.*.full_scrub_ratio=$ratio
+		osd-*.*.full_scrub_ratio=$ratio
 }
 
 full_scrub_threshold_rate() {
-	[[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.6.50) ]] &&
-		return
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] && return
 
 	local rate=$1
 
 	do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param -n \
-		osd-ldiskfs.*.full_scrub_threshold_rate=$rate
+		osd-*.*.full_scrub_threshold_rate=$rate
 }
 
 test_0() {
@@ -371,17 +372,21 @@ test_1a() {
 run_test 1a "Auto trigger initial OI scrub when server mounts"
 
 test_1b() {
-	scrub_prep 0
-	scrub_remove_ois 1
+	scrub_prep 0 2
 	echo "start MDTs without disabling OI scrub"
 	scrub_start_mds 2 "$MOUNT_OPTS_SCRUB"
-	scrub_check_status 3 completed
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_status 3 completed
 	mount_client $MOUNT || error "(4) Fail to start client!"
-	scrub_check_data 5
+	scrub_check_data2 runas 5
+	scrub_check_status 6 completed
 }
 run_test 1b "Trigger OI scrub when MDT mounts for OI files remove/recreate case"
 
 test_1c() {
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "ldiskfs special test" && return
+
 	local index
 
 	# OI files to be removed:
@@ -402,8 +407,10 @@ test_1c() {
 run_test 1c "Auto detect kinds of OI file(s) removed/recreated cases"
 
 test_2() {
-	scrub_prep 0
-	scrub_backup_restore 1
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "ldiskfs special test" && return
+
+	scrub_prep 0 1
 	echo "starting MDTs without disabling OI scrub"
 	scrub_start_mds 2 "$MOUNT_OPTS_SCRUB"
 	scrub_check_status 3 completed
@@ -417,21 +424,21 @@ test_3() {
 	formatall > /dev/null
 	setupall > /dev/null
 
-	scrub_prep 0
-	scrub_backup_restore 1
+	scrub_prep 0 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
 	scrub_check_status 3 init
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 }
 #run_test 3 "Do not trigger OI scrub when MDT mounts if 'noscrub' specified"
 
 test_4a() {
-	scrub_prep 0
-	scrub_backup_restore 1
+	scrub_prep 0 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 	mount_client $MOUNT || error "(5) Fail to start client!"
 	scrub_enable_auto
 	full_scrub_ratio 0
@@ -461,8 +468,10 @@ test_4a() {
 run_test 4a "Auto trigger OI scrub if bad OI mapping was found (1)"
 
 test_4b() {
-	scrub_prep 5
-	scrub_backup_restore 1
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "ldiskfs special test" && return
+
+	scrub_prep 5 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
 	scrub_check_flags 4 recreated,inconsistent
@@ -483,7 +492,7 @@ test_4b() {
 
 		echo "OI scrub on MDS$n status for the 1st time:"
 		do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+			osd-*.$(facet_svc mds$n).oi_scrub
 	done
 
 	scrub_check_data2 sanity-scrub.sh 9
@@ -499,7 +508,7 @@ test_4b() {
 
 		echo "OI scrub on MDS$n status for the 2nd time:"
 		do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+			osd-*.$(facet_svc mds$n).oi_scrub
 
 		[ ${updated0[$n]} -lt ${updated1[$n]} ] ||
 			error "(12) Auto trigger full scrub unexpectedly"
@@ -520,7 +529,7 @@ test_4b() {
 
 		echo "OI scrub on MDS$n status for the 3rd time:"
 		do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+			osd-*.$(facet_svc mds$n).oi_scrub
 
 		[ ${updated0[$n]} -gt ${updated1[$n]} ] ||
 			error "(16) Auto trigger full scrub unexpectedly"
@@ -537,7 +546,7 @@ test_4b() {
 		[ ${updated0[$n]} -eq ${updated1[$n]} ] || {
 			echo "OI scrub on MDS$n status for the 4th time:"
 			do_facet mds$n $LCTL get_param -n \
-				osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+				osd-*.$(facet_svc mds$n).oi_scrub
 
 			error "(18) NOT auto trigger full scrub as expected"
 		}
@@ -546,8 +555,10 @@ test_4b() {
 run_test 4b "Auto trigger OI scrub if bad OI mapping was found (2)"
 
 test_4c() {
-	scrub_prep 500
-	scrub_backup_restore 1
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "ldiskfs special test" && return
+
+	scrub_prep 500 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
 	scrub_check_flags 4 recreated,inconsistent
@@ -568,7 +579,7 @@ test_4c() {
 
 		echo "OI scrub on MDS$n status for the 1st time:"
 		do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+			osd-*.$(facet_svc mds$n).oi_scrub
 	done
 
 	scrub_check_data2 sanity-scrub.sh 9
@@ -584,7 +595,7 @@ test_4c() {
 
 		echo "OI scrub on MDS$n status for the 2nd time:"
 		do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+			osd-*.$(facet_svc mds$n).oi_scrub
 
 		[ ${updated0[$n]} -lt ${updated1[$n]} ] ||
 			error "(12) Auto trigger full scrub unexpectedly"
@@ -605,7 +616,7 @@ test_4c() {
 
 		echo "OI scrub on MDS$n status for the 3rd time:"
 		do_facet mds$n $LCTL get_param -n \
-			osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+			osd-*.$(facet_svc mds$n).oi_scrub
 
 		[ ${updated0[$n]} -gt ${updated1[$n]} ] ||
 			error "(16) Auto trigger full scrub unexpectedly"
@@ -622,7 +633,7 @@ test_4c() {
 		[ ${updated0[$n]} -eq ${updated1[$n]} ] || {
 			echo "OI scrub on MDS$n status for the 4th time:"
 			do_facet mds$n $LCTL get_param -n \
-				osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+				osd-*.$(facet_svc mds$n).oi_scrub
 
 			error "(18) NOT auto trigger full scrub as expected"
 		}
@@ -634,12 +645,12 @@ test_5() {
 	formatall > /dev/null
 	setupall > /dev/null
 
-	scrub_prep 1000
-	scrub_backup_restore 1
+	scrub_prep 100 1
 	echo "starting MDTs with OI scrub disabled (1)"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
 	scrub_check_status 3 init
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 	mount_client $MOUNT || error "(5) Fail to start client!"
 	scrub_enable_auto
 	full_scrub_ratio 0
@@ -688,12 +699,13 @@ test_5() {
 	declare -a pids
 
 	for n in $(seq $MDSCOUNT); do
-		stat $DIR/$tdir/mds$n/${tfile}800 &
+		stat $DIR/$tdir/mds$n/sanity-scrub.sh &
 		pids[$n]=$!
 	done
 
 	for n in $(seq $MDSCOUNT); do
-		wait ${pids[$n]} || error "(18) Fail to stat mds$n/${tfile}800"
+		wait ${pids[$n]} ||
+			error "(18) Fail to stat mds$n/sanity-scrub.sh"
 	done
 
 	scrub_check_status 19 completed
@@ -702,11 +714,11 @@ test_5() {
 run_test 5 "OI scrub state machine"
 
 test_6() {
-	scrub_prep 1000
-	scrub_backup_restore 1
+	scrub_prep 100 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 	mount_client $MOUNT || error "(5) Fail to start client!"
 	scrub_enable_auto
 	full_scrub_ratio 0
@@ -732,8 +744,8 @@ test_6() {
 	local n
 	for n in $(seq $MDSCOUNT); do
 		# stat will re-trigger OI scrub
-		stat $DIR/$tdir/mds$n/${tfile}800 ||
-			error "(8) Failed to stat mds$n/${tfile}800"
+		stat $DIR/$tdir/mds$n/sanity-scrub.sh ||
+			error "(8) Failed to stat mds$n/sanity-scrub.sh"
 	done
 
 	umount_client $MOUNT || error "(9) Fail to stop client!"
@@ -780,11 +792,11 @@ test_6() {
 run_test 6 "OI scrub resumes from last checkpoint"
 
 test_7() {
-	scrub_prep 500
-	scrub_backup_restore 1
+	scrub_prep 500 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 	mount_client $MOUNT || error "(5) Fail to start client!"
 	scrub_enable_auto
 	full_scrub_ratio 0
@@ -802,7 +814,11 @@ test_7() {
 	done
 
 	scrub_check_status 8 scanning
-	scrub_check_flags 9 recreated,inconsistent,auto
+	if [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ]; then
+		scrub_check_flags 9 inconsistent,auto
+	else
+		scrub_check_flags 9 recreated,inconsistent,auto
+	fi
 
 	do_nodes $(comma_list $(mdts_nodes)) \
 		$LCTL set_param fail_loc=0 fail_val=0
@@ -813,11 +829,11 @@ test_7() {
 run_test 7 "System is available during OI scrub scanning"
 
 test_8() {
-	scrub_prep 128
-	scrub_backup_restore 1
+	scrub_prep 128 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 
 	#define OBD_FAIL_OSD_SCRUB_DELAY	 0x190
 	do_nodes $(comma_list $(mdts_nodes)) \
@@ -839,13 +855,16 @@ test_8() {
 run_test 8 "Control OI scrub manually"
 
 test_9() {
+	# Skip scrub speed test for ZFS because of performance unstable
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "test scrub speed only on ldiskfs" && return
+
 	if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
 		skip "Testing on UP system, the speed may be inaccurate."
 		return 0
 	fi
 
-	scrub_prep 6000
-	scrub_backup_restore 1
+	scrub_prep 6000 1
 
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
@@ -919,11 +938,11 @@ test_9() {
 run_test 9 "OI scrub speed control"
 
 test_10a() {
-	scrub_prep 0
-	scrub_backup_restore 1
+	scrub_prep 0 1
 	echo "starting mds$n with OI scrub disabled (1)"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 	mount_client $MOUNT || error "(5) Fail to start client!"
 	scrub_enable_auto
 	full_scrub_ratio 0
@@ -954,11 +973,11 @@ run_test 10a "non-stopped OI scrub should auto restarts after MDS remount (1)"
 
 # test_10b is obsolete, it will be coverded by related sanity-lfsck tests.
 test_10b() {
-	scrub_prep 0
-	scrub_backup_restore 1
+	scrub_prep 0 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 
 	#define OBD_FAIL_OSD_SCRUB_DELAY	 0x190
 	do_nodes $(comma_list $(mdts_nodes)) \
@@ -984,6 +1003,9 @@ test_10b() {
 #run_test 10b "non-stopped OI scrub should auto restarts after MDS remount (2)"
 
 test_11() {
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "ldiskfs special test" && return
+
 	local CREATED=100
 	local n
 
@@ -1061,7 +1083,7 @@ test_12() {
 
 	do_facet ost1 $LCTL set_param fail_loc=0
 	wait_update_facet ost1 "$LCTL get_param -n \
-		osd-ldiskfs.$(facet_svc ost1).oi_scrub |
+		osd-*.$(facet_svc ost1).oi_scrub |
 		awk '/^status/ { print \\\$2 }'" "completed" 6 ||
 		error "(7) Expected '$expected' on ost1"
 
@@ -1097,7 +1119,7 @@ test_13() {
 	$START_SCRUB_ON_OST -r || error "(6) Fail to start OI scrub on OST!"
 
 	wait_update_facet ost1 "$LCTL get_param -n \
-		osd-ldiskfs.$(facet_svc ost1).oi_scrub |
+		osd-*.$(facet_svc ost1).oi_scrub |
 		awk '/^status/ { print \\\$2 }'" "completed" 6 ||
 		error "(7) Expected '$expected' on ost1"
 
@@ -1106,6 +1128,9 @@ test_13() {
 run_test 13 "OI scrub can rebuild missed /O entries"
 
 test_14() {
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+		skip "ldiskfs special test" && return
+
 	check_mount_and_prep
 	$SETSTRIPE -c 1 -i 0 $DIR/$tdir
 
@@ -1139,57 +1164,51 @@ test_14() {
 run_test 14 "OI scrub can repair objects under lost+found"
 
 test_15() {
-	local server_version=$(lustre_version_code $SINGLEMDS)
-	scrub_prep 20
-	scrub_backup_restore 1
+	local repaired
+
+	formatall > /dev/null
+	setupall > /dev/null
+
+	scrub_prep 20 1
 	echo "starting MDTs with OI scrub disabled"
 	scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
 	scrub_check_status 3 init
-	scrub_check_flags 4 recreated,inconsistent
+	[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+		scrub_check_flags 4 recreated,inconsistent
 
 	# run under dryrun mode
-	if [ $server_version -lt $(version_code 2.5.58) ]; then
-		scrub_start 5 --dryrun on
+	scrub_start 5 --dryrun
+	scrub_check_status 6 completed
+	if [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ]; then
+		scrub_check_flags 7 inconsistent
+		repaired=2
 	else
-		scrub_start 5 --dryrun
+		scrub_check_flags 7 recreated,inconsistent
+		repaired=20
 	fi
-	scrub_check_status 6 completed
-	scrub_check_flags 7 recreated,inconsistent
 	scrub_check_params 8 dryrun
-	scrub_check_repaired 9 20 1
+	scrub_check_repaired 9 $repaired 1
 
 	# run under dryrun mode again
-	if [ $server_version -lt $(version_code 2.5.58) ]; then
-		scrub_start 10 --dryrun on
+	scrub_start 10 --dryrun
+	scrub_check_status 11 completed
+	if [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ]; then
+		scrub_check_flags 12 inconsistent
 	else
-		scrub_start 10 --dryrun
+		scrub_check_flags 12 recreated,inconsistent
 	fi
-	scrub_check_status 11 completed
-	scrub_check_flags 12 recreated,inconsistent
 	scrub_check_params 13 dryrun
-	scrub_check_repaired 14 20 1
+	scrub_check_repaired 14 $repaired 1
 
 	# run under normal mode
-	#
-	# Lustre-2.x (x <= 5) used "-n off" to disable dryrun which does not
-	# work under Lustre-2.y (y >= 6), the test script should be fixed as
-	# "-noff" or "--dryrun=off" or nothing by default.
-	if [ $server_version -lt $(version_code 2.5.58) ]; then
-		scrub_start 15 --dryrun off
-	else
-		scrub_start 15
-	fi
+	scrub_start 15
 	scrub_check_status 16 completed
 	scrub_check_flags 17 ""
 	scrub_check_params 18 ""
-	scrub_check_repaired 19 20 0
+	scrub_check_repaired 19 $repaired 0
 
 	# run under normal mode again
-	if [ $server_version -lt $(version_code 2.5.58) ]; then
-		scrub_start 20 --dryrun off
-	else
-		scrub_start 20
-	fi
+	scrub_start 20
 	scrub_check_status 21 completed
 	scrub_check_flags 22 ""
 	scrub_check_params 23 ""
-- 
1.8.3.1