LU-7585 zfs: OI scrub for ZFS

author Fan Yong <fan.yong@intel.com>

Thu, 18 Jan 2018 01:34:50 +0000 (09:34 +0800)

committer Oleg Drokin <oleg.drokin@intel.com>

Wed, 31 Jan 2018 05:51:54 +0000 (05:51 +0000)
author Fan Yong <fan.yong@intel.com>
Thu, 18 Jan 2018 01:34:50 +0000 (09:34 +0800)
committer Oleg Drokin <oleg.drokin@intel.com>
Wed, 31 Jan 2018 05:51:54 +0000 (05:51 +0000)
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 063821f..af090df 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -272,6 +272,7 @@ extern char obd_jobid_var[];
  #define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY              0x195
  #define OBD_FAIL_OSD_COMPAT_NO_ENTRY                   0x196
  #define OBD_FAIL_OSD_OST_EA_FID_SET                    0x197
+#define OBD_FAIL_OSD_NO_OI_ENTRY                       0x198
  
  #define OBD_FAIL_OST                     0x200
  #define OBD_FAIL_OST_CONNECT_NET         0x201
diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c

index be9e4d3..a99434f 100644 (file)
--- a/lustre/lfsck/lfsck_lib.c
+++ b/lustre/lfsck/lfsck_lib.c
@@ -1809,7 +1809,8 @@ void lfsck_pos_fill(const struct lu_env *env, struct lfsck_instance *lfsck,
         if (!lfsck->li_current_oit_processed && !init)
                 pos->lp_oit_cookie--;
  
-       LASSERT(pos->lp_oit_cookie > 0);
+       if (unlikely(pos->lp_oit_cookie == 0))
+               pos->lp_oit_cookie = 1;
  
         if (lfsck->li_di_dir != NULL) {
                 struct dt_object *dto = lfsck->li_obj_dir;
diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c

index 6db1b45..75b9c1b 100644 (file)
--- a/lustre/mgs/mgs_handler.c
+++ b/lustre/mgs/mgs_handler.c
@@ -1400,7 +1400,7 @@ err_ns:
  err_ops:
         lu_site_purge(env, mgs2lu_dev(mgs)->ld_site, ~0);
         if (!cfs_hash_is_empty(mgs2lu_dev(mgs)->ld_site->ls_obj_hash)) {
-               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_OTHER, NULL);
                 lu_site_print(env, mgs2lu_dev(mgs)->ld_site, &msgdata,
                                 lu_cdebug_printer);
         }
@@ -1576,7 +1576,7 @@ static struct lu_device *mgs_device_fini(const struct lu_env *env,
  
         lu_site_purge(env, d->ld_site, ~0);
         if (!cfs_hash_is_empty(d->ld_site->ls_obj_hash)) {
-               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_OTHER, NULL);
                 lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer);
         }
  
diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c

index 3ef00be..da8b008 100644 (file)
--- a/lustre/ofd/ofd_dev.c
+++ b/lustre/ofd/ofd_dev.c
@@ -247,7 +247,7 @@ static void ofd_stack_fini(const struct lu_env *env, struct ofd_device *m,
  
         lu_site_purge(env, top->ld_site, ~0);
         if (!cfs_hash_is_empty(top->ld_site->ls_obj_hash)) {
-               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_OTHER, NULL);
                 lu_site_print(env, top->ld_site, &msgdata, lu_cdebug_printer);
         }
  
diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c

index 9610d3a..2b699a0 100644 (file)
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -3669,7 +3669,7 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
                         obj->oo_dt.do_body_ops = &osd_body_ops;
         }
  
-       if (result == 0)
+       if (!result && !CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY))
                 result = __osd_oi_insert(env, obj, fid, th);
  
         /* a small optimization - dt_insert() isn't usually applied
diff --git a/lustre/osd-zfs/Makefile.in b/lustre/osd-zfs/Makefile.in

index 6ffa654..4483348 100644 (file)
--- a/lustre/osd-zfs/Makefile.in
+++ b/lustre/osd-zfs/Makefile.in
@@ -1,6 +1,7 @@
  MODULES := osd_zfs
  osd_zfs-objs := osd_handler.o osd_lproc.o osd_quota.o
  osd_zfs-objs += osd_object.o osd_io.o osd_oi.o osd_xattr.o osd_index.o
+osd_zfs-objs += osd_scrub.o
  
  EXTRA_PRE_CFLAGS += -include @SPL_OBJ@/spl_config.h
  EXTRA_PRE_CFLAGS += -include @ZFS_OBJ@/zfs_config.h
diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c

index 74db348..b55d329 100644 (file)
--- a/lustre/osd-zfs/osd_handler.c
+++ b/lustre/osd-zfs/osd_handler.c
@@ -1058,6 +1058,8 @@ static int osd_mount(const struct lu_env *env,
         if (rc >= sizeof(o->od_svname))
                 RETURN(-E2BIG);
  
+       o->od_index = -1; /* -1 means index is invalid */
+       rc = server_name2index(o->od_svname, &o->od_index, NULL);
         str = strstr(str, ":");
         if (str) {
                 unsigned long flags;
@@ -1071,6 +1073,9 @@ static int osd_mount(const struct lu_env *env,
                         LCONSOLE_WARN("%s: set dev_rdonly on this device\n",
                                       svname);
                 }
+
+               if (flags & LMD_FLG_NOSCRUB)
+                       o->od_auto_scrub_interval = AS_NEVER;
         }
  
         if (server_name_is_ost(o->od_svname))
@@ -1108,11 +1113,6 @@ static int osd_mount(const struct lu_env *env,
         }
  #endif
  
-       /* 1. initialize oi before any file create or file open */
-       rc = osd_oi_init(env, o);
-       if (rc)
-               GOTO(err, rc);
-
         rc = lu_site_init(&o->od_site, osd2lu_dev(o));
         if (rc)
                 GOTO(err, rc);
@@ -1126,6 +1126,12 @@ static int osd_mount(const struct lu_env *env,
         if (rc)
                 GOTO(err, rc);
  
+       o->od_in_init = 1;
+       rc = osd_scrub_setup(env, o);
+       o->od_in_init = 0;
+       if (rc)
+               GOTO(err, rc);
+
         rc = osd_procfs_init(o, o->od_svname);
         if (rc)
                 GOTO(err, rc);
@@ -1222,6 +1228,9 @@ static int osd_device_init0(const struct lu_env *env,
  
         l->ld_ops = &osd_lu_ops;
         o->od_dt_dev.dd_ops = &osd_dt_ops;
+       sema_init(&o->od_otable_sem, 1);
+       INIT_LIST_HEAD(&o->od_ios_list);
+       o->od_auto_scrub_interval = AS_DEFAULT;
  
  out:
         RETURN(rc);
@@ -1304,7 +1313,7 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
  
         /* now with all the callbacks completed we can cleanup the remainings */
         osd_shutdown(env, o);
-       osd_oi_fini(env, o);
+       osd_scrub_cleanup(env, o);
  
         rc = osd_procfs_fini(o);
         if (rc) {
@@ -1552,7 +1561,6 @@ static void __exit osd_exit(void)
         lu_kmem_fini(osd_caches);
  }
  
-extern unsigned int osd_oi_count;
  module_param(osd_oi_count, int, 0444);
  MODULE_PARM_DESC(osd_oi_count, "Number of Object Index containers to be created, it's only valid for new filesystem.");
  
diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c

index 09c3057..20f86f2 100644 (file)
--- a/lustre/osd-zfs/osd_index.c
+++ b/lustre/osd-zfs/osd_index.c
@@ -331,7 +331,7 @@ out:
   */
  static int osd_find_parent_by_dnode(const struct lu_env *env,
                                     struct dt_object *o,
-                                   struct lu_fid *fid)
+                                   struct lu_fid *fid, uint64_t *oid)
  {
         struct osd_object       *obj = osd_dt_obj(o);
         struct osd_device       *osd = osd_obj2dev(obj);
@@ -344,14 +344,17 @@ static int osd_find_parent_by_dnode(const struct lu_env *env,
         if (rc != 0)
                 RETURN(rc);
         rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PARENT(osd), &dnode, 8);
-       if (rc == 0)
+       if (!rc) {
+               if (oid)
+                       *oid = dnode;
                 rc = osd_get_fid_by_oid(env, osd, dnode, fid);
+       }
  
         RETURN(rc);
  }
  
  static int osd_find_parent_fid(const struct lu_env *env, struct dt_object *o,
-                              struct lu_fid *fid)
+                              struct lu_fid *fid, uint64_t *oid)
  {
         struct link_ea_header  *leh;
         struct link_ea_entry   *lee;
@@ -404,7 +407,7 @@ out:
         if (rc == 0) {
                 struct lu_fid fid2;
                 int rc2;
-               rc2 = osd_find_parent_by_dnode(env, o, &fid2);
+               rc2 = osd_find_parent_by_dnode(env, o, &fid2, oid);
                 if (rc2 == 0)
                         if (lu_fid_eq(fid, &fid2) == 0)
                                 CERROR("wrong parent: "DFID" != "DFID"\n",
@@ -414,19 +417,241 @@ out:
  
         /* no LinkEA is found, let's try to find the fid in parent's LMA */
         if (unlikely(rc != 0))
-               rc = osd_find_parent_by_dnode(env, o, fid);
+               rc = osd_find_parent_by_dnode(env, o, fid, oid);
  
         RETURN(rc);
  }
  
+/*
+ * When lookup item under striped directory, we need to locate the master
+ * MDT-object of the striped directory firstly, then the client will send
+ * lookup (getattr_by_name) RPC to the MDT with some slave MDT-object's FID
+ * and the item's name. If the system is restored from MDT file level backup,
+ * then before the OI scrub completely built the OI files, the OI mappings of
+ * the master MDT-object and slave MDT-object may be invalid. Usually, it is
+ * not a problem for the master MDT-object. Because when locate the master
+ * MDT-object, we will do name based lookup (for the striped directory itself)
+ * firstly, during such process we can setup the correct OI mapping for the
+ * master MDT-object. But it will be trouble for the slave MDT-object. Because
+ * the client will not trigger name based lookup on the MDT to locate the slave
+ * MDT-object before locating item under the striped directory, then when
+ * osd_fid_lookup(), it will find that the OI mapping for the slave MDT-object
+ * is invalid and does not know what the right OI mapping is, then the MDT has
+ * to return -EINPROGRESS to the client to notify that the OI scrub is rebuiding
+ * the OI file, related OI mapping is unknown yet, please try again later. And
+ * then client will re-try the RPC again and again until related OI mapping has
+ * been updated. That is quite inefficient.
+ *
+ * To resolve above trouble, we will handle it as the following two cases:
+ *
+ * 1) The slave MDT-object and the master MDT-object are on different MDTs.
+ *    It is relative easy. Be as one of remote MDT-objects, the slave MDT-object
+ *    is linked under /REMOTE_PARENT_DIR with the name of its FID string.
+ *    We can locate the slave MDT-object via lookup the /REMOTE_PARENT_DIR
+ *    directly. Please check osd_fid_lookup().
+ *
+ * 2) The slave MDT-object and the master MDT-object reside on the same MDT.
+ *    Under such case, during lookup the master MDT-object, we will lookup the
+ *    slave MDT-object via readdir against the master MDT-object, because the
+ *    slave MDT-objects information are stored as sub-directories with the name
+ *    "${FID}:${index}". Then when find the local slave MDT-object, its OI
+ *    mapping will be recorded. Then subsequent osd_fid_lookup() will know
+ *    the correct OI mapping for the slave MDT-object.
+ */
+static int osd_check_lmv(const struct lu_env *env, struct osd_device *osd,
+                        uint64_t oid, const struct lu_fid *fid)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct luz_direntry *zde = &info->oti_zde;
+       zap_attribute_t *za = &info->oti_za;
+       zap_cursor_t *zc = &info->oti_zc;
+       struct lu_fid *tfid = &info->oti_fid;
+       nvlist_t *nvbuf = NULL;
+       struct lmv_mds_md_v1 *lmv = NULL;
+       int size;
+       int rc;
+       ENTRY;
+
+       rc = __osd_xattr_load_by_oid(osd, oid, &nvbuf);
+       if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+               RETURN(0);
+
+       if (rc)
+               RETURN(rc);
+
+       rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMV,
+                                      (uchar_t **)&lmv, &size);
+       if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+               GOTO(out_nvbuf, rc = 0);
+
+       if (rc || le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
+               GOTO(out_nvbuf, rc);
+
+       zap_cursor_init_serialized(zc, osd->od_os, oid, 0);
+       rc = -zap_cursor_retrieve(zc, za);
+       if (rc == -ENOENT) {
+               zap_cursor_advance(zc);
+       } else if (rc) {
+               CERROR("%s: fail to init for check LMV "DFID"(%llu): rc = %d\n",
+                      osd_name(osd), PFID(fid), oid, rc);
+               GOTO(out_zc, rc);
+       }
+
+       while (1) {
+               rc = -zap_cursor_retrieve(zc, za);
+               if (rc == -ENOENT)
+                       GOTO(out_zc, rc = 0);
+
+               if (rc) {
+                       CERROR("%s: fail to locate next for check LMV "
+                              DFID"(%llu): rc = %d\n",
+                              osd_name(osd), PFID(fid), oid, rc);
+                       GOTO(out_zc, rc);
+               }
+
+               fid_zero(tfid);
+               sscanf(za->za_name + 1, SFID, RFID(tfid));
+               if (fid_is_sane(tfid) && !osd_remote_fid(env, osd, tfid)) {
+                       rc = osd_zap_lookup(osd, oid, NULL, za->za_name,
+                                       za->za_integer_length,
+                                       sizeof(*zde) / za->za_integer_length,
+                                       (void *)zde);
+                       if (rc) {
+                               CERROR("%s: fail to lookup for check LMV "
+                                      DFID"(%llu): rc = %d\n",
+                                      osd_name(osd), PFID(fid), oid, rc);
+                               GOTO(out_zc, rc);
+                       }
+
+                       rc = osd_oii_insert(env, osd, tfid,
+                                           zde->lzd_reg.zde_dnode, false);
+                       GOTO(out_zc, rc);
+               }
+
+               zap_cursor_advance(zc);
+       }
+
+out_zc:
+       zap_cursor_fini(zc);
+out_nvbuf:
+       nvlist_free(nvbuf);
+
+       return rc;
+}
+
+static int
+osd_consistency_check(const struct lu_env *env, struct osd_device *osd,
+                     struct osd_object *obj, const struct lu_fid *fid,
+                     uint64_t oid, bool is_dir)
+{
+       struct lustre_scrub *scrub = &osd->od_scrub;
+       dnode_t *dn = NULL;
+       uint64_t oid2;
+       int once = 0;
+       bool insert;
+       int rc;
+       ENTRY;
+
+       if (!fid_is_norm(fid) && !fid_is_igif(fid))
+               RETURN(0);
+
+       /* oid == ZFS_NO_OBJECT must be for lookup ".." case */
+       if (oid == ZFS_NO_OBJECT) {
+               rc = osd_sa_handle_get(obj);
+               if (rc)
+                       RETURN(rc);
+
+               rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PARENT(osd), &oid, 8);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       if (thread_is_running(&scrub->os_thread)) {
+               if (scrub->os_pos_current > oid)
+                       RETURN(0);
+       } else if (osd->od_auto_scrub_interval == AS_NEVER) {
+               RETURN(0);
+       } else {
+               if (cfs_time_before(cfs_time_current_sec(),
+                                   scrub->os_file.sf_time_last_complete +
+                                   osd->od_auto_scrub_interval))
+                       RETURN(0);
+       }
+
+again:
+       rc = osd_fid_lookup(env, osd, fid, &oid2);
+       if (rc == -ENOENT) {
+               insert = true;
+               if (dn)
+                       goto trigger;
+
+               rc = __osd_obj2dnode(osd->od_os, oid, &dn);
+               /* The object has been removed (by race maybe). */
+               if (rc)
+                       RETURN(rc = (rc == -EEXIST ? -ENOENT : rc));
+
+               goto trigger;
+       } else if (rc || oid == oid2) {
+               GOTO(out, rc);
+       }
+
+       insert = false;
+
+trigger:
+       if (thread_is_running(&scrub->os_thread)) {
+               if (!dn) {
+                       rc = __osd_obj2dnode(osd->od_os, oid, &dn);
+                       /* The object has been removed (by race maybe). */
+                       if (rc)
+                               RETURN(rc = (rc == -EEXIST ? -ENOENT : rc));
+               }
+
+               rc = osd_oii_insert(env, osd, fid, oid, insert);
+               /* There is race condition between osd_oi_lookup and OI scrub.
+                * The OI scrub finished just after osd_oi_lookup() failure.
+                * Under such case, it is unnecessary to trigger OI scrub again,
+                * but try to call osd_oi_lookup() again. */
+               if (unlikely(rc == -EAGAIN))
+                       goto again;
+
+               if (is_dir)
+                       rc = osd_check_lmv(env, osd, oid, fid);
+               else
+                       rc = 0;
+
+               GOTO(out, rc);
+       }
+
+       if (osd->od_auto_scrub_interval != AS_NEVER && ++once == 1) {
+               rc = osd_scrub_start(env, osd, SS_AUTO_FULL |
+                                    SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT);
+               CDEBUG(D_LFSCK | D_CONSOLE | D_WARNING,
+                      "%s: trigger partial OI scrub for RPC inconsistency "
+                      "checking FID "DFID": rc = %d\n",
+                      osd_name(osd), PFID(fid), rc);
+               if (!rc)
+                       goto again;
+       }
+
+       GOTO(out, rc);
+
+out:
+       if (dn)
+               osd_dnode_rele(dn);
+
+       return rc;
+}
+
  static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt,
                           struct dt_rec *rec, const struct dt_key *key)
  {
         struct osd_thread_info *oti = osd_oti_get(env);
-       struct osd_object  *obj = osd_dt_obj(dt);
-       struct osd_device  *osd = osd_obj2dev(obj);
-       char               *name = (char *)key;
-       int                 rc;
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
+       struct lu_fid *fid = (struct lu_fid *)rec;
+       char *name = (char *)key;
+       uint64_t oid = ZFS_NO_OBJECT;
+       int rc;
         ENTRY;
  
         if (name[0] == '.') {
@@ -435,8 +660,8 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt,
                         memcpy(rec, f, sizeof(*f));
                         RETURN(1);
                 } else if (name[1] == '.' && name[2] == 0) {
-                       rc = osd_find_parent_fid(env, dt, (struct lu_fid *)rec);
-                       RETURN(rc == 0 ? 1 : rc);
+                       rc = osd_find_parent_fid(env, dt, fid, &oid);
+                       GOTO(out, rc);
                 }
         }
  
@@ -447,15 +672,26 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt,
         if (rc != 0)
                 RETURN(rc);
  
+       oid = oti->oti_zde.lzd_reg.zde_dnode;
         if (likely(fid_is_sane(&oti->oti_zde.lzd_fid))) {
                 memcpy(rec, &oti->oti_zde.lzd_fid, sizeof(struct lu_fid));
-               RETURN(1);
+               GOTO(out, rc = 0);
         }
  
-       rc = osd_get_fid_by_oid(env, osd, oti->oti_zde.lzd_reg.zde_dnode,
-                               (struct lu_fid *)rec);
+       rc = osd_get_fid_by_oid(env, osd, oti->oti_zde.lzd_reg.zde_dnode, fid);
+
+       GOTO(out, rc);
+
+out:
+       if (!rc && !osd_remote_fid(env, osd, fid)) {
+               rc = osd_consistency_check(env, osd, obj, fid, oid,
+                               S_ISDIR(DTTOIF(oti->oti_zde.lzd_reg.zde_type)));
+               /* Only -ENOENT error will affect the lookup result. */
+               if (rc != -ENOENT)
+                       rc = 0;
+       }
  
-       RETURN(rc == 0 ? 1 : (rc == -ENOENT ? -ENODATA : rc));
+       return rc == 0 ? 1 : (rc == -ENOENT ? -ENODATA : rc);
  }
  
  /*
@@ -1266,7 +1502,7 @@ static int osd_dir_it_rec(const struct lu_env *env, const struct dt_it *di,
                 lde->lde_hash = cpu_to_le64(2);
                 strcpy(lde->lde_name, "..");
                 lde->lde_namelen = cpu_to_le16(2);
-               rc = osd_find_parent_fid(env, &it->ozi_obj->oo_dt, fid);
+               rc = osd_find_parent_fid(env, &it->ozi_obj->oo_dt, fid, NULL);
                 if (!rc) {
                         fid_cpu_to_le(&lde->lde_fid, fid);
                         lde->lde_attrs = LUDA_FID;
@@ -1772,214 +2008,6 @@ static struct dt_index_operations osd_index_ops = {
         }
  };
  
-struct osd_metadnode_it {
-       struct osd_device       *mit_dev;
-       __u64                    mit_pos;
-       struct lu_fid            mit_fid;
-       int                      mit_prefetched;
-       __u64                    mit_prefetched_dnode;
-};
-
-static struct dt_it *osd_zfs_otable_it_init(const struct lu_env *env,
-                                           struct dt_object *dt, __u32 attr)
-{
-       struct osd_device       *dev   = osd_dev(dt->do_lu.lo_dev);
-       struct osd_metadnode_it *it;
-       ENTRY;
-
-       OBD_ALLOC_PTR(it);
-       if (unlikely(it == NULL))
-               RETURN(ERR_PTR(-ENOMEM));
-
-       it->mit_dev = dev;
-
-       /* XXX: dmu_object_next() does NOT find dnodes allocated
-        *      in the current non-committed txg, so we force txg
-        *      commit to find all existing dnodes ... */
-       if (!dev->od_dt_dev.dd_rdonly)
-               txg_wait_synced(dmu_objset_pool(dev->od_os), 0ULL);
-
-       RETURN((struct dt_it *)it);
-}
-
-static void osd_zfs_otable_it_fini(const struct lu_env *env, struct dt_it *di)
-{
-       struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-
-       OBD_FREE_PTR(it);
-}
-
-static int osd_zfs_otable_it_get(const struct lu_env *env,
-                                struct dt_it *di, const struct dt_key *key)
-{
-       return 0;
-}
-
-static void osd_zfs_otable_it_put(const struct lu_env *env, struct dt_it *di)
-{
-}
-
-#define OTABLE_PREFETCH                256
-
-static void osd_zfs_otable_prefetch(const struct lu_env *env,
-                                   struct osd_metadnode_it *it)
-{
-       struct osd_device       *dev = it->mit_dev;
-       int                      rc;
-
-       /* can go negative on the very first access to the iterator
-        * or if some non-Lustre objects were found */
-       if (unlikely(it->mit_prefetched < 0))
-               it->mit_prefetched = 0;
-
-       if (it->mit_prefetched >= (OTABLE_PREFETCH >> 1))
-               return;
-
-       if (it->mit_prefetched_dnode == 0)
-               it->mit_prefetched_dnode = it->mit_pos;
-
-       while (it->mit_prefetched < OTABLE_PREFETCH) {
-               rc = -dmu_object_next(dev->od_os, &it->mit_prefetched_dnode,
-                                     B_FALSE, 0);
-               if (unlikely(rc != 0))
-                       break;
-
-               osd_dmu_prefetch(dev->od_os, it->mit_prefetched_dnode,
-                                0, 0, 0, ZIO_PRIORITY_ASYNC_READ);
-
-               it->mit_prefetched++;
-       }
-}
-
-static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di)
-{
-       struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-       struct lustre_mdt_attrs *lma;
-       struct osd_device       *dev = it->mit_dev;
-       nvlist_t                *nvbuf = NULL;
-       uchar_t                 *v;
-       __u64                    dnode;
-       int                      rc, s;
-
-       memset(&it->mit_fid, 0, sizeof(it->mit_fid));
-
-       dnode = it->mit_pos;
-       do {
-               rc = -dmu_object_next(dev->od_os, &it->mit_pos, B_FALSE, 0);
-               if (unlikely(rc != 0))
-                       GOTO(out, rc = 1);
-               it->mit_prefetched--;
-
-               /* LMA is required for this to be a Lustre object.
-                * If there is no xattr skip it. */
-               rc = __osd_xattr_load_by_oid(dev, it->mit_pos, &nvbuf);
-               if (unlikely(rc != 0))
-                       continue;
-
-               LASSERT(nvbuf != NULL);
-               rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &v, &s);
-               if (likely(rc == 0)) {
-                       /* Lustre object */
-                       lma = (struct lustre_mdt_attrs *)v;
-                       lustre_lma_swab(lma);
-                       if (likely(!(lma->lma_compat & LMAC_NOT_IN_OI) &&
-                                  !(lma->lma_incompat & LMAI_AGENT))) {
-                               it->mit_fid = lma->lma_self_fid;
-                               nvlist_free(nvbuf);
-                               break;
-                       }
-               }
-
-               /* not a Lustre visible object, try next one */
-               nvlist_free(nvbuf);
-       } while (1);
-
-
-       /* we aren't prefetching in the above loop because the number of
-        * non-Lustre objects is very small and we will be repeating very
-        * rare. in case we want to use this to iterate over non-Lustre
-        * objects (i.e. when we convert regular ZFS in Lustre) it makes
-        * sense to initiate prefetching in the loop */
-
-       /* 0 - there are more items, +1 - the end */
-       if (likely(rc == 0))
-               osd_zfs_otable_prefetch(env, it);
-
-       CDEBUG(D_OTHER, "advance: %llu -> %llu "DFID": %d\n", dnode,
-              it->mit_pos, PFID(&it->mit_fid), rc);
-
-out:
-       return rc;
-}
-
-static struct dt_key *osd_zfs_otable_it_key(const struct lu_env *env,
-                                           const struct dt_it *di)
-{
-       return NULL;
-}
-
-static int osd_zfs_otable_it_key_size(const struct lu_env *env,
-                                     const struct dt_it *di)
-{
-       return sizeof(__u64);
-}
-
-static int osd_zfs_otable_it_rec(const struct lu_env *env,
-                                const struct dt_it *di,
-                                struct dt_rec *rec, __u32 attr)
-{
-       struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-       struct lu_fid *fid = (struct lu_fid *)rec;
-       ENTRY;
-
-       *fid = it->mit_fid;
-
-       RETURN(0);
-}
-
-
-static __u64 osd_zfs_otable_it_store(const struct lu_env *env,
-                                    const struct dt_it *di)
-{
-       struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-
-       return it->mit_pos;
-}
-
-static int osd_zfs_otable_it_load(const struct lu_env *env,
-                                 const struct dt_it *di, __u64 hash)
-{
-       struct osd_metadnode_it *it  = (struct osd_metadnode_it *)di;
-
-       it->mit_pos = hash;
-       it->mit_prefetched = 0;
-       it->mit_prefetched_dnode = 0;
-
-       return osd_zfs_otable_it_next(env, (struct dt_it *)di);
-}
-
-static int osd_zfs_otable_it_key_rec(const struct lu_env *env,
-                                    const struct dt_it *di, void *key_rec)
-{
-       return 0;
-}
-
-const struct dt_index_operations osd_zfs_otable_ops = {
-       .dio_it = {
-               .init     = osd_zfs_otable_it_init,
-               .fini     = osd_zfs_otable_it_fini,
-               .get      = osd_zfs_otable_it_get,
-               .put      = osd_zfs_otable_it_put,
-               .next     = osd_zfs_otable_it_next,
-               .key      = osd_zfs_otable_it_key,
-               .key_size = osd_zfs_otable_it_key_size,
-               .rec      = osd_zfs_otable_it_rec,
-               .store    = osd_zfs_otable_it_store,
-               .load     = osd_zfs_otable_it_load,
-               .key_rec  = osd_zfs_otable_it_key_rec,
-       }
-};
-
  int osd_index_try(const struct lu_env *env, struct dt_object *dt,
                 const struct dt_index_features *feat)
  {
@@ -1997,7 +2025,7 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt,
                 GOTO(out, rc = -ERANGE);
  
         if (unlikely(feat == &dt_otable_features)) {
-               dt->do_index_ops = &osd_zfs_otable_ops;
+               dt->do_index_ops = &osd_otable_ops;
                 GOTO(out, rc = 0);
         }
  
diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h

index 8d5996a..37f24ea 100644 (file)
--- a/lustre/osd-zfs/osd_internal.h
+++ b/lustre/osd-zfs/osd_internal.h
@@ -43,6 +43,8 @@
  #include <dt_object.h>
  #include <md_object.h>
  #include <lustre_quota.h>
+#include <lustre_scrub.h>
+#include <obd.h>
  #ifdef SHRINK_STOP
  #undef SHRINK_STOP
  #endif
@@ -179,6 +181,38 @@ struct osd_idmap_cache {
                                 oic_remote:1;      /* FID isn't local */
  };
  
+struct osd_inconsistent_item {
+       /* link into lustre_scrub::os_inconsistent_items,
+        * protected by lustr_scrub::os_lock. */
+       struct list_head       oii_list;
+
+       /* The right FID <=> oid mapping. */
+       struct osd_idmap_cache oii_cache;
+
+       unsigned int           oii_insert:1; /* insert or update mapping. */
+};
+
+struct osd_otable_it {
+       struct osd_device       *ooi_dev;
+       struct lu_fid            ooi_fid;
+       __u64                    ooi_pos;
+       __u64                    ooi_prefetched_dnode;
+       int                      ooi_prefetched;
+
+       /* The following bits can be updated/checked w/o lock protection.
+        * If more bits will be introduced in the future and need lock to
+        * protect, please add comment. */
+       unsigned int             ooi_used_outside:1, /* Some user out of OSD
+                                                     * uses the iteration. */
+                                ooi_all_cached:1, /* No more entries can be
+                                                   * filled into cache. */
+                                ooi_user_ready:1, /* The user out of OSD is
+                                                   * ready to iterate. */
+                                ooi_waiting:1; /* it::next is waiting. */
+};
+
+extern const struct dt_index_operations osd_otable_ops;
+
  /* max.number of regular attributes the callers may ask for */
  # define OSD_MAX_IN_BULK (sizeof(struct osa_attr)/sizeof(uint64_t))
  
@@ -218,6 +252,7 @@ struct osd_thread_info {
         int                    oti_ins_cache_size;
         int                    oti_ins_cache_used;
         struct lu_buf          oti_xattr_lbuf;
+       zap_cursor_t           oti_zc;
  };
  
  extern struct lu_context_key osd_key;
@@ -295,13 +330,17 @@ struct osd_device {
                                  od_prop_rdonly:1,  /**< ZFS property readonly */
                                  od_xattr_in_sa:1,
                                  od_is_ost:1,
+                                od_in_init:1,
                                  od_posix_acl:1;
         unsigned int             od_dnsize;
  
         char                     od_mntdev[128];
         char                     od_svname[128];
+       char                     od_uuid[16];
  
         int                      od_connects;
+       int                      od_index;
+       __s64                    od_auto_scrub_interval;
         struct lu_site           od_site;
  
         dnode_t                 *od_groupused_dn;
@@ -328,6 +367,11 @@ struct osd_device {
  
         /* osd seq instance */
         struct lu_client_seq    *od_cl_seq;
+
+       struct semaphore         od_otable_sem;
+       struct osd_otable_it    *od_otable_it;
+       struct lustre_scrub      od_scrub;
+       struct list_head         od_ios_list;
  };
  
  enum osd_destroy_type {
@@ -388,6 +432,7 @@ struct osd_object {
                 };
                 uint64_t        oo_parent; /* used only at object creation */
         };
+       struct lu_object_header *oo_header;
  };
  
  int osd_statfs(const struct lu_env *, struct dt_device *, struct obd_statfs *);
@@ -477,7 +522,33 @@ static inline struct seq_server_site *osd_seq_site(struct osd_device *osd)
  
  static inline char *osd_name(struct osd_device *osd)
  {
-       return osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name;
+       return osd->od_svname;
+}
+
+static inline void zfs_set_bit(int nr, __u8 *addr)
+{
+       set_bit(nr, (unsigned long *)addr);
+}
+
+static inline int zfs_test_bit(int nr, __u8 *addr)
+{
+       return test_bit(nr, (const unsigned long *)addr);
+}
+
+static inline int osd_oi_fid2idx(struct osd_device *dev,
+                                const struct lu_fid *fid)
+{
+       return fid->f_seq & (dev->od_oi_count - 1);
+}
+
+static inline struct osd_oi *osd_fid2oi(struct osd_device *osd,
+                                       const struct lu_fid *fid)
+{
+       LASSERTF(osd->od_oi_table && osd->od_oi_count >= 1,
+                "%s: "DFID", oi_count %d\n",
+                osd_name(osd), PFID(fid), osd->od_oi_count);
+
+       return osd->od_oi_table[osd_oi_fid2idx(osd, fid)];
  }
  
  #ifdef CONFIG_PROC_FS
@@ -523,6 +594,9 @@ int __osd_object_create(const struct lu_env *env, struct osd_device *osd,
  int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
                     struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
                     struct lu_attr *la, uint64_t parent, nvlist_t *);
+int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
+                      uint64_t oid, dnode_t **dnp);
+int osd_object_init0(const struct lu_env *env, struct osd_object *obj);
  
  /* osd_oi.c */
  int osd_oi_init(const struct lu_env *env, struct osd_device *o);
@@ -543,6 +617,17 @@ struct osd_idmap_cache *osd_idc_find_or_init(const struct lu_env *env,
  struct osd_idmap_cache *osd_idc_find(const struct lu_env *env,
                                      struct osd_device *osd,
                                      const struct lu_fid *fid);
+int osd_idc_find_and_init_with_oid(const struct lu_env *env,
+                                  struct osd_device *osd,
+                                  const struct lu_fid *fid,
+                                  uint64_t oid);
+int fid_is_on_ost(const struct lu_env *env, struct osd_device *osd,
+                 const struct lu_fid *fid);
+int osd_obj_find_or_create(const struct lu_env *env, struct osd_device *o,
+                          uint64_t parent, const char *name, uint64_t *child,
+                          const struct lu_fid *fid, bool isdir);
+
+extern unsigned int osd_oi_count;
  
  /* osd_index.c */
  int osd_index_try(const struct lu_env *env, struct dt_object *dt,
@@ -565,6 +650,18 @@ int osd_delete_from_remote_parent(const struct lu_env *env,
                                   struct osd_device *osd,
                                   struct osd_object *obj,
                                   struct osd_thandle *oh, bool destroy);
+int __osd_xattr_load_by_oid(struct osd_device *osd, uint64_t oid,
+                           nvlist_t **sa);
+
+/* osd_scrub.c */
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev);
+void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev);
+int osd_scrub_start(const struct lu_env *env, struct osd_device *dev,
+                   __u32 flags);
+int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
+                  const struct lu_fid *fid, uint64_t oid, bool insert);
+int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
+                  uint64_t *oid);
  
  /* osd_xattr.c */
  int __osd_sa_xattr_schedule_update(const struct lu_env *env,
diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c

index 00bf845..39b1848 100644 (file)
--- a/lustre/osd-zfs/osd_lproc.c
+++ b/lustre/osd-zfs/osd_lproc.c
@@ -40,6 +40,7 @@
  #include <obd.h>
  #include <obd_class.h>
  #include <lprocfs_status.h>
+#include <lustre_scrub.h>
  
  #include "osd_internal.h"
  
@@ -210,6 +211,54 @@ out:
         RETURN(result);
  }
  
+static int zfs_osd_auto_scrub_seq_show(struct seq_file *m, void *data)
+{
+       struct osd_device *dev = osd_dt_dev((struct dt_device *)m->private);
+
+       LASSERT(dev != NULL);
+       if (unlikely(!dev->od_os))
+               return -EINPROGRESS;
+
+       seq_printf(m, "%lld\n", dev->od_auto_scrub_interval);
+       return 0;
+}
+
+static ssize_t
+zfs_osd_auto_scrub_seq_write(struct file *file, const char __user *buffer,
+                            size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct dt_device *dt = m->private;
+       struct osd_device *dev = osd_dt_dev(dt);
+       int rc;
+       __s64 val;
+
+       LASSERT(dev != NULL);
+       if (unlikely(!dev->od_os))
+               return -EINPROGRESS;
+
+       rc = lprocfs_str_to_s64(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       dev->od_auto_scrub_interval = val;
+       return count;
+}
+LPROC_SEQ_FOPS(zfs_osd_auto_scrub);
+
+static int zfs_osd_oi_scrub_seq_show(struct seq_file *m, void *data)
+{
+       struct osd_device *dev = osd_dt_dev((struct dt_device *)m->private);
+
+       LASSERT(dev != NULL);
+       if (unlikely(!dev->od_os))
+               return -EINPROGRESS;
+
+       scrub_dump(m, &dev->od_scrub);
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(zfs_osd_oi_scrub);
+
  static int zfs_osd_fstype_seq_show(struct seq_file *m, void *data)
  {
         seq_puts(m, "zfs\n");
@@ -266,6 +315,10 @@ struct lprocfs_vars lprocfs_osd_obd_vars[] = {
           .fops =       &zfs_dt_filestotal_fops         },
         { .name =       "filesfree",
           .fops =       &zfs_dt_filesfree_fops          },
+       { .name =       "auto_scrub",
+         .fops =       &zfs_osd_auto_scrub_fops        },
+       { .name =       "oi_scrub",
+         .fops =       &zfs_osd_oi_scrub_fops          },
         { .name =       "fstype",
           .fops =       &zfs_osd_fstype_fops            },
         { .name =       "mntdev",
diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c

index 00b3f4c..4a3ac47 100644 (file)
--- a/lustre/osd-zfs/osd_object.c
+++ b/lustre/osd-zfs/osd_object.c
@@ -309,9 +309,26 @@ struct lu_object *osd_object_alloc(const struct lu_env *env,
         OBD_SLAB_ALLOC_PTR_GFP(mo, osd_object_kmem, GFP_NOFS);
         if (mo != NULL) {
                 struct lu_object *l;
+               struct lu_object_header *h;
+               struct osd_device *o = osd_dev(d);
  
                 l = &mo->oo_dt.do_lu;
-               dt_object_init(&mo->oo_dt, NULL, d);
+               if (unlikely(o->od_in_init)) {
+                       OBD_ALLOC_PTR(h);
+                       if (!h) {
+                               OBD_FREE_PTR(mo);
+                               return NULL;
+                       }
+
+                       lu_object_header_init(h);
+                       lu_object_init(l, h, d);
+                       lu_object_add_top(h, l);
+                       mo->oo_header = h;
+               } else {
+                       dt_object_init(&mo->oo_dt, NULL, d);
+                       mo->oo_header = NULL;
+               }
+
                 mo->oo_dt.do_ops = &osd_obj_ops;
                 l->lo_ops = &osd_lu_obj_ops;
                 INIT_LIST_HEAD(&mo->oo_sa_linkage);
@@ -437,6 +454,7 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
         struct lu_buf           buf;
         int                     rc;
         struct lustre_mdt_attrs *lma;
+       const struct lu_fid *rfid = lu_object_fid(&obj->oo_dt.do_lu);
         ENTRY;
  
         CLASSERT(sizeof(info->oti_buf) >= sizeof(*lma));
@@ -453,8 +471,14 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
                               "fid = "DFID"\n", osd_obj2dev(obj)->od_svname,
                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
-                             PFID(lu_object_fid(&obj->oo_dt.do_lu)));
+                             PFID(rfid));
                         rc = -EOPNOTSUPP;
+               } else if (unlikely(!lu_fid_eq(rfid, &lma->lma_self_fid))) {
+                       CERROR("%s: FID-in-LMA "DFID" does not match the "
+                             "object self-fid "DFID"\n",
+                             osd_obj2dev(obj)->od_svname,
+                             PFID(&lma->lma_self_fid), PFID(rfid));
+                       rc = -EREMCHG;
                 } else {
                         struct osd_device *osd = osd_obj2dev(obj);
  
@@ -512,8 +536,15 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
         struct osd_object *obj = osd_obj(l);
         struct osd_device *osd = osd_obj2dev(obj);
         const struct lu_fid *fid = lu_object_fid(l);
+       struct lustre_scrub *scrub = &osd->od_scrub;
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct luz_direntry *zde = &info->oti_zde;
+       struct osd_idmap_cache *idc;
+       char *name = info->oti_str;
         uint64_t oid;
         int rc = 0;
+       int rc1;
+       bool remote = false;
         ENTRY;
  
         LASSERT(osd_invariant(obj));
@@ -521,10 +552,11 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
         if (fid_is_otable_it(&l->lo_header->loh_fid)) {
                 obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
                 l->lo_header->loh_attr |= LOHA_EXISTS;
-               RETURN(0);
+
+               GOTO(out, rc = 0);
         }
  
-       if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+       if (conf && conf->loc_flags & LOC_F_NEW)
                 GOTO(out, rc = 0);
  
         if (unlikely(fid_is_acct(fid))) {
@@ -537,31 +569,117 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
                 GOTO(out, rc = 0);
         }
  
-       rc = osd_fid_lookup(env, osd, fid, &oid);
-       if (rc == 0) {
-               LASSERT(obj->oo_dn == NULL);
-               rc = __osd_obj2dnode(osd->od_os, oid, &obj->oo_dn);
-               /* EEXIST will be returned if object is being deleted in ZFS */
-               if (rc == -EEXIST) {
-                       rc = 0;
-                       GOTO(out, rc);
+       idc = osd_idc_find(env, osd, fid);
+       if (idc && !idc->oic_remote && idc->oic_dnode != ZFS_NO_OBJECT) {
+               oid = idc->oic_dnode;
+               goto zget;
+       }
+
+       rc = -ENOENT;
+       if (!list_empty(&osd->od_scrub.os_inconsistent_items))
+               rc = osd_oii_lookup(osd, fid, &oid);
+
+       if (rc)
+               rc = osd_fid_lookup(env, osd, fid, &oid);
+
+       if (rc == -ENOENT) {
+               if (likely(!(fid_is_norm(fid) || fid_is_igif(fid)) ||
+                          fid_is_on_ost(env, osd, fid) ||
+                          !zfs_test_bit(osd_oi_fid2idx(osd, fid),
+                                        scrub->os_file.sf_oi_bitmap)))
+                       GOTO(out, rc = 0);
+
+               rc = -EREMCHG;
+               goto trigger;
+       }
+
+       if (rc)
+               GOTO(out, rc);
+
+zget:
+       LASSERT(obj->oo_dn == NULL);
+
+       rc = __osd_obj2dnode(osd->od_os, oid, &obj->oo_dn);
+       /* EEXIST will be returned if object is being deleted in ZFS */
+       if (rc == -EEXIST)
+               GOTO(out, rc = 0);
+
+       if (rc) {
+               CERROR("%s: lookup "DFID"/%#llx failed: rc = %d\n",
+                      osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
+               GOTO(out, rc);
+       }
+
+       rc = osd_object_init0(env, obj);
+       if (rc)
+               GOTO(out, rc);
+
+       if (unlikely(obj->oo_header))
+               GOTO(out, rc = 0);
+
+       rc = osd_check_lma(env, obj);
+       if ((!rc && !remote) || (rc != -EREMCHG))
+               GOTO(out, rc);
+
+trigger:
+       /* We still have chance to get the valid dnode: for the object that is
+        * referenced by remote name entry, the object on the local MDT will be
+        * linked under the dir /REMOTE_PARENT_DIR with its FID string as name.
+        *
+        * During the OI scrub, if we cannot find the OI mapping, we may still
+        * have change to map the FID to local OID via lookup the dir
+        * /REMOTE_PARENT_DIR. */
+       if (!remote && !fid_is_on_ost(env, osd, fid)) {
+               osd_fid2str(name, fid, sizeof(info->oti_str));
+               rc = osd_zap_lookup(osd, osd->od_remote_parent_dir,
+                                   NULL, name, 8, 3, (void *)zde);
+               if (!rc) {
+                       oid = zde->lzd_reg.zde_dnode;
+                       osd_dnode_rele(obj->oo_dn);
+                       obj->oo_dn = NULL;
+                       remote = true;
+                       goto zget;
                 }
-               if (rc != 0) {
-                       CERROR("%s: lookup "DFID"/%#llx failed: rc = %d\n",
-                              osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
-                       GOTO(out, rc);
+       }
+
+       /* The case someone triggered the OI scrub already. */
+       if (thread_is_running(&scrub->os_thread)) {
+               if (!rc) {
+                       LASSERT(remote);
+
+                       lu_object_set_agent_entry(l);
+                       osd_oii_insert(env, osd, fid, oid, false);
+               } else {
+                       rc = -EINPROGRESS;
                 }
-               rc = osd_object_init0(env, obj);
-               if (rc != 0)
-                       GOTO(out, rc);
  
-               rc = osd_check_lma(env, obj);
-               if (rc != 0)
-                       GOTO(out, rc);
-       } else if (rc == -ENOENT) {
-               rc = 0;
+               GOTO(out, rc);
         }
-       LASSERT(osd_invariant(obj));
+
+       /* The case NOT allow to trigger OI scrub automatically. */
+       if (osd->od_auto_scrub_interval == AS_NEVER)
+               GOTO(out, rc);
+
+       /* It is me to trigger the OI scrub. */
+       rc1 = osd_scrub_start(env, osd, SS_CLEAR_DRYRUN |
+                             SS_CLEAR_FAILOUT | SS_AUTO_FULL);
+       LCONSOLE_WARN("%s: trigger OI scrub by RPC for the "DFID": rc = %d\n",
+                     osd_name(osd), PFID(fid), rc1);
+       if (!rc) {
+               LASSERT(remote);
+
+               lu_object_set_agent_entry(l);
+               if (!rc1)
+                       osd_oii_insert(env, osd, fid, oid, false);
+       } else {
+               if (!rc1)
+                       rc = -EINPROGRESS;
+               else
+                       rc = -EREMCHG;
+       }
+
+       GOTO(out, rc);
+
  out:
         RETURN(rc);
  }
@@ -573,11 +691,16 @@ out:
  static void osd_object_free(const struct lu_env *env, struct lu_object *l)
  {
         struct osd_object *obj = osd_obj(l);
+       struct lu_object_header *h = obj->oo_header;
  
         LASSERT(osd_invariant(obj));
  
         dt_object_fini(&obj->oo_dt);
         OBD_SLAB_FREE_PTR(obj, osd_object_kmem);
+       if (unlikely(h)) {
+               lu_object_header_fini(h);
+               OBD_FREE_PTR(h);
+       }
  }
  
  static int
@@ -707,13 +830,6 @@ static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
         /* remove obj ref from index dir (it depends) */
         zapid = osd_get_name_n_idx(env, osd, fid, buf,
                                    sizeof(info->oti_str), &zdn);
-       rc = osd_zap_remove(osd, zapid, zdn, buf, oh->ot_tx);
-       if (rc) {
-               CERROR("%s: zap_remove(%s) failed: rc = %d\n",
-                      osd->od_svname, buf, rc);
-               GOTO(out, rc);
-       }
-
         rc = osd_xattrs_destroy(env, obj, oh);
         if (rc) {
                 CERROR("%s: cannot destroy xattrs for %s: rc = %d\n",
@@ -758,6 +874,17 @@ static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
                                osd->od_svname, buf, oid, rc);
         }
  
+       /* Remove the OI mapping after the destroy to handle the race with
+        * OI scrub that may insert missed OI mapping during the interval. */
+       rc = osd_zap_remove(osd, zapid, zdn, buf, oh->ot_tx);
+       if (unlikely(rc == -ENOENT))
+               rc = 0;
+       if (rc)
+               CERROR("%s: zap_remove(%s) failed: rc = %d\n",
+                      osd->od_svname, buf, rc);
+
+       GOTO(out, rc);
+
  out:
         /* not needed in the cache anymore */
         set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
@@ -1116,6 +1243,26 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
            transaction group. */
         LASSERT(oh->ot_tx->tx_txg != 0);
  
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FID_MAPPING) && !osd->od_is_ost) {
+               struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
+               char *buf = info->oti_str;
+               dnode_t *zdn = NULL;
+               uint64_t zapid;
+
+               zapid = osd_get_name_n_idx(env, osd, lu_object_fid(&dt->do_lu),
+                                          buf, sizeof(info->oti_str), &zdn);
+               rc = osd_zap_lookup(osd, zapid, zdn, buf, 8,
+                                   sizeof(*zde) / 8, zde);
+               if (!rc) {
+                       zde->zde_dnode -= 1;
+                       rc = -zap_update(osd->od_os, zapid, buf, 8,
+                                        sizeof(*zde) / 8, zde, oh->ot_tx);
+               }
+               up_read(&obj->oo_guard);
+
+               RETURN(rc > 0 ? 0 : rc);
+       }
+
         /* Only allow set size for regular file */
         if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
                 valid &= ~(LA_SIZE | LA_BLOCKS);
@@ -1451,8 +1598,8 @@ int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
         return rc;
  }
  
-static int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
-                             uint64_t oid, dnode_t **dnp)
+int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
+                      uint64_t oid, dnode_t **dnp)
  {
         dmu_tx_hold_t *txh;
         int rc = 0;
@@ -1737,6 +1884,7 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
         dnode_t *dn = NULL, *zdn = NULL;
         uint64_t                 zapid, parent = 0;
         int                      rc;
+       __u32 compat = 0;
  
         ENTRY;
  
@@ -1789,9 +1937,20 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
  
         zapid = osd_get_name_n_idx(env, osd, fid, buf,
                                    sizeof(info->oti_str), &zdn);
-       rc = osd_zap_add(osd, zapid, zdn, buf, 8, 1, zde, oh->ot_tx);
-       if (rc)
-               GOTO(out, rc);
+       if (!CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY)) {
+               if (osd->od_is_ost &&
+                   OBD_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_INVALID_ENTRY))
+                       zde->zde_dnode++;
+
+               if (!osd->od_is_ost ||
+                   !OBD_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_NO_ENTRY)) {
+                       rc = osd_zap_add(osd, zapid, zdn, buf, 8, 1,
+                                        zde, oh->ot_tx);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+       }
+
         obj->oo_dn = dn;
         /* Now add in all of the "SA" attributes */
         rc = osd_sa_handle_get(obj);
@@ -1803,7 +1962,9 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt,
                 GOTO(out, rc);
  
         /* initialize LMA */
-       lustre_lma_init(lma, fid, 0, 0);
+       if (fid_is_idif(fid) || (fid_is_norm(fid) && osd->od_is_ost))
+               compat |= LMAC_FID_ON_OST;
+       lustre_lma_init(lma, fid, compat, 0);
         lustre_lma_swab(lma);
         rc = -nvlist_add_byte_array(obj->oo_sa_xattr, XATTR_NAME_LMA,
                                     (uchar_t *)lma, sizeof(*lma));
diff --git a/lustre/osd-zfs/osd_oi.c b/lustre/osd-zfs/osd_oi.c

index b00c760..c8d1fcc 100644 (file)
--- a/lustre/osd-zfs/osd_oi.c
+++ b/lustre/osd-zfs/osd_oi.c
@@ -140,75 +140,91 @@ osd_oi_lookup(const struct lu_env *env, struct osd_device *o,
         return 0;
  }
  
-/**
- * Create a new OI with the given name.
- */
-static int
-osd_oi_create(const struct lu_env *env, struct osd_device *o,
-             uint64_t parent, const char *name, uint64_t *child)
+static int osd_obj_create(const struct lu_env *env, struct osd_device *o,
+                         uint64_t parent, const char *name, uint64_t *child,
+                         const struct lu_fid *fid, bool isdir)
  {
-       struct zpl_direntry     *zde = &osd_oti_get(env)->oti_zde.lzd_reg;
-       struct lu_attr          *la = &osd_oti_get(env)->oti_la;
-       sa_handle_t             *sa_hdl = NULL;
-       dmu_tx_t                *tx;
-       uint64_t                 oid;
-       int                      rc;
-
-       /* verify it doesn't already exist */
-       rc = -zap_lookup(o->od_os, parent, name, 8, 1, (void *)zde);
-       if (rc == 0)
-               return -EEXIST;
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
+       struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
+       struct lu_attr *la = &info->oti_la;
+       sa_handle_t *sa_hdl = NULL;
+       nvlist_t *nvbuf = NULL;
+       dmu_tx_t *tx;
+       uint64_t oid;
+       __u32 compat = LMAC_NOT_IN_OI;
+       int rc;
+       ENTRY;
  
         if (o->od_dt_dev.dd_rdonly)
-               return -EROFS;
+               RETURN(-EROFS);
+
+       memset(la, 0, sizeof(*la));
+       la->la_valid = LA_MODE | LA_UID | LA_GID;
+       la->la_mode = S_IRUGO | S_IWUSR | (isdir ? S_IXUGO | S_IFDIR : S_IFREG);
+
+       if (fid) {
+               rc = -nvlist_alloc(&nvbuf, NV_UNIQUE_NAME, KM_SLEEP);
+               if (rc)
+                       RETURN(rc);
+
+               if (o->od_is_ost)
+                       compat |= LMAC_FID_ON_OST;
+               lustre_lma_init(lma, fid, compat, 0);
+               lustre_lma_swab(lma);
+               rc = -nvlist_add_byte_array(nvbuf, XATTR_NAME_LMA,
+                                           (uchar_t *)lma, sizeof(*lma));
+               if (rc)
+                       GOTO(out, rc);
+       }
  
         /* create fid-to-dnode index */
         tx = dmu_tx_create(o->od_os);
-       if (tx == NULL)
-               return -ENOMEM;
+       if (!tx)
+               GOTO(out, rc = -ENOMEM);
  
-       dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1, NULL);
+       dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
         dmu_tx_hold_bonus(tx, parent);
         dmu_tx_hold_zap(tx, parent, TRUE, name);
         dmu_tx_hold_sa_create(tx, ZFS_SA_BASE_ATTR_SIZE);
-
         rc = -dmu_tx_assign(tx, TXG_WAIT);
         if (rc) {
                 dmu_tx_abort(tx);
-               return rc;
+               GOTO(out, rc);
         }
  
-       oid = osd_zap_create_flags(o->od_os, 0, ZAP_FLAG_HASH64,
-                                  DMU_OT_DIRECTORY_CONTENTS,
-                                  14, /* == ZFS fzap_default_block_shift */
-                                  DN_MAX_INDBLKSHIFT,
-                                  0, tx);
-
+       if (isdir)
+               oid = osd_zap_create_flags(o->od_os, 0, ZAP_FLAG_HASH64,
+                                          DMU_OT_DIRECTORY_CONTENTS,
+                                          14, DN_MAX_INDBLKSHIFT, 0, tx);
+       else
+               oid = osd_dmu_object_alloc(o->od_os, DMU_OTN_UINT8_METADATA,
+                                          0, 0, tx);
         rc = -sa_handle_get(o->od_os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl);
         if (rc)
-               goto commit;
-       memset(la, 0, sizeof(*la));
-       la->la_valid = LA_MODE | LA_UID | LA_GID;
-       la->la_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-       rc = __osd_attr_init(env, o, NULL, sa_hdl, tx, la, parent, NULL);
+               GOTO(commit, rc);
+
+       rc = __osd_attr_init(env, o, NULL, sa_hdl, tx, la, parent, nvbuf);
         sa_handle_destroy(sa_hdl);
         if (rc)
-               goto commit;
+               GOTO(commit, rc);
  
         zde->zde_dnode = oid;
         zde->zde_pad = 0;
-       zde->zde_type = IFTODT(S_IFDIR);
-
+       zde->zde_type = IFTODT(isdir ? S_IFDIR : S_IFREG);
         rc = -zap_add(o->od_os, parent, name, 8, 1, (void *)zde, tx);
  
+       GOTO(commit, rc);
+
  commit:
         if (rc)
                 dmu_object_free(o->od_os, oid, tx);
-       dmu_tx_commit(tx);
-
-       if (rc == 0)
+       else
                 *child = oid;
-
+       dmu_tx_commit(tx);
+out:
+       if (nvbuf)
+               nvlist_free(nvbuf);
         return rc;
  }
  
@@ -223,7 +239,23 @@ osd_oi_find_or_create(const struct lu_env *env, struct osd_device *o,
         if (rc == 0)
                 *child = oi.oi_zapid;
         else if (rc == -ENOENT)
-               rc = osd_oi_create(env, o, parent, name, child);
+               rc = osd_obj_create(env, o, parent, name, child, NULL, true);
+
+       return rc;
+}
+
+int osd_obj_find_or_create(const struct lu_env *env, struct osd_device *o,
+                          uint64_t parent, const char *name, uint64_t *child,
+                          const struct lu_fid *fid, bool isdir)
+{
+       struct osd_oi oi;
+       int rc;
+
+       rc = osd_oi_lookup(env, o, parent, name, &oi);
+       if (!rc)
+               *child = oi.oi_zapid;
+       else if (rc == -ENOENT)
+               rc = osd_obj_create(env, o, parent, name, child, fid, isdir);
  
         return rc;
  }
@@ -252,7 +284,11 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
                 return 0;
         }
  
-       LASSERT(ss != NULL);
+       /* The seq_server_site may be NOT ready during initial OI scrub */
+       if (unlikely(!ss || !ss->ss_server_fld ||
+                    !ss->ss_server_fld->lsf_cache))
+               return -ENOENT;
+
         fld_range_set_any(range);
         /* OSD will only do local fld lookup */
         return fld_local_lookup(env, ss->ss_server_fld, seq, range);
@@ -269,7 +305,8 @@ int fid_is_on_ost(const struct lu_env *env, struct osd_device *osd,
                 RETURN(1);
  
         if (unlikely(fid_is_local_file(fid) || fid_is_llog(fid)) ||
-                    fid_is_name_llog(fid) || fid_is_quota(fid))
+                    fid_is_name_llog(fid) || fid_is_quota(fid) ||
+                    fid_is_igif(fid))
                 RETURN(0);
  
         rc = osd_fld_lookup(env, osd, fid_seq(fid), range);
@@ -479,8 +516,7 @@ osd_get_idx_for_fid(struct osd_device *osd, const struct lu_fid *fid,
  {
         struct osd_oi *oi;
  
-       LASSERT(osd->od_oi_table != NULL);
-       oi = osd->od_oi_table[fid_seq(fid) & (osd->od_oi_count - 1)];
+       oi = osd_fid2oi(osd, fid);
         if (buf)
                 osd_fid2str(buf, fid, bufsize);
         if (zdn)
@@ -698,13 +734,15 @@ osd_oi_open_table(const struct lu_env *env, struct osd_device *o, int count)
  /**
   * Determine if the type and number of OIs used by this file system.
   */
-static int
-osd_oi_probe(const struct lu_env *env, struct osd_device *o, int *count)
+static int osd_oi_probe(const struct lu_env *env, struct osd_device *o)
  {
-       uint64_t        root_oid = o->od_root;
-       struct osd_oi   oi;
-       char            name[16];
-       int             rc;
+       struct lustre_scrub *scrub = &o->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       struct osd_oi oi;
+       char name[16];
+       int max = sf->sf_oi_count > 0 ? sf->sf_oi_count : OSD_OI_FID_NR_MAX;
+       int count;
+       int rc;
         ENTRY;
  
         /*
@@ -713,31 +751,25 @@ osd_oi_probe(const struct lu_env *env, struct osd_device *o, int *count)
          * The only safeguard is that we know the number of OIs must be a
          * power of two and this is checked for basic sanity.
          */
-       for (*count = 0; *count < OSD_OI_FID_NR_MAX; (*count)++) {
-               sprintf(name, "%s.%d", DMU_OSD_OI_NAME_BASE, *count);
-               rc = osd_oi_lookup(env, o, root_oid, name, &oi);
-               if (rc == 0)
+       for (count = 0; count < max; count++) {
+               snprintf(name, 15, "%s.%d", DMU_OSD_OI_NAME_BASE, count);
+               rc = osd_oi_lookup(env, o, o->od_root, name, &oi);
+               if (!rc)
                         continue;
  
                 if (rc == -ENOENT) {
-                       if (*count == 0)
-                               break;
-
-                       if ((*count & (*count - 1)) != 0)
-                               RETURN(-EDOM);
+                       if (sf->sf_oi_count == 0)
+                               RETURN(count);
  
-                       RETURN(0);
+                       zfs_set_bit(count, sf->sf_oi_bitmap);
+                       continue;
                 }
  
-               RETURN(rc);
+               if (rc)
+                       RETURN(rc);
         }
  
-       /*
-        * No OIs exist, this must be a new filesystem.
-        */
-       *count = 0;
-
-       RETURN(0);
+       RETURN(count);
  }
  
  static void osd_ost_seq_fini(const struct lu_env *env, struct osd_device *osd)
@@ -802,47 +834,97 @@ osd_oi_init_remote_parent(const struct lu_env *env, struct osd_device *o)
   */
  int osd_oi_init(const struct lu_env *env, struct osd_device *o)
  {
-       char    *key = osd_oti_get(env)->oti_buf;
-       int      i, rc, count = 0;
+       struct lustre_scrub *scrub = &o->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       char *key = osd_oti_get(env)->oti_buf;
+       uint64_t sdb;
+       int i, rc, count;
         ENTRY;
  
+       LASSERTF((sf->sf_oi_count & (sf->sf_oi_count - 1)) == 0,
+                "Invalid OI count in scrub file %d\n", sf->sf_oi_count);
+
         osd_oi_init_remote_parent(env, o);
  
-       rc = osd_oi_probe(env, o, &count);
+       rc = osd_oi_init_compat(env, o);
         if (rc)
                 RETURN(rc);
  
-       if (count == 0) {
-               uint64_t odb, sdb;
+       count = osd_oi_probe(env, o);
+       if (count < 0)
+               GOTO(out, rc = count);
  
-               count = osd_oi_count;
-               odb = o->od_root;
+       if (count > 0) {
+               if (count == sf->sf_oi_count)
+                       goto open;
  
-               for (i = 0; i < count; i++) {
-                       sprintf(key, "%s.%d", DMU_OSD_OI_NAME_BASE, i);
-                       rc = osd_oi_find_or_create(env, o, odb, key, &sdb);
-                       if (rc)
-                               RETURN(rc);
+               if (sf->sf_oi_count == 0) {
+                       if (likely((count & (count - 1)) == 0)) {
+                               sf->sf_oi_count = count;
+                               rc = scrub_file_store(env, scrub);
+                               if (rc)
+                                       GOTO(out, rc);
+
+                               goto open;
+                       }
+
+                       LCONSOLE_ERROR("%s: invalid oi count %d. You can "
+                                      "remove all OIs, then remount it\n",
+                                      osd_name(o), count);
+                       GOTO(out, rc = -EDOM);
+               }
+
+               scrub_file_reset(scrub, o->od_uuid, SF_RECREATED);
+               count = sf->sf_oi_count;
+       } else {
+               if (sf->sf_oi_count > 0) {
+                       count = sf->sf_oi_count;
+                       memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+                       for (i = 0; i < count; i++)
+                               zfs_set_bit(i, sf->sf_oi_bitmap);
+                       scrub_file_reset(scrub, o->od_uuid, SF_RECREATED);
+               } else {
+                       count = sf->sf_oi_count = osd_oi_count;
                 }
         }
  
-       rc = osd_oi_init_compat(env, o);
+       rc = scrub_file_store(env, scrub);
         if (rc)
-               RETURN(rc);
+               GOTO(out, rc);
  
+       for (i = 0; i < count; i++) {
+               LASSERT(sizeof(osd_oti_get(env)->oti_buf) >= 32);
+
+               snprintf(key, sizeof(osd_oti_get(env)->oti_buf) - 1,
+                        "%s.%d", DMU_OSD_OI_NAME_BASE, i);
+               rc = osd_oi_find_or_create(env, o, o->od_root, key, &sdb);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+open:
         LASSERT((count & (count - 1)) == 0);
         o->od_oi_count = count;
         OBD_ALLOC(o->od_oi_table, sizeof(struct osd_oi *) * count);
         if (o->od_oi_table == NULL)
-               RETURN(-ENOMEM);
+               GOTO(out, rc = -ENOMEM);
  
         rc = osd_oi_open_table(env, o, count);
+
+       GOTO(out, rc);
+
+out:
         if (rc) {
-               OBD_FREE(o->od_oi_table, sizeof(struct osd_oi *) * count);
-               o->od_oi_table = NULL;
+               osd_ost_seq_fini(env, o);
+
+               if (o->od_oi_table) {
+                       OBD_FREE(o->od_oi_table,
+                                sizeof(struct osd_oi *) * count);
+                       o->od_oi_table = NULL;
+               }
         }
  
-       RETURN(rc);
+       return rc;
  }
  
  void osd_oi_fini(const struct lu_env *env, struct osd_device *o)
@@ -1016,3 +1098,23 @@ int osd_idc_find_and_init(const struct lu_env *env, struct osd_device *osd,
  
         return 0;
  }
+
+int osd_idc_find_and_init_with_oid(const struct lu_env *env,
+                                  struct osd_device *osd,
+                                  const struct lu_fid *fid,
+                                  uint64_t oid)
+{
+       struct osd_idmap_cache *idc;
+
+       idc = osd_idc_find(env, osd, fid);
+       if (!idc) {
+               idc = osd_idc_add(env, osd, fid);
+               if (IS_ERR(idc))
+                       return PTR_ERR(idc);
+       }
+
+       idc->oic_dnode = oid;
+       idc->oic_remote = 0;
+
+       return 0;
+}
diff --git a/lustre/osd-zfs/osd_scrub.c b/lustre/osd-zfs/osd_scrub.c

new file mode 100644 (file)

index 0000000..b2cda0a
--- /dev/null
+++ b/lustre/osd-zfs/osd_scrub.c
@@ -0,0 +1,1743 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/osd-zfs/osd_scrub.c
+ *
+ * Top-level entry points into osd module
+ *
+ * The OI scrub is used for rebuilding Object Index files when restores MDT from
+ * file-level backup.
+ *
+ * The otable based iterator scans ZFS objects to feed up layer LFSCK.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LFSCK
+
+#include <linux/kthread.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_disk.h>
+#include <dt_object.h>
+#include <linux/xattr.h>
+#include <lustre_scrub.h>
+#include <obd_class.h>
+#include <lustre_nodemap.h>
+#include <sys/dsl_dataset.h>
+
+#include "osd_internal.h"
+
+#define OSD_OTABLE_MAX_HASH            ((1ULL << 48) - 1)
+#define OTABLE_PREFETCH                        256
+
+#define DTO_INDEX_INSERT               1
+#define DTO_INDEX_DELETE               2
+#define DTO_INDEX_UPDATE               3
+
+static inline bool osd_scrub_has_window(struct osd_otable_it *it)
+{
+       return it->ooi_prefetched < OTABLE_PREFETCH;
+}
+
+/**
+ * update/insert/delete the specified OI mapping (@fid @id) according to the ops
+ *
+ * \retval   1, changed nothing
+ * \retval   0, changed successfully
+ * \retval -ve, on error
+ */
+static int osd_scrub_refresh_mapping(const struct lu_env *env,
+                                    struct osd_device *dev,
+                                    const struct lu_fid *fid,
+                                    uint64_t oid, int ops,
+                                    bool force, const char *name)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
+       char *buf = info->oti_str;
+       dmu_tx_t *tx = NULL;
+       dnode_t *dn = NULL;
+       uint64_t zapid;
+       int rc;
+       ENTRY;
+
+       if (dev->od_scrub.os_file.sf_param & SP_DRYRUN && !force)
+               GOTO(log, rc = 0);
+
+       tx = dmu_tx_create(dev->od_os);
+       if (!tx)
+               GOTO(log, rc = -ENOMEM);
+
+       zapid = osd_get_name_n_idx(env, dev, fid, buf,
+                                  sizeof(info->oti_str), &dn);
+       osd_tx_hold_zap(tx, zapid, dn,
+                       ops == DTO_INDEX_INSERT ? TRUE : FALSE, NULL);
+       rc = -dmu_tx_assign(tx, TXG_WAIT);
+       if (rc) {
+               dmu_tx_abort(tx);
+               GOTO(log, rc);
+       }
+
+       switch (ops) {
+       case DTO_INDEX_UPDATE:
+               zde->zde_pad = 0;
+               zde->zde_dnode = oid;
+               zde->zde_type = 0; /* The type in OI mapping is useless. */
+               rc = -zap_update(dev->od_os, zapid, buf, 8, sizeof(*zde) / 8,
+                                zde, tx);
+               if (unlikely(rc == -ENOENT)) {
+                       /* Some unlink thread may removed the OI mapping. */
+                       rc = 1;
+               }
+               break;
+       case DTO_INDEX_INSERT:
+               zde->zde_pad = 0;
+               zde->zde_dnode = oid;
+               zde->zde_type = 0; /* The type in OI mapping is useless. */
+               rc = osd_zap_add(dev, zapid, dn, buf, 8, sizeof(*zde) / 8,
+                                zde, tx);
+               if (unlikely(rc == -EEXIST))
+                       rc = 1;
+               break;
+       case DTO_INDEX_DELETE:
+               rc = osd_zap_remove(dev, zapid, dn, buf, tx);
+               if (rc == -ENOENT) {
+                       /* It is normal that the unlink thread has removed the
+                        * OI mapping already. */
+                       rc = 1;
+               }
+               break;
+       default:
+               LASSERTF(0, "Unexpected ops %d\n", ops);
+               rc = -EINVAL;
+               break;
+       }
+
+       dmu_tx_commit(tx);
+       GOTO(log, rc);
+
+log:
+       CDEBUG(D_LFSCK, "%s: refresh OI map for scrub, op %d, force %s, "
+              DFID" => %llu (%s): rc = %d\n", osd_name(dev), ops,
+              force ? "yes" : "no", PFID(fid), oid, name ? name : "null", rc);
+
+       return rc;
+}
+
+static int
+osd_scrub_check_update(const struct lu_env *env, struct osd_device *dev,
+                      const struct lu_fid *fid, uint64_t oid, int val)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       struct osd_inconsistent_item *oii = NULL;
+       nvlist_t *nvbuf = NULL;
+       dnode_t *dn = NULL;
+       uint64_t oid2;
+       int ops = DTO_INDEX_UPDATE;
+       int rc;
+       ENTRY;
+
+       down_write(&scrub->os_rwsem);
+       scrub->os_new_checked++;
+       if (val < 0)
+               GOTO(out, rc = val);
+
+       if (scrub->os_in_prior)
+               oii = list_entry(scrub->os_inconsistent_items.next,
+                                struct osd_inconsistent_item, oii_list);
+
+       if (oid < sf->sf_pos_latest_start && !oii)
+               GOTO(out, rc = 0);
+
+       if (oii && oii->oii_insert) {
+               ops = DTO_INDEX_INSERT;
+               goto zget;
+       }
+
+       rc = osd_fid_lookup(env, dev, fid, &oid2);
+       if (rc) {
+               if (rc != -ENOENT)
+                       GOTO(out, rc);
+
+               ops = DTO_INDEX_INSERT;
+
+zget:
+               rc = __osd_obj2dnode(dev->od_os, oid, &dn);
+               if (rc) {
+                       /* Someone removed the object by race. */
+                       if (rc == -ENOENT || rc == -EEXIST)
+                               rc = 0;
+                       GOTO(out, rc);
+               }
+
+               scrub->os_full_speed = 1;
+               sf->sf_flags |= SF_INCONSISTENT;
+       } else if (oid == oid2) {
+               GOTO(out, rc = 0);
+       } else {
+               struct lustre_mdt_attrs *lma = NULL;
+               int size;
+
+               rc = __osd_xattr_load_by_oid(dev, oid2, &nvbuf);
+               if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+                       goto update;
+               if (rc)
+                       GOTO(out, rc);
+
+               rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+                                              (uchar_t **)&lma, &size);
+               if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+                       goto update;
+               if (rc)
+                       GOTO(out, rc);
+
+               lustre_lma_swab(lma);
+               if (unlikely(lu_fid_eq(&lma->lma_self_fid, fid))) {
+                       CDEBUG(D_LFSCK, "%s: the FID "DFID" is used by "
+                              "two objects: %llu and %llu (in OI)\n",
+                              osd_name(dev), PFID(fid), oid, oid2);
+
+                       GOTO(out, rc = -EEXIST);
+               }
+
+update:
+               scrub->os_full_speed = 1;
+               sf->sf_flags |= SF_INCONSISTENT;
+       }
+
+       rc = osd_scrub_refresh_mapping(env, dev, fid, oid, ops, false, NULL);
+       if (!rc) {
+               if (scrub->os_in_prior)
+                       sf->sf_items_updated_prior++;
+               else
+                       sf->sf_items_updated++;
+       }
+
+       GOTO(out, rc);
+
+out:
+       if (nvbuf)
+               nvlist_free(nvbuf);
+
+       if (rc < 0) {
+               sf->sf_items_failed++;
+               if (sf->sf_pos_first_inconsistent == 0 ||
+                   sf->sf_pos_first_inconsistent > oid)
+                       sf->sf_pos_first_inconsistent = oid;
+       } else {
+               rc = 0;
+       }
+
+       /* There may be conflict unlink during the OI scrub,
+        * if happend, then remove the new added OI mapping. */
+       if (ops == DTO_INDEX_INSERT && dn && dn->dn_free_txg)
+               osd_scrub_refresh_mapping(env, dev, fid, oid,
+                                         DTO_INDEX_DELETE, false, NULL);
+       up_write(&scrub->os_rwsem);
+
+       if (dn)
+               osd_dnode_rele(dn);
+
+       if (oii) {
+               spin_lock(&scrub->os_lock);
+               if (likely(!list_empty(&oii->oii_list)))
+                       list_del(&oii->oii_list);
+               spin_unlock(&scrub->os_lock);
+               OBD_FREE_PTR(oii);
+       }
+
+       RETURN(sf->sf_param & SP_FAILOUT ? rc : 0);
+}
+
+static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct scrub_file *sf = &scrub->os_file;
+       __u32 flags = scrub->os_start_flags;
+       int rc;
+       bool drop_dryrun = false;
+       ENTRY;
+
+       CDEBUG(D_LFSCK, "%s: OI scrub prep, flags = 0x%x\n",
+              scrub->os_name, flags);
+
+       down_write(&scrub->os_rwsem);
+       if (flags & SS_SET_FAILOUT)
+               sf->sf_param |= SP_FAILOUT;
+       else if (flags & SS_CLEAR_FAILOUT)
+               sf->sf_param &= ~SP_FAILOUT;
+
+       if (flags & SS_SET_DRYRUN) {
+               sf->sf_param |= SP_DRYRUN;
+       } else if (flags & SS_CLEAR_DRYRUN && sf->sf_param & SP_DRYRUN) {
+               sf->sf_param &= ~SP_DRYRUN;
+               drop_dryrun = true;
+       }
+
+       if (flags & SS_RESET)
+               scrub_file_reset(scrub, dev->od_uuid, 0);
+
+       scrub->os_partial_scan = 0;
+       if (flags & SS_AUTO_FULL) {
+               scrub->os_full_speed = 1;
+               sf->sf_flags |= SF_AUTO;
+       } else if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+                                  SF_UPGRADE)) {
+               scrub->os_full_speed = 1;
+       } else {
+               scrub->os_full_speed = 0;
+       }
+
+       spin_lock(&scrub->os_lock);
+       scrub->os_in_prior = 0;
+       scrub->os_waiting = 0;
+       scrub->os_paused = 0;
+       scrub->os_in_join = 0;
+       scrub->os_full_scrub = 0;
+       spin_unlock(&scrub->os_lock);
+       scrub->os_new_checked = 0;
+       if (drop_dryrun && sf->sf_pos_first_inconsistent != 0)
+               sf->sf_pos_latest_start = sf->sf_pos_first_inconsistent;
+       else if (sf->sf_pos_last_checkpoint != 0)
+               sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1;
+       else
+               sf->sf_pos_latest_start = 1;
+
+       scrub->os_pos_current = sf->sf_pos_latest_start;
+       sf->sf_status = SS_SCANNING;
+       sf->sf_time_latest_start = cfs_time_current_sec();
+       sf->sf_time_last_checkpoint = sf->sf_time_latest_start;
+       sf->sf_pos_last_checkpoint = sf->sf_pos_latest_start - 1;
+       rc = scrub_file_store(env, scrub);
+       if (!rc) {
+               spin_lock(&scrub->os_lock);
+               thread_set_flags(thread, SVC_RUNNING);
+               spin_unlock(&scrub->os_lock);
+               wake_up_all(&thread->t_ctl_waitq);
+       }
+       up_write(&scrub->os_rwsem);
+
+       RETURN(rc);
+}
+
+static int osd_scrub_post(const struct lu_env *env, struct osd_device *dev,
+                         int result)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_LFSCK, "%s: OI scrub post with result = %d\n",
+              scrub->os_name, result);
+
+       down_write(&scrub->os_rwsem);
+       spin_lock(&scrub->os_lock);
+       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+       spin_unlock(&scrub->os_lock);
+       if (scrub->os_new_checked > 0) {
+               sf->sf_items_checked += scrub->os_new_checked;
+               scrub->os_new_checked = 0;
+               sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+       }
+       sf->sf_time_last_checkpoint = cfs_time_current_sec();
+       if (result > 0) {
+               sf->sf_status = SS_COMPLETED;
+               if (!(sf->sf_param & SP_DRYRUN)) {
+                       memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+                       sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT |
+                                         SF_UPGRADE | SF_AUTO);
+               }
+               sf->sf_time_last_complete = sf->sf_time_last_checkpoint;
+               sf->sf_success_count++;
+       } else if (result == 0) {
+               if (scrub->os_paused)
+                       sf->sf_status = SS_PAUSED;
+               else
+                       sf->sf_status = SS_STOPPED;
+       } else {
+               sf->sf_status = SS_FAILED;
+       }
+       sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC -
+                                           scrub->os_time_last_checkpoint);
+       rc = scrub_file_store(env, scrub);
+       up_write(&scrub->os_rwsem);
+
+       RETURN(rc < 0 ? rc : result);
+}
+
+/* iteration engine */
+
+static inline int
+osd_scrub_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
+{
+       spin_lock(&scrub->os_lock);
+       if (osd_scrub_has_window(it) ||
+           !list_empty(&scrub->os_inconsistent_items) ||
+           it->ooi_waiting || !thread_is_running(&scrub->os_thread))
+               scrub->os_waiting = 0;
+       else
+               scrub->os_waiting = 1;
+       spin_unlock(&scrub->os_lock);
+
+       return !scrub->os_waiting;
+}
+
+static int osd_scrub_next(const struct lu_env *env, struct osd_device *dev,
+                         struct lu_fid *fid, uint64_t *oid)
+{
+       struct l_wait_info lwi = { 0 };
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct osd_otable_it *it = dev->od_otable_it;
+       struct lustre_mdt_attrs *lma = NULL;
+       nvlist_t *nvbuf = NULL;
+       int size = 0;
+       int rc = 0;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_DELAY) && cfs_fail_val > 0) {
+               lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
+               if (likely(lwi.lwi_timeout > 0)) {
+                       l_wait_event(thread->t_ctl_waitq,
+                               !list_empty(&scrub->os_inconsistent_items) ||
+                               !thread_is_running(thread),
+                               &lwi);
+                       if (unlikely(!thread_is_running(thread)))
+                               RETURN(SCRUB_NEXT_EXIT);
+               }
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
+               spin_lock(&scrub->os_lock);
+               thread_set_flags(thread, SVC_STOPPING);
+               spin_unlock(&scrub->os_lock);
+               RETURN(SCRUB_NEXT_CRASH);
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_FATAL))
+               RETURN(SCRUB_NEXT_FATAL);
+
+again:
+       if (nvbuf) {
+               nvlist_free(nvbuf);
+               nvbuf = NULL;
+               lma = NULL;
+       }
+
+       if (!list_empty(&scrub->os_inconsistent_items)) {
+               spin_lock(&scrub->os_lock);
+               if (likely(!list_empty(&scrub->os_inconsistent_items))) {
+                       struct osd_inconsistent_item *oii;
+
+                       oii = list_entry(scrub->os_inconsistent_items.next,
+                               struct osd_inconsistent_item, oii_list);
+                       *fid = oii->oii_cache.oic_fid;
+                       *oid = oii->oii_cache.oic_dnode;
+                       scrub->os_in_prior = 1;
+                       spin_unlock(&scrub->os_lock);
+
+                       GOTO(out, rc = 0);
+               }
+               spin_unlock(&scrub->os_lock);
+       }
+
+       if (!scrub->os_full_speed && !osd_scrub_has_window(it)) {
+               memset(&lwi, 0, sizeof(lwi));
+               l_wait_event(thread->t_ctl_waitq,
+                            osd_scrub_wakeup(scrub, it),
+                            &lwi);
+       }
+
+       if (unlikely(!thread_is_running(thread)))
+               GOTO(out, rc = SCRUB_NEXT_EXIT);
+
+       rc = -dmu_object_next(dev->od_os, &scrub->os_pos_current, B_FALSE, 0);
+       if (rc)
+               GOTO(out, rc = (rc == -ESRCH ? SCRUB_NEXT_BREAK : rc));
+
+       rc = __osd_xattr_load_by_oid(dev, scrub->os_pos_current, &nvbuf);
+       if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+               goto again;
+
+       if (rc)
+               GOTO(out, rc);
+
+       LASSERT(nvbuf != NULL);
+       rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+                                      (uchar_t **)&lma, &size);
+       if (!rc) {
+               lustre_lma_swab(lma);
+               if (likely(!(lma->lma_compat & LMAC_NOT_IN_OI) &&
+                          !(lma->lma_incompat & LMAI_AGENT))) {
+                       *fid = lma->lma_self_fid;
+                       *oid = scrub->os_pos_current;
+
+                       GOTO(out, rc = 0);
+               }
+       }
+
+       if (!scrub->os_full_speed) {
+               spin_lock(&scrub->os_lock);
+               it->ooi_prefetched++;
+               if (it->ooi_waiting) {
+                       it->ooi_waiting = 0;
+                       wake_up_all(&thread->t_ctl_waitq);
+               }
+               spin_unlock(&scrub->os_lock);
+       }
+
+       goto again;
+
+out:
+       if (nvbuf)
+               nvlist_free(nvbuf);
+
+       return rc;
+}
+
+static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
+                         const struct lu_fid *fid, uint64_t oid, int rc)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct osd_otable_it *it = dev->od_otable_it;
+
+       rc = osd_scrub_check_update(env, dev, fid, oid, rc);
+       if (!scrub->os_in_prior) {
+               if (!scrub->os_full_speed) {
+                       spin_lock(&scrub->os_lock);
+                       it->ooi_prefetched++;
+                       if (it->ooi_waiting) {
+                               it->ooi_waiting = 0;
+                               wake_up_all(&thread->t_ctl_waitq);
+                       }
+                       spin_unlock(&scrub->os_lock);
+               }
+       } else {
+               scrub->os_in_prior = 0;
+       }
+
+       if (rc)
+               return rc;
+
+       rc = scrub_checkpoint(env, scrub);
+       if (rc) {
+               CDEBUG(D_LFSCK, "%s: fail to checkpoint, pos = %llu: "
+                      "rc = %d\n", scrub->os_name, scrub->os_pos_current, rc);
+               /* Continue, as long as the scrub itself can go ahead. */
+       }
+
+       return 0;
+}
+
+static int osd_scrub_main(void *args)
+{
+       struct lu_env env;
+       struct osd_device *dev = (struct osd_device *)args;
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct lu_fid *fid;
+       uint64_t oid;
+       int rc = 0;
+       ENTRY;
+
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_DT_THREAD);
+       if (rc) {
+               CDEBUG(D_LFSCK, "%s: OI scrub fail to init env: rc = %d\n",
+                      scrub->os_name, rc);
+               GOTO(noenv, rc);
+       }
+
+       rc = osd_scrub_prep(&env, dev);
+       if (rc) {
+               CDEBUG(D_LFSCK, "%s: OI scrub fail to scrub prep: rc = %d\n",
+                      scrub->os_name, rc);
+               GOTO(out, rc);
+       }
+
+       if (!scrub->os_full_speed) {
+               struct l_wait_info lwi = { 0 };
+               struct osd_otable_it *it = dev->od_otable_it;
+
+               l_wait_event(thread->t_ctl_waitq,
+                            it->ooi_user_ready || !thread_is_running(thread),
+                            &lwi);
+               if (unlikely(!thread_is_running(thread)))
+                       GOTO(post, rc = 0);
+
+               scrub->os_pos_current = it->ooi_pos;
+       }
+
+       CDEBUG(D_LFSCK, "%s: OI scrub start, flags = 0x%x, pos = %llu\n",
+              scrub->os_name, scrub->os_start_flags,
+              scrub->os_pos_current);
+
+       fid = &osd_oti_get(&env)->oti_fid;
+       while (!rc && thread_is_running(thread)) {
+               rc = osd_scrub_next(&env, dev, fid, &oid);
+               switch (rc) {
+               case SCRUB_NEXT_EXIT:
+                       GOTO(post, rc = 0);
+               case SCRUB_NEXT_CRASH:
+                       spin_lock(&scrub->os_lock);
+                       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+                       spin_unlock(&scrub->os_lock);
+                       GOTO(out, rc = -EINVAL);
+               case SCRUB_NEXT_FATAL:
+                       GOTO(post, rc = -EINVAL);
+               case SCRUB_NEXT_BREAK:
+                       GOTO(post, rc = 1);
+               }
+
+               rc = osd_scrub_exec(&env, dev, fid, oid, rc);
+       }
+
+       GOTO(post, rc);
+
+post:
+       rc = osd_scrub_post(&env, dev, rc);
+       CDEBUG(D_LFSCK, "%s: OI scrub: stop, pos = %llu: rc = %d\n",
+              scrub->os_name, scrub->os_pos_current, rc);
+
+out:
+       while (!list_empty(&scrub->os_inconsistent_items)) {
+               struct osd_inconsistent_item *oii;
+
+               oii = list_entry(scrub->os_inconsistent_items.next,
+                                struct osd_inconsistent_item, oii_list);
+               list_del_init(&oii->oii_list);
+               OBD_FREE_PTR(oii);
+       }
+
+       lu_env_fini(&env);
+
+noenv:
+       spin_lock(&scrub->os_lock);
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up_all(&thread->t_ctl_waitq);
+       spin_unlock(&scrub->os_lock);
+       return rc;
+}
+
+/* initial OI scrub */
+
+struct osd_lf_map;
+
+typedef int (*handle_dirent_t)(const struct lu_env *, struct osd_device *,
+                              const char *, uint64_t, uint64_t,
+                              enum osd_lf_flags, bool);
+static int osd_ios_varfid_hd(const struct lu_env *, struct osd_device *,
+                            const char *, uint64_t, uint64_t,
+                            enum osd_lf_flags, bool);
+static int osd_ios_uld_hd(const struct lu_env *, struct osd_device *,
+                         const char *, uint64_t, uint64_t,
+                         enum osd_lf_flags, bool);
+
+typedef int (*scan_dir_t)(const struct lu_env *, struct osd_device *,
+                         uint64_t, handle_dirent_t, enum osd_lf_flags);
+static int osd_ios_general_sd(const struct lu_env *, struct osd_device *,
+                             uint64_t, handle_dirent_t, enum osd_lf_flags);
+static int osd_ios_ROOT_sd(const struct lu_env *, struct osd_device *,
+                          uint64_t, handle_dirent_t, enum osd_lf_flags);
+
+struct osd_lf_map {
+       char                    *olm_name;
+       struct lu_fid            olm_fid;
+       enum osd_lf_flags        olm_flags;
+       scan_dir_t               olm_scan_dir;
+       handle_dirent_t          olm_handle_dirent;
+};
+
+/* Add the new introduced local files in the list in the future. */
+static const struct osd_lf_map osd_lf_maps[] = {
+       /* CONFIGS */
+       {
+               .olm_name               = MOUNT_CONFIGS_DIR,
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_LOCAL_FILE,
+                       .f_oid  = MGS_CONFIGS_OID,
+               },
+               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_varfid_hd,
+       },
+
+       /* NIDTBL_VERSIONS */
+       {
+               .olm_name               = MGS_NIDTBL_DIR,
+               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_varfid_hd,
+       },
+
+       /* PENDING */
+       {
+               .olm_name               = "PENDING",
+       },
+
+       /* ROOT */
+       {
+               .olm_name               = "ROOT",
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_ROOT,
+                       .f_oid  = FID_OID_ROOT,
+               },
+               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_scan_dir           = osd_ios_ROOT_sd,
+       },
+
+       /* fld */
+       {
+               .olm_name               = "fld",
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_LOCAL_FILE,
+                       .f_oid  = FLD_INDEX_OID,
+               },
+       },
+
+       /* changelog_catalog */
+       {
+               .olm_name               = CHANGELOG_CATALOG,
+       },
+
+       /* changelog_users */
+       {
+               .olm_name               = CHANGELOG_USERS,
+       },
+
+       /* quota_master */
+       {
+               .olm_name               = QMT_DIR,
+               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_varfid_hd,
+       },
+
+       /* quota_slave */
+       {
+               .olm_name               = QSD_DIR,
+               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_varfid_hd,
+       },
+
+       /* LFSCK */
+       {
+               .olm_name               = LFSCK_DIR,
+               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_varfid_hd,
+       },
+
+       /* lfsck_bookmark */
+       {
+               .olm_name               = LFSCK_BOOKMARK,
+       },
+
+       /* lfsck_layout */
+       {
+               .olm_name               = LFSCK_LAYOUT,
+       },
+
+       /* lfsck_namespace */
+       {
+               .olm_name               = LFSCK_NAMESPACE,
+       },
+
+       /* OSP update logs update_log{_dir} use f_seq = FID_SEQ_UPDATE_LOG{_DIR}
+        * and f_oid = index for their log files.  See lu_update_log{_dir}_fid()
+        * for more details. */
+
+       /* update_log */
+       {
+               .olm_name               = "update_log",
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_UPDATE_LOG,
+               },
+               .olm_flags              = OLF_IDX_IN_FID,
+       },
+
+       /* update_log_dir */
+       {
+               .olm_name               = "update_log_dir",
+               .olm_fid        = {
+                       .f_seq  = FID_SEQ_UPDATE_LOG_DIR,
+               },
+               .olm_flags              = OLF_SCAN_SUBITEMS | OLF_IDX_IN_FID,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_uld_hd,
+       },
+
+       /* hsm_actions */
+       {
+               .olm_name               = HSM_ACTIONS,
+       },
+
+       /* nodemap */
+       {
+               .olm_name               = LUSTRE_NODEMAP_NAME,
+       },
+
+       {
+               .olm_name               = NULL
+       }
+};
+
+/* Add the new introduced files under .lustre/ in the list in the future. */
+static const struct osd_lf_map osd_dl_maps[] = {
+       /* .lustre/fid */
+       {
+               .olm_name               = "fid",
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_DOT_LUSTRE,
+                       .f_oid  = FID_OID_DOT_LUSTRE_OBF,
+               },
+       },
+
+       /* .lustre/lost+found */
+       {
+               .olm_name               = "lost+found",
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_DOT_LUSTRE,
+                       .f_oid  = FID_OID_DOT_LUSTRE_LPF,
+               },
+       },
+
+       {
+               .olm_name               = NULL
+       }
+};
+
+struct osd_ios_item {
+       struct list_head        oii_list;
+       uint64_t                oii_parent;
+       enum osd_lf_flags       oii_flags;
+       scan_dir_t              oii_scan_dir;
+       handle_dirent_t         oii_handle_dirent;
+};
+
+static int osd_ios_new_item(struct osd_device *dev, uint64_t parent,
+                           enum osd_lf_flags flags, scan_dir_t scan_dir,
+                           handle_dirent_t handle_dirent)
+{
+       struct osd_ios_item *item;
+
+       OBD_ALLOC_PTR(item);
+       if (!item) {
+               CWARN("%s: initial OI scrub failed to add item for %llu\n",
+                     osd_name(dev), parent);
+               return -ENOMEM;
+       }
+
+       INIT_LIST_HEAD(&item->oii_list);
+       item->oii_parent = parent;
+       item->oii_flags = flags;
+       item->oii_scan_dir = scan_dir;
+       item->oii_handle_dirent = handle_dirent;
+       list_add_tail(&item->oii_list, &dev->od_ios_list);
+
+       return 0;
+}
+
+/**
+ * verify FID-in-LMA and OI entry for one object
+ *
+ * ios: Initial OI Scrub.
+ */
+static int osd_ios_scan_one(const struct lu_env *env, struct osd_device *dev,
+                           const struct lu_fid *fid, uint64_t parent,
+                           uint64_t oid, const char *name,
+                           enum osd_lf_flags flags)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       struct lustre_mdt_attrs *lma = NULL;
+       nvlist_t *nvbuf = NULL;
+       struct lu_fid tfid;
+       uint64_t oid2 = 0;
+       __u64 flag = 0;
+       int size = 0;
+       int op = 0;
+       int rc;
+       ENTRY;
+
+       rc = __osd_xattr_load_by_oid(dev, oid, &nvbuf);
+       if (unlikely(rc == -ENOENT || rc == -EEXIST))
+               RETURN(0);
+
+       if (rc && rc != -ENODATA) {
+               CWARN("%s: initial OI scrub failed to get lma for %llu: "
+                     "rc = %d\n", osd_name(dev), oid, rc);
+
+               RETURN(rc);
+       }
+
+       if (!rc) {
+               LASSERT(nvbuf != NULL);
+               rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+                                              (uchar_t **)&lma, &size);
+               if (rc || size == 0) {
+                       LASSERT(lma == NULL);
+                       rc = -ENODATA;
+               } else {
+                       LASSERTF(lma != NULL, "corrupted LMA, size %d\n", size);
+                       lustre_lma_swab(lma);
+                       if (lma->lma_compat & LMAC_NOT_IN_OI) {
+                               nvlist_free(nvbuf);
+                               RETURN(0);
+                       }
+
+                       tfid = lma->lma_self_fid;
+               }
+               nvlist_free(nvbuf);
+       }
+
+       if (rc == -ENODATA) {
+               if (!fid) {
+                       /* Skip the object without FID-in-LMA */
+                       CDEBUG(D_LFSCK, "%s: %llu has no FID-in-LMA, skip it\n",
+                              osd_name(dev), oid);
+
+                       RETURN(0);
+               }
+
+               LASSERT(!fid_is_zero(fid));
+
+               tfid = *fid;
+               if (flags & OLF_IDX_IN_FID) {
+                       LASSERT(dev->od_index >= 0);
+
+                       tfid.f_oid = dev->od_index;
+               }
+       }
+
+       rc = osd_fid_lookup(env, dev, &tfid, &oid2);
+       if (rc) {
+               if (rc != -ENOENT) {
+                       CWARN("%s: initial OI scrub failed to lookup fid for "
+                             DFID"=>%llu: rc = %d\n",
+                             osd_name(dev), PFID(&tfid), oid, rc);
+
+                       RETURN(rc);
+               }
+
+               flag = SF_RECREATED;
+               op = DTO_INDEX_INSERT;
+       } else {
+               if (oid == oid2)
+                       RETURN(0);
+
+               flag = SF_INCONSISTENT;
+               op = DTO_INDEX_UPDATE;
+       }
+
+       if (!(sf->sf_flags & flag)) {
+               scrub_file_reset(scrub, dev->od_uuid, flag);
+               rc = scrub_file_store(env, scrub);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       rc = osd_scrub_refresh_mapping(env, dev, &tfid, oid, op, true, name);
+
+       RETURN(rc > 0 ? 0 : rc);
+}
+
+static int osd_ios_varfid_hd(const struct lu_env *env, struct osd_device *dev,
+                            const char *name, uint64_t parent, uint64_t oid,
+                            enum osd_lf_flags flags, bool is_dir)
+{
+       int rc;
+       ENTRY;
+
+       rc = osd_ios_scan_one(env, dev, NULL, parent, oid, name, 0);
+       if (!rc && is_dir)
+               rc = osd_ios_new_item(dev, oid, flags, osd_ios_general_sd,
+                                     osd_ios_varfid_hd);
+
+       RETURN(rc);
+}
+
+static int osd_ios_uld_hd(const struct lu_env *env, struct osd_device *dev,
+                         const char *name, uint64_t parent, uint64_t oid,
+                         enum osd_lf_flags flags, bool is_dir)
+{
+       struct lu_fid tfid;
+       int rc;
+       ENTRY;
+
+       /* skip any non-DFID format name */
+       if (name[0] != '[')
+               RETURN(0);
+
+       /* skip the start '[' */
+       sscanf(&name[1], SFID, RFID(&tfid));
+       if (fid_is_sane(&tfid))
+               rc = osd_ios_scan_one(env, dev, &tfid, parent, oid, name, 0);
+       else
+               rc = -EIO;
+
+       RETURN(rc);
+}
+
+/*
+ * General scanner for the directories execpt /ROOT during initial OI scrub.
+ * It scans the name entries under the given directory one by one. For each
+ * entry, verifies its OI mapping via the given @handle_dirent.
+ */
+static int osd_ios_general_sd(const struct lu_env *env, struct osd_device *dev,
+                             uint64_t parent, handle_dirent_t handle_dirent,
+                             enum osd_lf_flags flags)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct luz_direntry *zde = &info->oti_zde;
+       zap_attribute_t *za = &info->oti_za;
+       zap_cursor_t *zc = &info->oti_zc;
+       int rc;
+       ENTRY;
+
+       zap_cursor_init_serialized(zc, dev->od_os, parent, 0);
+       rc = -zap_cursor_retrieve(zc, za);
+       if (rc == -ENOENT)
+               zap_cursor_advance(zc);
+       else if (rc)
+               GOTO(log, rc);
+
+       while (1) {
+               rc = -zap_cursor_retrieve(zc, za);
+               if (rc)
+                       GOTO(log, rc = (rc == -ENOENT ? 0 : rc));
+
+               /* skip the entry started with '.' */
+               if (likely(za->za_name[0] != '.')) {
+                       rc = osd_zap_lookup(dev, parent, NULL, za->za_name,
+                                       za->za_integer_length,
+                                       sizeof(*zde) / za->za_integer_length,
+                                       (void *)zde);
+                       if (rc) {
+                               CWARN("%s: initial OI scrub failed to lookup "
+                                     "%s under %llu: rc = %d\n",
+                                     osd_name(dev), za->za_name, parent, rc);
+                               continue;
+                       }
+
+                       rc = handle_dirent(env, dev, za->za_name, parent,
+                                       zde->lzd_reg.zde_dnode, flags,
+                                       S_ISDIR(DTTOIF(zde->lzd_reg.zde_type)) ?
+                                       true : false);
+                       CDEBUG(D_LFSCK, "%s: initial OI scrub handled %s under "
+                              "%llu: rc = %d\n",
+                              osd_name(dev), za->za_name, parent, rc);
+               }
+
+               zap_cursor_advance(zc);
+       }
+
+log:
+       if (rc)
+               CWARN("%s: initial OI scrub failed to scan the directory %llu: "
+                     "rc = %d\n", osd_name(dev), parent, rc);
+       zap_cursor_fini(zc);
+
+       return rc;
+}
+
+/*
+ * The scanner for /ROOT directory. It is not all the items under /ROOT will
+ * be scanned during the initial OI scrub, instead, only the .lustre and the
+ * sub-items under .lustre will be handled.
+ */
+static int osd_ios_ROOT_sd(const struct lu_env *env, struct osd_device *dev,
+                          uint64_t parent, handle_dirent_t handle_dirent,
+                          enum osd_lf_flags flags)
+{
+       struct luz_direntry *zde = &osd_oti_get(env)->oti_zde;
+       const struct osd_lf_map *map;
+       uint64_t oid;
+       int rc;
+       int rc1 = 0;
+       ENTRY;
+
+       rc = osd_zap_lookup(dev, parent, NULL, dot_lustre_name, 8,
+                           sizeof(*zde) / 8, (void *)zde);
+       if (rc == -ENOENT) {
+               /* The .lustre directory is lost. That is not fatal. It can
+                * be re-created in the subsequent MDT start processing. */
+               RETURN(0);
+       }
+
+       if (rc) {
+               CWARN("%s: initial OI scrub failed to find .lustre: "
+                     "rc = %d\n", osd_name(dev), rc);
+
+               RETURN(rc);
+       }
+
+       oid = zde->lzd_reg.zde_dnode;
+       rc = osd_ios_scan_one(env, dev, &LU_DOT_LUSTRE_FID, parent, oid,
+                             dot_lustre_name, 0);
+       if (rc)
+               RETURN(rc);
+
+       for (map = osd_dl_maps; map->olm_name; map++) {
+               rc = osd_zap_lookup(dev, oid, NULL, map->olm_name, 8,
+                                   sizeof(*zde) / 8, (void *)zde);
+               if (rc) {
+                       if (rc != -ENOENT)
+                               CWARN("%s: initial OI scrub failed to find"
+                                     "the entry %s under .lustre: rc = %d\n",
+                                     osd_name(dev), map->olm_name, rc);
+                       else if (!fid_is_zero(&map->olm_fid))
+                               /* Try to remove the stale OI mapping. */
+                               osd_scrub_refresh_mapping(env, dev,
+                                               &map->olm_fid, 0,
+                                               DTO_INDEX_DELETE, true,
+                                               map->olm_name);
+                       continue;
+               }
+
+               rc = osd_ios_scan_one(env, dev, &map->olm_fid, oid,
+                                     zde->lzd_reg.zde_dnode, map->olm_name,
+                                     map->olm_flags);
+               if (rc)
+                       rc1 = rc;
+       }
+
+       RETURN(rc1);
+}
+
+static void osd_initial_OI_scrub(const struct lu_env *env,
+                                struct osd_device *dev)
+{
+       struct luz_direntry *zde = &osd_oti_get(env)->oti_zde;
+       const struct osd_lf_map *map;
+       int rc;
+       ENTRY;
+
+       for (map = osd_lf_maps; map->olm_name; map++) {
+               rc = osd_zap_lookup(dev, dev->od_root, NULL, map->olm_name, 8,
+                                   sizeof(*zde) / 8, (void *)zde);
+               if (rc) {
+                       if (rc != -ENOENT)
+                               CWARN("%s: initial OI scrub failed "
+                                     "to find the entry %s: rc = %d\n",
+                                     osd_name(dev), map->olm_name, rc);
+                       else if (!fid_is_zero(&map->olm_fid))
+                               /* Try to remove the stale OI mapping. */
+                               osd_scrub_refresh_mapping(env, dev,
+                                               &map->olm_fid, 0,
+                                               DTO_INDEX_DELETE, true,
+                                               map->olm_name);
+                       continue;
+               }
+
+               rc = osd_ios_scan_one(env, dev, &map->olm_fid, dev->od_root,
+                                     zde->lzd_reg.zde_dnode, map->olm_name,
+                                     map->olm_flags);
+               if (!rc && map->olm_flags & OLF_SCAN_SUBITEMS)
+                       osd_ios_new_item(dev, zde->lzd_reg.zde_dnode,
+                                        map->olm_flags, map->olm_scan_dir,
+                                        map->olm_handle_dirent);
+       }
+
+       while (!list_empty(&dev->od_ios_list)) {
+               struct osd_ios_item *item;
+
+               item = list_entry(dev->od_ios_list.next,
+                                 struct osd_ios_item, oii_list);
+               list_del_init(&item->oii_list);
+               item->oii_scan_dir(env, dev, item->oii_parent,
+                                  item->oii_handle_dirent, item->oii_flags);
+               OBD_FREE_PTR(item);
+       }
+
+       EXIT;
+}
+
+/* OI scrub start/stop */
+
+int osd_scrub_start(const struct lu_env *env, struct osd_device *dev,
+                   __u32 flags)
+{
+       int rc;
+       ENTRY;
+
+       if (dev->od_dt_dev.dd_rdonly)
+               RETURN(-EROFS);
+
+       /* od_otable_sem: prevent concurrent start/stop */
+       down(&dev->od_otable_sem);
+       rc = scrub_start(osd_scrub_main, &dev->od_scrub, dev, flags);
+       up(&dev->od_otable_sem);
+
+       RETURN(rc == -EALREADY ? 0 : rc);
+}
+
+static void osd_scrub_stop(struct osd_device *dev)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       ENTRY;
+
+       /* od_otable_sem: prevent concurrent start/stop */
+       down(&dev->od_otable_sem);
+       scrub->os_paused = 1;
+       scrub_stop(scrub);
+       up(&dev->od_otable_sem);
+
+       EXIT;
+}
+
+/* OI scrub setup/cleanup */
+
+static const char osd_scrub_name[] = "OI_scrub";
+
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       struct lu_fid *fid = &info->oti_fid;
+       struct dt_object *obj;
+       uint64_t oid;
+       int rc = 0;
+       bool dirty = false;
+       ENTRY;
+
+       memcpy(dev->od_uuid,
+              &dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid,
+              sizeof(dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid));
+       memset(&dev->od_scrub, 0, sizeof(struct lustre_scrub));
+       init_waitqueue_head(&scrub->os_thread.t_ctl_waitq);
+       init_rwsem(&scrub->os_rwsem);
+       spin_lock_init(&scrub->os_lock);
+       INIT_LIST_HEAD(&scrub->os_inconsistent_items);
+       scrub->os_name = osd_name(dev);
+
+       /* 'What the @fid is' is not imporatant, because the object
+        * has no OI mapping, and only is visible inside the OSD.*/
+       fid->f_seq = FID_SEQ_IGIF_MAX;
+       if (dev->od_is_ost)
+               fid->f_oid = ((1 << 31) | dev->od_index) + 1;
+       else
+               fid->f_oid = dev->od_index + 1;
+       fid->f_ver = 0;
+       rc = osd_obj_find_or_create(env, dev, dev->od_root,
+                                   osd_scrub_name, &oid, fid, false);
+       if (rc)
+               RETURN(rc);
+
+       rc = osd_idc_find_and_init_with_oid(env, dev, fid, oid);
+       if (rc)
+               RETURN(rc);
+
+       obj = lu2dt(lu_object_find_slice(env, osd2lu_dev(dev), fid, NULL));
+       if (IS_ERR_OR_NULL(obj))
+               RETURN(obj ? PTR_ERR(obj) : -ENOENT);
+
+       scrub->os_obj = obj;
+       rc = scrub_file_load(env, scrub);
+       if (rc == -ENOENT || rc == -EFAULT) {
+               scrub_file_init(scrub, dev->od_uuid);
+               dirty = true;
+       } else if (rc < 0) {
+               GOTO(cleanup_obj, rc);
+       } else {
+               if (memcmp(sf->sf_uuid, dev->od_uuid, 16) != 0) {
+                       struct obd_uuid *old_uuid;
+                       struct obd_uuid *new_uuid;
+
+                       OBD_ALLOC_PTR(old_uuid);
+                       OBD_ALLOC_PTR(new_uuid);
+                       if (!old_uuid || !new_uuid) {
+                               CERROR("%s: UUID has been changed, but"
+                                      "failed to allocate RAM for report\n",
+                                      osd_name(dev));
+                       } else {
+                               class_uuid_unparse(sf->sf_uuid, old_uuid);
+                               class_uuid_unparse(dev->od_uuid, new_uuid);
+                               CDEBUG(D_LFSCK, "%s: UUID has been changed "
+                                      "from %s to %s\n", osd_name(dev),
+                                      old_uuid->uuid, new_uuid->uuid);
+                       }
+                       scrub_file_reset(scrub, dev->od_uuid, SF_INCONSISTENT);
+                       dirty = true;
+                       if (old_uuid)
+                               OBD_FREE_PTR(old_uuid);
+                       if (new_uuid)
+                               OBD_FREE_PTR(new_uuid);
+               } else if (sf->sf_status == SS_SCANNING) {
+                       sf->sf_status = SS_CRASHED;
+                       dirty = true;
+               }
+
+               if ((sf->sf_oi_count & (sf->sf_oi_count - 1)) != 0) {
+                       LCONSOLE_WARN("%s: invalid oi count %d, set it to %d\n",
+                                     osd_name(dev), sf->sf_oi_count,
+                                     osd_oi_count);
+                       sf->sf_oi_count = osd_oi_count;
+                       dirty = true;
+               }
+       }
+
+       if (sf->sf_pos_last_checkpoint != 0)
+               scrub->os_pos_current = sf->sf_pos_last_checkpoint + 1;
+       else
+               scrub->os_pos_current = 1;
+
+       if (dirty) {
+               rc = scrub_file_store(env, scrub);
+               if (rc)
+                       GOTO(cleanup_obj, rc);
+       }
+
+       /* Initialize OI files. */
+       rc = osd_oi_init(env, dev);
+       if (rc < 0)
+               GOTO(cleanup_obj, rc);
+
+       if (!dev->od_dt_dev.dd_rdonly)
+               osd_initial_OI_scrub(env, dev);
+
+       if (!dev->od_dt_dev.dd_rdonly &&
+           dev->od_auto_scrub_interval != AS_NEVER &&
+           ((sf->sf_status == SS_PAUSED) ||
+            (sf->sf_status == SS_CRASHED &&
+             sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+                             SF_UPGRADE | SF_AUTO)) ||
+            (sf->sf_status == SS_INIT &&
+             sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
+                             SF_UPGRADE))))
+               rc = osd_scrub_start(env, dev, SS_AUTO_FULL);
+
+       if (rc)
+               GOTO(cleanup_oi, rc);
+
+       RETURN(0);
+
+cleanup_oi:
+       osd_oi_fini(env, dev);
+cleanup_obj:
+       dt_object_put_nocache(env, scrub->os_obj);
+       scrub->os_obj = NULL;
+
+       return rc;
+}
+
+void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+
+       LASSERT(!dev->od_otable_it);
+
+       if (scrub->os_obj) {
+               osd_scrub_stop(dev);
+               dt_object_put_nocache(env, scrub->os_obj);
+               scrub->os_obj = NULL;
+       }
+
+       if (dev->od_oi_table)
+               osd_oi_fini(env, dev);
+}
+
+/* object table based iteration APIs */
+
+static struct dt_it *osd_otable_it_init(const struct lu_env *env,
+                                      struct dt_object *dt, __u32 attr)
+{
+       enum dt_otable_it_flags flags = attr >> DT_OTABLE_IT_FLAGS_SHIFT;
+       enum dt_otable_it_valid valid = attr & ~DT_OTABLE_IT_FLAGS_MASK;
+       struct osd_device *dev = osd_dev(dt->do_lu.lo_dev);
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct osd_otable_it *it;
+       __u32 start = 0;
+       int rc;
+       ENTRY;
+
+       if (dev->od_dt_dev.dd_rdonly)
+               RETURN(ERR_PTR(-EROFS));
+
+       /* od_otable_sem: prevent concurrent init/fini */
+       down(&dev->od_otable_sem);
+       if (dev->od_otable_it)
+               GOTO(out, it = ERR_PTR(-EALREADY));
+
+       OBD_ALLOC_PTR(it);
+       if (!it)
+               GOTO(out, it = ERR_PTR(-ENOMEM));
+
+       if (flags & DOIF_OUTUSED)
+               it->ooi_used_outside = 1;
+
+       if (flags & DOIF_RESET)
+               start |= SS_RESET;
+
+       if (valid & DOIV_ERROR_HANDLE) {
+               if (flags & DOIF_FAILOUT)
+                       start |= SS_SET_FAILOUT;
+               else
+                       start |= SS_CLEAR_FAILOUT;
+       }
+
+       if (valid & DOIV_DRYRUN) {
+               if (flags & DOIF_DRYRUN)
+                       start |= SS_SET_DRYRUN;
+               else
+                       start |= SS_CLEAR_DRYRUN;
+       }
+
+       /* XXX: dmu_object_next() does NOT find dnodes allocated
+        *      in the current non-committed txg, so we force txg
+        *      commit to find all existing dnodes ... */
+       txg_wait_synced(dmu_objset_pool(dev->od_os), 0ULL);
+
+       dev->od_otable_it = it;
+       it->ooi_dev = dev;
+       rc = scrub_start(osd_scrub_main, scrub, dev, start & ~SS_AUTO_PARTIAL);
+       if (rc == -EALREADY) {
+               it->ooi_pos = 1;
+       } else if (rc < 0) {
+               dev->od_otable_it = NULL;
+               OBD_FREE_PTR(it);
+               it = ERR_PTR(rc);
+       } else {
+               it->ooi_pos = scrub->os_pos_current;
+       }
+
+       GOTO(out, it);
+
+out:
+       up(&dev->od_otable_sem);
+       return (struct dt_it *)it;
+}
+
+static void osd_otable_it_fini(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_otable_it *it = (struct osd_otable_it *)di;
+       struct osd_device *dev = it->ooi_dev;
+
+       /* od_otable_sem: prevent concurrent init/fini */
+       down(&dev->od_otable_sem);
+       scrub_stop(&dev->od_scrub);
+       LASSERT(dev->od_otable_it == it);
+
+       dev->od_otable_it = NULL;
+       up(&dev->od_otable_sem);
+       OBD_FREE_PTR(it);
+}
+
+static int osd_otable_it_get(const struct lu_env *env,
+                            struct dt_it *di, const struct dt_key *key)
+{
+       return 0;
+}
+
+static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di)
+{
+}
+
+static void osd_otable_it_preload(const struct lu_env *env,
+                                 struct osd_otable_it *it)
+{
+       struct osd_device *dev = it->ooi_dev;
+       int rc;
+
+       /* can go negative on the very first access to the iterator
+        * or if some non-Lustre objects were found */
+       if (unlikely(it->ooi_prefetched < 0))
+               it->ooi_prefetched = 0;
+
+       if (it->ooi_prefetched >= (OTABLE_PREFETCH >> 1))
+               return;
+
+       if (it->ooi_prefetched_dnode == 0)
+               it->ooi_prefetched_dnode = it->ooi_pos;
+
+       while (it->ooi_prefetched < OTABLE_PREFETCH) {
+               rc = -dmu_object_next(dev->od_os, &it->ooi_prefetched_dnode,
+                                     B_FALSE, 0);
+               if (rc)
+                       break;
+
+               osd_dmu_prefetch(dev->od_os, it->ooi_prefetched_dnode,
+                                0, 0, 0, ZIO_PRIORITY_ASYNC_READ);
+               it->ooi_prefetched++;
+       }
+}
+
+static inline int
+osd_otable_it_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
+{
+       spin_lock(&scrub->os_lock);
+       if (it->ooi_pos < scrub->os_pos_current || scrub->os_waiting ||
+           !thread_is_running(&scrub->os_thread))
+               it->ooi_waiting = 0;
+       else
+               it->ooi_waiting = 1;
+       spin_unlock(&scrub->os_lock);
+
+       return !it->ooi_waiting;
+}
+
+static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_otable_it *it = (struct osd_otable_it *)di;
+       struct osd_device *dev = it->ooi_dev;
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct l_wait_info lwi = { 0 };
+       struct lustre_mdt_attrs *lma = NULL;
+       nvlist_t *nvbuf = NULL;
+       int size = 0;
+       int rc;
+       ENTRY;
+
+       LASSERT(it->ooi_user_ready);
+       fid_zero(&it->ooi_fid);
+
+       if (unlikely(it->ooi_all_cached))
+               RETURN(1);
+
+again:
+       if (nvbuf) {
+               nvlist_free(nvbuf);
+               nvbuf = NULL;
+               lma = NULL;
+               size = 0;
+       }
+
+       if (it->ooi_pos >= scrub->os_pos_current)
+               l_wait_event(thread->t_ctl_waitq,
+                            osd_otable_it_wakeup(scrub, it),
+                            &lwi);
+
+       if (!thread_is_running(thread) && !it->ooi_used_outside)
+               GOTO(out, rc = 1);
+
+       rc = -dmu_object_next(dev->od_os, &it->ooi_pos, B_FALSE, 0);
+       if (rc) {
+               if (unlikely(rc == -ESRCH)) {
+                       it->ooi_all_cached = 1;
+                       rc = 1;
+               }
+
+               GOTO(out, rc);
+       }
+
+       rc = __osd_xattr_load_by_oid(dev, it->ooi_pos, &nvbuf);
+
+       if (!scrub->os_full_speed)
+               spin_lock(&scrub->os_lock);
+       it->ooi_prefetched--;
+       if (!scrub->os_full_speed) {
+               if (scrub->os_waiting) {
+                       scrub->os_waiting = 0;
+                       wake_up_all(&thread->t_ctl_waitq);
+               }
+               spin_unlock(&scrub->os_lock);
+       }
+
+       if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
+               goto again;
+
+       if (rc)
+               GOTO(out, rc);
+
+       LASSERT(nvbuf != NULL);
+       rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA,
+                                      (uchar_t **)&lma, &size);
+       if (rc || size == 0)
+               /* It is either non-Lustre object or OSD internal object,
+                * ignore it, go ahead */
+               goto again;
+
+       LASSERTF(lma != NULL, "corrupted LMA, size %d\n", size);
+       lustre_lma_swab(lma);
+       if (unlikely(lma->lma_compat & LMAC_NOT_IN_OI ||
+                    lma->lma_incompat & LMAI_AGENT))
+               goto again;
+
+       it->ooi_fid = lma->lma_self_fid;
+
+       GOTO(out, rc = 0);
+
+out:
+       if (nvbuf)
+               nvlist_free(nvbuf);
+
+       if (!rc && scrub->os_full_speed)
+               osd_otable_it_preload(env, it);
+
+       return rc;
+}
+
+static struct dt_key *osd_otable_it_key(const struct lu_env *env,
+                                       const struct dt_it *di)
+{
+       return NULL;
+}
+
+static int osd_otable_it_key_size(const struct lu_env *env,
+                                 const struct dt_it *di)
+{
+       return sizeof(__u64);
+}
+
+static int osd_otable_it_rec(const struct lu_env *env, const struct dt_it *di,
+                            struct dt_rec *rec, __u32 attr)
+{
+       struct osd_otable_it *it  = (struct osd_otable_it *)di;
+       struct lu_fid *fid = (struct lu_fid *)rec;
+
+       *fid = it->ooi_fid;
+       return 0;
+}
+
+static __u64 osd_otable_it_store(const struct lu_env *env,
+                                const struct dt_it *di)
+{
+       struct osd_otable_it *it = (struct osd_otable_it *)di;
+
+       return it->ooi_pos;
+}
+
+/**
+ * Set the OSD layer iteration start position as the specified hash.
+ */
+static int osd_otable_it_load(const struct lu_env *env,
+                             const struct dt_it *di, __u64 hash)
+{
+       struct osd_otable_it *it = (struct osd_otable_it *)di;
+       struct osd_device *dev = it->ooi_dev;
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       int rc;
+       ENTRY;
+
+       /* Forbid to set iteration position after iteration started. */
+       if (it->ooi_user_ready)
+               RETURN(-EPERM);
+
+       if (hash > OSD_OTABLE_MAX_HASH)
+               hash = OSD_OTABLE_MAX_HASH;
+
+       /* The hash is the last checkpoint position,
+        * we will start from the next one. */
+       it->ooi_pos = hash + 1;
+       it->ooi_prefetched = 0;
+       it->ooi_prefetched_dnode = 0;
+       it->ooi_user_ready = 1;
+       if (!scrub->os_full_speed)
+               wake_up_all(&scrub->os_thread.t_ctl_waitq);
+
+       /* Unplug OSD layer iteration by the first next() call. */
+       rc = osd_otable_it_next(env, (struct dt_it *)it);
+
+       RETURN(rc);
+}
+
+static int osd_otable_it_key_rec(const struct lu_env *env,
+                                const struct dt_it *di, void *key_rec)
+{
+       return 0;
+}
+
+const struct dt_index_operations osd_otable_ops = {
+       .dio_it = {
+               .init     = osd_otable_it_init,
+               .fini     = osd_otable_it_fini,
+               .get      = osd_otable_it_get,
+               .put      = osd_otable_it_put,
+               .next     = osd_otable_it_next,
+               .key      = osd_otable_it_key,
+               .key_size = osd_otable_it_key_size,
+               .rec      = osd_otable_it_rec,
+               .store    = osd_otable_it_store,
+               .load     = osd_otable_it_load,
+               .key_rec  = osd_otable_it_key_rec,
+       }
+};
+
+/* high priority inconsistent items list APIs */
+
+int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
+                  const struct lu_fid *fid, uint64_t oid, bool insert)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct osd_inconsistent_item *oii;
+       bool wakeup = false;
+       ENTRY;
+
+       osd_idc_find_and_init_with_oid(env, dev, fid, oid);
+       OBD_ALLOC_PTR(oii);
+       if (unlikely(!oii))
+               RETURN(-ENOMEM);
+
+       INIT_LIST_HEAD(&oii->oii_list);
+       oii->oii_cache.oic_dev = dev;
+       oii->oii_cache.oic_fid = *fid;
+       oii->oii_cache.oic_dnode = oid;
+       oii->oii_insert = insert;
+
+       spin_lock(&scrub->os_lock);
+       if (unlikely(!thread_is_running(thread))) {
+               spin_unlock(&scrub->os_lock);
+               OBD_FREE_PTR(oii);
+               RETURN(-EAGAIN);
+       }
+
+       if (list_empty(&scrub->os_inconsistent_items))
+               wakeup = true;
+       list_add_tail(&oii->oii_list, &scrub->os_inconsistent_items);
+       spin_unlock(&scrub->os_lock);
+
+       if (wakeup)
+               wake_up_all(&thread->t_ctl_waitq);
+
+       RETURN(0);
+}
+
+int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
+                  uint64_t *oid)
+{
+       struct lustre_scrub *scrub = &dev->od_scrub;
+       struct osd_inconsistent_item *oii;
+       int ret = -ENOENT;
+       ENTRY;
+
+       spin_lock(&scrub->os_lock);
+       list_for_each_entry(oii, &scrub->os_inconsistent_items, oii_list) {
+               if (lu_fid_eq(fid, &oii->oii_cache.oic_fid)) {
+                       *oid = oii->oii_cache.oic_dnode;
+                       ret = 0;
+                       break;
+               }
+       }
+       spin_unlock(&scrub->os_lock);
+
+       RETURN(ret);
+}
diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh

index aa4e7aa..88fd417 100644 (file)
--- a/lustre/tests/sanity-scrub.sh
+++ b/lustre/tests/sanity-scrub.sh
@@ -27,52 +27,25 @@ if ! check_versions; then
         exit 0
  fi
  
-[ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
-       skip "ldiskfs only test" && exit 0
-
-[ $(facet_fstype ost1) != "ldiskfs" ] &&
-       skip "ldiskfs only test" && exit 0
-
-[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.2.90) ]] &&
-       skip "Need MDS version at least 2.2.90" && exit 0
+stopall
  
  SAVED_MDSSIZE=${MDSSIZE}
  SAVED_OSTSIZE=${OSTSIZE}
  SAVED_OSTCOUNT=${OSTCOUNT}
+
  # use small MDS + OST size to speed formatting time
  # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
-# 200M MDT device can guarantee uninitialized groups during the OI scrub
-MDSSIZE=200000
-OSTSIZE=100000
-# no need too much OSTs, to reduce the format/start/stop overhead
-stopall
-[ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
+# 400M MDT device can guarantee uninitialized groups during the OI scrub
+MDSSIZE=400000
+OSTSIZE=200000
  
-MOUNT_2=""
+# no need too many OSTs, to reduce the format/start/stop overhead
+[ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
  
  # build up a clean test environment.
  formatall
  setupall
  
-[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.90) ]] &&
-       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 1a"
-
-[[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.6.50) ]] &&
-       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 4"
-
-[[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.1) ]] &&
-       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 15"
-
-[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.90) ]] &&
-[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.4.50) ]] &&
-       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 15"
-
-[[ $(lustre_version_code ost1) -lt $(version_code 2.4.50) ]] &&
-       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14"
-
-[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.5.59) ]] &&
-       SCRUB_ONLY="-t scrub"
-
  build_test_filter
  
  MDT_DEV="${FSNAME}-MDT0000"
@@ -86,7 +59,7 @@ scrub_start() {
         # use "lfsck_start -A" when we no longer need testing interop
         for n in $(seq $MDSCOUNT); do
                 do_facet mds$n $LCTL lfsck_start -M $(facet_svc mds$n) \
-                       $SCRUB_ONLY "$@" ||
+                       -t scrub "$@" ||
                         error "($error_id) Failed to start OI scrub on mds$n"
         done
  }
@@ -105,22 +78,22 @@ scrub_stop() {
  scrub_status() {
         local n=$1
  
-       do_facet mds$n $LCTL get_param -n \
-               osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+       do_facet mds$n $LCTL get_param -n osd-*.$(facet_svc mds$n).oi_scrub
  }
  
-START_SCRUB="do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} $SCRUB_ONLY"
-START_SCRUB_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} $SCRUB_ONLY"
+START_SCRUB="do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t scrub"
+START_SCRUB_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t scrub"
  STOP_SCRUB="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
  SHOW_SCRUB="do_facet $SINGLEMDS \
-               $LCTL get_param -n osd-ldiskfs.${MDT_DEV}.oi_scrub"
+               $LCTL get_param -n osd-*.${MDT_DEV}.oi_scrub"
  SHOW_SCRUB_ON_OST="do_facet ost1 \
-               $LCTL get_param -n osd-ldiskfs.${OST_DEV}.oi_scrub"
+               $LCTL get_param -n osd-*.${OST_DEV}.oi_scrub"
  MOUNT_OPTS_SCRUB="-o user_xattr"
  MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
  
  scrub_prep() {
         local nfiles=$1
+       local inject=$2
         local n
  
         check_mount_and_prep
@@ -142,6 +115,34 @@ scrub_prep() {
                 fi
         done
         echo "prepared $(date)."
+
+       [ ! -z $inject ] && [ $inject -eq 2 ] && {
+               #define OBD_FAIL_OSD_NO_OI_ENTRY        0x198
+               do_nodes $(comma_list $(mdts_nodes)) \
+                               $LCTL set_param fail_loc=0x198
+
+               for n in $(seq $MDSCOUNT); do
+                       cp $LUSTRE/tests/runas $DIR/$tdir/mds$n ||
+                               error "Fail to copy runas to MDS$n"
+               done
+
+               do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0
+       }
+
+       [ ! -z $inject ] && [ $inject -eq 1 ] &&
+               [ $(facet_fstype $SINGLEMDS) = "zfs" ] && {
+               #define OBD_FAIL_OSD_FID_MAPPING        0x193
+               do_nodes $(comma_list $(mdts_nodes)) \
+                       $LCTL set_param fail_loc=0x193
+
+               for n in $(seq $MDSCOUNT); do
+                       chmod 0400 $DIR/$tdir/mds$n/test-framework.sh
+                       chmod 0400 $DIR/$tdir/mds$n/sanity-scrub.sh
+               done
+
+               do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0
+       }
+
         cleanup_mount $MOUNT > /dev/null || error "Fail to stop client!"
  
         # sync local transactions on every MDT
@@ -159,6 +160,17 @@ scrub_prep() {
                 echo "stop mds$n"
                 stop mds$n > /dev/null || error "Fail to stop MDS$n!"
         done
+
+       [ ! -z $inject ] && [ $(facet_fstype $SINGLEMDS) = "ldiskfs" ] && {
+               if [ $inject -eq 1 ]; then
+                       for n in $(seq $MDSCOUNT); do
+                               mds_backup_restore mds$n ||
+                                       error "Backup/restore on mds$n failed"
+                       done
+               elif [ $inject -eq 2 ]; then
+                       scrub_remove_ois 1
+               fi
+       }
  }
  
  scrub_start_mds() {
@@ -190,7 +202,7 @@ scrub_check_status() {
  
         for n in $(seq $MDSCOUNT); do
                 wait_update_facet mds$n "$LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+                       osd-*.$(facet_svc mds$n).oi_scrub |
                         awk '/^status/ { print \\\$2 }'" "$expected" 6 ||
                         error "($error_id) Expected '$expected' on mds$n"
         done
@@ -204,7 +216,7 @@ scrub_check_flags() {
  
         for n in $(seq $MDSCOUNT); do
                 actual=$(do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+                       osd-*.$(facet_svc mds$n).oi_scrub |
                         awk '/^flags/ { print $2 }')
                 if [ "$actual" != "$expected" ]; then
                         error "($error_id) Expected '$expected' on mds$n, but" \
@@ -221,7 +233,7 @@ scrub_check_params() {
  
         for n in $(seq $MDSCOUNT); do
                 actual=$(do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+                       osd-*.$(facet_svc mds$n).oi_scrub |
                         awk '/^param/ { print $2 }')
                 if [ "$actual" != "$expected" ]; then
                         error "($error_id) Expected '$expected' on mds$n, but" \
@@ -240,11 +252,11 @@ scrub_check_repaired() {
         for n in $(seq $MDSCOUNT); do
                 if [ $dryrun -eq 1 ]; then
                         actual=$(do_facet mds$n $LCTL get_param -n \
-                               osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+                               osd-*.$(facet_svc mds$n).oi_scrub |
                                 awk '/^inconsistent:/ { print $2 }')
                 else
                         actual=$(do_facet mds$n $LCTL get_param -n \
-                               osd-ldiskfs.$(facet_svc mds$n).oi_scrub |
+                               osd-*.$(facet_svc mds$n).oi_scrub |
                                 awk '/^updated:/ { print $2 }')
                 fi
  
@@ -284,6 +296,8 @@ scrub_check_data2() {
  }
  
  scrub_remove_ois() {
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] && return
+
         local error_id=$1
         local index=$2
         local n
@@ -294,40 +308,27 @@ scrub_remove_ois() {
         done
  }
  
-scrub_backup_restore() {
-       local error_id=$1
-       local igif=$2
-       local n
-
-       for n in $(seq $MDSCOUNT); do
-               mds_backup_restore mds$n $igif ||
-                       error "($error_id) Backup/restore on mds$n failed"
-       done
-}
-
  scrub_enable_auto() {
         do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param -n \
-               osd-ldiskfs.*.auto_scrub=1
+               osd-*.*.auto_scrub=1
  }
  
  full_scrub_ratio() {
-       [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.6.50) ]] &&
-               return
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] && return
  
         local ratio=$1
  
         do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param -n \
-               osd-ldiskfs.*.full_scrub_ratio=$ratio
+               osd-*.*.full_scrub_ratio=$ratio
  }
  
  full_scrub_threshold_rate() {
-       [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.6.50) ]] &&
-               return
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] && return
  
         local rate=$1
  
         do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param -n \
-               osd-ldiskfs.*.full_scrub_threshold_rate=$rate
+               osd-*.*.full_scrub_threshold_rate=$rate
  }
  
  test_0() {
@@ -371,17 +372,21 @@ test_1a() {
  run_test 1a "Auto trigger initial OI scrub when server mounts"
  
  test_1b() {
-       scrub_prep 0
-       scrub_remove_ois 1
+       scrub_prep 0 2
         echo "start MDTs without disabling OI scrub"
         scrub_start_mds 2 "$MOUNT_OPTS_SCRUB"
-       scrub_check_status 3 completed
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_status 3 completed
         mount_client $MOUNT || error "(4) Fail to start client!"
-       scrub_check_data 5
+       scrub_check_data2 runas 5
+       scrub_check_status 6 completed
  }
  run_test 1b "Trigger OI scrub when MDT mounts for OI files remove/recreate case"
  
  test_1c() {
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "ldiskfs special test" && return
+
         local index
  
         # OI files to be removed:
@@ -402,8 +407,10 @@ test_1c() {
  run_test 1c "Auto detect kinds of OI file(s) removed/recreated cases"
  
  test_2() {
-       scrub_prep 0
-       scrub_backup_restore 1
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "ldiskfs special test" && return
+
+       scrub_prep 0 1
         echo "starting MDTs without disabling OI scrub"
         scrub_start_mds 2 "$MOUNT_OPTS_SCRUB"
         scrub_check_status 3 completed
@@ -417,21 +424,21 @@ test_3() {
         formatall > /dev/null
         setupall > /dev/null
  
-       scrub_prep 0
-       scrub_backup_restore 1
+       scrub_prep 0 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
         scrub_check_status 3 init
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
  }
  #run_test 3 "Do not trigger OI scrub when MDT mounts if 'noscrub' specified"
  
  test_4a() {
-       scrub_prep 0
-       scrub_backup_restore 1
+       scrub_prep 0 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
         mount_client $MOUNT || error "(5) Fail to start client!"
         scrub_enable_auto
         full_scrub_ratio 0
@@ -461,8 +468,10 @@ test_4a() {
  run_test 4a "Auto trigger OI scrub if bad OI mapping was found (1)"
  
  test_4b() {
-       scrub_prep 5
-       scrub_backup_restore 1
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "ldiskfs special test" && return
+
+       scrub_prep 5 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
         scrub_check_flags 4 recreated,inconsistent
@@ -483,7 +492,7 @@ test_4b() {
  
                 echo "OI scrub on MDS$n status for the 1st time:"
                 do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                       osd-*.$(facet_svc mds$n).oi_scrub
         done
  
         scrub_check_data2 sanity-scrub.sh 9
@@ -499,7 +508,7 @@ test_4b() {
  
                 echo "OI scrub on MDS$n status for the 2nd time:"
                 do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                       osd-*.$(facet_svc mds$n).oi_scrub
  
                 [ ${updated0[$n]} -lt ${updated1[$n]} ] ||
                         error "(12) Auto trigger full scrub unexpectedly"
@@ -520,7 +529,7 @@ test_4b() {
  
                 echo "OI scrub on MDS$n status for the 3rd time:"
                 do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                       osd-*.$(facet_svc mds$n).oi_scrub
  
                 [ ${updated0[$n]} -gt ${updated1[$n]} ] ||
                         error "(16) Auto trigger full scrub unexpectedly"
@@ -537,7 +546,7 @@ test_4b() {
                 [ ${updated0[$n]} -eq ${updated1[$n]} ] || {
                         echo "OI scrub on MDS$n status for the 4th time:"
                         do_facet mds$n $LCTL get_param -n \
-                               osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                               osd-*.$(facet_svc mds$n).oi_scrub
  
                         error "(18) NOT auto trigger full scrub as expected"
                 }
@@ -546,8 +555,10 @@ test_4b() {
  run_test 4b "Auto trigger OI scrub if bad OI mapping was found (2)"
  
  test_4c() {
-       scrub_prep 500
-       scrub_backup_restore 1
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "ldiskfs special test" && return
+
+       scrub_prep 500 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
         scrub_check_flags 4 recreated,inconsistent
@@ -568,7 +579,7 @@ test_4c() {
  
                 echo "OI scrub on MDS$n status for the 1st time:"
                 do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                       osd-*.$(facet_svc mds$n).oi_scrub
         done
  
         scrub_check_data2 sanity-scrub.sh 9
@@ -584,7 +595,7 @@ test_4c() {
  
                 echo "OI scrub on MDS$n status for the 2nd time:"
                 do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                       osd-*.$(facet_svc mds$n).oi_scrub
  
                 [ ${updated0[$n]} -lt ${updated1[$n]} ] ||
                         error "(12) Auto trigger full scrub unexpectedly"
@@ -605,7 +616,7 @@ test_4c() {
  
                 echo "OI scrub on MDS$n status for the 3rd time:"
                 do_facet mds$n $LCTL get_param -n \
-                       osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                       osd-*.$(facet_svc mds$n).oi_scrub
  
                 [ ${updated0[$n]} -gt ${updated1[$n]} ] ||
                         error "(16) Auto trigger full scrub unexpectedly"
@@ -622,7 +633,7 @@ test_4c() {
                 [ ${updated0[$n]} -eq ${updated1[$n]} ] || {
                         echo "OI scrub on MDS$n status for the 4th time:"
                         do_facet mds$n $LCTL get_param -n \
-                               osd-ldiskfs.$(facet_svc mds$n).oi_scrub
+                               osd-*.$(facet_svc mds$n).oi_scrub
  
                         error "(18) NOT auto trigger full scrub as expected"
                 }
@@ -634,12 +645,12 @@ test_5() {
         formatall > /dev/null
         setupall > /dev/null
  
-       scrub_prep 1000
-       scrub_backup_restore 1
+       scrub_prep 100 1
         echo "starting MDTs with OI scrub disabled (1)"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
         scrub_check_status 3 init
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
         mount_client $MOUNT || error "(5) Fail to start client!"
         scrub_enable_auto
         full_scrub_ratio 0
@@ -688,12 +699,13 @@ test_5() {
         declare -a pids
  
         for n in $(seq $MDSCOUNT); do
-               stat $DIR/$tdir/mds$n/${tfile}800 &
+               stat $DIR/$tdir/mds$n/sanity-scrub.sh &
                 pids[$n]=$!
         done
  
         for n in $(seq $MDSCOUNT); do
-               wait ${pids[$n]} || error "(18) Fail to stat mds$n/${tfile}800"
+               wait ${pids[$n]} ||
+                       error "(18) Fail to stat mds$n/sanity-scrub.sh"
         done
  
         scrub_check_status 19 completed
@@ -702,11 +714,11 @@ test_5() {
  run_test 5 "OI scrub state machine"
  
  test_6() {
-       scrub_prep 1000
-       scrub_backup_restore 1
+       scrub_prep 100 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
         mount_client $MOUNT || error "(5) Fail to start client!"
         scrub_enable_auto
         full_scrub_ratio 0
@@ -732,8 +744,8 @@ test_6() {
         local n
         for n in $(seq $MDSCOUNT); do
                 # stat will re-trigger OI scrub
-               stat $DIR/$tdir/mds$n/${tfile}800 ||
-                       error "(8) Failed to stat mds$n/${tfile}800"
+               stat $DIR/$tdir/mds$n/sanity-scrub.sh ||
+                       error "(8) Failed to stat mds$n/sanity-scrub.sh"
         done
  
         umount_client $MOUNT || error "(9) Fail to stop client!"
@@ -780,11 +792,11 @@ test_6() {
  run_test 6 "OI scrub resumes from last checkpoint"
  
  test_7() {
-       scrub_prep 500
-       scrub_backup_restore 1
+       scrub_prep 500 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
         mount_client $MOUNT || error "(5) Fail to start client!"
         scrub_enable_auto
         full_scrub_ratio 0
@@ -802,7 +814,11 @@ test_7() {
         done
  
         scrub_check_status 8 scanning
-       scrub_check_flags 9 recreated,inconsistent,auto
+       if [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ]; then
+               scrub_check_flags 9 inconsistent,auto
+       else
+               scrub_check_flags 9 recreated,inconsistent,auto
+       fi
  
         do_nodes $(comma_list $(mdts_nodes)) \
                 $LCTL set_param fail_loc=0 fail_val=0
@@ -813,11 +829,11 @@ test_7() {
  run_test 7 "System is available during OI scrub scanning"
  
  test_8() {
-       scrub_prep 128
-       scrub_backup_restore 1
+       scrub_prep 128 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
  
         #define OBD_FAIL_OSD_SCRUB_DELAY         0x190
         do_nodes $(comma_list $(mdts_nodes)) \
@@ -839,13 +855,16 @@ test_8() {
  run_test 8 "Control OI scrub manually"
  
  test_9() {
+       # Skip scrub speed test for ZFS because of performance unstable
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "test scrub speed only on ldiskfs" && return
+
         if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
                 skip "Testing on UP system, the speed may be inaccurate."
                 return 0
         fi
  
-       scrub_prep 6000
-       scrub_backup_restore 1
+       scrub_prep 6000 1
  
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
@@ -919,11 +938,11 @@ test_9() {
  run_test 9 "OI scrub speed control"
  
  test_10a() {
-       scrub_prep 0
-       scrub_backup_restore 1
+       scrub_prep 0 1
         echo "starting mds$n with OI scrub disabled (1)"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
         mount_client $MOUNT || error "(5) Fail to start client!"
         scrub_enable_auto
         full_scrub_ratio 0
@@ -954,11 +973,11 @@ run_test 10a "non-stopped OI scrub should auto restarts after MDS remount (1)"
  
  # test_10b is obsolete, it will be coverded by related sanity-lfsck tests.
  test_10b() {
-       scrub_prep 0
-       scrub_backup_restore 1
+       scrub_prep 0 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
  
         #define OBD_FAIL_OSD_SCRUB_DELAY         0x190
         do_nodes $(comma_list $(mdts_nodes)) \
@@ -984,6 +1003,9 @@ test_10b() {
  #run_test 10b "non-stopped OI scrub should auto restarts after MDS remount (2)"
  
  test_11() {
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "ldiskfs special test" && return
+
         local CREATED=100
         local n
  
@@ -1061,7 +1083,7 @@ test_12() {
  
         do_facet ost1 $LCTL set_param fail_loc=0
         wait_update_facet ost1 "$LCTL get_param -n \
-               osd-ldiskfs.$(facet_svc ost1).oi_scrub |
+               osd-*.$(facet_svc ost1).oi_scrub |
                 awk '/^status/ { print \\\$2 }'" "completed" 6 ||
                 error "(7) Expected '$expected' on ost1"
  
@@ -1097,7 +1119,7 @@ test_13() {
         $START_SCRUB_ON_OST -r || error "(6) Fail to start OI scrub on OST!"
  
         wait_update_facet ost1 "$LCTL get_param -n \
-               osd-ldiskfs.$(facet_svc ost1).oi_scrub |
+               osd-*.$(facet_svc ost1).oi_scrub |
                 awk '/^status/ { print \\\$2 }'" "completed" 6 ||
                 error "(7) Expected '$expected' on ost1"
  
@@ -1106,6 +1128,9 @@ test_13() {
  run_test 13 "OI scrub can rebuild missed /O entries"
  
  test_14() {
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] &&
+               skip "ldiskfs special test" && return
+
         check_mount_and_prep
         $SETSTRIPE -c 1 -i 0 $DIR/$tdir
  
@@ -1139,57 +1164,51 @@ test_14() {
  run_test 14 "OI scrub can repair objects under lost+found"
  
  test_15() {
-       local server_version=$(lustre_version_code $SINGLEMDS)
-       scrub_prep 20
-       scrub_backup_restore 1
+       local repaired
+
+       formatall > /dev/null
+       setupall > /dev/null
+
+       scrub_prep 20 1
         echo "starting MDTs with OI scrub disabled"
         scrub_start_mds 2 "$MOUNT_OPTS_NOSCRUB"
         scrub_check_status 3 init
-       scrub_check_flags 4 recreated,inconsistent
+       [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ] ||
+               scrub_check_flags 4 recreated,inconsistent
  
         # run under dryrun mode
-       if [ $server_version -lt $(version_code 2.5.58) ]; then
-               scrub_start 5 --dryrun on
+       scrub_start 5 --dryrun
+       scrub_check_status 6 completed
+       if [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ]; then
+               scrub_check_flags 7 inconsistent
+               repaired=2
         else
-               scrub_start 5 --dryrun
+               scrub_check_flags 7 recreated,inconsistent
+               repaired=20
         fi
-       scrub_check_status 6 completed
-       scrub_check_flags 7 recreated,inconsistent
         scrub_check_params 8 dryrun
-       scrub_check_repaired 9 20 1
+       scrub_check_repaired 9 $repaired 1
  
         # run under dryrun mode again
-       if [ $server_version -lt $(version_code 2.5.58) ]; then
-               scrub_start 10 --dryrun on
+       scrub_start 10 --dryrun
+       scrub_check_status 11 completed
+       if [ $(facet_fstype $SINGLEMDS) != "ldiskfs" ]; then
+               scrub_check_flags 12 inconsistent
         else
-               scrub_start 10 --dryrun
+               scrub_check_flags 12 recreated,inconsistent
         fi
-       scrub_check_status 11 completed
-       scrub_check_flags 12 recreated,inconsistent
         scrub_check_params 13 dryrun
-       scrub_check_repaired 14 20 1
+       scrub_check_repaired 14 $repaired 1
  
         # run under normal mode
-       #
-       # Lustre-2.x (x <= 5) used "-n off" to disable dryrun which does not
-       # work under Lustre-2.y (y >= 6), the test script should be fixed as
-       # "-noff" or "--dryrun=off" or nothing by default.
-       if [ $server_version -lt $(version_code 2.5.58) ]; then
-               scrub_start 15 --dryrun off
-       else
-               scrub_start 15
-       fi
+       scrub_start 15
         scrub_check_status 16 completed
         scrub_check_flags 17 ""
         scrub_check_params 18 ""
-       scrub_check_repaired 19 20 0
+       scrub_check_repaired 19 $repaired 0
  
         # run under normal mode again
-       if [ $server_version -lt $(version_code 2.5.58) ]; then
-               scrub_start 20 --dryrun off
-       else
-               scrub_start 20
-       fi
+       scrub_start 20
         scrub_check_status 21 completed
         scrub_check_flags 22 ""
         scrub_check_params 23 ""
author	Fan Yong <fan.yong@intel.com>
	Thu, 18 Jan 2018 01:34:50 +0000 (09:34 +0800)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Wed, 31 Jan 2018 05:51:54 +0000 (05:51 +0000)
lustre/include/obd_support.h		patch \| blob \| history
lustre/lfsck/lfsck_lib.c		patch \| blob \| history
lustre/mgs/mgs_handler.c		patch \| blob \| history
lustre/ofd/ofd_dev.c		patch \| blob \| history
lustre/osd-ldiskfs/osd_handler.c		patch \| blob \| history
lustre/osd-zfs/Makefile.in		patch \| blob \| history
lustre/osd-zfs/osd_handler.c		patch \| blob \| history
lustre/osd-zfs/osd_index.c		patch \| blob \| history
lustre/osd-zfs/osd_internal.h		patch \| blob \| history
lustre/osd-zfs/osd_lproc.c		patch \| blob \| history
lustre/osd-zfs/osd_object.c		patch \| blob \| history
lustre/osd-zfs/osd_oi.c		patch \| blob \| history
lustre/osd-zfs/osd_scrub.c	[new file with mode: 0644]	patch \| blob
lustre/tests/sanity-scrub.sh		patch \| blob \| history