Whamcloud - gitweb
LU-957 scrub: OI scrub against ldiskfs
authorFan Yong <yong.fan@whamcloud.com>
Fri, 1 Jun 2012 01:56:52 +0000 (09:56 +0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 5 Jun 2012 11:48:12 +0000 (07:48 -0400)
The OI scrub is totally implemented inside osd-ldiskfs,
and driven by the inode iterator. For each object with
normal fid, its fid in the LMA and related OI entry in
the OI file will be compared, if they are inconsistent,
then the OI entry will be updated.

Signed-off-by: Fan Yong <yong.fan@whamcloud.com>
Change-Id: I04d25b3433336fb63c312f795a8631328beb3aa7
Reviewed-on: http://review.whamcloud.com/2552
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
12 files changed:
lustre/osd-ldiskfs/Makefile.in
lustre/osd-ldiskfs/autoMakefile.am
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_iam.c
lustre/osd-ldiskfs/osd_iam.h
lustre/osd-ldiskfs/osd_iam_lfix.c
lustre/osd-ldiskfs/osd_iam_lvar.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_oi.c
lustre/osd-ldiskfs/osd_oi.h
lustre/osd-ldiskfs/osd_scrub.c [new file with mode: 0644]
lustre/osd-ldiskfs/osd_scrub.h [new file with mode: 0644]

index 9a9e0f6..179bf2f 100644 (file)
@@ -1,6 +1,7 @@
 MODULES := osd_ldiskfs
 osd_ldiskfs-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o osd_iam.o \
 MODULES := osd_ldiskfs
 osd_ldiskfs-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o osd_iam.o \
-                    osd_iam_lfix.o osd_iam_lvar.o osd_io.o osd_compat.o
+                   osd_iam_lfix.o osd_iam_lvar.o osd_io.o osd_compat.o \
+                   osd_scrub.o
 
 EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs
 
 
 EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs
 
index 09186d1..3294c28 100644 (file)
@@ -40,4 +40,4 @@ endif
 
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
 EXTRA_DIST := $(osd_ldiskfs-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h \
 
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
 EXTRA_DIST := $(osd_ldiskfs-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h \
-              osd_iam.h
+             osd_iam.h osd_scrub.h
index b68f7f7..9c1949b 100644 (file)
@@ -1057,7 +1057,7 @@ const int osd_dto_credits_noquota[DTO_NR] = {
         [DTO_INDEX_INSERT]  = 16,
         [DTO_INDEX_DELETE]  = 16,
         /**
         [DTO_INDEX_INSERT]  = 16,
         [DTO_INDEX_DELETE]  = 16,
         /**
-         * Unused now
+        * Used for OI scrub
          */
         [DTO_INDEX_UPDATE]  = 16,
         /**
          */
         [DTO_INDEX_UPDATE]  = 16,
         /**
@@ -1541,6 +1541,7 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj,
                  * NB: don't need any lock because no contention at this
                  * early stage */
                 inode->i_flags |= S_NOCMTIME;
                  * NB: don't need any lock because no contention at this
                  * early stage */
                 inode->i_flags |= S_NOCMTIME;
+               inode->i_state |= I_LUSTRE_NOSCRUB;
                 obj->oo_inode = inode;
                 result = 0;
         } else {
                 obj->oo_inode = inode;
                 result = 0;
         } else {
@@ -1874,6 +1875,9 @@ static int osd_object_destroy(const struct lu_env *env,
         LASSERT(inode);
         LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
 
         LASSERT(inode);
         LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
 
+       /* Parallel control for OI scrub. For most of cases, there is no
+        * lock contention. So it will not affect unlink performance. */
+       cfs_mutex_lock(&inode->i_mutex);
         if (S_ISDIR(inode->i_mode)) {
                 LASSERT(osd_inode_unlinked(inode) ||
                         inode->i_nlink == 1);
         if (S_ISDIR(inode->i_mode)) {
                 LASSERT(osd_inode_unlinked(inode) ||
                         inode->i_nlink == 1);
@@ -1888,6 +1892,7 @@ static int osd_object_destroy(const struct lu_env *env,
         OSD_EXEC_OP(th, destroy);
 
         result = osd_oi_delete(osd_oti_get(env), osd, fid, th);
         OSD_EXEC_OP(th, destroy);
 
         result = osd_oi_delete(osd_oti_get(env), osd, fid, th);
+       cfs_mutex_unlock(&inode->i_mutex);
 
         /* XXX: add to ext3 orphan list */
         /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
 
         /* XXX: add to ext3 orphan list */
         /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -4004,19 +4009,16 @@ static int osd_device_init(const struct lu_env *env, struct lu_device *d,
 
 static int osd_shutdown(const struct lu_env *env, struct osd_device *o)
 {
 
 static int osd_shutdown(const struct lu_env *env, struct osd_device *o)
 {
-        struct osd_thread_info *info = osd_oti_get(env);
-
-        ENTRY;
+       ENTRY;
 
 
-        if (o->od_oi_table != NULL)
-                osd_oi_fini(info, o);
+       osd_scrub_cleanup(env, o);
 
 
-        if (o->od_fsops) {
-                fsfilt_put_ops(o->od_fsops);
-                o->od_fsops = NULL;
-        }
+       if (o->od_fsops) {
+               fsfilt_put_ops(o->od_fsops);
+       o->od_fsops = NULL;
+       }
 
 
-        RETURN(0);
+       RETURN(0);
 }
 
 static int osd_mount(const struct lu_env *env,
 }
 
 static int osd_mount(const struct lu_env *env,
@@ -4113,6 +4115,7 @@ static struct lu_device *osd_device_alloc(const struct lu_env *env,
                         l->ld_ops = &osd_lu_ops;
                         o->od_dt_dev.dd_ops = &osd_dt_ops;
                         cfs_spin_lock_init(&o->od_osfs_lock);
                         l->ld_ops = &osd_lu_ops;
                         o->od_dt_dev.dd_ops = &osd_dt_ops;
                         cfs_spin_lock_init(&o->od_osfs_lock);
+                       cfs_mutex_init(&o->od_otable_mutex);
                         o->od_osfs_age = cfs_time_shift_64(-1000);
                         o->od_capa_hash = init_capa_hash();
                         if (o->od_capa_hash == NULL) {
                         o->od_osfs_age = cfs_time_shift_64(-1000);
                         o->od_capa_hash = init_capa_hash();
                         if (o->od_capa_hash == NULL) {
@@ -4171,14 +4174,12 @@ static int osd_recovery_complete(const struct lu_env *env,
 static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
                        struct lu_device *dev)
 {
 static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
                        struct lu_device *dev)
 {
-        struct osd_device      *osd = osd_dev(dev);
-        struct osd_thread_info *oti = osd_oti_get(env);
-        int                     result;
-
-        ENTRY;
+       struct osd_device *osd = osd_dev(dev);
+       int                result;
+       ENTRY;
 
 
-        /* 1. initialize oi before any file create or file open */
-        result = osd_oi_init(oti, osd);
+       /* 1. setup scrub, including OI files initialization */
+       result = osd_scrub_setup(env, osd);
         if (result < 0)
                 RETURN(result);
 
         if (result < 0)
                 RETURN(result);
 
index a495491..faf99ac 100644 (file)
@@ -2249,20 +2249,27 @@ EXPORT_SYMBOL(iam_insert);
  * Update record with the key @k in container @c (within context of
  * transaction @h), new record is given by @r.
  *
  * Update record with the key @k in container @c (within context of
  * transaction @h), new record is given by @r.
  *
- * Return values: 0: success, -ve: error, including -ENOENT if no record with
- * the given key found.
+ * Return values: +1: skip because of the same rec value, 0: success,
+ * -ve: error, including -ENOENT if no record with the given key found.
  */
 int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
                const struct iam_rec *r, struct iam_path_descr *pd)
 {
         struct iam_iterator it;
  */
 int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
                const struct iam_rec *r, struct iam_path_descr *pd)
 {
         struct iam_iterator it;
-        int result;
-
-        iam_it_init(&it, c, IAM_IT_WRITE, pd);
-
-        result = iam_it_get_exact(&it, k);
-        if (result == 0)
-                iam_it_rec_set(h, &it, r);
+       struct iam_leaf *folio;
+       int result;
+
+       iam_it_init(&it, c, IAM_IT_WRITE, pd);
+
+       result = iam_it_get_exact(&it, k);
+       if (result == 0) {
+               folio = &it.ii_path.ip_leaf;
+               result = iam_leaf_ops(folio)->rec_eq(folio, r);
+               if (result == 0)
+                       iam_it_rec_set(h, &it, r);
+               else
+                       result = 1;
+       }
         iam_it_put(&it);
         iam_it_fini(&it);
         return result;
         iam_it_put(&it);
         iam_it_fini(&it);
         return result;
index 3e0aabd..98da250 100644 (file)
@@ -374,6 +374,8 @@ struct iam_leaf_operations {
         int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
         int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k);
 
         int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k);
         int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k);
 
+       int (*rec_eq)(const struct iam_leaf *l, const struct iam_rec *r);
+
         int (*key_size)(const struct iam_leaf *l);
         /*
          * Search leaf @l for a record with key @k or for a place
         int (*key_size)(const struct iam_leaf *l);
         /*
          * Search leaf @l for a record with key @k or for a place
index d9ee5c9..f676794 100644 (file)
@@ -26,6 +26,8 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2012 Whamcloud, Inc.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -356,6 +358,18 @@ static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r)
         memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size);
 }
 
         memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size);
 }
 
+static inline int lfix_reccmp(const struct iam_container *c,
+                             const struct iam_rec *r1,
+                             const struct iam_rec *r2)
+{
+       return memcmp(r1, r2, c->ic_descr->id_rec_size);
+}
+
+static int iam_lfix_rec_eq(const struct iam_leaf *l, const struct iam_rec *r)
+{
+       return !lfix_reccmp(iam_leaf_container(l), iam_lfix_rec(l), r);
+}
+
 static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r)
 {
         assert_corr(iam_leaf_at_rec(l));
 static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r)
 {
         assert_corr(iam_leaf_at_rec(l));
@@ -511,6 +525,7 @@ static struct iam_leaf_operations iam_lfix_leaf_ops = {
         .key_eq         = iam_lfix_key_eq,
         .key_size       = iam_lfix_key_size,
         .rec_set        = iam_lfix_rec_set,
         .key_eq         = iam_lfix_key_eq,
         .key_size       = iam_lfix_key_size,
         .rec_set        = iam_lfix_rec_set,
+       .rec_eq         = iam_lfix_rec_eq,
         .rec_get        = iam_lfix_rec_get,
         .lookup         = iam_lfix_lookup,
         .ilookup        = iam_lfix_ilookup,
         .rec_get        = iam_lfix_rec_get,
         .lookup         = iam_lfix_lookup,
         .ilookup        = iam_lfix_ilookup,
index 4d1fa30..75068e5 100644 (file)
@@ -26,6 +26,8 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2012 Whamcloud, Inc.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -571,6 +573,15 @@ static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
         assert_inv(n_invariant(l));
 }
 
         assert_inv(n_invariant(l));
 }
 
+static int lvar_rec_eq(const struct iam_leaf *l, const struct iam_rec *r)
+{
+       struct iam_rec *rec = e_rec(n_cur(l));
+
+       if (rec_size(rec) != rec_size(r))
+               return 0;
+       return !memcmp(rec, r, rec_size(r));
+}
+
 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
 {
         struct iam_rec *rec;
 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
 {
         struct iam_rec *rec;
@@ -768,6 +779,7 @@ static struct iam_leaf_operations lvar_leaf_ops = {
         .key_eq         = lvar_key_eq,
         .key_size       = lvar_key_size,
         .rec_set        = lvar_rec_set,
         .key_eq         = lvar_key_eq,
         .key_size       = lvar_key_size,
         .rec_set        = lvar_rec_set,
+       .rec_eq         = lvar_rec_eq,
         .rec_get        = lvar_rec_get,
         .lookup         = lvar_lookup,
         .ilookup        = lvar_ilookup,
         .rec_get        = lvar_rec_get,
         .lookup         = lvar_lookup,
         .ilookup        = lvar_ilookup,
index f6858e1..f75bb7e 100644 (file)
 
 #include "osd_oi.h"
 #include "osd_iam.h"
 
 #include "osd_oi.h"
 #include "osd_iam.h"
+#include "osd_scrub.h"
 
 struct inode;
 
 #define OSD_COUNTERS (0)
 
 
 struct inode;
 
 #define OSD_COUNTERS (0)
 
+/* Lustre special inode::i_state to indicate OI scrub skip this inode. */
+#define I_LUSTRE_NOSCRUB       (1 << 31)
+
 /** Enable thandle usage statistics */
 #define OSD_THANDLE_STATS (0)
 
 /** Enable thandle usage statistics */
 #define OSD_THANDLE_STATS (0)
 
@@ -191,6 +195,55 @@ static inline void ldiskfs_htree_lock_free(struct htree_lock *lk)
 
 #endif /* HAVE_LDISKFS_PDO */
 
 
 #endif /* HAVE_LDISKFS_PDO */
 
+#define OSD_OTABLE_IT_CACHE_SIZE       128
+#define OSD_OTABLE_IT_CACHE_MASK       (~(OSD_OTABLE_IT_CACHE_SIZE - 1))
+
+struct osd_inconsistent_item {
+       /* link into osd_scrub::os_inconsistent_items,
+        * protected by osd_scrub::os_lock. */
+       cfs_list_t             oii_list;
+
+       /* The right FID <=> ino#/gen mapping. */
+       struct osd_idmap_cache oii_cache;
+
+       unsigned int           oii_insert:1; /* insert or update mapping. */
+};
+
+struct osd_otable_cache {
+       struct osd_idmap_cache ooc_cache[OSD_OTABLE_IT_CACHE_SIZE];
+
+       /* Index for next cache slot to be filled. */
+       int                    ooc_producer_idx;
+
+       /* Index for next cache slot to be returned by it::next(). */
+       int                    ooc_consumer_idx;
+
+       /* How many items in ooc_cache. */
+       int                    ooc_cached_items;
+
+       /* Position for up layer LFSCK iteration pre-loading. */
+       __u32                  ooc_pos_preload;
+};
+
+struct osd_otable_it {
+       struct osd_device       *ooi_dev;
+       struct osd_otable_cache  ooi_cache;
+
+       /* For osd_otable_it_key. */
+       __u8                     ooi_key[16];
+
+       /* The following bits can be updated/checked w/o lock protection.
+        * If more bits will be introduced in the future and need lock to
+        * protect, please add comment. */
+       unsigned long            ooi_used_outside:1, /* Some user out of OSD
+                                                     * uses the iteration. */
+                                ooi_all_cached:1, /* No more entries can be
+                                                   * filled into cache. */
+                                ooi_user_ready:1, /* The user out of OSD is
+                                                   * ready to iterate. */
+                                ooi_waiting:1; /* it::next is waiting. */
+};
+
 extern const int osd_dto_credits_noquota[];
 
 /*
 extern const int osd_dto_credits_noquota[];
 
 /*
@@ -244,6 +297,10 @@ struct osd_device {
         struct brw_stats          od_brw_stats;
         cfs_atomic_t              od_r_in_flight;
         cfs_atomic_t              od_w_in_flight;
         struct brw_stats          od_brw_stats;
         cfs_atomic_t              od_r_in_flight;
         cfs_atomic_t              od_w_in_flight;
+
+       cfs_mutex_t               od_otable_mutex;
+       struct osd_otable_it     *od_otable_it;
+       struct osd_scrub          od_scrub;
 };
 
 #define OSD_TRACK_DECLARES
 };
 
 #define OSD_TRACK_DECLARES
@@ -561,6 +618,12 @@ int osd_compat_spec_insert(struct osd_thread_info *info,
                            const struct lu_fid *fid,
                            const struct osd_inode_id *id, struct thandle *th);
 
                            const struct lu_fid *fid,
                            const struct osd_inode_id *id, struct thandle *th);
 
+void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags);
+int osd_scrub_file_store(struct osd_scrub *scrub);
+int osd_scrub_start(struct osd_device *dev);
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev);
+void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev);
+
 /*
  * Invariants, assertions.
  */
 /*
  * Invariants, assertions.
  */
index d13a0fa..737e8d3 100644 (file)
@@ -70,6 +70,7 @@
 /* osd_lookup(), struct osd_thread_info */
 #include "osd_internal.h"
 #include "osd_igif.h"
 /* osd_lookup(), struct osd_thread_info */
 #include "osd_internal.h"
 #include "osd_igif.h"
+#include "osd_scrub.h"
 
 static unsigned int osd_oi_count = OSD_OI_FID_NR;
 CFS_MODULE_PARM(osd_oi_count, "i", int, 0444,
 
 static unsigned int osd_oi_count = OSD_OI_FID_NR;
 CFS_MODULE_PARM(osd_oi_count, "i", int, 0444,
@@ -89,22 +90,25 @@ static struct dt_index_features oi_feat = {
 #define OSD_OI_NAME_BASE        "oi.16"
 
 static void osd_oi_table_put(struct osd_thread_info *info,
 #define OSD_OI_NAME_BASE        "oi.16"
 
 static void osd_oi_table_put(struct osd_thread_info *info,
-                             struct osd_oi **oi_table, unsigned oi_count)
+                            struct osd_oi **oi_table, unsigned oi_count)
 {
 {
-        struct iam_container *bag;
-        int                   i;
-
-        for (i = 0; i < oi_count; i++) {
-                LASSERT(oi_table[i] != NULL);
-                LASSERT(oi_table[i]->oi_inode != NULL);
-
-                bag = &(oi_table[i]->oi_dir.od_container);
-                if (bag->ic_object == oi_table[i]->oi_inode)
-                        iam_container_fini(bag);
-                iput(oi_table[i]->oi_inode);
-                oi_table[i]->oi_inode = NULL;
-                OBD_FREE_PTR(oi_table[i]);
-        }
+       struct iam_container *bag;
+       int                   i;
+
+       for (i = 0; i < oi_count; i++) {
+               if (oi_table[i] == NULL)
+                       continue;
+
+               LASSERT(oi_table[i]->oi_inode != NULL);
+
+               bag = &(oi_table[i]->oi_dir.od_container);
+               if (bag->ic_object == oi_table[i]->oi_inode)
+                       iam_container_fini(bag);
+               iput(oi_table[i]->oi_inode);
+               oi_table[i]->oi_inode = NULL;
+               OBD_FREE_PTR(oi_table[i]);
+               oi_table[i] = NULL;
+       }
 }
 
 static int osd_oi_index_create_one(struct osd_thread_info *info,
 }
 
 static int osd_oi_index_create_one(struct osd_thread_info *info,
@@ -281,89 +285,136 @@ out_inode:
  */
 static int
 osd_oi_table_open(struct osd_thread_info *info, struct osd_device *osd,
  */
 static int
 osd_oi_table_open(struct osd_thread_info *info, struct osd_device *osd,
-                  struct osd_oi **oi_table, unsigned oi_count, bool create)
+                 struct osd_oi **oi_table, unsigned oi_count, bool create)
 {
 {
-        struct dt_device *dev = &osd->od_dt_dev;
-        int               count = 0;
-        int               rc = 0;
-        int               i;
-
-        /* NB: oi_count != 0 means that we have already created/known all OIs
-         * and have known exact number of OIs. */
-        LASSERT(oi_count <= OSD_OI_FID_NR_MAX);
-
-        for (i = 0; i < (oi_count != 0 ? oi_count : OSD_OI_FID_NR_MAX); i++) {
-                char name[12];
-
-                sprintf(name, "%s.%d", OSD_OI_NAME_BASE, i);
-                rc = osd_oi_open(info, osd, name, &oi_table[i], create);
-                if (rc == 0) {
-                        count++;
-                        continue;
-                }
-
-                if (rc == -ENOENT && oi_count == 0)
-                        return count;
-
-                CERROR("%s: can't open %s: rc = %d\n",
-                       dev->dd_lu_dev.ld_obd->obd_name, name, rc);
-                if (oi_count > 0) {
-                        CERROR("%s: expect to open total %d OI files.\n",
-                               dev->dd_lu_dev.ld_obd->obd_name, oi_count);
-                }
-                break;
-        }
+       struct dt_device  *dev = &osd->od_dt_dev;
+       struct scrub_file *sf = &osd->od_scrub.os_file;
+       int                count = 0;
+       int                rc = 0;
+       int                i;
+       ENTRY;
+
+       /* NB: oi_count != 0 means that we have already created/known all OIs
+        * and have known exact number of OIs. */
+       LASSERT(oi_count <= OSD_OI_FID_NR_MAX);
+
+       for (i = 0; i < (oi_count != 0 ? oi_count : OSD_OI_FID_NR_MAX); i++) {
+               char name[12];
+
+               if (oi_table[i] != NULL) {
+                       count++;
+                       continue;
+               }
 
 
-        if (rc < 0) {
-                osd_oi_table_put(info, oi_table, count);
-                return rc;
-        }
+               sprintf(name, "%s.%d", OSD_OI_NAME_BASE, i);
+               rc = osd_oi_open(info, osd, name, &oi_table[i], create);
+               if (rc == 0) {
+                       count++;
+                       continue;
+               }
 
 
-        return count;
+               if (rc == -ENOENT && create == false) {
+                       if (oi_count == 0)
+                               return count;
+
+                       rc = 0;
+                       ldiskfs_set_bit(i, sf->sf_oi_bitmap);
+                       continue;
+               }
+
+               CERROR("%s: can't open %s: rc = %d\n",
+                      dev->dd_lu_dev.ld_obd->obd_name, name, rc);
+               if (oi_count > 0)
+                       CERROR("%s: expect to open total %d OI files.\n",
+                              dev->dd_lu_dev.ld_obd->obd_name, oi_count);
+               break;
+       }
+
+       if (rc < 0) {
+               osd_oi_table_put(info, oi_table, oi_count > 0 ? oi_count : i);
+               count = rc;
+       }
+
+       RETURN(count);
 }
 
 int osd_oi_init(struct osd_thread_info *info, struct osd_device *osd)
 {
 }
 
 int osd_oi_init(struct osd_thread_info *info, struct osd_device *osd)
 {
-        struct dt_device *dev = &osd->od_dt_dev;
-        struct osd_oi   **oi;
-        int               rc;
+       struct dt_device  *dev = &osd->od_dt_dev;
+       struct osd_scrub  *scrub = &osd->od_scrub;
+       struct scrub_file *sf = &scrub->os_file;
+       struct osd_oi    **oi;
+       int                rc;
+       ENTRY;
+
+       OBD_ALLOC(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
+       if (oi == NULL)
+               RETURN(-ENOMEM);
+
+       cfs_mutex_lock(&oi_init_lock);
+       /* try to open existing multiple OIs first */
+       rc = osd_oi_table_open(info, osd, oi, sf->sf_oi_count, false);
+       if (rc < 0)
+               GOTO(out, rc);
 
 
-        OBD_ALLOC(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
-        if (oi == NULL)
-                return -ENOMEM;
-
-        cfs_mutex_lock(&oi_init_lock);
-        /* try to open existing multiple OIs first */
-        rc = osd_oi_table_open(info, osd, oi, 0, false);
-        if (rc != 0)
-                goto out;
-
-        /* if previous failed then try found single OI from old filesystem */
-        rc = osd_oi_open(info, osd, OSD_OI_NAME_BASE, &oi[0], false);
-        if (rc == 0) { /* found single OI from old filesystem */
-                rc = 1;
-                goto out;
-        } else if (rc != -ENOENT) {
-                CERROR("%s: can't open %s: rc = %d\n",
-                       dev->dd_lu_dev.ld_obd->obd_name, OSD_OI_NAME_BASE, rc);
-                goto out;
-        }
+       if (rc > 0) {
+               if (rc == sf->sf_oi_count || sf->sf_oi_count == 0)
+                       GOTO(out, rc);
+
+               osd_scrub_file_reset(scrub,
+                                    LDISKFS_SB(osd_sb(osd))->s_es->s_uuid,
+                                    SF_RECREATED);
+               osd_oi_count = sf->sf_oi_count;
+               goto create;
+       }
+
+       /* if previous failed then try found single OI from old filesystem */
+       rc = osd_oi_open(info, osd, OSD_OI_NAME_BASE, &oi[0], false);
+       if (rc == 0) { /* found single OI from old filesystem */
+               GOTO(out, rc = 1);
+       } else if (rc != -ENOENT) {
+               CERROR("%s: can't open %s: rc = %d\n",
+                      dev->dd_lu_dev.ld_obd->obd_name, OSD_OI_NAME_BASE, rc);
+               GOTO(out, rc);
+       }
+
+       if (sf->sf_oi_count > 0) {
+               int i;
+
+               memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+               for (i = 0; i < osd_oi_count; i++)
+                       ldiskfs_set_bit(i, sf->sf_oi_bitmap);
+               osd_scrub_file_reset(scrub,
+                                    LDISKFS_SB(osd_sb(osd))->s_es->s_uuid,
+                                    SF_RECREATED);
+       }
+       sf->sf_oi_count = osd_oi_count;
+
+create:
+       rc = osd_scrub_file_store(scrub);
+       if (rc < 0) {
+               osd_oi_table_put(info, oi, sf->sf_oi_count);
+               GOTO(out, rc);
+       }
+
+       /* No OIs exist, new filesystem, create OI objects */
+       rc = osd_oi_table_open(info, osd, oi, osd_oi_count, true);
+       LASSERT(ergo(rc >= 0, rc == osd_oi_count));
+
+       GOTO(out, rc);
 
 
-        /* No OIs exist, new filesystem, create OI objects */
-        rc = osd_oi_table_open(info, osd, oi, osd_oi_count, true);
-        LASSERT(ergo(rc >= 0, rc == osd_oi_count));
 out:
 out:
-        if (rc < 0) {
-                OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
-        } else {
-                LASSERT((rc & (rc - 1)) == 0);
-                osd->od_oi_table = oi;
-                osd->od_oi_count = rc;
-                rc = 0;
-        }
+       if (rc < 0) {
+               OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
+       } else {
+               LASSERT((rc & (rc - 1)) == 0);
+               osd->od_oi_table = oi;
+               osd->od_oi_count = rc;
+               rc = 0;
+       }
 
 
-        cfs_mutex_unlock(&oi_init_lock);
-        return rc;
+       cfs_mutex_unlock(&oi_init_lock);
+       return rc;
 }
 
 void osd_oi_fini(struct osd_thread_info *info, struct osd_device *osd)
 }
 
 void osd_oi_fini(struct osd_thread_info *info, struct osd_device *osd)
index bae99ed..8db80dd 100644 (file)
@@ -84,6 +84,11 @@ struct osd_inode_id {
        __u32 oii_gen; /* inode generation */
 };
 
        __u32 oii_gen; /* inode generation */
 };
 
+struct osd_idmap_cache {
+       struct lu_fid           oic_fid;
+       struct osd_inode_id     oic_lid;
+};
+
 static inline void osd_id_pack(struct osd_inode_id *tgt,
                               const struct osd_inode_id *src)
 {
 static inline void osd_id_pack(struct osd_inode_id *tgt,
                               const struct osd_inode_id *src)
 {
diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c
new file mode 100644 (file)
index 0000000..ccdca14
--- /dev/null
@@ -0,0 +1,940 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Whamcloud, Inc.
+ */
+/*
+ * lustre/osd-ldiskfs/osd_scrub.c
+ *
+ * Top-level entry points into osd module
+ *
+ * The OI scrub is used for rebuilding Object Index files when restores MDT from
+ * file-level backup.
+ *
+ * The otable based iterator scans ldiskfs inode table to feed up layer LFSCK.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <lustre/lustre_idl.h>
+#include <lustre_disk.h>
+#include <dt_object.h>
+
+#include "osd_internal.h"
+#include "osd_oi.h"
+#include "osd_scrub.h"
+
+#define HALF_SEC       (CFS_HZ >> 1)
+
+static inline struct osd_device *osd_scrub2dev(struct osd_scrub *scrub)
+{
+       return container_of0(scrub, struct osd_device, od_scrub);
+}
+
+static inline struct super_block *osd_scrub2sb(struct osd_scrub *scrub)
+{
+       return osd_sb(osd_scrub2dev(scrub));
+}
+
+static void osd_scrub_file_to_cpu(struct scrub_file *des,
+                                 struct scrub_file *src)
+{
+       memcpy(des->sf_uuid, src->sf_uuid, 16);
+       des->sf_flags   = le64_to_cpu(src->sf_flags);
+       des->sf_magic   = le32_to_cpu(src->sf_magic);
+       des->sf_status  = le16_to_cpu(src->sf_status);
+       des->sf_param   = le16_to_cpu(src->sf_param);
+       des->sf_time_last_complete      =
+                               le64_to_cpu(src->sf_time_last_complete);
+       des->sf_time_latest_start       =
+                               le64_to_cpu(src->sf_time_latest_start);
+       des->sf_time_last_checkpoint    =
+                               le64_to_cpu(src->sf_time_last_checkpoint);
+       des->sf_pos_latest_start        =
+                               le64_to_cpu(src->sf_pos_latest_start);
+       des->sf_pos_last_checkpoint     =
+                               le64_to_cpu(src->sf_pos_last_checkpoint);
+       des->sf_pos_first_inconsistent  =
+                               le64_to_cpu(src->sf_pos_first_inconsistent);
+       des->sf_items_checked           =
+                               le64_to_cpu(src->sf_items_checked);
+       des->sf_items_updated           =
+                               le64_to_cpu(src->sf_items_updated);
+       des->sf_items_failed            =
+                               le64_to_cpu(src->sf_items_failed);
+       des->sf_items_updated_prior     =
+                               le64_to_cpu(src->sf_items_updated_prior);
+       des->sf_run_time        = le32_to_cpu(src->sf_run_time);
+       des->sf_success_count   = le32_to_cpu(src->sf_success_count);
+       des->sf_oi_count        = le16_to_cpu(src->sf_oi_count);
+       memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+static void osd_scrub_file_to_le(struct scrub_file *des,
+                                struct scrub_file *src)
+{
+       memcpy(des->sf_uuid, src->sf_uuid, 16);
+       des->sf_flags   = cpu_to_le64(src->sf_flags);
+       des->sf_magic   = cpu_to_le32(src->sf_magic);
+       des->sf_status  = cpu_to_le16(src->sf_status);
+       des->sf_param   = cpu_to_le16(src->sf_param);
+       des->sf_time_last_complete      =
+                               cpu_to_le64(src->sf_time_last_complete);
+       des->sf_time_latest_start       =
+                               cpu_to_le64(src->sf_time_latest_start);
+       des->sf_time_last_checkpoint    =
+                               cpu_to_le64(src->sf_time_last_checkpoint);
+       des->sf_pos_latest_start        =
+                               cpu_to_le64(src->sf_pos_latest_start);
+       des->sf_pos_last_checkpoint     =
+                               cpu_to_le64(src->sf_pos_last_checkpoint);
+       des->sf_pos_first_inconsistent  =
+                               cpu_to_le64(src->sf_pos_first_inconsistent);
+       des->sf_items_checked           =
+                               cpu_to_le64(src->sf_items_checked);
+       des->sf_items_updated           =
+                               cpu_to_le64(src->sf_items_updated);
+       des->sf_items_failed            =
+                               cpu_to_le64(src->sf_items_failed);
+       des->sf_items_updated_prior     =
+                               cpu_to_le64(src->sf_items_updated_prior);
+       des->sf_run_time        = cpu_to_le32(src->sf_run_time);
+       des->sf_success_count   = cpu_to_le32(src->sf_success_count);
+       des->sf_oi_count        = cpu_to_le16(src->sf_oi_count);
+       memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+static void osd_scrub_file_init(struct osd_scrub *scrub, __u8 *uuid)
+{
+       struct scrub_file *sf = &scrub->os_file;
+
+       memset(sf, 0, sizeof(*sf));
+       memcpy(sf->sf_uuid, uuid, 16);
+       sf->sf_magic = SCRUB_MAGIC_V1;
+       sf->sf_status = SS_INIT;
+}
+
+void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags)
+{
+       struct scrub_file *sf = &scrub->os_file;
+
+       CDEBUG(D_LFSCK, "Reset OI scrub file, flags = "LPX64"\n", flags);
+       memcpy(sf->sf_uuid, uuid, 16);
+       sf->sf_status = SS_INIT;
+       sf->sf_flags |= flags;
+       sf->sf_param = 0;
+       sf->sf_run_time = 0;
+       sf->sf_time_latest_start = 0;
+       sf->sf_time_last_checkpoint = 0;
+       sf->sf_pos_latest_start = 0;
+       sf->sf_pos_last_checkpoint = 0;
+       sf->sf_pos_first_inconsistent = 0;
+       sf->sf_items_checked = 0;
+       sf->sf_items_updated = 0;
+       sf->sf_items_failed = 0;
+       sf->sf_items_updated_prior = 0;
+}
+
+static int osd_scrub_file_load(struct osd_scrub *scrub)
+{
+       loff_t  pos  = 0;
+       char   *name = LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name;
+       int     len  = sizeof(scrub->os_file_disk);
+       int     rc;
+
+       rc = osd_ldiskfs_read(scrub->os_inode, &scrub->os_file_disk, len, &pos);
+       if (rc == len) {
+               struct scrub_file *sf = &scrub->os_file;
+
+               osd_scrub_file_to_cpu(sf, &scrub->os_file_disk);
+               if (sf->sf_magic != SCRUB_MAGIC_V1) {
+                       CWARN("%.16s: invalid scrub magic 0x%x != 0x%x\n,",
+                             name, sf->sf_magic, SCRUB_MAGIC_V1);
+                       /* Process it as new scrub file. */
+                       rc = -ENOENT;
+               } else {
+                       rc = 0;
+               }
+       } else if (rc != 0) {
+               CERROR("%.16s: fail to load scrub file, expected = %d, "
+                      "rc = %d\n", name, len, rc);
+               if (rc > 0)
+                       rc = -EFAULT;
+       } else {
+               /* return -ENOENT for empty scrub file case. */
+               rc = -ENOENT;
+       }
+
+       return rc;
+}
+
+int osd_scrub_file_store(struct osd_scrub *scrub)
+{
+       struct osd_device *dev;
+       handle_t          *jh;
+       loff_t             pos     = 0;
+       int                len     = sizeof(scrub->os_file_disk);
+       int                credits;
+       int                rc;
+
+       dev = container_of0(scrub, struct osd_device, od_scrub);
+       credits = osd_dto_credits_noquota[DTO_WRITE_BASE] +
+                 osd_dto_credits_noquota[DTO_WRITE_BLOCK];
+       jh = ldiskfs_journal_start_sb(osd_sb(dev), credits);
+       if (IS_ERR(jh)) {
+               rc = PTR_ERR(jh);
+               CERROR("%.16s: fail to start trans for scrub store, rc = %d\n",
+                      LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name,rc);
+               return rc;
+       }
+
+       osd_scrub_file_to_le(&scrub->os_file_disk, &scrub->os_file);
+       rc = osd_ldiskfs_write_record(scrub->os_inode, &scrub->os_file_disk,
+                                     len, &pos, jh);
+       ldiskfs_journal_stop(jh);
+       if (rc != 0)
+               CERROR("%.16s: fail to store scrub file, expected = %d, "
+                      "rc = %d\n",
+                      LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name,
+                      len, rc);
+       scrub->os_time_last_checkpoint = cfs_time_current();
+       scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint +
+                               cfs_time_seconds(SCRUB_CHECKPOINT_INTERVAL);
+       return rc;
+}
+
+static int osd_scrub_prep(struct osd_device *dev)
+{
+       struct osd_scrub     *scrub  = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct scrub_file    *sf     = &scrub->os_file;
+       __u32                 flags  = scrub->os_start_flags;
+       int                   rc;
+       ENTRY;
+
+       cfs_down_write(&scrub->os_rwsem);
+       if (flags & SS_SET_FAILOUT)
+               sf->sf_param |= SP_FAILOUT;
+
+       if (flags & SS_CLEAR_FAILOUT)
+               sf->sf_param &= ~SP_FAILOUT;
+
+       if (flags & SS_RESET)
+               osd_scrub_file_reset(scrub,
+                       LDISKFS_SB(osd_sb(dev))->s_es->s_uuid, sf->sf_flags);
+
+       if (flags & SS_AUTO) {
+               scrub->os_full_speed = 1;
+               sf->sf_flags |= SF_AUTO;
+       } else {
+               scrub->os_full_speed = 0;
+       }
+
+       if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT))
+               scrub->os_full_speed = 1;
+
+       scrub->os_in_prior = 0;
+       scrub->os_waiting = 0;
+       scrub->os_new_checked = 0;
+       if (sf->sf_pos_last_checkpoint != 0)
+               sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1;
+       else
+               sf->sf_pos_latest_start = LDISKFS_FIRST_INO(osd_sb(dev));
+
+       scrub->os_pos_current = sf->sf_pos_latest_start;
+       sf->sf_status = SS_SCANNING;
+       sf->sf_time_latest_start = cfs_time_current_sec();
+       sf->sf_time_last_checkpoint = sf->sf_time_latest_start;
+       rc = osd_scrub_file_store(scrub);
+       if (rc == 0) {
+               cfs_spin_lock(&scrub->os_lock);
+               thread_set_flags(thread, SVC_RUNNING);
+               cfs_spin_unlock(&scrub->os_lock);
+               cfs_waitq_broadcast(&thread->t_ctl_waitq);
+       }
+       cfs_up_write(&scrub->os_rwsem);
+
+       RETURN(rc);
+}
+
+static int osd_scrub_error_handler(struct osd_device *dev,
+                                  struct osd_inode_id *lid, int rc)
+{
+       struct osd_scrub  *scrub = &dev->od_scrub;
+       struct scrub_file *sf    = &scrub->os_file;
+
+       cfs_down_write(&scrub->os_rwsem);
+       scrub->os_new_checked++;
+       sf->sf_items_failed++;
+       if (sf->sf_pos_first_inconsistent == 0 ||
+           sf->sf_pos_first_inconsistent > lid->oii_ino)
+               sf->sf_pos_first_inconsistent = lid->oii_ino;
+       cfs_up_write(&scrub->os_rwsem);
+       return sf->sf_param & SP_FAILOUT ? rc : 0;
+}
+
+static int
+osd_scrub_check_update(struct osd_thread_info *info,  struct osd_device *dev,
+                      struct osd_idmap_cache *oic)
+{
+       struct osd_scrub             *scrub  = &dev->od_scrub;
+       struct scrub_file            *sf     = &scrub->os_file;
+       struct osd_inode_id          *lid2   = &info->oti_id;
+       struct lu_fid                *oi_fid = &info->oti_fid;
+       struct osd_inode_id          *oi_id  = &info->oti_id;
+       handle_t                     *jh     = NULL;
+       struct osd_inconsistent_item *oii    = NULL;
+       struct inode                 *inode  = NULL;
+       struct lu_fid                *fid    = &oic->oic_fid;
+       struct osd_inode_id          *lid    = &oic->oic_lid;
+       struct iam_container         *bag;
+       struct iam_path_descr        *ipd;
+       int                           ops    = DTO_INDEX_UPDATE;
+       int                           idx;
+       int                           rc;
+       ENTRY;
+
+       if (scrub->os_in_prior)
+               oii = cfs_list_entry(oic, struct osd_inconsistent_item,
+                                    oii_cache);
+
+       cfs_down_write(&scrub->os_rwsem);
+       scrub->os_new_checked++;
+       if (lid->oii_ino < sf->sf_pos_latest_start && oii == NULL)
+               GOTO(out, rc = 0);
+
+       if (oii != NULL && oii->oii_insert)
+               goto iget;
+
+       rc = osd_oi_lookup(info, dev, fid, lid2);
+       if (rc != 0) {
+               if (rc != -ENOENT)
+                       GOTO(out, rc);
+
+iget:
+               inode = osd_iget(info, dev, lid);
+               if (IS_ERR(inode)) {
+                       rc = PTR_ERR(inode);
+                       /* Someone removed the inode. */
+                       if (rc == -ENOENT || rc == -ESTALE)
+                               rc = 0;
+                       GOTO(out, rc);
+               }
+
+               /* Prevent the inode to be unlinked during OI scrub. */
+               cfs_mutex_lock(&inode->i_mutex);
+               if (unlikely(inode->i_nlink == 0)) {
+                       cfs_mutex_unlock(&inode->i_mutex);
+                       iput(inode);
+                       GOTO(out, rc = 0);
+               }
+
+               ops = DTO_INDEX_INSERT;
+               idx = osd_oi_fid2idx(dev, fid);
+               if (unlikely(!ldiskfs_test_bit(idx, sf->sf_oi_bitmap)))
+                       ldiskfs_set_bit(idx, sf->sf_oi_bitmap);
+               sf->sf_flags |= SF_RECREATED;
+       } else if (osd_id_eq(lid, lid2)) {
+                       GOTO(out, rc = 0);
+       }
+
+       sf->sf_flags |= SF_INCONSISTENT;
+       fid_cpu_to_be(oi_fid, fid);
+       osd_id_pack(oi_id, &oic->oic_lid);
+       jh = ldiskfs_journal_start_sb(osd_sb(dev),
+                               osd_dto_credits_noquota[ops]);
+       if (IS_ERR(jh)) {
+               rc = PTR_ERR(jh);
+               CERROR("%.16s: fail to start trans for scrub store, rc = %d\n",
+                      LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc);
+               GOTO(out, rc);
+       }
+
+       bag = &osd_fid2oi(dev, fid)->oi_dir.od_container;
+       ipd = osd_idx_ipd_get(info->oti_env, bag);
+       if (unlikely(ipd == NULL)) {
+               ldiskfs_journal_stop(jh);
+               CERROR("%.16s: fail to get ipd for scrub store\n",
+                       LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name);
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       if (ops == DTO_INDEX_UPDATE)
+               rc = iam_update(jh, bag, (const struct iam_key *)oi_fid,
+                               (struct iam_rec *)oi_id, ipd);
+       else
+               rc = iam_insert(jh, bag, (const struct iam_key *)oi_fid,
+                               (struct iam_rec *)oi_id, ipd);
+       osd_ipd_put(info->oti_env, bag, ipd);
+       ldiskfs_journal_stop(jh);
+       if (rc == 0) {
+               if (scrub->os_in_prior)
+                       sf->sf_items_updated_prior++;
+               else
+                       sf->sf_items_updated++;
+       }
+
+       GOTO(out, rc);
+
+out:
+       if (rc != 0) {
+               sf->sf_items_failed++;
+               if (sf->sf_pos_first_inconsistent == 0 ||
+                   sf->sf_pos_first_inconsistent > lid->oii_ino)
+                       sf->sf_pos_first_inconsistent = lid->oii_ino;
+       }
+
+       if (ops == DTO_INDEX_INSERT) {
+               cfs_mutex_unlock(&inode->i_mutex);
+               iput(inode);
+       }
+       cfs_up_write(&scrub->os_rwsem);
+
+       if (oii != NULL) {
+               LASSERT(!cfs_list_empty(&oii->oii_list));
+
+               cfs_spin_lock(&scrub->os_lock);
+               cfs_list_del_init(&oii->oii_list);
+               cfs_spin_unlock(&scrub->os_lock);
+               OBD_FREE_PTR(oii);
+       }
+       RETURN(sf->sf_param & SP_FAILOUT ? rc : 0);
+}
+
+static int do_osd_scrub_checkpoint(struct osd_scrub *scrub)
+{
+       struct scrub_file *sf = &scrub->os_file;
+       int                rc;
+       ENTRY;
+
+       cfs_down_write(&scrub->os_rwsem);
+       sf->sf_items_checked += scrub->os_new_checked;
+       scrub->os_new_checked = 0;
+       sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+       sf->sf_time_last_checkpoint = cfs_time_current_sec();
+       sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC -
+                                           scrub->os_time_last_checkpoint);
+       rc = osd_scrub_file_store(scrub);
+       cfs_up_write(&scrub->os_rwsem);
+
+       RETURN(rc);
+}
+
+static inline int osd_scrub_checkpoint(struct osd_scrub *scrub)
+{
+       if (unlikely(cfs_time_beforeq(scrub->os_time_next_checkpoint,
+                                     cfs_time_current()) &&
+                    scrub->os_new_checked > 0))
+               return do_osd_scrub_checkpoint(scrub);
+       return 0;
+}
+
+static void osd_scrub_post(struct osd_scrub *scrub, int result)
+{
+       struct scrub_file *sf = &scrub->os_file;
+       ENTRY;
+
+       cfs_down_write(&scrub->os_rwsem);
+       cfs_spin_lock(&scrub->os_lock);
+       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+       cfs_spin_unlock(&scrub->os_lock);
+       if (scrub->os_new_checked > 0) {
+               sf->sf_items_checked += scrub->os_new_checked;
+               scrub->os_new_checked = 0;
+               sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+       }
+       sf->sf_time_last_checkpoint = cfs_time_current_sec();
+       if (result > 0) {
+               sf->sf_status = SS_COMPLETED;
+               memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
+               sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT | SF_AUTO);
+               sf->sf_time_last_complete = sf->sf_time_last_checkpoint;
+               sf->sf_success_count++;
+       } else if (result == 0) {
+               sf->sf_status = SS_PAUSED;
+       } else {
+               sf->sf_status = SS_FAILED;
+       }
+       sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC -
+                                           scrub->os_time_last_checkpoint);
+       result = osd_scrub_file_store(scrub);
+       if (result < 0)
+               CERROR("%.16s: fail to osd_scrub_post, rc = %d\n",
+                      LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name,
+                      result);
+       cfs_up_write(&scrub->os_rwsem);
+
+       EXIT;
+}
+
+#define SCRUB_NEXT_BREAK       1
+#define SCRUB_NEXT_CONTINUE    2
+
+static int
+osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev,
+              struct osd_scrub *scrub, struct super_block *sb,
+              ldiskfs_group_t bg, struct buffer_head *bitmap, __u32 gbase,
+              __u32 *offset, struct osd_idmap_cache **oic)
+{
+       struct osd_inconsistent_item *oii;
+       struct lu_fid                *fid;
+       struct osd_inode_id          *lid;
+       struct inode                 *inode;
+       int                          rc    = 0;
+
+       if (!cfs_list_empty(&scrub->os_inconsistent_items)) {
+               oii = cfs_list_entry(scrub->os_inconsistent_items.next,
+                                    struct osd_inconsistent_item, oii_list);
+               *oic = &oii->oii_cache;
+               scrub->os_in_prior = 1;
+               return 0;
+       }
+
+       *oic = &scrub->os_oic;
+       fid = &(*oic)->oic_fid;
+       lid = &(*oic)->oic_lid;
+       *offset = ldiskfs_find_next_bit(bitmap->b_data,
+                                       LDISKFS_INODES_PER_GROUP(sb), *offset);
+       if (*offset >= LDISKFS_INODES_PER_GROUP(sb)) {
+               brelse(bitmap);
+               scrub->os_pos_current = 1 + (bg + 1) *
+                                       LDISKFS_INODES_PER_GROUP(sb);
+               return SCRUB_NEXT_BREAK;
+       }
+
+       scrub->os_pos_current = gbase + *offset;
+       osd_id_gen(lid, scrub->os_pos_current, OSD_OII_NOGEN);
+       inode = osd_iget_fid(info, dev, lid, fid);
+       if (IS_ERR(inode)) {
+               rc = PTR_ERR(inode);
+               /* The inode may be removed after bitmap searching, or the
+                * file is new created without inode initialized yet. */
+               if (rc == -ENOENT || rc == -ESTALE)
+                       rc = SCRUB_NEXT_CONTINUE;
+               else
+                       CERROR("%.16s: fail to read inode, group = %u, "
+                              "ino# = %u, rc = %d\n",
+                              LDISKFS_SB(sb)->s_es->s_volume_name,
+                              bg, scrub->os_pos_current, rc);
+       } else {
+               if (fid_is_igif(fid) || fid_is_idif(fid) ||
+                   fid_seq(fid) == FID_SEQ_LLOG ||
+                   fid_seq(fid) == FID_SEQ_LOCAL_FILE ||
+                   fid_seq_is_rsvd(fid_seq(fid)) ||
+                   inode->i_state & I_LUSTRE_NOSCRUB)
+                       rc = SCRUB_NEXT_CONTINUE;
+               iput(inode);
+       }
+       return rc;
+}
+
+static inline int osd_scrub_has_window(struct osd_scrub *scrub,
+                                      struct osd_otable_cache *ooc)
+{
+       return scrub->os_pos_current < ooc->ooc_pos_preload + SCRUB_WINDOW_SIZE;
+}
+
+static int osd_scrub_main(void *args)
+{
+       struct lu_env                 env;
+       struct osd_thread_info       *info;
+       struct osd_device            *dev    = (struct osd_device *)args;
+       struct osd_scrub             *scrub  = &dev->od_scrub;
+       struct ptlrpc_thread         *thread = &scrub->os_thread;
+       cfs_list_t                   *list   = &scrub->os_inconsistent_items;
+       struct l_wait_info            lwi    = { 0 };
+       struct super_block           *sb     = osd_sb(dev);
+       struct osd_otable_it         *it     = NULL;
+       struct osd_otable_cache      *ooc    = NULL;
+       int                           noslot = 0;
+       int                           rc;
+       __u32                         max;
+       ENTRY;
+
+       cfs_daemonize("OI_scrub");
+       rc = lu_env_init(&env, LCT_DT_THREAD);
+       if (rc != 0) {
+               CERROR("%.16s: OI scrub, fail to init env, rc = %d\n",
+                      LDISKFS_SB(sb)->s_es->s_volume_name, rc);
+               GOTO(noenv, rc);
+       }
+
+       info = osd_oti_get(&env);
+       rc = osd_scrub_prep(dev);
+       if (rc != 0) {
+               CERROR("%.16s: OI scrub, fail to scrub prep, rc = %d\n",
+                      LDISKFS_SB(sb)->s_es->s_volume_name, rc);
+               GOTO(out, rc);
+       }
+
+       if (!scrub->os_full_speed) {
+               LASSERT(dev->od_otable_it != NULL);
+
+               it = dev->od_otable_it;
+               ooc = &it->ooi_cache;
+               l_wait_event(thread->t_ctl_waitq,
+                            it->ooi_user_ready || !thread_is_running(thread),
+                            &lwi);
+               if (unlikely(!thread_is_running(thread)))
+                       GOTO(post, rc = 0);
+
+               LASSERT(scrub->os_pos_current >= ooc->ooc_pos_preload);
+               scrub->os_pos_current = ooc->ooc_pos_preload;
+       }
+
+       CDEBUG(D_LFSCK, "OI scrub: flags = 0x%x, pos = %u\n",
+              scrub->os_start_flags, scrub->os_pos_current);
+
+       max = le32_to_cpu(LDISKFS_SB(sb)->s_es->s_inodes_count);
+       while (scrub->os_pos_current <= max) {
+               struct buffer_head *bitmap = NULL;
+               struct osd_idmap_cache *oic = NULL;
+               ldiskfs_group_t bg = (scrub->os_pos_current - 1) /
+                                    LDISKFS_INODES_PER_GROUP(sb);
+               __u32 offset = (scrub->os_pos_current - 1) %
+                              LDISKFS_INODES_PER_GROUP(sb);
+               __u32 gbase = 1 + bg * LDISKFS_INODES_PER_GROUP(sb);
+
+               bitmap = ldiskfs_read_inode_bitmap(sb, bg);
+               if (bitmap == NULL) {
+                       CERROR("%.16s: fail to read bitmap at pos = %u, "
+                              "bg = %u, scrub will stop\n",
+                              LDISKFS_SB(sb)->s_es->s_volume_name,
+                              scrub->os_pos_current, (__u32)bg);
+                       GOTO(post, rc = -EIO);
+               }
+
+               while (offset < LDISKFS_INODES_PER_GROUP(sb)) {
+                       if (unlikely(!thread_is_running(thread))) {
+                               brelse(bitmap);
+                               GOTO(post, rc = 0);
+                       }
+
+                       if (cfs_list_empty(list) && noslot != 0)
+                               goto wait;
+
+                       rc = osd_scrub_next(info, dev, scrub, sb, bg,
+                                           bitmap, gbase, &offset, &oic);
+                       if (rc == SCRUB_NEXT_BREAK)
+                               break;
+                       else if (rc == SCRUB_NEXT_CONTINUE)
+                               goto next;
+
+                       if (rc != 0)
+                               rc = osd_scrub_error_handler(dev, &oic->oic_lid,
+                                                            rc);
+                       else
+                               rc = osd_scrub_check_update(info, dev, oic);
+                       if (rc != 0) {
+                               brelse(bitmap);
+                               GOTO(post, rc);
+                       }
+
+                       rc = osd_scrub_checkpoint(scrub);
+                       if (rc != 0) {
+                               CERROR("%.16s: fail to checkpoint, pos = %u, "
+                                      "rc = %d\n",
+                                      LDISKFS_SB(sb)->s_es->s_volume_name,
+                                      scrub->os_pos_current, rc);
+                               brelse(bitmap);
+                               GOTO(post, rc);
+                       }
+
+                       if (scrub->os_in_prior) {
+                               scrub->os_in_prior = 0;
+                               continue;
+                       }
+
+next:
+                       scrub->os_pos_current = gbase + ++offset;
+                       if (dev->od_otable_it != NULL) {
+                               if (unlikely(it == NULL)) {
+                                       it = dev->od_otable_it;
+                                       ooc = &it->ooi_cache;
+                               }
+
+                               if (it->ooi_waiting &&
+                                   (ooc->ooc_pos_preload <
+                                    scrub->os_pos_current)) {
+                                       it->ooi_waiting = 0;
+                                       cfs_waitq_broadcast(
+                                                       &thread->t_ctl_waitq);
+                               }
+                       }
+
+                       if (scrub->os_full_speed || rc == SCRUB_NEXT_CONTINUE)
+                               continue;
+
+wait:
+                       if (osd_scrub_has_window(scrub, ooc)) {
+                               noslot = 0;
+                               continue;
+                       }
+
+                       scrub->os_waiting = 1;
+                       l_wait_event(thread->t_ctl_waitq,
+                                    osd_scrub_has_window(scrub, ooc) ||
+                                    !cfs_list_empty(list) ||
+                                    !thread_is_running(thread),
+                                    &lwi);
+                       scrub->os_waiting = 0;
+
+                       if (osd_scrub_has_window(scrub, ooc))
+                               noslot = 0;
+                       else
+                               noslot = 1;
+               }
+       }
+
+       GOTO(post, rc = (scrub->os_pos_current > max ? 1 : rc));
+
+post:
+       osd_scrub_post(scrub, rc);
+       CDEBUG(D_LFSCK, "OI scrub: stop, rc = %d, pos = %u\n",
+              rc, scrub->os_pos_current);
+
+out:
+       while (!cfs_list_empty(list)) {
+               struct osd_inconsistent_item *oii;
+
+               oii = cfs_list_entry(list->next,
+                                    struct osd_inconsistent_item, oii_list);
+               cfs_list_del_init(&oii->oii_list);
+               OBD_FREE_PTR(oii);
+       }
+       lu_env_fini(&env);
+
+noenv:
+       cfs_spin_lock(&scrub->os_lock);
+       thread_set_flags(thread, SVC_STOPPED);
+       cfs_waitq_broadcast(&thread->t_ctl_waitq);
+       cfs_spin_unlock(&scrub->os_lock);
+       return rc;
+}
+
+static int do_osd_scrub_start(struct osd_device *dev, __u32 flags)
+{
+       struct osd_scrub     *scrub  = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct l_wait_info    lwi    = { 0 };
+       int                   rc;
+       ENTRY;
+
+again:
+       /* os_lock: sync status between stop and scrub thread */
+       cfs_spin_lock(&scrub->os_lock);
+       if (thread_is_running(thread)) {
+               cfs_spin_unlock(&scrub->os_lock);
+               RETURN(-EALREADY);
+       } else if (unlikely(thread_is_stopping(thread))) {
+               cfs_spin_unlock(&scrub->os_lock);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopped(thread),
+                            &lwi);
+               goto again;
+       }
+       cfs_spin_unlock(&scrub->os_lock);
+
+       scrub->os_start_flags = flags;
+       thread_set_flags(thread, 0);
+       rc = cfs_create_thread(osd_scrub_main, dev, 0);
+       if (rc < 0) {
+               CERROR("%.16s: cannot start iteration thread, rc = %d\n",
+                      LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc);
+               RETURN(rc);
+       }
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+
+       RETURN(0);
+}
+
+int osd_scrub_start(struct osd_device *dev)
+{
+       __u32 flags = SS_AUTO;
+       int   rc;
+       ENTRY;
+
+       if (dev->od_scrub.os_file.sf_status == SS_COMPLETED)
+               flags |= SS_RESET;
+
+       /* od_otable_mutex: prevent curcurrent start/stop */
+       cfs_mutex_lock(&dev->od_otable_mutex);
+       rc = do_osd_scrub_start(dev, flags);
+       cfs_mutex_unlock(&dev->od_otable_mutex);
+
+       RETURN(rc == -EALREADY ? 0 : rc);
+}
+
+static void do_osd_scrub_stop(struct osd_scrub *scrub)
+{
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct l_wait_info    lwi    = { 0 };
+
+       /* os_lock: sync status between stop and scrub thread */
+       cfs_spin_lock(&scrub->os_lock);
+       if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
+               thread_set_flags(thread, SVC_STOPPING);
+               cfs_spin_unlock(&scrub->os_lock);
+               cfs_waitq_broadcast(&thread->t_ctl_waitq);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopped(thread),
+                            &lwi);
+               /* Do not skip the last lock/unlock, which can guarantee that
+                * the caller cannot return until the OI scrub thread exit. */
+               cfs_spin_lock(&scrub->os_lock);
+       }
+       cfs_spin_unlock(&scrub->os_lock);
+}
+
+static void osd_scrub_stop(struct osd_device *dev)
+{
+       /* od_otable_mutex: prevent curcurrent start/stop */
+       cfs_mutex_lock(&dev->od_otable_mutex);
+       do_osd_scrub_stop(&dev->od_scrub);
+       cfs_mutex_unlock(&dev->od_otable_mutex);
+}
+
+static const char osd_scrub_name[] = "OI_scrub";
+
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
+{
+       struct osd_thread_info     *info   = osd_oti_get(env);
+       struct osd_scrub           *scrub  = &dev->od_scrub;
+       struct lvfs_run_ctxt       *ctxt   = &scrub->os_ctxt;
+       struct scrub_file          *sf     = &scrub->os_file;
+       struct osd_inode_id        *id     = &scrub->os_oic.oic_lid;
+       struct super_block         *sb     = osd_sb(dev);
+       struct ldiskfs_super_block *es     = LDISKFS_SB(sb)->s_es;
+       struct inode               *inode;
+       struct lvfs_run_ctxt        saved;
+       struct file                *filp;
+       int                         dirty  = 0;
+       int                         init   = 0;
+       int                         rc     = 0;
+       ENTRY;
+
+       OBD_SET_CTXT_MAGIC(ctxt);
+       ctxt->pwdmnt = dev->od_mnt;
+       ctxt->pwd = dev->od_mnt->mnt_root;
+       ctxt->fs = get_ds();
+
+       cfs_waitq_init(&scrub->os_thread.t_ctl_waitq);
+       cfs_init_rwsem(&scrub->os_rwsem);
+       cfs_spin_lock_init(&scrub->os_lock);
+       CFS_INIT_LIST_HEAD(&scrub->os_inconsistent_items);
+       if (get_mount_flags(dev->od_mount->lmi_sb) & LMD_FLG_NOSCRUB)
+               scrub->os_no_scrub = 1;
+
+       push_ctxt(&saved, ctxt, NULL);
+       filp = filp_open(osd_scrub_name, O_RDWR | O_CREAT, 0644);
+       if (IS_ERR(filp))
+               RETURN(PTR_ERR(filp));
+
+       scrub->os_inode = igrab(filp->f_dentry->d_inode);
+       filp_close(filp, 0);
+       pop_ctxt(&saved, ctxt, NULL);
+
+       rc = osd_scrub_file_load(scrub);
+       if (rc == -ENOENT) {
+               osd_scrub_file_init(scrub, es->s_uuid);
+               dirty = 1;
+               init = 1;
+       } else if (rc != 0) {
+               RETURN(rc);
+       } else {
+               if (memcmp(sf->sf_uuid, es->s_uuid, 16) != 0) {
+                       osd_scrub_file_reset(scrub, es->s_uuid,SF_INCONSISTENT);
+                       dirty = 1;
+               } else if (sf->sf_status == SS_SCANNING) {
+                       sf->sf_status = SS_CRASHED;
+                       dirty = 1;
+               }
+       }
+
+       if (sf->sf_pos_last_checkpoint != 0)
+               scrub->os_pos_current = sf->sf_pos_last_checkpoint + 1;
+       else
+               scrub->os_pos_current = LDISKFS_FIRST_INO(sb);
+
+       if (dirty != 0) {
+               rc = osd_scrub_file_store(scrub);
+               if (rc != 0)
+                       RETURN(rc);
+       }
+
+       /* Initialize OI files. */
+       rc = osd_oi_init(info, dev);
+       if (rc < 0)
+               RETURN(rc);
+
+       if (init != 0) {
+               rc = __osd_oi_lookup(info, dev, &LU_DOT_LUSTRE_FID, id);
+               if (rc == 0) {
+                       inode = osd_iget(info, dev, id);
+                       if (IS_ERR(inode)) {
+                               rc = PTR_ERR(inode);
+                               /* It is restored from old 2.x backup. */
+                               if (rc == -ENOENT || rc == -ESTALE) {
+                                       osd_scrub_file_reset(scrub, es->s_uuid,
+                                                            SF_INCONSISTENT);
+                                       rc = osd_scrub_file_store(scrub);
+                               }
+                       } else {
+                               iput(inode);
+                       }
+               } else if (rc == -ENOENT) {
+                       rc = 0;
+               }
+       }
+
+       if (rc == 0 && !scrub->os_no_scrub &&
+           ((sf->sf_status == SS_CRASHED &&
+             sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_AUTO)) ||
+            (sf->sf_status == SS_INIT &&
+             sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT))))
+               rc = osd_scrub_start(dev);
+
+       RETURN(rc);
+}
+
+void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev)
+{
+       struct osd_scrub *scrub = &dev->od_scrub;
+
+       LASSERT(dev->od_otable_it == NULL);
+
+       if (scrub->os_inode != NULL) {
+               osd_scrub_stop(dev);
+               iput(scrub->os_inode);
+               scrub->os_inode = NULL;
+       }
+       if (dev->od_oi_table != NULL)
+               osd_oi_fini(osd_oti_get(env), dev);
+}
diff --git a/lustre/osd-ldiskfs/osd_scrub.h b/lustre/osd-ldiskfs/osd_scrub.h
new file mode 100644 (file)
index 0000000..5c9df6d
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Whamcloud, Inc.
+ */
+/*
+ * lustre/osd-ldiskfs/osd_scrub.h
+ *
+ * Shared definitions and declarations for OI scrub.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _OSD_SCRUB_H
+# define _OSD_SCRUB_H
+
+#include "osd_oi.h"
+
+#define SCRUB_MAGIC_V1                 0x4C5FD252
+#define SCRUB_CHECKPOINT_INTERVAL      60
+#define SCRUB_OI_BITMAP_SIZE           (OSD_OI_FID_NR_MAX >> 3)
+#define SCRUB_WINDOW_SIZE              1024
+
+enum scrub_status {
+       /* The scrub file is new created, for new MDT, upgrading from old disk,
+        * or re-creating the scrub file manually. */
+       SS_INIT         = 0,
+
+       /* The scrub is checking/repairing the OI files. */
+       SS_SCANNING     = 1,
+
+       /* The scrub checked/repaired the OI files successfully. */
+       SS_COMPLETED    = 2,
+
+       /* The scrub failed to check/repair the OI files. */
+       SS_FAILED       = 3,
+
+       /* The scrub is stopped manually, the OI files may be inconsistent. */
+       SS_PAUSED       = 4,
+
+       /* The scrub crashed during the scanning, should be restarted. */
+       SS_CRASHED      = 5,
+};
+
+enum scrub_flags {
+       /* OI files have been recreated, OI mappings should be re-inserted. */
+       SF_RECREATED    = 0x0000000000000001ULL,
+
+       /* OI files are invalid, should be rebuild ASAP */
+       SF_INCONSISTENT = 0x0000000000000002ULL,
+
+       /* OI scrub is triggered automatically. */
+       SF_AUTO         = 0x0000000000000004ULL,
+};
+
+enum scrub_param {
+       /* Exit when fail. */
+       SP_FAILOUT      = 0x0001,
+};
+
+enum scrub_start {
+       /* Set failout flag. */
+       SS_SET_FAILOUT          = 0x00000001,
+
+       /* Clear failout flag. */
+       SS_CLEAR_FAILOUT        = 0x00000002,
+
+       /* Reset scrub start position. */
+       SS_RESET                = 0x00000004,
+
+       /* Trigger scrub automatically. */
+       SS_AUTO                 = 0x00000008,
+};
+
+struct scrub_file {
+       /* 128-bit uuid for volume. */
+       __u8    sf_uuid[16];
+
+       /* See 'enum scrub_flags'. */
+       __u64   sf_flags;
+
+       /* The scrub magic. */
+       __u32   sf_magic;
+
+       /* See 'enum scrub_status'. */
+       __u16   sf_status;
+
+       /* See 'enum scrub_param'. */
+       __u16   sf_param;
+
+       /* The time for the last OI scrub completed. */
+       __u64   sf_time_last_complete;
+
+       /* The time for the latest OI scrub ran. */
+       __u64   sf_time_latest_start;
+
+       /* The time for the last OI scrub checkpoint. */
+       __u64   sf_time_last_checkpoint;
+
+       /* The position for the latest OI scrub started from. */
+       __u64   sf_pos_latest_start;
+
+       /* The position for the last OI scrub checkpoint. */
+       __u64   sf_pos_last_checkpoint;
+
+       /* The position for the first should be updated object. */
+       __u64   sf_pos_first_inconsistent;
+
+       /* How many objects have been checked. */
+       __u64   sf_items_checked;
+
+       /* How many objects have been updated. */
+       __u64   sf_items_updated;
+
+       /* How many objects failed to be processed. */
+       __u64   sf_items_failed;
+
+       /* How many prior objects have been updated during scanning. */
+       __u64   sf_items_updated_prior;
+
+       /* How long the OI scrub has run. */
+       __u32   sf_run_time;
+
+       /* How many completed OI scrub ran on the device. */
+       __u32   sf_success_count;
+
+       /* How many OI files. */
+       __u16   sf_oi_count;
+
+       /* Update the magic or flags if want to use the reserved fields. */
+       __u16   sf_reserved_0;
+       __u32   sf_reserved_1;
+       __u64   sf_reserved_2[16];
+
+       /* Bitmap for OI files recreated case. */
+       __u8    sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE];
+};
+
+struct osd_scrub {
+       struct lvfs_run_ctxt    os_ctxt;
+       struct ptlrpc_thread    os_thread;
+       struct osd_idmap_cache  os_oic;
+       cfs_list_t              os_inconsistent_items;
+
+       /* write lock for scrub prep/update/post/checkpoint,
+        * read lock for scrub dump. */
+       cfs_rw_semaphore_t      os_rwsem;
+       cfs_spinlock_t          os_lock;
+
+       /* Scrub file in memory. */
+       struct scrub_file       os_file;
+
+       /* Buffer for scrub file load/store. */
+       struct scrub_file       os_file_disk;
+
+       /* Inode for the scrub file. */
+       struct inode           *os_inode;
+
+       /* The time for last checkpoint, jiffies */
+       cfs_time_t              os_time_last_checkpoint;
+
+       /* The time for next checkpoint, jiffies */
+       cfs_time_t              os_time_next_checkpoint;
+
+       /* How many objects have been checked since last checkpoint. */
+       __u32                   os_new_checked;
+       __u32                   os_pos_current;
+       __u32                   os_start_flags;
+       unsigned int            os_in_prior:1, /* process inconsistent item
+                                               * found by RPC prior */
+                               os_waiting:1, /* Waiting for scan window. */
+                               os_full_speed:1, /* run w/o speed limit */
+                               os_no_scrub:1; /* NOT auto trigger OI scrub*/
+};
+
+#endif /* _OSD_SCRUB_H */