Whamcloud - gitweb
LU-1816 scrub: OI scrub skips new created objects for once
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_scrub.c
index ccdca14..65157f4 100644 (file)
@@ -157,6 +157,8 @@ void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags)
        sf->sf_items_updated = 0;
        sf->sf_items_failed = 0;
        sf->sf_items_updated_prior = 0;
+       sf->sf_items_noscrub = 0;
+       sf->sf_items_igif = 0;
 }
 
 static int osd_scrub_file_load(struct osd_scrub *scrub)
@@ -214,7 +216,7 @@ int osd_scrub_file_store(struct osd_scrub *scrub)
 
        osd_scrub_file_to_le(&scrub->os_file_disk, &scrub->os_file);
        rc = osd_ldiskfs_write_record(scrub->os_inode, &scrub->os_file_disk,
-                                     len, &pos, jh);
+                                     len, 0, &pos, jh);
        ldiskfs_journal_stop(jh);
        if (rc != 0)
                CERROR("%.16s: fail to store scrub file, expected = %d, "
@@ -281,8 +283,8 @@ static int osd_scrub_prep(struct osd_device *dev)
        RETURN(rc);
 }
 
-static int osd_scrub_error_handler(struct osd_device *dev,
-                                  struct osd_inode_id *lid, int rc)
+static int
+osd_scrub_error(struct osd_device *dev, struct osd_inode_id *lid, int rc)
 {
        struct osd_scrub  *scrub = &dev->od_scrub;
        struct scrub_file *sf    = &scrub->os_file;
@@ -298,14 +300,14 @@ static int osd_scrub_error_handler(struct osd_device *dev,
 }
 
 static int
-osd_scrub_check_update(struct osd_thread_info *info,  struct osd_device *dev,
+osd_scrub_check_update(struct osd_thread_info *info, struct osd_device *dev,
                       struct osd_idmap_cache *oic)
 {
        struct osd_scrub             *scrub  = &dev->od_scrub;
        struct scrub_file            *sf     = &scrub->os_file;
        struct osd_inode_id          *lid2   = &info->oti_id;
        struct lu_fid                *oi_fid = &info->oti_fid;
-       struct osd_inode_id          *oi_id  = &info->oti_id;
+       struct osd_inode_id          *oi_id  = &info->oti_id2;
        handle_t                     *jh     = NULL;
        struct osd_inconsistent_item *oii    = NULL;
        struct inode                 *inode  = NULL;
@@ -475,7 +477,10 @@ static void osd_scrub_post(struct osd_scrub *scrub, int result)
                sf->sf_time_last_complete = sf->sf_time_last_checkpoint;
                sf->sf_success_count++;
        } else if (result == 0) {
-               sf->sf_status = SS_PAUSED;
+               if (scrub->os_paused)
+                       sf->sf_status = SS_PAUSED;
+               else
+                       sf->sf_status = SS_STOPPED;
        } else {
                sf->sf_status = SS_FAILED;
        }
@@ -491,22 +496,116 @@ static void osd_scrub_post(struct osd_scrub *scrub, int result)
        EXIT;
 }
 
-#define SCRUB_NEXT_BREAK       1
-#define SCRUB_NEXT_CONTINUE    2
+#define SCRUB_NEXT_BREAK       1 /* exit current loop and process next group */
+#define SCRUB_NEXT_CONTINUE    2 /* skip current object and process next bit */
+#define SCRUB_NEXT_EXIT        3 /* exit all the loops */
+#define SCRUB_NEXT_WAIT        4 /* wait for free cache slot */
+#define SCRUB_NEXT_CRASH       5 /* simulate system crash during OI scrub */
+#define SCRUB_NEXT_FATAL       6 /* simulate failure during OI scrub */
+#define SCRUB_NEXT_NOSCRUB     7 /* new created object, no scrub on it */
+#define SCRUB_NEXT_IGIF        8 /* IGIF object */
+
+struct osd_iit_param {
+       struct super_block *sb;
+       struct buffer_head *bitmap;
+       ldiskfs_group_t bg;
+       __u32 gbase;
+       __u32 offset;
+};
+
+typedef int (*osd_iit_next_policy)(struct osd_thread_info *info,
+                                  struct osd_device *dev,
+                                  struct osd_iit_param *param,
+                                  struct osd_idmap_cache **oic,
+                                  int noslot);
+
+typedef int (*osd_iit_exec_policy)(struct osd_thread_info *info,
+                                  struct osd_device *dev,
+                                  struct osd_iit_param *param,
+                                  struct osd_idmap_cache *oic,
+                                  int *noslot, int rc);
 
-static int
-osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev,
-              struct osd_scrub *scrub, struct super_block *sb,
-              ldiskfs_group_t bg, struct buffer_head *bitmap, __u32 gbase,
-              __u32 *offset, struct osd_idmap_cache **oic)
+static inline int osd_scrub_has_window(struct osd_scrub *scrub,
+                                      struct osd_otable_cache *ooc)
 {
-       struct osd_inconsistent_item *oii;
-       struct lu_fid                *fid;
-       struct osd_inode_id          *lid;
-       struct inode                 *inode;
-       int                          rc    = 0;
+       return scrub->os_pos_current < ooc->ooc_pos_preload + SCRUB_WINDOW_SIZE;
+}
+
+static int osd_iit_next(struct osd_iit_param *param, __u32 *pos)
+{
+       param->offset = ldiskfs_find_next_bit(param->bitmap->b_data,
+                       LDISKFS_INODES_PER_GROUP(param->sb), param->offset);
+       if (param->offset >= LDISKFS_INODES_PER_GROUP(param->sb)) {
+               *pos = 1 + (param->bg+1) * LDISKFS_INODES_PER_GROUP(param->sb);
+               return SCRUB_NEXT_BREAK;
+       } else {
+               *pos = param->gbase + param->offset;
+               return 0;
+       }
+}
+
+static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev,
+                       struct lu_fid *fid, struct osd_inode_id *lid, __u32 pos,
+                       struct super_block *sb, struct inode **pinode)
+{
+       struct inode *inode;
+       int           rc;
+
+       osd_id_gen(lid, pos, OSD_OII_NOGEN);
+       inode = osd_iget_fid(info, dev, lid, fid);
+       if (IS_ERR(inode)) {
+               rc = PTR_ERR(inode);
+               /* The inode may be removed after bitmap searching, or the
+                * file is new created without inode initialized yet. */
+               if (rc == -ENOENT || rc == -ESTALE)
+                       return SCRUB_NEXT_CONTINUE;
+
+               CERROR("%.16s: fail to read inode, ino# = %u, rc = %d\n",
+                      LDISKFS_SB(sb)->s_es->s_volume_name, pos, rc);
+               return rc;
+       }
+
+       *pinode = inode;
+       return 0;
+}
+
+static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev,
+                         struct osd_iit_param *param,
+                         struct osd_idmap_cache **oic, int noslot)
+{
+       struct osd_scrub     *scrub  = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct lu_fid        *fid;
+       struct osd_inode_id  *lid;
+       struct inode         *inode;
+       int                   rc;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_DELAY) && cfs_fail_val > 0) {
+               struct l_wait_info lwi;
+
+               lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
+               l_wait_event(thread->t_ctl_waitq,
+                            !cfs_list_empty(&scrub->os_inconsistent_items) ||
+                            !thread_is_running(thread),
+                            &lwi);
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
+               cfs_spin_lock(&scrub->os_lock);
+               thread_set_flags(thread, SVC_STOPPING);
+               cfs_spin_unlock(&scrub->os_lock);
+               return SCRUB_NEXT_CRASH;
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_FATAL))
+               return SCRUB_NEXT_FATAL;
+
+       if (unlikely(!thread_is_running(thread)))
+               return SCRUB_NEXT_EXIT;
 
        if (!cfs_list_empty(&scrub->os_inconsistent_items)) {
+               struct osd_inconsistent_item *oii;
+
                oii = cfs_list_entry(scrub->os_inconsistent_items.next,
                                     struct osd_inconsistent_item, oii_list);
                *oic = &oii->oii_cache;
@@ -514,65 +613,256 @@ osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev,
                return 0;
        }
 
+       if (noslot != 0)
+               return SCRUB_NEXT_WAIT;
+
+       rc = osd_iit_next(param, &scrub->os_pos_current);
+       if (rc != 0)
+               return rc;
+
        *oic = &scrub->os_oic;
        fid = &(*oic)->oic_fid;
        lid = &(*oic)->oic_lid;
-       *offset = ldiskfs_find_next_bit(bitmap->b_data,
-                                       LDISKFS_INODES_PER_GROUP(sb), *offset);
-       if (*offset >= LDISKFS_INODES_PER_GROUP(sb)) {
-               brelse(bitmap);
-               scrub->os_pos_current = 1 + (bg + 1) *
-                                       LDISKFS_INODES_PER_GROUP(sb);
-               return SCRUB_NEXT_BREAK;
+       rc = osd_iit_iget(info, dev, fid, lid,
+                         scrub->os_pos_current, param->sb, &inode);
+       if (rc != 0)
+               return rc;
+
+       if (inode->i_state & I_LUSTRE_NOSCRUB) {
+               /* Only skip it for the first OI scrub accessing. */
+               inode->i_state &= ~I_LUSTRE_NOSCRUB;
+               rc = SCRUB_NEXT_NOSCRUB;
+       } else if (!fid_is_norm(fid)) {
+               rc = SCRUB_NEXT_IGIF;
        }
 
-       scrub->os_pos_current = gbase + *offset;
-       osd_id_gen(lid, scrub->os_pos_current, OSD_OII_NOGEN);
-       inode = osd_iget_fid(info, dev, lid, fid);
-       if (IS_ERR(inode)) {
-               rc = PTR_ERR(inode);
-               /* The inode may be removed after bitmap searching, or the
-                * file is new created without inode initialized yet. */
-               if (rc == -ENOENT || rc == -ESTALE)
-                       rc = SCRUB_NEXT_CONTINUE;
-               else
-                       CERROR("%.16s: fail to read inode, group = %u, "
-                              "ino# = %u, rc = %d\n",
-                              LDISKFS_SB(sb)->s_es->s_volume_name,
-                              bg, scrub->os_pos_current, rc);
-       } else {
-               if (fid_is_igif(fid) || fid_is_idif(fid) ||
-                   fid_seq(fid) == FID_SEQ_LLOG ||
-                   fid_seq(fid) == FID_SEQ_LOCAL_FILE ||
-                   fid_seq_is_rsvd(fid_seq(fid)) ||
-                   inode->i_state & I_LUSTRE_NOSCRUB)
-                       rc = SCRUB_NEXT_CONTINUE;
+       iput(inode);
+       return rc;
+}
+
+static int osd_preload_next(struct osd_thread_info *info,
+                           struct osd_device *dev, struct osd_iit_param *param,
+                           struct osd_idmap_cache **oic, int noslot)
+{
+       struct osd_otable_cache *ooc    = &dev->od_otable_it->ooi_cache;
+       struct osd_scrub        *scrub;
+       struct ptlrpc_thread    *thread;
+       struct inode            *inode;
+       int                      rc;
+
+       rc = osd_iit_next(param, &ooc->ooc_pos_preload);
+       if (rc != 0)
+               return rc;
+
+       scrub = &dev->od_scrub;
+       thread = &scrub->os_thread;
+       if (thread_is_running(thread) &&
+           ooc->ooc_pos_preload >= scrub->os_pos_current)
+               return SCRUB_NEXT_EXIT;
+
+       rc = osd_iit_iget(info, dev,
+                         &ooc->ooc_cache[ooc->ooc_producer_idx].oic_fid,
+                         &ooc->ooc_cache[ooc->ooc_producer_idx].oic_lid,
+                         ooc->ooc_pos_preload, param->sb, &inode);
+       /* If succeed, it needs to move forward; otherwise up layer LFSCK may
+        * ignore the failure, so it still need to skip the inode next time. */
+       ooc->ooc_pos_preload = param->gbase + ++(param->offset);
+       if (rc == 0)
                iput(inode);
-       }
        return rc;
 }
 
-static inline int osd_scrub_has_window(struct osd_scrub *scrub,
-                                      struct osd_otable_cache *ooc)
+static int osd_scrub_exec(struct osd_thread_info *info, struct osd_device *dev,
+                         struct osd_iit_param *param,
+                         struct osd_idmap_cache *oic, int *noslot, int rc)
 {
-       return scrub->os_pos_current < ooc->ooc_pos_preload + SCRUB_WINDOW_SIZE;
+       struct l_wait_info       lwi    = { 0 };
+       struct osd_scrub        *scrub  = &dev->od_scrub;
+       struct scrub_file       *sf     = &scrub->os_file;
+       __u64                   *items  = NULL;
+       struct ptlrpc_thread    *thread = &scrub->os_thread;
+       struct osd_otable_it    *it     = dev->od_otable_it;
+       struct osd_otable_cache *ooc    = it ? &it->ooi_cache : NULL;
+
+       switch (rc) {
+       case SCRUB_NEXT_CONTINUE:
+               goto next;
+       case SCRUB_NEXT_WAIT:
+               goto wait;
+       case SCRUB_NEXT_NOSCRUB:
+               items = &sf->sf_items_noscrub;
+               break;
+       case SCRUB_NEXT_IGIF:
+               items = &sf->sf_items_igif;
+               break;
+       }
+
+       if (items != NULL) {
+               cfs_down_write(&scrub->os_rwsem);
+               scrub->os_new_checked++;
+               (*items)++;
+               cfs_up_write(&scrub->os_rwsem);
+               goto next;
+       }
+
+       LASSERTF(rc <= 0, "unexpected rc = %d\n", rc);
+
+       if (rc != 0)
+               rc = osd_scrub_error(dev, &oic->oic_lid, rc);
+       else
+               rc = osd_scrub_check_update(info, dev, oic);
+       if (rc != 0)
+               return rc;
+
+       rc = osd_scrub_checkpoint(scrub);
+       if (rc != 0) {
+               CERROR("%.16s: fail to checkpoint, pos = %u, rc = %d\n",
+                      LDISKFS_SB(param->sb)->s_es->s_volume_name,
+                      scrub->os_pos_current, rc);
+               /* Continue, as long as the scrub itself can go ahead. */
+       }
+
+       if (scrub->os_in_prior) {
+               scrub->os_in_prior = 0;
+               return 0;
+       }
+
+next:
+       scrub->os_pos_current = param->gbase + ++(param->offset);
+       if (it != NULL && it->ooi_waiting &&
+           ooc->ooc_pos_preload < scrub->os_pos_current) {
+               it->ooi_waiting = 0;
+               cfs_waitq_broadcast(&thread->t_ctl_waitq);
+       }
+
+       if (scrub->os_full_speed || rc == SCRUB_NEXT_CONTINUE)
+               return 0;
+
+wait:
+       if (osd_scrub_has_window(scrub, ooc)) {
+               *noslot = 0;
+               return 0;
+       }
+
+       scrub->os_waiting = 1;
+       l_wait_event(thread->t_ctl_waitq,
+                    osd_scrub_has_window(scrub, ooc) ||
+                    !cfs_list_empty(&scrub->os_inconsistent_items) ||
+                    !thread_is_running(thread),
+                    &lwi);
+       scrub->os_waiting = 0;
+
+       if (osd_scrub_has_window(scrub, ooc))
+               *noslot = 0;
+       else
+               *noslot = 1;
+       return 0;
+}
+
+static int osd_preload_exec(struct osd_thread_info *info,
+                           struct osd_device *dev, struct osd_iit_param *param,
+                           struct osd_idmap_cache *oic, int *noslot, int rc)
+{
+       struct osd_otable_cache *ooc = &dev->od_otable_it->ooi_cache;
+
+       if (rc == 0) {
+               ooc->ooc_cached_items++;
+               ooc->ooc_producer_idx = (ooc->ooc_producer_idx + 1) &
+                                       ~OSD_OTABLE_IT_CACHE_MASK;
+       }
+       return rc > 0 ? 0 : rc;
+}
+
+#define SCRUB_IT_ALL   1
+#define SCRUB_IT_CRASH 2
+
+static int osd_inode_iteration(struct osd_thread_info *info,
+                              struct osd_device *dev, __u32 max, int preload)
+{
+       osd_iit_next_policy   next;
+       osd_iit_exec_policy   exec;
+       __u32                *pos;
+       __u32                *count;
+       struct osd_iit_param  param;
+       __u32                 limit;
+       int                   noslot = 0;
+       int                   rc;
+       ENTRY;
+
+       if (preload == 0) {
+               struct osd_scrub *scrub = &dev->od_scrub;
+
+               next = osd_scrub_next;
+               exec = osd_scrub_exec;
+               pos = &scrub->os_pos_current;
+               count = &scrub->os_new_checked;
+       } else {
+               struct osd_otable_cache *ooc = &dev->od_otable_it->ooi_cache;
+
+               next = osd_preload_next;
+               exec = osd_preload_exec;
+               pos = &ooc->ooc_pos_preload;
+               count = &ooc->ooc_cached_items;
+       }
+       param.sb = osd_sb(dev);
+       limit = le32_to_cpu(LDISKFS_SB(param.sb)->s_es->s_inodes_count);
+
+       while (*pos <= limit && *count < max) {
+               struct osd_idmap_cache *oic = NULL;
+
+               param.bg = (*pos - 1) / LDISKFS_INODES_PER_GROUP(param.sb);
+               param.offset = (*pos - 1) % LDISKFS_INODES_PER_GROUP(param.sb);
+               param.gbase = 1 + param.bg * LDISKFS_INODES_PER_GROUP(param.sb);
+               param.bitmap = ldiskfs_read_inode_bitmap(param.sb, param.bg);
+               if (param.bitmap == NULL) {
+                       CERROR("%.16s: fail to read bitmap for %u, "
+                              "scrub will stop, urgent mode\n",
+                              LDISKFS_SB(param.sb)->s_es->s_volume_name,
+                              (__u32)param.bg);
+                       RETURN(-EIO);
+               }
+
+               while (param.offset < LDISKFS_INODES_PER_GROUP(param.sb) &&
+                      *count < max) {
+                       rc = next(info, dev, &param, &oic, noslot);
+                       switch (rc) {
+                       case SCRUB_NEXT_BREAK:
+                               goto next_group;
+                       case SCRUB_NEXT_EXIT:
+                               brelse(param.bitmap);
+                               RETURN(0);
+                       case SCRUB_NEXT_CRASH:
+                               brelse(param.bitmap);
+                               RETURN(SCRUB_IT_CRASH);
+                       case SCRUB_NEXT_FATAL:
+                               brelse(param.bitmap);
+                               RETURN(-EINVAL);
+                       }
+
+                       rc = exec(info, dev, &param, oic, &noslot, rc);
+                       if (rc != 0) {
+                               brelse(param.bitmap);
+                               RETURN(rc);
+                       }
+               }
+
+next_group:
+               brelse(param.bitmap);
+       }
+
+       if (*pos > limit)
+               RETURN(SCRUB_IT_ALL);
+       RETURN(0);
 }
 
 static int osd_scrub_main(void *args)
 {
-       struct lu_env                 env;
-       struct osd_thread_info       *info;
-       struct osd_device            *dev    = (struct osd_device *)args;
-       struct osd_scrub             *scrub  = &dev->od_scrub;
-       struct ptlrpc_thread         *thread = &scrub->os_thread;
-       cfs_list_t                   *list   = &scrub->os_inconsistent_items;
-       struct l_wait_info            lwi    = { 0 };
-       struct super_block           *sb     = osd_sb(dev);
-       struct osd_otable_it         *it     = NULL;
-       struct osd_otable_cache      *ooc    = NULL;
-       int                           noslot = 0;
-       int                           rc;
-       __u32                         max;
+       struct lu_env         env;
+       struct osd_device    *dev    = (struct osd_device *)args;
+       struct osd_scrub     *scrub  = &dev->od_scrub;
+       struct ptlrpc_thread *thread = &scrub->os_thread;
+       struct super_block   *sb     = osd_sb(dev);
+       int                   rc;
        ENTRY;
 
        cfs_daemonize("OI_scrub");
@@ -583,7 +873,6 @@ static int osd_scrub_main(void *args)
                GOTO(noenv, rc);
        }
 
-       info = osd_oti_get(&env);
        rc = osd_scrub_prep(dev);
        if (rc != 0) {
                CERROR("%.16s: OI scrub, fail to scrub prep, rc = %d\n",
@@ -592,10 +881,10 @@ static int osd_scrub_main(void *args)
        }
 
        if (!scrub->os_full_speed) {
-               LASSERT(dev->od_otable_it != NULL);
+               struct l_wait_info lwi = { 0 };
+               struct osd_otable_it *it = dev->od_otable_it;
+               struct osd_otable_cache *ooc = &it->ooi_cache;
 
-               it = dev->od_otable_it;
-               ooc = &it->ooi_cache;
                l_wait_event(thread->t_ctl_waitq,
                             it->ooi_user_ready || !thread_is_running(thread),
                             &lwi);
@@ -609,108 +898,10 @@ static int osd_scrub_main(void *args)
        CDEBUG(D_LFSCK, "OI scrub: flags = 0x%x, pos = %u\n",
               scrub->os_start_flags, scrub->os_pos_current);
 
-       max = le32_to_cpu(LDISKFS_SB(sb)->s_es->s_inodes_count);
-       while (scrub->os_pos_current <= max) {
-               struct buffer_head *bitmap = NULL;
-               struct osd_idmap_cache *oic = NULL;
-               ldiskfs_group_t bg = (scrub->os_pos_current - 1) /
-                                    LDISKFS_INODES_PER_GROUP(sb);
-               __u32 offset = (scrub->os_pos_current - 1) %
-                              LDISKFS_INODES_PER_GROUP(sb);
-               __u32 gbase = 1 + bg * LDISKFS_INODES_PER_GROUP(sb);
-
-               bitmap = ldiskfs_read_inode_bitmap(sb, bg);
-               if (bitmap == NULL) {
-                       CERROR("%.16s: fail to read bitmap at pos = %u, "
-                              "bg = %u, scrub will stop\n",
-                              LDISKFS_SB(sb)->s_es->s_volume_name,
-                              scrub->os_pos_current, (__u32)bg);
-                       GOTO(post, rc = -EIO);
-               }
-
-               while (offset < LDISKFS_INODES_PER_GROUP(sb)) {
-                       if (unlikely(!thread_is_running(thread))) {
-                               brelse(bitmap);
-                               GOTO(post, rc = 0);
-                       }
-
-                       if (cfs_list_empty(list) && noslot != 0)
-                               goto wait;
-
-                       rc = osd_scrub_next(info, dev, scrub, sb, bg,
-                                           bitmap, gbase, &offset, &oic);
-                       if (rc == SCRUB_NEXT_BREAK)
-                               break;
-                       else if (rc == SCRUB_NEXT_CONTINUE)
-                               goto next;
-
-                       if (rc != 0)
-                               rc = osd_scrub_error_handler(dev, &oic->oic_lid,
-                                                            rc);
-                       else
-                               rc = osd_scrub_check_update(info, dev, oic);
-                       if (rc != 0) {
-                               brelse(bitmap);
-                               GOTO(post, rc);
-                       }
-
-                       rc = osd_scrub_checkpoint(scrub);
-                       if (rc != 0) {
-                               CERROR("%.16s: fail to checkpoint, pos = %u, "
-                                      "rc = %d\n",
-                                      LDISKFS_SB(sb)->s_es->s_volume_name,
-                                      scrub->os_pos_current, rc);
-                               brelse(bitmap);
-                               GOTO(post, rc);
-                       }
-
-                       if (scrub->os_in_prior) {
-                               scrub->os_in_prior = 0;
-                               continue;
-                       }
-
-next:
-                       scrub->os_pos_current = gbase + ++offset;
-                       if (dev->od_otable_it != NULL) {
-                               if (unlikely(it == NULL)) {
-                                       it = dev->od_otable_it;
-                                       ooc = &it->ooi_cache;
-                               }
-
-                               if (it->ooi_waiting &&
-                                   (ooc->ooc_pos_preload <
-                                    scrub->os_pos_current)) {
-                                       it->ooi_waiting = 0;
-                                       cfs_waitq_broadcast(
-                                                       &thread->t_ctl_waitq);
-                               }
-                       }
-
-                       if (scrub->os_full_speed || rc == SCRUB_NEXT_CONTINUE)
-                               continue;
-
-wait:
-                       if (osd_scrub_has_window(scrub, ooc)) {
-                               noslot = 0;
-                               continue;
-                       }
-
-                       scrub->os_waiting = 1;
-                       l_wait_event(thread->t_ctl_waitq,
-                                    osd_scrub_has_window(scrub, ooc) ||
-                                    !cfs_list_empty(list) ||
-                                    !thread_is_running(thread),
-                                    &lwi);
-                       scrub->os_waiting = 0;
-
-                       if (osd_scrub_has_window(scrub, ooc))
-                               noslot = 0;
-                       else
-                               noslot = 1;
-               }
-       }
-
-       GOTO(post, rc = (scrub->os_pos_current > max ? 1 : rc));
+       rc = osd_inode_iteration(osd_oti_get(&env), dev, ~0U, 0);
+       if (unlikely(rc == SCRUB_IT_CRASH))
+               GOTO(out, rc = -EINVAL);
+       GOTO(post, rc);
 
 post:
        osd_scrub_post(scrub, rc);
@@ -718,10 +909,10 @@ post:
               rc, scrub->os_pos_current);
 
 out:
-       while (!cfs_list_empty(list)) {
+       while (!cfs_list_empty(&scrub->os_inconsistent_items)) {
                struct osd_inconsistent_item *oii;
 
-               oii = cfs_list_entry(list->next,
+               oii = cfs_list_entry(scrub->os_inconsistent_items.next,
                                     struct osd_inconsistent_item, oii_list);
                cfs_list_del_init(&oii->oii_list);
                OBD_FREE_PTR(oii);
@@ -817,6 +1008,7 @@ static void osd_scrub_stop(struct osd_device *dev)
 {
        /* od_otable_mutex: prevent curcurrent start/stop */
        cfs_mutex_lock(&dev->od_otable_mutex);
+       dev->od_scrub.os_paused = 1;
        do_osd_scrub_stop(&dev->od_scrub);
        cfs_mutex_unlock(&dev->od_otable_mutex);
 }
@@ -840,6 +1032,7 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        int                         rc     = 0;
        ENTRY;
 
+       memset(scrub, 0, sizeof(*scrub));
        OBD_SET_CTXT_MAGIC(ctxt);
        ctxt->pwdmnt = dev->od_mnt;
        ctxt->pwd = dev->od_mnt->mnt_root;
@@ -915,7 +1108,8 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        }
 
        if (rc == 0 && !scrub->os_no_scrub &&
-           ((sf->sf_status == SS_CRASHED &&
+           ((sf->sf_status == SS_PAUSED) ||
+            (sf->sf_status == SS_CRASHED &&
              sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_AUTO)) ||
             (sf->sf_status == SS_INIT &&
              sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT))))
@@ -938,3 +1132,516 @@ void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev)
        if (dev->od_oi_table != NULL)
                osd_oi_fini(osd_oti_get(env), dev);
 }
+
+static struct dt_it *osd_otable_it_init(const struct lu_env *env,
+                                      struct dt_object *dt, __u32 attr,
+                                      struct lustre_capa *capa)
+{
+       enum dt_otable_it_flags flags = attr >> DT_OTABLE_IT_FLAGS_SHIFT;
+       enum dt_otable_it_valid valid = attr & ~DT_OTABLE_IT_FLAGS_MASK;
+       struct osd_device      *dev   = osd_dev(dt->do_lu.lo_dev);
+       struct osd_scrub       *scrub = &dev->od_scrub;
+       struct osd_otable_it   *it;
+       __u32                   start = 0;
+       int                     rc;
+       ENTRY;
+
+       /* od_otable_mutex: prevent curcurrent init/fini */
+       cfs_mutex_lock(&dev->od_otable_mutex);
+       if (dev->od_otable_it != NULL)
+               GOTO(out, it = ERR_PTR(-EALREADY));
+
+       OBD_ALLOC_PTR(it);
+       if (it == NULL)
+               GOTO(out, it = ERR_PTR(-ENOMEM));
+
+       dev->od_otable_it = it;
+       it->ooi_dev = dev;
+       it->ooi_cache.ooc_consumer_idx = -1;
+       if (flags & DOIF_OUTUSED)
+               it->ooi_used_outside = 1;
+
+       if (flags & DOIF_RESET)
+               start |= SS_RESET;
+
+       if (valid & DOIV_ERROR_HANDLE) {
+               if (flags & DOIF_FAILOUT)
+                       start |= SS_SET_FAILOUT;
+               else
+                       start |= SS_CLEAR_FAILOUT;
+       }
+
+       rc = do_osd_scrub_start(dev, start);
+       if (rc == -EALREADY) {
+               it->ooi_cache.ooc_pos_preload = scrub->os_pos_current - 1;
+       } else if (rc < 0) {
+               dev->od_otable_it = NULL;
+               OBD_FREE_PTR(it);
+               GOTO(out, it = ERR_PTR(-EALREADY));
+       } else {
+               it->ooi_cache.ooc_pos_preload = scrub->os_pos_current;
+       }
+
+       GOTO(out, it);
+
+out:
+       cfs_mutex_unlock(&dev->od_otable_mutex);
+       return (struct dt_it *)it;
+}
+
+static void osd_otable_it_fini(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_otable_it *it  = (struct osd_otable_it *)di;
+       struct osd_device    *dev = it->ooi_dev;
+
+       /* od_otable_mutex: prevent curcurrent init/fini */
+       cfs_mutex_lock(&dev->od_otable_mutex);
+       do_osd_scrub_stop(&dev->od_scrub);
+       LASSERT(dev->od_otable_it == it);
+
+       dev->od_otable_it = NULL;
+       cfs_mutex_unlock(&dev->od_otable_mutex);
+       OBD_FREE_PTR(it);
+}
+
+/**
+ * XXX: Temporary used to notify otable iteration to be paused.
+ */
+static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_device *dev = ((struct osd_otable_it *)di)->ooi_dev;
+
+       /* od_otable_mutex: prevent curcurrent init/fini */
+       cfs_mutex_lock(&dev->od_otable_mutex);
+       dev->od_scrub.os_paused = 1;
+       cfs_mutex_unlock(&dev->od_otable_mutex);
+}
+
+/**
+ * Set the OSD layer iteration start position as the specified key.
+ *
+ * The LFSCK out of OSD layer does not know the detail of the key, so if there
+ * are several keys, they cannot be compared out of OSD, so call "::get()" for
+ * each key, and OSD will select the smallest one by itself.
+ */
+static int osd_otable_it_get(const struct lu_env *env,
+                            struct dt_it *di, const struct dt_key *key)
+{
+       struct osd_otable_it    *it  = (struct osd_otable_it *)di;
+       struct osd_otable_cache *ooc = &it->ooi_cache;
+       const char              *str = (const char *)key;
+       __u32                    ino;
+       ENTRY;
+
+       /* Forbid to set iteration position after iteration started. */
+       if (it->ooi_user_ready)
+               RETURN(-EPERM);
+
+       if (str[0] == '\0')
+               RETURN(-EINVAL);
+
+       if (sscanf(str, "%u", &ino) <= 0)
+               RETURN(-EINVAL);
+
+       /* Skip the one that has been processed last time. */
+       if (ooc->ooc_pos_preload > ++ino)
+               ooc->ooc_pos_preload = ino;
+
+       RETURN(0);
+}
+
+static int osd_otable_it_preload(const struct lu_env *env,
+                                struct osd_otable_it *it)
+{
+       struct osd_device       *dev   = it->ooi_dev;
+       struct osd_scrub        *scrub = &dev->od_scrub;
+       struct osd_otable_cache *ooc   = &it->ooi_cache;
+       int                      rc;
+       ENTRY;
+
+       rc = osd_inode_iteration(osd_oti_get(env), dev,
+                                OSD_OTABLE_IT_CACHE_SIZE, 1);
+       if (rc == SCRUB_IT_ALL)
+               it->ooi_all_cached = 1;
+
+       CDEBUG(D_LFSCK, "OSD pre-loaded: max = %u, preload = %u, rc = %d\n",
+              le32_to_cpu(LDISKFS_SB(osd_sb(dev))->s_es->s_inodes_count),
+              ooc->ooc_pos_preload, rc);
+
+       if (scrub->os_waiting && osd_scrub_has_window(scrub, ooc)) {
+               scrub->os_waiting = 0;
+               cfs_waitq_broadcast(&scrub->os_thread.t_ctl_waitq);
+       }
+
+       RETURN(rc < 0 ? rc : ooc->ooc_cached_items);
+}
+
+static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_otable_it    *it     = (struct osd_otable_it *)di;
+       struct osd_device       *dev    = it->ooi_dev;
+       struct osd_scrub        *scrub  = &dev->od_scrub;
+       struct osd_otable_cache *ooc    = &it->ooi_cache;
+       struct ptlrpc_thread    *thread = &scrub->os_thread;
+       struct l_wait_info       lwi    = { 0 };
+       int                      rc;
+       ENTRY;
+
+       LASSERT(it->ooi_user_ready);
+
+again:
+       if (!thread_is_running(thread) && !it->ooi_used_outside)
+               RETURN(1);
+
+       if (ooc->ooc_cached_items > 0) {
+               ooc->ooc_cached_items--;
+               ooc->ooc_consumer_idx = (ooc->ooc_consumer_idx + 1) &
+                                       ~OSD_OTABLE_IT_CACHE_MASK;
+               RETURN(0);
+       }
+
+       if (it->ooi_all_cached) {
+               l_wait_event(thread->t_ctl_waitq,
+                            !thread_is_running(thread),
+                            &lwi);
+               RETURN(1);
+       }
+
+       it->ooi_waiting = 1;
+       l_wait_event(thread->t_ctl_waitq,
+                    ooc->ooc_pos_preload < scrub->os_pos_current ||
+                    !thread_is_running(thread),
+                    &lwi);
+       it->ooi_waiting = 0;
+
+       if (!thread_is_running(thread) && !it->ooi_used_outside)
+               RETURN(1);
+
+       rc = osd_otable_it_preload(env, it);
+       if (rc >= 0)
+               goto again;
+
+       RETURN(rc);
+}
+
+static struct dt_key *osd_otable_it_key(const struct lu_env *env,
+                                       const struct dt_it *di)
+{
+       struct osd_otable_it    *it  = (struct osd_otable_it *)di;
+       struct osd_otable_cache *ooc = &it->ooi_cache;
+
+       sprintf(it->ooi_key, "%u",
+               ooc->ooc_cache[ooc->ooc_consumer_idx].oic_lid.oii_ino);
+       return (struct dt_key *)it->ooi_key;
+}
+
+static int osd_otable_it_key_size(const struct lu_env *env,
+                                 const struct dt_it *di)
+{
+       return sizeof(((struct osd_otable_it *)di)->ooi_key);
+}
+
+static int osd_otable_it_rec(const struct lu_env *env, const struct dt_it *di,
+                            struct dt_rec *rec, __u32 attr)
+{
+       struct osd_otable_it    *it  = (struct osd_otable_it *)di;
+       struct osd_otable_cache *ooc = &it->ooi_cache;
+
+       *(struct lu_fid *)rec = ooc->ooc_cache[ooc->ooc_consumer_idx].oic_fid;
+       return 0;
+}
+
+static int osd_otable_it_load(const struct lu_env *env,
+                             const struct dt_it *di, __u64 hash)
+{
+       struct osd_otable_it    *it    = (struct osd_otable_it *)di;
+       struct osd_device       *dev   = it->ooi_dev;
+       struct osd_otable_cache *ooc   = &it->ooi_cache;
+       struct osd_scrub        *scrub = &dev->od_scrub;
+
+       if (it->ooi_user_ready)
+               return 0;
+
+       if (ooc->ooc_pos_preload < LDISKFS_FIRST_INO(osd_sb(dev)))
+               ooc->ooc_pos_preload = LDISKFS_FIRST_INO(osd_sb(dev));
+       it->ooi_user_ready = 1;
+       if (!scrub->os_full_speed)
+               cfs_waitq_broadcast(&scrub->os_thread.t_ctl_waitq);
+
+       /* Unplug OSD layer iteration by the first next() call. */
+       return osd_otable_it_next(env, (struct dt_it *)it);
+}
+
+const struct dt_index_operations osd_otable_ops = {
+       .dio_it = {
+               .init     = osd_otable_it_init,
+               .fini     = osd_otable_it_fini,
+               .put      = osd_otable_it_put,
+               .get      = osd_otable_it_get,
+               .next     = osd_otable_it_next,
+               .key      = osd_otable_it_key,
+               .key_size = osd_otable_it_key_size,
+               .rec      = osd_otable_it_rec,
+               .load     = osd_otable_it_load,
+       }
+};
+
+int osd_oii_insert(struct osd_device *dev, struct osd_idmap_cache *oic,
+                  int insert)
+{
+       struct osd_inconsistent_item *oii;
+       struct osd_scrub             *scrub  = &dev->od_scrub;
+       struct ptlrpc_thread         *thread = &scrub->os_thread;
+       int                           wakeup = 0;
+       ENTRY;
+
+       OBD_ALLOC_PTR(oii);
+       if (unlikely(oii == NULL))
+               RETURN(-ENOMEM);
+
+       CFS_INIT_LIST_HEAD(&oii->oii_list);
+       oii->oii_cache = *oic;
+       oii->oii_insert = insert;
+
+       cfs_spin_lock(&scrub->os_lock);
+       if (unlikely(!thread_is_running(thread))) {
+               cfs_spin_unlock(&scrub->os_lock);
+               OBD_FREE_PTR(oii);
+               RETURN(-EAGAIN);
+       }
+
+       if (cfs_list_empty(&scrub->os_inconsistent_items))
+               wakeup = 1;
+       cfs_list_add_tail(&oii->oii_list, &scrub->os_inconsistent_items);
+       cfs_spin_unlock(&scrub->os_lock);
+
+       if (wakeup != 0)
+               cfs_waitq_broadcast(&thread->t_ctl_waitq);
+
+       RETURN(0);
+}
+
+int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
+                  struct osd_inode_id *id)
+{
+       struct osd_scrub             *scrub = &dev->od_scrub;
+       struct osd_inconsistent_item *oii;
+       ENTRY;
+
+       cfs_spin_lock(&scrub->os_lock);
+       cfs_list_for_each_entry(oii, &scrub->os_inconsistent_items, oii_list) {
+               if (lu_fid_eq(fid, &oii->oii_cache.oic_fid)) {
+                       *id = oii->oii_cache.oic_lid;
+                       cfs_spin_unlock(&scrub->os_lock);
+                       RETURN(0);
+               }
+       }
+       cfs_spin_unlock(&scrub->os_lock);
+
+       RETURN(-ENOENT);
+}
+
+static const char *scrub_status_names[] = {
+       "init",
+       "scanning",
+       "completed",
+       "failed",
+       "stopped",
+       "paused",
+       "crashed",
+       NULL
+};
+
+static const char *scrub_flags_names[] = {
+       "recreated",
+       "inconsistent",
+       "auto",
+       NULL
+};
+
+static const char *scrub_param_names[] = {
+       "failout",
+       NULL
+};
+
+static int scrub_bits_dump(char **buf, int *len, int bits, const char *names[],
+                          const char *prefix)
+{
+       int save = *len;
+       int flag;
+       int rc;
+       int i;
+
+       rc = snprintf(*buf, *len, "%s:%c", prefix, bits != 0 ? ' ' : '\n');
+       if (rc <= 0)
+               return -ENOSPC;
+
+       *buf += rc;
+       *len -= rc;
+       for (i = 0, flag = 1; bits != 0; i++, flag = 1 << i) {
+               if (flag & bits) {
+                       bits &= ~flag;
+                       rc = snprintf(*buf, *len, "%s%c", names[i],
+                                     bits != 0 ? ',' : '\n');
+                       if (rc <= 0)
+                               return -ENOSPC;
+
+                       *buf += rc;
+                       *len -= rc;
+               }
+       }
+       return save - *len;
+}
+
+static int scrub_time_dump(char **buf, int *len, __u64 time, const char *prefix)
+{
+       int rc;
+
+       if (time != 0)
+               rc = snprintf(*buf, *len, "%s: "LPU64" seconds\n", prefix,
+                             cfs_time_current_sec() - time);
+       else
+               rc = snprintf(*buf, *len, "%s: N/A\n", prefix);
+       if (rc <= 0)
+               return -ENOSPC;
+
+       *buf += rc;
+       *len -= rc;
+       return rc;
+}
+
+static int scrub_pos_dump(char **buf, int *len, __u64 pos, const char *prefix)
+{
+       int rc;
+
+       if (pos != 0)
+               rc = snprintf(*buf, *len, "%s: "LPU64"\n", prefix, pos);
+       else
+               rc = snprintf(*buf, *len, "%s: N/A\n", prefix);
+       if (rc <= 0)
+               return -ENOSPC;
+
+       *buf += rc;
+       *len -= rc;
+       return rc;
+}
+
+int osd_scrub_dump(struct osd_device *dev, char *buf, int len)
+{
+       struct osd_scrub  *scrub   = &dev->od_scrub;
+       struct scrub_file *sf      = &scrub->os_file;
+       __u64              checked;
+       __u64              speed;
+       int                save    = len;
+       int                ret     = -ENOSPC;
+       int                rc;
+
+       cfs_down_read(&scrub->os_rwsem);
+       rc = snprintf(buf, len,
+                     "name: OI scrub\n"
+                     "magic: 0x%x\n"
+                     "oi_files: %d\n"
+                     "status: %s\n",
+                     sf->sf_magic, (int)sf->sf_oi_count,
+                     scrub_status_names[sf->sf_status]);
+       if (rc <= 0)
+               goto out;
+
+       buf += rc;
+       len -= rc;
+       rc = scrub_bits_dump(&buf, &len, sf->sf_flags, scrub_flags_names,
+                            "flags");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_bits_dump(&buf, &len, sf->sf_param, scrub_param_names,
+                            "param");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_time_dump(&buf, &len, sf->sf_time_last_complete,
+                            "time_since_last_completed");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_time_dump(&buf, &len, sf->sf_time_latest_start,
+                            "time_since_latest_start");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_time_dump(&buf, &len, sf->sf_time_last_checkpoint,
+                            "time_since_last_checkpoint");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_pos_dump(&buf, &len, sf->sf_pos_latest_start,
+                           "latest_start_position");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_pos_dump(&buf, &len, sf->sf_pos_last_checkpoint,
+                           "last_checkpoint_position");
+       if (rc < 0)
+               goto out;
+
+       rc = scrub_pos_dump(&buf, &len, sf->sf_pos_first_inconsistent,
+                           "first_failure_position");
+       if (rc < 0)
+               goto out;
+
+       checked = sf->sf_items_checked + scrub->os_new_checked;
+       rc = snprintf(buf, len,
+                     "checked: "LPU64"\n"
+                     "updated: "LPU64"\n"
+                     "failed: "LPU64"\n"
+                     "prior_updated: "LPU64"\n"
+                     "noscrub: "LPU64"\n"
+                     "igif: "LPU64"\n"
+                     "success_count: %u\n",
+                     checked, sf->sf_items_updated, sf->sf_items_failed,
+                     sf->sf_items_updated_prior, sf->sf_items_noscrub,
+                     sf->sf_items_igif, sf->sf_success_count);
+       if (rc <= 0)
+               goto out;
+
+       buf += rc;
+       len -= rc;
+       speed = checked;
+       if (thread_is_running(&scrub->os_thread)) {
+               cfs_duration_t duration = cfs_time_current() -
+                                         scrub->os_time_last_checkpoint;
+               __u64 new_checked = scrub->os_new_checked * CFS_HZ;
+               __u32 rtime = sf->sf_run_time +
+                             cfs_duration_sec(duration + HALF_SEC);
+
+               if (duration != 0)
+                       do_div(new_checked, duration);
+               if (rtime != 0)
+                       do_div(speed, rtime);
+               rc = snprintf(buf, len,
+                             "run_time: %u seconds\n"
+                             "average_speed: "LPU64" objects/sec\n"
+                             "real-time_speed: "LPU64" objects/sec\n"
+                             "current_position: %u\n",
+                             rtime, speed, new_checked, scrub->os_pos_current);
+       } else {
+               if (sf->sf_run_time != 0)
+                       do_div(speed, sf->sf_run_time);
+               rc = snprintf(buf, len,
+                             "run_time: %u seconds\n"
+                             "average_speed: "LPU64" objects/sec\n"
+                             "real-time_speed: N/A\n"
+                             "current_position: N/A\n",
+                             sf->sf_run_time, speed);
+       }
+       if (rc <= 0)
+               goto out;
+
+       buf += rc;
+       len -= rc;
+       ret = save - len;
+
+out:
+       cfs_up_read(&scrub->os_rwsem);
+       return ret;
+}