Whamcloud - gitweb
LU-14927 scrub: share osd_scrub[prep|post] code
[fs/lustre-release.git] / lustre / osd-zfs / osd_scrub.c
index 9a5c234..c581303 100644 (file)
 #define OSD_OTABLE_MAX_HASH            ((1ULL << 48) - 1)
 #define OTABLE_PREFETCH                        256
 
-#define DTO_INDEX_INSERT               1
-#define DTO_INDEX_DELETE               2
-#define DTO_INDEX_UPDATE               3
-
 static inline bool osd_scrub_has_window(struct osd_otable_it *it)
 {
        return it->ooi_prefetched < OTABLE_PREFETCH;
@@ -71,11 +67,11 @@ static inline bool osd_scrub_has_window(struct osd_otable_it *it)
  * \retval   0, changed successfully
  * \retval -ve, on error
  */
-static int osd_scrub_refresh_mapping(const struct lu_env *env,
-                                    struct osd_device *dev,
-                                    const struct lu_fid *fid,
-                                    uint64_t oid, int ops,
-                                    bool force, const char *name)
+int osd_scrub_refresh_mapping(const struct lu_env *env,
+                             struct osd_device *dev,
+                             const struct lu_fid *fid,
+                             uint64_t oid, enum dt_txn_op ops,
+                             bool force, const char *name)
 {
        struct osd_thread_info *info = osd_oti_get(env);
        struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
@@ -196,7 +192,10 @@ zget:
                        GOTO(out, rc);
                }
 
+               spin_lock(&scrub->os_lock);
                scrub->os_full_speed = 1;
+               spin_unlock(&scrub->os_lock);
+
                sf->sf_flags |= SF_INCONSISTENT;
        } else if (oid == oid2) {
                GOTO(out, rc = 0);
@@ -227,7 +226,9 @@ zget:
                }
 
 update:
+               spin_lock(&scrub->os_lock);
                scrub->os_full_speed = 1;
+               spin_unlock(&scrub->os_lock);
                sf->sf_flags |= SF_INCONSISTENT;
        }
 
@@ -242,6 +243,29 @@ update:
        GOTO(out, rc);
 
 out:
+       if (dev->od_is_ost) {
+               sa_handle_t *hdl;
+               uint64_t nlink, mode;
+
+               rc = -sa_handle_get(dev->od_os, oid, NULL, SA_HDL_PRIVATE,
+                                   &hdl);
+               if (rc)
+                       GOTO(cleanup, rc);
+
+               rc = -sa_lookup(hdl, SA_ZPL_MODE(dev), &mode, sizeof(mode));
+               if (rc || !S_ISREG(mode)) {
+                       sa_handle_destroy(hdl);
+                       GOTO(cleanup, rc);
+               }
+
+               rc = -sa_lookup(hdl, SA_ZPL_LINKS(dev), &nlink, sizeof(nlink));
+               if (rc == 0 && nlink > 1)
+                       scrub->os_has_ml_file = 1;
+
+               sa_handle_destroy(hdl);
+       }
+
+cleanup:
        if (nvbuf)
                nvlist_free(nvbuf);
 
@@ -275,125 +299,6 @@ out:
        RETURN(sf->sf_param & SP_FAILOUT ? rc : 0);
 }
 
-static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
-{
-       struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
-       struct scrub_file *sf = &scrub->os_file;
-       __u32 flags = scrub->os_start_flags;
-       int rc;
-       bool drop_dryrun = false;
-       ENTRY;
-
-       CDEBUG(D_LFSCK, "%s: OI scrub prep, flags = 0x%x\n",
-              scrub->os_name, flags);
-
-       down_write(&scrub->os_rwsem);
-       if (flags & SS_SET_FAILOUT)
-               sf->sf_param |= SP_FAILOUT;
-       else if (flags & SS_CLEAR_FAILOUT)
-               sf->sf_param &= ~SP_FAILOUT;
-
-       if (flags & SS_SET_DRYRUN) {
-               sf->sf_param |= SP_DRYRUN;
-       } else if (flags & SS_CLEAR_DRYRUN && sf->sf_param & SP_DRYRUN) {
-               sf->sf_param &= ~SP_DRYRUN;
-               drop_dryrun = true;
-       }
-
-       if (flags & SS_RESET)
-               scrub_file_reset(scrub, dev->od_uuid, 0);
-
-       scrub->os_partial_scan = 0;
-       if (flags & SS_AUTO_FULL) {
-               scrub->os_full_speed = 1;
-               sf->sf_flags |= SF_AUTO;
-       } else if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT |
-                                  SF_UPGRADE)) {
-               scrub->os_full_speed = 1;
-       } else {
-               scrub->os_full_speed = 0;
-       }
-
-       spin_lock(&scrub->os_lock);
-       scrub->os_in_prior = 0;
-       scrub->os_waiting = 0;
-       scrub->os_paused = 0;
-       scrub->os_in_join = 0;
-       scrub->os_full_scrub = 0;
-       spin_unlock(&scrub->os_lock);
-       scrub->os_new_checked = 0;
-       if (drop_dryrun && sf->sf_pos_first_inconsistent != 0)
-               sf->sf_pos_latest_start = sf->sf_pos_first_inconsistent;
-       else if (sf->sf_pos_last_checkpoint != 0)
-               sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1;
-       else
-               sf->sf_pos_latest_start = 1;
-
-       scrub->os_pos_current = sf->sf_pos_latest_start;
-       sf->sf_status = SS_SCANNING;
-       sf->sf_time_latest_start = ktime_get_real_seconds();
-       sf->sf_time_last_checkpoint = sf->sf_time_latest_start;
-       sf->sf_pos_last_checkpoint = sf->sf_pos_latest_start - 1;
-       rc = scrub_file_store(env, scrub);
-       if (!rc) {
-               spin_lock(&scrub->os_lock);
-               thread_set_flags(thread, SVC_RUNNING);
-               spin_unlock(&scrub->os_lock);
-               wake_up_all(&thread->t_ctl_waitq);
-       }
-       up_write(&scrub->os_rwsem);
-
-       RETURN(rc);
-}
-
-static int osd_scrub_post(const struct lu_env *env, struct osd_device *dev,
-                         int result)
-{
-       struct lustre_scrub *scrub = &dev->od_scrub;
-       struct scrub_file *sf = &scrub->os_file;
-       int rc;
-       ENTRY;
-
-       CDEBUG(D_LFSCK, "%s: OI scrub post with result = %d\n",
-              scrub->os_name, result);
-
-       down_write(&scrub->os_rwsem);
-       spin_lock(&scrub->os_lock);
-       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
-       spin_unlock(&scrub->os_lock);
-       if (scrub->os_new_checked > 0) {
-               sf->sf_items_checked += scrub->os_new_checked;
-               scrub->os_new_checked = 0;
-               sf->sf_pos_last_checkpoint = scrub->os_pos_current;
-       }
-       sf->sf_time_last_checkpoint = ktime_get_real_seconds();
-       if (result > 0) {
-               sf->sf_status = SS_COMPLETED;
-               if (!(sf->sf_param & SP_DRYRUN)) {
-                       memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
-                       sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT |
-                                         SF_UPGRADE | SF_AUTO);
-               }
-               sf->sf_time_last_complete = sf->sf_time_last_checkpoint;
-               sf->sf_success_count++;
-       } else if (result == 0) {
-               if (scrub->os_paused)
-                       sf->sf_status = SS_PAUSED;
-               else
-                       sf->sf_status = SS_STOPPED;
-       } else {
-               sf->sf_status = SS_FAILED;
-       }
-       sf->sf_run_time += ktime_get_seconds() -
-                          scrub->os_time_last_checkpoint;
-
-       rc = scrub_file_store(env, scrub);
-       up_write(&scrub->os_rwsem);
-
-       RETURN(rc < 0 ? rc : result);
-}
-
 /* iteration engine */
 
 static inline int
@@ -402,7 +307,7 @@ osd_scrub_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
        spin_lock(&scrub->os_lock);
        if (osd_scrub_has_window(it) ||
            !list_empty(&scrub->os_inconsistent_items) ||
-           it->ooi_waiting || !thread_is_running(&scrub->os_thread))
+           it->ooi_waiting || kthread_should_stop())
                scrub->os_waiting = 0;
        else
                scrub->os_waiting = 1;
@@ -414,9 +319,7 @@ osd_scrub_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
 static int osd_scrub_next(const struct lu_env *env, struct osd_device *dev,
                          struct lu_fid *fid, uint64_t *oid)
 {
-       struct l_wait_info lwi = { 0 };
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct osd_otable_it *it = dev->od_otable_it;
        struct lustre_mdt_attrs *lma = NULL;
        nvlist_t *nvbuf = NULL;
@@ -425,20 +328,19 @@ static int osd_scrub_next(const struct lu_env *env, struct osd_device *dev,
        ENTRY;
 
        if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_DELAY) && cfs_fail_val > 0) {
-               lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
-               if (likely(lwi.lwi_timeout > 0)) {
-                       l_wait_event(thread->t_ctl_waitq,
-                               !list_empty(&scrub->os_inconsistent_items) ||
-                               !thread_is_running(thread),
-                               &lwi);
-                       if (unlikely(!thread_is_running(thread)))
-                               RETURN(SCRUB_NEXT_EXIT);
-               }
+               wait_var_event_timeout(
+                       scrub,
+                       !list_empty(&scrub->os_inconsistent_items) ||
+                       kthread_should_stop(),
+                       cfs_time_seconds(cfs_fail_val));
+
+               if (kthread_should_stop())
+                       RETURN(SCRUB_NEXT_EXIT);
        }
 
        if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
                spin_lock(&scrub->os_lock);
-               thread_set_flags(thread, SVC_STOPPING);
+               scrub->os_running = 0;
                spin_unlock(&scrub->os_lock);
                RETURN(SCRUB_NEXT_CRASH);
        }
@@ -470,14 +372,10 @@ again:
                spin_unlock(&scrub->os_lock);
        }
 
-       if (!scrub->os_full_speed && !osd_scrub_has_window(it)) {
-               memset(&lwi, 0, sizeof(lwi));
-               l_wait_event(thread->t_ctl_waitq,
-                            osd_scrub_wakeup(scrub, it),
-                            &lwi);
-       }
+       if (!scrub->os_full_speed && !osd_scrub_has_window(it))
+               wait_var_event(scrub, osd_scrub_wakeup(scrub, it));
 
-       if (unlikely(!thread_is_running(thread)))
+       if (kthread_should_stop())
                GOTO(out, rc = SCRUB_NEXT_EXIT);
 
        rc = -dmu_object_next(dev->od_os, &scrub->os_pos_current, B_FALSE, 0);
@@ -510,7 +408,7 @@ again:
                it->ooi_prefetched++;
                if (it->ooi_waiting) {
                        it->ooi_waiting = 0;
-                       wake_up_all(&thread->t_ctl_waitq);
+                       wake_up_var(scrub);
                }
                spin_unlock(&scrub->os_lock);
        }
@@ -528,7 +426,6 @@ static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
                          const struct lu_fid *fid, uint64_t oid, int rc)
 {
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct osd_otable_it *it = dev->od_otable_it;
 
        rc = osd_scrub_check_update(env, dev, fid, oid, rc);
@@ -538,12 +435,14 @@ static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
                        it->ooi_prefetched++;
                        if (it->ooi_waiting) {
                                it->ooi_waiting = 0;
-                               wake_up_all(&thread->t_ctl_waitq);
+                               wake_up_var(scrub);
                        }
                        spin_unlock(&scrub->os_lock);
                }
        } else {
+               spin_lock(&scrub->os_lock);
                scrub->os_in_prior = 0;
+               spin_unlock(&scrub->os_lock);
        }
 
        if (rc)
@@ -559,15 +458,17 @@ static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
        return 0;
 }
 
+static int osd_scan_ml_file_main(const struct lu_env *env,
+                                struct osd_device *dev);
+
 static int osd_scrub_main(void *args)
 {
        struct lu_env env;
        struct osd_device *dev = (struct osd_device *)args;
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct lu_fid *fid;
        uint64_t oid;
-       int rc = 0;
+       int rc = 0, ret;
        ENTRY;
 
        rc = lu_env_init(&env, LCT_LOCAL | LCT_DT_THREAD);
@@ -577,7 +478,7 @@ static int osd_scrub_main(void *args)
                GOTO(noenv, rc);
        }
 
-       rc = osd_scrub_prep(&env, dev);
+       rc = scrub_thread_prep(&env, scrub, dev->od_uuid, 1);
        if (rc) {
                CDEBUG(D_LFSCK, "%s: OI scrub fail to scrub prep: rc = %d\n",
                       scrub->os_name, rc);
@@ -585,13 +486,13 @@ static int osd_scrub_main(void *args)
        }
 
        if (!scrub->os_full_speed) {
-               struct l_wait_info lwi = { 0 };
                struct osd_otable_it *it = dev->od_otable_it;
 
-               l_wait_event(thread->t_ctl_waitq,
-                            it->ooi_user_ready || !thread_is_running(thread),
-                            &lwi);
-               if (unlikely(!thread_is_running(thread)))
+               wait_var_event(scrub,
+                              it->ooi_user_ready ||
+                              kthread_should_stop());
+
+               if (kthread_should_stop())
                        GOTO(post, rc = 0);
 
                scrub->os_pos_current = it->ooi_pos;
@@ -602,14 +503,14 @@ static int osd_scrub_main(void *args)
               scrub->os_pos_current);
 
        fid = &osd_oti_get(&env)->oti_fid;
-       while (!rc && thread_is_running(thread)) {
+       while (!rc && !kthread_should_stop()) {
                rc = osd_scrub_next(&env, dev, fid, &oid);
                switch (rc) {
                case SCRUB_NEXT_EXIT:
                        GOTO(post, rc = 0);
                case SCRUB_NEXT_CRASH:
                        spin_lock(&scrub->os_lock);
-                       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+                       scrub->os_running = 0;
                        spin_unlock(&scrub->os_lock);
                        GOTO(out, rc = -EINVAL);
                case SCRUB_NEXT_FATAL:
@@ -624,7 +525,13 @@ static int osd_scrub_main(void *args)
        GOTO(post, rc);
 
 post:
-       rc = osd_scrub_post(&env, dev, rc);
+       if (scrub->os_has_ml_file) {
+               ret = osd_scan_ml_file_main(&env, dev);
+               if (ret != 0)
+                       rc = ret;
+       }
+
+       rc = scrub_thread_post(&env, &dev->od_scrub, rc);
        CDEBUG(D_LFSCK, "%s: OI scrub: stop, pos = %llu: rc = %d\n",
               scrub->os_name, scrub->os_pos_current, rc);
 
@@ -642,9 +549,12 @@ out:
 
 noenv:
        spin_lock(&scrub->os_lock);
-       thread_set_flags(thread, SVC_STOPPED);
-       wake_up_all(&thread->t_ctl_waitq);
+       scrub->os_running = 0;
        spin_unlock(&scrub->os_lock);
+       if (xchg(&scrub->os_task, NULL) == NULL)
+               /* scrub_stop is waiting, we need to synchronize */
+               wait_var_event(scrub, kthread_should_stop());
+       wake_up_var(scrub);
        return rc;
 }
 
@@ -701,7 +611,7 @@ static const struct osd_lf_map osd_lf_maps[] = {
 
        /* PENDING */
        {
-               .olm_name               = "PENDING",
+               .olm_name               = MDT_ORPHAN_DIR,
        },
 
        /* ROOT */
@@ -1273,8 +1183,7 @@ static int osd_ios_ROOT_sd(const struct lu_env *env, struct osd_device *dev,
                                    sizeof(*zde) / 8, (void *)zde);
                if (rc) {
                        if (rc != -ENOENT)
-                               CWARN("%s: initial OI scrub failed to find"
-                                     "the entry %s under .lustre: rc = %d\n",
+                               CWARN("%s: initial OI scrub failed to find the entry %s under .lustre: rc = %d\n",
                                      osd_name(dev), map->olm_name, rc);
                        else if (!fid_is_zero(&map->olm_fid))
                                /* Try to remove the stale OI mapping. */
@@ -1394,7 +1303,9 @@ void osd_scrub_stop(struct osd_device *dev)
 
        /* od_otable_sem: prevent concurrent start/stop */
        down(&dev->od_otable_sem);
+       spin_lock(&scrub->os_lock);
        scrub->os_paused = 1;
+       spin_unlock(&scrub->os_lock);
        scrub_stop(scrub);
        up(&dev->od_otable_sem);
 
@@ -1405,7 +1316,8 @@ void osd_scrub_stop(struct osd_device *dev)
 
 static const char osd_scrub_name[] = "OI_scrub";
 
-int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev,
+                   bool resetoi)
 {
        struct osd_thread_info *info = osd_oti_get(env);
        struct lustre_scrub *scrub = &dev->od_scrub;
@@ -1417,11 +1329,10 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        bool dirty = false;
        ENTRY;
 
-       memcpy(dev->od_uuid,
+       memcpy(dev->od_uuid.b,
               &dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid,
               sizeof(dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid));
        memset(&dev->od_scrub, 0, sizeof(struct lustre_scrub));
-       init_waitqueue_head(&scrub->os_thread.t_ctl_waitq);
        init_rwsem(&scrub->os_rwsem);
        spin_lock_init(&scrub->os_lock);
        INIT_LIST_HEAD(&scrub->os_inconsistent_items);
@@ -1448,6 +1359,7 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        if (IS_ERR_OR_NULL(obj))
                RETURN(obj ? PTR_ERR(obj) : -ENOENT);
 
+       obj->do_body_ops = &osd_body_scrub_ops;
        scrub->os_obj = obj;
        rc = scrub_file_load(env, scrub);
        if (rc == -ENOENT || rc == -EFAULT) {
@@ -1456,29 +1368,12 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        } else if (rc < 0) {
                GOTO(cleanup_obj, rc);
        } else {
-               if (memcmp(sf->sf_uuid, dev->od_uuid, 16) != 0) {
-                       struct obd_uuid *old_uuid;
-                       struct obd_uuid *new_uuid;
-
-                       OBD_ALLOC_PTR(old_uuid);
-                       OBD_ALLOC_PTR(new_uuid);
-                       if (!old_uuid || !new_uuid) {
-                               CERROR("%s: UUID has been changed, but"
-                                      "failed to allocate RAM for report\n",
-                                      osd_name(dev));
-                       } else {
-                               class_uuid_unparse(sf->sf_uuid, old_uuid);
-                               class_uuid_unparse(dev->od_uuid, new_uuid);
-                               CDEBUG(D_LFSCK, "%s: UUID has been changed "
-                                      "from %s to %s\n", osd_name(dev),
-                                      old_uuid->uuid, new_uuid->uuid);
-                       }
+               if (!uuid_equal(&sf->sf_uuid, &dev->od_uuid)) {
+                       CDEBUG(D_LFSCK,
+                              "%s: UUID has been changed from %pU to %pU\n",
+                              osd_name(dev), &sf->sf_uuid, &dev->od_uuid);
                        scrub_file_reset(scrub, dev->od_uuid, SF_INCONSISTENT);
                        dirty = true;
-                       if (old_uuid)
-                               OBD_FREE_PTR(old_uuid);
-                       if (new_uuid)
-                               OBD_FREE_PTR(new_uuid);
                } else if (sf->sf_status == SS_SCANNING) {
                        sf->sf_status = SS_CRASHED;
                        dirty = true;
@@ -1505,7 +1400,7 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        }
 
        /* Initialize OI files. */
-       rc = osd_oi_init(env, dev);
+       rc = osd_oi_init(env, dev, resetoi);
        if (rc < 0)
                GOTO(cleanup_obj, rc);
 
@@ -1683,7 +1578,7 @@ osd_otable_it_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
 {
        spin_lock(&scrub->os_lock);
        if (it->ooi_pos < scrub->os_pos_current || scrub->os_waiting ||
-           !thread_is_running(&scrub->os_thread))
+           !scrub->os_running)
                it->ooi_waiting = 0;
        else
                it->ooi_waiting = 1;
@@ -1697,12 +1592,10 @@ static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di)
        struct osd_otable_it *it = (struct osd_otable_it *)di;
        struct osd_device *dev = it->ooi_dev;
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
-       struct l_wait_info lwi = { 0 };
        struct lustre_mdt_attrs *lma = NULL;
        nvlist_t *nvbuf = NULL;
-       int size = 0;
-       int rc;
+       int rc, size = 0;
+       bool locked;
        ENTRY;
 
        LASSERT(it->ooi_user_ready);
@@ -1720,11 +1613,10 @@ again:
        }
 
        if (it->ooi_pos >= scrub->os_pos_current)
-               l_wait_event(thread->t_ctl_waitq,
-                            osd_otable_it_wakeup(scrub, it),
-                            &lwi);
+               wait_var_event(scrub,
+                              osd_otable_it_wakeup(scrub, it));
 
-       if (!thread_is_running(thread) && !it->ooi_used_outside)
+       if (!scrub->os_running && !it->ooi_used_outside)
                GOTO(out, rc = 1);
 
        rc = -dmu_object_next(dev->od_os, &it->ooi_pos, B_FALSE, 0);
@@ -1739,16 +1631,20 @@ again:
 
        rc = __osd_xattr_load_by_oid(dev, it->ooi_pos, &nvbuf);
 
-       if (!scrub->os_full_speed)
+       locked = false;
+       if (!scrub->os_full_speed) {
                spin_lock(&scrub->os_lock);
+               locked = true;
+       }
        it->ooi_prefetched--;
        if (!scrub->os_full_speed) {
                if (scrub->os_waiting) {
                        scrub->os_waiting = 0;
-                       wake_up_all(&thread->t_ctl_waitq);
+                       wake_up_var(scrub);
                }
-               spin_unlock(&scrub->os_lock);
        }
+       if (locked)
+               spin_unlock(&scrub->os_lock);
 
        if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
                goto again;
@@ -1840,7 +1736,7 @@ static int osd_otable_it_load(const struct lu_env *env,
        it->ooi_prefetched_dnode = 0;
        it->ooi_user_ready = 1;
        if (!scrub->os_full_speed)
-               wake_up_all(&scrub->os_thread.t_ctl_waitq);
+               wake_up_var(scrub);
 
        /* Unplug OSD layer iteration by the first next() call. */
        rc = osd_otable_it_next(env, (struct dt_it *)it);
@@ -1876,7 +1772,6 @@ int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
                   const struct lu_fid *fid, uint64_t oid, bool insert)
 {
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct osd_inconsistent_item *oii;
        bool wakeup = false;
        ENTRY;
@@ -1893,7 +1788,7 @@ int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
        oii->oii_insert = insert;
 
        spin_lock(&scrub->os_lock);
-       if (unlikely(!thread_is_running(thread))) {
+       if (!scrub->os_running) {
                spin_unlock(&scrub->os_lock);
                OBD_FREE_PTR(oii);
                RETURN(-EAGAIN);
@@ -1905,7 +1800,7 @@ int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
        spin_unlock(&scrub->os_lock);
 
        if (wakeup)
-               wake_up_all(&thread->t_ctl_waitq);
+               wake_up_var(scrub);
 
        RETURN(0);
 }
@@ -1930,3 +1825,218 @@ int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
 
        RETURN(ret);
 }
+
+typedef int (*scan_dir_helper_t)(const struct lu_env *env,
+                                struct osd_device *dev, uint64_t dir_oid,
+                                struct osd_zap_it *ozi);
+
+static int osd_scan_dir(const struct lu_env *env, struct osd_device *dev,
+                       uint64_t id, scan_dir_helper_t cb)
+{
+       struct osd_zap_it *it;
+       struct luz_direntry *zde;
+       zap_attribute_t *za;
+       int rc;
+
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(it, osd_zapit_cachep, GFP_NOFS);
+       if (it == NULL)
+               RETURN(-ENOMEM);
+
+       rc = osd_zap_cursor_init(&it->ozi_zc, dev->od_os, id, 0);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       za = &it->ozi_za;
+       zde = &it->ozi_zde;
+       while (1) {
+               rc = -zap_cursor_retrieve(it->ozi_zc, za);
+               if (unlikely(rc)) {
+                       if (rc == -ENOENT)
+                               rc = 0;
+
+                       break;
+               }
+
+               if (name_is_dot_or_dotdot(za->za_name, strlen(za->za_name))) {
+                       zap_cursor_advance(it->ozi_zc);
+                       continue;
+               }
+
+               strncpy(it->ozi_name, za->za_name, sizeof(it->ozi_name));
+               if (za->za_integer_length != 8) {
+                       rc = -EIO;
+                       break;
+               }
+
+               rc = osd_zap_lookup(dev, it->ozi_zc->zc_zapobj, NULL,
+                                   za->za_name, za->za_integer_length,
+                                   sizeof(*zde) / za->za_integer_length, zde);
+               if (rc)
+                       break;
+
+               rc = cb(env, dev, id, it);
+               if (rc)
+                       break;
+
+               zap_cursor_advance(it->ozi_zc);
+       }
+       osd_zap_cursor_fini(it->ozi_zc);
+
+out:
+       OBD_SLAB_FREE_PTR(it, osd_zapit_cachep);
+       RETURN(rc);
+}
+
+static int osd_remove_ml_file(const struct lu_env *env, struct osd_device *dev,
+                             uint64_t dir, uint64_t id, struct lu_fid *fid,
+                             char *name)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct dt_object *dt;
+       struct osd_object *obj = NULL;
+       dmu_tx_t *tx;
+       sa_handle_t *hdl;
+       uint64_t nlink;
+       int rc;
+
+       rc = -sa_handle_get(dev->od_os, id, NULL, SA_HDL_PRIVATE, &hdl);
+       if (rc)
+               RETURN(rc);
+
+       dt = lu2dt(lu_object_find_slice(env, osd2lu_dev(dev), fid, NULL));
+       if (IS_ERR(dt))
+               RETURN(PTR_ERR(dt));
+
+       if (dt) {
+               obj = osd_dt_obj(dt);
+               down_read(&obj->oo_guard);
+       }
+
+       rc = -sa_lookup(hdl, SA_ZPL_LINKS(dev), &nlink, sizeof(nlink));
+       if (rc)
+               GOTO(out, rc);
+
+       if (nlink <= 1) {
+               CERROR("%s: multi-link file O/%s/%s/%s has nlink %llu\n",
+                      osd_name(dev), info->oti_seq_name, info->oti_dir_name,
+                      name, nlink);
+               GOTO(out, rc = 0);
+       }
+
+       tx = dmu_tx_create(dev->od_os);
+       if (!tx) {
+               CERROR("%s: fail to create tx to remove multi-link file!\n",
+                      osd_name(dev));
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       dmu_tx_hold_zap(tx, dir, FALSE, NULL);
+       rc = -dmu_tx_assign(tx, TXG_WAIT);
+       if (rc)
+               GOTO(abort, rc);
+
+       nlink--;
+       rc = -sa_update(hdl, SA_ZPL_LINKS(dev), &nlink, sizeof(nlink), tx);
+       if (rc)
+               GOTO(abort, rc);
+
+       rc = -zap_remove(dev->od_os, dir, name, tx);
+       if (rc)
+               GOTO(abort, rc);
+
+       dmu_tx_commit(tx);
+       GOTO(out, rc);
+
+abort:
+       dmu_tx_abort(tx);
+
+out:
+       if (dt) {
+               up_read(&obj->oo_guard);
+               dt_object_put_nocache(env, dt);
+       }
+
+       sa_handle_destroy(hdl);
+       RETURN(rc);
+}
+
+static int osd_scan_ml_file(const struct lu_env *env, struct osd_device *dev,
+                           uint64_t dir_oid, struct osd_zap_it *ozi)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct lu_fid *fid = &info->oti_fid;
+       struct ost_id *ostid = &info->oti_ostid;
+       char name[32];
+       u64 seq;
+       int rc = 0;
+
+       ENTRY;
+
+       rc = osd_get_fid_by_oid(env, dev, ozi->ozi_zde.lzd_reg.zde_dnode, fid);
+       if (rc)
+               RETURN(rc);
+
+       seq = fid_seq(fid);
+       fid_to_ostid(fid, ostid);
+
+       snprintf(name, sizeof(name), (fid_seq_is_rsvd(seq) ||
+                                     fid_seq_is_mdt0(seq)) ? "%llu" : "%llx",
+                                     fid_seq_is_idif(seq) ? 0 : seq);
+       if (strcmp(info->oti_seq_name, name) != 0)
+               GOTO(fix, rc);
+
+       snprintf(name, sizeof(name), "d%d",
+               (int)ostid_id(ostid) % OSD_OST_MAP_SIZE);
+       if (strcmp(info->oti_dir_name, name) != 0)
+               GOTO(fix, rc);
+
+       snprintf(name, sizeof(name), "%llu", ostid_id(ostid));
+       if (strcmp(ozi->ozi_name, name) == 0)
+               RETURN(0);
+
+fix:
+       CDEBUG(D_LFSCK, "%s: the file O/%s/%s/%s is corrupted\n",
+              osd_name(dev), info->oti_seq_name, info->oti_dir_name,
+              ozi->ozi_name);
+
+       rc = osd_remove_ml_file(env, dev, dir_oid,
+                               ozi->ozi_zde.lzd_reg.zde_dnode, fid,
+                               ozi->ozi_name);
+       RETURN(rc);
+}
+
+static int osd_scan_ml_file_dir(const struct lu_env *env,
+                               struct osd_device *dev, uint64_t dir_oid,
+                               struct osd_zap_it *ozi)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+
+       if (!S_ISDIR(cpu_to_le16(DTTOIF(ozi->ozi_zde.lzd_reg.zde_type))))
+               return 0;
+
+       info->oti_dir_name = ozi->ozi_name;
+       return osd_scan_dir(env, dev, ozi->ozi_zde.lzd_reg.zde_dnode,
+                           osd_scan_ml_file);
+}
+
+static int osd_scan_ml_file_seq(const struct lu_env *env,
+                               struct osd_device *dev, uint64_t dir_oid,
+                               struct osd_zap_it *ozi)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+
+       if (!S_ISDIR(cpu_to_le16(DTTOIF(ozi->ozi_zde.lzd_reg.zde_type))))
+               return 0;
+
+       info->oti_seq_name = ozi->ozi_name;
+       return osd_scan_dir(env, dev, ozi->ozi_zde.lzd_reg.zde_dnode,
+                           osd_scan_ml_file_dir);
+}
+
+static int osd_scan_ml_file_main(const struct lu_env *env,
+                                struct osd_device *dev)
+{
+       return osd_scan_dir(env, dev, dev->od_O_id, osd_scan_ml_file_seq);
+}