Whamcloud - gitweb
LU-13124 scrub: check for multiple linked file
[fs/lustre-release.git] / lustre / osd-zfs / osd_scrub.c
index b2cda0a..247973a 100644 (file)
 #include <obd_class.h>
 #include <lustre_nodemap.h>
 #include <sys/dsl_dataset.h>
+#include <sys/zap_impl.h>
+#include <sys/zap.h>
+#include <sys/zap_leaf.h>
 
 #include "osd_internal.h"
 
 #define OSD_OTABLE_MAX_HASH            ((1ULL << 48) - 1)
 #define OTABLE_PREFETCH                        256
 
-#define DTO_INDEX_INSERT               1
-#define DTO_INDEX_DELETE               2
-#define DTO_INDEX_UPDATE               3
-
 static inline bool osd_scrub_has_window(struct osd_otable_it *it)
 {
        return it->ooi_prefetched < OTABLE_PREFETCH;
@@ -68,11 +67,11 @@ static inline bool osd_scrub_has_window(struct osd_otable_it *it)
  * \retval   0, changed successfully
  * \retval -ve, on error
  */
-static int osd_scrub_refresh_mapping(const struct lu_env *env,
-                                    struct osd_device *dev,
-                                    const struct lu_fid *fid,
-                                    uint64_t oid, int ops,
-                                    bool force, const char *name)
+int osd_scrub_refresh_mapping(const struct lu_env *env,
+                             struct osd_device *dev,
+                             const struct lu_fid *fid,
+                             uint64_t oid, enum dt_txn_op ops,
+                             bool force, const char *name)
 {
        struct osd_thread_info *info = osd_oti_get(env);
        struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
@@ -193,7 +192,10 @@ zget:
                        GOTO(out, rc);
                }
 
+               spin_lock(&scrub->os_lock);
                scrub->os_full_speed = 1;
+               spin_unlock(&scrub->os_lock);
+
                sf->sf_flags |= SF_INCONSISTENT;
        } else if (oid == oid2) {
                GOTO(out, rc = 0);
@@ -224,7 +226,9 @@ zget:
                }
 
 update:
+               spin_lock(&scrub->os_lock);
                scrub->os_full_speed = 1;
+               spin_unlock(&scrub->os_lock);
                sf->sf_flags |= SF_INCONSISTENT;
        }
 
@@ -239,6 +243,29 @@ update:
        GOTO(out, rc);
 
 out:
+       if (dev->od_is_ost) {
+               sa_handle_t *hdl;
+               uint64_t nlink, mode;
+
+               rc = -sa_handle_get(dev->od_os, oid, NULL, SA_HDL_PRIVATE,
+                                   &hdl);
+               if (rc)
+                       GOTO(cleanup, rc);
+
+               rc = -sa_lookup(hdl, SA_ZPL_MODE(dev), &mode, sizeof(mode));
+               if (rc || !S_ISREG(mode)) {
+                       sa_handle_destroy(hdl);
+                       GOTO(cleanup, rc);
+               }
+
+               rc = -sa_lookup(hdl, SA_ZPL_LINKS(dev), &nlink, sizeof(nlink));
+               if (rc == 0 && nlink > 1)
+                       scrub->os_has_ml_file = 1;
+
+               sa_handle_destroy(hdl);
+       }
+
+cleanup:
        if (nvbuf)
                nvlist_free(nvbuf);
 
@@ -275,7 +302,6 @@ out:
 static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
 {
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct scrub_file *sf = &scrub->os_file;
        __u32 flags = scrub->os_start_flags;
        int rc;
@@ -301,6 +327,7 @@ static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
        if (flags & SS_RESET)
                scrub_file_reset(scrub, dev->od_uuid, 0);
 
+       spin_lock(&scrub->os_lock);
        scrub->os_partial_scan = 0;
        if (flags & SS_AUTO_FULL) {
                scrub->os_full_speed = 1;
@@ -312,7 +339,6 @@ static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
                scrub->os_full_speed = 0;
        }
 
-       spin_lock(&scrub->os_lock);
        scrub->os_in_prior = 0;
        scrub->os_waiting = 0;
        scrub->os_paused = 0;
@@ -329,15 +355,15 @@ static int osd_scrub_prep(const struct lu_env *env, struct osd_device *dev)
 
        scrub->os_pos_current = sf->sf_pos_latest_start;
        sf->sf_status = SS_SCANNING;
-       sf->sf_time_latest_start = cfs_time_current_sec();
+       sf->sf_time_latest_start = ktime_get_real_seconds();
        sf->sf_time_last_checkpoint = sf->sf_time_latest_start;
        sf->sf_pos_last_checkpoint = sf->sf_pos_latest_start - 1;
        rc = scrub_file_store(env, scrub);
        if (!rc) {
                spin_lock(&scrub->os_lock);
-               thread_set_flags(thread, SVC_RUNNING);
+               scrub->os_running = 1;
                spin_unlock(&scrub->os_lock);
-               wake_up_all(&thread->t_ctl_waitq);
+               wake_up_var(scrub);
        }
        up_write(&scrub->os_rwsem);
 
@@ -357,14 +383,14 @@ static int osd_scrub_post(const struct lu_env *env, struct osd_device *dev,
 
        down_write(&scrub->os_rwsem);
        spin_lock(&scrub->os_lock);
-       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+       scrub->os_running = 0;
        spin_unlock(&scrub->os_lock);
        if (scrub->os_new_checked > 0) {
                sf->sf_items_checked += scrub->os_new_checked;
                scrub->os_new_checked = 0;
                sf->sf_pos_last_checkpoint = scrub->os_pos_current;
        }
-       sf->sf_time_last_checkpoint = cfs_time_current_sec();
+       sf->sf_time_last_checkpoint = ktime_get_real_seconds();
        if (result > 0) {
                sf->sf_status = SS_COMPLETED;
                if (!(sf->sf_param & SP_DRYRUN)) {
@@ -382,8 +408,9 @@ static int osd_scrub_post(const struct lu_env *env, struct osd_device *dev,
        } else {
                sf->sf_status = SS_FAILED;
        }
-       sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC -
-                                           scrub->os_time_last_checkpoint);
+       sf->sf_run_time += ktime_get_seconds() -
+                          scrub->os_time_last_checkpoint;
+
        rc = scrub_file_store(env, scrub);
        up_write(&scrub->os_rwsem);
 
@@ -398,7 +425,7 @@ osd_scrub_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
        spin_lock(&scrub->os_lock);
        if (osd_scrub_has_window(it) ||
            !list_empty(&scrub->os_inconsistent_items) ||
-           it->ooi_waiting || !thread_is_running(&scrub->os_thread))
+           it->ooi_waiting || kthread_should_stop())
                scrub->os_waiting = 0;
        else
                scrub->os_waiting = 1;
@@ -410,9 +437,7 @@ osd_scrub_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
 static int osd_scrub_next(const struct lu_env *env, struct osd_device *dev,
                          struct lu_fid *fid, uint64_t *oid)
 {
-       struct l_wait_info lwi = { 0 };
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct osd_otable_it *it = dev->od_otable_it;
        struct lustre_mdt_attrs *lma = NULL;
        nvlist_t *nvbuf = NULL;
@@ -421,20 +446,19 @@ static int osd_scrub_next(const struct lu_env *env, struct osd_device *dev,
        ENTRY;
 
        if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_DELAY) && cfs_fail_val > 0) {
-               lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
-               if (likely(lwi.lwi_timeout > 0)) {
-                       l_wait_event(thread->t_ctl_waitq,
-                               !list_empty(&scrub->os_inconsistent_items) ||
-                               !thread_is_running(thread),
-                               &lwi);
-                       if (unlikely(!thread_is_running(thread)))
-                               RETURN(SCRUB_NEXT_EXIT);
-               }
+               wait_var_event_timeout(
+                       scrub,
+                       !list_empty(&scrub->os_inconsistent_items) ||
+                       kthread_should_stop(),
+                       cfs_time_seconds(cfs_fail_val));
+
+               if (kthread_should_stop())
+                       RETURN(SCRUB_NEXT_EXIT);
        }
 
        if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
                spin_lock(&scrub->os_lock);
-               thread_set_flags(thread, SVC_STOPPING);
+               scrub->os_running = 0;
                spin_unlock(&scrub->os_lock);
                RETURN(SCRUB_NEXT_CRASH);
        }
@@ -466,14 +490,10 @@ again:
                spin_unlock(&scrub->os_lock);
        }
 
-       if (!scrub->os_full_speed && !osd_scrub_has_window(it)) {
-               memset(&lwi, 0, sizeof(lwi));
-               l_wait_event(thread->t_ctl_waitq,
-                            osd_scrub_wakeup(scrub, it),
-                            &lwi);
-       }
+       if (!scrub->os_full_speed && !osd_scrub_has_window(it))
+               wait_var_event(scrub, osd_scrub_wakeup(scrub, it));
 
-       if (unlikely(!thread_is_running(thread)))
+       if (kthread_should_stop())
                GOTO(out, rc = SCRUB_NEXT_EXIT);
 
        rc = -dmu_object_next(dev->od_os, &scrub->os_pos_current, B_FALSE, 0);
@@ -506,7 +526,7 @@ again:
                it->ooi_prefetched++;
                if (it->ooi_waiting) {
                        it->ooi_waiting = 0;
-                       wake_up_all(&thread->t_ctl_waitq);
+                       wake_up_var(scrub);
                }
                spin_unlock(&scrub->os_lock);
        }
@@ -524,7 +544,6 @@ static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
                          const struct lu_fid *fid, uint64_t oid, int rc)
 {
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct osd_otable_it *it = dev->od_otable_it;
 
        rc = osd_scrub_check_update(env, dev, fid, oid, rc);
@@ -534,12 +553,14 @@ static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
                        it->ooi_prefetched++;
                        if (it->ooi_waiting) {
                                it->ooi_waiting = 0;
-                               wake_up_all(&thread->t_ctl_waitq);
+                               wake_up_var(scrub);
                        }
                        spin_unlock(&scrub->os_lock);
                }
        } else {
+               spin_lock(&scrub->os_lock);
                scrub->os_in_prior = 0;
+               spin_unlock(&scrub->os_lock);
        }
 
        if (rc)
@@ -555,15 +576,17 @@ static int osd_scrub_exec(const struct lu_env *env, struct osd_device *dev,
        return 0;
 }
 
+static int osd_scan_ml_file_main(const struct lu_env *env,
+                                struct osd_device *dev);
+
 static int osd_scrub_main(void *args)
 {
        struct lu_env env;
        struct osd_device *dev = (struct osd_device *)args;
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct lu_fid *fid;
        uint64_t oid;
-       int rc = 0;
+       int rc = 0, ret;
        ENTRY;
 
        rc = lu_env_init(&env, LCT_LOCAL | LCT_DT_THREAD);
@@ -581,13 +604,13 @@ static int osd_scrub_main(void *args)
        }
 
        if (!scrub->os_full_speed) {
-               struct l_wait_info lwi = { 0 };
                struct osd_otable_it *it = dev->od_otable_it;
 
-               l_wait_event(thread->t_ctl_waitq,
-                            it->ooi_user_ready || !thread_is_running(thread),
-                            &lwi);
-               if (unlikely(!thread_is_running(thread)))
+               wait_var_event(scrub,
+                              it->ooi_user_ready ||
+                              kthread_should_stop());
+
+               if (kthread_should_stop())
                        GOTO(post, rc = 0);
 
                scrub->os_pos_current = it->ooi_pos;
@@ -598,14 +621,14 @@ static int osd_scrub_main(void *args)
               scrub->os_pos_current);
 
        fid = &osd_oti_get(&env)->oti_fid;
-       while (!rc && thread_is_running(thread)) {
+       while (!rc && !kthread_should_stop()) {
                rc = osd_scrub_next(&env, dev, fid, &oid);
                switch (rc) {
                case SCRUB_NEXT_EXIT:
                        GOTO(post, rc = 0);
                case SCRUB_NEXT_CRASH:
                        spin_lock(&scrub->os_lock);
-                       thread_set_flags(&scrub->os_thread, SVC_STOPPING);
+                       scrub->os_running = 0;
                        spin_unlock(&scrub->os_lock);
                        GOTO(out, rc = -EINVAL);
                case SCRUB_NEXT_FATAL:
@@ -620,6 +643,12 @@ static int osd_scrub_main(void *args)
        GOTO(post, rc);
 
 post:
+       if (scrub->os_has_ml_file) {
+               ret = osd_scan_ml_file_main(&env, dev);
+               if (ret != 0)
+                       rc = ret;
+       }
+
        rc = osd_scrub_post(&env, dev, rc);
        CDEBUG(D_LFSCK, "%s: OI scrub: stop, pos = %llu: rc = %d\n",
               scrub->os_name, scrub->os_pos_current, rc);
@@ -638,9 +667,12 @@ out:
 
 noenv:
        spin_lock(&scrub->os_lock);
-       thread_set_flags(thread, SVC_STOPPED);
-       wake_up_all(&thread->t_ctl_waitq);
+       scrub->os_running = 0;
        spin_unlock(&scrub->os_lock);
+       if (xchg(&scrub->os_task, NULL) == NULL)
+               /* scrub_stop is waiting, we need to synchronize */
+               wait_var_event(scrub, kthread_should_stop());
+       wake_up_var(scrub);
        return rc;
 }
 
@@ -697,7 +729,7 @@ static const struct osd_lf_map osd_lf_maps[] = {
 
        /* PENDING */
        {
-               .olm_name               = "PENDING",
+               .olm_name               = MDT_ORPHAN_DIR,
        },
 
        /* ROOT */
@@ -749,7 +781,7 @@ static const struct osd_lf_map osd_lf_maps[] = {
        /* LFSCK */
        {
                .olm_name               = LFSCK_DIR,
-               .olm_flags              = OLF_SCAN_SUBITEMS,
+               .olm_flags              = OLF_SCAN_SUBITEMS | OLF_NOT_BACKUP,
                .olm_scan_dir           = osd_ios_general_sd,
                .olm_handle_dirent      = osd_ios_varfid_hd,
        },
@@ -803,6 +835,18 @@ static const struct osd_lf_map osd_lf_maps[] = {
                .olm_name               = LUSTRE_NODEMAP_NAME,
        },
 
+       /* index_backup */
+       {
+               .olm_name               = INDEX_BACKUP_DIR,
+               .olm_fid                = {
+                       .f_seq  = FID_SEQ_LOCAL_FILE,
+                       .f_oid  = INDEX_BACKUP_OID,
+               },
+               .olm_flags              = OLF_SCAN_SUBITEMS | OLF_NOT_BACKUP,
+               .olm_scan_dir           = osd_ios_general_sd,
+               .olm_handle_dirent      = osd_ios_varfid_hd,
+       },
+
        {
                .olm_name               = NULL
        }
@@ -864,6 +908,130 @@ static int osd_ios_new_item(struct osd_device *dev, uint64_t parent,
        return 0;
 }
 
+static bool osd_index_need_recreate(const struct lu_env *env,
+                                   struct osd_device *dev, uint64_t oid)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       zap_attribute_t *za = &info->oti_za2;
+       zap_cursor_t *zc = &info->oti_zc2;
+       int rc;
+       ENTRY;
+
+       zap_cursor_init_serialized(zc, dev->od_os, oid, 0);
+       rc = -zap_cursor_retrieve(zc, za);
+       zap_cursor_fini(zc);
+       if (rc && rc != -ENOENT)
+               RETURN(true);
+
+       RETURN(false);
+}
+
+static void osd_ios_index_register(const struct lu_env *env,
+                                  struct osd_device *osd,
+                                  const struct lu_fid *fid, uint64_t oid)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       zap_attribute_t *za = &info->oti_za2;
+       zap_cursor_t *zc = &info->oti_zc2;
+       struct zap_leaf_entry *le;
+       dnode_t *dn = NULL;
+       sa_handle_t *hdl;
+       __u64 mode = 0;
+       __u32 keysize = 0;
+       __u32 recsize = 0;
+       int rc;
+       ENTRY;
+
+       rc = __osd_obj2dnode(osd->od_os, oid, &dn);
+       if (rc == -EEXIST || rc == -ENOENT)
+               RETURN_EXIT;
+
+       if (rc < 0)
+               GOTO(log, rc);
+
+       if (!osd_object_is_zap(dn))
+               GOTO(log, rc = 1);
+
+       rc = -sa_handle_get(osd->od_os, oid, NULL, SA_HDL_PRIVATE, &hdl);
+       if (rc)
+               GOTO(log, rc);
+
+       rc = -sa_lookup(hdl, SA_ZPL_MODE(osd), &mode, sizeof(mode));
+       sa_handle_destroy(hdl);
+       if (rc)
+               GOTO(log, rc);
+
+       if (!S_ISREG(mode))
+               GOTO(log, rc = 1);
+
+       zap_cursor_init_serialized(zc, osd->od_os, oid, 0);
+       rc = -zap_cursor_retrieve(zc, za);
+       if (rc)
+               /* Skip empty index object */
+               GOTO(fini, rc = (rc == -ENOENT ? 1 : rc));
+
+       if (zc->zc_zap->zap_ismicro ||
+           !(zap_f_phys(zc->zc_zap)->zap_flags & ZAP_FLAG_UINT64_KEY))
+               GOTO(fini, rc = 1);
+
+       le = ZAP_LEAF_ENTRY(zc->zc_leaf, 0);
+       keysize = le->le_name_numints * 8;
+       recsize = za->za_integer_length * za->za_num_integers;
+       if (likely(keysize && recsize))
+               rc = osd_index_register(osd, fid, keysize, recsize);
+
+       GOTO(fini, rc);
+
+fini:
+       zap_cursor_fini(zc);
+
+log:
+       if (dn)
+               osd_dnode_rele(dn);
+       if (rc < 0)
+               CWARN("%s: failed to register index "DFID" (%u/%u): rc = %d\n",
+                     osd_name(osd), PFID(fid), keysize, recsize, rc);
+       else if (!rc)
+               CDEBUG(D_LFSCK, "%s: registered index "DFID" (%u/%u)\n",
+                      osd_name(osd), PFID(fid), keysize, recsize);
+}
+
+static void osd_index_restore(const struct lu_env *env, struct osd_device *dev,
+                             struct lustre_index_restore_unit *liru, void *buf,
+                             int bufsize)
+{
+       struct luz_direntry *zde = &osd_oti_get(env)->oti_zde;
+       struct lu_fid *tgt_fid = &liru->liru_cfid;
+       struct lu_fid bak_fid;
+       int rc;
+       ENTRY;
+
+       lustre_fid2lbx(buf, tgt_fid, bufsize);
+       rc = -zap_lookup(dev->od_os, dev->od_index_backup_id, buf, 8,
+                        sizeof(*zde) / 8, (void *)zde);
+       if (rc)
+               GOTO(log, rc);
+
+       rc = osd_get_fid_by_oid(env, dev, zde->lzd_reg.zde_dnode, &bak_fid);
+       if (rc)
+               GOTO(log, rc);
+
+       /* The OI mapping for index may be invalid, since it will be
+        * re-created, not update the OI mapping, just cache it in RAM. */
+       rc = osd_idc_find_and_init_with_oid(env, dev, tgt_fid,
+                                           liru->liru_clid);
+       if (!rc)
+               rc = lustre_index_restore(env, &dev->od_dt_dev,
+                               &liru->liru_pfid, tgt_fid, &bak_fid,
+                               liru->liru_name, &dev->od_index_backup_list,
+                               &dev->od_lock, buf, bufsize);
+       GOTO(log, rc);
+
+log:
+       CDEBUG(D_WARNING, "%s: restore index '%s' with "DFID": rc = %d\n",
+              osd_name(dev), liru->liru_name, PFID(tgt_fid), rc);
+}
+
 /**
  * verify FID-in-LMA and OI entry for one object
  *
@@ -912,7 +1080,31 @@ static int osd_ios_scan_one(const struct lu_env *env, struct osd_device *dev,
                                RETURN(0);
                        }
 
+                       if (lma->lma_compat & LMAC_IDX_BACKUP &&
+                           osd_index_need_recreate(env, dev, oid)) {
+                               if (parent == dev->od_root) {
+                                       lu_local_obj_fid(&tfid,
+                                                        OSD_FS_ROOT_OID);
+                               } else {
+                                       rc = osd_get_fid_by_oid(env, dev,
+                                                               parent, &tfid);
+                                       if (rc) {
+                                               nvlist_free(nvbuf);
+                                               RETURN(rc);
+                                       }
+                               }
+
+                               rc = lustre_liru_new(
+                                               &dev->od_index_restore_list,
+                                               &tfid, &lma->lma_self_fid, oid,
+                                               name, strlen(name));
+                               nvlist_free(nvbuf);
+                               RETURN(rc);
+                       }
+
                        tfid = lma->lma_self_fid;
+                       if (!(flags & OLF_NOT_BACKUP))
+                               osd_ios_index_register(env, dev, &tfid, oid);
                }
                nvlist_free(nvbuf);
        }
@@ -1109,8 +1301,7 @@ static int osd_ios_ROOT_sd(const struct lu_env *env, struct osd_device *dev,
                                    sizeof(*zde) / 8, (void *)zde);
                if (rc) {
                        if (rc != -ENOENT)
-                               CWARN("%s: initial OI scrub failed to find"
-                                     "the entry %s under .lustre: rc = %d\n",
+                               CWARN("%s: initial OI scrub failed to find the entry %s under .lustre: rc = %d\n",
                                      osd_name(dev), map->olm_name, rc);
                        else if (!fid_is_zero(&map->olm_fid))
                                /* Try to remove the stale OI mapping. */
@@ -1176,6 +1367,31 @@ static void osd_initial_OI_scrub(const struct lu_env *env,
                OBD_FREE_PTR(item);
        }
 
+       if (!list_empty(&dev->od_index_restore_list)) {
+               char *buf;
+
+               OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+               if (!buf)
+                       CERROR("%s: not enough RAM for rebuild index\n",
+                              osd_name(dev));
+
+               while (!list_empty(&dev->od_index_restore_list)) {
+                       struct lustre_index_restore_unit *liru;
+
+                       liru = list_entry(dev->od_index_restore_list.next,
+                                         struct lustre_index_restore_unit,
+                                         liru_link);
+                       list_del(&liru->liru_link);
+                       if (buf)
+                               osd_index_restore(env, dev, liru, buf,
+                                                 INDEX_BACKUP_BUFSIZE);
+                       OBD_FREE(liru, liru->liru_len);
+               }
+
+               if (buf)
+                       OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+       }
+
        EXIT;
 }
 
@@ -1198,14 +1414,16 @@ int osd_scrub_start(const struct lu_env *env, struct osd_device *dev,
        RETURN(rc == -EALREADY ? 0 : rc);
 }
 
-static void osd_scrub_stop(struct osd_device *dev)
+void osd_scrub_stop(struct osd_device *dev)
 {
        struct lustre_scrub *scrub = &dev->od_scrub;
        ENTRY;
 
        /* od_otable_sem: prevent concurrent start/stop */
        down(&dev->od_otable_sem);
+       spin_lock(&scrub->os_lock);
        scrub->os_paused = 1;
+       spin_unlock(&scrub->os_lock);
        scrub_stop(scrub);
        up(&dev->od_otable_sem);
 
@@ -1216,7 +1434,8 @@ static void osd_scrub_stop(struct osd_device *dev)
 
 static const char osd_scrub_name[] = "OI_scrub";
 
-int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
+int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev,
+                   bool resetoi)
 {
        struct osd_thread_info *info = osd_oti_get(env);
        struct lustre_scrub *scrub = &dev->od_scrub;
@@ -1228,11 +1447,10 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        bool dirty = false;
        ENTRY;
 
-       memcpy(dev->od_uuid,
+       memcpy(dev->od_uuid.b,
               &dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid,
               sizeof(dsl_dataset_phys(dev->od_os->os_dsl_dataset)->ds_guid));
        memset(&dev->od_scrub, 0, sizeof(struct lustre_scrub));
-       init_waitqueue_head(&scrub->os_thread.t_ctl_waitq);
        init_rwsem(&scrub->os_rwsem);
        spin_lock_init(&scrub->os_lock);
        INIT_LIST_HEAD(&scrub->os_inconsistent_items);
@@ -1259,6 +1477,7 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        if (IS_ERR_OR_NULL(obj))
                RETURN(obj ? PTR_ERR(obj) : -ENOENT);
 
+       obj->do_body_ops = &osd_body_scrub_ops;
        scrub->os_obj = obj;
        rc = scrub_file_load(env, scrub);
        if (rc == -ENOENT || rc == -EFAULT) {
@@ -1267,29 +1486,12 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        } else if (rc < 0) {
                GOTO(cleanup_obj, rc);
        } else {
-               if (memcmp(sf->sf_uuid, dev->od_uuid, 16) != 0) {
-                       struct obd_uuid *old_uuid;
-                       struct obd_uuid *new_uuid;
-
-                       OBD_ALLOC_PTR(old_uuid);
-                       OBD_ALLOC_PTR(new_uuid);
-                       if (!old_uuid || !new_uuid) {
-                               CERROR("%s: UUID has been changed, but"
-                                      "failed to allocate RAM for report\n",
-                                      osd_name(dev));
-                       } else {
-                               class_uuid_unparse(sf->sf_uuid, old_uuid);
-                               class_uuid_unparse(dev->od_uuid, new_uuid);
-                               CDEBUG(D_LFSCK, "%s: UUID has been changed "
-                                      "from %s to %s\n", osd_name(dev),
-                                      old_uuid->uuid, new_uuid->uuid);
-                       }
+               if (!uuid_equal(&sf->sf_uuid, &dev->od_uuid)) {
+                       CDEBUG(D_LFSCK,
+                              "%s: UUID has been changed from %pU to %pU\n",
+                              osd_name(dev), &sf->sf_uuid, &dev->od_uuid);
                        scrub_file_reset(scrub, dev->od_uuid, SF_INCONSISTENT);
                        dirty = true;
-                       if (old_uuid)
-                               OBD_FREE_PTR(old_uuid);
-                       if (new_uuid)
-                               OBD_FREE_PTR(new_uuid);
                } else if (sf->sf_status == SS_SCANNING) {
                        sf->sf_status = SS_CRASHED;
                        dirty = true;
@@ -1316,7 +1518,7 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev)
        }
 
        /* Initialize OI files. */
-       rc = osd_oi_init(env, dev);
+       rc = osd_oi_init(env, dev, resetoi);
        if (rc < 0)
                GOTO(cleanup_obj, rc);
 
@@ -1494,7 +1696,7 @@ osd_otable_it_wakeup(struct lustre_scrub *scrub, struct osd_otable_it *it)
 {
        spin_lock(&scrub->os_lock);
        if (it->ooi_pos < scrub->os_pos_current || scrub->os_waiting ||
-           !thread_is_running(&scrub->os_thread))
+           !scrub->os_running)
                it->ooi_waiting = 0;
        else
                it->ooi_waiting = 1;
@@ -1508,12 +1710,10 @@ static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di)
        struct osd_otable_it *it = (struct osd_otable_it *)di;
        struct osd_device *dev = it->ooi_dev;
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
-       struct l_wait_info lwi = { 0 };
        struct lustre_mdt_attrs *lma = NULL;
        nvlist_t *nvbuf = NULL;
-       int size = 0;
-       int rc;
+       int rc, size = 0;
+       bool locked;
        ENTRY;
 
        LASSERT(it->ooi_user_ready);
@@ -1531,11 +1731,10 @@ again:
        }
 
        if (it->ooi_pos >= scrub->os_pos_current)
-               l_wait_event(thread->t_ctl_waitq,
-                            osd_otable_it_wakeup(scrub, it),
-                            &lwi);
+               wait_var_event(scrub,
+                              osd_otable_it_wakeup(scrub, it));
 
-       if (!thread_is_running(thread) && !it->ooi_used_outside)
+       if (!scrub->os_running && !it->ooi_used_outside)
                GOTO(out, rc = 1);
 
        rc = -dmu_object_next(dev->od_os, &it->ooi_pos, B_FALSE, 0);
@@ -1550,16 +1749,20 @@ again:
 
        rc = __osd_xattr_load_by_oid(dev, it->ooi_pos, &nvbuf);
 
-       if (!scrub->os_full_speed)
+       locked = false;
+       if (!scrub->os_full_speed) {
                spin_lock(&scrub->os_lock);
+               locked = true;
+       }
        it->ooi_prefetched--;
        if (!scrub->os_full_speed) {
                if (scrub->os_waiting) {
                        scrub->os_waiting = 0;
-                       wake_up_all(&thread->t_ctl_waitq);
+                       wake_up_var(scrub);
                }
-               spin_unlock(&scrub->os_lock);
        }
+       if (locked)
+               spin_unlock(&scrub->os_lock);
 
        if (rc == -ENOENT || rc == -EEXIST || rc == -ENODATA)
                goto again;
@@ -1651,7 +1854,7 @@ static int osd_otable_it_load(const struct lu_env *env,
        it->ooi_prefetched_dnode = 0;
        it->ooi_user_ready = 1;
        if (!scrub->os_full_speed)
-               wake_up_all(&scrub->os_thread.t_ctl_waitq);
+               wake_up_var(scrub);
 
        /* Unplug OSD layer iteration by the first next() call. */
        rc = osd_otable_it_next(env, (struct dt_it *)it);
@@ -1687,7 +1890,6 @@ int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
                   const struct lu_fid *fid, uint64_t oid, bool insert)
 {
        struct lustre_scrub *scrub = &dev->od_scrub;
-       struct ptlrpc_thread *thread = &scrub->os_thread;
        struct osd_inconsistent_item *oii;
        bool wakeup = false;
        ENTRY;
@@ -1704,7 +1906,7 @@ int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
        oii->oii_insert = insert;
 
        spin_lock(&scrub->os_lock);
-       if (unlikely(!thread_is_running(thread))) {
+       if (!scrub->os_running) {
                spin_unlock(&scrub->os_lock);
                OBD_FREE_PTR(oii);
                RETURN(-EAGAIN);
@@ -1716,7 +1918,7 @@ int osd_oii_insert(const struct lu_env *env, struct osd_device *dev,
        spin_unlock(&scrub->os_lock);
 
        if (wakeup)
-               wake_up_all(&thread->t_ctl_waitq);
+               wake_up_var(scrub);
 
        RETURN(0);
 }
@@ -1741,3 +1943,218 @@ int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid,
 
        RETURN(ret);
 }
+
+typedef int (*scan_dir_helper_t)(const struct lu_env *env,
+                                struct osd_device *dev, uint64_t dir_oid,
+                                struct osd_zap_it *ozi);
+
+static int osd_scan_dir(const struct lu_env *env, struct osd_device *dev,
+                       uint64_t id, scan_dir_helper_t cb)
+{
+       struct osd_zap_it *it;
+       struct luz_direntry *zde;
+       zap_attribute_t *za;
+       int rc;
+
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(it, osd_zapit_cachep, GFP_NOFS);
+       if (it == NULL)
+               RETURN(-ENOMEM);
+
+       rc = osd_zap_cursor_init(&it->ozi_zc, dev->od_os, id, 0);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       za = &it->ozi_za;
+       zde = &it->ozi_zde;
+       while (1) {
+               rc = -zap_cursor_retrieve(it->ozi_zc, za);
+               if (unlikely(rc)) {
+                       if (rc == -ENOENT)
+                               rc = 0;
+
+                       break;
+               }
+
+               if (name_is_dot_or_dotdot(za->za_name, strlen(za->za_name))) {
+                       zap_cursor_advance(it->ozi_zc);
+                       continue;
+               }
+
+               strncpy(it->ozi_name, za->za_name, sizeof(it->ozi_name));
+               if (za->za_integer_length != 8) {
+                       rc = -EIO;
+                       break;
+               }
+
+               rc = osd_zap_lookup(dev, it->ozi_zc->zc_zapobj, NULL,
+                                   za->za_name, za->za_integer_length,
+                                   sizeof(*zde) / za->za_integer_length, zde);
+               if (rc)
+                       break;
+
+               rc = cb(env, dev, id, it);
+               if (rc)
+                       break;
+
+               zap_cursor_advance(it->ozi_zc);
+       }
+       osd_zap_cursor_fini(it->ozi_zc);
+
+out:
+       OBD_SLAB_FREE_PTR(it, osd_zapit_cachep);
+       RETURN(rc);
+}
+
+static int osd_remove_ml_file(const struct lu_env *env, struct osd_device *dev,
+                             uint64_t dir, uint64_t id, struct lu_fid *fid,
+                             char *name)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct dt_object *dt;
+       struct osd_object *obj = NULL;
+       dmu_tx_t *tx;
+       sa_handle_t *hdl;
+       uint64_t nlink;
+       int rc;
+
+       rc = -sa_handle_get(dev->od_os, id, NULL, SA_HDL_PRIVATE, &hdl);
+       if (rc)
+               RETURN(rc);
+
+       dt = lu2dt(lu_object_find_slice(env, osd2lu_dev(dev), fid, NULL));
+       if (IS_ERR(dt))
+               RETURN(PTR_ERR(dt));
+
+       if (dt) {
+               obj = osd_dt_obj(dt);
+               down_read(&obj->oo_guard);
+       }
+
+       rc = -sa_lookup(hdl, SA_ZPL_LINKS(dev), &nlink, sizeof(nlink));
+       if (rc)
+               GOTO(out, rc);
+
+       if (nlink <= 1) {
+               CERROR("%s: multi-link file O/%s/%s/%s has nlink %llu\n",
+                      osd_name(dev), info->oti_seq_name, info->oti_dir_name,
+                      name, nlink);
+               GOTO(out, rc = 0);
+       }
+
+       tx = dmu_tx_create(dev->od_os);
+       if (!tx) {
+               CERROR("%s: fail to create tx to remove multi-link file!\n",
+                      osd_name(dev));
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       dmu_tx_hold_zap(tx, dir, FALSE, NULL);
+       rc = -dmu_tx_assign(tx, TXG_WAIT);
+       if (rc)
+               GOTO(abort, rc);
+
+       nlink--;
+       rc = -sa_update(hdl, SA_ZPL_LINKS(dev), &nlink, sizeof(nlink), tx);
+       if (rc)
+               GOTO(abort, rc);
+
+       rc = -zap_remove(dev->od_os, dir, name, tx);
+       if (rc)
+               GOTO(abort, rc);
+
+       dmu_tx_commit(tx);
+       GOTO(out, rc);
+
+abort:
+       dmu_tx_abort(tx);
+
+out:
+       if (dt) {
+               up_read(&obj->oo_guard);
+               dt_object_put_nocache(env, dt);
+       }
+
+       sa_handle_destroy(hdl);
+       RETURN(rc);
+}
+
+static int osd_scan_ml_file(const struct lu_env *env, struct osd_device *dev,
+                           uint64_t dir_oid, struct osd_zap_it *ozi)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct lu_fid *fid = &info->oti_fid;
+       struct ost_id *ostid = &info->oti_ostid;
+       char name[32];
+       u64 seq;
+       int rc = 0;
+
+       ENTRY;
+
+       rc = osd_get_fid_by_oid(env, dev, ozi->ozi_zde.lzd_reg.zde_dnode, fid);
+       if (rc)
+               RETURN(rc);
+
+       seq = fid_seq(fid);
+       fid_to_ostid(fid, ostid);
+
+       snprintf(name, sizeof(name), (fid_seq_is_rsvd(seq) ||
+                                     fid_seq_is_mdt0(seq)) ? "%llu" : "%llx",
+                                     fid_seq_is_idif(seq) ? 0 : seq);
+       if (strcmp(info->oti_seq_name, name) != 0)
+               GOTO(fix, rc);
+
+       snprintf(name, sizeof(name), "d%d",
+               (int)ostid_id(ostid) % OSD_OST_MAP_SIZE);
+       if (strcmp(info->oti_dir_name, name) != 0)
+               GOTO(fix, rc);
+
+       snprintf(name, sizeof(name), "%llu", ostid_id(ostid));
+       if (strcmp(ozi->ozi_name, name) == 0)
+               RETURN(0);
+
+fix:
+       CDEBUG(D_LFSCK, "%s: the file O/%s/%s/%s is corrupted\n",
+              osd_name(dev), info->oti_seq_name, info->oti_dir_name,
+              ozi->ozi_name);
+
+       rc = osd_remove_ml_file(env, dev, dir_oid,
+                               ozi->ozi_zde.lzd_reg.zde_dnode, fid,
+                               ozi->ozi_name);
+       RETURN(rc);
+}
+
+static int osd_scan_ml_file_dir(const struct lu_env *env,
+                               struct osd_device *dev, uint64_t dir_oid,
+                               struct osd_zap_it *ozi)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+
+       if (!S_ISDIR(cpu_to_le16(DTTOIF(ozi->ozi_zde.lzd_reg.zde_type))))
+               return 0;
+
+       info->oti_dir_name = ozi->ozi_name;
+       return osd_scan_dir(env, dev, ozi->ozi_zde.lzd_reg.zde_dnode,
+                           osd_scan_ml_file);
+}
+
+static int osd_scan_ml_file_seq(const struct lu_env *env,
+                               struct osd_device *dev, uint64_t dir_oid,
+                               struct osd_zap_it *ozi)
+{
+       struct osd_thread_info *info = osd_oti_get(env);
+
+       if (!S_ISDIR(cpu_to_le16(DTTOIF(ozi->ozi_zde.lzd_reg.zde_type))))
+               return 0;
+
+       info->oti_seq_name = ozi->ozi_name;
+       return osd_scan_dir(env, dev, ozi->ozi_zde.lzd_reg.zde_dnode,
+                           osd_scan_ml_file_dir);
+}
+
+static int osd_scan_ml_file_main(const struct lu_env *env,
+                                struct osd_device *dev)
+{
+       return osd_scan_dir(env, dev, dev->od_O_id, osd_scan_ml_file_seq);
+}