Whamcloud - gitweb
LU-3950 lfsck: control LFSCK on all devices via single command 65/7665/28
authorFan Yong <fan.yong@intel.com>
Fri, 24 Jan 2014 19:45:42 +0000 (03:45 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 6 Feb 2014 07:06:32 +0000 (07:06 +0000)
Under DNE mode, it is more convenient for the administrator to control
the LFSCK (start/stop) on all the MDT devices via single command. Such
functionality is not only useful for DNE consistency verification, but
also for layout consistency (Phase II). It is also required for orphan
OST-objects scanning.

Test-Parameters: allwaysuploadlogs
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Ie0d4611f969e51b80faf27b52dbdaee41caf5187
Reviewed-on: http://review.whamcloud.com/7665
Tested-by: Jenkins
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre/lustre_lfsck_user.h
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_layout.c
lustre/lfsck/lfsck_lib.c
lustre/mdd/mdd_lproc.c
lustre/mdt/mdt_handler.c
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_obd.c
lustre/tests/sanity-lfsck.sh
lustre/utils/lctl.c
lustre/utils/lustre_lfsck.c

index 4d901dc..2232ea7 100644 (file)
@@ -42,6 +42,12 @@ enum lfsck_param_flags {
 
        /* Dryrun mode, only check without modification */
        LPF_DRYRUN      = 0x0004,
+
+       /* Start/stop LFSCK on all MDT devices. */
+       LPF_ALL_MDT     = 0x0008,
+
+       /* Broadcast the command to other MDTs. */
+       LPF_BROADCAST   = 0x0010,
 };
 
 enum lfsck_type {
index 698fb5f..01d3c60 100644 (file)
@@ -312,6 +312,10 @@ struct lfsck_operations {
                                 struct lfsck_tgt_descs *ltds,
                                 struct lfsck_tgt_desc *ltd,
                                 struct ptlrpc_request_set *set);
+
+       int (*lfsck_join)(const struct lu_env *env,
+                         struct lfsck_component *com,
+                         struct lfsck_start_param *lsp);
 };
 
 #define TGT_PTRS               256     /* number of pointers at 1st level */
@@ -482,6 +486,9 @@ struct lfsck_instance {
        /* The status when the LFSCK stopped or paused. */
        __u32                     li_status;
 
+       /* The flags when the lFSCK stopped or paused. */
+       __u32                     li_flags;
+
        unsigned int              li_oit_over:1, /* oit is finished. */
                                  li_drop_dryrun:1, /* Ever dryrun, not now. */
                                  li_master:1, /* Master instance or not. */
index e7c12d2..bdcd4c4 100644 (file)
@@ -107,6 +107,15 @@ struct lfsck_layout_master_data {
        /* list for the ost targets in phase1 scanning. */
        struct list_head        llmd_ost_phase2_list;
 
+       /* list for the mdt targets involve layout verification. */
+       struct list_head        llmd_mdt_list;
+
+       /* list for the mdt targets in phase1 scanning. */
+       struct list_head        llmd_mdt_phase1_list;
+
+       /* list for the mdt targets in phase1 scanning. */
+       struct list_head        llmd_mdt_phase2_list;
+
        struct ptlrpc_thread    llmd_thread;
        atomic_t                llmd_rpcs_in_flight;
        __u32                   llmd_touch_gen;
@@ -739,59 +748,112 @@ static int lfsck_layout_master_async_interpret(const struct lu_env *env,
 
        switch (lr->lr_event) {
        case LE_START:
-               if (rc == 0) {
-                       spin_lock(&ltds->ltd_lock);
-                       if (!ltd->ltd_dead && !ltd->ltd_layout_done) {
-                               if (list_empty(&ltd->ltd_layout_list))
-                                       list_add_tail(
-                                               &ltd->ltd_layout_list,
-                                               &llmd->llmd_ost_list);
-                               if (list_empty(&ltd->ltd_layout_phase_list))
-                                       list_add_tail(
-                                               &ltd->ltd_layout_phase_list,
-                                               &llmd->llmd_ost_phase1_list);
-                       }
-                       spin_unlock(&ltds->ltd_lock);
-               } else {
+               if (rc != 0) {
                        struct lfsck_layout *lo = com->lc_file_ram;
 
                        lo->ll_flags |= LF_INCOMPLETE;
+                       lfsck_tgt_put(ltd);
+                       break;
                }
+
+               spin_lock(&ltds->ltd_lock);
+               if (ltd->ltd_dead || ltd->ltd_layout_done) {
+                       spin_unlock(&ltds->ltd_lock);
+                       lfsck_tgt_put(ltd);
+                       break;
+               }
+
+               if (lr->lr_flags & LEF_TO_OST) {
+                       if (list_empty(&ltd->ltd_layout_list))
+                               list_add_tail(&ltd->ltd_layout_list,
+                                             &llmd->llmd_ost_list);
+                       if (list_empty(&ltd->ltd_layout_phase_list))
+                               list_add_tail(&ltd->ltd_layout_phase_list,
+                                             &llmd->llmd_ost_phase1_list);
+               } else {
+                       if (list_empty(&ltd->ltd_layout_list))
+                               list_add_tail(&ltd->ltd_layout_list,
+                                             &llmd->llmd_mdt_list);
+                       if (list_empty(&ltd->ltd_layout_phase_list))
+                               list_add_tail(&ltd->ltd_layout_phase_list,
+                                             &llmd->llmd_mdt_phase1_list);
+               }
+               spin_unlock(&ltds->ltd_lock);
                lfsck_tgt_put(ltd);
                break;
        case LE_STOP:
+       case LE_PHASE1_DONE:
        case LE_PHASE2_DONE:
+               if (rc != 0)
+                       CERROR("%s: fail to notify %s %x for layout: "
+                              "event = %d, rc = %d\n",
+                              lfsck_lfsck2name(com->lc_lfsck),
+                              (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT",
+                              ltd->ltd_index, lr->lr_event, rc);
                break;
-       case LE_QUERY:
-               spin_lock(&ltds->ltd_lock);
-               if (rc == 0 && !ltd->ltd_dead && !ltd->ltd_layout_done) {
-                       struct lfsck_reply *reply;
+       case LE_QUERY: {
+               struct lfsck_reply *reply;
 
-                       reply = req_capsule_server_get(&req->rq_pill,
-                                                      &RMF_LFSCK_REPLY);
-                       switch (reply->lr_status) {
-                       case LS_SCANNING_PHASE1:
+               if (rc != 0) {
+                       spin_lock(&ltds->ltd_lock);
+                       list_del_init(&ltd->ltd_layout_phase_list);
+                       list_del_init(&ltd->ltd_layout_list);
+                       spin_unlock(&ltds->ltd_lock);
+                       lfsck_tgt_put(ltd);
+                       break;
+               }
+
+               reply = req_capsule_server_get(&req->rq_pill,
+                                              &RMF_LFSCK_REPLY);
+               if (reply == NULL) {
+                       rc = -EPROTO;
+                       CERROR("%s: invalid return value: rc = %d\n",
+                              lfsck_lfsck2name(com->lc_lfsck), rc);
+                       spin_lock(&ltds->ltd_lock);
+                       list_del_init(&ltd->ltd_layout_phase_list);
+                       list_del_init(&ltd->ltd_layout_list);
+                       spin_unlock(&ltds->ltd_lock);
+                       lfsck_tgt_put(ltd);
+                       break;
+               }
+
+               switch (reply->lr_status) {
+               case LS_SCANNING_PHASE1:
+                       break;
+               case LS_SCANNING_PHASE2:
+                       spin_lock(&ltds->ltd_lock);
+                       list_del_init(&ltd->ltd_layout_phase_list);
+                       if (ltd->ltd_dead || ltd->ltd_layout_done) {
+                               spin_unlock(&ltds->ltd_lock);
                                break;
-                       case LS_SCANNING_PHASE2:
-                               list_del(&ltd->ltd_layout_phase_list);
+                       }
+
+                       if (lr->lr_flags & LEF_TO_OST)
                                list_add_tail(&ltd->ltd_layout_phase_list,
                                              &llmd->llmd_ost_phase2_list);
-                               break;
-                       default:
-                               list_del_init(&ltd->ltd_layout_phase_list);
-                               list_del_init(&ltd->ltd_layout_list);
-                               break;
-                       }
+                       else
+                               list_add_tail(&ltd->ltd_layout_phase_list,
+                                             &llmd->llmd_mdt_phase2_list);
+                       spin_unlock(&ltds->ltd_lock);
+                       break;
+               default:
+                       spin_lock(&ltds->ltd_lock);
+                       list_del_init(&ltd->ltd_layout_phase_list);
+                       list_del_init(&ltd->ltd_layout_list);
+                       spin_unlock(&ltds->ltd_lock);
+                       break;
                }
-               spin_unlock(&ltds->ltd_lock);
                lfsck_tgt_put(ltd);
                break;
+       }
        default:
                CERROR("%s: unexpected event: rc = %d\n",
                       lfsck_lfsck2name(com->lc_lfsck), lr->lr_event);
                break;
        }
 
+       lfsck_component_put(env, com);
+
        return 0;
 }
 
@@ -806,6 +868,7 @@ static int lfsck_layout_master_query_others(const struct lu_env *env,
        struct ptlrpc_request_set         *set;
        struct lfsck_tgt_descs            *ltds;
        struct lfsck_tgt_desc             *ltd;
+       struct list_head                  *head;
        __u32                              cnt   = 0;
        int                                rc    = 0;
        int                                rc1   = 0;
@@ -816,18 +879,29 @@ static int lfsck_layout_master_query_others(const struct lu_env *env,
                RETURN(-ENOMEM);
 
        llmd->llmd_touch_gen++;
-       ltds = &lfsck->li_ost_descs;
        memset(lr, 0, sizeof(*lr));
        lr->lr_index = lfsck_dev_idx(lfsck->li_bottom);
        lr->lr_event = LE_QUERY;
        lr->lr_active = LT_LAYOUT;
-
        laia->laia_com = com;
-       laia->laia_ltds = ltds;
        laia->laia_lr = lr;
+
+       if (!list_empty(&llmd->llmd_mdt_phase1_list)) {
+               ltds = &lfsck->li_mdt_descs;
+               lr->lr_flags = 0;
+               head = &llmd->llmd_mdt_phase1_list;
+       } else {
+
+again:
+               ltds = &lfsck->li_ost_descs;
+               lr->lr_flags = LEF_TO_OST;
+               head = &llmd->llmd_ost_phase1_list;
+       }
+
+       laia->laia_ltds = ltds;
        spin_lock(&ltds->ltd_lock);
-       while (!list_empty(&llmd->llmd_ost_phase1_list)) {
-               ltd = list_entry(llmd->llmd_ost_phase1_list.next,
+       while (!list_empty(head)) {
+               ltd = list_entry(head->next,
                                 struct lfsck_tgt_desc,
                                 ltd_layout_phase_list);
                if (ltd->ltd_layout_gen == llmd->llmd_touch_gen)
@@ -835,8 +909,7 @@ static int lfsck_layout_master_query_others(const struct lu_env *env,
 
                ltd->ltd_layout_gen = llmd->llmd_touch_gen;
                list_del(&ltd->ltd_layout_phase_list);
-               list_add_tail(&ltd->ltd_layout_phase_list,
-                             &llmd->llmd_ost_phase1_list);
+               list_add_tail(&ltd->ltd_layout_phase_list, head);
                atomic_inc(&ltd->ltd_ref);
                laia->laia_ltd = ltd;
                spin_unlock(&ltds->ltd_lock);
@@ -844,8 +917,10 @@ static int lfsck_layout_master_query_others(const struct lu_env *env,
                                         lfsck_layout_master_async_interpret,
                                         laia, LFSCK_QUERY);
                if (rc != 0) {
-                       CERROR("%s: fail to query OST %x for layout: rc = %d\n",
-                              lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
+                       CERROR("%s: fail to query %s %x for layout: rc = %d\n",
+                              lfsck_lfsck2name(lfsck),
+                              (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT",
+                              ltd->ltd_index, rc);
                        lfsck_tgt_put(ltd);
                        rc1 = rc;
                } else {
@@ -855,8 +930,19 @@ static int lfsck_layout_master_query_others(const struct lu_env *env,
        }
        spin_unlock(&ltds->ltd_lock);
 
-       if (cnt > 0)
+       if (cnt > 0) {
                rc = ptlrpc_set_wait(set);
+               if (rc < 0) {
+                       ptlrpc_set_destroy(set);
+                       RETURN(rc);
+               }
+               cnt = 0;
+       }
+
+       if (!(lr->lr_flags & LEF_TO_OST) &&
+           list_empty(&llmd->llmd_mdt_phase1_list))
+               goto again;
+
        ptlrpc_set_destroy(set);
 
        RETURN(rc1 != 0 ? rc1 : rc);
@@ -865,13 +951,15 @@ static int lfsck_layout_master_query_others(const struct lu_env *env,
 static inline bool
 lfsck_layout_master_to_orphan(struct lfsck_layout_master_data *llmd)
 {
-       return !list_empty(&llmd->llmd_ost_phase2_list) ||
-              list_empty(&llmd->llmd_ost_phase1_list);
+       return list_empty(&llmd->llmd_mdt_phase1_list) &&
+              (!list_empty(&llmd->llmd_ost_phase2_list) ||
+               list_empty(&llmd->llmd_ost_phase1_list));
 }
 
 static int lfsck_layout_master_notify_others(const struct lu_env *env,
                                             struct lfsck_component *com,
-                                            struct lfsck_request *lr)
+                                            struct lfsck_request *lr,
+                                            __u32 flags)
 {
        struct lfsck_thread_info          *info  = lfsck_env_info(env);
        struct lfsck_async_interpret_args *laia  = &info->lti_laia;
@@ -881,6 +969,8 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env,
        struct ptlrpc_request_set         *set;
        struct lfsck_tgt_descs            *ltds;
        struct lfsck_tgt_desc             *ltd;
+       struct lfsck_tgt_desc             *next;
+       struct list_head                  *head;
        __u32                              idx;
        __u32                              cnt   = 0;
        int                                rc    = 0;
@@ -893,9 +983,14 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env,
        lr->lr_active = LT_LAYOUT;
        laia->laia_com = com;
        laia->laia_lr = lr;
+       lr->lr_flags = 0;
        switch (lr->lr_event) {
        case LE_START:
+               /* Notify OSTs firstly, then other MDTs if needed. */
+               lr->lr_flags |= LEF_TO_OST;
                ltds = &lfsck->li_ost_descs;
+
+lable1:
                laia->laia_ltds = ltds;
                down_read(&ltds->ltd_rw_sem);
                cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
@@ -908,9 +1003,11 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env,
                                        lfsck_layout_master_async_interpret,
                                        laia, LFSCK_NOTIFY);
                        if (rc != 0) {
-                               CERROR("%s: fail to notify OST %x for layout "
+                               CERROR("%s: fail to notify %s %x for layout "
                                       "start: rc = %d\n",
-                                      lfsck_lfsck2name(lfsck), idx, rc);
+                                      lfsck_lfsck2name(lfsck),
+                                      (lr->lr_flags & LEF_TO_OST) ? "OST" :
+                                      "MDT", idx, rc);
                                lfsck_tgt_put(ltd);
                                lo->ll_flags |= LF_INCOMPLETE;
                        } else {
@@ -918,17 +1015,84 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env,
                        }
                }
                up_read(&ltds->ltd_rw_sem);
+
+               /* Sync up */
+               if (cnt > 0) {
+                       rc = ptlrpc_set_wait(set);
+                       if (rc < 0) {
+                               ptlrpc_set_destroy(set);
+                               RETURN(rc);
+                       }
+                       cnt = 0;
+               }
+
+               if (!(flags & LPF_ALL_MDT))
+                       break;
+
+               ltds = &lfsck->li_mdt_descs;
+               /* The sponsor broadcasts the request to other MDTs. */
+               if (flags & LPF_BROADCAST) {
+                       flags &= ~LPF_ALL_MDT;
+                       lr->lr_flags &= ~LEF_TO_OST;
+                       goto lable1;
+               }
+
+               /* non-sponsors link other MDT targets locallly. */
+               spin_lock(&ltds->ltd_lock);
+               cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
+                       ltd = LTD_TGT(ltds, idx);
+                       LASSERT(ltd != NULL);
+
+                       if (!list_empty(&ltd->ltd_layout_list))
+                               continue;
+
+                       list_add_tail(&ltd->ltd_layout_list,
+                                     &llmd->llmd_mdt_list);
+                       list_add_tail(&ltd->ltd_layout_phase_list,
+                                     &llmd->llmd_mdt_phase1_list);
+               }
+               spin_unlock(&ltds->ltd_lock);
+
                break;
        case LE_STOP:
+               if (flags & LPF_BROADCAST)
+                       lr->lr_flags |= LEF_FORCE_STOP;
        case LE_PHASE2_DONE:
+               /* Notify other MDTs if needed, then the OSTs. */
+               if (flags & LPF_ALL_MDT) {
+                       /* The sponsor broadcasts the request to other MDTs. */
+                       if (flags & LPF_BROADCAST) {
+                               lr->lr_flags &= ~LEF_TO_OST;
+                               head = &llmd->llmd_mdt_list;
+                               ltds = &lfsck->li_mdt_descs;
+                               goto lable3;
+                       }
+
+                       /* non-sponsors unlink other MDT targets locallly. */
+                       ltds = &lfsck->li_mdt_descs;
+                       spin_lock(&ltds->ltd_lock);
+                       list_for_each_entry_safe(ltd, next,
+                                                &llmd->llmd_mdt_list,
+                                                ltd_layout_list) {
+                               list_del_init(&ltd->ltd_layout_phase_list);
+                               list_del_init(&ltd->ltd_layout_list);
+                       }
+                       spin_unlock(&ltds->ltd_lock);
+               }
+
+lable2:
+               lr->lr_flags |= LEF_TO_OST;
+               head = &llmd->llmd_ost_list;
                ltds = &lfsck->li_ost_descs;
+
+lable3:
                laia->laia_ltds = ltds;
                spin_lock(&ltds->ltd_lock);
-               while (!list_empty(&llmd->llmd_ost_list)) {
-                       ltd = list_entry(llmd->llmd_ost_list.next,
-                                        struct lfsck_tgt_desc,
+               while (!list_empty(head)) {
+                       ltd = list_entry(head->next, struct lfsck_tgt_desc,
                                         ltd_layout_list);
-                       list_del_init(&ltd->ltd_layout_phase_list);
+                       if (!list_empty(&ltd->ltd_layout_phase_list))
+                               list_del_init(&ltd->ltd_layout_phase_list);
                        list_del_init(&ltd->ltd_layout_list);
                        laia->laia_ltd = ltd;
                        spin_unlock(&ltds->ltd_lock);
@@ -936,17 +1100,64 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env,
                                        lfsck_layout_master_async_interpret,
                                        laia, LFSCK_NOTIFY);
                        if (rc != 0)
-                               CERROR("%s: fail to notify OST %x for layout "
-                                      "stop/done: rc = %d\n",
+                               CERROR("%s: fail to notify %s %x for layout "
+                                      "stop/phase2: rc = %d\n",
                                       lfsck_lfsck2name(lfsck),
-                                      ltd->ltd_index, rc);
+                                      (lr->lr_flags & LEF_TO_OST) ? "OST" :
+                                      "MDT", ltd->ltd_index, rc);
                        else
                                cnt++;
                        spin_lock(&ltds->ltd_lock);
                }
                spin_unlock(&ltds->ltd_lock);
-               break;
+
+               if (!(flags & LPF_BROADCAST))
+                       break;
+
+               /* Sync up */
+               if (cnt > 0) {
+                       rc = ptlrpc_set_wait(set);
+                       if (rc < 0) {
+                               ptlrpc_set_destroy(set);
+                               RETURN(rc);
+                       }
+                       cnt = 0;
+               }
+
+               flags &= ~LPF_BROADCAST;
+               goto lable2;
        case LE_PHASE1_DONE:
+               llmd->llmd_touch_gen++;
+               lr->lr_flags &= ~LEF_TO_OST;
+               ltds = &lfsck->li_mdt_descs;
+               laia->laia_ltds = ltds;
+               spin_lock(&ltds->ltd_lock);
+               while (!list_empty(&llmd->llmd_mdt_phase1_list)) {
+                       ltd = list_entry(llmd->llmd_mdt_phase1_list.next,
+                                        struct lfsck_tgt_desc,
+                                        ltd_layout_phase_list);
+                       if (ltd->ltd_layout_gen == llmd->llmd_touch_gen)
+                               break;
+
+                       ltd->ltd_layout_gen = llmd->llmd_touch_gen;
+                       list_del_init(&ltd->ltd_layout_phase_list);
+                       list_add_tail(&ltd->ltd_layout_phase_list,
+                                     &llmd->llmd_mdt_phase1_list);
+                       laia->laia_ltd = ltd;
+                       spin_unlock(&ltds->ltd_lock);
+                       rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
+                                       lfsck_layout_master_async_interpret,
+                                       laia, LFSCK_NOTIFY);
+                       if (rc != 0)
+                               CERROR("%s: fail to notify MDT %x for layout "
+                                      "phase1 done: rc = %d\n",
+                                      lfsck_lfsck2name(lfsck),
+                                      ltd->ltd_index, rc);
+                       else
+                               cnt++;
+                       spin_lock(&ltds->ltd_lock);
+               }
+               spin_unlock(&ltds->ltd_lock);
                break;
        default:
                CERROR("%s: unexpected LFSCK event: rc = %d\n",
@@ -1039,8 +1250,13 @@ static int lfsck_layout_assistant(void *args)
        struct l_wait_info               lwi     = { 0 };
        int                              rc      = 0;
        int                              rc1     = 0;
+       __u32                            flags;
        ENTRY;
 
+       if (lta->lta_lsp->lsp_start != NULL)
+               flags  = lta->lta_lsp->lsp_start->ls_flags;
+       else
+               flags = bk->lb_param;
        memset(lr, 0, sizeof(*lr));
        lr->lr_event = LE_START;
        lr->lr_index = lfsck_dev_idx(lfsck->li_bottom);
@@ -1053,7 +1269,7 @@ static int lfsck_layout_assistant(void *args)
        if (pos->lp_oit_cookie <= 1)
                lr->lr_param |= LPF_RESET;
 
-       rc = lfsck_layout_master_notify_others(env, com, lr);
+       rc = lfsck_layout_master_notify_others(env, com, lr, flags);
        if (rc != 0) {
                CERROR("%s: fail to notify others for layout start: rc = %d\n",
                       lfsck_lfsck2name(lfsck), rc);
@@ -1126,7 +1342,7 @@ static int lfsck_layout_assistant(void *args)
                        lr->lr_index = lfsck_dev_idx(lfsck->li_bottom);
                        lr->lr_event = LE_PHASE1_DONE;
                        lr->lr_status = llmd->llmd_post_result;
-                       rc = lfsck_layout_master_notify_others(env, com, lr);
+                       rc = lfsck_layout_master_notify_others(env, com, lr, 0);
                        if (rc != 0)
                                CERROR("%s: failed to notify others "
                                       "for layout post: rc = %d\n",
@@ -1236,23 +1452,31 @@ cleanup2:
        lr->lr_index = lfsck_dev_idx(lfsck->li_bottom);
        if (rc > 0) {
                lr->lr_event = LE_PHASE2_DONE;
+               flags = 0;
                lr->lr_status = rc;
        } else if (rc == 0) {
                lr->lr_event = LE_STOP;
                if (lfsck->li_status == LS_PAUSED ||
-                   lfsck->li_status == LS_CO_PAUSED)
+                   lfsck->li_status == LS_CO_PAUSED) {
+                       flags = 0;
                        lr->lr_status = LS_CO_PAUSED;
-               else if (lfsck->li_status == LS_STOPPED ||
-                        lfsck->li_status == LS_CO_STOPPED)
-                       lr->lr_status = LS_CO_STOPPED;
-               else
+               } else if (lfsck->li_status == LS_STOPPED ||
+                        lfsck->li_status == LS_CO_STOPPED) {
+                       flags = lfsck->li_flags;
+                       if (flags & LPF_BROADCAST)
+                               lr->lr_status = LS_STOPPED;
+                       else
+                               lr->lr_status = LS_CO_STOPPED;
+               } else {
                        LBUG();
+               }
        } else {
                lr->lr_event = LE_STOP;
+               flags = 0;
                lr->lr_status = LS_CO_FAILED;
        }
 
-       rc1 = lfsck_layout_master_notify_others(env, com, lr);
+       rc1 = lfsck_layout_master_notify_others(env, com, lr, flags);
        if (rc1 != 0) {
                CERROR("%s: failed to notify others for layout quit: rc = %d\n",
                       lfsck_lfsck2name(lfsck), rc1);
@@ -1470,6 +1694,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env,
 
        memset(lr, 0, sizeof(*lr));
        lr->lr_event = event;
+       lr->lr_flags = LEF_FROM_OST;
        lr->lr_status = result;
        lr->lr_index = lfsck_dev_idx(lfsck->li_bottom);
        lr->lr_active = LT_LAYOUT;
@@ -2319,6 +2544,18 @@ static void lfsck_layout_master_data_release(const struct lu_env *env,
                                 ltd_layout_list) {
                list_del_init(&ltd->ltd_layout_list);
        }
+       list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase1_list,
+                                ltd_layout_phase_list) {
+               list_del_init(&ltd->ltd_layout_phase_list);
+       }
+       list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase2_list,
+                                ltd_layout_phase_list) {
+               list_del_init(&ltd->ltd_layout_phase_list);
+       }
+       list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_list,
+                                ltd_layout_list) {
+               list_del_init(&ltd->ltd_layout_list);
+       }
        spin_unlock(&ltds->ltd_lock);
 
        OBD_FREE_PTR(llmd);
@@ -2380,10 +2617,15 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env,
        struct lfsck_tgt_desc           *ltd;
        ENTRY;
 
-       if (lr->lr_event != LE_PHASE1_DONE)
+       if (lr->lr_event != LE_PHASE1_DONE &&
+           lr->lr_event != LE_PHASE2_DONE &&
+           lr->lr_event != LE_STOP)
                RETURN(-EINVAL);
 
-       ltds = &lfsck->li_ost_descs;
+       if (lr->lr_flags & LEF_FROM_OST)
+               ltds = &lfsck->li_ost_descs;
+       else
+               ltds = &lfsck->li_mdt_descs;
        spin_lock(&ltds->ltd_lock);
        ltd = LTD_TGT(ltds, lr->lr_index);
        if (ltd == NULL) {
@@ -2393,21 +2635,54 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env,
        }
 
        list_del_init(&ltd->ltd_layout_phase_list);
-       if (lr->lr_status > 0) {
-               if (list_empty(&ltd->ltd_layout_list))
-                       list_add_tail(&ltd->ltd_layout_list,
-                                     &llmd->llmd_ost_list);
-               list_add_tail(&ltd->ltd_layout_phase_list,
-                             &llmd->llmd_ost_phase2_list);
-       } else {
+       switch (lr->lr_event) {
+       case LE_PHASE1_DONE:
+               if (lr->lr_status <= 0) {
+                       ltd->ltd_layout_done = 1;
+                       list_del_init(&ltd->ltd_layout_list);
+                       lo->ll_flags |= LF_INCOMPLETE;
+                       break;
+               }
+
+               if (lr->lr_flags & LEF_FROM_OST) {
+                       if (list_empty(&ltd->ltd_layout_list))
+                               list_add_tail(&ltd->ltd_layout_list,
+                                             &llmd->llmd_ost_list);
+                       list_add_tail(&ltd->ltd_layout_phase_list,
+                                     &llmd->llmd_ost_phase2_list);
+               } else {
+                       if (list_empty(&ltd->ltd_layout_list))
+                               list_add_tail(&ltd->ltd_layout_list,
+                                             &llmd->llmd_mdt_list);
+                       list_add_tail(&ltd->ltd_layout_phase_list,
+                                     &llmd->llmd_mdt_phase2_list);
+               }
+               break;
+       case LE_PHASE2_DONE:
                ltd->ltd_layout_done = 1;
                list_del_init(&ltd->ltd_layout_list);
-               lo->ll_flags |= LF_INCOMPLETE;
+               break;
+       case LE_STOP:
+               ltd->ltd_layout_done = 1;
+               list_del_init(&ltd->ltd_layout_list);
+               if (!(lr->lr_flags & LEF_FORCE_STOP))
+                       lo->ll_flags |= LF_INCOMPLETE;
+               break;
+       default:
+               break;
        }
        spin_unlock(&ltds->ltd_lock);
 
-       if (lfsck_layout_master_to_orphan(llmd))
+       if (lr->lr_flags & LEF_FORCE_STOP) {
+               struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
+
+               memset(stop, 0, sizeof(*stop));
+               stop->ls_status = lr->lr_status;
+               stop->ls_flags = lr->lr_param;
+               lfsck_stop(env, lfsck->li_bottom, stop);
+       } else if (lfsck_layout_master_to_orphan(llmd)) {
                wake_up_all(&llmd->llmd_thread.t_ctl_waitq);
+       }
 
        RETURN(0);
 }
@@ -2479,6 +2754,14 @@ static int lfsck_layout_master_stop_notify(const struct lu_env *env,
        lr->lr_index = lfsck_dev_idx(lfsck->li_bottom);
        lr->lr_event = LE_STOP;
        lr->lr_active = LT_LAYOUT;
+       if (ltds == &lfsck->li_ost_descs) {
+               lr->lr_flags = LEF_TO_OST;
+       } else {
+               if (ltd->ltd_index == lfsck_dev_idx(lfsck->li_bottom))
+                       return 0;
+
+               lr->lr_flags = 0;
+       }
        lr->lr_status = LS_CO_STOPPED;
 
        laia->laia_com = com;
@@ -2490,12 +2773,45 @@ static int lfsck_layout_master_stop_notify(const struct lu_env *env,
                                 lfsck_layout_master_async_interpret,
                                 laia, LFSCK_NOTIFY);
        if (rc != 0)
-               CERROR("%s: Fail to notify OST %x for stop: rc = %d\n",
-                      lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
+               CERROR("%s: Fail to notify %s %x for co-stop: rc = %d\n",
+                      lfsck_lfsck2name(lfsck),
+                      (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT",
+                      ltd->ltd_index, rc);
 
        return rc;
 }
 
+/* with lfsck::li_lock held */
+static int lfsck_layout_slave_join(const struct lu_env *env,
+                                  struct lfsck_component *com,
+                                  struct lfsck_start_param *lsp)
+{
+       struct lfsck_instance            *lfsck = com->lc_lfsck;
+       struct lfsck_layout_slave_data   *llsd  = com->lc_data;
+       struct lfsck_layout_slave_target *llst;
+       struct lfsck_start               *start = lsp->lsp_start;
+       int                               rc    = 0;
+       ENTRY;
+
+       if (!lsp->lsp_index_valid || start == NULL ||
+           !(start->ls_flags & LPF_ALL_MDT))
+               RETURN(-EALREADY);
+
+       spin_unlock(&lfsck->li_lock);
+       rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
+       spin_lock(&lfsck->li_lock);
+       if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
+               spin_unlock(&lfsck->li_lock);
+               llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index);
+               if (llst != NULL)
+                       lfsck_layout_llst_put(llst);
+               spin_lock(&lfsck->li_lock);
+               rc = -EAGAIN;
+       }
+
+       RETURN(rc);
+}
+
 static struct lfsck_operations lfsck_layout_master_ops = {
        .lfsck_reset            = lfsck_layout_reset,
        .lfsck_fail             = lfsck_layout_fail,
@@ -2526,6 +2842,7 @@ static struct lfsck_operations lfsck_layout_slave_ops = {
        .lfsck_data_release     = lfsck_layout_slave_data_release,
        .lfsck_in_notify        = lfsck_layout_slave_in_notify,
        .lfsck_query            = lfsck_layout_query,
+       .lfsck_join             = lfsck_layout_slave_join,
 };
 
 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
@@ -2556,10 +2873,13 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
                        GOTO(out, rc = -ENOMEM);
 
                INIT_LIST_HEAD(&llmd->llmd_req_list);
+               spin_lock_init(&llmd->llmd_lock);
                INIT_LIST_HEAD(&llmd->llmd_ost_list);
                INIT_LIST_HEAD(&llmd->llmd_ost_phase1_list);
                INIT_LIST_HEAD(&llmd->llmd_ost_phase2_list);
-               spin_lock_init(&llmd->llmd_lock);
+               INIT_LIST_HEAD(&llmd->llmd_mdt_list);
+               INIT_LIST_HEAD(&llmd->llmd_mdt_phase1_list);
+               INIT_LIST_HEAD(&llmd->llmd_mdt_phase2_list);
                init_waitqueue_head(&llmd->llmd_thread.t_ctl_waitq);
                atomic_set(&llmd->llmd_rpcs_in_flight, 0);
                com->lc_data = llmd;
index ad92e32..3eb3a02 100644 (file)
@@ -90,6 +90,7 @@ const char *lfsck_param_names[] = {
        NULL,
        "failout",
        "dryrun",
+       "all_targets",
        NULL
 };
 
@@ -330,9 +331,6 @@ void lfsck_instance_cleanup(const struct lu_env *env,
        LASSERT(list_empty(&lfsck->li_link));
        LASSERT(thread_is_init(thread) || thread_is_stopped(thread));
 
-       lfsck_tgt_descs_fini(&lfsck->li_ost_descs);
-       lfsck_tgt_descs_fini(&lfsck->li_mdt_descs);
-
        if (lfsck->li_obj_oit != NULL) {
                lu_object_put_nocache(env, &lfsck->li_obj_oit->do_lu);
                lfsck->li_obj_oit = NULL;
@@ -363,6 +361,9 @@ void lfsck_instance_cleanup(const struct lu_env *env,
                lfsck_component_cleanup(env, com);
        }
 
+       lfsck_tgt_descs_fini(&lfsck->li_ost_descs);
+       lfsck_tgt_descs_fini(&lfsck->li_mdt_descs);
+
        if (lfsck->li_bookmark_obj != NULL) {
                lu_object_put_nocache(env, &lfsck->li_bookmark_obj->do_lu);
                lfsck->li_bookmark_obj = NULL;
@@ -1094,6 +1095,7 @@ int lfsck_async_request(const struct lu_env *env, struct obd_export *exp,
 
        laia = ptlrpc_req_async_args(req);
        *laia = *(struct lfsck_async_interpret_args *)args;
+       lfsck_component_get(laia->laia_com);
        req->rq_interpret_reply = interpreter;
        ptlrpc_set_add_req(set, req);
 
@@ -1287,20 +1289,27 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
        if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
                rc = -EALREADY;
                while (start->ls_active != 0) {
-                       if (type & start->ls_active) {
+                       if (!(type & start->ls_active)) {
+                               type <<= 1;
+                               continue;
+                       }
+
+                       com = __lfsck_component_find(lfsck, type,
+                                                    &lfsck->li_list_scan);
+                       if (com == NULL)
                                com = __lfsck_component_find(lfsck, type,
-                                                       &lfsck->li_list_scan);
-                               if (com == NULL)
-                                       com = __lfsck_component_find(lfsck,
-                                               type,
                                                &lfsck->li_list_double_scan);
-                               if (com == NULL) {
-                                       rc = -EBUSY;
+                       if (com == NULL) {
+                               rc = -EOPNOTSUPP;
+                               break;
+                       }
+
+                       if (com->lc_ops->lfsck_join != NULL) {
+                               rc = com->lc_ops->lfsck_join( env, com, lsp);
+                               if (rc != 0 && rc != -EALREADY)
                                        break;
-                               } else {
-                                       start->ls_active &= ~type;
-                               }
                        }
+                       start->ls_active &= ~type;
                        type <<= 1;
                }
                spin_unlock(&lfsck->li_lock);
@@ -1363,6 +1372,16 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
                }
        }
 
+       if (bk->lb_param & LPF_ALL_MDT &&
+           !(start->ls_flags & LPF_ALL_MDT)) {
+               bk->lb_param &= ~LPF_ALL_MDT;
+               dirty = true;
+       } else if (!(bk->lb_param & LPF_ALL_MDT) &&
+                  start->ls_flags & LPF_ALL_MDT) {
+               bk->lb_param |= LPF_ALL_MDT;
+               dirty = true;
+       }
+
        if (dirty) {
                rc = lfsck_bookmark_store(env, lfsck);
                if (rc != 0)
@@ -1486,10 +1505,13 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key,
                GOTO(out, rc = -EALREADY);
        }
 
-       if (stop != NULL)
+       if (stop != NULL) {
                lfsck->li_status = stop->ls_status;
-       else
+               lfsck->li_flags = stop->ls_flags;
+       } else {
                lfsck->li_status = LS_STOPPED;
+               lfsck->li_flags = 0;
+       }
 
        thread_set_flags(thread, SVC_STOPPING);
        spin_unlock(&lfsck->li_lock);
@@ -1796,6 +1818,7 @@ void lfsck_del_target(const struct lu_env *env, struct dt_device *key,
        spin_lock(&ltds->ltd_lock);
        ltd->ltd_dead = 1;
        if (!list_empty(&ltd->ltd_layout_list)) {
+               list_del_init(&ltd->ltd_layout_phase_list);
                list_del_init(&ltd->ltd_layout_list);
                stop = true;
        } else {
index 2573e33..de379b3 100644 (file)
@@ -269,13 +269,11 @@ static int lprocfs_rd_lfsck_speed_limit(char *page, char **start, off_t off,
                                        int count, int *eof, void *data)
 {
        struct mdd_device *mdd = data;
-       int rc;
 
        LASSERT(mdd != NULL);
        *eof = 1;
 
-       rc = lfsck_get_speed(mdd->mdd_bottom, page, count);
-       return rc != 0 ? rc : count;
+       return lfsck_get_speed(mdd->mdd_bottom, page, count);
 }
 
 static int lprocfs_wr_lfsck_speed_limit(struct file *file, const char *buffer,
@@ -336,6 +334,17 @@ static int lprocfs_rd_lfsck_namespace(char *page, char **start, off_t off,
        return rc;
 }
 
+static int lprocfs_rd_lfsck_layout(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+       struct mdd_device *mdd = data;
+
+       LASSERT(mdd != NULL);
+       *eof = 1;
+
+       return lfsck_dump(mdd->mdd_bottom, page, count, LT_LAYOUT);
+}
+
 static struct lprocfs_vars lprocfs_mdd_obd_vars[] = {
         { "atime_diff",      lprocfs_rd_atime_diff, lprocfs_wr_atime_diff, 0 },
         { "changelog_mask",  lprocfs_rd_changelog_mask,
@@ -347,6 +356,7 @@ static struct lprocfs_vars lprocfs_mdd_obd_vars[] = {
        { "lfsck_async_windows", lprocfs_rd_lfsck_async_windows,
                                 lprocfs_wr_lfsck_async_windows, 0 },
        { "lfsck_namespace", lprocfs_rd_lfsck_namespace, 0, 0 },
+       { "lfsck_layout", lprocfs_rd_lfsck_layout, 0, 0 },
        { 0 }
 };
 
index 4c386b1..064fef1 100644 (file)
@@ -4238,6 +4238,7 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
         }
 
        stop.ls_status = LS_PAUSED;
+       stop.ls_flags = 0;
        next->md_ops->mdo_iocontrol(env, next, OBD_IOC_STOP_LFSCK, 0, &stop);
 
         mdt_seq_fini(env, m);
@@ -5569,10 +5570,12 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
        }
        case OBD_IOC_STOP_LFSCK: {
                struct md_device        *next = mdt->mdt_child;
-               struct lfsck_stop        stop;
+               struct obd_ioctl_data   *data = karg;
+               struct lfsck_stop       *stop =
+                               (struct lfsck_stop *)(data->ioc_inlbuf1);
 
-               stop.ls_status = LS_STOPPED;
-               rc = next->md_ops->mdo_iocontrol(&env, next, cmd, 0, &stop);
+               stop->ls_status = LS_STOPPED;
+               rc = next->md_ops->mdo_iocontrol(&env, next, cmd, 0, stop);
                break;
        }
         case OBD_IOC_GET_OBJ_VERSION: {
index 3559991..6782e5d 100644 (file)
@@ -2248,6 +2248,7 @@ static void ofd_fini(const struct lu_env *env, struct ofd_device *m)
        struct lfsck_stop        stop;
 
        stop.ls_status = LS_PAUSED;
+       stop.ls_flags = 0;
        lfsck_stop(env, m->ofd_osd, &stop);
        lfsck_degister(env, m->ofd_osd);
        target_recovery_fini(obd);
index 61b1610..a5e7a2d 100644 (file)
@@ -1030,10 +1030,10 @@ int ofd_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                break;
        }
        case OBD_IOC_STOP_LFSCK: {
-               struct lfsck_stop stop;
+               struct obd_ioctl_data *data = karg;
 
-               stop.ls_status = LS_STOPPED;
-               rc = lfsck_stop(&env, ofd->ofd_osd, &stop);
+               rc = lfsck_stop(&env, ofd->ofd_osd,
+                               (struct lfsck_stop *)(data->ioc_inlbuf1));
                break;
        }
        case OBD_IOC_GET_OBJ_VERSION:
index 36fd071..898b373 100644 (file)
@@ -43,7 +43,7 @@ check_and_setup_lustre
        ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
 
 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.50) ]] &&
-       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11"
+       ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12"
 
 build_test_filter
 
@@ -1155,6 +1155,80 @@ test_11b() {
 }
 run_test 11b "LFSCK can rebuild crashed last_id"
 
+test_12() {
+       [ $MDSCOUNT -lt 2 ] &&
+               skip "We need at least 2 MDSes for test_12" && exit 0
+
+       echo "stopall"
+       stopall > /dev/null
+       echo "formatall"
+       formatall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       echo "All the LFSCK targets should be in 'init' status."
+       for k in $(seq $MDSCOUNT); do
+               local STATUS=$(do_facet mds${k} $LCTL get_param -n \
+                               mdd.$(facet_svc mds${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$STATUS" == "init" ] ||
+                       error "(1) MDS${k} Expect 'init', but got '$STATUS'"
+
+               $LFS mkdir -i $((k - 1)) $DIR/${k}
+               createmany -o $DIR/${k}/f 100
+       done
+
+       echo "Trigger LFSCK on all targets by single command (limited speed)."
+       do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
+               -s 10 || error "(2) Fail to start LFSCK on all devices!"
+
+       echo "All the LFSCK targets should be in 'scanning-phase1' status."
+       for k in $(seq $MDSCOUNT); do
+               local STATUS=$(do_facet mds${k} $LCTL get_param -n \
+                               mdd.$(facet_svc mds${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$STATUS" == "scanning-phase1" ] ||
+               error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
+       done
+
+       echo "Stop layout LFSCK on all targets by single lctl command."
+       do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
+               error "(4) Fail to stop LFSCK on all devices!"
+
+       echo "All the LFSCK targets should be in 'stopped' status."
+       for k in $(seq $MDSCOUNT); do
+               local STATUS=$(do_facet mds${k} $LCTL get_param -n \
+                               mdd.$(facet_svc mds${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$STATUS" == "stopped" ] ||
+                       error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
+       done
+
+       for k in $(seq $OSTCOUNT); do
+               local STATUS=$(do_facet ost${k} $LCTL get_param -n \
+                               obdfilter.$(facet_svc ost${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$STATUS" == "stopped" ] ||
+                       error "(6) OST${k} Expect 'stopped', but got '$STATUS'"
+       done
+
+       echo "Re-trigger LFSCK on all targets by single command (full speed)."
+       do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
+               -s 0 || error "(7) Fail to start LFSCK on all devices!"
+
+       echo "All the LFSCK targets should be in 'completed' status."
+       for k in $(seq $MDSCOUNT); do
+               # The LFSCK status query internal is 30 seconds. For the case
+               # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
+               # time to guarantee the status sync up.
+               wait_update_facet mds${k} "$LCTL get_param -n \
+                       mdd.$(facet_svc mds${k}).lfsck_layout |
+                       awk '/^status/ { print \\\$2 }'" "completed" 32 ||
+                       error "(8) MDS${k} is not the expected 'completed'"
+       done
+}
+run_test 12 "single command to trigger LFSCK on all devices"
+
 $LCTL set_param debug=-lfsck > /dev/null || true
 
 # restore MDS/OST size
index 1875b4b..c3fc3c4 100644 (file)
@@ -367,11 +367,12 @@ command_t cmdlist[] = {
         "usage: lfsck_start <-M | --device [MDT,OST]_device>\n"
         "                   [-e | --error error_handle] [-h | --help]\n"
         "                   [-n | --dryrun switch] [-r | --reset]\n"
-        "                   [-s | --speed speed_limit]\n"
+        "                   [-s | --speed speed_limit] [-A | --all]\n"
         "                   [-t | --type lfsck_type[,lfsck_type...]]\n"
         "                   [-w | --windows win_size]"},
        {"lfsck_stop", jt_lfsck_stop, 0, "stop lfsck(s)\n"
-        "usage: lfsck_stop <-M | --device [MDT,OST]_device> [-h | --help]"},
+        "usage: lfsck_stop <-M | --device [MDT,OST]_device>\n"
+        "                  [-A | --all] [-h | --help]"},
 
        {"==== obsolete (DANGEROUS) ====", jt_noop, 0, "obsolete (DANGEROUS)"},
        /* some test scripts still use these */
index e04cc21..0dd7018 100644 (file)
@@ -52,6 +52,7 @@ static struct option long_opt_start[] = {
        {"dryrun",      required_argument, 0, 'n'},
        {"reset",       no_argument,       0, 'r'},
        {"speed",       required_argument, 0, 's'},
+       {"all",         no_argument,       0, 'A'},
        {"type",        required_argument, 0, 't'},
        {"windows",     required_argument, 0, 'w'},
        {0,             0,                 0,   0}
@@ -59,6 +60,7 @@ static struct option long_opt_start[] = {
 
 static struct option long_opt_stop[] = {
        {"device",      required_argument, 0, 'M'},
+       {"all",         no_argument,       0, 'A'},
        {"help",        no_argument,       0, 'h'},
        {0,             0,                 0,   0}
 };
@@ -95,7 +97,7 @@ static void usage_start(void)
                "lfsck_start <-M | --device [MDT,OST]_device>\n"
                "            [-e | --error error_handle] [-h | --help]\n"
                "            [-n | --dryrun switch] [-r | --reset]\n"
-               "            [-s | --speed speed_limit]\n"
+               "            [-s | --speed speed_limit] [-A | --all]\n"
                "            [-t | --type lfsck_type[,lfsck_type...]]\n"
                "            [-w | --windows win_size]\n"
                "OPTIONS:\n"
@@ -106,6 +108,7 @@ static void usage_start(void)
                "-r: Reset scanning start position to the device beginning.\n"
                "-s: How many items can be scanned at most per second. "
                    "'%d' means no limit (default).\n"
+               "-A: Start LFSCK on all MDT devices.\n"
                "-t: The LFSCK type(s) to be started.\n"
                "-w: The windows size for async requests pipeline.\n",
                LFSCK_SPEED_NO_LIMIT);
@@ -115,9 +118,11 @@ static void usage_stop(void)
 {
        fprintf(stderr, "Stop LFSCK.\n"
                "SYNOPSIS:\n"
-               "lfsck_stop <-M | --device [MDT,OST]_device> [-h | --help]\n"
+               "lfsck_stop <-M | --device [MDT,OST]_device>\n"
+               "[-A | --all] [-h | --help]\n"
                "OPTIONS:\n"
                "-M: The device to stop LFSCK/scrub on.\n"
+               "-A: Stop LFSCK on all MDT devices.\n"
                "-h: Help information.\n");
 }
 
@@ -144,7 +149,7 @@ int jt_lfsck_start(int argc, char **argv)
        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
        char device[MAX_OBD_NAME];
        struct lfsck_start start;
-       char *optstring = "M:e:hn:rs:t:w:";
+       char *optstring = "M:e:hn:rs:At:w:";
        int opt, index, rc, val, i, type;
 
        memset(&data, 0, sizeof(data));
@@ -197,6 +202,9 @@ int jt_lfsck_start(int argc, char **argv)
                        start.ls_speed_limit = val;
                        start.ls_valid |= LSV_SPEED_LIMIT;
                        break;
+               case 'A':
+                       start.ls_flags |= LPF_ALL_MDT | LPF_BROADCAST;
+                       break;
                case 't': {
                        char *str = optarg, *p, c;
 
@@ -312,10 +320,12 @@ int jt_lfsck_stop(int argc, char **argv)
        struct obd_ioctl_data data;
        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
        char device[MAX_OBD_NAME];
-       char *optstring = "M:h";
+       struct lfsck_stop stop;
+       char *optstring = "M:Ah";
        int opt, index, rc;
 
        memset(&data, 0, sizeof(data));
+       memset(&stop, 0, sizeof(stop));
        memset(device, 0, MAX_OBD_NAME);
 
        /* Reset the 'optind' for the case of getopt_long() called multiple
@@ -329,6 +339,9 @@ int jt_lfsck_stop(int argc, char **argv)
                        if (rc != 0)
                                return rc;
                        break;
+               case 'A':
+                       stop.ls_flags |= LPF_ALL_MDT | LPF_BROADCAST;
+                       break;
                case 'h':
                        usage_stop();
                        return 0;
@@ -350,6 +363,8 @@ int jt_lfsck_stop(int argc, char **argv)
                }
        }
 
+       data.ioc_inlbuf1 = (char *)&stop;
+       data.ioc_inllen1 = sizeof(stop);
        memset(buf, 0, sizeof(rawbuf));
        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
        if (rc) {