During the first-stage scanning, the local LFSCK instance records
which OSTs have ever failed to respond LFSCK verification requests
(maybe because of network issues or the OST itself trouble). Then
before start the second-stage scanning, the local LFSCK instance
will notify those ever failed OSTs to skip orphan handling since
they missed some OST-objects verification via la_sync_failures().
Originally, after la_sync_failures(), related OSTs will be removed
from the LFSCK targets list, in spite of whether la_sync_failures()
succeed or not, then the subsequent LFSCK notification RPCs will not
be sent to those OSTs. That may cause some OST(s) cannot exit LFSCK
expectedly, and then the subsequent LFSCK start will get failure
since former LFSCK instance has not exit.
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Id0283c78527d6a3a6c563de7ce6af1fe2d3f1a30
Reviewed-on: http://review.whamcloud.com/13525
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
+ lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false);
CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, "
"oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID
", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir,
CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, "
"oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID
", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir,
laia->laia_ltd = ltd;
ltd->ltd_layout_done = 0;
laia->laia_ltd = ltd;
ltd->ltd_layout_done = 0;
+ ltd->ltd_synced_failures = 0;
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_async_interpret_common,
laia, LFSCK_NOTIFY);
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_async_interpret_common,
laia, LFSCK_NOTIFY);
*gen = lad->lad_touch_gen;
list_move_tail(list, &lad->lad_mdt_list);
*gen = lad->lad_touch_gen;
list_move_tail(list, &lad->lad_mdt_list);
- if (ltd->ltd_namespace_failed)
+ if (ltd->ltd_synced_failures)
continue;
atomic_inc(<d->ltd_ref);
continue;
atomic_inc(<d->ltd_ref);
unsigned int ltd_dead:1,
ltd_layout_done:1,
ltd_namespace_done:1,
unsigned int ltd_dead:1,
ltd_layout_done:1,
ltd_namespace_done:1,
- ltd_namespace_failed:1;
};
struct lfsck_tgt_desc_idx {
};
struct lfsck_tgt_desc_idx {
struct ptlrpc_request *req,
void *args, int rc)
{
struct ptlrpc_request *req,
void *args, int rc)
{
- struct lfsck_async_interpret_args *laia = args;
+ if (rc == 0) {
+ struct lfsck_async_interpret_args *laia = args;
+ struct lfsck_tgt_desc *ltd = laia->laia_ltd;
+ ltd->ltd_synced_failures = 1;
atomic_dec(laia->laia_count);
atomic_dec(laia->laia_count);
ltd = LTD_TGT(ltds, idx);
LASSERT(ltd != NULL);
ltd = LTD_TGT(ltds, idx);
LASSERT(ltd != NULL);
- spin_lock(<ds->ltd_lock);
- list_del_init(<d->ltd_layout_phase_list);
- list_del_init(<d->ltd_layout_list);
- spin_unlock(<ds->ltd_lock);
-
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_layout_assistant_sync_failures_interpret,
laia, LFSCK_NOTIFY);
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_layout_assistant_sync_failures_interpret,
laia, LFSCK_NOTIFY);
CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
lfsck_lfsck2name(lfsck));
CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
lfsck_lfsck2name(lfsck));
+ atomic_inc(&lfsck->li_double_scan_count);
+
if (lo->ll_flags & LF_INCOMPLETE)
GOTO(done, rc = 1);
if (lo->ll_flags & LF_INCOMPLETE)
GOTO(done, rc = 1);
- atomic_inc(&lfsck->li_double_scan_count);
-
com->lc_new_checked = 0;
com->lc_new_scanned = 0;
com->lc_time_last_checkpoint = cfs_time_current();
com->lc_new_checked = 0;
com->lc_new_scanned = 0;
com->lc_time_last_checkpoint = cfs_time_current();
rc = l_wait_event(thread->t_ctl_waitq,
!thread_is_running(thread) ||
rc = l_wait_event(thread->t_ctl_waitq,
!thread_is_running(thread) ||
+ lo->ll_flags & LF_INCOMPLETE ||
list_empty(&llsd->llsd_master_list),
&lwi);
if (unlikely(!thread_is_running(thread)))
GOTO(done, rc = 0);
list_empty(&llsd->llsd_master_list),
&lwi);
if (unlikely(!thread_is_running(thread)))
GOTO(done, rc = 0);
+ if (lo->ll_flags & LF_INCOMPLETE)
+ GOTO(done, rc = 1);
+
if (rc == -ETIMEDOUT)
continue;
if (rc == -ETIMEDOUT)
continue;
CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
"from %s %x, status %d, flags %x, flags2 %x\n",
lfsck_lfsck2name(lfsck), lr->lr_event,
CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
"from %s %x, status %d, flags %x, flags2 %x\n",
lfsck_lfsck2name(lfsck), lr->lr_event,
- (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT",
+ (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
if (lr->lr_event != LE_PHASE1_DONE &&
lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
if (lr->lr_event != LE_PHASE1_DONE &&
break;
case LE_PHASE2_DONE:
ltd->ltd_layout_done = 1;
break;
case LE_PHASE2_DONE:
ltd->ltd_layout_done = 1;
- list_del_init(<d->ltd_layout_list);
+ if (!list_empty(<d->ltd_layout_list)) {
+ list_del_init(<d->ltd_layout_list);
+ if (lr->lr_flags2 & LF_INCOMPLETE) {
+ lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
+ fail = true;
+ }
+ }
+
break;
case LE_PEER_EXIT:
fail = true;
break;
case LE_PEER_EXIT:
fail = true;
true);
if (llst != NULL) {
lfsck_layout_llst_put(llst);
true);
if (llst != NULL) {
lfsck_layout_llst_put(llst);
- if (list_empty(&llsd->llsd_master_list))
- wake_up_all(
- &lfsck->li_thread.t_ctl_waitq);
+ wake_up_all(&lfsck->li_thread.t_ctl_waitq);
laia->laia_ltd = ltd;
ltd->ltd_layout_done = 0;
ltd->ltd_namespace_done = 0;
laia->laia_ltd = ltd;
ltd->ltd_layout_done = 0;
ltd->ltd_namespace_done = 0;
+ ltd->ltd_synced_failures = 0;
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_async_interpret, laia,
LFSCK_NOTIFY);
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_async_interpret, laia,
LFSCK_NOTIFY);
struct ptlrpc_request *req,
void *args, int rc)
{
struct ptlrpc_request *req,
void *args, int rc)
{
+ if (rc == 0) {
+ struct lfsck_async_interpret_args *laia = args;
+ struct lfsck_tgt_desc *ltd = laia->laia_ltd;
+
+ ltd->ltd_synced_failures = 1;
+ }
+
struct lfsck_tgt_descs *ltds = &lfsck->li_mdt_descs;
struct lfsck_tgt_desc *ltd;
struct ptlrpc_request_set *set;
struct lfsck_tgt_descs *ltds = &lfsck->li_mdt_descs;
struct lfsck_tgt_desc *ltd;
struct ptlrpc_request_set *set;
+ if (!lad->lad_incomplete)
+ RETURN_EXIT;
+
set = ptlrpc_prep_set();
if (set == NULL)
GOTO(out, rc = -ENOMEM);
set = ptlrpc_prep_set();
if (set == NULL)
GOTO(out, rc = -ENOMEM);
memset(laia, 0, sizeof(*laia));
lad->lad_touch_gen++;
memset(laia, 0, sizeof(*laia));
lad->lad_touch_gen++;
- spin_lock(<ds->ltd_lock);
- while (!list_empty(&lad->lad_mdt_list)) {
- ltd = list_entry(lad->lad_mdt_list.next,
- struct lfsck_tgt_desc,
- ltd_namespace_list);
- if (ltd->ltd_namespace_gen == lad->lad_touch_gen)
- break;
+ down_read(<ds->ltd_rw_sem);
+ cfs_foreach_bit(lad->lad_bitmap, idx) {
+ ltd = LTD_TGT(ltds, idx);
+ LASSERT(ltd != NULL);
- ltd->ltd_namespace_gen = lad->lad_touch_gen;
- list_move_tail(<d->ltd_namespace_list,
- &lad->lad_mdt_list);
- if (!lad->lad_incomplete ||
- !cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) {
- ltd->ltd_namespace_failed = 0;
- continue;
- }
-
- ltd->ltd_namespace_failed = 1;
- spin_unlock(<ds->ltd_lock);
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_namespace_assistant_sync_failures_interpret,
laia, LFSCK_NOTIFY);
rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
lfsck_namespace_assistant_sync_failures_interpret,
laia, LFSCK_NOTIFY);
CDEBUG(D_LFSCK, "%s: namespace LFSCK assistant fail "
"to sync failure with MDT %x: rc = %d\n",
lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
CDEBUG(D_LFSCK, "%s: namespace LFSCK assistant fail "
"to sync failure with MDT %x: rc = %d\n",
lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
-
- spin_lock(<ds->ltd_lock);
- spin_unlock(<ds->ltd_lock);
+ up_read(<ds->ltd_rw_sem);
rc = ptlrpc_set_wait(set);
ptlrpc_set_destroy(set);
rc = ptlrpc_set_wait(set);
ptlrpc_set_destroy(set);