From f0ead95dd1275ee906eccdf117abb92b36949a1b Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Tue, 26 Mar 2019 07:43:14 -0400 Subject: [PATCH] LU-8760 lfsck: fix bit operations lfsck_assistant_data Race between lfsck_master_engine->lfsck_double_scan_generic() and lfsck_layout->lfsck_assistant_engine() take a place. Both threads were sleeping and waiting for each other. lad_to_double_scan was set before sleep, but lfsck_assistant_data had zeros lad_post_result = 1, lad_to_post = 0, lad_to_double_scan = 0, lad_in_double_scan = 0, lad_exit = 0, lad_incomplete = 0, Using unsigned int a:1, b:1; f1() {a = 1;} f2() {b = 0;} is racy for multithread execution. These type of logic requires atomic bit operations. The race is lad_to_double_scan vs lad_to_post. Signed-off-by: Alexander Boyko Cray-bug-id: LUS-7076 Change-Id: I4f971ce2acb244f32ae2e108b96995dc2f27e7a3 Reviewed-on: https://review.whamcloud.com/34502 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Andrew Perepechko Reviewed-by: Oleg Drokin --- lustre/lfsck/lfsck_engine.c | 44 ++++++++++++++++++++++-------------------- lustre/lfsck/lfsck_internal.h | 15 ++++++++------ lustre/lfsck/lfsck_layout.c | 16 +++++++-------- lustre/lfsck/lfsck_lib.c | 17 +++++++--------- lustre/lfsck/lfsck_namespace.c | 10 +++++----- 5 files changed, 52 insertions(+), 50 deletions(-) diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index 4eb62d4..a25d949 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -1617,7 +1617,7 @@ int lfsck_assistant_engine(void *args) while (!list_empty(&lad->lad_req_list)) { bool wakeup = false; - if (unlikely(lad->lad_exit || + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) || !thread_is_running(mthread))) GOTO(cleanup, rc = lad->lad_post_result); @@ -1649,25 +1649,25 @@ int lfsck_assistant_engine(void *args) l_wait_event(athread->t_ctl_waitq, !lfsck_assistant_req_empty(lad) || - lad->lad_exit || - lad->lad_to_post || - lad->lad_to_double_scan, + test_bit(LAD_EXIT, &lad->lad_flags) || + test_bit(LAD_TO_POST, &lad->lad_flags) || + test_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags), &lwi); - if (unlikely(lad->lad_exit)) + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) GOTO(cleanup, rc = lad->lad_post_result); if (!list_empty(&lad->lad_req_list)) continue; - if (lad->lad_to_post) { + if (test_bit(LAD_TO_POST, &lad->lad_flags)) { CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread post\n", lfsck_lfsck2name(lfsck), lad->lad_name); - if (unlikely(lad->lad_exit)) + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) GOTO(cleanup, rc = lad->lad_post_result); - lad->lad_to_post = 0; + clear_bit(LAD_TO_POST, &lad->lad_flags); LASSERT(lad->lad_post_result > 0); /* Wakeup the master engine to go ahead. */ @@ -1684,10 +1684,10 @@ int lfsck_assistant_engine(void *args) lad->lad_name, rc); } - if (lad->lad_to_double_scan) { - lad->lad_to_double_scan = 0; + if (test_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags)) { + clear_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags); atomic_inc(&lfsck->li_double_scan_count); - lad->lad_in_double_scan = 1; + set_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags); wake_up_all(&mthread->t_ctl_waitq); com->lc_new_checked = 0; @@ -1711,7 +1711,7 @@ int lfsck_assistant_engine(void *args) if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_DOUBLESCAN)) GOTO(cleanup, rc = 0); - while (lad->lad_in_double_scan) { + while (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags)) { rc = lfsck_assistant_query_others(env, com); if (lfsck_phase2_next_ready(lad)) goto p2_next; @@ -1726,12 +1726,13 @@ int lfsck_assistant_engine(void *args) NULL, NULL); rc = l_wait_event(athread->t_ctl_waitq, lfsck_phase2_next_ready(lad) || - lad->lad_exit || + test_bit(LAD_EXIT, &lad->lad_flags) || !thread_is_running(mthread), &lwi); - if (unlikely(lad->lad_exit || - !thread_is_running(mthread))) + if (unlikely( + test_bit(LAD_EXIT, &lad->lad_flags) || + !thread_is_running(mthread))) GOTO(cleanup, rc = 0); if (rc == -ETIMEDOUT) @@ -1745,8 +1746,9 @@ p2_next: if (rc != 0) GOTO(cleanup, rc); - if (unlikely(lad->lad_exit || - !thread_is_running(mthread))) + if (unlikely( + test_bit(LAD_EXIT, &lad->lad_flags) || + !thread_is_running(mthread))) GOTO(cleanup, rc = 0); } } @@ -1758,7 +1760,7 @@ cleanup: if (rc < 0) lad->lad_assistant_status = rc; - if (lad->lad_exit && lad->lad_post_result <= 0) + if (test_bit(LAD_EXIT, &lad->lad_flags) && lad->lad_post_result <= 0) lao->la_fill_pos(env, com, &lfsck->li_pos_checkpoint); thread_set_flags(athread, SVC_STOPPING); @@ -1832,8 +1834,8 @@ cleanup: /* Under force exit case, some requests may be just freed without * verification, those objects should be re-handled when next run. * So not update the on-disk trace file under such case. */ - if (lad->lad_in_double_scan) { - if (!lad->lad_exit) + if (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags)) { + if (!test_bit(LAD_EXIT, &lad->lad_flags)) rc1 = lao->la_double_scan_result(env, com, rc); CDEBUG(D_LFSCK, "%s: LFSCK assistant phase2 scan " @@ -1842,7 +1844,7 @@ cleanup: } fini: - if (lad->lad_in_double_scan) + if (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags)) atomic_dec(&lfsck->li_double_scan_count); spin_lock(&lad->lad_lock); diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index c302b43..16e6a05 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -855,13 +855,16 @@ struct lfsck_assistant_data { int lad_prefetched; int lad_assistant_status; int lad_post_result; - unsigned int lad_to_post:1, - lad_to_double_scan:1, - lad_in_double_scan:1, - lad_exit:1, - lad_incomplete:1; + unsigned long lad_flags; bool lad_advance_lock; }; +enum { + LAD_TO_POST = 0, + LAD_TO_DOUBLE_SCAN = 1, + LAD_IN_DOUBLE_SCAN = 2, + LAD_EXIT = 3, + LAD_INCOMPLETE = 4, +}; #define LFSCK_TMPBUF_LEN 64 @@ -1430,7 +1433,7 @@ static inline void lfsck_lad_set_bitmap(const struct lu_env *env, if (likely(bitmap->size > index)) { cfs_bitmap_set(bitmap, index); - lad->lad_incomplete = 1; + set_bit(LAD_INCOMPLETE, &lad->lad_flags); } else if (com->lc_type == LFSCK_TYPE_NAMESPACE) { struct lfsck_namespace *ns = com->lc_file_ram; diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index 1a21536..4f250e0 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -270,7 +270,7 @@ static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, int rc = 0; ENTRY; - if (!lad->lad_incomplete) + if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags)) RETURN_EXIT; /* If the MDT has ever failed to verfiy some OST-objects, @@ -908,7 +908,7 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env, } if (lo->ll_bitmap_size == 0) { - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); CFS_RESET_BITMAP(bitmap); RETURN(0); @@ -920,9 +920,9 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env, RETURN(rc >= 0 ? -EINVAL : rc); if (cfs_bitmap_check_empty(bitmap)) - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); else - lad->lad_incomplete = 1; + set_bit(LAD_INCOMPLETE, &lad->lad_flags); RETURN(0); } @@ -1467,7 +1467,7 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, if (lfsck->li_master) { struct lfsck_assistant_data *lad = com->lc_data; - if (lad->lad_incomplete) + if (test_bit(LAD_INCOMPLETE, &lad->lad_flags)) lo->ll_status = LS_PARTIAL; else lo->ll_status = LS_COMPLETED; @@ -4346,7 +4346,7 @@ out: if (rc < 0) { struct lfsck_assistant_data *lad = com->lc_data; - if (unlikely(lad->lad_exit)) { + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) { rc = 0; } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || rc == -ETIMEDOUT || rc == -EHOSTDOWN || @@ -4536,7 +4536,7 @@ static int lfsck_layout_assistant_handler_p2(const struct lu_env *env, if (rc != 0 && bk->lb_param & LPF_FAILOUT) RETURN(rc); - if (unlikely(lad->lad_exit || + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) || !thread_is_running(&lfsck->li_thread))) RETURN(0); spin_lock(<ds->ltd_lock); @@ -5083,7 +5083,7 @@ static int lfsck_layout_reset(const struct lu_env *env, if (com->lc_lfsck->li_master) { struct lfsck_assistant_data *lad = com->lc_data; - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); CFS_RESET_BITMAP(lad->lad_bitmap); } diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 0581be4..4e96050 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -2510,10 +2510,7 @@ int lfsck_start_assistant(const struct lu_env *env, struct lfsck_component *com, lad->lad_assistant_status = 0; lad->lad_post_result = 0; - lad->lad_to_post = 0; - lad->lad_to_double_scan = 0; - lad->lad_in_double_scan = 0; - lad->lad_exit = 0; + lad->lad_flags = 0; lad->lad_advance_lock = false; thread_set_flags(athread, 0); @@ -2577,8 +2574,8 @@ void lfsck_post_generic(const struct lu_env *env, lad->lad_post_result = *result; if (*result <= 0) - lad->lad_exit = 1; - lad->lad_to_post = 1; + set_bit(LAD_EXIT, &lad->lad_flags); + set_bit(LAD_TO_POST, &lad->lad_flags); CDEBUG(D_LFSCK, "%s: waiting for assistant to do %s post, rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lad->lad_name, *result); @@ -2605,9 +2602,9 @@ int lfsck_double_scan_generic(const struct lu_env *env, struct l_wait_info lwi = { 0 }; if (status != LS_SCANNING_PHASE2) - lad->lad_exit = 1; + set_bit(LAD_EXIT, &lad->lad_flags); else - lad->lad_to_double_scan = 1; + set_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags); CDEBUG(D_LFSCK, "%s: waiting for assistant to do %s double_scan, " "status %d\n", @@ -2615,7 +2612,7 @@ int lfsck_double_scan_generic(const struct lu_env *env, wake_up_all(&athread->t_ctl_waitq); l_wait_event(mthread->t_ctl_waitq, - lad->lad_in_double_scan || + test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags) || thread_is_stopped(athread), &lwi); @@ -2637,7 +2634,7 @@ void lfsck_quit_generic(const struct lu_env *env, struct ptlrpc_thread *athread = &lad->lad_thread; struct l_wait_info lwi = { 0 }; - lad->lad_exit = 1; + set_bit(LAD_EXIT, &lad->lad_flags); wake_up_all(&athread->t_ctl_waitq); l_wait_event(mthread->t_ctl_waitq, thread_is_init(athread) || diff --git a/lustre/lfsck/lfsck_namespace.c b/lustre/lfsck/lfsck_namespace.c index d88a352..269a2f0 100644 --- a/lustre/lfsck/lfsck_namespace.c +++ b/lustre/lfsck/lfsck_namespace.c @@ -312,7 +312,7 @@ static int lfsck_namespace_load_bitmap(const struct lu_env *env, } if (ns->ln_bitmap_size == 0) { - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); CFS_RESET_BITMAP(bitmap); RETURN(0); @@ -326,9 +326,9 @@ static int lfsck_namespace_load_bitmap(const struct lu_env *env, RETURN(rc >= 0 ? -EINVAL : rc); if (cfs_bitmap_check_empty(bitmap)) - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); else - lad->lad_incomplete = 1; + set_bit(LAD_INCOMPLETE, &lad->lad_flags); RETURN(0); } @@ -4100,7 +4100,7 @@ static int lfsck_namespace_reset(const struct lu_env *env, if (rc != 0) GOTO(out, rc); - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); CFS_RESET_BITMAP(lad->lad_bitmap); rc = lfsck_namespace_store(env, com); @@ -6555,7 +6555,7 @@ static void lfsck_namespace_assistant_sync_failures(const struct lu_env *env, int rc = 0; ENTRY; - if (!lad->lad_incomplete) + if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags)) RETURN_EXIT; set = ptlrpc_prep_set(); -- 1.8.3.1