Whamcloud - gitweb
LU-8760 lfsck: fix bit operations lfsck_assistant_data 02/34502/2
authorAlexander Boyko <c17825@cray.com>
Tue, 26 Mar 2019 11:43:14 +0000 (07:43 -0400)
committerOleg Drokin <green@whamcloud.com>
Mon, 8 Apr 2019 05:32:03 +0000 (05:32 +0000)
Race between lfsck_master_engine->lfsck_double_scan_generic()
and  lfsck_layout->lfsck_assistant_engine() take a place.
Both threads were sleeping and waiting for each other.
lad_to_double_scan was set before sleep, but lfsck_assistant_data had
zeros
  lad_post_result = 1,
  lad_to_post = 0,
  lad_to_double_scan = 0,
  lad_in_double_scan = 0,
  lad_exit = 0,
  lad_incomplete = 0,

Using
 unsigned int a:1,
           b:1;
  f1() {a = 1;}
  f2() {b = 0;}
is racy for multithread execution. These type of logic requires
atomic bit operations.
The race is lad_to_double_scan vs lad_to_post.

Signed-off-by: Alexander Boyko <c17825@cray.com>
Cray-bug-id: LUS-7076
Change-Id: I4f971ce2acb244f32ae2e108b96995dc2f27e7a3
Reviewed-on: https://review.whamcloud.com/34502
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Andrew Perepechko <c17827@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_layout.c
lustre/lfsck/lfsck_lib.c
lustre/lfsck/lfsck_namespace.c

index 4eb62d4..a25d949 100644 (file)
@@ -1617,7 +1617,7 @@ int lfsck_assistant_engine(void *args)
                while (!list_empty(&lad->lad_req_list)) {
                        bool wakeup = false;
 
-                       if (unlikely(lad->lad_exit ||
+                       if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
                                     !thread_is_running(mthread)))
                                GOTO(cleanup, rc = lad->lad_post_result);
 
@@ -1649,25 +1649,25 @@ int lfsck_assistant_engine(void *args)
 
                l_wait_event(athread->t_ctl_waitq,
                             !lfsck_assistant_req_empty(lad) ||
-                            lad->lad_exit ||
-                            lad->lad_to_post ||
-                            lad->lad_to_double_scan,
+                            test_bit(LAD_EXIT, &lad->lad_flags) ||
+                            test_bit(LAD_TO_POST, &lad->lad_flags) ||
+                            test_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags),
                             &lwi);
 
-               if (unlikely(lad->lad_exit))
+               if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags)))
                        GOTO(cleanup, rc = lad->lad_post_result);
 
                if (!list_empty(&lad->lad_req_list))
                        continue;
 
-               if (lad->lad_to_post) {
+               if (test_bit(LAD_TO_POST, &lad->lad_flags)) {
                        CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread post\n",
                               lfsck_lfsck2name(lfsck), lad->lad_name);
 
-                       if (unlikely(lad->lad_exit))
+                       if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags)))
                                GOTO(cleanup, rc = lad->lad_post_result);
 
-                       lad->lad_to_post = 0;
+                       clear_bit(LAD_TO_POST, &lad->lad_flags);
                        LASSERT(lad->lad_post_result > 0);
 
                        /* Wakeup the master engine to go ahead. */
@@ -1684,10 +1684,10 @@ int lfsck_assistant_engine(void *args)
                               lad->lad_name, rc);
                }
 
-               if (lad->lad_to_double_scan) {
-                       lad->lad_to_double_scan = 0;
+               if (test_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags)) {
+                       clear_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags);
                        atomic_inc(&lfsck->li_double_scan_count);
-                       lad->lad_in_double_scan = 1;
+                       set_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags);
                        wake_up_all(&mthread->t_ctl_waitq);
 
                        com->lc_new_checked = 0;
@@ -1711,7 +1711,7 @@ int lfsck_assistant_engine(void *args)
                        if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_DOUBLESCAN))
                                GOTO(cleanup, rc = 0);
 
-                       while (lad->lad_in_double_scan) {
+                       while (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags)) {
                                rc = lfsck_assistant_query_others(env, com);
                                if (lfsck_phase2_next_ready(lad))
                                        goto p2_next;
@@ -1726,12 +1726,13 @@ int lfsck_assistant_engine(void *args)
                                                           NULL, NULL);
                                rc = l_wait_event(athread->t_ctl_waitq,
                                        lfsck_phase2_next_ready(lad) ||
-                                       lad->lad_exit ||
+                                       test_bit(LAD_EXIT, &lad->lad_flags) ||
                                        !thread_is_running(mthread),
                                        &lwi);
 
-                               if (unlikely(lad->lad_exit ||
-                                            !thread_is_running(mthread)))
+                               if (unlikely(
+                                       test_bit(LAD_EXIT, &lad->lad_flags) ||
+                                       !thread_is_running(mthread)))
                                        GOTO(cleanup, rc = 0);
 
                                if (rc == -ETIMEDOUT)
@@ -1745,8 +1746,9 @@ p2_next:
                                if (rc != 0)
                                        GOTO(cleanup, rc);
 
-                               if (unlikely(lad->lad_exit ||
-                                            !thread_is_running(mthread)))
+                               if (unlikely(
+                                       test_bit(LAD_EXIT, &lad->lad_flags) ||
+                                       !thread_is_running(mthread)))
                                        GOTO(cleanup, rc = 0);
                        }
                }
@@ -1758,7 +1760,7 @@ cleanup:
        if (rc < 0)
                lad->lad_assistant_status = rc;
 
-       if (lad->lad_exit && lad->lad_post_result <= 0)
+       if (test_bit(LAD_EXIT, &lad->lad_flags) && lad->lad_post_result <= 0)
                lao->la_fill_pos(env, com, &lfsck->li_pos_checkpoint);
 
        thread_set_flags(athread, SVC_STOPPING);
@@ -1832,8 +1834,8 @@ cleanup:
        /* Under force exit case, some requests may be just freed without
         * verification, those objects should be re-handled when next run.
         * So not update the on-disk trace file under such case. */
-       if (lad->lad_in_double_scan) {
-               if (!lad->lad_exit)
+       if (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags)) {
+               if (!test_bit(LAD_EXIT, &lad->lad_flags))
                        rc1 = lao->la_double_scan_result(env, com, rc);
 
                CDEBUG(D_LFSCK, "%s: LFSCK assistant phase2 scan "
@@ -1842,7 +1844,7 @@ cleanup:
        }
 
 fini:
-       if (lad->lad_in_double_scan)
+       if (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags))
                atomic_dec(&lfsck->li_double_scan_count);
 
        spin_lock(&lad->lad_lock);
index c302b43..16e6a05 100644 (file)
@@ -855,13 +855,16 @@ struct lfsck_assistant_data {
        int                                      lad_prefetched;
        int                                      lad_assistant_status;
        int                                      lad_post_result;
-       unsigned int                             lad_to_post:1,
-                                                lad_to_double_scan:1,
-                                                lad_in_double_scan:1,
-                                                lad_exit:1,
-                                                lad_incomplete:1;
+       unsigned long                            lad_flags;
        bool                                     lad_advance_lock;
 };
+enum {
+       LAD_TO_POST = 0,
+       LAD_TO_DOUBLE_SCAN = 1,
+       LAD_IN_DOUBLE_SCAN = 2,
+       LAD_EXIT = 3,
+       LAD_INCOMPLETE = 4,
+};
 
 #define LFSCK_TMPBUF_LEN       64
 
@@ -1430,7 +1433,7 @@ static inline void lfsck_lad_set_bitmap(const struct lu_env *env,
 
        if (likely(bitmap->size > index)) {
                cfs_bitmap_set(bitmap, index);
-               lad->lad_incomplete = 1;
+               set_bit(LAD_INCOMPLETE, &lad->lad_flags);
        } else if (com->lc_type == LFSCK_TYPE_NAMESPACE) {
                struct lfsck_namespace *ns = com->lc_file_ram;
 
index 1a21536..4f250e0 100644 (file)
@@ -270,7 +270,7 @@ static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
        int                                rc    = 0;
        ENTRY;
 
-       if (!lad->lad_incomplete)
+       if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
                RETURN_EXIT;
 
        /* If the MDT has ever failed to verfiy some OST-objects,
@@ -908,7 +908,7 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env,
        }
 
        if (lo->ll_bitmap_size == 0) {
-               lad->lad_incomplete = 0;
+               clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
                CFS_RESET_BITMAP(bitmap);
 
                RETURN(0);
@@ -920,9 +920,9 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env,
                RETURN(rc >= 0 ? -EINVAL : rc);
 
        if (cfs_bitmap_check_empty(bitmap))
-               lad->lad_incomplete = 0;
+               clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
        else
-               lad->lad_incomplete = 1;
+               set_bit(LAD_INCOMPLETE, &lad->lad_flags);
 
        RETURN(0);
 }
@@ -1467,7 +1467,7 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env,
                        if (lfsck->li_master) {
                                struct lfsck_assistant_data *lad = com->lc_data;
 
-                               if (lad->lad_incomplete)
+                               if (test_bit(LAD_INCOMPLETE, &lad->lad_flags))
                                        lo->ll_status = LS_PARTIAL;
                                else
                                        lo->ll_status = LS_COMPLETED;
@@ -4346,7 +4346,7 @@ out:
        if (rc < 0) {
                struct lfsck_assistant_data *lad = com->lc_data;
 
-               if (unlikely(lad->lad_exit)) {
+               if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) {
                        rc = 0;
                } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
                           rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
@@ -4536,7 +4536,7 @@ static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
                        if (rc != 0 && bk->lb_param & LPF_FAILOUT)
                                RETURN(rc);
 
-                       if (unlikely(lad->lad_exit ||
+                       if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
                                     !thread_is_running(&lfsck->li_thread)))
                                RETURN(0);
                        spin_lock(&ltds->ltd_lock);
@@ -5083,7 +5083,7 @@ static int lfsck_layout_reset(const struct lu_env *env,
        if (com->lc_lfsck->li_master) {
                struct lfsck_assistant_data *lad = com->lc_data;
 
-               lad->lad_incomplete = 0;
+               clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
                CFS_RESET_BITMAP(lad->lad_bitmap);
        }
 
index 0581be4..4e96050 100644 (file)
@@ -2510,10 +2510,7 @@ int lfsck_start_assistant(const struct lu_env *env, struct lfsck_component *com,
 
        lad->lad_assistant_status = 0;
        lad->lad_post_result = 0;
-       lad->lad_to_post = 0;
-       lad->lad_to_double_scan = 0;
-       lad->lad_in_double_scan = 0;
-       lad->lad_exit = 0;
+       lad->lad_flags = 0;
        lad->lad_advance_lock = false;
        thread_set_flags(athread, 0);
 
@@ -2577,8 +2574,8 @@ void lfsck_post_generic(const struct lu_env *env,
 
        lad->lad_post_result = *result;
        if (*result <= 0)
-               lad->lad_exit = 1;
-       lad->lad_to_post = 1;
+               set_bit(LAD_EXIT, &lad->lad_flags);
+       set_bit(LAD_TO_POST, &lad->lad_flags);
 
        CDEBUG(D_LFSCK, "%s: waiting for assistant to do %s post, rc = %d\n",
               lfsck_lfsck2name(com->lc_lfsck), lad->lad_name, *result);
@@ -2605,9 +2602,9 @@ int lfsck_double_scan_generic(const struct lu_env *env,
        struct l_wait_info               lwi     = { 0 };
 
        if (status != LS_SCANNING_PHASE2)
-               lad->lad_exit = 1;
+               set_bit(LAD_EXIT, &lad->lad_flags);
        else
-               lad->lad_to_double_scan = 1;
+               set_bit(LAD_TO_DOUBLE_SCAN, &lad->lad_flags);
 
        CDEBUG(D_LFSCK, "%s: waiting for assistant to do %s double_scan, "
               "status %d\n",
@@ -2615,7 +2612,7 @@ int lfsck_double_scan_generic(const struct lu_env *env,
 
        wake_up_all(&athread->t_ctl_waitq);
        l_wait_event(mthread->t_ctl_waitq,
-                    lad->lad_in_double_scan ||
+                    test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags) ||
                     thread_is_stopped(athread),
                     &lwi);
 
@@ -2637,7 +2634,7 @@ void lfsck_quit_generic(const struct lu_env *env,
        struct ptlrpc_thread            *athread = &lad->lad_thread;
        struct l_wait_info               lwi     = { 0 };
 
-       lad->lad_exit = 1;
+       set_bit(LAD_EXIT, &lad->lad_flags);
        wake_up_all(&athread->t_ctl_waitq);
        l_wait_event(mthread->t_ctl_waitq,
                     thread_is_init(athread) ||
index d88a352..269a2f0 100644 (file)
@@ -312,7 +312,7 @@ static int lfsck_namespace_load_bitmap(const struct lu_env *env,
        }
 
        if (ns->ln_bitmap_size == 0) {
-               lad->lad_incomplete = 0;
+               clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
                CFS_RESET_BITMAP(bitmap);
 
                RETURN(0);
@@ -326,9 +326,9 @@ static int lfsck_namespace_load_bitmap(const struct lu_env *env,
                RETURN(rc >= 0 ? -EINVAL : rc);
 
        if (cfs_bitmap_check_empty(bitmap))
-               lad->lad_incomplete = 0;
+               clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
        else
-               lad->lad_incomplete = 1;
+               set_bit(LAD_INCOMPLETE, &lad->lad_flags);
 
        RETURN(0);
 }
@@ -4100,7 +4100,7 @@ static int lfsck_namespace_reset(const struct lu_env *env,
        if (rc != 0)
                GOTO(out, rc);
 
-       lad->lad_incomplete = 0;
+       clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
        CFS_RESET_BITMAP(lad->lad_bitmap);
 
        rc = lfsck_namespace_store(env, com);
@@ -6555,7 +6555,7 @@ static void lfsck_namespace_assistant_sync_failures(const struct lu_env *env,
        int                                rc    = 0;
        ENTRY;
 
-       if (!lad->lad_incomplete)
+       if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
                RETURN_EXIT;
 
        set = ptlrpc_prep_set();