Whamcloud - gitweb
LU-6146 tests: race condition for check/use cfs_fail_val 81/13481/9
authorFan Yong <fan.yong@intel.com>
Tue, 4 Nov 2014 08:02:22 +0000 (16:02 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Sun, 25 Jan 2015 01:53:26 +0000 (01:53 +0000)
There are some race conditions when check/use cfs_fail_val.
For example: when inject failure stub for LFSCK test as following:

764   if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) &&
765       cfs_fail_val > 0) {
766           struct l_wait_info lwi;
767
768           lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
769                             NULL, NULL);
770           l_wait_event(thread->t_ctl_waitq,
771                        !thread_is_running(thread),
772                        &lwi);
773
774           if (unlikely(!thread_is_running(thread))) {
775                   CDEBUG(D_LFSCK, "%s: scan dir exit for engine "
776                          "stop, parent "DFID", cookie "LPX64"n",
777                          lfsck_lfsck2name(lfsck),
778                          PFID(lfsck_dto2fid(dir)),
779                          lfsck->li_cookie_dir);
780                   RETURN(0);
781           }
782   }

The "cfs_fail_val" may be changed as zero by others after the check
at the line 765 but before using it at the line 768. Then the LFSCK
engine will fall into "wait" until someone run "lfsck_stop".

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I418621faaf6a1f42ba1d541b37374c1dc21831be
Reviewed-on: http://review.whamcloud.com/13481
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
libcfs/libcfs/fail.c
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_layout.c
lustre/lfsck/lfsck_namespace.c
lustre/lfsck/lfsck_striped_dir.c
lustre/osd-ldiskfs/osd_scrub.c

index 58a698e..af8784d 100644 (file)
@@ -127,7 +127,7 @@ int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
        int ret = 0;
 
        ret = __cfs_fail_check_set(id, value, set);
        int ret = 0;
 
        ret = __cfs_fail_check_set(id, value, set);
-       if (ret) {
+       if (ret && likely(ms > 0)) {
                CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
                       id, ms);
                schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
                CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
                       id, ms);
                schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
index 950c3ec..1b0ce1e 100644 (file)
@@ -761,24 +761,14 @@ static int lfsck_master_dir_engine(const struct lu_env *env,
        ENTRY;
 
        do {
        ENTRY;
 
        do {
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-
-                       if (unlikely(!thread_is_running(thread))) {
-                               CDEBUG(D_LFSCK, "%s: scan dir exit for engine "
-                                      "stop, parent "DFID", cookie "LPX64"\n",
-                                      lfsck_lfsck2name(lfsck),
-                                      PFID(lfsck_dto2fid(dir)),
-                                      lfsck->li_cookie_dir);
-                               RETURN(0);
-                       }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY2, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread))) {
+                       CDEBUG(D_LFSCK, "%s: scan dir exit for engine stop, "
+                              "parent "DFID", cookie "LPX64"\n",
+                              lfsck_lfsck2name(lfsck),
+                              PFID(lfsck_dto2fid(dir)), lfsck->li_cookie_dir);
+
+                       RETURN(0);
                }
 
                lfsck->li_new_scanned++;
                }
 
                lfsck->li_new_scanned++;
@@ -897,23 +887,13 @@ static int lfsck_master_oit_engine(const struct lu_env *env,
                if (unlikely(lfsck->li_oit_over))
                        RETURN(1);
 
                if (unlikely(lfsck->li_oit_over))
                        RETURN(1);
 
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY1) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY1, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread))) {
+                       CDEBUG(D_LFSCK, "%s: OIT scan exit for engine stop, "
+                              "cookie "LPU64"\n",
+                              lfsck_lfsck2name(lfsck), iops->store(env, di));
 
 
-                       if (unlikely(!thread_is_running(thread))) {
-                               CDEBUG(D_LFSCK, "%s: OIT scan exit for engine "
-                                      "stop, cookie "LPU64"\n",
-                                      lfsck_lfsck2name(lfsck),
-                                      iops->store(env, di));
-                               RETURN(0);
-                       }
+                       RETURN(0);
                }
 
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH))
                }
 
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH))
index 148fbdf..a16396a 100644 (file)
@@ -1310,11 +1310,19 @@ lfsck_layout_lastid_load(const struct lu_env *env,
                                                cfs_time_seconds(cfs_fail_val),
                                                NULL, NULL);
 
                                                cfs_time_seconds(cfs_fail_val),
                                                NULL, NULL);
 
-                               up_write(&com->lc_sem);
-                               l_wait_event(lfsck->li_thread.t_ctl_waitq,
-                                            !thread_is_running(&lfsck->li_thread),
-                                            &lwi);
-                               down_write(&com->lc_sem);
+                               /* Some others may changed the cfs_fail_val
+                                * as zero after above check, re-check it for
+                                * sure to avoid falling into wait for ever. */
+                               if (likely(lwi.lwi_timeout > 0)) {
+                                       struct ptlrpc_thread *thread =
+                                               &lfsck->li_thread;
+
+                                       up_write(&com->lc_sem);
+                                       l_wait_event(thread->t_ctl_waitq,
+                                                    !thread_is_running(thread),
+                                                    &lwi);
+                                       down_write(&com->lc_sem);
+                               }
                        }
                }
 
                        }
                }
 
@@ -2625,17 +2633,9 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env,
                struct dt_key           *key;
                struct lu_orphan_rec    *rec = &info->lti_rec;
 
                struct dt_key           *key;
                struct lu_orphan_rec    *rec = &info->lti_rec;
 
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
-                   cfs_fail_val > 0) {
-                       struct ptlrpc_thread    *thread = &lfsck->li_thread;
-                       struct l_wait_info       lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-               }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+                   unlikely(!thread_is_running(&lfsck->li_thread)))
+                       break;
 
                key = iops->key(env, di);
                com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
 
                key = iops->key(env, di);
                com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
index 887c145..f254e1d 100644 (file)
@@ -5519,19 +5519,9 @@ static void lfsck_namespace_scan_local_lpf(const struct lu_env *env,
                rc = 0;
 
        while (rc == 0) {
                rc = 0;
 
        while (rc == 0) {
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-
-                       if (unlikely(!thread_is_running(thread)))
-                               break;
-               }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread)))
+                       break;
 
                rc = iops->rec(env, di, (struct dt_rec *)ent,
                               LUDA_64BITHASH | LUDA_TYPE);
 
                rc = iops->rec(env, di, (struct dt_rec *)ent,
                               LUDA_64BITHASH | LUDA_TYPE);
@@ -5795,19 +5785,9 @@ static int lfsck_namespace_assistant_handler_p2(const struct lu_env *env,
                GOTO(put, rc);
 
        do {
                GOTO(put, rc);
 
        do {
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-
-                       if (unlikely(!thread_is_running(thread)))
-                               GOTO(put, rc = 0);
-               }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread)))
+                       GOTO(put, rc = 0);
 
                key = iops->key(env, di);
                fid_be_to_cpu(&fid, (const struct lu_fid *)key);
 
                key = iops->key(env, di);
                fid_be_to_cpu(&fid, (const struct lu_fid *)key);
index b0655d0..c21e1dc 100644 (file)
@@ -1603,19 +1603,9 @@ int lfsck_namespace_scan_shard(const struct lu_env *env,
                rc = 0;
 
        while (rc == 0) {
                rc = 0;
 
        while (rc == 0) {
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-
-                       if (unlikely(!thread_is_running(thread)))
-                               GOTO(out, rc = 0);
-               }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread)))
+                       GOTO(out, rc = 0);
 
                rc = iops->rec(env, di, (struct dt_rec *)ent, args);
                if (rc == 0)
 
                rc = iops->rec(env, di, (struct dt_rec *)ent, args);
                if (rc == 0)
index d31f720..4b3e4a1 100644 (file)
@@ -975,10 +975,11 @@ static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev,
                struct l_wait_info lwi;
 
                lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
                struct l_wait_info lwi;
 
                lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
-               l_wait_event(thread->t_ctl_waitq,
-                            !list_empty(&scrub->os_inconsistent_items) ||
-                            !thread_is_running(thread),
-                            &lwi);
+               if (likely(lwi.lwi_timeout > 0))
+                       l_wait_event(thread->t_ctl_waitq,
+                               !list_empty(&scrub->os_inconsistent_items) ||
+                               !thread_is_running(thread),
+                               &lwi);
        }
 
        if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
        }
 
        if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {