From f6ef1b797f2f6b28e7c5860b6cf16759cadfc9a4 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 4 Nov 2014 16:02:22 +0800 Subject: [PATCH] LU-6146 tests: race condition for check/use cfs_fail_val There are some race conditions when check/use cfs_fail_val. For example: when inject failure stub for LFSCK test as following: 764 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) && 765 cfs_fail_val > 0) { 766 struct l_wait_info lwi; 767 768 lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), 769 NULL, NULL); 770 l_wait_event(thread->t_ctl_waitq, 771 !thread_is_running(thread), 772 &lwi); 773 774 if (unlikely(!thread_is_running(thread))) { 775 CDEBUG(D_LFSCK, "%s: scan dir exit for engine " 776 "stop, parent "DFID", cookie "LPX64"n", 777 lfsck_lfsck2name(lfsck), 778 PFID(lfsck_dto2fid(dir)), 779 lfsck->li_cookie_dir); 780 RETURN(0); 781 } 782 } The "cfs_fail_val" may be changed as zero by others after the check at the line 765 but before using it at the line 768. Then the LFSCK engine will fall into "wait" until someone run "lfsck_stop". Signed-off-by: Fan Yong Change-Id: I418621faaf6a1f42ba1d541b37374c1dc21831be Reviewed-on: http://review.whamcloud.com/13481 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Andreas Dilger --- libcfs/libcfs/fail.c | 2 +- lustre/lfsck/lfsck_engine.c | 48 ++++++++++++---------------------------- lustre/lfsck/lfsck_layout.c | 32 +++++++++++++-------------- lustre/lfsck/lfsck_namespace.c | 32 +++++---------------------- lustre/lfsck/lfsck_striped_dir.c | 16 +++----------- lustre/osd-ldiskfs/osd_scrub.c | 9 ++++---- 6 files changed, 45 insertions(+), 94 deletions(-) diff --git a/libcfs/libcfs/fail.c b/libcfs/libcfs/fail.c index 58a698e..af8784d 100644 --- a/libcfs/libcfs/fail.c +++ b/libcfs/libcfs/fail.c @@ -127,7 +127,7 @@ int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) int ret = 0; ret = __cfs_fail_check_set(id, value, set); - if (ret) { + if (ret && likely(ms > 0)) { CERROR("cfs_fail_timeout id %x sleeping for %dms\n", id, ms); schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index 950c3ec..1b0ce1e 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -761,24 +761,14 @@ static int lfsck_master_dir_engine(const struct lu_env *env, ENTRY; do { - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) && - cfs_fail_val > 0) { - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - - if (unlikely(!thread_is_running(thread))) { - CDEBUG(D_LFSCK, "%s: scan dir exit for engine " - "stop, parent "DFID", cookie "LPX64"\n", - lfsck_lfsck2name(lfsck), - PFID(lfsck_dto2fid(dir)), - lfsck->li_cookie_dir); - RETURN(0); - } + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY2, cfs_fail_val) && + unlikely(!thread_is_running(thread))) { + CDEBUG(D_LFSCK, "%s: scan dir exit for engine stop, " + "parent "DFID", cookie "LPX64"\n", + lfsck_lfsck2name(lfsck), + PFID(lfsck_dto2fid(dir)), lfsck->li_cookie_dir); + + RETURN(0); } lfsck->li_new_scanned++; @@ -897,23 +887,13 @@ static int lfsck_master_oit_engine(const struct lu_env *env, if (unlikely(lfsck->li_oit_over)) RETURN(1); - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY1) && - cfs_fail_val > 0) { - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY1, cfs_fail_val) && + unlikely(!thread_is_running(thread))) { + CDEBUG(D_LFSCK, "%s: OIT scan exit for engine stop, " + "cookie "LPU64"\n", + lfsck_lfsck2name(lfsck), iops->store(env, di)); - if (unlikely(!thread_is_running(thread))) { - CDEBUG(D_LFSCK, "%s: OIT scan exit for engine " - "stop, cookie "LPU64"\n", - lfsck_lfsck2name(lfsck), - iops->store(env, di)); - RETURN(0); - } + RETURN(0); } if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH)) diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index 148fbdf..a16396a 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -1310,11 +1310,19 @@ lfsck_layout_lastid_load(const struct lu_env *env, cfs_time_seconds(cfs_fail_val), NULL, NULL); - up_write(&com->lc_sem); - l_wait_event(lfsck->li_thread.t_ctl_waitq, - !thread_is_running(&lfsck->li_thread), - &lwi); - down_write(&com->lc_sem); + /* Some others may changed the cfs_fail_val + * as zero after above check, re-check it for + * sure to avoid falling into wait for ever. */ + if (likely(lwi.lwi_timeout > 0)) { + struct ptlrpc_thread *thread = + &lfsck->li_thread; + + up_write(&com->lc_sem); + l_wait_event(thread->t_ctl_waitq, + !thread_is_running(thread), + &lwi); + down_write(&com->lc_sem); + } } } @@ -2625,17 +2633,9 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct dt_key *key; struct lu_orphan_rec *rec = &info->lti_rec; - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct ptlrpc_thread *thread = &lfsck->li_thread; - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - } + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(&lfsck->li_thread))) + break; key = iops->key(env, di); com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key; diff --git a/lustre/lfsck/lfsck_namespace.c b/lustre/lfsck/lfsck_namespace.c index 887c145..f254e1d7 100644 --- a/lustre/lfsck/lfsck_namespace.c +++ b/lustre/lfsck/lfsck_namespace.c @@ -5519,19 +5519,9 @@ static void lfsck_namespace_scan_local_lpf(const struct lu_env *env, rc = 0; while (rc == 0) { - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - - if (unlikely(!thread_is_running(thread))) - break; - } + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(thread))) + break; rc = iops->rec(env, di, (struct dt_rec *)ent, LUDA_64BITHASH | LUDA_TYPE); @@ -5795,19 +5785,9 @@ static int lfsck_namespace_assistant_handler_p2(const struct lu_env *env, GOTO(put, rc); do { - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - - if (unlikely(!thread_is_running(thread))) - GOTO(put, rc = 0); - } + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); key = iops->key(env, di); fid_be_to_cpu(&fid, (const struct lu_fid *)key); diff --git a/lustre/lfsck/lfsck_striped_dir.c b/lustre/lfsck/lfsck_striped_dir.c index b0655d0..c21e1dc 100644 --- a/lustre/lfsck/lfsck_striped_dir.c +++ b/lustre/lfsck/lfsck_striped_dir.c @@ -1603,19 +1603,9 @@ int lfsck_namespace_scan_shard(const struct lu_env *env, rc = 0; while (rc == 0) { - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - - if (unlikely(!thread_is_running(thread))) - GOTO(out, rc = 0); - } + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(thread))) + GOTO(out, rc = 0); rc = iops->rec(env, di, (struct dt_rec *)ent, args); if (rc == 0) diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c index d31f720..4b3e4a1 100644 --- a/lustre/osd-ldiskfs/osd_scrub.c +++ b/lustre/osd-ldiskfs/osd_scrub.c @@ -975,10 +975,11 @@ static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev, struct l_wait_info lwi; lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !list_empty(&scrub->os_inconsistent_items) || - !thread_is_running(thread), - &lwi); + if (likely(lwi.lwi_timeout > 0)) + l_wait_event(thread->t_ctl_waitq, + !list_empty(&scrub->os_inconsistent_items) || + !thread_is_running(thread), + &lwi); } if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) { -- 1.8.3.1