There are some race conditions when check/use cfs_fail_val.
For example: when inject failure stub for LFSCK test as following:
764 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) &&
765 cfs_fail_val > 0) {
766 struct l_wait_info lwi;
767
768 lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
769 NULL, NULL);
770 l_wait_event(thread->t_ctl_waitq,
771 !thread_is_running(thread),
772 &lwi);
773
774 if (unlikely(!thread_is_running(thread))) {
775 CDEBUG(D_LFSCK, "%s: scan dir exit for engine "
776 "stop, parent "DFID", cookie "LPX64"n",
777 lfsck_lfsck2name(lfsck),
778 PFID(lfsck_dto2fid(dir)),
779 lfsck->li_cookie_dir);
780 RETURN(0);
781 }
782 }
The "cfs_fail_val" may be changed as zero by others after the check
at the line 765 but before using it at the line 768. Then the LFSCK
engine will fall into "wait" until someone run "lfsck_stop".
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I418621faaf6a1f42ba1d541b37374c1dc21831be
Reviewed-on: http://review.whamcloud.com/13481
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
int ret = 0;
ret = __cfs_fail_check_set(id, value, set);
int ret = 0;
ret = __cfs_fail_check_set(id, value, set);
+ if (ret && likely(ms > 0)) {
CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
id, ms);
schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
id, ms);
schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
- if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) &&
- cfs_fail_val > 0) {
- struct l_wait_info lwi;
-
- lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
- NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !thread_is_running(thread),
- &lwi);
-
- if (unlikely(!thread_is_running(thread))) {
- CDEBUG(D_LFSCK, "%s: scan dir exit for engine "
- "stop, parent "DFID", cookie "LPX64"\n",
- lfsck_lfsck2name(lfsck),
- PFID(lfsck_dto2fid(dir)),
- lfsck->li_cookie_dir);
- RETURN(0);
- }
+ if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY2, cfs_fail_val) &&
+ unlikely(!thread_is_running(thread))) {
+ CDEBUG(D_LFSCK, "%s: scan dir exit for engine stop, "
+ "parent "DFID", cookie "LPX64"\n",
+ lfsck_lfsck2name(lfsck),
+ PFID(lfsck_dto2fid(dir)), lfsck->li_cookie_dir);
+
+ RETURN(0);
}
lfsck->li_new_scanned++;
}
lfsck->li_new_scanned++;
if (unlikely(lfsck->li_oit_over))
RETURN(1);
if (unlikely(lfsck->li_oit_over))
RETURN(1);
- if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY1) &&
- cfs_fail_val > 0) {
- struct l_wait_info lwi;
-
- lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
- NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !thread_is_running(thread),
- &lwi);
+ if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY1, cfs_fail_val) &&
+ unlikely(!thread_is_running(thread))) {
+ CDEBUG(D_LFSCK, "%s: OIT scan exit for engine stop, "
+ "cookie "LPU64"\n",
+ lfsck_lfsck2name(lfsck), iops->store(env, di));
- if (unlikely(!thread_is_running(thread))) {
- CDEBUG(D_LFSCK, "%s: OIT scan exit for engine "
- "stop, cookie "LPU64"\n",
- lfsck_lfsck2name(lfsck),
- iops->store(env, di));
- RETURN(0);
- }
}
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH))
}
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH))
cfs_time_seconds(cfs_fail_val),
NULL, NULL);
cfs_time_seconds(cfs_fail_val),
NULL, NULL);
- up_write(&com->lc_sem);
- l_wait_event(lfsck->li_thread.t_ctl_waitq,
- !thread_is_running(&lfsck->li_thread),
- &lwi);
- down_write(&com->lc_sem);
+ /* Some others may changed the cfs_fail_val
+ * as zero after above check, re-check it for
+ * sure to avoid falling into wait for ever. */
+ if (likely(lwi.lwi_timeout > 0)) {
+ struct ptlrpc_thread *thread =
+ &lfsck->li_thread;
+
+ up_write(&com->lc_sem);
+ l_wait_event(thread->t_ctl_waitq,
+ !thread_is_running(thread),
+ &lwi);
+ down_write(&com->lc_sem);
+ }
struct dt_key *key;
struct lu_orphan_rec *rec = &info->lti_rec;
struct dt_key *key;
struct lu_orphan_rec *rec = &info->lti_rec;
- if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
- cfs_fail_val > 0) {
- struct ptlrpc_thread *thread = &lfsck->li_thread;
- struct l_wait_info lwi;
-
- lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
- NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !thread_is_running(thread),
- &lwi);
- }
+ if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+ unlikely(!thread_is_running(&lfsck->li_thread)))
+ break;
key = iops->key(env, di);
com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
key = iops->key(env, di);
com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
rc = 0;
while (rc == 0) {
rc = 0;
while (rc == 0) {
- if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
- cfs_fail_val > 0) {
- struct l_wait_info lwi;
-
- lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
- NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !thread_is_running(thread),
- &lwi);
-
- if (unlikely(!thread_is_running(thread)))
- break;
- }
+ if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+ unlikely(!thread_is_running(thread)))
+ break;
rc = iops->rec(env, di, (struct dt_rec *)ent,
LUDA_64BITHASH | LUDA_TYPE);
rc = iops->rec(env, di, (struct dt_rec *)ent,
LUDA_64BITHASH | LUDA_TYPE);
- if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
- cfs_fail_val > 0) {
- struct l_wait_info lwi;
-
- lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
- NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !thread_is_running(thread),
- &lwi);
-
- if (unlikely(!thread_is_running(thread)))
- GOTO(put, rc = 0);
- }
+ if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+ unlikely(!thread_is_running(thread)))
+ GOTO(put, rc = 0);
key = iops->key(env, di);
fid_be_to_cpu(&fid, (const struct lu_fid *)key);
key = iops->key(env, di);
fid_be_to_cpu(&fid, (const struct lu_fid *)key);
rc = 0;
while (rc == 0) {
rc = 0;
while (rc == 0) {
- if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
- cfs_fail_val > 0) {
- struct l_wait_info lwi;
-
- lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
- NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !thread_is_running(thread),
- &lwi);
-
- if (unlikely(!thread_is_running(thread)))
- GOTO(out, rc = 0);
- }
+ if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
+ unlikely(!thread_is_running(thread)))
+ GOTO(out, rc = 0);
rc = iops->rec(env, di, (struct dt_rec *)ent, args);
if (rc == 0)
rc = iops->rec(env, di, (struct dt_rec *)ent, args);
if (rc == 0)
struct l_wait_info lwi;
lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
struct l_wait_info lwi;
lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), NULL, NULL);
- l_wait_event(thread->t_ctl_waitq,
- !list_empty(&scrub->os_inconsistent_items) ||
- !thread_is_running(thread),
- &lwi);
+ if (likely(lwi.lwi_timeout > 0))
+ l_wait_event(thread->t_ctl_waitq,
+ !list_empty(&scrub->os_inconsistent_items) ||
+ !thread_is_running(thread),
+ &lwi);
}
if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {
}
if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) {