From 555eb580e547fb263faee0a4d88482789171211b Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Sun, 13 Jul 2014 20:11:56 +0800 Subject: [PATCH 1/1] LU-5395 lfsck: deadlock between LFSCK and destroy There is potential deadlock race condition between object destroy and layout LFSCK. Consider the following scenario: 1) The LFSCK thread obtained the parent object firstly, at that time, the parent object has not been destroyed yet. 2) One RPC service thread destroyed the parent and all its children objects. Because the LFSCK is referencing the parent object, then the parent object will be marked as dying in RAM. On the other hand, the parent object is referencing all its children objects, then all children objects will be marked as dying in RAM also. 3) The LFSCK thread tries to find some child object with the parent object referenced. Then it will find that the child object is dying. According to the object visibility rules: the object with dying flag cannot be returned to others. So the LFSCK thread has to wait until the dying object has been purged from RAM, then it can allocate a new object (with the same FID) in RAM. Unfortunately, the LFSCK thread itself is referencing the parent object, and cause the parent object cannot be purged, then cause the child object cannot be purged also. So the LFSCK thread will fall into deadlock. We introduce non-blocked version lu_object_find() to allow the LFSCK thread to return failure immediately (instead of wait) when it finds dying (child) object, then the LFSCK thread can check whether the parent object is dying or not. So avoid above deadlock. Signed-off-by: Fan Yong Change-Id: I7f465259011ad5fb92ef1b4dba0ff9f46d134352 Reviewed-on: http://review.whamcloud.com/11373 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/include/lu_object.h | 4 +++ lustre/lfsck/lfsck_internal.h | 16 ++++++++++++ lustre/lfsck/lfsck_layout.c | 60 +++++++++++++++++++++++++++++++++++-------- lustre/obdclass/lu_object.c | 17 +++++++++--- 4 files changed, 82 insertions(+), 15 deletions(-) diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index a66790a..7c8c737 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -169,6 +169,10 @@ typedef enum { /* This is a new object to be allocated, or the file * corresponding to the object does not exists. */ LOC_F_NEW = 0x00000001, + + /* When find a dying object, just return -EAGAIN at once instead of + * blocking the thread. */ + LOC_F_NOWAIT = 0x00000002, } loc_flags_t; /** diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index a4f3bf8..93d960b 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -575,6 +575,7 @@ struct lfsck_thread_info { struct lu_orphan_rec lti_rec; struct lov_user_md lti_lum; struct dt_insert_rec lti_dt_rec; + struct lu_object_conf lti_conf; }; /* lfsck_lib.c */ @@ -785,6 +786,21 @@ static inline void lfsck_object_put(const struct lu_env *env, } static inline struct dt_object * +lfsck_object_find_by_dev_nowait(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *fid) +{ + struct lu_object_conf *conf = &lfsck_env_info(env)->lti_conf; + struct dt_object *obj; + + conf->loc_flags = LOC_F_NOWAIT; + obj = lu2dt(lu_object_find_slice(env, dt2lu_dev(dev), fid, conf)); + if (unlikely(obj == NULL)) + return ERR_PTR(-ENOENT); + + return obj; +} + +static inline struct dt_object * lfsck_object_find_by_dev(const struct lu_env *env, struct dt_device *dev, const struct lu_fid *fid) { diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index 10101ab..3db63dc 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -2970,7 +2970,7 @@ static int lfsck_layout_repair_dangling(const struct lu_env *env, GOTO(stop, rc); dt_read_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) + if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock2, rc = 1); rc = dt_create(env, child, cla, hint, NULL, handle); @@ -3058,7 +3058,7 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, GOTO(stop, rc); dt_write_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) + if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock2, rc = 1); rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, @@ -3161,7 +3161,7 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, GOTO(stop, rc); dt_write_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) + if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock2, rc = 0); rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); @@ -3259,7 +3259,7 @@ static int lfsck_layout_repair_owner(const struct lu_env *env, /* Use the dt_object lock to serialize with destroy and attr_set. */ dt_read_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) + if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock, rc = 1); /* Get the latest parent's owner. */ @@ -3429,17 +3429,16 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, int rc; ENTRY; - rc = dt_attr_get(env, parent, pla, BYPASS_CAPA); - if (rc != 0) { - if (lu_object_is_dying(parent->do_lu.lo_header)) - RETURN(0); + if (unlikely(lfsck_is_dead_obj(parent))) + RETURN(0); + rc = dt_attr_get(env, parent, pla, BYPASS_CAPA); + if (rc != 0) GOTO(out, rc); - } rc = dt_attr_get(env, child, cla, BYPASS_CAPA); if (rc == -ENOENT) { - if (lu_object_is_dying(parent->do_lu.lo_header)) + if (unlikely(lfsck_is_dead_obj(parent))) RETURN(0); type = LLIT_DANGLING; @@ -4665,6 +4664,9 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, thread_is_stopped(athread)) GOTO(out, rc = 0); + if (unlikely(lfsck_is_dead_obj(parent))) + GOTO(out, rc = 0); + ostid_le_to_cpu(&objs->l_ost_oi, oi); index = le32_to_cpu(objs->l_ost_idx); rc = ostid_to_fid(fid, oi, index); @@ -4684,8 +4686,44 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, goto next; } - cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid); + /* There is potential deadlock race condition between object + * destroy and layout LFSCK. Consider the following scenario: + * + * 1) The LFSCK thread obtained the parent object firstly, at + * that time, the parent object has not been destroyed yet. + * + * 2) One RPC service thread destroyed the parent and all its + * children objects. Because the LFSCK is referencing the + * parent object, then the parent object will be marked as + * dying in RAM. On the other hand, the parent object is + * referencing all its children objects, then all children + * objects will be marked as dying in RAM also. + * + * 3) The LFSCK thread tries to find some child object with + * the parent object referenced. Then it will find that the + * child object is dying. According to the object visibility + * rules: the object with dying flag cannot be returned to + * others. So the LFSCK thread has to wait until the dying + * object has been purged from RAM, then it can allocate a + * new object (with the same FID) in RAM. Unfortunately, the + * LFSCK thread itself is referencing the parent object, and + * cause the parent object cannot be purged, then cause the + * child object cannot be purged also. So the LFSCK thread + * will fall into deadlock. + * + * We introduce non-blocked version lu_object_find() to allow + * the LFSCK thread to return failure immediately (instead of + * wait) when it finds dying (child) object, then the LFSCK + * thread can check whether the parent object is dying or not. + * So avoid above deadlock. LU-5395 */ + cobj = lfsck_object_find_by_dev_nowait(env, tgt->ltd_tgt, fid); if (IS_ERR(cobj)) { + if (lfsck_is_dead_obj(parent)) { + lfsck_tgt_put(tgt); + + GOTO(out, rc = 0); + } + rc = PTR_ERR(cobj); goto next; } diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index eda5e9c..6384421 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -605,10 +605,13 @@ static struct lu_object *htable_lookup(struct lu_site *s, * drained), and moreover, lookup has to wait until object is freed. */ - init_waitqueue_entry_current(waiter); - add_wait_queue(&bkt->lsb_marche_funebre, waiter); - set_current_state(TASK_UNINTERRUPTIBLE); - lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE); + if (likely(waiter != NULL)) { + init_waitqueue_entry_current(waiter); + add_wait_queue(&bkt->lsb_marche_funebre, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE); + } + return ERR_PTR(-EAGAIN); } @@ -794,6 +797,12 @@ struct lu_object *lu_object_find_at(const struct lu_env *env, wait_queue_t wait; while (1) { + if (conf != NULL && conf->loc_flags & LOC_F_NOWAIT) { + obj = lu_object_find_try(env, dev, f, conf, NULL); + + return obj; + } + obj = lu_object_find_try(env, dev, f, conf, &wait); if (obj != ERR_PTR(-EAGAIN)) return obj; -- 1.8.3.1