Whamcloud - gitweb
LU-5395 lfsck: deadlock between LFSCK and destroy 73/11373/6
authorFan Yong <fan.yong@intel.com>
Sun, 13 Jul 2014 12:11:56 +0000 (20:11 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 5 Sep 2014 00:24:00 +0000 (00:24 +0000)
There is potential deadlock race condition between object
destroy and layout LFSCK. Consider the following scenario:

1) The LFSCK thread obtained the parent object firstly, at
   that time, the parent object has not been destroyed yet.

2) One RPC service thread destroyed the parent and all its
   children objects. Because the LFSCK is referencing the
   parent object, then the parent object will be marked as
   dying in RAM. On the other hand, the parent object is
   referencing all its children objects, then all children
   objects will be marked as dying in RAM also.

3) The LFSCK thread tries to find some child object with
   the parent object referenced. Then it will find that the
   child object is dying. According to the object visibility
   rules: the object with dying flag cannot be returned to
   others. So the LFSCK thread has to wait until the dying
   object has been purged from RAM, then it can allocate a
   new object (with the same FID) in RAM. Unfortunately, the
   LFSCK thread itself is referencing the parent object, and
   cause the parent object cannot be purged, then cause the
   child object cannot be purged also. So the LFSCK thread
   will fall into deadlock.

We introduce non-blocked version lu_object_find() to allow
the LFSCK thread to return failure immediately (instead of
wait) when it finds dying (child) object, then the LFSCK
thread can check whether the parent object is dying or not.
So avoid above deadlock.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I7f465259011ad5fb92ef1b4dba0ff9f46d134352
Reviewed-on: http://review.whamcloud.com/11373
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lu_object.h
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_layout.c
lustre/obdclass/lu_object.c

index a66790a..7c8c737 100644 (file)
@@ -169,6 +169,10 @@ typedef enum {
        /* This is a new object to be allocated, or the file
         * corresponding to the object does not exists. */
        LOC_F_NEW       = 0x00000001,
+
+       /* When find a dying object, just return -EAGAIN at once instead of
+        * blocking the thread. */
+       LOC_F_NOWAIT    = 0x00000002,
 } loc_flags_t;
 
 /**
index a4f3bf8..93d960b 100644 (file)
@@ -575,6 +575,7 @@ struct lfsck_thread_info {
        struct lu_orphan_rec    lti_rec;
        struct lov_user_md      lti_lum;
        struct dt_insert_rec    lti_dt_rec;
+       struct lu_object_conf   lti_conf;
 };
 
 /* lfsck_lib.c */
@@ -785,6 +786,21 @@ static inline void lfsck_object_put(const struct lu_env *env,
 }
 
 static inline struct dt_object *
+lfsck_object_find_by_dev_nowait(const struct lu_env *env, struct dt_device *dev,
+                               const struct lu_fid *fid)
+{
+       struct lu_object_conf   *conf = &lfsck_env_info(env)->lti_conf;
+       struct dt_object        *obj;
+
+       conf->loc_flags = LOC_F_NOWAIT;
+       obj = lu2dt(lu_object_find_slice(env, dt2lu_dev(dev), fid, conf));
+       if (unlikely(obj == NULL))
+               return ERR_PTR(-ENOENT);
+
+       return obj;
+}
+
+static inline struct dt_object *
 lfsck_object_find_by_dev(const struct lu_env *env, struct dt_device *dev,
                         const struct lu_fid *fid)
 {
index 10101ab..3db63dc 100644 (file)
@@ -2970,7 +2970,7 @@ static int lfsck_layout_repair_dangling(const struct lu_env *env,
                GOTO(stop, rc);
 
        dt_read_lock(env, parent, 0);
-       if (unlikely(lu_object_is_dying(parent->do_lu.lo_header)))
+       if (unlikely(lfsck_is_dead_obj(parent)))
                GOTO(unlock2, rc = 1);
 
        rc = dt_create(env, child, cla, hint, NULL, handle);
@@ -3058,7 +3058,7 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
                GOTO(stop, rc);
 
        dt_write_lock(env, parent, 0);
-       if (unlikely(lu_object_is_dying(parent->do_lu.lo_header)))
+       if (unlikely(lfsck_is_dead_obj(parent)))
                GOTO(unlock2, rc = 1);
 
        rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle,
@@ -3161,7 +3161,7 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
                GOTO(stop, rc);
 
        dt_write_lock(env, parent, 0);
-       if (unlikely(lu_object_is_dying(parent->do_lu.lo_header)))
+       if (unlikely(lfsck_is_dead_obj(parent)))
                GOTO(unlock2, rc = 0);
 
        rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA);
@@ -3259,7 +3259,7 @@ static int lfsck_layout_repair_owner(const struct lu_env *env,
 
        /* Use the dt_object lock to serialize with destroy and attr_set. */
        dt_read_lock(env, parent, 0);
-       if (unlikely(lu_object_is_dying(parent->do_lu.lo_header)))
+       if (unlikely(lfsck_is_dead_obj(parent)))
                GOTO(unlock, rc = 1);
 
        /* Get the latest parent's owner. */
@@ -3429,17 +3429,16 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env,
        int                                   rc;
        ENTRY;
 
-       rc = dt_attr_get(env, parent, pla, BYPASS_CAPA);
-       if (rc != 0) {
-               if (lu_object_is_dying(parent->do_lu.lo_header))
-                       RETURN(0);
+       if (unlikely(lfsck_is_dead_obj(parent)))
+               RETURN(0);
 
+       rc = dt_attr_get(env, parent, pla, BYPASS_CAPA);
+       if (rc != 0)
                GOTO(out, rc);
-       }
 
        rc = dt_attr_get(env, child, cla, BYPASS_CAPA);
        if (rc == -ENOENT) {
-               if (lu_object_is_dying(parent->do_lu.lo_header))
+               if (unlikely(lfsck_is_dead_obj(parent)))
                        RETURN(0);
 
                type = LLIT_DANGLING;
@@ -4665,6 +4664,9 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env,
                             thread_is_stopped(athread))
                        GOTO(out, rc = 0);
 
+               if (unlikely(lfsck_is_dead_obj(parent)))
+                       GOTO(out, rc = 0);
+
                ostid_le_to_cpu(&objs->l_ost_oi, oi);
                index = le32_to_cpu(objs->l_ost_idx);
                rc = ostid_to_fid(fid, oi, index);
@@ -4684,8 +4686,44 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env,
                        goto next;
                }
 
-               cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
+               /* There is potential deadlock race condition between object
+                * destroy and layout LFSCK. Consider the following scenario:
+                *
+                * 1) The LFSCK thread obtained the parent object firstly, at
+                *    that time, the parent object has not been destroyed yet.
+                *
+                * 2) One RPC service thread destroyed the parent and all its
+                *    children objects. Because the LFSCK is referencing the
+                *    parent object, then the parent object will be marked as
+                *    dying in RAM. On the other hand, the parent object is
+                *    referencing all its children objects, then all children
+                *    objects will be marked as dying in RAM also.
+                *
+                * 3) The LFSCK thread tries to find some child object with
+                *    the parent object referenced. Then it will find that the
+                *    child object is dying. According to the object visibility
+                *    rules: the object with dying flag cannot be returned to
+                *    others. So the LFSCK thread has to wait until the dying
+                *    object has been purged from RAM, then it can allocate a
+                *    new object (with the same FID) in RAM. Unfortunately, the
+                *    LFSCK thread itself is referencing the parent object, and
+                *    cause the parent object cannot be purged, then cause the
+                *    child object cannot be purged also. So the LFSCK thread
+                *    will fall into deadlock.
+                *
+                * We introduce non-blocked version lu_object_find() to allow
+                * the LFSCK thread to return failure immediately (instead of
+                * wait) when it finds dying (child) object, then the LFSCK
+                * thread can check whether the parent object is dying or not.
+                * So avoid above deadlock. LU-5395 */
+               cobj = lfsck_object_find_by_dev_nowait(env, tgt->ltd_tgt, fid);
                if (IS_ERR(cobj)) {
+                       if (lfsck_is_dead_obj(parent)) {
+                               lfsck_tgt_put(tgt);
+
+                               GOTO(out, rc = 0);
+                       }
+
                        rc = PTR_ERR(cobj);
                        goto next;
                }
index eda5e9c..6384421 100644 (file)
@@ -605,10 +605,13 @@ static struct lu_object *htable_lookup(struct lu_site *s,
          * drained), and moreover, lookup has to wait until object is freed.
          */
 
-       init_waitqueue_entry_current(waiter);
-       add_wait_queue(&bkt->lsb_marche_funebre, waiter);
-       set_current_state(TASK_UNINTERRUPTIBLE);
-       lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+       if (likely(waiter != NULL)) {
+               init_waitqueue_entry_current(waiter);
+               add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+       }
+
        return ERR_PTR(-EAGAIN);
 }
 
@@ -794,6 +797,12 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
        wait_queue_t           wait;
 
        while (1) {
+               if (conf != NULL && conf->loc_flags & LOC_F_NOWAIT) {
+                       obj = lu_object_find_try(env, dev, f, conf, NULL);
+
+                       return obj;
+               }
+
                obj = lu_object_find_try(env, dev, f, conf, &wait);
                if (obj != ERR_PTR(-EAGAIN))
                        return obj;