Whamcloud - gitweb
LU-15139 osp: block reads until the object is created 03/47003/24
authorAlex Zhuravlev <bzzz@whamcloud.com>
Wed, 6 Apr 2022 08:00:30 +0000 (11:00 +0300)
committerOleg Drokin <green@whamcloud.com>
Wed, 2 Nov 2022 07:10:46 +0000 (07:10 +0000)
it's possible that remote llog can be read and written simultaneously
at recovery. for example, dtx recovery thread is fetching updates
while MDD's orphan cleanup procedure is removing orphans from PENDING.

OSP can be asked to read a just created in OSP cache object while
actual object on remote MDS hasn't been created yet. OSP should
block such reads until the creation is done.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: Id0f52b90761839399102bed825569da6bfd17864
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/47003
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/osp/osp_dev.c
lustre/osp/osp_internal.h
lustre/osp/osp_md_object.c

index ab408ea..95c196e 100644 (file)
@@ -1251,6 +1251,7 @@ static int osp_init0(const struct lu_env *env, struct osp_device *osp,
                GOTO(out, rc);
        if (osdname)
                OBD_FREE(osdname, MAX_OBD_NAME);
+       init_waitqueue_head(&osp->opd_out_waitq);
        RETURN(0);
 
 out:
index 43f39a7..4cbdc9b 100644 (file)
@@ -270,6 +270,8 @@ struct osp_device {
        unsigned int                    opd_reserved_mb_low;
        unsigned int                    opd_reserved_ino_high;
        unsigned int                    opd_reserved_ino_low;
+
+       wait_queue_head_t                opd_out_waitq;
        bool                            opd_cleanup_orphans_done;
        bool                            opd_force_creation;
 };
@@ -309,7 +311,8 @@ struct osp_object {
        unsigned int            opo_reserved:1,
                                opo_non_exist:1,
                                opo_stale:1,
-                               opo_destroyed:1;
+                               opo_destroyed:1,
+                               opo_creating:1; /* create in progress */
 
        /* read/write lock for md osp object */
        struct rw_semaphore     opo_sem;
@@ -322,6 +325,7 @@ struct osp_object {
        /* to implement in-flight invalidation */
        atomic_t                opo_invalidate_seq;
        struct rw_semaphore     opo_invalidate_sem;
+       atomic_t                opo_writes_in_flight;
 };
 
 extern const struct lu_object_operations osp_lu_obj_ops;
index 6ff56f4..502dce0 100644 (file)
@@ -83,10 +83,15 @@ static int osp_create_interpreter(const struct lu_env *env,
                                  struct osp_object *obj,
                                  void *data, int index, int rc)
 {
+       struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev);
+
+       spin_lock(&obj->opo_lock);
        if (rc != 0 && rc != -EEXIST) {
                obj->opo_obj.do_lu.lo_header->loh_attr &= ~LOHA_EXISTS;
                obj->opo_non_exist = 1;
        }
+       obj->opo_creating = 0;
+       spin_unlock(&obj->opo_lock);
 
        /*
         * invalidate opo cache for the object after the object is created, so
@@ -94,6 +99,15 @@ static int osp_create_interpreter(const struct lu_env *env,
         */
        osp_obj_invalidate_cache(obj);
 
+       /*
+        * currently reads from objects being created
+        * are exceptional - during recovery only, when
+        * remote llog update fetching can race with
+        * orphan cleanup. so don't waste memory adding
+        * a wait queue to every osp object
+        */
+       wake_up_all(&osp->opd_out_waitq);
+
        return 0;
 }
 
@@ -181,9 +195,12 @@ int osp_md_create(const struct lu_env *env, struct dt_object *dt,
        if (rc < 0)
                GOTO(out, rc);
 
+       spin_lock(&obj->opo_lock);
+       obj->opo_creating = 1;
        dt->do_lu.lo_header->loh_attr |= LOHA_EXISTS | (attr->la_mode & S_IFMT);
        dt2osp_obj(dt)->opo_non_exist = 0;
        obj->opo_stale = 0;
+       spin_unlock(&obj->opo_lock);
 
        obj->opo_attr = *attr;
 out:
@@ -1157,6 +1174,8 @@ static int osp_write_interpreter(const struct lu_env *env,
                                  struct osp_object *obj,
                                  void *data, int index, int rc)
 {
+       struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev);
+
        if (rc) {
                CDEBUG(D_HA, "error "DFID": rc = %d\n",
                       PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc);
@@ -1166,6 +1185,8 @@ static int osp_write_interpreter(const struct lu_env *env,
                obj->opo_stale = 1;
                spin_unlock(&obj->opo_lock);
        }
+       if (atomic_dec_and_test(&obj->opo_writes_in_flight))
+               wake_up_all(&osp->opd_out_waitq);
        return 0;
 }
 
@@ -1233,6 +1254,8 @@ static ssize_t osp_md_write(const struct lu_env *env, struct dt_object *dt,
        }
        spin_unlock(&obj->opo_lock);
 
+       atomic_inc(&obj->opo_writes_in_flight);
+
        RETURN(buf->lb_len);
 }
 
@@ -1244,7 +1267,16 @@ static inline void orr_le_to_cpu(struct out_read_reply *orr_dst,
        orr_dst->orr_offset = le64_to_cpu(orr_dst->orr_offset);
 }
 
+static int osp_md_check_creating(struct osp_object *obj)
+{
+       int rc;
 
+       spin_lock(&obj->opo_lock);
+       rc = obj->opo_creating;
+       spin_unlock(&obj->opo_lock);
+
+       return rc;
+}
 
 static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt,
                           struct lu_buf *rbuf, loff_t *pos)
@@ -1265,6 +1297,10 @@ static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt,
        if (dt2osp_obj(dt)->opo_destroyed)
                RETURN(-ENOENT);
 
+       wait_event_idle(osp->opd_out_waitq,
+                       !atomic_read(&dt2osp_obj(dt)->opo_writes_in_flight) &&
+                       osp_md_check_creating(dt2osp_obj(dt)) == 0);
+
        /* Because it needs send the update buffer right away,
         * just create an update buffer, instead of attaching the
         * update_remote list of the thandle.  */