Whamcloud - gitweb
LU-15139 osp: block reads until the object is created
authorAlex Zhuravlev <bzzz@whamcloud.com>
Sun, 13 Nov 2022 14:51:30 +0000 (17:51 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 19 Nov 2022 17:33:17 +0000 (17:33 +0000)
it's possible that remote llog can be read and written simultaneously
at recovery. for example, dtx recovery thread is fetching updates
while MDD's orphan cleanup procedure is removing orphans from PENDING.

OSP can be asked to read a just created in OSP cache object while
actual object on remote MDS hasn't been created yet. OSP should
block such reads until the creation is done.

Lustre-change: https://review.whamcloud.com/47003/
Lustre-commit: 4f2914537cc32fe89c4781bcfc87c38e3fe4419c

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I5596c791a758dd542746afd961eb1ed9c97845be
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49146
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/osp/osp_dev.c
lustre/osp/osp_internal.h
lustre/osp/osp_md_object.c

index 574a0c3..00d5989 100644 (file)
@@ -1254,6 +1254,7 @@ static int osp_init0(const struct lu_env *env, struct osp_device *osp,
                GOTO(out, rc);
        if (osdname)
                OBD_FREE(osdname, MAX_OBD_NAME);
+       init_waitqueue_head(&osp->opd_out_waitq);
        RETURN(0);
 
 out:
index 5cb003d..295ede8 100644 (file)
@@ -278,6 +278,8 @@ struct osp_device {
        int                             opd_reserved_ino_low;
        int                             opd_reserved_mb_high;
        int                             opd_reserved_mb_low;
+
+       wait_queue_head_t                opd_out_waitq;
 };
 
 #define opd_pre_used_fid               opd_pre->osp_pre_used_fid
@@ -314,7 +316,8 @@ struct osp_object {
        unsigned int            opo_reserved:1,
                                opo_non_exist:1,
                                opo_stale:1,
-                               opo_destroyed:1;
+                               opo_destroyed:1,
+                               opo_creating:1; /* create in progress */
 
        /* read/write lock for md osp object */
        struct rw_semaphore     opo_sem;
@@ -327,6 +330,7 @@ struct osp_object {
        /* to implement in-flight invalidation */
        atomic_t                opo_invalidate_seq;
        struct rw_semaphore     opo_invalidate_sem;
+       atomic_t                opo_writes_in_flight;
 };
 
 extern struct lu_object_operations osp_lu_obj_ops;
index 8715c70..25ebb2c 100644 (file)
@@ -83,10 +83,15 @@ static int osp_create_interpreter(const struct lu_env *env,
                                  struct osp_object *obj,
                                  void *data, int index, int rc)
 {
+       struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev);
+
+       spin_lock(&obj->opo_lock);
        if (rc != 0 && rc != -EEXIST) {
                obj->opo_obj.do_lu.lo_header->loh_attr &= ~LOHA_EXISTS;
                obj->opo_non_exist = 1;
        }
+       obj->opo_creating = 0;
+       spin_unlock(&obj->opo_lock);
 
        /*
         * invalidate opo cache for the object after the object is created, so
@@ -94,6 +99,15 @@ static int osp_create_interpreter(const struct lu_env *env,
         */
        osp_obj_invalidate_cache(obj);
 
+       /*
+        * currently reads from objects being created
+        * are exceptional - during recovery only, when
+        * remote llog update fetching can race with
+        * orphan cleanup. so don't waste memory adding
+        * a wait queue to every osp object
+        */
+       wake_up_all(&osp->opd_out_waitq);
+
        return 0;
 }
 
@@ -181,9 +195,12 @@ int osp_md_create(const struct lu_env *env, struct dt_object *dt,
        if (rc < 0)
                GOTO(out, rc);
 
+       spin_lock(&obj->opo_lock);
+       obj->opo_creating = 1;
        dt->do_lu.lo_header->loh_attr |= LOHA_EXISTS | (attr->la_mode & S_IFMT);
        dt2osp_obj(dt)->opo_non_exist = 0;
        obj->opo_stale = 0;
+       spin_unlock(&obj->opo_lock);
 
        obj->opo_attr = *attr;
 out:
@@ -1157,6 +1174,8 @@ static int osp_write_interpreter(const struct lu_env *env,
                                  struct osp_object *obj,
                                  void *data, int index, int rc)
 {
+       struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev);
+
        if (rc) {
                CDEBUG(D_HA, "error "DFID": rc = %d\n",
                       PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc);
@@ -1166,6 +1185,8 @@ static int osp_write_interpreter(const struct lu_env *env,
                obj->opo_stale = 1;
                spin_unlock(&obj->opo_lock);
        }
+       if (atomic_dec_and_test(&obj->opo_writes_in_flight))
+               wake_up_all(&osp->opd_out_waitq);
        return 0;
 }
 
@@ -1233,6 +1254,8 @@ static ssize_t osp_md_write(const struct lu_env *env, struct dt_object *dt,
        }
        spin_unlock(&obj->opo_lock);
 
+       atomic_inc(&obj->opo_writes_in_flight);
+
        RETURN(buf->lb_len);
 }
 
@@ -1244,7 +1267,16 @@ static inline void orr_le_to_cpu(struct out_read_reply *orr_dst,
        orr_dst->orr_offset = le64_to_cpu(orr_dst->orr_offset);
 }
 
+static int osp_md_check_creating(struct osp_object *obj)
+{
+       int rc;
 
+       spin_lock(&obj->opo_lock);
+       rc = obj->opo_creating;
+       spin_unlock(&obj->opo_lock);
+
+       return rc;
+}
 
 static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt,
                           struct lu_buf *rbuf, loff_t *pos)
@@ -1265,6 +1297,10 @@ static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt,
        if (dt2osp_obj(dt)->opo_destroyed)
                RETURN(-ENOENT);
 
+       wait_event_idle(osp->opd_out_waitq,
+                       !atomic_read(&dt2osp_obj(dt)->opo_writes_in_flight) &&
+                       osp_md_check_creating(dt2osp_obj(dt)) == 0);
+
        /* Because it needs send the update buffer right away,
         * just create an update buffer, instead of attaching the
         * update_remote list of the thandle.  */