From: Alex Zhuravlev Date: Sun, 13 Nov 2022 14:51:30 +0000 (+0300) Subject: LU-15139 osp: block reads until the object is created X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=d53a7f35f4fa60613b857a9b005fa88e36c6b331;p=fs%2Flustre-release.git LU-15139 osp: block reads until the object is created it's possible that remote llog can be read and written simultaneously at recovery. for example, dtx recovery thread is fetching updates while MDD's orphan cleanup procedure is removing orphans from PENDING. OSP can be asked to read a just created in OSP cache object while actual object on remote MDS hasn't been created yet. OSP should block such reads until the creation is done. Lustre-change: https://review.whamcloud.com/47003/ Lustre-commit: 4f2914537cc32fe89c4781bcfc87c38e3fe4419c Signed-off-by: Alex Zhuravlev Change-Id: I5596c791a758dd542746afd961eb1ed9c97845be Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49146 Tested-by: jenkins Tested-by: Andreas Dilger Reviewed-by: Andreas Dilger --- diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index 574a0c3..00d5989 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -1254,6 +1254,7 @@ static int osp_init0(const struct lu_env *env, struct osp_device *osp, GOTO(out, rc); if (osdname) OBD_FREE(osdname, MAX_OBD_NAME); + init_waitqueue_head(&osp->opd_out_waitq); RETURN(0); out: diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index 5cb003d..295ede8 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -278,6 +278,8 @@ struct osp_device { int opd_reserved_ino_low; int opd_reserved_mb_high; int opd_reserved_mb_low; + + wait_queue_head_t opd_out_waitq; }; #define opd_pre_used_fid opd_pre->osp_pre_used_fid @@ -314,7 +316,8 @@ struct osp_object { unsigned int opo_reserved:1, opo_non_exist:1, opo_stale:1, - opo_destroyed:1; + opo_destroyed:1, + opo_creating:1; /* create in progress */ /* read/write lock for md osp object */ struct rw_semaphore opo_sem; @@ -327,6 +330,7 @@ struct osp_object { /* to implement in-flight invalidation */ atomic_t opo_invalidate_seq; struct rw_semaphore opo_invalidate_sem; + atomic_t opo_writes_in_flight; }; extern struct lu_object_operations osp_lu_obj_ops; diff --git a/lustre/osp/osp_md_object.c b/lustre/osp/osp_md_object.c index 8715c70..25ebb2c 100644 --- a/lustre/osp/osp_md_object.c +++ b/lustre/osp/osp_md_object.c @@ -83,10 +83,15 @@ static int osp_create_interpreter(const struct lu_env *env, struct osp_object *obj, void *data, int index, int rc) { + struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev); + + spin_lock(&obj->opo_lock); if (rc != 0 && rc != -EEXIST) { obj->opo_obj.do_lu.lo_header->loh_attr &= ~LOHA_EXISTS; obj->opo_non_exist = 1; } + obj->opo_creating = 0; + spin_unlock(&obj->opo_lock); /* * invalidate opo cache for the object after the object is created, so @@ -94,6 +99,15 @@ static int osp_create_interpreter(const struct lu_env *env, */ osp_obj_invalidate_cache(obj); + /* + * currently reads from objects being created + * are exceptional - during recovery only, when + * remote llog update fetching can race with + * orphan cleanup. so don't waste memory adding + * a wait queue to every osp object + */ + wake_up_all(&osp->opd_out_waitq); + return 0; } @@ -181,9 +195,12 @@ int osp_md_create(const struct lu_env *env, struct dt_object *dt, if (rc < 0) GOTO(out, rc); + spin_lock(&obj->opo_lock); + obj->opo_creating = 1; dt->do_lu.lo_header->loh_attr |= LOHA_EXISTS | (attr->la_mode & S_IFMT); dt2osp_obj(dt)->opo_non_exist = 0; obj->opo_stale = 0; + spin_unlock(&obj->opo_lock); obj->opo_attr = *attr; out: @@ -1157,6 +1174,8 @@ static int osp_write_interpreter(const struct lu_env *env, struct osp_object *obj, void *data, int index, int rc) { + struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev); + if (rc) { CDEBUG(D_HA, "error "DFID": rc = %d\n", PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc); @@ -1166,6 +1185,8 @@ static int osp_write_interpreter(const struct lu_env *env, obj->opo_stale = 1; spin_unlock(&obj->opo_lock); } + if (atomic_dec_and_test(&obj->opo_writes_in_flight)) + wake_up_all(&osp->opd_out_waitq); return 0; } @@ -1233,6 +1254,8 @@ static ssize_t osp_md_write(const struct lu_env *env, struct dt_object *dt, } spin_unlock(&obj->opo_lock); + atomic_inc(&obj->opo_writes_in_flight); + RETURN(buf->lb_len); } @@ -1244,7 +1267,16 @@ static inline void orr_le_to_cpu(struct out_read_reply *orr_dst, orr_dst->orr_offset = le64_to_cpu(orr_dst->orr_offset); } +static int osp_md_check_creating(struct osp_object *obj) +{ + int rc; + spin_lock(&obj->opo_lock); + rc = obj->opo_creating; + spin_unlock(&obj->opo_lock); + + return rc; +} static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt, struct lu_buf *rbuf, loff_t *pos) @@ -1265,6 +1297,10 @@ static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt, if (dt2osp_obj(dt)->opo_destroyed) RETURN(-ENOENT); + wait_event_idle(osp->opd_out_waitq, + !atomic_read(&dt2osp_obj(dt)->opo_writes_in_flight) && + osp_md_check_creating(dt2osp_obj(dt)) == 0); + /* Because it needs send the update buffer right away, * just create an update buffer, instead of attaching the * update_remote list of the thandle. */