From 4f2914537cc32fe89c4781bcfc87c38e3fe4419c Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 6 Apr 2022 11:00:30 +0300 Subject: [PATCH] LU-15139 osp: block reads until the object is created it's possible that remote llog can be read and written simultaneously at recovery. for example, dtx recovery thread is fetching updates while MDD's orphan cleanup procedure is removing orphans from PENDING. OSP can be asked to read a just created in OSP cache object while actual object on remote MDS hasn't been created yet. OSP should block such reads until the creation is done. Signed-off-by: Alex Zhuravlev Change-Id: Id0f52b90761839399102bed825569da6bfd17864 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/47003 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Mikhail Pershin Reviewed-by: Oleg Drokin --- lustre/osp/osp_dev.c | 1 + lustre/osp/osp_internal.h | 6 +++++- lustre/osp/osp_md_object.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index ab408ea..95c196e 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -1251,6 +1251,7 @@ static int osp_init0(const struct lu_env *env, struct osp_device *osp, GOTO(out, rc); if (osdname) OBD_FREE(osdname, MAX_OBD_NAME); + init_waitqueue_head(&osp->opd_out_waitq); RETURN(0); out: diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index 43f39a7..4cbdc9b 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -270,6 +270,8 @@ struct osp_device { unsigned int opd_reserved_mb_low; unsigned int opd_reserved_ino_high; unsigned int opd_reserved_ino_low; + + wait_queue_head_t opd_out_waitq; bool opd_cleanup_orphans_done; bool opd_force_creation; }; @@ -309,7 +311,8 @@ struct osp_object { unsigned int opo_reserved:1, opo_non_exist:1, opo_stale:1, - opo_destroyed:1; + opo_destroyed:1, + opo_creating:1; /* create in progress */ /* read/write lock for md osp object */ struct rw_semaphore opo_sem; @@ -322,6 +325,7 @@ struct osp_object { /* to implement in-flight invalidation */ atomic_t opo_invalidate_seq; struct rw_semaphore opo_invalidate_sem; + atomic_t opo_writes_in_flight; }; extern const struct lu_object_operations osp_lu_obj_ops; diff --git a/lustre/osp/osp_md_object.c b/lustre/osp/osp_md_object.c index 6ff56f4..502dce0 100644 --- a/lustre/osp/osp_md_object.c +++ b/lustre/osp/osp_md_object.c @@ -83,10 +83,15 @@ static int osp_create_interpreter(const struct lu_env *env, struct osp_object *obj, void *data, int index, int rc) { + struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev); + + spin_lock(&obj->opo_lock); if (rc != 0 && rc != -EEXIST) { obj->opo_obj.do_lu.lo_header->loh_attr &= ~LOHA_EXISTS; obj->opo_non_exist = 1; } + obj->opo_creating = 0; + spin_unlock(&obj->opo_lock); /* * invalidate opo cache for the object after the object is created, so @@ -94,6 +99,15 @@ static int osp_create_interpreter(const struct lu_env *env, */ osp_obj_invalidate_cache(obj); + /* + * currently reads from objects being created + * are exceptional - during recovery only, when + * remote llog update fetching can race with + * orphan cleanup. so don't waste memory adding + * a wait queue to every osp object + */ + wake_up_all(&osp->opd_out_waitq); + return 0; } @@ -181,9 +195,12 @@ int osp_md_create(const struct lu_env *env, struct dt_object *dt, if (rc < 0) GOTO(out, rc); + spin_lock(&obj->opo_lock); + obj->opo_creating = 1; dt->do_lu.lo_header->loh_attr |= LOHA_EXISTS | (attr->la_mode & S_IFMT); dt2osp_obj(dt)->opo_non_exist = 0; obj->opo_stale = 0; + spin_unlock(&obj->opo_lock); obj->opo_attr = *attr; out: @@ -1157,6 +1174,8 @@ static int osp_write_interpreter(const struct lu_env *env, struct osp_object *obj, void *data, int index, int rc) { + struct osp_device *osp = lu2osp_dev(obj->opo_obj.do_lu.lo_dev); + if (rc) { CDEBUG(D_HA, "error "DFID": rc = %d\n", PFID(lu_object_fid(&obj->opo_obj.do_lu)), rc); @@ -1166,6 +1185,8 @@ static int osp_write_interpreter(const struct lu_env *env, obj->opo_stale = 1; spin_unlock(&obj->opo_lock); } + if (atomic_dec_and_test(&obj->opo_writes_in_flight)) + wake_up_all(&osp->opd_out_waitq); return 0; } @@ -1233,6 +1254,8 @@ static ssize_t osp_md_write(const struct lu_env *env, struct dt_object *dt, } spin_unlock(&obj->opo_lock); + atomic_inc(&obj->opo_writes_in_flight); + RETURN(buf->lb_len); } @@ -1244,7 +1267,16 @@ static inline void orr_le_to_cpu(struct out_read_reply *orr_dst, orr_dst->orr_offset = le64_to_cpu(orr_dst->orr_offset); } +static int osp_md_check_creating(struct osp_object *obj) +{ + int rc; + spin_lock(&obj->opo_lock); + rc = obj->opo_creating; + spin_unlock(&obj->opo_lock); + + return rc; +} static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt, struct lu_buf *rbuf, loff_t *pos) @@ -1265,6 +1297,10 @@ static ssize_t osp_md_read(const struct lu_env *env, struct dt_object *dt, if (dt2osp_obj(dt)->opo_destroyed) RETURN(-ENOENT); + wait_event_idle(osp->opd_out_waitq, + !atomic_read(&dt2osp_obj(dt)->opo_writes_in_flight) && + osp_md_check_creating(dt2osp_obj(dt)) == 0); + /* Because it needs send the update buffer right away, * just create an update buffer, instead of attaching the * update_remote list of the thandle. */ -- 1.8.3.1