From: Alexander Zarochentsev Date: Tue, 6 May 2025 15:27:48 +0000 (+0000) Subject: LU-18989 osd: no tx restart in fallocate X-Git-Tag: 2.16.56~60 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=3143e595cafa252df66e374e94fb700e6cb6e500;p=fs%2Flustre-release.git LU-18989 osd: no tx restart in fallocate A transaction restart in osd_fallocate_preallocate() while holding an object lock may lead to a deadlock with another thread which takes the lock first and then starts a transaction. crash> bt 44982 #0 __schedule at ffffffffa2331fe8 #1 schedule at ffffffffa233241a #2 wait_transaction_locked at ffffffffc106f08a #3 add_transaction_credits at ffffffffc106f67a #4 start_this_handle at ffffffffc106fa10 #5 jbd2__journal_restart at ffffffffc107015e #6 osd_fallocate_preallocate.constprop.0 at ffffffffc23e3b1b #7 osd_fallocate at ffffffffc23e40cb #8 mdt_object_fallocate at ffffffffc21418f9 #9 mdt_fallocate_hdl at ffffffffc2144125 and crash> bt PID: 47838 TASK: ffff9f66f5288000 CPU: 9 COMMAND: "mdt00_020" #0 __schedule at ffffffffa2331fe8 #1 schedule at ffffffffa233241a #2 rwsem_down_write_slowpath at ffffffffa2334a9b #3 osd_write_lock at ffffffffc23b6b4d #4 mdd_xattr_del at ffffffffc208cd88 #5 mdt_reint_setxattr at ffffffffc21224a9 #6 mdt_reint_rec at ffffffffc211ec69 #7 mdt_reint_internal at ffffffffc20f0a64 #8 mdt_reint at ffffffffc20fc6b9 #9 tgt_handle_request0 at ffffffffc1dea9e7 HPE-bug-id: LUS-12819 Signed-off-by: Alexander Zarochentsev Change-Id: I8baa3ba1505c7a55e96841524368a66d447d18c5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59158 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andrew Perepechko Reviewed-by: Arshad Hussain Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index d1c7ea0..90f27c3 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -1454,7 +1454,7 @@ struct dt_body_operations { */ int (*dbo_fallocate)(const struct lu_env *env, struct dt_object *dt, - __u64 start, + __u64 *start, __u64 end, int mode, struct thandle *th); @@ -2723,7 +2723,7 @@ static inline int dt_declare_fallocate(const struct lu_env *env, } static inline int dt_falloc(const struct lu_env *env, struct dt_object *dt, - __u64 start, __u64 end, int mode, + __u64 *start, __u64 end, int mode, struct thandle *th) { LASSERT(dt); diff --git a/lustre/mdt/mdt_io.c b/lustre/mdt/mdt_io.c index f3cc37c..ba6eb1b 100644 --- a/lustre/mdt/mdt_io.c +++ b/lustre/mdt/mdt_io.c @@ -844,43 +844,51 @@ static int mdt_object_fallocate(const struct lu_env *env, struct dt_device *dt, struct dt_object *dob, __u64 start, __u64 end, int mode, struct lu_attr *la) { - struct thandle *th; int rc; + bool restart; ENTRY; if (!dt_object_exists(dob)) RETURN(-ENOENT); - th = dt_trans_create(env, dt); - if (IS_ERR(th)) - RETURN(PTR_ERR(th)); + do { + struct thandle *th; - rc = dt_declare_attr_set(env, dob, la, th); - if (rc) - GOTO(stop, rc); + restart = false; - rc = dt_declare_fallocate(env, dob, start, end, mode, th); - if (rc) - GOTO(stop, rc); + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); - tgt_vbr_obj_data_set(env, dob, true); - rc = dt_trans_start(env, dt, th); - if (rc) - GOTO(stop, rc); + rc = dt_declare_attr_set(env, dob, la, th); + if (rc) + GOTO(stop, rc); - dt_write_lock(env, dob, 0); - rc = dt_falloc(env, dob, start, end, mode, th); - if (rc) - GOTO(unlock, rc); - rc = dt_attr_set(env, dob, la, th); - if (rc) - GOTO(unlock, rc); + rc = dt_declare_fallocate(env, dob, start, end, mode, th); + if (rc) + GOTO(stop, rc); + + tgt_vbr_obj_data_set(env, dob, true); + rc = dt_trans_start(env, dt, th); + if (rc) + GOTO(stop, rc); + + dt_write_lock(env, dob, 0); + rc = dt_falloc(env, dob, &start, end, mode, th); + if (rc == -EAGAIN) + restart = true; + if (rc) + GOTO(unlock, rc); + rc = dt_attr_set(env, dob, la, th); + if (rc) + GOTO(unlock, rc); unlock: - dt_write_unlock(env, dob); + dt_write_unlock(env, dob); stop: - th->th_result = rc; - dt_trans_stop(env, dt, th); + th->th_result = rc; + dt_trans_stop(env, dt, th); + } while (restart); RETURN(rc); } diff --git a/lustre/ofd/ofd_objects.c b/lustre/ofd/ofd_objects.c index fa4185d..a11c3c5 100644 --- a/lustre/ofd/ofd_objects.c +++ b/lustre/ofd/ofd_objects.c @@ -788,9 +788,9 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo, struct ofd_thread_info *info = ofd_info(env); struct ofd_device *ofd = ofd_obj2dev(fo); struct dt_object *dob = ofd_object_child(fo); - struct thandle *th; struct filter_fid *ff = &info->fti_mds_fid; bool ff_needed = false; + bool restart; int rc; ENTRY; @@ -824,58 +824,67 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo, } } - th = ofd_trans_create(env, ofd); - if (IS_ERR(th)) - RETURN(PTR_ERR(th)); + do { + struct thandle *th; - rc = dt_declare_attr_set(env, dob, la, th); - if (rc) - GOTO(stop, rc); + restart = false; - rc = dt_declare_fallocate(env, dob, start, end, mode, th); - if (rc) - GOTO(stop, rc); + th = ofd_trans_create(env, ofd); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); - if (ff_needed) { - info->fti_buf.lb_buf = ff; - info->fti_buf.lb_len = sizeof(*ff); - rc = dt_declare_xattr_set(env, ofd_object_child(fo), - &info->fti_buf, XATTR_NAME_FID, 0, - th); + rc = dt_declare_attr_set(env, dob, la, th); if (rc) GOTO(stop, rc); - } - rc = ofd_trans_start(env, ofd, fo, th); - if (rc) - GOTO(stop, rc); + if (ff_needed) { + info->fti_buf.lb_buf = ff; + info->fti_buf.lb_len = sizeof(*ff); + rc = dt_declare_xattr_set(env, ofd_object_child(fo), + &info->fti_buf, XATTR_NAME_FID, 0, + th); + if (rc) + GOTO(stop, rc); + } - ofd_read_lock(env, fo); - if (!ofd_object_exists(fo)) - GOTO(unlock, rc = -ENOENT); + rc = dt_declare_fallocate(env, dob, start, end, mode, th); + if (rc) + GOTO(stop, rc); - if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) - tgt_fmd_update(info->fti_exp, &fo->ofo_header.loh_fid, - info->fti_xid); + rc = ofd_trans_start(env, ofd, fo, th); + if (rc) + GOTO(stop, rc); - rc = dt_falloc(env, dob, start, end, mode, th); - if (rc) - GOTO(unlock, rc); + ofd_read_lock(env, fo); + if (!ofd_object_exists(fo)) + GOTO(unlock, rc = -ENOENT); - rc = dt_attr_set(env, dob, la, th); - if (rc) - GOTO(unlock, rc); + if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) + tgt_fmd_update(info->fti_exp, &fo->ofo_header.loh_fid, + info->fti_xid); - if (ff_needed) { - rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf, - XATTR_NAME_FID, 0, th); - if (!rc) - filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff)); - } + rc = dt_falloc(env, dob, &start, end, mode, th); + if (rc == -EAGAIN) + restart = true; + if (rc) + GOTO(unlock, rc); + + rc = dt_attr_set(env, dob, la, th); + if (rc) + GOTO(unlock, rc); + + if (ff_needed) { + rc = dt_xattr_set(env, ofd_object_child(fo), + &info->fti_buf, XATTR_NAME_FID, 0, th); + if (!rc) + filter_fid_le_to_cpu(&fo->ofo_ff, ff, + sizeof(*ff)); + } unlock: - ofd_read_unlock(env, fo); + ofd_read_unlock(env, fo); stop: - ofd_trans_stop(env, ofd, th, rc); + ofd_trans_stop(env, ofd, th, rc); + } while (restart); RETURN(rc); } diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 999fd07..000c8c6 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -408,6 +408,12 @@ struct osd_thandle { struct list_head ot_commit_dcb_list; struct list_head ot_stop_dcb_list; unsigned int ot_credits; + /* + * For iterative operation within one transaction, + * for example fallocate, it is a number of credits + * enough for completing at least one iteration. + */ + unsigned int ot_credits_iter; unsigned int oh_declared_ext; /* quota IDs related to the transaction */ diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 95dadf9..9ad5ccd 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -810,21 +810,18 @@ cleanup: } #ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS -static int osd_extend_restart_trans(handle_t *handle, int needed, +static int osd_extend_trans(handle_t *handle, int needed, struct inode *inode) { int rc; - rc = ldiskfs_journal_ensure_credits(handle, needed, + rc = __ldiskfs_journal_ensure_credits(handle, needed, 2 * needed, ldiskfs_trans_default_revoke_credits(inode->i_sb)); - /* this means journal has been restarted */ - if (rc > 0) - rc = 0; return rc; } #else -static int osd_extend_restart_trans(handle_t *handle, int needed, +static int osd_extend_trans(handle_t *handle, int needed, struct inode *inode) { int rc; @@ -833,10 +830,7 @@ static int osd_extend_restart_trans(handle_t *handle, int needed, return 0; rc = ldiskfs_journal_extend(handle, needed - handle->h_buffer_credits); - if (rc <= 0) - return rc; - - return ldiskfs_journal_restart(handle, needed); + return rc; } #endif /* HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */ @@ -2137,19 +2131,31 @@ static int osd_declare_fallocate(const struct lu_env *env, /* quota space should be reported in 1K blocks */ quota_space = toqb(quota_space) + toqb(end - start) + LDISKFS_META_TRANS_BLOCKS(inode->i_sb); - - /* - * We don't need to reserve credits for whole fallocate here. - * We reserve space only for metadata. Fallocate credits are - * extended as required - */ } + rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode), i_projid_read(inode), quota_space, oh, osd_dt_obj(dt), NULL, OSD_QID_BLK); if (rc) RETURN(rc); + if ((mode & FALLOC_FL_PUNCH_HOLE) == 0) { + unsigned int crds_per_ext; + ldiskfs_lblk_t blen; + + blen = osd_i_blocks(inode, ALIGN(end, 1 << inode->i_blkbits)) - + osd_i_blocks(inode, start); + + crds_per_ext = ldiskfs_chunk_trans_blocks(inode, blen); + /* + * allow one more fallocate iteration when num credits + * enough to insert one extend + quota/xattrs updates + */ + oh->ot_credits_iter = oh->ot_credits + crds_per_ext; + /* at tx start, reserve tx space for 5 extent inserts */ + oh->ot_credits += 5 * crds_per_ext; + } + /* * The both hole punch and allocation may need few transactions * to complete, so we have to avoid concurrent writes/truncates @@ -2165,16 +2171,14 @@ static int osd_declare_fallocate(const struct lu_env *env, static int osd_fallocate_preallocate(const struct lu_env *env, struct dt_object *dt, - __u64 start, __u64 end, int mode, + __u64 *start, __u64 end, int mode, struct thandle *th) { struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super); handle_t *handle = ldiskfs_journal_current_handle(); - unsigned int save_credits = oh->ot_credits; struct osd_object *obj = osd_dt_obj(dt); struct inode *inode = obj->oo_inode; struct ldiskfs_map_blocks map; - unsigned int credits; ldiskfs_lblk_t blen; ldiskfs_lblk_t boff; loff_t new_size = 0; @@ -2189,13 +2193,13 @@ static int osd_fallocate_preallocate(const struct lu_env *env, LASSERT(inode != NULL); CDEBUG(D_INODE, "fallocate: inode #%lu: start %llu end %llu mode %d\n", - inode->i_ino, start, end, mode); + inode->i_ino, *start, end, mode); dquot_initialize(inode); LASSERT(th); - boff = osd_i_blocks(inode, start); + boff = osd_i_blocks(inode, *start); blen = osd_i_blocks(inode, ALIGN(end, 1 << inode->i_blkbits)) - boff; /* Create and mark new extents as either zero or unwritten */ @@ -2228,36 +2232,11 @@ static int osd_fallocate_preallocate(const struct lu_env *env, if (blen <= EXT_UNWRITTEN_MAX_LEN) flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE; - /* - * credits to insert 1 extent into extent tree. - */ - credits = ldiskfs_chunk_trans_blocks(inode, blen); depth = ext_depth(inode); while (rc >= 0 && blen) { loff_t epos; - /* - * Recalculate credits when extent tree depth changes. - */ - if (depth != ext_depth(inode)) { - credits = ldiskfs_chunk_trans_blocks(inode, blen); - depth = ext_depth(inode); - } - - /* TODO: quota check */ - if (handle->h_transaction->t_state == T_RUNNING) { - rc = osd_extend_restart_trans(handle, credits, inode); - } else { - rc = ldiskfs_journal_restart(handle, credits -#ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS - ,ldiskfs_trans_default_revoke_credits(inode->i_sb) -#endif - ); - } - if (rc) - break; - rc = ldiskfs_map_blocks(handle, inode, &map, flags); if (rc <= 0) { CDEBUG(D_INODE, @@ -2268,6 +2247,7 @@ static int osd_fallocate_preallocate(const struct lu_env *env, } map.m_lblk += rc; + *start += rc; map.m_len = blen = blen - rc; epos = (loff_t)map.m_lblk << inode->i_blkbits; inode_set_ctime_current(inode); @@ -2286,20 +2266,32 @@ static int osd_fallocate_preallocate(const struct lu_env *env, } ldiskfs_mark_inode_dirty(handle, inode); - } -out: - /* extand credits if needed for operations such as attribute set */ - if (rc >= 0) - rc = osd_extend_restart_trans(handle, save_credits, inode); + /* do not attempt to extend an old transaction */ + if (handle->h_transaction->t_state != T_RUNNING) + GOTO(out, rc = -EAGAIN); + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth != ext_depth(inode)) + GOTO(out, rc = -EAGAIN); + + rc = osd_extend_trans(handle, oh->ot_credits_iter, inode); + if (rc > 0) + GOTO(out, rc = -EAGAIN); + if (rc) + GOTO(out, rc); +} + +out: inode_unlock(inode); RETURN(rc); } static int osd_fallocate_advance(const struct lu_env *env, struct dt_object *dt, - __u64 start, __u64 end, int mode, + __u64 *start, __u64 end, int mode, struct thandle *th) { struct osd_object *obj = osd_dt_obj(dt); @@ -2326,7 +2318,7 @@ static int osd_fallocate_advance(const struct lu_env *env, struct dt_object *dt, LASSERT(al->tl_shared == 0); found = 1; /* do actual punch/zero in osd_trans_stop() */ - al->tl_start = start; + al->tl_start = *start; al->tl_end = end; al->tl_mode = mode; al->tl_fallocate = true; @@ -2337,7 +2329,7 @@ static int osd_fallocate_advance(const struct lu_env *env, struct dt_object *dt, } static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, - __u64 start, __u64 end, int mode, struct thandle *th) + __u64 *start, __u64 end, int mode, struct thandle *th) { int rc;