Whamcloud - gitweb
LU-18989 osd: no tx restart in fallocate 58/59158/4
authorAlexander Zarochentsev <alexander.zarochentsev@hpe.com>
Tue, 6 May 2025 15:27:48 +0000 (15:27 +0000)
committerOleg Drokin <green@whamcloud.com>
Wed, 21 May 2025 05:18:56 +0000 (05:18 +0000)
A transaction restart in osd_fallocate_preallocate()
while holding an object lock may lead to a deadlock
with another thread which takes the lock first and then
starts a transaction.

crash> bt 44982
 #0 __schedule at ffffffffa2331fe8
 #1 schedule at ffffffffa233241a
 #2 wait_transaction_locked at ffffffffc106f08a
 #3  add_transaction_credits at ffffffffc106f67a
 #4  start_this_handle at ffffffffc106fa10
 #5  jbd2__journal_restart at ffffffffc107015e
 #6  osd_fallocate_preallocate.constprop.0 at ffffffffc23e3b1b
 #7  osd_fallocate at ffffffffc23e40cb
 #8  mdt_object_fallocate at ffffffffc21418f9
 #9  mdt_fallocate_hdl at ffffffffc2144125

and

crash> bt
PID: 47838    TASK: ffff9f66f5288000  CPU: 9    COMMAND: "mdt00_020"
 #0  __schedule at ffffffffa2331fe8
 #1  schedule at ffffffffa233241a
 #2  rwsem_down_write_slowpath at ffffffffa2334a9b
 #3  osd_write_lock at ffffffffc23b6b4d
 #4  mdd_xattr_del at ffffffffc208cd88
 #5  mdt_reint_setxattr at ffffffffc21224a9
 #6  mdt_reint_rec at ffffffffc211ec69
 #7  mdt_reint_internal at ffffffffc20f0a64
 #8  mdt_reint at ffffffffc20fc6b9
 #9  tgt_handle_request0 at ffffffffc1dea9e7

HPE-bug-id: LUS-12819
Signed-off-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Change-Id: I8baa3ba1505c7a55e96841524368a66d447d18c5
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59158
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Reviewed-by: Arshad Hussain <arshad.hussain@aeoncomputing.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/dt_object.h
lustre/mdt/mdt_io.c
lustre/ofd/ofd_objects.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c

index d1c7ea0..90f27c3 100644 (file)
@@ -1454,7 +1454,7 @@ struct dt_body_operations {
         */
        int (*dbo_fallocate)(const struct lu_env *env,
                            struct dt_object *dt,
-                           __u64 start,
+                           __u64 *start,
                            __u64 end,
                            int mode,
                            struct thandle *th);
@@ -2723,7 +2723,7 @@ static inline int dt_declare_fallocate(const struct lu_env *env,
 }
 
 static inline int dt_falloc(const struct lu_env *env, struct dt_object *dt,
-                             __u64 start, __u64 end, int mode,
+                             __u64 *start, __u64 end, int mode,
                              struct thandle *th)
 {
        LASSERT(dt);
index f3cc37c..ba6eb1b 100644 (file)
@@ -844,43 +844,51 @@ static int mdt_object_fallocate(const struct lu_env *env, struct dt_device *dt,
                                struct dt_object *dob, __u64 start, __u64 end,
                                int mode, struct lu_attr *la)
 {
-       struct thandle *th;
        int rc;
+       bool restart;
 
        ENTRY;
 
        if (!dt_object_exists(dob))
                RETURN(-ENOENT);
 
-       th = dt_trans_create(env, dt);
-       if (IS_ERR(th))
-               RETURN(PTR_ERR(th));
+       do {
+               struct thandle *th;
 
-       rc = dt_declare_attr_set(env, dob, la, th);
-       if (rc)
-               GOTO(stop, rc);
+               restart = false;
 
-       rc = dt_declare_fallocate(env, dob, start, end, mode, th);
-       if (rc)
-               GOTO(stop, rc);
+               th = dt_trans_create(env, dt);
+               if (IS_ERR(th))
+                       RETURN(PTR_ERR(th));
 
-       tgt_vbr_obj_data_set(env, dob, true);
-       rc = dt_trans_start(env, dt, th);
-       if (rc)
-               GOTO(stop, rc);
+               rc = dt_declare_attr_set(env, dob, la, th);
+               if (rc)
+                       GOTO(stop, rc);
 
-       dt_write_lock(env, dob, 0);
-       rc = dt_falloc(env, dob, start, end, mode, th);
-       if (rc)
-               GOTO(unlock, rc);
-       rc = dt_attr_set(env, dob, la, th);
-       if (rc)
-               GOTO(unlock, rc);
+               rc = dt_declare_fallocate(env, dob, start, end, mode, th);
+               if (rc)
+                       GOTO(stop, rc);
+
+               tgt_vbr_obj_data_set(env, dob, true);
+               rc = dt_trans_start(env, dt, th);
+               if (rc)
+                       GOTO(stop, rc);
+
+               dt_write_lock(env, dob, 0);
+               rc = dt_falloc(env, dob, &start, end, mode, th);
+               if (rc == -EAGAIN)
+                       restart = true;
+               if (rc)
+                       GOTO(unlock, rc);
+               rc = dt_attr_set(env, dob, la, th);
+               if (rc)
+                       GOTO(unlock, rc);
 unlock:
-       dt_write_unlock(env, dob);
+               dt_write_unlock(env, dob);
 stop:
-       th->th_result = rc;
-       dt_trans_stop(env, dt, th);
+               th->th_result = rc;
+               dt_trans_stop(env, dt, th);
+       } while (restart);
        RETURN(rc);
 }
 
index fa4185d..a11c3c5 100644 (file)
@@ -788,9 +788,9 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo,
        struct ofd_thread_info *info = ofd_info(env);
        struct ofd_device *ofd = ofd_obj2dev(fo);
        struct dt_object *dob = ofd_object_child(fo);
-       struct thandle *th;
        struct filter_fid *ff = &info->fti_mds_fid;
        bool ff_needed = false;
+       bool restart;
        int rc;
 
        ENTRY;
@@ -824,58 +824,67 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo,
                }
        }
 
-       th = ofd_trans_create(env, ofd);
-       if (IS_ERR(th))
-               RETURN(PTR_ERR(th));
+       do {
+               struct thandle *th;
 
-       rc = dt_declare_attr_set(env, dob, la, th);
-       if (rc)
-               GOTO(stop, rc);
+               restart = false;
 
-       rc = dt_declare_fallocate(env, dob, start, end, mode, th);
-       if (rc)
-               GOTO(stop, rc);
+               th = ofd_trans_create(env, ofd);
+               if (IS_ERR(th))
+                       RETURN(PTR_ERR(th));
 
-       if (ff_needed) {
-               info->fti_buf.lb_buf = ff;
-               info->fti_buf.lb_len = sizeof(*ff);
-               rc = dt_declare_xattr_set(env, ofd_object_child(fo),
-                                         &info->fti_buf, XATTR_NAME_FID, 0,
-                                         th);
+               rc = dt_declare_attr_set(env, dob, la, th);
                if (rc)
                        GOTO(stop, rc);
-       }
 
-       rc = ofd_trans_start(env, ofd, fo, th);
-       if (rc)
-               GOTO(stop, rc);
+               if (ff_needed) {
+                       info->fti_buf.lb_buf = ff;
+                       info->fti_buf.lb_len = sizeof(*ff);
+                       rc = dt_declare_xattr_set(env, ofd_object_child(fo),
+                                       &info->fti_buf, XATTR_NAME_FID, 0,
+                                       th);
+                       if (rc)
+                               GOTO(stop, rc);
+               }
 
-       ofd_read_lock(env, fo);
-       if (!ofd_object_exists(fo))
-               GOTO(unlock, rc = -ENOENT);
+               rc = dt_declare_fallocate(env, dob, start, end, mode, th);
+               if (rc)
+                       GOTO(stop, rc);
 
-       if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME))
-               tgt_fmd_update(info->fti_exp, &fo->ofo_header.loh_fid,
-                              info->fti_xid);
+               rc = ofd_trans_start(env, ofd, fo, th);
+               if (rc)
+                       GOTO(stop, rc);
 
-       rc = dt_falloc(env, dob, start, end, mode, th);
-       if (rc)
-               GOTO(unlock, rc);
+               ofd_read_lock(env, fo);
+               if (!ofd_object_exists(fo))
+                       GOTO(unlock, rc = -ENOENT);
 
-       rc = dt_attr_set(env, dob, la, th);
-       if (rc)
-               GOTO(unlock, rc);
+               if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME))
+                       tgt_fmd_update(info->fti_exp, &fo->ofo_header.loh_fid,
+                                       info->fti_xid);
 
-       if (ff_needed) {
-               rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
-                                 XATTR_NAME_FID, 0, th);
-               if (!rc)
-                       filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
-       }
+               rc = dt_falloc(env, dob, &start, end, mode, th);
+               if (rc == -EAGAIN)
+                       restart = true;
+               if (rc)
+                       GOTO(unlock, rc);
+
+               rc = dt_attr_set(env, dob, la, th);
+               if (rc)
+                       GOTO(unlock, rc);
+
+               if (ff_needed) {
+                       rc = dt_xattr_set(env, ofd_object_child(fo),
+                                       &info->fti_buf, XATTR_NAME_FID, 0, th);
+                       if (!rc)
+                               filter_fid_le_to_cpu(&fo->ofo_ff, ff,
+                                                    sizeof(*ff));
+               }
 unlock:
-       ofd_read_unlock(env, fo);
+               ofd_read_unlock(env, fo);
 stop:
-       ofd_trans_stop(env, ofd, th, rc);
+               ofd_trans_stop(env, ofd, th, rc);
+       } while (restart);
        RETURN(rc);
 }
 
index 999fd07..000c8c6 100644 (file)
@@ -408,6 +408,12 @@ struct osd_thandle {
        struct list_head       ot_commit_dcb_list;
        struct list_head       ot_stop_dcb_list;
        unsigned int            ot_credits;
+       /*
+        * For iterative operation within one transaction,
+        * for example fallocate, it is a number of credits
+        * enough for completing at least one iteration.
+        */
+       unsigned int            ot_credits_iter;
        unsigned int            oh_declared_ext;
 
        /* quota IDs related to the transaction */
index 95dadf9..9ad5ccd 100644 (file)
@@ -810,21 +810,18 @@ cleanup:
 }
 
 #ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
-static int osd_extend_restart_trans(handle_t *handle, int needed,
+static int osd_extend_trans(handle_t *handle, int needed,
                                    struct inode *inode)
 {
        int rc;
 
-       rc = ldiskfs_journal_ensure_credits(handle, needed,
+       rc = __ldiskfs_journal_ensure_credits(handle, needed, 2 * needed,
                ldiskfs_trans_default_revoke_credits(inode->i_sb));
-       /* this means journal has been restarted */
-       if (rc > 0)
-               rc = 0;
 
        return rc;
 }
 #else
-static int osd_extend_restart_trans(handle_t *handle, int needed,
+static int osd_extend_trans(handle_t *handle, int needed,
                                    struct inode *inode)
 {
        int rc;
@@ -833,10 +830,7 @@ static int osd_extend_restart_trans(handle_t *handle, int needed,
                return 0;
        rc = ldiskfs_journal_extend(handle,
                                needed - handle->h_buffer_credits);
-       if (rc <= 0)
-               return rc;
-
-       return ldiskfs_journal_restart(handle, needed);
+       return rc;
 }
 #endif /* HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */
 
@@ -2137,19 +2131,31 @@ static int osd_declare_fallocate(const struct lu_env *env,
                /* quota space should be reported in 1K blocks */
                quota_space = toqb(quota_space) + toqb(end - start) +
                        LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
-
-               /*
-                * We don't need to reserve credits for whole fallocate here.
-                * We reserve space only for metadata. Fallocate credits are
-                * extended as required
-                */
        }
+
        rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
                                   i_projid_read(inode), quota_space, oh,
                                   osd_dt_obj(dt), NULL, OSD_QID_BLK);
        if (rc)
                RETURN(rc);
 
+       if ((mode & FALLOC_FL_PUNCH_HOLE) == 0) {
+               unsigned int crds_per_ext;
+               ldiskfs_lblk_t blen;
+
+               blen = osd_i_blocks(inode, ALIGN(end, 1 << inode->i_blkbits)) -
+                      osd_i_blocks(inode, start);
+
+               crds_per_ext = ldiskfs_chunk_trans_blocks(inode, blen);
+               /*
+                * allow one more fallocate iteration when num credits
+                * enough to insert one extend + quota/xattrs updates
+                */
+               oh->ot_credits_iter = oh->ot_credits + crds_per_ext;
+               /* at tx start, reserve tx space for 5 extent inserts */
+               oh->ot_credits += 5 * crds_per_ext;
+       }
+
        /*
         * The both hole punch and allocation may need few transactions
         * to complete, so we have to avoid concurrent writes/truncates
@@ -2165,16 +2171,14 @@ static int osd_declare_fallocate(const struct lu_env *env,
 
 static int osd_fallocate_preallocate(const struct lu_env *env,
                                     struct dt_object *dt,
-                                    __u64 start, __u64 end, int mode,
+                                    __u64 *start, __u64 end, int mode,
                                     struct thandle *th)
 {
        struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
        handle_t *handle = ldiskfs_journal_current_handle();
-       unsigned int save_credits = oh->ot_credits;
        struct osd_object *obj = osd_dt_obj(dt);
        struct inode *inode = obj->oo_inode;
        struct ldiskfs_map_blocks map;
-       unsigned int credits;
        ldiskfs_lblk_t blen;
        ldiskfs_lblk_t boff;
        loff_t new_size = 0;
@@ -2189,13 +2193,13 @@ static int osd_fallocate_preallocate(const struct lu_env *env,
        LASSERT(inode != NULL);
 
        CDEBUG(D_INODE, "fallocate: inode #%lu: start %llu end %llu mode %d\n",
-              inode->i_ino, start, end, mode);
+              inode->i_ino, *start, end, mode);
 
        dquot_initialize(inode);
 
        LASSERT(th);
 
-       boff = osd_i_blocks(inode, start);
+       boff = osd_i_blocks(inode, *start);
        blen = osd_i_blocks(inode, ALIGN(end, 1 << inode->i_blkbits)) - boff;
 
        /* Create and mark new extents as either zero or unwritten */
@@ -2228,36 +2232,11 @@ static int osd_fallocate_preallocate(const struct lu_env *env,
        if (blen <= EXT_UNWRITTEN_MAX_LEN)
                flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE;
 
-       /*
-        * credits to insert 1 extent into extent tree.
-        */
-       credits = ldiskfs_chunk_trans_blocks(inode, blen);
        depth = ext_depth(inode);
 
        while (rc >= 0 && blen) {
                loff_t epos;
 
-               /*
-                * Recalculate credits when extent tree depth changes.
-                */
-               if (depth != ext_depth(inode)) {
-                       credits = ldiskfs_chunk_trans_blocks(inode, blen);
-                       depth = ext_depth(inode);
-               }
-
-               /* TODO: quota check */
-               if (handle->h_transaction->t_state == T_RUNNING) {
-                       rc = osd_extend_restart_trans(handle, credits, inode);
-               } else {
-                       rc = ldiskfs_journal_restart(handle, credits
-#ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
-                               ,ldiskfs_trans_default_revoke_credits(inode->i_sb)
-#endif
-                               );
-               }
-               if (rc)
-                       break;
-
                rc = ldiskfs_map_blocks(handle, inode, &map, flags);
                if (rc <= 0) {
                        CDEBUG(D_INODE,
@@ -2268,6 +2247,7 @@ static int osd_fallocate_preallocate(const struct lu_env *env,
                }
 
                map.m_lblk += rc;
+               *start += rc;
                map.m_len = blen = blen - rc;
                epos = (loff_t)map.m_lblk << inode->i_blkbits;
                inode_set_ctime_current(inode);
@@ -2286,20 +2266,32 @@ static int osd_fallocate_preallocate(const struct lu_env *env,
                }
 
                ldiskfs_mark_inode_dirty(handle, inode);
-       }
 
-out:
-       /* extand credits if needed for operations such as attribute set */
-       if (rc >= 0)
-               rc = osd_extend_restart_trans(handle, save_credits, inode);
+               /* do not attempt to extend an old transaction */
+               if (handle->h_transaction->t_state != T_RUNNING)
+                       GOTO(out, rc = -EAGAIN);
 
+               /*
+                * Recalculate credits when extent tree depth changes.
+                */
+               if (depth != ext_depth(inode))
+                       GOTO(out, rc = -EAGAIN);
+
+               rc = osd_extend_trans(handle, oh->ot_credits_iter, inode);
+               if (rc > 0)
+                       GOTO(out, rc = -EAGAIN);
+               if (rc)
+                       GOTO(out, rc);
+}
+
+out:
        inode_unlock(inode);
 
        RETURN(rc);
 }
 
 static int osd_fallocate_advance(const struct lu_env *env, struct dt_object *dt,
-                                 __u64 start, __u64 end, int mode,
+                                 __u64 *start, __u64 end, int mode,
                                  struct thandle *th)
 {
        struct osd_object *obj = osd_dt_obj(dt);
@@ -2326,7 +2318,7 @@ static int osd_fallocate_advance(const struct lu_env *env, struct dt_object *dt,
                LASSERT(al->tl_shared == 0);
                found = 1;
                /* do actual punch/zero in osd_trans_stop() */
-               al->tl_start = start;
+               al->tl_start = *start;
                al->tl_end = end;
                al->tl_mode = mode;
                al->tl_fallocate = true;
@@ -2337,7 +2329,7 @@ static int osd_fallocate_advance(const struct lu_env *env, struct dt_object *dt,
 }
 
 static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
-                        __u64 start, __u64 end, int mode, struct thandle *th)
+                        __u64 *start, __u64 end, int mode, struct thandle *th)
 {
        int rc;