Whamcloud - gitweb
LU-14187 osd-ldiskfs: fix locking in write commit 91/40991/12
authorWang Shilong <wshilong@ddn.com>
Wed, 16 Dec 2020 07:10:56 +0000 (15:10 +0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 5 Jan 2021 08:29:08 +0000 (08:29 +0000)
Restart transaction in osd layer break rules that
locks are taken afer transaction start.

This patch try to fix by moving transaction restart
to OFD layer.

We will record how many extents we declare with
@oh_declared_ext. And during IO we check if it
run out before we restart whole transaction.

We track an average extent bytes per filesystem and
use it as a hint during declare write commit. This
will avoid us restart transaction ofen if filesystem
is heavily fragmented.

Fixes: 0271b17b ("LU-14134 osd-ldiskfs: reduce credits for new writing")
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Change-Id: I289a3a6775befe159b7fef29004eaaaff873e2c6
Reviewed-on: https://review.whamcloud.com/40991
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
libcfs/libcfs/libcfs_mem.c
lustre/include/dt_object.h
lustre/include/lustre_compat.h
lustre/include/obd_support.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/mdt/mdt_io.c
lustre/ofd/ofd_io.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/tests/sanity.sh

index 757cc19..15542bb 100644 (file)
@@ -35,6 +35,7 @@
 
 #include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 #include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
+#include <lustre_compat.h>
 
 struct cfs_var_array {
        unsigned int            va_count;       /* # of buffers */
 
 struct cfs_var_array {
        unsigned int            va_count;       /* # of buffers */
@@ -177,10 +178,6 @@ EXPORT_SYMBOL(cfs_array_alloc);
  * minimum changes needed to work on older kernels too.
  */
 
  * minimum changes needed to work on older kernels too.
  */
 
-#ifndef raw_cpu_ptr
-#define raw_cpu_ptr(p) __this_cpu_ptr(p)
-#endif
-
 #ifndef llist_for_each_safe
 #define llist_for_each_safe(pos, n, node)                       \
         for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
 #ifndef llist_for_each_safe
 #define llist_for_each_safe(pos, n, node)                       \
         for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
index e1d574a..b5c0148 100644 (file)
@@ -2000,7 +2000,9 @@ struct thandle {
         * including OSTs */
                                th_complex:1,
        /* whether ignore quota */
         * including OSTs */
                                th_complex:1,
        /* whether ignore quota */
-                               th_ignore_quota:1;
+                               th_ignore_quota:1,
+       /* whether restart transaction */
+                               th_restart_tran:1;
 };
 
 /**
 };
 
 /**
index a39b617..50b0399 100644 (file)
@@ -576,4 +576,8 @@ static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode,
 #define FALLOC_FL_INSERT_RANGE 0x20 /* insert space within file */
 #endif
 
 #define FALLOC_FL_INSERT_RANGE 0x20 /* insert space within file */
 #endif
 
+#ifndef raw_cpu_ptr
+#define raw_cpu_ptr(p) __this_cpu_ptr(p)
+#endif
+
 #endif /* _LUSTRE_COMPAT_H */
 #endif /* _LUSTRE_COMPAT_H */
index b871dad..7160e54 100644 (file)
@@ -343,6 +343,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_FALLOCATE_NET      0x249
 #define OBD_FAIL_OST_SEEK_NET           0x24a
 #define OBD_FAIL_OST_WR_ATTR_DELAY      0x250
 #define OBD_FAIL_OST_FALLOCATE_NET      0x249
 #define OBD_FAIL_OST_SEEK_NET           0x24a
 #define OBD_FAIL_OST_WR_ATTR_DELAY      0x250
+#define OBD_FAIL_OST_RESTART_IO                 0x251
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
index caeb244..84f118d 100644 (file)
@@ -1391,12 +1391,16 @@ struct hsm_state_set {
                               OBD_BRW_OVER_GRPQUOTA | \
                               OBD_BRW_OVER_PRJQUOTA)
 
                               OBD_BRW_OVER_GRPQUOTA | \
                               OBD_BRW_OVER_PRJQUOTA)
 
+#define OBD_BRW_DONE   0x40000000UL   /*
+                                       * osd-ldiskfs inernal,
+                                       * IO has been issued before
+                                       */
 #define OBD_BRW_LOCAL1 0x80000000UL    /*
                                         * osd-ldiskfs internal,
                                         * page mapped to real block
                                         */
 
 #define OBD_BRW_LOCAL1 0x80000000UL    /*
                                         * osd-ldiskfs internal,
                                         * page mapped to real block
                                         */
 
-#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1)
+#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1 | OBD_BRW_DONE)
 
 #define OBD_MAX_GRANT 0x7fffffffUL /* Max grant allowed to one client: 2 GiB */
 
 
 #define OBD_MAX_GRANT 0x7fffffffUL /* Max grant allowed to one client: 2 GiB */
 
index 929563b..c80d602 100644 (file)
@@ -595,7 +595,7 @@ static int mdt_commitrw_write(const struct lu_env *env, struct obd_export *exp,
        struct thandle *th;
        int rc = 0;
        int retries = 0;
        struct thandle *th;
        int rc = 0;
        int retries = 0;
-       int i;
+       int i, restart = 0;
 
        ENTRY;
 
 
        ENTRY;
 
@@ -655,8 +655,10 @@ retry:
 
        dt_write_lock(env, dob, 0);
        rc = dt_write_commit(env, dob, lnb, niocount, th, oa->o_size);
 
        dt_write_lock(env, dob, 0);
        rc = dt_write_commit(env, dob, lnb, niocount, th, oa->o_size);
-       if (rc)
+       if (rc) {
+               restart = th->th_restart_tran;
                GOTO(unlock, rc);
                GOTO(unlock, rc);
+       }
 
        if (la->la_valid) {
                rc = dt_attr_set(env, dob, la, th);
 
        if (la->la_valid) {
                rc = dt_attr_set(env, dob, la, th);
@@ -680,13 +682,23 @@ out_stop:
                        granted = 0;
        }
 
                        granted = 0;
        }
 
-       th->th_result = rc;
+       th->th_result = restart ? 0 : rc;
        dt_trans_stop(env, dt, th);
        if (rc == -ENOSPC && retries++ < 3) {
                CDEBUG(D_INODE, "retry after force commit, retries:%d\n",
                       retries);
                goto retry;
        }
        dt_trans_stop(env, dt, th);
        if (rc == -ENOSPC && retries++ < 3) {
                CDEBUG(D_INODE, "retry after force commit, retries:%d\n",
                       retries);
                goto retry;
        }
+       if (restart) {
+               retries++;
+               restart = 0;
+               if (retries % 10000 == 0)
+                       CERROR("%s: restart IO write too many times: %d\n",
+                              exp->exp_obd->obd_name, retries);
+               CDEBUG(D_INODE, "retry transaction, retries:%d\n",
+                      retries);
+               goto retry;
+       }
 
 out:
        dt_bufs_put(env, dob, lnb, niocount);
 
 out:
        dt_bufs_put(env, dob, lnb, niocount);
index 44d2fa7..90d1807 100644 (file)
@@ -1216,7 +1216,7 @@ ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp,
        int rc = 0;
        int rc2 = 0;
        int retries = 0;
        int rc = 0;
        int rc2 = 0;
        int retries = 0;
-       int i;
+       int i, restart = 0;
        bool soft_sync = false;
        bool cb_registered = false;
        bool fake_write = false;
        bool soft_sync = false;
        bool cb_registered = false;
        bool fake_write = false;
@@ -1329,8 +1329,10 @@ retry:
                OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_OST_WR_ATTR_DELAY,
                                       OBD_FAIL_ONCE, cfs_fail_val);
                rc = dt_write_commit(env, o, lnb, niocount, th, oa->o_size);
                OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_OST_WR_ATTR_DELAY,
                                       OBD_FAIL_ONCE, cfs_fail_val);
                rc = dt_write_commit(env, o, lnb, niocount, th, oa->o_size);
-               if (rc)
+               if (rc) {
+                       restart = th->th_restart_tran;
                        GOTO(out_unlock, rc);
                        GOTO(out_unlock, rc);
+               }
        }
 
        /* get attr to return */
        }
 
        /* get attr to return */
@@ -1355,7 +1357,7 @@ out_stop:
                        granted = 0;
        }
 
                        granted = 0;
        }
 
-       rc2 = ofd_trans_stop(env, ofd, th, rc);
+       rc2 = ofd_trans_stop(env, ofd, th, restart ? 0 : rc);
        if (!rc)
                rc = rc2;
        if (rc == -ENOSPC && retries++ < 3) {
        if (!rc)
                rc = rc2;
        if (rc == -ENOSPC && retries++ < 3) {
@@ -1364,6 +1366,16 @@ out_stop:
                goto retry;
        }
 
                goto retry;
        }
 
+       if (restart) {
+               retries++;
+               restart = 0;
+               if (retries % 10000 == 0)
+                       CERROR("%s: restart IO write too many times: %d\n",
+                               ofd_name(ofd), retries);
+               CDEBUG(D_INODE, "retry transaction, retries:%d\n",
+                      retries);
+               goto retry;
+       }
        if (!soft_sync)
                /* reset fed_soft_sync_count upon non-SOFT_SYNC RPC */
                atomic_set(&fed->fed_soft_sync_count, 0);
        if (!soft_sync)
                /* reset fed_soft_sync_count upon non-SOFT_SYNC RPC */
                atomic_set(&fed->fed_soft_sync_count, 0);
index 24db85c..bd03e70 100644 (file)
@@ -1832,6 +1832,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
        th->th_dev = d;
        th->th_result = 0;
        oh->ot_credits = 0;
        th->th_dev = d;
        th->th_result = 0;
        oh->ot_credits = 0;
+       oh->oh_declared_ext = 0;
        INIT_LIST_HEAD(&oh->ot_commit_dcb_list);
        INIT_LIST_HEAD(&oh->ot_stop_dcb_list);
        INIT_LIST_HEAD(&oh->ot_trunc_locks);
        INIT_LIST_HEAD(&oh->ot_commit_dcb_list);
        INIT_LIST_HEAD(&oh->ot_stop_dcb_list);
        INIT_LIST_HEAD(&oh->ot_trunc_locks);
@@ -7860,6 +7861,8 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
        osd_procfs_fini(o);
        if (o->od_oi_table != NULL)
                osd_oi_fini(osd_oti_get(env), o);
        osd_procfs_fini(o);
        if (o->od_oi_table != NULL)
                osd_oi_fini(osd_oti_get(env), o);
+       if (o->od_extent_bytes_percpu)
+               free_percpu(o->od_extent_bytes_percpu);
        osd_obj_map_fini(o);
        osd_umount(env, o);
 
        osd_obj_map_fini(o);
        osd_umount(env, o);
 
@@ -7983,6 +7986,12 @@ static int osd_device_init0(const struct lu_env *env,
                GOTO(out_procfs, rc);
        }
 
                GOTO(out_procfs, rc);
        }
 
+       o->od_extent_bytes_percpu = alloc_percpu(unsigned int);
+       if (!o->od_extent_bytes_percpu) {
+               rc = -ENOMEM;
+               GOTO(out_procfs, rc);
+       }
+
        RETURN(0);
 
 out_procfs:
        RETURN(0);
 
 out_procfs:
index f374321..7d2f970 100644 (file)
@@ -91,6 +91,9 @@ extern struct kmem_cache *dynlock_cachep;
 #define OSD_STATFS_RESERVED            (1ULL << 23) /* 8MB */
 #define OSD_STATFS_RESERVED_SHIFT      (7) /* reserve 0.78% of all space */
 
 #define OSD_STATFS_RESERVED            (1ULL << 23) /* 8MB */
 #define OSD_STATFS_RESERVED_SHIFT      (7) /* reserve 0.78% of all space */
 
+/* Default extent bytes when declaring write commit */
+#define OSD_DEFAULT_EXTENT_BYTES       (1U << 20)
+
 /* check if ldiskfs support project quota */
 #ifndef LDISKFS_IOC_FSSETXATTR
 #undef HAVE_PROJECT_QUOTA
 /* check if ldiskfs support project quota */
 #ifndef LDISKFS_IOC_FSSETXATTR
 #undef HAVE_PROJECT_QUOTA
@@ -345,6 +348,7 @@ struct osd_device {
        enum osd_t10_type        od_t10_type;
        atomic_t                 od_commit_cb_in_flight;
        wait_queue_head_t        od_commit_cb_done;
        enum osd_t10_type        od_t10_type;
        atomic_t                 od_commit_cb_in_flight;
        wait_queue_head_t        od_commit_cb_done;
+       unsigned int __percpu   *od_extent_bytes_percpu;
 };
 
 static inline struct qsd_instance *osd_def_qsd(struct osd_device *osd)
 };
 
 static inline struct qsd_instance *osd_def_qsd(struct osd_device *osd)
@@ -413,6 +417,7 @@ struct osd_thandle {
        /* Link to the device, for debugging. */
        struct lu_ref_link      ot_dev_link;
        unsigned int            ot_credits;
        /* Link to the device, for debugging. */
        struct lu_ref_link      ot_dev_link;
        unsigned int            ot_credits;
+       unsigned int            oh_declared_ext;
 
        /* quota IDs related to the transaction */
        unsigned short          ot_id_cnt;
 
        /* quota IDs related to the transaction */
        unsigned short          ot_id_cnt;
index deaa0af..b9660d9 100644 (file)
@@ -440,6 +440,29 @@ static int osd_bio_init(struct bio *bio, struct osd_iobuf *iobuf,
        RETURN(0);
 }
 
        RETURN(0);
 }
 
+static void osd_mark_page_io_done(struct osd_iobuf *iobuf,
+                                 struct inode *inode,
+                                 sector_t start_blocks,
+                                 sector_t count)
+{
+       struct niobuf_local *lnb;
+       int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+       pgoff_t pg_start, pg_end;
+
+       pg_start = start_blocks / blocks_per_page;
+       if (start_blocks % blocks_per_page)
+               pg_start++;
+       if (count >= blocks_per_page)
+               pg_end = (start_blocks + count -
+                         blocks_per_page) / blocks_per_page;
+       else
+               return; /* nothing to mark */
+       for ( ; pg_start <= pg_end; pg_start++) {
+               lnb = iobuf->dr_lnbs[pg_start];
+               lnb->lnb_flags |= OBD_BRW_DONE;
+       }
+}
+
 static int osd_do_bio(struct osd_device *osd, struct inode *inode,
                      struct osd_iobuf *iobuf, sector_t start_blocks,
                      sector_t count)
 static int osd_do_bio(struct osd_device *osd, struct inode *inode,
                      struct osd_iobuf *iobuf, sector_t start_blocks,
                      sector_t count)
@@ -612,6 +635,11 @@ out:
                        OBD_FREE_PTR(bio_private);
        }
 
                        OBD_FREE_PTR(bio_private);
        }
 
+       /* Write only now */
+       if (rc == 0 && iobuf->dr_rw)
+               osd_mark_page_io_done(iobuf, inode,
+                                     start_blocks, count);
+
        RETURN(rc);
 }
 
        RETURN(rc);
 }
 
@@ -932,13 +960,6 @@ static int osd_chunk_trans_blocks(struct inode *inode, int nrblocks)
 }
 
 #ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
 }
 
 #ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
-static int osd_extend_trans(handle_t *handle, int needed,
-                           struct inode *inode)
-{
-       return  __ldiskfs_journal_ensure_credits(handle, needed, needed,
-               ldiskfs_trans_default_revoke_credits(inode->i_sb));
-}
-
 static int osd_extend_restart_trans(handle_t *handle, int needed,
                                    struct inode *inode)
 {
 static int osd_extend_restart_trans(handle_t *handle, int needed,
                                    struct inode *inode)
 {
@@ -953,22 +974,15 @@ static int osd_extend_restart_trans(handle_t *handle, int needed,
        return rc;
 }
 #else
        return rc;
 }
 #else
-static int osd_extend_trans(handle_t *handle, int needed,
-                           struct inode *inode)
-{
-       if (ldiskfs_handle_has_enough_credits(handle, needed))
-               return 0;
-
-       return ldiskfs_journal_extend(handle,
-                                     needed - handle->h_buffer_credits);
-}
-
 static int osd_extend_restart_trans(handle_t *handle, int needed,
                                    struct inode *inode)
 {
 static int osd_extend_restart_trans(handle_t *handle, int needed,
                                    struct inode *inode)
 {
+       int rc;
 
 
-       int rc = osd_extend_trans(handle, needed, inode);
-
+       if (ldiskfs_handle_has_enough_credits(handle, needed))
+               return 0;
+       rc = ldiskfs_journal_extend(handle,
+                               needed - handle->h_buffer_credits);
        if (rc <= 0)
                return rc;
 
        if (rc <= 0)
                return rc;
 
@@ -1002,12 +1016,45 @@ static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf,
        return osd_do_bio(osd, inode, iobuf, start_blocks, count);
 }
 
        return osd_do_bio(osd, inode, iobuf, start_blocks, count);
 }
 
+static unsigned int osd_extent_bytes(const struct osd_device *o)
+{
+       unsigned int *extent_bytes_ptr =
+                       raw_cpu_ptr(o->od_extent_bytes_percpu);
+
+       if (likely(*extent_bytes_ptr))
+               return *extent_bytes_ptr;
+
+       /* initialize on first access or CPU hotplug */
+       if (!ldiskfs_has_feature_extents(osd_sb(o)))
+               *extent_bytes_ptr = 1 << osd_sb(o)->s_blocksize_bits;
+       else
+               *extent_bytes_ptr = OSD_DEFAULT_EXTENT_BYTES;
+
+       return *extent_bytes_ptr;
+}
+
+#define EXTENT_BYTES_DECAY 64
+static void osd_decay_extent_bytes(struct osd_device *osd,
+                                  unsigned int new_bytes)
+{
+       unsigned int old_bytes;
+
+       if (!ldiskfs_has_feature_extents(osd_sb(osd)))
+               return;
+
+       old_bytes = osd_extent_bytes(osd);
+       *raw_cpu_ptr(osd->od_extent_bytes_percpu) =
+               (old_bytes * (EXTENT_BYTES_DECAY - 1) +
+                min(new_bytes, OSD_DEFAULT_EXTENT_BYTES) +
+                EXTENT_BYTES_DECAY - 1) / EXTENT_BYTES_DECAY;
+}
 
 static int osd_ldiskfs_map_inode_pages(struct inode *inode,
                                       struct osd_iobuf *iobuf,
                                       struct osd_device *osd,
                                       int create, __u64 user_size,
 
 static int osd_ldiskfs_map_inode_pages(struct inode *inode,
                                       struct osd_iobuf *iobuf,
                                       struct osd_device *osd,
                                       int create, __u64 user_size,
-                                      int check_credits)
+                                      int check_credits,
+                                      struct thandle *thandle)
 {
        int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
        int rc = 0, i = 0, mapped_index = 0;
 {
        int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
        int rc = 0, i = 0, mapped_index = 0;
@@ -1015,7 +1062,6 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode,
        int clen = 0;
        pgoff_t max_page_index;
        handle_t *handle = NULL;
        int clen = 0;
        pgoff_t max_page_index;
        handle_t *handle = NULL;
-       int credits;
        sector_t start_blocks = 0, count = 0;
        loff_t disk_size = 0;
        struct page **page = iobuf->dr_pages;
        sector_t start_blocks = 0, count = 0;
        loff_t disk_size = 0;
        struct page **page = iobuf->dr_pages;
@@ -1075,37 +1121,30 @@ cont_map:
                 * transaction to make sure consistency.
                 */
                if (handle && check_credits) {
                 * transaction to make sure consistency.
                 */
                if (handle && check_credits) {
-                       /*
-                        * credits to insert 1 extent into extent tree.
-                        */
-                       credits = osd_chunk_trans_blocks(inode, blen);
-                       rc = osd_extend_trans(handle, credits, inode);
-                       if (rc < 0)
-                               GOTO(cleanup, rc);
+                       struct osd_thandle *oh;
+
+                       LASSERT(thandle != NULL);
+                       oh = container_of(thandle, struct osd_thandle,
+                                         ot_super);
                        /*
                         * only issue IO if restart transaction needed,
                         * as update disk size need hold inode lock, we
                         * want to avoid that as much as possible.
                         */
                        /*
                         * only issue IO if restart transaction needed,
                         * as update disk size need hold inode lock, we
                         * want to avoid that as much as possible.
                         */
-                       if (rc > 0) {
-                               WARN_ON_ONCE(start_blocks == 0);
+                       if (oh->oh_declared_ext <= 0) {
                                rc = osd_ldiskfs_map_write(inode,
                                        iobuf, osd, start_blocks,
                                        count, &disk_size, user_size);
                                if (rc)
                                        GOTO(cleanup, rc);
                                rc = osd_ldiskfs_map_write(inode,
                                        iobuf, osd, start_blocks,
                                        count, &disk_size, user_size);
                                if (rc)
                                        GOTO(cleanup, rc);
-#ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
-                               rc = ldiskfs_journal_restart(handle, credits,
-                                       ldiskfs_trans_default_revoke_credits(inode->i_sb));
-#else
-                               rc = ldiskfs_journal_restart(handle, credits);
-#endif
-                               if (rc)
-                                       GOTO(cleanup, rc);
-                               start_blocks += count;
-                               /* reset IO block count */
-                               count = 0;
+                               thandle->th_restart_tran = 1;
+                               GOTO(cleanup, rc = -EAGAIN);
                        }
                        }
+
+                       if (OBD_FAIL_CHECK(OBD_FAIL_OST_RESTART_IO))
+                               oh->oh_declared_ext = 0;
+                       else
+                               oh->oh_declared_ext--;
                }
                rc = ldiskfs_map_blocks(handle, inode, &map, create);
                if (rc >= 0) {
                }
                rc = ldiskfs_map_blocks(handle, inode, &map, create);
                if (rc >= 0) {
@@ -1148,6 +1187,12 @@ cont_map:
                }
 
                if (rc == 0 && total < blen) {
                }
 
                if (rc == 0 && total < blen) {
+                       /*
+                        * decay extent blocks if we could not
+                        * allocate extent once.
+                        */
+                       osd_decay_extent_bytes(osd,
+                               (total - previous_total) << inode->i_blkbits);
                        map.m_lblk = fp->index * blocks_per_page + total;
                        map.m_len = blen - total;
                        previous_total = total;
                        map.m_lblk = fp->index * blocks_per_page + total;
                        map.m_len = blen - total;
                        previous_total = total;
@@ -1155,7 +1200,14 @@ cont_map:
                }
                if (rc != 0)
                        GOTO(cleanup, rc);
                }
                if (rc != 0)
                        GOTO(cleanup, rc);
-
+               /*
+                * decay extent blocks if we could allocate
+                * good large(1M) extent.
+                */
+               if (previous_total == 0 &&
+                   total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+                       osd_decay_extent_bytes(osd,
+                                              total << inode->i_blkbits);
                /* look for next extent */
                fp = NULL;
                blocks += blocks_per_page * clen;
                /* look for next extent */
                fp = NULL;
                blocks += blocks_per_page * clen;
@@ -1227,7 +1279,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 
        if (iobuf->dr_npages) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
 
        if (iobuf->dr_npages) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
-                                                0, 0);
+                                                0, 0, NULL);
                if (likely(rc == 0)) {
                        rc = osd_do_bio(osd, inode, iobuf, 0, 0);
                        /* do IO stats for preparation reads */
                if (likely(rc == 0)) {
                        rc = osd_do_bio(osd, inode, iobuf, 0, 0);
                        /* do IO stats for preparation reads */
@@ -1308,12 +1360,21 @@ static int osd_declare_write_commit(const struct lu_env *env,
        struct osd_fextent      mapped = { 0 }, extent = { 0 };
        enum osd_quota_local_flags local_flags = 0;
        enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
        struct osd_fextent      mapped = { 0 }, extent = { 0 };
        enum osd_quota_local_flags local_flags = 0;
        enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
+       unsigned int            extent_bytes;
        ENTRY;
 
        LASSERT(handle != NULL);
        oh = container_of(handle, struct osd_thandle, ot_super);
        LASSERT(oh->ot_handle == NULL);
 
        ENTRY;
 
        LASSERT(handle != NULL);
        oh = container_of(handle, struct osd_thandle, ot_super);
        LASSERT(oh->ot_handle == NULL);
 
+       /*
+        * We track a decaying average extent blocks per filesystem,
+        * for most of time, it will be 1M, with filesystem becoming
+        * heavily-fragmented, it will be reduced to 4K at the worst.
+        */
+       extent_bytes = osd_extent_bytes(osd);
+       LASSERT(extent_bytes >= (1 << osd_sb(osd)->s_blocksize));
+
        /* calculate number of extents (probably better to pass nb) */
        for (i = 0; i < npages; i++) {
                /* ignore quota for the whole request if any page is from
        /* calculate number of extents (probably better to pass nb) */
        for (i = 0; i < npages; i++) {
                /* ignore quota for the whole request if any page is from
@@ -1336,10 +1397,18 @@ static int osd_declare_write_commit(const struct lu_env *env,
                        continue;
                }
 
                        continue;
                }
 
+               if (lnb[i].lnb_flags & OBD_BRW_DONE) {
+                       lnb[i].lnb_flags |= OBD_BRW_MAPPED;
+                       continue;
+               }
+
                /* count only unmapped changes */
                newblocks++;
                if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
                /* count only unmapped changes */
                newblocks++;
                if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
-                       extents++;
+                       if (extent.end != 0)
+                               extents += (extent.end - extent.start +
+                                       extent_bytes - 1) / extent_bytes;
+                       extent.start = lnb[i].lnb_file_offset;
                        extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
                } else {
                        extent.end += lnb[i].lnb_len;
                        extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
                } else {
                        extent.end += lnb[i].lnb_len;
@@ -1355,6 +1424,9 @@ static int osd_declare_write_commit(const struct lu_env *env,
         */
        if (!newblocks)
                goto out_declare;
         */
        if (!newblocks)
                goto out_declare;
+
+       extents += (extent.end - extent.start +
+                   extent_bytes - 1) / extent_bytes;
        /*
         * each extent can go into new leaf causing a split
         * 5 is max tree depth: inode + 4 index blocks
        /*
         * each extent can go into new leaf causing a split
         * 5 is max tree depth: inode + 4 index blocks
@@ -1375,12 +1447,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
                credits += depth * extents;
        }
 
                credits += depth * extents;
        }
 
-       /*
-        * try a bit more extents to avoid restart
-        * as much as possible in normal case.
-        */
-       if (npages > 1 && extents)
-               extents <<= 1;
+       oh->oh_declared_ext = extents;
 
        /* quota space for metadata blocks */
        quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
 
        /* quota space for metadata blocks */
        quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
@@ -1439,9 +1506,6 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
        struct inode *inode = osd_dt_obj(dt)->oo_inode;
        struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
        int rc = 0, i, check_credits = 0;
        struct inode *inode = osd_dt_obj(dt)->oo_inode;
        struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
        int rc = 0, i, check_credits = 0;
-       struct osd_thandle *oh = container_of(thandle,
-                                             struct osd_thandle, ot_super);
-       unsigned int save_credits = oh->ot_credits;
 
        LASSERT(inode);
 
 
        LASSERT(inode);
 
@@ -1469,6 +1533,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                        continue;
                }
 
                        continue;
                }
 
+               if (lnb[i].lnb_flags & OBD_BRW_DONE)
+                       continue;
+
                if (!(lnb[i].lnb_flags & OBD_BRW_MAPPED))
                        check_credits = 1;
 
                if (!(lnb[i].lnb_flags & OBD_BRW_MAPPED))
                        check_credits = 1;
 
@@ -1494,29 +1561,19 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
        } else if (iobuf->dr_npages > 0) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd,
                                                 1, user_size,
        } else if (iobuf->dr_npages > 0) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd,
                                                 1, user_size,
-                                                check_credits);
-               /*
-                * Write might restart transaction, extend credits
-                * if needed for operations such as attribute set.
-                */
-               if (rc == 0) {
-                       handle_t *handle = ldiskfs_journal_current_handle();
-
-                       LASSERT(handle != NULL);
-                       rc = osd_extend_restart_trans(handle, save_credits,
-                                                     inode);
-               }
+                                                check_credits,
+                                                thandle);
        } else {
                /* no pages to write, no transno is needed */
                thandle->th_local = 1;
        }
 
        } else {
                /* no pages to write, no transno is needed */
                thandle->th_local = 1;
        }
 
-       if (rc != 0)
+       if (rc != 0 && !thandle->th_restart_tran)
                osd_fini_iobuf(osd, iobuf);
 
        osd_trans_exec_check(env, thandle, OSD_OT_WRITE);
 
                osd_fini_iobuf(osd, iobuf);
 
        osd_trans_exec_check(env, thandle, OSD_OT_WRITE);
 
-       if (unlikely(rc != 0)) {
+       if (unlikely(rc != 0 && !thandle->th_restart_tran)) {
                /* if write fails, we should drop pages from the cache */
                for (i = 0; i < npages; i++) {
                        if (lnb[i].lnb_page == NULL)
                /* if write fails, we should drop pages from the cache */
                for (i = 0; i < npages; i++) {
                        if (lnb[i].lnb_page == NULL)
@@ -1601,7 +1658,7 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
 
        if (iobuf->dr_npages) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
 
        if (iobuf->dr_npages) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
-                                                0, 0);
+                                                0, 0, NULL);
                if (!rc)
                        rc = osd_do_bio(osd, inode, iobuf, 0, 0);
 
                if (!rc)
                        rc = osd_do_bio(osd, inode, iobuf, 0, 0);
 
index e8c0832..b8d75c6 100755 (executable)
@@ -24038,6 +24038,25 @@ test_430c() {
 }
 run_test 430c "lseek: external tools check"
 
 }
 run_test 430c "lseek: external tools check"
 
+test_431() { # LU-14187
+       local file=$DIR/$tdir/$tfile
+
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -c 1 -i 0 $file || error "lfs setstripe failed"
+       dd if=/dev/urandom of=$file bs=4k count=1
+       dd if=/dev/urandom of=$file bs=4k count=1 seek=10 conv=notrunc
+       dd if=/dev/urandom of=$file bs=4k count=1 seek=12 conv=notrunc
+       #define OBD_FAIL_OST_RESTART_IO 0x251
+       do_facet ost1 "$LCTL set_param fail_loc=0x251"
+       $LFS setstripe -c 1 -i 0 $file.0 || error "lfs setstripe failed"
+       cp $file $file.0
+       cancel_lru_locks
+       sync_all_data
+       echo 3 > /proc/sys/vm/drop_caches
+       diff  $file $file.0 || error "data diff"
+}
+run_test 431 "Restart transaction for IO"
+
 prep_801() {
        [[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] ||
        [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&
 prep_801() {
        [[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] ||
        [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&