LU-19014 memcg: fix client hang in balance_dirty_pages()

author Qian Yingjin <qian@ddn.com>

Sat, 24 May 2025 08:30:48 +0000 (16:30 +0800)

committer Oleg Drokin <green@whamcloud.com>

Tue, 8 Jul 2025 03:50:16 +0000 (03:50 +0000)
author Qian Yingjin <qian@ddn.com>
Sat, 24 May 2025 08:30:48 +0000 (16:30 +0800)
committer Oleg Drokin <green@whamcloud.com>
Tue, 8 Jul 2025 03:50:16 +0000 (03:50 +0000)
diff --git a/config/lustre-core.m4 b/config/lustre-core.m4

index 2cdb1bc..d22eab8 100644 (file)
--- a/config/lustre-core.m4
+++ b/config/lustre-core.m4
@@ -4406,6 +4406,29 @@ AC_DEFUN([LC_HAVE_FOLIO_BATCH_REINIT], [
  ]) # LC_HAVE_FOLIO_BATCH_REINIT
  
  #
+# LC_HAVE_INODE_ATTACH_WB_FOLIO
+#
+# linux kernel v6.2-rc4 commit: 9cfb816b1c6c99f4b3c1d4a0fb096162cd17ec71
+# mm/fs: convert inode_attach_wb() to take a folio
+#
+AC_DEFUN([LC_SRC_HAVE_INODE_ATTACH_WB_FOLIO], [
+       LB2_LINUX_TEST_SRC([inode_attach_wb_folio_arg], [
+               #include <linux/writeback.h>
+       ],[
+               struct folio *folio = NULL;
+
+               inode_attach_wb(NULL, folio);
+       ],[-Werror])
+])
+AC_DEFUN([LC_HAVE_INODE_ATTACH_WB_FOLIO], [
+       LB2_MSG_LINUX_TEST_RESULT([if 'inode_attach_wb()' takes folio],
+       [inode_attach_wb_folio_arg], [
+               AC_DEFINE(HAVE_INODE_ATTACH_WB_FOLIO, 1,
+                       ['inode_attach_wb()' takes folio])
+       ])
+]) # LC_HAVE_INODE_ATTACH_WB_FOLIO
+
+#
  # LC_HAVE_IOV_ITER_IOVEC
  #
  # linux kernel v6.3-rc4-32-g6eb203e1a868
@@ -5399,6 +5422,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
         LC_SRC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK
         LC_SRC_HAVE_U64_CAPABILITY
         LC_SRC_HAVE_FOLIO_BATCH_REINIT
+       LC_SRC_HAVE_INODE_ATTACH_WB_FOLIO
  
         # 6.4
         LC_SRC_HAVE_IOV_ITER_IOVEC
@@ -5737,6 +5761,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
         LC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK
         LC_HAVE_U64_CAPABILITY
         LC_HAVE_FOLIO_BATCH_REINIT
+       LC_HAVE_INODE_ATTACH_WB_FOLIO
  
         # 6.4
         LC_HAVE_IOV_ITER_IOVEC
diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h

index c672249..2db876f 100644 (file)
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -1428,6 +1428,36 @@ static inline void cl_read_ahead_release(const struct lu_env *env,
                 ra->cra_release(env, ra);
  }
  
+enum cl_io_priority {
+       /* Normal I/O, usually just queue the pages in the client side cache. */
+       IO_PRIO_NORMAL  = 0,
+       /* I/O is urgent and should flush queued pages to OSTs ASAP. */
+       IO_PRIO_URGENT,
+       /* The memcg is under high memory pressure and the user write process
+        * is dirty exceeded and under rate limiting in balance_dirty_pages().
+        * It needs to flush dirty pages for the corresponding @wb ASAP.
+        */
+       IO_PRIO_DIRTY_EXCEEDED,
+       /*
+        * I/O is urgent and flushing pages are marked with OBD_BRW_SOFT_SYNC
+        * flag and may trigger a soft sync on OSTs. Thus it can free unstable
+        * pages much quickly.
+        */
+       IO_PRIO_SOFT_SYNC,
+       /*
+        * The system or a certain memcg is under high memory pressure. Need to
+        * flush dirty pages to OSTs immediately and I/O RPC must wait the write
+        * transcation commit on OSTs synchronously to release unstable pages.
+        */
+       IO_PRIO_HARD_SYNC,
+       IO_PRIO_MAX,
+};
+
+static inline bool cl_io_high_prio(enum cl_io_priority prio)
+{
+       return prio >= IO_PRIO_URGENT;
+}
+
  /**
   * Per-layer io operations.
   * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
@@ -1540,12 +1570,13 @@ struct cl_io_operations {
         int  (*cio_commit_async)(const struct lu_env *env,
                                  const struct cl_io_slice *slice,
                                  struct cl_page_list *queue, int from, int to,
-                                cl_commit_cbt cb);
+                                cl_commit_cbt cb, enum cl_io_priority prio);
         /**
          * Release active extent.
          */
         void  (*cio_extent_release)(const struct lu_env *env,
-                                   const struct cl_io_slice *slice);
+                                   const struct cl_io_slice *slice,
+                                   enum cl_io_priority prio);
         /**
          * Decide maximum read ahead extent
          *
@@ -1833,13 +1864,14 @@ struct cl_io {
                         struct cl_page *ft_page;
                 } ci_fault;
                 struct cl_fsync_io {
-                       loff_t             fi_start;
-                       loff_t             fi_end;
+                       loff_t                   fi_start;
+                       loff_t                   fi_end;
                         /** file system level fid */
-                       struct lu_fid     *fi_fid;
-                       enum cl_fsync_mode fi_mode;
+                       struct lu_fid           *fi_fid;
+                       enum cl_fsync_mode       fi_mode;
                         /* how many pages were written/discarded */
-                       unsigned int       fi_nr_written;
+                       unsigned int             fi_nr_written;
+                       enum cl_io_priority      fi_prio;
                 } ci_fsync;
                 struct cl_ladvise_io {
                         __u64                    lio_start;
@@ -2389,8 +2421,9 @@ int   cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
                         long timeout);
  int   cl_io_commit_async(const struct lu_env *env, struct cl_io *io,
                           struct cl_page_list *queue, int from, int to,
-                         cl_commit_cbt cb);
-void  cl_io_extent_release(const struct lu_env *env, struct cl_io *io);
+                         cl_commit_cbt cb, enum cl_io_priority prio);
+void  cl_io_extent_release(const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_priority prio);
  int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io,
                       loff_t pos, size_t bytes);
  int   cl_io_read_ahead(const struct lu_env *env, struct cl_io *io,
diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h

index 55027da..f0e4741 100644 (file)
--- a/lustre/include/lustre_osc.h
+++ b/lustre/include/lustre_osc.h
@@ -563,7 +563,8 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
                              __u64 size, struct osc_extent **extp);
  void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
  int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
-                             pgoff_t start, pgoff_t end, int hp, int discard);
+                             pgoff_t start, pgoff_t end, int hp, int discard,
+                             enum cl_io_priority prio);
  int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
                          pgoff_t start, pgoff_t end);
  int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
@@ -645,9 +646,10 @@ int osc_dio_submit(const struct lu_env *env, struct cl_io *io,
  int osc_io_commit_async(const struct lu_env *env,
                         const struct cl_io_slice *ios,
                         struct cl_page_list *qin, int from, int to,
-                       cl_commit_cbt cb);
+                       cl_commit_cbt cb, enum cl_io_priority prio);
  void osc_io_extent_release(const struct lu_env *env,
-                          const struct cl_io_slice *ios);
+                          const struct cl_io_slice *ios,
+                          enum cl_io_priority prio);
  int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
  void osc_io_iter_fini(const struct lu_env *env,
                       const struct cl_io_slice *ios);
@@ -733,6 +735,11 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj)
         return &osc_export(obj)->exp_obd->u.cli;
  }
  
+static inline char *cli_name(struct client_obd *cli)
+{
+       return cli->cl_import->imp_obd->obd_name;
+}
+
  static inline struct osc_object *cl2osc(const struct cl_object *obj)
  {
         return container_of_safe(obj, struct osc_object, oo_cl);
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index fdd0ebc..8434133 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -5287,7 +5287,7 @@ static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
  
                 /* flush local cache first if any */
                 cl_sync_file_range(inode, offset, OBD_OBJECT_EOF,
-                                  CL_FSYNC_LOCAL, 0);
+                                  CL_FSYNC_LOCAL, 0, IO_PRIO_NORMAL);
  
                 retval = ll_lseek(file, offset, origin);
                 if (retval < 0)
@@ -5343,7 +5343,8 @@ static int ll_flush(struct file *file, fl_owner_t id)
   * Return how many pages have been written.
   */
  int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
-                      enum cl_fsync_mode mode, int ignore_layout)
+                      enum cl_fsync_mode mode, int ignore_layout,
+                      enum cl_io_priority prio)
  {
         struct lu_env *env;
         struct cl_io *io;
@@ -5373,6 +5374,7 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
         fio->fi_fid = ll_inode2fid(inode);
         fio->fi_mode = mode;
         fio->fi_nr_written = 0;
+       fio->fi_prio = prio;
  
         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
                 result = cl_io_loop(env, io);
@@ -5451,7 +5453,8 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                 err = pcc_fsync(file, start, end, datasync, &cached);
                 if (!cached)
                         err = cl_sync_file_range(inode, start, end,
-                                                CL_FSYNC_ALL, 0);
+                                                CL_FSYNC_ALL, 0,
+                                                IO_PRIO_NORMAL);
                 if (rc == 0 && err < 0)
                         rc = err;
                 if (rc < 0)
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index 37434f2..e18631e 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -1379,7 +1379,8 @@ int ll_read_folio(struct file *file, struct folio *folio);
  int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
                            struct cl_page *page, struct file *file);
  void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
-int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io,
+                       enum cl_io_priority prio);
  
  enum lcc_type;
  void ll_cl_add(struct inode *inode, const struct lu_env *env, struct cl_io *io,
@@ -2002,7 +2003,8 @@ dentry_may_statahead(struct inode *dir, struct dentry *dentry)
  }
  
  int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
-                      enum cl_fsync_mode mode, int ignore_layout);
+                      enum cl_fsync_mode mode, int ignore_layout,
+                      enum cl_io_priority prio);
  
  static inline int ll_file_nolock(const struct file *file)
  {
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 6c13a68..68627d4 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -3265,7 +3265,8 @@ void ll_delete_inode(struct inode *inode)
                  * unlink, so that file is not opened somewhere else
                  */
                 cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ?
-                                  CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1);
+                                  CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1,
+                                  IO_PRIO_NORMAL);
         }
  
         ll_truncate_inode_pages_final(inode);
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index d0d04f5..d7825a5 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -1543,7 +1543,7 @@ int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
                  */
                 result = cl_sync_file_range(inode, offset,
                                             offset + PAGE_SIZE - 1,
-                                           CL_FSYNC_LOCAL, 1);
+                                           CL_FSYNC_LOCAL, 1, IO_PRIO_NORMAL);
                 if (result > 0) {
                         /* May have written more than one page. decreasing this
                          * page because the caller will count it.
@@ -1570,6 +1570,7 @@ out:
  int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
  {
         struct inode *inode = mapping->host;
+       enum cl_io_priority prio = IO_PRIO_NORMAL;
         loff_t start;
         loff_t end;
         enum cl_fsync_mode mode;
@@ -1611,8 +1612,11 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
                 wb = inode_to_wb(inode);
                 if (wbc->for_background ||
                     (wb->start_all_reason == WB_REASON_VMSCAN &&
-                    test_bit(WB_start_all, &wb->state)))
+                    test_bit(WB_start_all, &wb->state))) {
                         mode = CL_FSYNC_RECLAIM;
+                       if (wb->dirty_exceeded)
+                               prio = IO_PRIO_DIRTY_EXCEEDED;
+               }
                 spin_unlock(&inode->i_lock);
  #else
                 /*
@@ -1634,7 +1638,7 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
          * inside the IO context of write, which will cause deadlock at
          * layout_conf since it waits for active IOs to complete.
          */
-       result = cl_sync_file_range(inode, start, end, mode, 1);
+       result = cl_sync_file_range(inode, start, end, mode, 1, prio);
         if (result > 0) {
                 wbc->nr_to_write -= result;
                 result = 0;
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c

index d5e2f43..116c57b 100644 (file)
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -781,7 +781,7 @@ again:
                 }
  
                 /* commit pages and then wait for page lock */
-               result = vvp_io_write_commit(env, io);
+               result = vvp_io_write_commit(env, io, IO_PRIO_NORMAL);
                 if (result < 0)
                         GOTO(out, result);
  
@@ -903,6 +903,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
         struct cl_page *page;
         struct page *vmpage = wbe_folio_page(vmfolio);
         unsigned from = pos & (PAGE_SIZE - 1);
+       enum cl_io_priority prio = IO_PRIO_NORMAL;
         bool unplug = false;
         int result = 0;
         ENTRY;
@@ -926,6 +927,29 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
         LASSERT(cl_page_is_owned(page, io));
         if (copied > 0) {
                 struct cl_page_list *plist = &vio->u.readwrite.vui_queue;
+#ifdef SB_I_CGROUPWB
+               struct inode *inode = file_inode(file);
+               struct bdi_writeback *wb;
+
+               spin_lock(&inode->i_lock);
+#ifdef HAVE_INODE_ATTACH_WB_FOLIO
+               inode_attach_wb(inode, page_folio(vmpage));
+#else
+               inode_attach_wb(inode, vmpage);
+#endif
+               wb = inode_to_wb(inode);
+               LASSERTF(wb != NULL, "wb@%pK\n", wb);
+               if (wb->dirty_exceeded) {
+                       unplug = true;
+                       prio = IO_PRIO_URGENT;
+                       CDEBUG(D_IOTRACE, "wb@%pK dirty_ratelimit=%lu balanced_dirty_ratelimit=%lu dirty_exceeded=%d state=%lX last_old_flush=%lu\n",
+                              wb, wb->dirty_ratelimit,
+                              wb->balanced_dirty_ratelimit,
+                              wb->dirty_exceeded, wb->state,
+                              wb->last_old_flush);
+               }
+               spin_unlock(&inode->i_lock);
+#endif
  
                 lcc->lcc_page = NULL; /* page will be queued */
  
@@ -962,7 +986,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
             io->u.ci_rw.crw_pos + io->u.ci_rw.crw_bytes)
                 unplug = true;
         if (unplug)
-               result = vvp_io_write_commit(env, io);
+               result = vvp_io_write_commit(env, io, prio);
  
         if (result < 0)
                 io->ci_result = result;
diff --git a/lustre/llite/vvp_internal.h b/lustre/llite/vvp_internal.h

index 33821e6..8776db1 100644 (file)
--- a/lustre/llite/vvp_internal.h
+++ b/lustre/llite/vvp_internal.h
@@ -247,7 +247,8 @@ struct vvp_object *cl_inode2vvp(struct inode *inode);
  
  int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
                 struct cl_io *io);
-int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io,
+                       enum cl_io_priority prio);
  int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
                   struct cl_page *page, pgoff_t index);
  struct lu_object *vvp_object_alloc(const struct lu_env *env,
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c

index cf8bf44..47c3cf0 100644 (file)
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -1182,7 +1182,8 @@ static bool page_list_sanity_check(struct cl_object *obj,
  }
  
  /* Return how many bytes have queued or written */
-int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io,
+                       enum cl_io_priority prio)
  {
         struct cl_object *obj = io->ci_obj;
         struct inode *inode = vvp_object_inode(obj);
@@ -1198,8 +1199,9 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
         if (npages == 0)
                 RETURN(0);
  
-       CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n",
-               npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to);
+       CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d prio %d\n",
+               npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to,
+               prio);
  
         LASSERT(page_list_sanity_check(obj, queue));
  
@@ -1207,7 +1209,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
         rc = cl_io_commit_async(env, io, queue,
                                 vio->u.readwrite.vui_from,
                                 vio->u.readwrite.vui_to,
-                               write_commit_callback);
+                               write_commit_callback, prio);
         npages -= queue->pl_nr; /* already committed pages */
         if (npages > 0) {
                 /* calculate how many bytes were written */
@@ -1231,7 +1233,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
         LASSERT(ergo(rc == 0, queue->pl_nr == 0));
  
         /* out of quota, try sync write */
-       if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) {
+       if ((rc == -EDQUOT && !cl_io_is_mkwrite(io)) || prio > IO_PRIO_NORMAL) {
                 struct ll_inode_info *lli = ll_i2info(inode);
  
                 rc = vvp_io_commit_sync(env, io, queue,
@@ -1375,7 +1377,7 @@ static int vvp_io_write_start(const struct lu_env *env,
         }
  
         if (result > 0) {
-               result = vvp_io_write_commit(env, io);
+               result = vvp_io_write_commit(env, io, IO_PRIO_NORMAL);
                 /* Simulate short commit */
                 if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) {
                         vio->u.readwrite.vui_written >>= 1;
@@ -1613,7 +1615,8 @@ static int vvp_io_fault_start(const struct lu_env *env,
                          * still have chance to detect it.
                          */
                         result = cl_io_commit_async(env, io, plist, 0, to,
-                                                   mkwrite_commit_callback);
+                                                   mkwrite_commit_callback,
+                                                   IO_PRIO_NORMAL);
                         /* Have overquota flag, trying sync write to check
                          * whether indeed out of quota
                          */
@@ -1627,7 +1630,8 @@ static int vvp_io_fault_start(const struct lu_env *env,
                                         cl_page_list_add(plist, page, true);
                                         result = cl_io_commit_async(env, io,
                                                 plist, 0, to,
-                                               mkwrite_commit_callback);
+                                               mkwrite_commit_callback,
+                                               IO_PRIO_NORMAL);
                                         io->ci_noquota = 0;
                                 } else {
                                         cl_page_put(env, page);
diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c

index 4d67be6..299d83c 100644 (file)
--- a/lustre/llite/vvp_object.c
+++ b/lustre/llite/vvp_object.c
@@ -139,7 +139,8 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
         int rc;
  
         ENTRY;
-       rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
+       rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1,
+                               IO_PRIO_NORMAL);
         if (rc < 0) {
                 CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n",
                        PFID(lu_object_fid(&obj->co_lu)), rc);
diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c

index b6bbbcc..43e7eb8 100644 (file)
--- a/lustre/lov/lov_io.c
+++ b/lustre/lov/lov_io.c
@@ -758,6 +758,7 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
                 io->u.ci_fsync.fi_end = end;
                 io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
                 io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+               io->u.ci_fsync.fi_prio = parent->u.ci_fsync.fi_prio;
                 break;
         }
         case CIT_READ:
@@ -1445,10 +1446,11 @@ static int lov_io_submit(const struct lu_env *env,
  static int lov_io_commit_async(const struct lu_env *env,
                                const struct cl_io_slice *ios,
                                struct cl_page_list *queue, int from, int to,
-                              cl_commit_cbt cb)
+                              cl_commit_cbt cb, enum cl_io_priority prio)
  {
         struct cl_page_list *plist = &lov_env_info(env)->lti_plist;
         struct lov_io *lio = cl2lov_io(env, ios);
+       bool hp = cl_io_high_prio(prio);
         struct lov_io_sub *sub;
         struct cl_page *page;
         int rc = 0;
@@ -1463,7 +1465,7 @@ static int lov_io_commit_async(const struct lu_env *env,
                 LASSERT(!IS_ERR(sub));
                 LASSERT(sub == &lio->lis_single_subio);
                 rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, queue,
-                                       from, to, cb);
+                                       from, to, cb, prio);
                 RETURN(rc);
         }
  
@@ -1493,7 +1495,8 @@ static int lov_io_commit_async(const struct lu_env *env,
                 sub = lov_sub_get(env, lio, index);
                 if (!IS_ERR(sub)) {
                         rc = cl_io_commit_async(sub->sub_env, &sub->sub_io,
-                                               plist, from, stripe_to, cb);
+                                               plist, from, stripe_to, cb,
+                                               prio);
                 } else {
                         rc = PTR_ERR(sub);
                         break;
@@ -1504,9 +1507,14 @@ static int lov_io_commit_async(const struct lu_env *env,
  
                 from = 0;
  
-               if (lov_comp_entry(index) !=
+               if (!hp && lov_comp_entry(index) !=
                     lov_comp_entry(page->cp_lov_index))
-                       cl_io_extent_release(sub->sub_env, &sub->sub_io);
+                       cl_io_extent_release(sub->sub_env, &sub->sub_io, prio);
+       }
+
+       if (rc == 0 && hp) {
+               list_for_each_entry(sub, &lio->lis_subios, sub_list)
+                       cl_io_extent_release(sub->sub_env, &sub->sub_io, prio);
         }
  
         /* for error case, add the page back into the qin list */
diff --git a/lustre/mdc/mdc_dev.c b/lustre/mdc/mdc_dev.c

index 67a7451..1817ff8 100644 (file)
--- a/lustre/mdc/mdc_dev.c
+++ b/lustre/mdc/mdc_dev.c
@@ -256,7 +256,7 @@ static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj,
  
         if (mode == CLM_WRITE) {
                 result = osc_cache_writeback_range(env, obj, start, end, 1,
-                                                  discard);
+                                                  discard, IO_PRIO_NORMAL);
                 CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
                        obj, start, end, result,
                        discard ? "discarded" : "written back");
@@ -1182,9 +1182,16 @@ static int mdc_io_fsync_start(const struct lu_env *env,
         if (fio->fi_mode == CL_FSYNC_RECLAIM) {
                 struct client_obd *cli = osc_cli(osc);
  
-               if (!atomic_long_read(&cli->cl_unstable_count)) {
-                       /* Stop flush when there are no unstable pages? */
-                       CDEBUG(D_CACHE, "unstable count is zero\n");
+               if (!atomic_read(&osc->oo_nr_ios) &&
+                   !atomic_read(&osc->oo_nr_writes) &&
+                   !atomic_long_read(&cli->cl_unstable_count)) {
+                       /*
+                        * No active IO, no dirty pages needing to write and no
+                        * unstable pages needing to commit.
+                        */
+                       CDEBUG(D_CACHE,
+                              "%s: dirty/unstable counts are both zero\n",
+                              cli_name(cli));
                         RETURN(0);
                 }
         }
@@ -1193,7 +1200,8 @@ static int mdc_io_fsync_start(const struct lu_env *env,
          * possible range despite of supplied start/end values.
          */
         result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0,
-                                          fio->fi_mode == CL_FSYNC_DISCARD);
+                                          fio->fi_mode == CL_FSYNC_DISCARD,
+                                          fio->fi_prio);
         if (result > 0) {
                 fio->fi_nr_written += result;
                 result = 0;
diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c

index dff5c16..770bc92 100644 (file)
--- a/lustre/obdclass/cl_io.c
+++ b/lustre/obdclass/cl_io.c
@@ -631,13 +631,14 @@ EXPORT_SYMBOL(cl_io_lru_reserve);
   * @from: Starting position
   * @to: Ending position
   * @cb: callback function
+ * @prio: I/O priority
   *
   * Returns 0 if all pages committed, or errcode if error occurred.
   * see cl_io_operations::cio_commit_async()
   */
  int cl_io_commit_async(const struct lu_env *env, struct cl_io *io,
                        struct cl_page_list *queue, int from, int to,
-                      cl_commit_cbt cb)
+                      cl_commit_cbt cb, enum cl_io_priority prio)
  {
         const struct cl_io_slice *scan;
         int result = 0;
@@ -647,7 +648,7 @@ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io,
                 if (scan->cis_iop->cio_commit_async == NULL)
                         continue;
                 result = scan->cis_iop->cio_commit_async(env, scan, queue,
-                                                        from, to, cb);
+                                                        from, to, cb, prio);
                 if (result != 0)
                         break;
         }
@@ -655,7 +656,8 @@ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io,
  }
  EXPORT_SYMBOL(cl_io_commit_async);
  
-void cl_io_extent_release(const struct lu_env *env, struct cl_io *io)
+void cl_io_extent_release(const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_priority prio)
  {
         const struct cl_io_slice *scan;
         ENTRY;
@@ -663,7 +665,7 @@ void cl_io_extent_release(const struct lu_env *env, struct cl_io *io)
         list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
                 if (scan->cis_iop->cio_extent_release == NULL)
                         continue;
-               scan->cis_iop->cio_extent_release(env, scan);
+               scan->cis_iop->cio_extent_release(env, scan, prio);
         }
         EXIT;
  }
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c

index a4f29f8..013d5f5 100644 (file)
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -196,8 +196,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
                         GOTO(out, rc = 65);
                 fallthrough;
         default:
-               if (atomic_read(&ext->oe_users) > 0)
-                       GOTO(out, rc = 70);
+               break;
         }
  
         if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
@@ -564,10 +563,13 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
  /**
   * Drop user count of osc_extent, and unplug IO asynchronously.
   */
-void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext,
+                       enum cl_io_priority prio)
  {
         struct osc_object *obj = ext->oe_obj;
         struct client_obd *cli = osc_cli(obj);
+       bool hp = cl_io_high_prio(prio);
+
         ENTRY;
  
         LASSERT(atomic_read(&ext->oe_users) > 0);
@@ -575,15 +577,26 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
         LASSERT(ext->oe_grants > 0);
  
         if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
-               LASSERT(ext->oe_state == OES_ACTIVE);
                 if (ext->oe_trunc_pending) {
-                       /* a truncate process is waiting for this extent.
+                       /*
+                        * A truncate process is waiting for this extent.
                          * This may happen due to a race, check
-                        * osc_cache_truncate_start(). */
+                        * osc_cache_truncate_start().
+                        */
+                       if (ext->oe_state != OES_ACTIVE) {
+                               int rc;
+
+                               osc_object_unlock(obj);
+                               rc = osc_extent_wait(env, ext, OES_INV);
+                               if (rc < 0)
+                                       OSC_EXTENT_DUMP(D_ERROR, ext,
+                                                       "error: %d.\n", rc);
+                               osc_object_lock(obj);
+                       }
                         osc_extent_state_set(ext, OES_TRUNC);
                         ext->oe_trunc_pending = 0;
                         osc_object_unlock(obj);
-               } else {
+               } else if (ext->oe_state == OES_ACTIVE) {
                         int grant = 0;
  
                         osc_extent_state_set(ext, OES_CACHE);
@@ -596,18 +609,17 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
                         if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
                                 grant += cli->cl_grant_extent_tax;
  
-                       if (!ext->oe_rw && ext->oe_dlmlock) {
-                               bool hp;
-
+                       if (!hp && !ext->oe_rw && ext->oe_dlmlock) {
                                 lock_res_and_lock(ext->oe_dlmlock);
                                 hp = ldlm_is_cbpending(ext->oe_dlmlock);
                                 unlock_res_and_lock(ext->oe_dlmlock);
-
-                               /* HP extent should be written ASAP. */
-                               if (hp)
-                                       ext->oe_hp = 1;
                         }
  
+
+                       /* HP extent should be written ASAP. */
+                       if (hp)
+                               ext->oe_hp = 1;
+
                         if (ext->oe_hp)
                                 list_move_tail(&ext->oe_link,
                                                &obj->oo_hp_exts);
@@ -621,9 +633,14 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
                         osc_object_unlock(obj);
                         if (grant > 0)
                                 osc_unreserve_grant(cli, 0, grant);
+               } else {
+                       osc_object_unlock(obj);
                 }
  
-               osc_io_unplug_async(env, cli, obj);
+               if (unlikely(cl_io_high_prio(prio)))
+                       osc_io_unplug(env, cli, obj);
+               else
+                       osc_io_unplug_async(env, cli, obj);
         }
         osc_extent_put(env, ext);
  
@@ -916,7 +933,7 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
         }
         osc_object_unlock(obj);
         if (rc == 1)
-               osc_extent_release(env, ext);
+               osc_extent_release(env, ext, IO_PRIO_NORMAL);
  
         /* wait for the extent until its state becomes @state */
         rc = wait_event_idle_timeout(ext->oe_waitq,
@@ -1160,6 +1177,9 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index,
  
         LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
         osc_object_lock(obj);
+       if (ext->oe_state != OES_ACTIVE)
+               GOTO(out, rc = -ESTALE);
+
         LASSERT(sanity_check_nolock(ext) == 0);
         end_chunk = ext->oe_end >> ppc_bits;
         if (chunk > end_chunk + 1)
@@ -2342,7 +2362,10 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
          * 2. otherwise, a new extent will be allocated. */
  
         ext = oio->oi_active;
-       if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+       if (ext != NULL && ext->oe_state != OES_ACTIVE) {
+               need_release = 1;
+       } else if (ext != NULL && ext->oe_start <= index &&
+                  ext->oe_max_end >= index) {
                 /* one chunk plus extent overhead must be enough to write this
                  * page */
                 grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
@@ -2376,7 +2399,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
                 need_release = 1;
         }
         if (need_release) {
-               osc_extent_release(env, ext);
+               osc_extent_release(env, ext, IO_PRIO_NORMAL);
                 oio->oi_active = NULL;
                 ext = NULL;
         }
@@ -2407,6 +2430,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
                                 grants = tmp;
                 }
  
+restart_find:
                 tmp = grants;
                 if (rc == 0) {
                         ext = osc_extent_find(env, osc, index, &tmp);
@@ -2430,6 +2454,28 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
                 LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
  
                 osc_object_lock(osc);
+               if (ext->oe_state != OES_ACTIVE) {
+                       if (ext->oe_state == OES_CACHE) {
+                               osc_extent_state_set(ext, OES_ACTIVE);
+                               osc_update_pending(osc, OBD_BRW_WRITE,
+                                                  -ext->oe_nr_pages);
+                               list_del_init(&ext->oe_link);
+                       } else {
+                               osc_object_unlock(osc);
+                               osc_extent_get(ext);
+                               osc_extent_release(env, ext, IO_PRIO_NORMAL);
+                               oio->oi_active = NULL;
+
+                               /* Waiting for IO finished.  */
+                               rc = osc_extent_wait(env, ext, OES_INV);
+                               osc_extent_put(env, ext);
+                               if (rc < 0)
+                                       RETURN(rc);
+
+                               GOTO(restart_find, rc);
+                       }
+               }
+
                 if (ext->oe_nr_pages == 0)
                         ext->oe_srvlock = ops->ops_srvlock;
                 else
@@ -3097,14 +3143,18 @@ EXPORT_SYMBOL(osc_cache_wait_range);
   * Return how many pages will be issued, or error code if error occurred.
   */
  int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
-                             pgoff_t start, pgoff_t end, int hp, int discard)
+                             pgoff_t start, pgoff_t end, int hp, int discard,
+                             enum cl_io_priority prio)
  {
         struct osc_extent *ext;
         LIST_HEAD(discard_list);
+       bool active_ext_check = false;
         bool unplug = false;
         int result = 0;
+
         ENTRY;
  
+repeat:
         osc_object_lock(obj);
         ext = osc_extent_search(obj, start);
         if (ext == NULL)
@@ -3176,6 +3226,16 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
                          * grants. We do this for the correctness of fsync. */
                         LASSERT(hp == 0 && discard == 0);
                         ext->oe_urgent = 1;
+
+                       if (active_ext_check) {
+                               osc_extent_state_set(ext, OES_CACHE);
+                               list_move_tail(&ext->oe_link,
+                                              &obj->oo_urgent_exts);
+                               osc_update_pending(obj, OBD_BRW_WRITE,
+                                                  ext->oe_nr_pages);
+                               unplug = true;
+                       }
+
                         break;
                 case OES_TRUNC:
                         /* this extent is being truncated, can't do anything
@@ -3223,7 +3283,22 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
                         result = rc;
         }
  
-       OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
+       OSC_IO_DEBUG(obj, "pageout [%lu, %lu] npages %lu: rc=%d.\n",
+                    start, end, obj->oo_npages, result);
+
+       /*
+        * Try to flush the active I/O extents of the object.
+        * Otherwise, the user process writing the file may be dirty exceeded
+        * and waiting endless in balance_dirty_pages().
+        */
+       if (result == 0 && prio == IO_PRIO_DIRTY_EXCEEDED &&
+           !active_ext_check && atomic_read(&obj->oo_nr_ios) &&
+           obj->oo_npages > 0) {
+               osc_extent_tree_dump(D_CACHE, obj);
+               active_ext_check = true;
+               GOTO(repeat, result);
+       }
+
         RETURN(result);
  }
  EXPORT_SYMBOL(osc_cache_writeback_range);
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h

index 40770ce..89e2c71 100644 (file)
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -29,7 +29,8 @@ void osc_update_next_shrink(struct client_obd *cli);
  int lru_queue_work(const struct lu_env *env, void *data);
  int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
                       int sent, int rc);
-void osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext,
+                       enum cl_io_priority prio);
  int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
                            pgoff_t start, pgoff_t end, bool discard);
  int osc_ldlm_hp_handle(const struct lu_env *env, struct osc_object *obj,
@@ -110,11 +111,6 @@ static inline unsigned long rpcs_in_flight(struct client_obd *cli)
         return cli->cl_r_in_flight + cli->cl_w_in_flight;
  }
  
-static inline char *cli_name(struct client_obd *cli)
-{
-       return cli->cl_import->imp_obd->obd_name;
-}
-
  static inline char list_empty_marker(struct list_head *list)
  {
         return list_empty(list) ? '-' : '+';
diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c

index 4f9ecd0..42ff8eb 100644 (file)
--- a/lustre/osc/osc_io.c
+++ b/lustre/osc/osc_io.c
@@ -422,7 +422,7 @@ void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
  int osc_io_commit_async(const struct lu_env *env,
                         const struct cl_io_slice *ios,
                         struct cl_page_list *qin, int from, int to,
-                       cl_commit_cbt cb)
+                       cl_commit_cbt cb, enum cl_io_priority prio)
  {
         struct cl_io *io = ios->cis_io;
         struct osc_io *oio = cl2osc_io(env, ios);
@@ -500,8 +500,8 @@ int osc_io_commit_async(const struct lu_env *env,
         /* for sync write, kernel will wait for this page to be flushed before
          * osc_io_end() is called, so release it earlier.
          * for mkwrite(), it's known there is no further pages. */
-       if (cl_io_is_sync_write(io) && oio->oi_active != NULL) {
-               osc_extent_release(env, oio->oi_active);
+       if (cl_io_is_sync_write(io) && oio->oi_active) {
+               osc_extent_release(env, oio->oi_active, prio);
                 oio->oi_active = NULL;
         }
  
@@ -511,12 +511,13 @@ int osc_io_commit_async(const struct lu_env *env,
  EXPORT_SYMBOL(osc_io_commit_async);
  
  void osc_io_extent_release(const struct lu_env *env,
-                          const struct cl_io_slice *ios)
+                          const struct cl_io_slice *ios,
+                          enum cl_io_priority prio)
  {
         struct osc_io *oio = cl2osc_io(env, ios);
  
         if (oio->oi_active != NULL) {
-               osc_extent_release(env, oio->oi_active);
+               osc_extent_release(env, oio->oi_active, prio);
                 oio->oi_active = NULL;
         }
  }
@@ -686,7 +687,8 @@ int osc_punch_start(const struct lu_env *env, struct cl_io *io,
         int rc;
  
         ENTRY;
-       rc = osc_cache_writeback_range(env, osc, pg_start, pg_end, 1, 0);
+       rc = osc_cache_writeback_range(env, osc, pg_start, pg_end, 1, 0,
+                                      IO_PRIO_NORMAL);
         if (rc < 0)
                 RETURN(rc);
  
@@ -1106,9 +1108,16 @@ static int osc_io_fsync_start(const struct lu_env *env,
         if (fio->fi_mode == CL_FSYNC_RECLAIM) {
                 struct client_obd *cli = osc_cli(osc);
  
-               if (!atomic_long_read(&cli->cl_unstable_count)) {
-                       /* Stop flush when there are no unstable pages? */
-                       CDEBUG(D_CACHE, "unstable count is zero\n");
+               if (!atomic_read(&osc->oo_nr_ios) &&
+                   !atomic_read(&osc->oo_nr_writes) &&
+                   !atomic_long_read(&cli->cl_unstable_count)) {
+                       /*
+                        * No active I/O, no dirty pages needing to write and
+                        * no unstable pages needing to commit.
+                        */
+                       CDEBUG(D_CACHE,
+                              "%s: unstable/dirty counts are both zero\n",
+                              cli_name(cli));
                         RETURN(0);
                 }
         }
@@ -1117,7 +1126,8 @@ static int osc_io_fsync_start(const struct lu_env *env,
                 end = CL_PAGE_EOF;
  
         result = osc_cache_writeback_range(env, osc, start, end, 0,
-                                          fio->fi_mode == CL_FSYNC_DISCARD);
+                                          fio->fi_mode == CL_FSYNC_DISCARD,
+                                          fio->fi_prio);
         if (result < 0 && fio->fi_mode == CL_FSYNC_DISCARD) {
                 CDEBUG(D_CACHE,
                        "%s: ignore error %d on discarding "DFID":[%lu-%lu]\n",
@@ -1258,7 +1268,7 @@ void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice)
         struct osc_io *oio = cl2osc_io(env, slice);
  
         if (oio->oi_active) {
-               osc_extent_release(env, oio->oi_active);
+               osc_extent_release(env, oio->oi_active, IO_PRIO_NORMAL);
                 oio->oi_active = NULL;
         }
  }
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c

index f6689f1..237d20c 100644 (file)
--- a/lustre/osc/osc_lock.c
+++ b/lustre/osc/osc_lock.c
@@ -349,7 +349,7 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
  
         if (mode == CLM_WRITE) {
                 rc = osc_cache_writeback_range(env, obj, start, end, 1,
-                                              discard);
+                                              discard, IO_PRIO_NORMAL);
                 CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
                        obj, start, end, rc,
                        discard ? "discarded" : "written back");
author	Qian Yingjin <qian@ddn.com>
	Sat, 24 May 2025 08:30:48 +0000 (16:30 +0800)
committer	Oleg Drokin <green@whamcloud.com>
	Tue, 8 Jul 2025 03:50:16 +0000 (03:50 +0000)
config/lustre-core.m4		patch \| blob \| history
lustre/include/cl_object.h		patch \| blob \| history
lustre/include/lustre_osc.h		patch \| blob \| history
lustre/llite/file.c		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/rw.c		patch \| blob \| history
lustre/llite/rw26.c		patch \| blob \| history
lustre/llite/vvp_internal.h		patch \| blob \| history
lustre/llite/vvp_io.c		patch \| blob \| history
lustre/llite/vvp_object.c		patch \| blob \| history
lustre/lov/lov_io.c		patch \| blob \| history
lustre/mdc/mdc_dev.c		patch \| blob \| history
lustre/obdclass/cl_io.c		patch \| blob \| history
lustre/osc/osc_cache.c		patch \| blob \| history
lustre/osc/osc_internal.h		patch \| blob \| history
lustre/osc/osc_io.c		patch \| blob \| history
lustre/osc/osc_lock.c		patch \| blob \| history