From: Qian Yingjin Date: Sat, 24 May 2025 08:30:48 +0000 (+0800) Subject: LU-19014 memcg: fix client hang in balance_dirty_pages() X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=c413d2ede5dfe71a89878aed305a8d61f03a30a3;p=fs%2Flustre-release.git LU-19014 memcg: fix client hang in balance_dirty_pages() Two nodes (at least) append write a shared file in Lustre with memcg enabled. The client randomly hung in balance_dirty_pages() with the following call trace: [<0>] balance_dirty_pages+0x2ee/0xd10 [<0>] balance_dirty_pages_ratelimited_flags+0x27a/0x380 [<0>] generic_perform_write+0x150/0x210 [<0>] vvp_io_write_start+0x516/0xc00 [lustre] [<0>] cl_io_start+0x5a/0x110 [obdclass] [<0>] cl_io_loop+0x97/0x1f0 [obdclass] [<0>] ll_file_io_generic+0x4d2/0xe50 [lustre] [<0>] do_file_write_iter+0x3e9/0x5d0 [lustre] [<0>] vfs_write+0x2cb/0x410 [<0>] ksys_write+0x5f/0xe0 [<0>] do_syscall_64+0x5c/0xf0 After analyze the core dump of the hung system, we found that the bdi_writeback data structure (wb) corresponded to the memcg has pending dirty pages (in state WB_registered | WB_has_diry_io), but can not write-out the dirty pages and loop in balance_dirty_pages function. This is a bug in Lustre memcg code. In OSC/MDC layer, it will stop to flush dirty pages once found that there are no any unstable pages. However, there may be some dirty pages queued in the cache. In this case, the client should still write back the dirty pages. Thus the wb stat accounting will be updated and the write process can continue instead of looping endless. Moreover, there are some problem in the current Lustre CLIO engine. When the system or a certain memcg is under memory pressure, the client just queues the dirty page in page cache or in the current active extent (OES_ACTIVE osc_extent) when vvp_io_write_commit()/ cl_io_commit_async() is called in ->write_end(). The queued pages can not be written back even the kernel is trying to flush dirty pages in writeback via ->ll_writepages(). The client is looping in the following call sequences: loop: ->write_begin() ->write_end() ->balance_dirty_pages() ->Launch file writeback in background but cannot flush any dirty pages. ->The current process is paused a certain time (i.e. 200ms) as the corresponding @wb is dirty exceeded. -> GOTO loop: The write progress is very slow: write a page and sleep/pause for a period of time alternately. We fix this hang in ->ll_write_end(). When detect the corresponding @wb is dirty exceeded, the client will submit the dirty pages into OSC writeback cache. The state of current extent will change from OES_ACTIVE to OES_CACHE and this kind of extents can be written back. Moreover, we mark the current extent as urgent, thus it can be flushed much more quickly. Fixes: 8aa231a99 ("LU-16713 llite: writeback/commit pages under memory pressure") Signed-off-by: Yingjin Qian Change-Id: Iecee60484f1b65fad6f4c9eac7bd4d2c53f38b8d Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59223 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Zhenyu Xu Reviewed-by: Patrick Farrell Reviewed-by: Oleg Drokin --- diff --git a/config/lustre-core.m4 b/config/lustre-core.m4 index 2cdb1bc..d22eab8 100644 --- a/config/lustre-core.m4 +++ b/config/lustre-core.m4 @@ -4406,6 +4406,29 @@ AC_DEFUN([LC_HAVE_FOLIO_BATCH_REINIT], [ ]) # LC_HAVE_FOLIO_BATCH_REINIT # +# LC_HAVE_INODE_ATTACH_WB_FOLIO +# +# linux kernel v6.2-rc4 commit: 9cfb816b1c6c99f4b3c1d4a0fb096162cd17ec71 +# mm/fs: convert inode_attach_wb() to take a folio +# +AC_DEFUN([LC_SRC_HAVE_INODE_ATTACH_WB_FOLIO], [ + LB2_LINUX_TEST_SRC([inode_attach_wb_folio_arg], [ + #include + ],[ + struct folio *folio = NULL; + + inode_attach_wb(NULL, folio); + ],[-Werror]) +]) +AC_DEFUN([LC_HAVE_INODE_ATTACH_WB_FOLIO], [ + LB2_MSG_LINUX_TEST_RESULT([if 'inode_attach_wb()' takes folio], + [inode_attach_wb_folio_arg], [ + AC_DEFINE(HAVE_INODE_ATTACH_WB_FOLIO, 1, + ['inode_attach_wb()' takes folio]) + ]) +]) # LC_HAVE_INODE_ATTACH_WB_FOLIO + +# # LC_HAVE_IOV_ITER_IOVEC # # linux kernel v6.3-rc4-32-g6eb203e1a868 @@ -5399,6 +5422,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [ LC_SRC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK LC_SRC_HAVE_U64_CAPABILITY LC_SRC_HAVE_FOLIO_BATCH_REINIT + LC_SRC_HAVE_INODE_ATTACH_WB_FOLIO # 6.4 LC_SRC_HAVE_IOV_ITER_IOVEC @@ -5737,6 +5761,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [ LC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK LC_HAVE_U64_CAPABILITY LC_HAVE_FOLIO_BATCH_REINIT + LC_HAVE_INODE_ATTACH_WB_FOLIO # 6.4 LC_HAVE_IOV_ITER_IOVEC diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index c672249..2db876f 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1428,6 +1428,36 @@ static inline void cl_read_ahead_release(const struct lu_env *env, ra->cra_release(env, ra); } +enum cl_io_priority { + /* Normal I/O, usually just queue the pages in the client side cache. */ + IO_PRIO_NORMAL = 0, + /* I/O is urgent and should flush queued pages to OSTs ASAP. */ + IO_PRIO_URGENT, + /* The memcg is under high memory pressure and the user write process + * is dirty exceeded and under rate limiting in balance_dirty_pages(). + * It needs to flush dirty pages for the corresponding @wb ASAP. + */ + IO_PRIO_DIRTY_EXCEEDED, + /* + * I/O is urgent and flushing pages are marked with OBD_BRW_SOFT_SYNC + * flag and may trigger a soft sync on OSTs. Thus it can free unstable + * pages much quickly. + */ + IO_PRIO_SOFT_SYNC, + /* + * The system or a certain memcg is under high memory pressure. Need to + * flush dirty pages to OSTs immediately and I/O RPC must wait the write + * transcation commit on OSTs synchronously to release unstable pages. + */ + IO_PRIO_HARD_SYNC, + IO_PRIO_MAX, +}; + +static inline bool cl_io_high_prio(enum cl_io_priority prio) +{ + return prio >= IO_PRIO_URGENT; +} + /** * Per-layer io operations. * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops @@ -1540,12 +1570,13 @@ struct cl_io_operations { int (*cio_commit_async)(const struct lu_env *env, const struct cl_io_slice *slice, struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb); + cl_commit_cbt cb, enum cl_io_priority prio); /** * Release active extent. */ void (*cio_extent_release)(const struct lu_env *env, - const struct cl_io_slice *slice); + const struct cl_io_slice *slice, + enum cl_io_priority prio); /** * Decide maximum read ahead extent * @@ -1833,13 +1864,14 @@ struct cl_io { struct cl_page *ft_page; } ci_fault; struct cl_fsync_io { - loff_t fi_start; - loff_t fi_end; + loff_t fi_start; + loff_t fi_end; /** file system level fid */ - struct lu_fid *fi_fid; - enum cl_fsync_mode fi_mode; + struct lu_fid *fi_fid; + enum cl_fsync_mode fi_mode; /* how many pages were written/discarded */ - unsigned int fi_nr_written; + unsigned int fi_nr_written; + enum cl_io_priority fi_prio; } ci_fsync; struct cl_ladvise_io { __u64 lio_start; @@ -2389,8 +2421,9 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, long timeout); int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb); -void cl_io_extent_release(const struct lu_env *env, struct cl_io *io); + cl_commit_cbt cb, enum cl_io_priority prio); +void cl_io_extent_release(const struct lu_env *env, struct cl_io *io, + enum cl_io_priority prio); int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io, loff_t pos, size_t bytes); int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h index 55027da..f0e4741 100644 --- a/lustre/include/lustre_osc.h +++ b/lustre/include/lustre_osc.h @@ -563,7 +563,8 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, __u64 size, struct osc_extent **extp); void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext); int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end, int hp, int discard); + pgoff_t start, pgoff_t end, int hp, int discard, + enum cl_io_priority prio); int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, pgoff_t start, pgoff_t end); int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, @@ -645,9 +646,10 @@ int osc_dio_submit(const struct lu_env *env, struct cl_io *io, int osc_io_commit_async(const struct lu_env *env, const struct cl_io_slice *ios, struct cl_page_list *qin, int from, int to, - cl_commit_cbt cb); + cl_commit_cbt cb, enum cl_io_priority prio); void osc_io_extent_release(const struct lu_env *env, - const struct cl_io_slice *ios); + const struct cl_io_slice *ios, + enum cl_io_priority prio); int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios); void osc_io_iter_fini(const struct lu_env *env, const struct cl_io_slice *ios); @@ -733,6 +735,11 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj) return &osc_export(obj)->exp_obd->u.cli; } +static inline char *cli_name(struct client_obd *cli) +{ + return cli->cl_import->imp_obd->obd_name; +} + static inline struct osc_object *cl2osc(const struct cl_object *obj) { return container_of_safe(obj, struct osc_object, oo_cl); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index fdd0ebc..8434133 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -5287,7 +5287,7 @@ static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) /* flush local cache first if any */ cl_sync_file_range(inode, offset, OBD_OBJECT_EOF, - CL_FSYNC_LOCAL, 0); + CL_FSYNC_LOCAL, 0, IO_PRIO_NORMAL); retval = ll_lseek(file, offset, origin); if (retval < 0) @@ -5343,7 +5343,8 @@ static int ll_flush(struct file *file, fl_owner_t id) * Return how many pages have been written. */ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, - enum cl_fsync_mode mode, int ignore_layout) + enum cl_fsync_mode mode, int ignore_layout, + enum cl_io_priority prio) { struct lu_env *env; struct cl_io *io; @@ -5373,6 +5374,7 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, fio->fi_fid = ll_inode2fid(inode); fio->fi_mode = mode; fio->fi_nr_written = 0; + fio->fi_prio = prio; if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) result = cl_io_loop(env, io); @@ -5451,7 +5453,8 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) err = pcc_fsync(file, start, end, datasync, &cached); if (!cached) err = cl_sync_file_range(inode, start, end, - CL_FSYNC_ALL, 0); + CL_FSYNC_ALL, 0, + IO_PRIO_NORMAL); if (rc == 0 && err < 0) rc = err; if (rc < 0) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 37434f2..e18631e08 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -1379,7 +1379,8 @@ int ll_read_folio(struct file *file, struct folio *folio); int ll_io_read_page(const struct lu_env *env, struct cl_io *io, struct cl_page *page, struct file *file); void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io, + enum cl_io_priority prio); enum lcc_type; void ll_cl_add(struct inode *inode, const struct lu_env *env, struct cl_io *io, @@ -2002,7 +2003,8 @@ dentry_may_statahead(struct inode *dir, struct dentry *dentry) } int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, - enum cl_fsync_mode mode, int ignore_layout); + enum cl_fsync_mode mode, int ignore_layout, + enum cl_io_priority prio); static inline int ll_file_nolock(const struct file *file) { diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 6c13a68..68627d4 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -3265,7 +3265,8 @@ void ll_delete_inode(struct inode *inode) * unlink, so that file is not opened somewhere else */ cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ? - CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1); + CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1, + IO_PRIO_NORMAL); } ll_truncate_inode_pages_final(inode); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index d0d04f5..d7825a5 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1543,7 +1543,7 @@ int ll_writepage(struct page *vmpage, struct writeback_control *wbc) */ result = cl_sync_file_range(inode, offset, offset + PAGE_SIZE - 1, - CL_FSYNC_LOCAL, 1); + CL_FSYNC_LOCAL, 1, IO_PRIO_NORMAL); if (result > 0) { /* May have written more than one page. decreasing this * page because the caller will count it. @@ -1570,6 +1570,7 @@ out: int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + enum cl_io_priority prio = IO_PRIO_NORMAL; loff_t start; loff_t end; enum cl_fsync_mode mode; @@ -1611,8 +1612,11 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb(inode); if (wbc->for_background || (wb->start_all_reason == WB_REASON_VMSCAN && - test_bit(WB_start_all, &wb->state))) + test_bit(WB_start_all, &wb->state))) { mode = CL_FSYNC_RECLAIM; + if (wb->dirty_exceeded) + prio = IO_PRIO_DIRTY_EXCEEDED; + } spin_unlock(&inode->i_lock); #else /* @@ -1634,7 +1638,7 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) * inside the IO context of write, which will cause deadlock at * layout_conf since it waits for active IOs to complete. */ - result = cl_sync_file_range(inode, start, end, mode, 1); + result = cl_sync_file_range(inode, start, end, mode, 1, prio); if (result > 0) { wbc->nr_to_write -= result; result = 0; diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index d5e2f43..116c57b 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -781,7 +781,7 @@ again: } /* commit pages and then wait for page lock */ - result = vvp_io_write_commit(env, io); + result = vvp_io_write_commit(env, io, IO_PRIO_NORMAL); if (result < 0) GOTO(out, result); @@ -903,6 +903,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping, struct cl_page *page; struct page *vmpage = wbe_folio_page(vmfolio); unsigned from = pos & (PAGE_SIZE - 1); + enum cl_io_priority prio = IO_PRIO_NORMAL; bool unplug = false; int result = 0; ENTRY; @@ -926,6 +927,29 @@ static int ll_write_end(struct file *file, struct address_space *mapping, LASSERT(cl_page_is_owned(page, io)); if (copied > 0) { struct cl_page_list *plist = &vio->u.readwrite.vui_queue; +#ifdef SB_I_CGROUPWB + struct inode *inode = file_inode(file); + struct bdi_writeback *wb; + + spin_lock(&inode->i_lock); +#ifdef HAVE_INODE_ATTACH_WB_FOLIO + inode_attach_wb(inode, page_folio(vmpage)); +#else + inode_attach_wb(inode, vmpage); +#endif + wb = inode_to_wb(inode); + LASSERTF(wb != NULL, "wb@%pK\n", wb); + if (wb->dirty_exceeded) { + unplug = true; + prio = IO_PRIO_URGENT; + CDEBUG(D_IOTRACE, "wb@%pK dirty_ratelimit=%lu balanced_dirty_ratelimit=%lu dirty_exceeded=%d state=%lX last_old_flush=%lu\n", + wb, wb->dirty_ratelimit, + wb->balanced_dirty_ratelimit, + wb->dirty_exceeded, wb->state, + wb->last_old_flush); + } + spin_unlock(&inode->i_lock); +#endif lcc->lcc_page = NULL; /* page will be queued */ @@ -962,7 +986,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping, io->u.ci_rw.crw_pos + io->u.ci_rw.crw_bytes) unplug = true; if (unplug) - result = vvp_io_write_commit(env, io); + result = vvp_io_write_commit(env, io, prio); if (result < 0) io->ci_result = result; diff --git a/lustre/llite/vvp_internal.h b/lustre/llite/vvp_internal.h index 33821e6..8776db1 100644 --- a/lustre/llite/vvp_internal.h +++ b/lustre/llite/vvp_internal.h @@ -247,7 +247,8 @@ struct vvp_object *cl_inode2vvp(struct inode *inode); int vvp_io_init(const struct lu_env *env, struct cl_object *obj, struct cl_io *io); -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io, + enum cl_io_priority prio); int vvp_page_init(const struct lu_env *env, struct cl_object *obj, struct cl_page *page, pgoff_t index); struct lu_object *vvp_object_alloc(const struct lu_env *env, diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index cf8bf44..47c3cf0 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -1182,7 +1182,8 @@ static bool page_list_sanity_check(struct cl_object *obj, } /* Return how many bytes have queued or written */ -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) +int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io, + enum cl_io_priority prio) { struct cl_object *obj = io->ci_obj; struct inode *inode = vvp_object_inode(obj); @@ -1198,8 +1199,9 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) if (npages == 0) RETURN(0); - CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n", - npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to); + CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d prio %d\n", + npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to, + prio); LASSERT(page_list_sanity_check(obj, queue)); @@ -1207,7 +1209,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) rc = cl_io_commit_async(env, io, queue, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to, - write_commit_callback); + write_commit_callback, prio); npages -= queue->pl_nr; /* already committed pages */ if (npages > 0) { /* calculate how many bytes were written */ @@ -1231,7 +1233,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) LASSERT(ergo(rc == 0, queue->pl_nr == 0)); /* out of quota, try sync write */ - if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) { + if ((rc == -EDQUOT && !cl_io_is_mkwrite(io)) || prio > IO_PRIO_NORMAL) { struct ll_inode_info *lli = ll_i2info(inode); rc = vvp_io_commit_sync(env, io, queue, @@ -1375,7 +1377,7 @@ static int vvp_io_write_start(const struct lu_env *env, } if (result > 0) { - result = vvp_io_write_commit(env, io); + result = vvp_io_write_commit(env, io, IO_PRIO_NORMAL); /* Simulate short commit */ if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) { vio->u.readwrite.vui_written >>= 1; @@ -1613,7 +1615,8 @@ static int vvp_io_fault_start(const struct lu_env *env, * still have chance to detect it. */ result = cl_io_commit_async(env, io, plist, 0, to, - mkwrite_commit_callback); + mkwrite_commit_callback, + IO_PRIO_NORMAL); /* Have overquota flag, trying sync write to check * whether indeed out of quota */ @@ -1627,7 +1630,8 @@ static int vvp_io_fault_start(const struct lu_env *env, cl_page_list_add(plist, page, true); result = cl_io_commit_async(env, io, plist, 0, to, - mkwrite_commit_callback); + mkwrite_commit_callback, + IO_PRIO_NORMAL); io->ci_noquota = 0; } else { cl_page_put(env, page); diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c index 4d67be6..299d83c 100644 --- a/lustre/llite/vvp_object.c +++ b/lustre/llite/vvp_object.c @@ -139,7 +139,8 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj) int rc; ENTRY; - rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1); + rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1, + IO_PRIO_NORMAL); if (rc < 0) { CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n", PFID(lu_object_fid(&obj->co_lu)), rc); diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c index b6bbbcc..43e7eb8 100644 --- a/lustre/lov/lov_io.c +++ b/lustre/lov/lov_io.c @@ -758,6 +758,7 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, io->u.ci_fsync.fi_end = end; io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid; io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode; + io->u.ci_fsync.fi_prio = parent->u.ci_fsync.fi_prio; break; } case CIT_READ: @@ -1445,10 +1446,11 @@ static int lov_io_submit(const struct lu_env *env, static int lov_io_commit_async(const struct lu_env *env, const struct cl_io_slice *ios, struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb) + cl_commit_cbt cb, enum cl_io_priority prio) { struct cl_page_list *plist = &lov_env_info(env)->lti_plist; struct lov_io *lio = cl2lov_io(env, ios); + bool hp = cl_io_high_prio(prio); struct lov_io_sub *sub; struct cl_page *page; int rc = 0; @@ -1463,7 +1465,7 @@ static int lov_io_commit_async(const struct lu_env *env, LASSERT(!IS_ERR(sub)); LASSERT(sub == &lio->lis_single_subio); rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, queue, - from, to, cb); + from, to, cb, prio); RETURN(rc); } @@ -1493,7 +1495,8 @@ static int lov_io_commit_async(const struct lu_env *env, sub = lov_sub_get(env, lio, index); if (!IS_ERR(sub)) { rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, - plist, from, stripe_to, cb); + plist, from, stripe_to, cb, + prio); } else { rc = PTR_ERR(sub); break; @@ -1504,9 +1507,14 @@ static int lov_io_commit_async(const struct lu_env *env, from = 0; - if (lov_comp_entry(index) != + if (!hp && lov_comp_entry(index) != lov_comp_entry(page->cp_lov_index)) - cl_io_extent_release(sub->sub_env, &sub->sub_io); + cl_io_extent_release(sub->sub_env, &sub->sub_io, prio); + } + + if (rc == 0 && hp) { + list_for_each_entry(sub, &lio->lis_subios, sub_list) + cl_io_extent_release(sub->sub_env, &sub->sub_io, prio); } /* for error case, add the page back into the qin list */ diff --git a/lustre/mdc/mdc_dev.c b/lustre/mdc/mdc_dev.c index 67a7451..1817ff8 100644 --- a/lustre/mdc/mdc_dev.c +++ b/lustre/mdc/mdc_dev.c @@ -256,7 +256,7 @@ static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj, if (mode == CLM_WRITE) { result = osc_cache_writeback_range(env, obj, start, end, 1, - discard); + discard, IO_PRIO_NORMAL); CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", obj, start, end, result, discard ? "discarded" : "written back"); @@ -1182,9 +1182,16 @@ static int mdc_io_fsync_start(const struct lu_env *env, if (fio->fi_mode == CL_FSYNC_RECLAIM) { struct client_obd *cli = osc_cli(osc); - if (!atomic_long_read(&cli->cl_unstable_count)) { - /* Stop flush when there are no unstable pages? */ - CDEBUG(D_CACHE, "unstable count is zero\n"); + if (!atomic_read(&osc->oo_nr_ios) && + !atomic_read(&osc->oo_nr_writes) && + !atomic_long_read(&cli->cl_unstable_count)) { + /* + * No active IO, no dirty pages needing to write and no + * unstable pages needing to commit. + */ + CDEBUG(D_CACHE, + "%s: dirty/unstable counts are both zero\n", + cli_name(cli)); RETURN(0); } } @@ -1193,7 +1200,8 @@ static int mdc_io_fsync_start(const struct lu_env *env, * possible range despite of supplied start/end values. */ result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0, - fio->fi_mode == CL_FSYNC_DISCARD); + fio->fi_mode == CL_FSYNC_DISCARD, + fio->fi_prio); if (result > 0) { fio->fi_nr_written += result; result = 0; diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index dff5c16..770bc92 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -631,13 +631,14 @@ EXPORT_SYMBOL(cl_io_lru_reserve); * @from: Starting position * @to: Ending position * @cb: callback function + * @prio: I/O priority * * Returns 0 if all pages committed, or errcode if error occurred. * see cl_io_operations::cio_commit_async() */ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb) + cl_commit_cbt cb, enum cl_io_priority prio) { const struct cl_io_slice *scan; int result = 0; @@ -647,7 +648,7 @@ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, if (scan->cis_iop->cio_commit_async == NULL) continue; result = scan->cis_iop->cio_commit_async(env, scan, queue, - from, to, cb); + from, to, cb, prio); if (result != 0) break; } @@ -655,7 +656,8 @@ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, } EXPORT_SYMBOL(cl_io_commit_async); -void cl_io_extent_release(const struct lu_env *env, struct cl_io *io) +void cl_io_extent_release(const struct lu_env *env, struct cl_io *io, + enum cl_io_priority prio) { const struct cl_io_slice *scan; ENTRY; @@ -663,7 +665,7 @@ void cl_io_extent_release(const struct lu_env *env, struct cl_io *io) list_for_each_entry(scan, &io->ci_layers, cis_linkage) { if (scan->cis_iop->cio_extent_release == NULL) continue; - scan->cis_iop->cio_extent_release(env, scan); + scan->cis_iop->cio_extent_release(env, scan, prio); } EXIT; } diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index a4f29f8..013d5f5 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -196,8 +196,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, GOTO(out, rc = 65); fallthrough; default: - if (atomic_read(&ext->oe_users) > 0) - GOTO(out, rc = 70); + break; } if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) @@ -564,10 +563,13 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, /** * Drop user count of osc_extent, and unplug IO asynchronously. */ -void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) +void osc_extent_release(const struct lu_env *env, struct osc_extent *ext, + enum cl_io_priority prio) { struct osc_object *obj = ext->oe_obj; struct client_obd *cli = osc_cli(obj); + bool hp = cl_io_high_prio(prio); + ENTRY; LASSERT(atomic_read(&ext->oe_users) > 0); @@ -575,15 +577,26 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) LASSERT(ext->oe_grants > 0); if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { - LASSERT(ext->oe_state == OES_ACTIVE); if (ext->oe_trunc_pending) { - /* a truncate process is waiting for this extent. + /* + * A truncate process is waiting for this extent. * This may happen due to a race, check - * osc_cache_truncate_start(). */ + * osc_cache_truncate_start(). + */ + if (ext->oe_state != OES_ACTIVE) { + int rc; + + osc_object_unlock(obj); + rc = osc_extent_wait(env, ext, OES_INV); + if (rc < 0) + OSC_EXTENT_DUMP(D_ERROR, ext, + "error: %d.\n", rc); + osc_object_lock(obj); + } osc_extent_state_set(ext, OES_TRUNC); ext->oe_trunc_pending = 0; osc_object_unlock(obj); - } else { + } else if (ext->oe_state == OES_ACTIVE) { int grant = 0; osc_extent_state_set(ext, OES_CACHE); @@ -596,18 +609,17 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) if (osc_extent_merge(env, ext, next_extent(ext)) == 0) grant += cli->cl_grant_extent_tax; - if (!ext->oe_rw && ext->oe_dlmlock) { - bool hp; - + if (!hp && !ext->oe_rw && ext->oe_dlmlock) { lock_res_and_lock(ext->oe_dlmlock); hp = ldlm_is_cbpending(ext->oe_dlmlock); unlock_res_and_lock(ext->oe_dlmlock); - - /* HP extent should be written ASAP. */ - if (hp) - ext->oe_hp = 1; } + + /* HP extent should be written ASAP. */ + if (hp) + ext->oe_hp = 1; + if (ext->oe_hp) list_move_tail(&ext->oe_link, &obj->oo_hp_exts); @@ -621,9 +633,14 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) osc_object_unlock(obj); if (grant > 0) osc_unreserve_grant(cli, 0, grant); + } else { + osc_object_unlock(obj); } - osc_io_unplug_async(env, cli, obj); + if (unlikely(cl_io_high_prio(prio))) + osc_io_unplug(env, cli, obj); + else + osc_io_unplug_async(env, cli, obj); } osc_extent_put(env, ext); @@ -916,7 +933,7 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, } osc_object_unlock(obj); if (rc == 1) - osc_extent_release(env, ext); + osc_extent_release(env, ext, IO_PRIO_NORMAL); /* wait for the extent until its state becomes @state */ rc = wait_event_idle_timeout(ext->oe_waitq, @@ -1160,6 +1177,9 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); osc_object_lock(obj); + if (ext->oe_state != OES_ACTIVE) + GOTO(out, rc = -ESTALE); + LASSERT(sanity_check_nolock(ext) == 0); end_chunk = ext->oe_end >> ppc_bits; if (chunk > end_chunk + 1) @@ -2342,7 +2362,10 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, * 2. otherwise, a new extent will be allocated. */ ext = oio->oi_active; - if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { + if (ext != NULL && ext->oe_state != OES_ACTIVE) { + need_release = 1; + } else if (ext != NULL && ext->oe_start <= index && + ext->oe_max_end >= index) { /* one chunk plus extent overhead must be enough to write this * page */ grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; @@ -2376,7 +2399,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, need_release = 1; } if (need_release) { - osc_extent_release(env, ext); + osc_extent_release(env, ext, IO_PRIO_NORMAL); oio->oi_active = NULL; ext = NULL; } @@ -2407,6 +2430,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, grants = tmp; } +restart_find: tmp = grants; if (rc == 0) { ext = osc_extent_find(env, osc, index, &tmp); @@ -2430,6 +2454,28 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); osc_object_lock(osc); + if (ext->oe_state != OES_ACTIVE) { + if (ext->oe_state == OES_CACHE) { + osc_extent_state_set(ext, OES_ACTIVE); + osc_update_pending(osc, OBD_BRW_WRITE, + -ext->oe_nr_pages); + list_del_init(&ext->oe_link); + } else { + osc_object_unlock(osc); + osc_extent_get(ext); + osc_extent_release(env, ext, IO_PRIO_NORMAL); + oio->oi_active = NULL; + + /* Waiting for IO finished. */ + rc = osc_extent_wait(env, ext, OES_INV); + osc_extent_put(env, ext); + if (rc < 0) + RETURN(rc); + + GOTO(restart_find, rc); + } + } + if (ext->oe_nr_pages == 0) ext->oe_srvlock = ops->ops_srvlock; else @@ -3097,14 +3143,18 @@ EXPORT_SYMBOL(osc_cache_wait_range); * Return how many pages will be issued, or error code if error occurred. */ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end, int hp, int discard) + pgoff_t start, pgoff_t end, int hp, int discard, + enum cl_io_priority prio) { struct osc_extent *ext; LIST_HEAD(discard_list); + bool active_ext_check = false; bool unplug = false; int result = 0; + ENTRY; +repeat: osc_object_lock(obj); ext = osc_extent_search(obj, start); if (ext == NULL) @@ -3176,6 +3226,16 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, * grants. We do this for the correctness of fsync. */ LASSERT(hp == 0 && discard == 0); ext->oe_urgent = 1; + + if (active_ext_check) { + osc_extent_state_set(ext, OES_CACHE); + list_move_tail(&ext->oe_link, + &obj->oo_urgent_exts); + osc_update_pending(obj, OBD_BRW_WRITE, + ext->oe_nr_pages); + unplug = true; + } + break; case OES_TRUNC: /* this extent is being truncated, can't do anything @@ -3223,7 +3283,22 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, result = rc; } - OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result); + OSC_IO_DEBUG(obj, "pageout [%lu, %lu] npages %lu: rc=%d.\n", + start, end, obj->oo_npages, result); + + /* + * Try to flush the active I/O extents of the object. + * Otherwise, the user process writing the file may be dirty exceeded + * and waiting endless in balance_dirty_pages(). + */ + if (result == 0 && prio == IO_PRIO_DIRTY_EXCEEDED && + !active_ext_check && atomic_read(&obj->oo_nr_ios) && + obj->oo_npages > 0) { + osc_extent_tree_dump(D_CACHE, obj); + active_ext_check = true; + GOTO(repeat, result); + } + RETURN(result); } EXPORT_SYMBOL(osc_cache_writeback_range); diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 40770ce..89e2c71 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -29,7 +29,8 @@ void osc_update_next_shrink(struct client_obd *cli); int lru_queue_work(const struct lu_env *env, void *data); int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, int sent, int rc); -void osc_extent_release(const struct lu_env *env, struct osc_extent *ext); +void osc_extent_release(const struct lu_env *env, struct osc_extent *ext, + enum cl_io_priority prio); int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, pgoff_t start, pgoff_t end, bool discard); int osc_ldlm_hp_handle(const struct lu_env *env, struct osc_object *obj, @@ -110,11 +111,6 @@ static inline unsigned long rpcs_in_flight(struct client_obd *cli) return cli->cl_r_in_flight + cli->cl_w_in_flight; } -static inline char *cli_name(struct client_obd *cli) -{ - return cli->cl_import->imp_obd->obd_name; -} - static inline char list_empty_marker(struct list_head *list) { return list_empty(list) ? '-' : '+'; diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c index 4f9ecd0..42ff8eb 100644 --- a/lustre/osc/osc_io.c +++ b/lustre/osc/osc_io.c @@ -422,7 +422,7 @@ void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, int osc_io_commit_async(const struct lu_env *env, const struct cl_io_slice *ios, struct cl_page_list *qin, int from, int to, - cl_commit_cbt cb) + cl_commit_cbt cb, enum cl_io_priority prio) { struct cl_io *io = ios->cis_io; struct osc_io *oio = cl2osc_io(env, ios); @@ -500,8 +500,8 @@ int osc_io_commit_async(const struct lu_env *env, /* for sync write, kernel will wait for this page to be flushed before * osc_io_end() is called, so release it earlier. * for mkwrite(), it's known there is no further pages. */ - if (cl_io_is_sync_write(io) && oio->oi_active != NULL) { - osc_extent_release(env, oio->oi_active); + if (cl_io_is_sync_write(io) && oio->oi_active) { + osc_extent_release(env, oio->oi_active, prio); oio->oi_active = NULL; } @@ -511,12 +511,13 @@ int osc_io_commit_async(const struct lu_env *env, EXPORT_SYMBOL(osc_io_commit_async); void osc_io_extent_release(const struct lu_env *env, - const struct cl_io_slice *ios) + const struct cl_io_slice *ios, + enum cl_io_priority prio) { struct osc_io *oio = cl2osc_io(env, ios); if (oio->oi_active != NULL) { - osc_extent_release(env, oio->oi_active); + osc_extent_release(env, oio->oi_active, prio); oio->oi_active = NULL; } } @@ -686,7 +687,8 @@ int osc_punch_start(const struct lu_env *env, struct cl_io *io, int rc; ENTRY; - rc = osc_cache_writeback_range(env, osc, pg_start, pg_end, 1, 0); + rc = osc_cache_writeback_range(env, osc, pg_start, pg_end, 1, 0, + IO_PRIO_NORMAL); if (rc < 0) RETURN(rc); @@ -1106,9 +1108,16 @@ static int osc_io_fsync_start(const struct lu_env *env, if (fio->fi_mode == CL_FSYNC_RECLAIM) { struct client_obd *cli = osc_cli(osc); - if (!atomic_long_read(&cli->cl_unstable_count)) { - /* Stop flush when there are no unstable pages? */ - CDEBUG(D_CACHE, "unstable count is zero\n"); + if (!atomic_read(&osc->oo_nr_ios) && + !atomic_read(&osc->oo_nr_writes) && + !atomic_long_read(&cli->cl_unstable_count)) { + /* + * No active I/O, no dirty pages needing to write and + * no unstable pages needing to commit. + */ + CDEBUG(D_CACHE, + "%s: unstable/dirty counts are both zero\n", + cli_name(cli)); RETURN(0); } } @@ -1117,7 +1126,8 @@ static int osc_io_fsync_start(const struct lu_env *env, end = CL_PAGE_EOF; result = osc_cache_writeback_range(env, osc, start, end, 0, - fio->fi_mode == CL_FSYNC_DISCARD); + fio->fi_mode == CL_FSYNC_DISCARD, + fio->fi_prio); if (result < 0 && fio->fi_mode == CL_FSYNC_DISCARD) { CDEBUG(D_CACHE, "%s: ignore error %d on discarding "DFID":[%lu-%lu]\n", @@ -1258,7 +1268,7 @@ void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice) struct osc_io *oio = cl2osc_io(env, slice); if (oio->oi_active) { - osc_extent_release(env, oio->oi_active); + osc_extent_release(env, oio->oi_active, IO_PRIO_NORMAL); oio->oi_active = NULL; } } diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c index f6689f1..237d20c 100644 --- a/lustre/osc/osc_lock.c +++ b/lustre/osc/osc_lock.c @@ -349,7 +349,7 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end, if (mode == CLM_WRITE) { rc = osc_cache_writeback_range(env, obj, start, end, 1, - discard); + discard, IO_PRIO_NORMAL); CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", obj, start, end, rc, discard ? "discarded" : "written back");