From 1dad1b111b163704aaff40e96e0248416a49b1d9 Mon Sep 17 00:00:00 2001 From: Qian Yingjin Date: Fri, 6 Sep 2024 00:57:05 +0800 Subject: [PATCH] LU-17190 llite: release locks after all read-head pages submit We put all acquired DLM extent locks for read-ahead in a list. After all read-ahead pages are submitted, then the client releases all DLM extent locks acquired during the previous read-ahead. By this way, in a extent lock blocking AST, all reading extents have already submitted and put into the list @oo_reading_exts of the OSC object. Then the client can check this list to find out the conflict outstanding extents as all I/O RPC slots (limited by osc.*.max_rpcs_in_flight) are used out by direct I/Os which take server-side locking. Otherwise, in the original way, it matches DLM extent lock, adds read-ahead pages into queue list, releases the previous matched lock; repeat this progress for read-ahead and finally submit the I/O containing all read-ahead pages (@osc_io_submit). The conflict extents in OES_LOCK_DONE state may be added into the list @oo_reading_exts after the check in blocking AST. On the client side in the blocking AST from server-side locking for DIO it will try to lock the pages in these lockdone extents to writeback or discard these cached pages covered by the lock; All pages in lockdone extent are locked (PG_locked), and these extents are waiting for RPC slots while all RPC slots are used out by DIO. Thus it may cause deadlock. This patch can be used by the next patch about high priority I/O for blocking AST. The client can check the list @oo_reading_exts to find out the conflict outstanding extents. Put these conflict extents into HP list, thus they can be sent to OSTs and handled ASAP, avoiding the possible deadlock. Change-Id: I5661607ecba3b6cbd6e29ae3fa14566a5ec045f1 Signed-off-by: Qian Yingjin Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56324 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Patrick Farrell Reviewed-by: Shaun Tancheff Reviewed-by: Oleg Drokin --- lustre/include/cl_object.h | 9 ++++-- lustre/llite/llite_internal.h | 5 +++ lustre/llite/rw.c | 74 ++++++++++++++++++++++++++++--------------- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 042c6a1..8467947 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1387,8 +1387,14 @@ struct cl_read_ahead { void *cra_dlmlock; void *cra_oio; + /* + * Linkage to track all cl_read_aheads for a read-ahead operations, + * used for releasing DLM locks acquired during read-ahead. + */ + struct list_head cra_linkage; + /* whether lock is in contention */ - bool cra_contention; + bool cra_contention; }; static inline void cl_read_ahead_release(const struct lu_env *env, @@ -1396,7 +1402,6 @@ static inline void cl_read_ahead_release(const struct lu_env *env, { if (ra->cra_release != NULL) ra->cra_release(env, ra); - memset(ra, 0, sizeof(*ra)); } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index de47230..814bb1d 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -805,6 +805,11 @@ struct ra_io_arg { */ loff_t ria_length; loff_t ria_bytes; + /* + * list of cl_read_aheads used during read-ahead, used to release DLM + * locks acquired during read-ahead. + */ + struct list_head ria_cl_ra_list; }; /* LL_HIST_MAX=32 causes an overflow */ diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 57aa58f..bfd4f72 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -425,7 +425,7 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, struct cl_page_list *queue, struct ll_readahead_state *ras, struct ra_io_arg *ria, pgoff_t *ra_end, pgoff_t skip_index) { - struct cl_read_ahead ra = { 0 }; + struct cl_read_ahead *ra = NULL; /* busy page count is per stride */ int rc = 0, count = 0, busy_page_count = 0; pgoff_t page_idx; @@ -439,7 +439,8 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, if (skip_index && page_idx == skip_index) continue; if (ras_inside_ra_window(page_idx, ria)) { - if (ra.cra_end_idx == 0 || ra.cra_end_idx < page_idx) { + if (!ra || ra->cra_end_idx == 0 || + ra->cra_end_idx < page_idx) { pgoff_t end_idx; /* @@ -449,39 +450,46 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, * Do not extend read lock accross stripe if * lock contention detected. */ - if (ra.cra_contention && + if (ra && ra->cra_contention && page_idx > ria->ria_end_idx_min) { ria->ria_end_idx = *ra_end; break; } - cl_read_ahead_release(env, &ra); + OBD_ALLOC_PTR(ra); + if (ra == NULL) + /* Ignore the error */ + break; - rc = cl_io_read_ahead(env, io, page_idx, &ra); - if (rc < 0) + INIT_LIST_HEAD(&ra->cra_linkage); + rc = cl_io_read_ahead(env, io, page_idx, ra); + if (rc < 0) { + OBD_FREE_PTR(ra); break; + } + list_add_tail(&ra->cra_linkage, + &ria->ria_cl_ra_list); /* * Only shrink ria_end_idx if the matched * LDLM lock doesn't cover more. */ - if (page_idx > ra.cra_end_idx) { - ria->ria_end_idx = ra.cra_end_idx; + if (page_idx > ra->cra_end_idx) { + ria->ria_end_idx = ra->cra_end_idx; break; } CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n", - page_idx, ra.cra_end_idx, - ra.cra_rpc_pages); - LASSERTF(ra.cra_end_idx >= page_idx, + page_idx, ra->cra_end_idx, + ra->cra_rpc_pages); + LASSERTF(ra->cra_end_idx >= page_idx, "object: %px, indcies %lu / %lu\n", - io->ci_obj, ra.cra_end_idx, page_idx); + io->ci_obj, ra->cra_end_idx, page_idx); /* update read ahead RPC size. - * NB: it's racy but doesn't matter - */ - if (ras->ras_rpc_pages != ra.cra_rpc_pages && - ra.cra_rpc_pages > 0) - ras->ras_rpc_pages = ra.cra_rpc_pages; + * NB: it's racy but doesn't matter */ + if (ras->ras_rpc_pages != ra->cra_rpc_pages && + ra->cra_rpc_pages > 0) + ras->ras_rpc_pages = ra->cra_rpc_pages; if (!skip_index) { /* trim (align with optimal RPC size) */ end_idx = ras_align(ras, @@ -545,8 +553,6 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, } } - cl_read_ahead_release(env, &ra); - if (count) ll_ra_stats_add(vvp_object_inode(io->ci_obj), RA_STAT_READAHEAD_PAGES, count); @@ -554,6 +560,18 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, return count; } +static void ll_readahead_locks_release(const struct lu_env *env, + struct list_head *cl_ra_list) +{ + struct cl_read_ahead *ra, *n; + + list_for_each_entry_safe(ra, n, cl_ra_list, cra_linkage) { + list_del_init(&ra->cra_linkage); + cl_read_ahead_release(env, ra); + OBD_FREE_PTR(ra); + } +} + static void ll_readahead_work_free(struct ll_readahead_work *work) { fput(work->lrw_file); @@ -643,6 +661,7 @@ static void ll_readahead_handle_work(struct work_struct *wq) ria = &ll_env_info(env)->lti_ria; memset(ria, 0, sizeof(*ria)); + INIT_LIST_HEAD(&ria->ria_cl_ra_list); ria->ria_start_idx = work->lrw_start_idx; /* Truncate RA window to end of file */ @@ -706,6 +725,8 @@ static void ll_readahead_handle_work(struct work_struct *wq) if (ria->ria_end_idx == ra_end_idx && ra_end_idx == (kms >> PAGE_SHIFT)) ll_ra_stats_inc(inode, RA_STAT_EOF); + ll_readahead_locks_release(env, &ria->ria_cl_ra_list); + if (ra_end_idx != ria->ria_end_idx) ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); @@ -729,17 +750,15 @@ out_free_work: } static int ll_readahead(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, + struct cl_page_list *queue, struct ra_io_arg *ria, struct ll_readahead_state *ras, bool hit, struct file *file, pgoff_t skip_index, pgoff_t *start_idx) { struct vvp_io *vio = vvp_env_io(env); - struct ll_thread_info *lti = ll_env_info(env); unsigned long pages, pages_min = 0; pgoff_t ra_end_idx = 0, end_idx = 0; struct inode *inode; - struct ra_io_arg *ria = <i->lti_ria; struct cl_object *clob; int ret = 0; __u64 kms; @@ -762,7 +781,6 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, RETURN(0); } - memset(ria, 0, sizeof(*ria)); ret = ll_readahead_file_kms(env, io, &kms); if (ret != 0) RETURN(ret); @@ -1679,7 +1697,7 @@ void ll_cl_remove(struct inode *inode, const struct lu_env *env) } int ll_io_read_page(const struct lu_env *env, struct cl_io *io, - struct cl_page *page, struct file *file) + struct cl_page *page, struct file *file) { struct inode *inode = vvp_object_inode(page->cp_obj); struct ll_sb_info *sbi = ll_i2sbi(inode); @@ -1695,6 +1713,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, pgoff_t io_start_index; pgoff_t io_end_index; bool unlockpage = true; + struct ra_io_arg *ria = NULL; ENTRY; @@ -1747,9 +1766,12 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) { pgoff_t skip_index = 0; + ria = &ll_env_info(env)->lti_ria; + memset(ria, 0, sizeof(*ria)); + INIT_LIST_HEAD(&ria->ria_cl_ra_list); if (ras->ras_next_readahead_idx < cl_page_index(page)) skip_index = cl_page_index(page); - rc2 = ll_readahead(env, io, &queue->c2_qin, ras, + rc2 = ll_readahead(env, io, &queue->c2_qin, ria, ras, uptodate, file, skip_index, &ra_start_index); /* Keep iotrace clean. Print only on actual page read */ @@ -1775,6 +1797,8 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, task_io_account_read(PAGE_SIZE * count); } + if (ria) + ll_readahead_locks_release(env, &ria->ria_cl_ra_list); if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */ rc = cl_sync_io_wait(env, anchor, 0); -- 1.8.3.1