From: Qian Yingjin <qian@ddn.com>
Date: Fri, 13 Oct 2023 02:49:34 +0000 (-0400)
Subject: LU-17190 osc: client-side high prio I/O under blocking AST
X-Git-Tag: 2.16.51~24
X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=refs%2Fchanges%2F27%2F56327%2F5;p=fs%2Flustre-release.git

LU-17190 osc: client-side high prio I/O under blocking AST

We found the following deadlock with parallel DIO:
T1: writer
Obtain DLM extent lock: L1=<PW, [0, EOF]>

T2: parallel DIO reader
- read 50M data, iosize=64M, max_pages_per_rpc=1024(4M)
  max_rpcs_in_flight=8
ll_direct_IO_impl()
- use out all available RPC slots: number of read RPC in flight: 9
- OST side
->tgt_brw_read()
->tgt_brw_lock()
- Server side locking. Try to cancel the conflict lock L1.

T3: reader
- Take DLM lock ref on L1=<PW, [0, EOF]>.
->ll_readpage()
->ll_io_read_page()
->cl_io_submit_rw()
- wait fro RPC slots to send the read RPC to OST
...

deadlock:
- T2 => T3: T2 is waiting for T3 to release DLM lock L1;
- T3 => T2: T3 is waiting for T2 finished to free RPC slots;

To solve this problem, we introduce a client-side high priority
I/O handling mechanism where the extent lock protecting the I/O is
under blocking AST.

It implements as follows:
When received a lock blocking AST and the corresponding lock is in
use (reader and writer count is not zero), it checks whether there
are any I/O (osc_extent) used this lock is outstanding (i.e. wait
for RPC slot). If found, make this kind of I/Os with high
priority and put them into the HP list of the client. Thus the
client will force to send HP I/Os even the available RPC slots
are used out.

This makes I/O engine on OSC layer more efficent.
For normal I/Os, the client needs to iterate over the object list
and send I/O one by one. Moreover, the in-flight I/Os can not
exceed the @max_rpcs_in_flight.

High priority I/Os are put into HP list and will be handled more
quickly. This can avoid possible deadlock caused by parallel DIO
and the client can reponse the lock blocking AST more quickly.

Test-Parameters: testlist=sanity-pcc env=ONLY=99a,ONLY_REPEAT=100 clientdistro=el8.10
Test-Parameters: testlist=sanity-pcc env=ONLY=99a,ONLY_REPEAT=100 clientdistro=el9.3
Test-Parameters: testlist=sanity-pcc env=ONLY=99b,ONLY_REPEAT=100 clientdistro=el8.10
Test-Parameters: testlist=sanity-pcc env=ONLY=99b,ONLY_REPEAT=100 clientdistro=el9.3
Change-Id: I9afe032a79f40d55b800ddb13d8b8e9a3e10ba56
Signed-off-by: Qian Yingjin <qian@ddn.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56327
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---

diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h
index b81803e..32da69e 100644
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -287,6 +287,8 @@ typedef int (*ldlm_res_policy)(const struct lu_env *env,
 
 typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
 
+typedef int (*ldlm_hp_handler_t)(struct ldlm_lock *lock);
+
 /**
  * LVB operations.
  * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
@@ -547,6 +549,12 @@ struct ldlm_namespace {
 	 */
 	ldlm_cancel_cbt		ns_cancel;
 
+	/**
+	 * Callback to check whether an object protected by a lock needs to
+	 * be handled with high priority (i.e. in case of lock blocking AST).
+	 */
+	ldlm_hp_handler_t	ns_hp_handler;
+
 	/** LDLM lock stats */
 	struct lprocfs_stats	*ns_stats;
 
@@ -632,6 +640,13 @@ static inline void ns_register_cancel(struct ldlm_namespace *ns,
 	ns->ns_cancel = arg;
 }
 
+static inline void ns_register_hp_handler(struct ldlm_namespace *ns,
+					  ldlm_hp_handler_t arg)
+{
+	LASSERT(ns != NULL);
+	ns->ns_hp_handler = arg;
+}
+
 struct ldlm_lock;
 
 /** Type for blocking callback function of a lock. */
diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h
index d103ea9..bf87865 100644
--- a/lustre/include/lustre_osc.h
+++ b/lustre/include/lustre_osc.h
@@ -264,6 +264,7 @@ struct osc_object {
 	 * Manage write(dirty) extents.
 	 */
 	struct list_head	oo_hp_exts;	/* list of hp extents */
+	struct list_head	oo_hp_read_exts;/* list for hp read extents */
 	struct list_head	oo_urgent_exts;	/* list of writeback extents */
 	struct list_head	oo_full_exts;
 	struct list_head	oo_dio_exts;
diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c
index f44926b..ae061bd 100644
--- a/lustre/ldlm/ldlm_extent.c
+++ b/lustre/ldlm/ldlm_extent.c
@@ -771,7 +771,7 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 	 * let's refresh eviction timer for it.
 	 */
 	timeout = ldlm_bl_timeout_by_rpc(arg->lpa_req);
-	LDLM_DEBUG(lock, "refreshed to %ds", timeout);
+	LDLM_DEBUG(lock, "refreshed to %ds. ", timeout);
 	ldlm_refresh_waiting_lock(lock, timeout);
 }
 EXPORT_SYMBOL(ldlm_lock_prolong_one);
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index 350d3b9..4692a3d 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -1961,6 +1961,8 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
 		CDEBUG(D_DLMTRACE,
 		       "Lock %p is referenced, will be cancelled later\n",
 		       lock);
+		if (ns->ns_hp_handler != NULL)
+			ns->ns_hp_handler(lock);
 	}
 
 	LDLM_DEBUG(lock, "client blocking callback handler END");
@@ -2528,7 +2530,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
 
 	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
 	case LDLM_BL_CALLBACK:
-		LDLM_DEBUG(lock, "blocking ast");
+		LDLM_DEBUG(lock, "blocking ast ");
 		req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
 		if (!ldlm_is_cancel_on_block(lock)) {
 			rc = ldlm_callback_reply(req, 0);
@@ -2540,14 +2542,14 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
 			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
 		break;
 	case LDLM_CP_CALLBACK:
-		LDLM_DEBUG(lock, "completion ast");
+		LDLM_DEBUG(lock, "completion ast ");
 		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
 		rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
 		if (!CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
 			ldlm_callback_reply(req, rc);
 		break;
 	case LDLM_GL_CALLBACK:
-		LDLM_DEBUG(lock, "glimpse ast");
+		LDLM_DEBUG(lock, "glimpse ast ");
 		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
 		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
 		break;
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index bfd4f72..62fdd11 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -470,10 +470,10 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 
 				list_add_tail(&ra->cra_linkage,
 					      &ria->ria_cl_ra_list);
-				 /*
-				  * Only shrink ria_end_idx if the matched
-				  * LDLM lock doesn't cover more.
-				  */
+				/*
+				 * Only shrink ria_end_idx if the matched
+				 * LDLM lock doesn't cover more.
+				 */
 				if (page_idx > ra->cra_end_idx) {
 					ria->ria_end_idx = ra->cra_end_idx;
 					break;
@@ -722,11 +722,12 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 		if (rc == 0)
 			task_io_account_read(PAGE_SIZE * count);
 	}
-	if (ria->ria_end_idx == ra_end_idx && ra_end_idx == (kms >> PAGE_SHIFT))
-		ll_ra_stats_inc(inode, RA_STAT_EOF);
 
 	ll_readahead_locks_release(env, &ria->ria_cl_ra_list);
 
+	if (ria->ria_end_idx == ra_end_idx && ra_end_idx == (kms >> PAGE_SHIFT))
+		ll_ra_stats_inc(inode, RA_STAT_EOF);
+
 	if (ra_end_idx != ria->ria_end_idx)
 		ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
 
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c
index 22cd0136..83951f1 100644
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -596,6 +596,18 @@ void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
 			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
 				grant += cli->cl_grant_extent_tax;
 
+			if (!ext->oe_rw && ext->oe_dlmlock) {
+				bool hp;
+
+				lock_res_and_lock(ext->oe_dlmlock);
+				hp = ldlm_is_cbpending(ext->oe_dlmlock);
+				unlock_res_and_lock(ext->oe_dlmlock);
+
+				/* HP extent should be written ASAP. */
+				if (hp)
+					ext->oe_hp = 1;
+			}
+
 			if (ext->oe_hp)
 				list_move_tail(&ext->oe_link,
 					       &obj->oo_hp_exts);
@@ -727,7 +739,8 @@ restart:
 				  ext->oe_end >= cur->oe_end),
 				 ext, EXTSTR"\n", EXTPARA(cur));
 
-			if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+			if (ext->oe_state > OES_CACHE || ext->oe_hp ||
+			    ext->oe_fsync_wait) {
 				/* for simplicity, we wait for this extent to
 				 * finish before going forward. */
 				conflict = osc_extent_get(ext);
@@ -739,7 +752,8 @@ restart:
 		}
 
 		/* non-overlapped extent */
-		if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait)
+		if (ext->oe_state != OES_CACHE || ext->oe_hp ||
+		    ext->oe_fsync_wait)
 			/* we can't do anything for a non OES_CACHE extent, or
 			 * if there is someone waiting for this extent to be
 			 * flushed, try next one. */
@@ -1617,6 +1631,15 @@ static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
 	return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
 }
 
+/* Check whether all I/O RPC slots are used out by parallel DIO. */
+static inline bool osc_full_dio_in_flight(struct client_obd *cli)
+{
+	__u32 rpcs = rpcs_in_flight(cli);
+
+	return rpcs >= cli->cl_max_rpcs_in_flight &&
+	       rpcs <= cli->cl_d_in_flight;
+}
+
 /* This maintains the lists of pending pages to read/write for a given object
  * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
  * to quickly find objects that are ready to send an RPC. */
@@ -1667,6 +1690,10 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
 			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
 			RETURN(1);
 		}
+		if (!list_empty(&osc->oo_hp_read_exts)) {
+			CDEBUG(D_CACHE, "high prio read request forcing RPC\n");
+			RETURN(1);
+		}
 		/* all read are urgent. */
 		if (!list_empty(&osc->oo_reading_exts))
 			RETURN(1);
@@ -1690,9 +1717,10 @@ static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
 	OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
 }
 
-static int osc_makes_hprpc(struct osc_object *obj)
+static bool osc_makes_hprpc(struct osc_object *obj)
 {
-	return !list_empty(&obj->oo_hp_exts);
+	return !list_empty(&obj->oo_hp_exts) ||
+	       !list_empty(&obj->oo_hp_read_exts);
 }
 
 static void on_list(struct list_head *item, struct list_head *list,
@@ -1995,6 +2023,45 @@ __must_hold(osc)
 	RETURN(rc);
 }
 
+static unsigned int get_read_extents(struct osc_object *obj,
+				     struct list_head *rpclist)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= UINT_MAX,
+		.erd_max_extents = UINT_MAX,
+	};
+
+	assert_osc_object_is_locked(obj);
+	while ((ext = list_first_entry_or_null(&obj->oo_hp_read_exts,
+					       struct osc_extent,
+					       oe_link)) != NULL) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	list_for_each_entry_safe(ext, next, &obj->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			break;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
+	}
+
+	LASSERT(data.erd_page_count <= data.erd_max_pages);
+	return data.erd_page_count;
+}
+
 /**
  * prepare pages for ASYNC io and put pages in send queue.
  *
@@ -2010,30 +2077,16 @@ osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
 		  struct osc_object *osc)
 __must_hold(osc)
 {
-	struct osc_extent *ext;
-	struct osc_extent *next;
 	LIST_HEAD(rpclist);
-	struct extent_rpc_data data = {
-		.erd_rpc_list	= &rpclist,
-		.erd_page_count	= 0,
-		.erd_max_pages	= cli->cl_max_pages_per_rpc,
-		.erd_max_chunks	= UINT_MAX,
-		.erd_max_extents = UINT_MAX,
-	};
+	unsigned int page_count;
 	int rc = 0;
+
 	ENTRY;
 
 	assert_osc_object_is_locked(osc);
-	list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
-		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
-		if (!try_to_add_extent_for_io(cli, ext, &data))
-			break;
-		osc_extent_state_set(ext, OES_RPC);
-		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
-	}
-	LASSERT(data.erd_page_count <= data.erd_max_pages);
+	page_count = get_read_extents(osc, &rpclist);
 
-	osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
+	osc_update_pending(osc, OBD_BRW_READ, -page_count);
 
 	if (!list_empty(&rpclist)) {
 		osc_object_unlock(osc);
@@ -2106,7 +2159,8 @@ __must_hold(&cli->cl_loi_list_lock)
 		 * starvation and leading to server evicting us for not
 		 * writing out pages in a timely manner LU-13131 */
 		if (osc_max_rpc_in_flight(cli, osc) &&
-		    list_empty(&osc->oo_hp_exts)) {
+		    list_empty(&osc->oo_hp_exts) &&
+		    list_empty(&osc->oo_hp_read_exts)) {
 			__osc_list_maint(cli, osc);
 			break;
 		}
@@ -2525,7 +2579,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io,
 	bool	can_merge   = true;
 	pgoff_t start      = CL_PAGE_EOF;
 	pgoff_t end        = 0;
-	struct osc_lock *oscl;
+
 	ENTRY;
 
 	list_for_each_entry(oap, list, oap_pending_item) {
@@ -2576,10 +2630,6 @@ int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io,
 		anchor = clpage->cp_sync_io;
 		ext->oe_csd = anchor->csi_dio_aio;
 	}
-	oscl = oio->oi_write_osclock ? : oio->oi_read_osclock;
-	if (oscl && oscl->ols_dlmlock != NULL) {
-		ext->oe_dlmlock = ldlm_lock_get(oscl->ols_dlmlock);
-	}
 	if (ext->oe_dio && !ext->oe_rw) { /* direct io write */
 		int grants;
 		int ppc;
@@ -2642,9 +2692,41 @@ int osc_queue_sync_pages(const struct lu_env *env, struct cl_io *io,
 		}
 		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
 	} else {
-		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+		bool hp_read = false;
+		struct ldlm_lock *dlmlock;
+		struct osc_lock *oscl;
+
+		/*
+		 * The DLM extent lock is under blocking AST, and make
+		 * this I/O with high priority.
+		 */
+
+		oscl = oio->oi_read_osclock ? : oio->oi_write_osclock;
+		dlmlock = oscl ? oscl->ols_dlmlock : NULL;
+
+		if (dlmlock == NULL && !ext->oe_srvlock) {
+			CDEBUG(D_CACHE,
+			       "NOLCK: io %pK "EXTSTR" dio: %d srvlock: %d\n",
+			       io, EXTPARA(ext), ext->oe_dio, ext->oe_srvlock);
+		}
+		if (!ext->oe_srvlock && dlmlock != NULL) {
+			lock_res_and_lock(dlmlock);
+			hp_read = ldlm_is_cbpending(dlmlock);
+			unlock_res_and_lock(dlmlock);
+			if (hp_read)
+				CDEBUG(D_CACHE,
+				       "HP read: io %pK ext@%pK "EXTSTR"\n",
+				       io, ext, EXTPARA(ext));
+		}
+
+		if (hp_read)
+			list_add_tail(&ext->oe_link, &obj->oo_hp_read_exts);
+		else
+			list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
 		osc_update_pending(obj, OBD_BRW_READ, page_count);
 	}
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "allocate ext: rw=%d\n", ext->oe_rw);
 	osc_object_unlock(obj);
 
 	osc_io_unplug_async(env, cli, obj);
@@ -2900,8 +2982,17 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 			result += ext->oe_nr_pages;
 			if (!discard) {
 				struct list_head *list = NULL;
-				if (hp) {
-					EASSERT(!ext->oe_hp, ext);
+
+				if (ext->oe_hp) {
+					/*
+					 * The extent is already added into HP
+					 * list.
+					 * Another thread has already written
+					 * back the extent with high priority.
+					 */
+					unplug = true;
+					break;
+				} else if (hp) {
 					ext->oe_hp = 1;
 					list = &obj->oo_hp_exts;
 				} else if (!ext->oe_urgent && !ext->oe_hp) {
@@ -3243,5 +3334,100 @@ out:
 	RETURN(result);
 }
 
+int osc_ldlm_hp_handle(const struct lu_env *env, struct osc_object *obj,
+		       pgoff_t start, pgoff_t end, bool read_check_only)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	bool no_rpc_slots = false;
+	bool unplug = false;
+
+	ENTRY;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	no_rpc_slots = osc_full_dio_in_flight(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	/*
+	 * Current we only handle with high priority for the case that
+	 * all I/O RPC slots are used out by parallel DIO and there are
+	 * conflict I/O extents in lock blocking AST.
+	 * TODO: Send all I/Os to OSTs on the object corresponding to
+	 * the lock in blocking AST. With higher priority, it does not
+	 * need to iterate over all OSC objects one by one, the conflict
+	 * I/O can be handled more quickly. Thus the lock taken by this
+	 * I/O can be release quickly.
+	 */
+
+	CDEBUG(D_CACHE,
+	       "High prio I/O check: start %lu end %lu RPC(%d):r%u/w%u/d%u\n",
+	       start, end, no_rpc_slots, cli->cl_r_in_flight,
+	       cli->cl_w_in_flight, cli->cl_d_in_flight);
+	osc_object_lock(obj);
+	/* Check buffered read extents. */
+	list_for_each_entry_safe(ext, next, &obj->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (ext->oe_end < start || ext->oe_start > end)
+			continue;
+		if (ext->oe_dio || ext->oe_srvlock)
+			continue;
+
+		list_move_tail(&ext->oe_link, &obj->oo_hp_read_exts);
+		OSC_EXTENT_DUMP(D_CACHE, ext, "HP read this extent\n");
+		unplug = true;
+	}
+
+	if (read_check_only)
+		GOTO(out_unlock, unplug);
+
+	/* Check buffered write extents. */
+	ext = osc_extent_search(obj, start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < start)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		if (ext->oe_start > end)
+			break;
+
+		ext->oe_fsync_wait = 1;
+		switch (ext->oe_state) {
+		case OES_CACHE:
+			/*
+			 * The extent in HP (oe_hp) is being written back by
+			 * another thread.
+			 */
+			if (ext->oe_hp || ext->oe_dio || ext->oe_srvlock)
+				break;
+
+			ext->oe_hp = 1;
+			list_move_tail(&ext->oe_link, &obj->oo_hp_exts);
+			OSC_EXTENT_DUMP(D_CACHE, ext, "HP write this extent\n");
+			unplug = true;
+			break;
+		case OES_ACTIVE:
+			/*
+			 * It is pretty bad to wait for ACTIVE extents, because
+			 * we do not know how long we will wait for it to be
+			 * flushed since it may be blocked at awaiting more
+			 * grants. We do this for the correctness of fsync.
+			 */
+			ext->oe_urgent = 1;
+			break;
+		default:
+			break;
+		}
+		ext = next_extent(ext);
+	}
+
+out_unlock:
+	osc_object_unlock(obj);
+
+	if (unplug)
+		osc_io_unplug(env, cli, obj);
+
+	RETURN(0);
+}
 
 /** @} osc */
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h
index 445de73..a576bbd 100644
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -32,6 +32,8 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
 void osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
 int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 			   pgoff_t start, pgoff_t end, bool discard);
+int osc_ldlm_hp_handle(const struct lu_env *env, struct osc_object *obj,
+		       pgoff_t start, pgoff_t end, bool need_rpc_check);
 
 void osc_lock_lvb_update(const struct lu_env *env,
 			 struct osc_object *osc,
diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c
index 3382fe9..c6015e9 100644
--- a/lustre/osc/osc_io.c
+++ b/lustre/osc/osc_io.c
@@ -72,6 +72,7 @@ static int osc_io_read_ahead(const struct lu_env *env,
 		LASSERT(dlmlock->l_ast_data == osc);
 		if (dlmlock->l_req_mode != LCK_PR) {
 			struct lustre_handle lockh;
+
 			ldlm_lock2handle(dlmlock, &lockh);
 			ldlm_lock_addref(&lockh, LCK_PR);
 			ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c
index 4af3d99..5a0480e 100644
--- a/lustre/osc/osc_lock.c
+++ b/lustre/osc/osc_lock.c
@@ -340,6 +340,12 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
 	if (IS_ERR(env))
 		RETURN(PTR_ERR(env));
 
+	/* For blocking AST, it only needs to check conflict read extents. */
+	rc = osc_ldlm_hp_handle(env, obj, start, end, true);
+	if (rc < 0)
+		CERROR("%s: HP read check failed: rc = %d\n",
+		       cli_name(osc_cli(obj)), rc);
+
 	if (mode == CLM_WRITE) {
 		rc = osc_cache_writeback_range(env, obj, start, end, 1,
 					       discard);
diff --git a/lustre/osc/osc_object.c b/lustre/osc/osc_object.c
index a6b63f6..9e3afb7 100644
--- a/lustre/osc/osc_object.c
+++ b/lustre/osc/osc_object.c
@@ -58,6 +58,7 @@ int osc_object_init(const struct lu_env *env, struct lu_object *obj,
 
 	osc->oo_root.rb_node = NULL;
 	INIT_LIST_HEAD(&osc->oo_hp_exts);
+	INIT_LIST_HEAD(&osc->oo_hp_read_exts);
 	INIT_LIST_HEAD(&osc->oo_urgent_exts);
 	INIT_LIST_HEAD(&osc->oo_full_exts);
 	INIT_LIST_HEAD(&osc->oo_reading_exts);
@@ -90,6 +91,7 @@ void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 
 	LASSERT(osc->oo_root.rb_node == NULL);
 	LASSERT(list_empty(&osc->oo_hp_exts));
+	LASSERT(list_empty(&osc->oo_hp_read_exts));
 	LASSERT(list_empty(&osc->oo_urgent_exts));
 	LASSERT(list_empty(&osc->oo_full_exts));
 	LASSERT(list_empty(&osc->oo_reading_exts));
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 6e92742..5c58fe9 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -3843,6 +3843,48 @@ static int osc_cancel_weight(struct ldlm_lock *lock)
 	RETURN(0);
 }
 
+static int osc_hp_handler(struct ldlm_lock *lock)
+{
+	struct cl_object *clob = NULL;
+	struct lu_env *env;
+	__u16 refcheck;
+	int rc = 0;
+
+	ENTRY;
+
+	if (lock->l_resource->lr_type != LDLM_EXTENT)
+		RETURN(0);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	lock_res_and_lock(lock);
+	if (!ldlm_is_granted(lock)) {
+		unlock_res_and_lock(lock);
+		GOTO(out, rc = 0);
+	}
+
+	if (lock->l_ast_data != NULL) {
+		clob = osc2cl(lock->l_ast_data);
+		cl_object_get(clob);
+	}
+	unlock_res_and_lock(lock);
+
+	if (clob != NULL) {
+		struct ldlm_extent *extent = &lock->l_policy_data.l_extent;
+
+		/* HP handling for extents covered by the DLM lock. */
+		rc = osc_ldlm_hp_handle(env, cl2osc(clob),
+					extent->start >> PAGE_SHIFT,
+					extent->end >> PAGE_SHIFT, false);
+		cl_object_put(env, clob);
+	}
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
 static int brw_queue_work(const struct lu_env *env, void *data)
 {
 	struct client_obd *cli = data;
@@ -3940,6 +3982,7 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	}
 
 	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
+	ns_register_hp_handler(obd->obd_namespace, osc_hp_handler);
 
 	spin_lock(&osc_shrink_lock);
 	list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
diff --git a/lustre/tests/sanity-pcc.sh b/lustre/tests/sanity-pcc.sh
index 1be67bb..11de54e 100755
--- a/lustre/tests/sanity-pcc.sh
+++ b/lustre/tests/sanity-pcc.sh
@@ -4120,6 +4120,7 @@ test_99() {
 	local mntpt="/mnt/pcc.$tdir"
 	local hsm_root="$mntpt/$tdir"
 	local file=$DIR/$tfile
+	local cnt=50
 
 	$LCTL get_param -n mdc.*.connect_flags | grep -q pcc_ro ||
 		skip "Server does not support PCC-RO"
@@ -4130,7 +4131,7 @@ test_99() {
 		"projid={0}\ roid=$HSM_ARCHIVE_NUMBER\ pccro=1"
 	do_facet $SINGLEAGT $LCTL pcc list $MOUNT
 
-	do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=50 ||
+	do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=$cnt ||
 		error "Write $file failed"
 
 	local rpid
@@ -4140,27 +4141,33 @@ test_99() {
 	local dpid
 	local lpcc_path
 
+	local lckf=$DIR/$tfile.lck
+
+	rm -f $lckf
 	lpcc_path=$(lpcc_fid2path $hsm_root $file)
 	(
-		while [ ! -e $DIR/sanity-pcc.99.lck ]; do
-			do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=50 conv=notrunc ||
+		while [ ! -e $lckf ]; do
+			do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=$cnt conv=notrunc || {
+				touch $lckf
 				error "failed to write $file"
+			}
 			sleep 0.$((RANDOM % 4 + 1))
 		done
 	)&
 	wpid=$!
 
 	(
-		while [ ! -e $DIR/sanity-pcc.99.lck ]; do
-			do_facet $SINGLEAGT dd if=$file of=/dev/null bs=1M count=50 ||
-				error "failed to write $file"
+		while [ ! -e $lckf ]; do
+			echo "Read $file ..."
+			do_facet $SINGLEAGT dd if=$file of=/dev/null bs=1M count=$cnt ||
+				error "failed to read $file"
 			sleep 0.$((RANDOM % 4 + 1))
 		done
 	)&
 	rpid=$!
 
 	(
-		while [ ! -e $DIR/sanity-pcc.99.lck ]; do
+		while [ ! -e $lckf ]; do
 			do_facet $SINGLEAGT $MMAP_CAT $file > /dev/null ||
 				error "failed to mmap_cat $file"
 			sleep 0.$((RANDOM % 4 + 1))
@@ -4169,7 +4176,7 @@ test_99() {
 	rpid2=$!
 
 	(
-		while [ ! -e $DIR/sanity-pcc.99.lck ]; do
+		while [ ! -e $lckf ]; do
 			echo "Unlink $lpcc_path"
 			do_facet $SINGLEAGT unlink $lpcc_path
 			sleep 1
@@ -4179,7 +4186,7 @@ test_99() {
 	upid=$!
 
 	(
-		while [ ! -e $DIR/sanity-pcc.99.lck ]; do
+		while [ ! -e $lckf ]; do
 			echo "Detach $file ..."
 			do_facet $SINGLEAGT $LFS pcc detach $file
 			sleep 0.$((RANDOM % 8 + 1))
@@ -4188,19 +4195,114 @@ test_99() {
 	dpid=$!
 
 	sleep 60
-	stack_trap "rm -f $DIR/sanity-pcc.99.lck"
-	touch $DIR/sanity-pcc.99.lck
+	echo "==== DONE ===="
+	stack_trap "rm -f $lckf"
+	touch $lckf
 	wait $wpid || error "$?: write failed"
 	wait $rpid || error "$?: read failed"
-	wait $rpid2 || error "$?: read2 failed"
+	wait $rpid2 || error "$?: mmap read2 failed"
 	wait $upid || error "$?: unlink failed"
 	wait $dpid || error "$?: detach failed"
 
+	echo "==== DONE WIAT ===="
+	lctl get_param osc.*.rpc_stats
 	do_facet $SINGLEAGT $LFS pcc detach $file
-	rm -f $DIR/sanity-pcc.99.lck
+	rm -f $lckf
 }
 run_test 99 "race among unlink | mmap read | write | detach for PCC-RO file"
 
+test_99b() {
+	local loopfile="$TMP/$tfile"
+	local mntpt="/mnt/pcc.$tdir"
+	local hsm_root="$mntpt/$tdir"
+	local file=$DIR/$tfile
+	local cnt=50
+
+	setup_loopdev $SINGLEAGT $loopfile $mntpt 200
+	do_facet $SINGLEAGT mkdir $hsm_root || error "mkdir $hsm_root failed"
+	setup_pcc_mapping $SINGLEAGT \
+		"projid={0}\ roid=$HSM_ARCHIVE_NUMBER\ pccro=1"
+	do_facet $SINGLEAGT $LCTL pcc list $MOUNT
+
+	do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=$cnt ||
+		error "Write $file failed"
+
+	local rpid
+	local rpid2
+	local wpid
+	local upid
+	local dpid
+	local lpcc_path
+
+	local lckf=$DIR/$tfile.lck
+
+	rm -f $lckf
+	lpcc_path=$(lpcc_fid2path $hsm_root $file)
+	(
+		while [ ! -e $lckf ]; do
+			do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=$cnt conv=notrunc || {
+				touch $lckf
+				error "failed to write $file"
+			}
+			sleep 0.$((RANDOM % 4 + 1))
+		done
+	)&
+	wpid=$!
+
+	(
+		while [ ! -e $lckf ]; do
+			echo "Read $file ..."
+			do_facet $SINGLEAGT dd if=$file of=/dev/null bs=1M count=$cnt ||
+				error "failed to read $file (1)"
+			sleep 0.$((RANDOM % 4 + 1))
+		done
+	)&
+	rpid=$!
+
+	(
+		while [ ! -e $lckf ]; do
+			do_facet $SINGLEAGT dd if=$file of=/dev/null bs=1M count=$cnt ||
+				error "failed to read $file (2)"
+			sleep 0.$((RANDOM % 4 + 1))
+		done
+	)&
+	rpid2=$!
+
+	(
+		while [ ! -e $lckf ]; do
+			echo "Unlink $lpcc_path"
+			do_facet $SINGLEAGT unlink $lpcc_path
+			sleep 1
+		done
+		true
+	)&
+	upid=$!
+
+	(
+		while [ ! -e $lckf ]; do
+			echo "Detach $file ..."
+			do_facet $SINGLEAGT $LFS pcc detach $file
+			sleep 0.$((RANDOM % 8 + 1))
+		done
+	)&
+	dpid=$!
+
+	sleep 60
+	echo "==== DONE ===="
+	touch $lckf
+	wait $wpid || error "$?: write failed"
+	wait $rpid || error "$?: read failed"
+	wait $rpid2 || error "$?: read2 failed"
+	wait $upid || error "$?: unlink failed"
+	wait $dpid || error "$?: detach failed"
+
+	echo "==== DONE WIAT ===="
+	lctl get_param osc.*.rpc_stats
+	do_facet $SINGLEAGT $LFS pcc detach $file
+	rm -f $lckf
+}
+run_test 99b "race among unlink | two readers | write | detach for PCC-RO file"
+
 test_100() {
 	local loopfile="$TMP/$tfile"
 	local mntpt="/mnt/pcc.$tdir"