From b2ede01d1ed77ddc512c013220f6ea8b509e9541 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 14 Dec 2018 14:48:45 +1100
Subject: [PATCH] LU-9679 osc: convert cl_cache_waiters to a wait_queue.

cli->cl_cache_waiters is a list of tasks that need
to be woken when grant-space becomes available.  This
means it is acting much like a wait queue.
So let's change it to really be a wait queue.

The current implementation adds new waiters to the end of the list,
and calls osc_enter_cache_try() on each in order.  We can provide the
same behaviour by using an exclusive wait, and having each waiter wake
the next task when it succeeds.

If a waiter notices that success has become impossible, it wakes all
other waiters.

If a waiter times out, it doesn't wake other - just leaves them to
time out themselves.

Note that the old code handled -EINTR from the wait function.  That is
not a possible return value when wait_event_idle* is used, so that
case is discarded.

As we need wait_event_idle_exclusive_timeout_cmd(), we should fix the
bug in that macro - the "might_sleep()" is wrong, as a spinlock might
be held at that point.

Linux-Commit: 31f45f56ecdf ("lustre: osc_cache: convert
	cl_cache_waiters to a wait_queue.")

Signed-off-by: Mr NeilBrown <neilb@suse.com>
Change-Id: Ib7622ea2daea8f6e59bef95d3b6c5a80d209b81e
Reviewed-on: https://review.whamcloud.com/37605
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 libcfs/include/libcfs/linux/linux-wait.h |   2 -
 lustre/include/lustre_osc.h              |  13 +--
 lustre/include/obd.h                     |   2 +-
 lustre/ldlm/ldlm_lib.c                   |   2 +-
 lustre/osc/osc_cache.c                   | 159 ++++++++++---------------------
 lustre/osc/osc_internal.h                |   1 -
 lustre/osc/osc_page.c                    |   2 +-
 7 files changed, 58 insertions(+), 123 deletions(-)

diff --git a/libcfs/include/libcfs/linux/linux-wait.h b/libcfs/include/libcfs/linux/linux-wait.h
index 8fac3db..fd154ba 100644
--- a/libcfs/include/libcfs/linux/linux-wait.h
+++ b/libcfs/include/libcfs/linux/linux-wait.h
@@ -281,7 +281,6 @@ do {									\
 					      cmd1, cmd2)		\
 ({									\
 	long __ret = timeout;						\
-	might_sleep();							\
 	if (!___wait_cond_timeout1(condition))				\
 		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
 			wq_head, condition, timeout, cmd1, cmd2);	\
@@ -400,7 +399,6 @@ do {									\
 					      cmd1, cmd2)		\
 ({									\
 	long __ret = timeout;						\
-	might_sleep();							\
 	if (!___wait_cond_timeout1(condition))				\
 		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
 			wq_head, condition, timeout, cmd1, cmd2);	\
diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h
index 4c243ed..9a0fd6a 100644
--- a/lustre/include/lustre_osc.h
+++ b/lustre/include/lustre_osc.h
@@ -99,14 +99,6 @@ static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
 	return container_of(pga, struct osc_async_page, oap_brw_page);
 }
 
-struct osc_cache_waiter {
-	struct list_head	ocw_entry;
-	wait_queue_head_t	ocw_waitq;
-	struct osc_async_page	*ocw_oap;
-	int			ocw_grant;
-	int			ocw_rc;
-};
-
 struct osc_device {
 	struct cl_device	od_cl;
 	struct obd_export	*od_exp;
@@ -598,7 +590,10 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 			 pgoff_t start, pgoff_t end);
 int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
 		   struct osc_object *osc, int async);
-void osc_wake_cache_waiters(struct client_obd *cli);
+static inline void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	wake_up(&cli->cl_cache_waiters);
+}
 
 static inline int osc_io_unplug_async(const struct lu_env *env,
 				      struct client_obd *cli,
diff --git a/lustre/include/obd.h b/lustre/include/obd.h
index 6a5f2dc..9cbc8f7 100644
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -221,7 +221,7 @@ struct client_obd {
 	 * grant before trying to dirty a page and unreserve the rest.
 	 * See osc_{reserve|unreserve}_grant for details. */
 	long			cl_reserved_grant;
-	struct list_head	cl_cache_waiters; /* waiting for cache/grant */
+	wait_queue_head_t	cl_cache_waiters; /* waiting for cache/grant */
 	time64_t		cl_next_shrink_grant;	/* seconds */
 	struct list_head	cl_grant_chain;
 	time64_t		cl_grant_shrink_interval; /* seconds */
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index 8a0ea40..6d14580 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -378,7 +378,7 @@ int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	 * ptlrpc_connect_interpret().
 	 */
 	client_adjust_max_dirty(cli);
-	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	init_waitqueue_head(&cli->cl_cache_waiters);
 	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_write_list);
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c
index 56fd9e4..1f343e2 100644
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -1533,15 +1533,26 @@ out:
 	return rc;
 }
 
-static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+/* Following two inlines exist to pass code fragments
+ * to wait_event_idle_exclusive_timeout_cmd().  Passing
+ * code fragments as macro args can look confusing, so
+ * we provide inlines to encapsulate them.
+ */
+static inline void cli_unlock_and_unplug(const struct lu_env *env,
+					 struct client_obd *cli,
+					 struct osc_async_page *oap)
 {
-	int rc;
-	spin_lock(&cli->cl_loi_list_lock);
-	rc = list_empty(&ocw->ocw_entry);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return rc;
+	osc_io_unplug_async(env, cli, NULL);
+	CDEBUG(D_CACHE,
+	       "%s: sleeping for cache space for %p\n",
+	       cli_name(cli), oap);
 }
 
+static inline void cli_lock_after_unplug(struct client_obd *cli)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+}
 /**
  * The main entry to reserve dirty page accounting. Usually the grant reserved
  * in this function will be freed in bulk in osc_free_grant() unless it fails
@@ -1554,8 +1565,11 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 {
 	struct osc_object	*osc = oap->oap_obj;
 	struct lov_oinfo	*loi = osc->oo_oinfo;
-	struct osc_cache_waiter	 ocw;
 	int			 rc = -EDQUOT;
+	unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout : at_max);
+	int remain;
+	bool entered = false;
+
 	ENTRY;
 
 	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
@@ -1571,83 +1585,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 		GOTO(out, rc = -EDQUOT);
 	}
 
-	/* Hopefully normal case - cache space and write credits available */
-	if (list_empty(&cli->cl_cache_waiters) &&
-	    osc_enter_cache_try(cli, oap, bytes)) {
-		OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
-		GOTO(out, rc = 0);
-	}
-
-	/* We can get here for two reasons: too many dirty pages in cache, or
+	/*
+	 * We can wait here for two reasons: too many dirty pages in cache, or
 	 * run out of grants. In both cases we should write dirty pages out.
 	 * Adding a cache waiter will trigger urgent write-out no matter what
 	 * RPC size will be.
-	 * The exiting condition is no avail grants and no dirty pages caching,
-	 * that really means there is no space on the OST. */
-	init_waitqueue_head(&ocw.ocw_waitq);
-	ocw.ocw_oap   = oap;
-	ocw.ocw_grant = bytes;
-	while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
-		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
-		ocw.ocw_rc = 0;
-		spin_unlock(&cli->cl_loi_list_lock);
-
-		osc_io_unplug_async(env, cli, NULL);
-
-		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
-		       cli_name(cli), &ocw, oap);
-
-		rc = wait_event_idle_timeout(ocw.ocw_waitq,
-					     ocw_granted(cli, &ocw),
-					     cfs_time_seconds(AT_OFF ?
-							      obd_timeout :
-							      at_max));
-
-		spin_lock(&cli->cl_loi_list_lock);
-
-		if (rc <= 0) {
-			/* wait_event_idle_timeout timed out */
-			list_del_init(&ocw.ocw_entry);
-			if (rc == 0)
-				rc = -ETIMEDOUT;
-			break;
-		}
-		LASSERT(list_empty(&ocw.ocw_entry));
-		rc = ocw.ocw_rc;
-
-		if (rc != -EDQUOT)
-			break;
-		if (osc_enter_cache_try(cli, oap, bytes)) {
-			rc = 0;
-			break;
-		}
-	}
-
-	switch (rc) {
-	case 0:
-		OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
-		break;
-	case -ETIMEDOUT:
+	 * The exiting condition (other than success) is no avail grants
+	 * and no dirty pages caching, that really means there is no space
+	 * on the OST.
+	 */
+	remain = wait_event_idle_exclusive_timeout_cmd(
+		cli->cl_cache_waiters,
+		(entered = osc_enter_cache_try(cli, oap, bytes)) ||
+		(cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0),
+		timeout,
+		cli_unlock_and_unplug(env, cli, oap),
+		cli_lock_after_unplug(cli));
+
+	if (entered) {
+		if (remain == timeout)
+			OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+		else
+			OSC_DUMP_GRANT(D_CACHE, cli,
+				       "finally got grant space\n");
+		wake_up(&cli->cl_cache_waiters);
+		rc = 0;
+	} else if (remain == 0) {
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "timeout, fall back to sync i/o\n");
 		osc_extent_tree_dump(D_CACHE, osc);
 		/* fall back to synchronous I/O */
-		rc = -EDQUOT;
-		break;
-	case -EINTR:
-		/* Ensures restartability - LU-3581 */
-		OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
-		rc = -ERESTARTSYS;
-		break;
-	case -EDQUOT:
+	} else {
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "no grant space, fall back to sync i/o\n");
-		break;
-	default:
-		CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived "
-		       "due to %d, fall back to sync i/o\n",
-		       cli_name(cli), &ocw, rc);
-		break;
+		wake_up_all(&cli->cl_cache_waiters);
 	}
 	EXIT;
 out:
@@ -1655,36 +1626,6 @@ out:
 	RETURN(rc);
 }
 
-/* caller must hold loi_list_lock */
-void osc_wake_cache_waiters(struct client_obd *cli)
-{
-	struct list_head *l, *tmp;
-	struct osc_cache_waiter *ocw;
-
-	ENTRY;
-	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
-		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
-
-		ocw->ocw_rc = -EDQUOT;
-
-		if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant))
-			ocw->ocw_rc = 0;
-
-		if (ocw->ocw_rc == 0 ||
-		    !(cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0)) {
-			list_del_init(&ocw->ocw_entry);
-			CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant "
-			       "%ld, %d\n", ocw, ocw->ocw_oap,
-			       cli->cl_avail_grant, ocw->ocw_rc);
-
-			wake_up(&ocw->ocw_waitq);
-		}
-	}
-
-	EXIT;
-}
-EXPORT_SYMBOL(osc_wake_cache_waiters);
-
 static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
 {
 	int hprpc = !!list_empty(&osc->oo_hp_exts);
@@ -1724,8 +1665,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
 		}
 		/* trigger a write rpc stream as long as there are dirtiers
 		 * waiting for space.  as they're waiting, they're not going to
-		 * create more pages to coalesce with what's waiting.. */
-		if (!list_empty(&cli->cl_cache_waiters)) {
+		 * create more pages to coalesce with what's waiting..
+		 */
+		if (waitqueue_active(&cli->cl_cache_waiters)) {
 			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
 			RETURN(1);
 		}
@@ -2177,8 +2119,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli)
 	/* then if we have cache waiters, return all objects with queued
 	 * writes.  This is especially important when many small files
 	 * have filled up the cache and not been fired into rpcs because
-	 * they don't pass the nr_pending/object threshhold */
-	if (!list_empty(&cli->cl_cache_waiters) &&
+	 * they don't pass the nr_pending/object threshhold
+	 */
+	if (waitqueue_active(&cli->cl_cache_waiters) &&
 	    !list_empty(&cli->cl_loi_write_list))
 		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
 
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h
index b2bc407..5a65dcd 100644
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -42,7 +42,6 @@ extern atomic_t osc_pool_req_count;
 extern unsigned int osc_reqpool_maxreqcount;
 extern struct ptlrpc_request_pool *osc_rq_pool;
 
-void osc_wake_cache_waiters(struct client_obd *cli);
 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
 void osc_schedule_grant_work(void);
 void osc_update_next_shrink(struct client_obd *cli);
diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c
index d0fd5e2..8f3258f 100644
--- a/lustre/osc/osc_page.c
+++ b/lustre/osc/osc_page.c
@@ -150,7 +150,7 @@ static int osc_page_print(const struct lu_env *env,
 			  cli->cl_r_in_flight, cli->cl_w_in_flight,
 			  cli->cl_max_rpcs_in_flight,
 			  cli->cl_avail_grant,
-			  list_empty_marker(&cli->cl_cache_waiters),
+			  waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-',
 			  list_empty_marker(&cli->cl_loi_ready_list),
 			  list_empty_marker(&cli->cl_loi_hp_ready_list),
 			  list_empty_marker(&cli->cl_loi_write_list),
-- 
1.8.3.1