Whamcloud - gitweb
LU-11089 obdclass: remove locking from lu_context_exit() 13/32713/8
authorNeilBrown <neilb@suse.com>
Wed, 8 May 2019 14:17:24 +0000 (10:17 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 21 May 2019 05:10:53 +0000 (05:10 +0000)
Recent patches suggest that the locking in lu_context_exit() hurts
performance as the changes that make are to improve performance.
Let's go all the way and remove the locking completely.

The race of interest is between lu_context_exit() finalizing a
value with ->lct_exit, and lu_context_key_quiesce() freeing
the value with key_fini().

If lu_context_key_quiesce() has started, there is no need to
finalize the value - it can just be freed.  So lu_context_exit()
is changed to skip the call to ->lcu_exit if LCT_QUIESCENT it set.

If lc_context_exit() has started, lu_context_key_quiesce() must wait
for it to complete - it cannot just skip the freeing.  To allow
this we introduce a new lc_state, LCS_LEAVING.  This indicates that
->lcu_exit might be called.  Before calling key_fini() on a context,
lu_context_key_quiesce() waits (spinning) for lc_state to move on from
LCS_LEAVING.

Linux-commit: ac3f8fd6e61b245fa9c14e3164203c1211c5ef6b

fix possible hang waiting for LCS_LEAVING

As lu_context_key_quiesce() spins waiting for LCS_LEAVING to
change, it is important the we set and then clear in within a
non-preemptible region.  If the thread that spins pre-empty the
thread that sets-and-clears the state while the state is LCS_LEAVING,
then it can spin indefinitely, particularly on a single-CPU machine.

Also update the comment to explain this dependency.

Linux-commit: 4859716f66db6989ef4bf52434b5b1d813c6adc1

Change-Id: I92ef27304eab43518fcb216b9c9cb4875cc9b98c
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-on: https://review.whamcloud.com/32713
Tested-by: Jenkins
Reviewed-by: Gu Zheng <gzheng@ddn.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
libcfs/autoconf/lustre-libcfs.m4
libcfs/include/libcfs/linux/Makefile.am
libcfs/include/libcfs/linux/processor.h [new file with mode: 0644]
lustre/include/lu_object.h
lustre/include/lustre_compat.h
lustre/obdclass/lu_object.c

index 82a4ebc..12efc01 100644 (file)
@@ -905,6 +905,17 @@ uuid_t, [
 ]) # LIBCFS_UUID_T
 
 #
+# Kernel version 4.12-rc3 commit fd851a3cdc196bfc1d229b5f22369069af532bf8
+# introduce processor.h
+#
+AC_DEFUN([LIBCFS_HAVE_PROCESSOR_HEADER], [
+LB_CHECK_LINUX_HEADER([linux/processor.h], [
+       AC_DEFINE(HAVE_PROCESSOR_H, 1,
+               [processor.h is present])])
+]) # LIBCFS_HAVE_PROCESSOR_HEADER
+
+
+#
 # LIBCFS_WAIT_QUEUE_ENTRY
 #
 # Kernel version 4.13 ac6424b981bce1c4bc55675c6ce11bfe1bbfa64f
@@ -1071,6 +1082,7 @@ LIBCFS_HOTPLUG_STATE_MACHINE
 LIBCFS_RHASHTABLE_LOOKUP_GET_INSERT_FAST
 LIBCFS_SCHED_HEADERS
 # 4.12
+LIBCFS_HAVE_PROCESSOR_HEADER
 LIBCFS_UUID_T
 # 4.13
 LIBCFS_WAIT_QUEUE_ENTRY
index 0aaef36..82bc7db 100644 (file)
@@ -1,2 +1,3 @@
 EXTRA_DIST = linux-misc.h linux-fs.h linux-mem.h linux-time.h linux-cpu.h \
-            linux-list.h linux-hash.h linux-uuid.h linux-crypto.h
+            linux-list.h linux-hash.h linux-uuid.h linux-crypto.h \
+            processor.h
diff --git a/libcfs/include/libcfs/linux/processor.h b/libcfs/include/libcfs/linux/processor.h
new file mode 100644 (file)
index 0000000..700d01e
--- /dev/null
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Misc low level processor primitives */
+#ifndef _LINUX_PROCESSOR_H
+#define _LINUX_PROCESSOR_H
+
+#include <asm/processor.h>
+
+/*
+ * spin_begin is used before beginning a busy-wait loop, and must be paired
+ * with spin_end when the loop is exited. spin_cpu_relax must be called
+ * within the loop.
+ *
+ * The loop body should be as small and fast as possible, on the order of
+ * tens of instructions/cycles as a guide. It should and avoid calling
+ * cpu_relax, or any "spin" or sleep type of primitive including nested uses
+ * of these primitives. It should not lock or take any other resource.
+ * Violations of these guidelies will not cause a bug, but may cause sub
+ * optimal performance.
+ *
+ * These loops are optimized to be used where wait times are expected to be
+ * less than the cost of a context switch (and associated overhead).
+ *
+ * Detection of resource owner and decision to spin or sleep or guest-yield
+ * (e.g., spin lock holder vcpu preempted, or mutex owner not on CPU) can be
+ * tested within the loop body.
+ */
+#ifndef spin_begin
+#ifdef CONFIG_PPC64
+#define spin_begin()   HMT_low()
+#else
+#define spin_begin()
+#endif /* CONFIG_PPC64 */
+#endif /* spin_begin */
+
+#ifndef spin_cpu_relax
+#define spin_cpu_relax() cpu_relax()
+#endif
+
+/*
+ * spin_cpu_yield may be called to yield (undirected) to the hypervisor if
+ * necessary. This should be used if the wait is expected to take longer
+ * than context switch overhead, but we can't sleep or do a directed yield.
+ */
+#ifndef spin_cpu_yield
+#define spin_cpu_yield() cpu_relax_yield()
+#endif
+
+#ifndef spin_end
+#ifdef CONFIG_PPC64
+#define spin_end()     HMT_medium()
+#else
+#define spin_end()
+#endif /* CONFIG_PPC64 */
+#endif /* spin_end */
+
+/*
+ * spin_until_cond can be used to wait for a condition to become true. It
+ * may be expected that the first iteration will true in the common case
+ * (no spinning), so that callers should not require a first "likely" test
+ * for the uncontended case before using this primitive.
+ *
+ * Usage and implementation guidelines are the same as for the spin_begin
+ * primitives, above.
+ */
+#ifndef spin_until_cond
+#define spin_until_cond(cond)                                  \
+do {                                                           \
+       if (unlikely(!(cond))) {                                \
+               spin_begin();                                   \
+               do {                                            \
+                       spin_cpu_relax();                       \
+               } while (!(cond));                              \
+               spin_end();                                     \
+       }                                                       \
+} while (0)
+
+#endif
+
+#endif /* _LINUX_PROCESSOR_H */
index 54e0410..0451563 100644 (file)
@@ -890,6 +890,7 @@ enum lu_xattr_flags {
 enum lu_context_state {
         LCS_INITIALIZED = 1,
         LCS_ENTERED,
+       LCS_LEAVING,
         LCS_LEFT,
         LCS_FINALIZED
 };
index f951411..4ea091a 100644 (file)
@@ -698,6 +698,10 @@ static inline struct timespec current_time(struct inode *inode)
 #define alloc_workqueue(name, flags, max_active) create_workqueue(name)
 #endif
 
+#ifndef smp_store_mb
+#define smp_store_mb(var, value)       set_mb(var, value)
+#endif
+
 #ifndef READ_ONCE
 #define READ_ONCE ACCESS_ONCE
 #endif
index 63729fe..9f62693 100644 (file)
 
 #include <linux/module.h>
 #include <linux/list.h>
+#ifdef HAVE_PROCESSOR_H
+#include <linux/processor.h>
+#else
+#include <libcfs/linux/processor.h>
+#endif
+
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_hash.h> /* hash_long() */
 #include <libcfs/linux/linux-mem.h>
@@ -1560,8 +1566,11 @@ void lu_context_key_quiesce(struct lu_context_key *key)
                up_write(&lu_key_initing);
 
                write_lock(&lu_keys_guard);
-               list_for_each_entry(ctx, &lu_context_remembered, lc_remember)
+               list_for_each_entry(ctx, &lu_context_remembered, lc_remember) {
+                       spin_until_cond(READ_ONCE(ctx->lc_state) != LCS_LEAVING);
                        key_fini(ctx, key->lct_index);
+               }
+
                write_unlock(&lu_keys_guard);
        }
 }
@@ -1721,28 +1730,35 @@ void lu_context_exit(struct lu_context *ctx)
 {
        unsigned int i;
 
-        LINVRNT(ctx->lc_state == LCS_ENTERED);
-        ctx->lc_state = LCS_LEFT;
-        if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
-               /* could race with key quiescency */
-               if (ctx->lc_tags & LCT_REMEMBER)
-                       read_lock(&lu_keys_guard);
-
+       LINVRNT(ctx->lc_state == LCS_ENTERED);
+       /*
+        * Disable preempt to ensure we get a warning if
+        * any lct_exit ever tries to sleep.  That would hurt
+        * lu_context_key_quiesce() which spins waiting for us.
+        * This also ensure we aren't preempted while the state
+        * is LCS_LEAVING, as that too would cause problems for
+        * lu_context_key_quiesce().
+        */
+       preempt_disable();
+       /*
+        * Ensure lu_context_key_quiesce() sees LCS_LEAVING
+        * or we see LCT_QUIESCENT
+        */
+       smp_store_mb(ctx->lc_state, LCS_LEAVING);
+       if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value) {
                 for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
-                       if (ctx->lc_value[i] != NULL) {
-                               struct lu_context_key *key;
-
-                               key = lu_keys[i];
-                               LASSERT(key != NULL);
-                               if (key->lct_exit != NULL)
-                                       key->lct_exit(ctx,
-                                                     key, ctx->lc_value[i]);
-                       }
-                }
+                       struct lu_context_key *key;
 
-               if (ctx->lc_tags & LCT_REMEMBER)
-                       read_unlock(&lu_keys_guard);
+                       key = lu_keys[i];
+                       if (ctx->lc_value[i] &&
+                           !(key->lct_tags & LCT_QUIESCENT) &&
+                           key->lct_exit)
+                               key->lct_exit(ctx, key, ctx->lc_value[i]);
+               }
         }
+
+       smp_store_release(&ctx->lc_state, LCS_LEFT);
+       preempt_enable();
 }
 EXPORT_SYMBOL(lu_context_exit);