Whamcloud - gitweb
LU-8760 lib: avoid unexpected out of order execution
[fs/lustre-release.git] / lustre / include / lustre_lib.h
index 3c6b5ae..325fc9b 100644 (file)
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
  * @{
  */
 
-#include <linux/signal.h>
 #include <libcfs/libcfs.h>
 #include <lustre/lustre_idl.h>
 #include <lustre_ver.h>
-#include <lustre_cfg.h>
+#include <uapi/linux/lustre_cfg.h>
 
 /* target.c */
 struct ptlrpc_request;
@@ -60,9 +55,16 @@ struct l_wait_info;
 #include <lustre_ha.h>
 #include <lustre_net.h>
 
+#define LI_POISON 0x5a5a5a5a
+#if BITS_PER_LONG > 32
+# define LL_POISON 0x5a5a5a5a5a5a5a5aL
+#else
+# define LL_POISON 0x5a5a5a5aL
+#endif
+#define LP_POISON ((void *)LL_POISON)
+
 #ifdef HAVE_SERVER_SUPPORT
-void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
-                          int error);
+int rev_import_init(struct obd_export *exp);
 int target_handle_connect(struct ptlrpc_request *req);
 int target_handle_disconnect(struct ptlrpc_request *req);
 void target_destroy_export(struct obd_export *exp);
@@ -212,6 +214,40 @@ struct l_wait_info {
         sigmask(SIGQUIT) | sigmask(SIGALRM))
 
 /*
+ * Wait Queue
+ */
+#ifndef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+                                             wait_queue_t *wait)
+{
+       wait->flags |= WQ_FLAG_EXCLUSIVE;
+       __add_wait_queue(q, wait);
+}
+#endif /* HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+#define add_wait_queue_exclusive_head(waitq, link)             \
+{                                                              \
+       unsigned long flags;                                    \
+                                                               \
+       spin_lock_irqsave(&((waitq)->lock), flags);             \
+       __add_wait_queue_exclusive(waitq, link);                \
+       spin_unlock_irqrestore(&((waitq)->lock), flags);        \
+}
+
+/*
  * wait for @condition to become true, but no longer than timeout, specified
  * by @info.
  */
@@ -226,7 +262,7 @@ do {                                                                           \
        if (condition)                                                         \
                break;                                                         \
                                                                               \
-       init_waitqueue_entry_current(&__wait);                                 \
+       init_waitqueue_entry(&__wait, current);                                \
        l_add_wait(&wq, &__wait);                                              \
                                                                               \
        /* Block all signals (just the non-fatal ones if no timeout). */       \
@@ -236,27 +272,43 @@ do {                                                                           \
                __blocked = cfs_block_sigsinv(0);                              \
                                                                               \
        for (;;) {                                                             \
-               unsigned       __wstate;                                       \
+               set_current_state(TASK_INTERRUPTIBLE);                         \
                                                                               \
-               __wstate = info->lwi_on_signal != NULL &&                      \
-                          (__timeout == 0 || __allow_intr) ?                  \
-                       TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;             \
+               /* To guarantee that the condition check will be done */       \
+               /* after setting the thread state as TASK_INTERRUPTIBLE. */    \
+               /* Otherwise, out-of-order execution may cause some race. */   \
+               /* Consider the following real execution order: */             \
                                                                               \
-               set_current_state(TASK_INTERRUPTIBLE);                         \
+               /* 1. Thread1 checks condition on CPU1, gets false. */         \
+               /* 2. Thread2 sets condition on CPU2. */                       \
+               /* 3. Thread2 calls wake_up() on CPU2 to wake the threads */   \
+               /*    with state TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE. */ \
+               /*    But the Thread1's state is TASK_RUNNING at that time. */ \
+               /* 4. Thread1 sets its state as TASK_INTERRUPTIBLE on CPU1, */ \
+               /*    then schedule. */                                        \
+                                                                              \
+               /* If the '__timeout' variable is zero, the Thread1 will */    \
+               /* have no chance to check the condition again. */             \
+                                                                              \
+               /* Generally, the interval between out-of-ordered step1 and */ \
+               /* step4 is very tiny, as to above step2 and step3 cannot */   \
+               /* happen. On some degree, it can explain why we seldom hit */ \
+               /* related trouble. But such race really exists, especially */ \
+               /* consider that the step1 and step4 can be interruptible. */  \
+               /* So add barrier to avoid Thread1 out-of-order execution. */  \
+               smp_mb();                                                      \
                                                                               \
                if (condition)                                                 \
                        break;                                                 \
                                                                               \
                if (__timeout == 0) {                                          \
-                       waitq_wait(&__wait, __wstate);                         \
+                       schedule();                                            \
                } else {                                                       \
                        cfs_duration_t interval = info->lwi_interval?          \
                                             min_t(cfs_duration_t,             \
                                                 info->lwi_interval,__timeout):\
                                             __timeout;                        \
-                       cfs_duration_t remaining = waitq_timedwait(&__wait,    \
-                                                  __wstate,                   \
-                                                  interval);                  \
+                       cfs_duration_t remaining = schedule_timeout(interval); \
                        __timeout = cfs_time_sub(__timeout,                    \
                                            cfs_time_sub(interval, remaining));\
                        if (__timeout == 0) {                                  \
@@ -273,7 +325,7 @@ do {                                                                           \
                                                                                \
                 if (condition)                                                 \
                         break;                                                 \
-                if (cfs_signal_pending()) {                                    \
+               if (signal_pending(current)) {                                 \
                         if (info->lwi_on_signal != NULL &&                     \
                             (__timeout == 0 || __allow_intr)) {                \
                                 if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
@@ -294,12 +346,11 @@ do {                                                                           \
                                                                                \
        cfs_restore_sigs(__blocked);                                           \
                                                                                \
-       set_current_state(TASK_RUNNING);                                       \
+       set_current_state(TASK_RUNNING);                                       \
        remove_wait_queue(&wq, &__wait);                                       \
 } while (0)
 
 
-
 #define l_wait_event(wq, condition, info)                       \
 ({                                                              \
        int                 __ret;                              \