Whamcloud - gitweb
LU-1715 ptlrpc: flock deadlock detection does not work 53/3553/19
authorAndriy Skulysh <Andriy_Skulysh@xyratex.com>
Tue, 23 Jul 2013 23:08:51 +0000 (02:08 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 6 Aug 2013 17:10:29 +0000 (17:10 +0000)
Flock deadlocks are checked on the first attempt to grant
the flock only. If we try again to grant it after its
blocking lock is cancelled, we don't check for deadlocks
which also may exist.

Perform deadlock detection during reprocess

Xyratex-bug-id: MRP-393
Signed-off-by: Andriy Skulysh <Andriy_Skulysh@xyratex.com>
Reviewed-by: Vitaly Fertman <vitaly_fertman@xyratex.com>
Reviewed-by: Bruce Korb <bruce_korb@xyratex.com>
Change-Id: Ie4d21da0a0b18458f786c9903663372aa24700fb
Reviewed-on: http://review.whamcloud.com/3553
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Keith Mannthey <keith.mannthey@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
12 files changed:
contrib/bit-masks/lustre_dlm_flags.def
lustre/include/lustre/lustre_idl.h
lustre/include/lustre_dlm_flags.h
lustre/ldlm/ldlm_flock.c
lustre/llite/llite_lib.c
lustre/obdclass/lprocfs_status.c
lustre/ptlrpc/wiretest.c
lustre/tests/Makefile.am
lustre/tests/flock_deadlock.c [new file with mode: 0644]
lustre/tests/sanity.sh
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 0f59691..828cc23 100644 (file)
@@ -64,7 +64,13 @@ flag[12] = {
     f-desc  = 'lock request has intent';
 };
 
-// Skipped bits 13, 14 and 15
+// Skipped bits 13, 14
+
+flag[15] = {
+    f-name  = flock_deadlock;
+    f-mask  = on_wire, ast;
+    f-desc  = 'flock deadlock detected';
+};
 
 flag[16] = {
     f-name  = discard_data;
index d8537f4..59e72f4 100644 (file)
@@ -1292,6 +1292,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 #define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
 #define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
 #define OBD_CONNECT_PINGLESS   0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* improved flock deadlock detection */
+
 /* XXX README XXX:
  * Please DO NOT add flag values here before first ensuring that this same
  * flag value is not in use on some other branch.  Please clear any such
@@ -1333,7 +1335,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                OBD_CONNECT_EINPROGRESS | \
                                OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
                                OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
-                               OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE)
+                               OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
+                               OBD_CONNECT_FLOCK_DEAD)
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
index a632217..855e18f 100644 (file)
 #ifndef LDLM_ALL_FLAGS_MASK
 
 /** l_flags bits marked as "all_flags" bits */
-#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F132FULL
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F932FULL
 
 /** l_flags bits marked as "ast" bits */
-#define LDLM_FL_AST_MASK                0x0000000080000000ULL
+#define LDLM_FL_AST_MASK                0x0000000080008000ULL
 
 /** l_flags bits marked as "blocked" bits */
 #define LDLM_FL_BLOCKED_MASK            0x000000000000000EULL
@@ -56,7 +56,7 @@
 #define LDLM_FL_LOCAL_ONLY_MASK         0x00FFFFFF00000000ULL
 
 /** l_flags bits marked as "on_wire" bits */
-#define LDLM_FL_ON_WIRE_MASK            0x00000000C08F132FULL
+#define LDLM_FL_ON_WIRE_MASK            0x00000000C08F932FULL
 
 /** extent, mode, or resource changed */
 #define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
 #define ldlm_set_has_intent(_l)         LDLM_SET_FLAG((  _l), 1ULL << 12)
 #define ldlm_clear_has_intent(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 12)
 
+/** flock deadlock detected */
+#define LDLM_FL_FLOCK_DEADLOCK          0x0000000000008000ULL // bit  15
+#define ldlm_is_flock_deadlock(_l)      LDLM_TEST_FLAG(( _l), 1ULL << 15)
+#define ldlm_set_flock_deadlock(_l)     LDLM_SET_FLAG((  _l), 1ULL << 15)
+#define ldlm_clear_flock_deadlock(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 15)
+
 /** discard (no writeback) on cancel */
 #define LDLM_FL_DISCARD_DATA            0x0000000000010000ULL // bit  16
 #define ldlm_is_discard_data(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 16)
@@ -390,6 +396,7 @@ static int hf_lustre_ldlm_fl_ast_sent            = -1;
 static int hf_lustre_ldlm_fl_replay              = -1;
 static int hf_lustre_ldlm_fl_intent_only         = -1;
 static int hf_lustre_ldlm_fl_has_intent          = -1;
+static int hf_lustre_ldlm_fl_flock_deadlock      = -1;
 static int hf_lustre_ldlm_fl_discard_data        = -1;
 static int hf_lustre_ldlm_fl_no_timeout          = -1;
 static int hf_lustre_ldlm_fl_block_nowait        = -1;
@@ -431,6 +438,7 @@ const value_string lustre_ldlm_flags_vals[] = {
   {LDLM_FL_REPLAY,              "LDLM_FL_REPLAY"},
   {LDLM_FL_INTENT_ONLY,         "LDLM_FL_INTENT_ONLY"},
   {LDLM_FL_HAS_INTENT,          "LDLM_FL_HAS_INTENT"},
+  {LDLM_FL_FLOCK_DEADLOCK,      "LDLM_FL_FLOCK_DEADLOCK"},
   {LDLM_FL_DISCARD_DATA,        "LDLM_FL_DISCARD_DATA"},
   {LDLM_FL_NO_TIMEOUT,          "LDLM_FL_NO_TIMEOUT"},
   {LDLM_FL_BLOCK_NOWAIT,        "LDLM_FL_BLOCK_NOWAIT"},
index a765cc7..c141438 100644 (file)
@@ -213,6 +213,26 @@ ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
         return 0;
 }
 
+static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock,
+                                                cfs_list_t *work_list)
+{
+       CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock);
+
+       if ((exp_connect_flags(lock->l_export) &
+                               OBD_CONNECT_FLOCK_DEAD) == 0) {
+               CERROR("deadlock found, but client doesn't "
+                               "support flock canceliation\n");
+       } else {
+               LASSERT(lock->l_completion_ast);
+               LASSERT((lock->l_flags & LDLM_FL_AST_SENT) == 0);
+               lock->l_flags |= LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK |
+                       LDLM_FL_FLOCK_DEADLOCK;
+               ldlm_flock_blocking_unlink(lock);
+               ldlm_resource_unlink_lock(lock);
+               ldlm_add_ast_work_item(lock, NULL, work_list);
+       }
+}
+
 /**
  * Process a granting attempt for flock lock.
  * Must be called under ns lock held.
@@ -281,6 +301,7 @@ reprocess:
                         }
                 }
         } else {
+               int reprocess_failed = 0;
                 lockmode_verify(mode);
 
                 /* This loop determines if there are existing locks
@@ -302,8 +323,15 @@ reprocess:
                         if (!ldlm_flocks_overlap(lock, req))
                                 continue;
 
-                        if (!first_enq)
-                                RETURN(LDLM_ITER_CONTINUE);
+                       if (!first_enq) {
+                               reprocess_failed = 1;
+                               if (ldlm_flock_deadlock(req, lock)) {
+                                       ldlm_flock_cancel_on_deadlock(req,
+                                                       work_list);
+                                       RETURN(LDLM_ITER_CONTINUE);
+                               }
+                               continue;
+                       }
 
                         if (*flags & LDLM_FL_BLOCK_NOWAIT) {
                                 ldlm_flock_destroy(req, mode, *flags);
@@ -339,6 +367,8 @@ reprocess:
                         *flags |= LDLM_FL_BLOCK_GRANTED;
                         RETURN(LDLM_ITER_STOP);
                 }
+               if (reprocess_failed)
+                       RETURN(LDLM_ITER_CONTINUE);
         }
 
         if (*flags & LDLM_FL_TEST_LOCK) {
@@ -690,7 +720,10 @@ granted:
         /* ldlm_lock_enqueue() has already placed lock on the granted list. */
         cfs_list_del_init(&lock->l_res_link);
 
-        if (flags & LDLM_FL_TEST_LOCK) {
+       if (lock->l_flags & LDLM_FL_FLOCK_DEADLOCK) {
+               LDLM_DEBUG(lock, "client-side enqueue deadlock received");
+               rc = -EDEADLK;
+       } else if (flags & LDLM_FL_TEST_LOCK) {
                 /* fcntl(F_GETLK) request */
                 /* The old mode was saved in getlk->fl_type so that if the mode
                  * in the lock changes we can decref the appropriate refcount.*/
@@ -719,7 +752,7 @@ granted:
                ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
        }
        unlock_res_and_lock(lock);
-       RETURN(0);
+       RETURN(rc);
 }
 EXPORT_SYMBOL(ldlm_flock_completion_ast);
 
index 7dd1ee4..d65710d 100644 (file)
@@ -214,7 +214,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                  OBD_CONNECT_EINPROGRESS |
                                  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
                                  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS |
-                                 OBD_CONNECT_MAX_EASIZE;
+                                 OBD_CONNECT_MAX_EASIZE |
+                                 OBD_CONNECT_FLOCK_DEAD;
 
         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
index 74f6fb1..ec9e3b3 100644 (file)
@@ -882,6 +882,7 @@ static const char *obd_connect_names[] = {
        "lightweight_conn",
        "short_io",
        "pingless",
+       "flock_deadlock",
        "unknown",
         NULL
 };
index 4ee8feb..4ae2e64 100644 (file)
@@ -1158,6 +1158,8 @@ void lustre_assert_wire_constants(void)
                 OBD_CONNECT_SHORTIO);
        LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
                 OBD_CONNECT_PINGLESS);
+       LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FLOCK_DEAD);
        LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
                (unsigned)OBD_CKSUM_CRC32);
        LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
index 38ad9c0..e1ea3f9 100644 (file)
@@ -66,7 +66,7 @@ noinst_PROGRAMS += statone runas openfile rmdirmany
 noinst_PROGRAMS += small_write multiop ll_sparseness_verify
 noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany
 noinst_PROGRAMS += openfilleddirunlink rename_many memhog
-noinst_PROGRAMS += mmap_sanity writemany reads flocks_test
+noinst_PROGRAMS += mmap_sanity writemany reads flocks_test flock_deadlock
 noinst_PROGRAMS += write_time_limit rwv lgetxattr_size_check checkfiemap
 
 bin_PROGRAMS = mcreate munlink
diff --git a/lustre/tests/flock_deadlock.c b/lustre/tests/flock_deadlock.c
new file mode 100644 (file)
index 0000000..8a85802
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+*/
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+*/
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/sem.h>
+#include <semaphore.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+#define LOCK_LEN 100
+
+struct flock flocks[4] = {
+       /* 1st region */
+       {
+               .l_type         = F_WRLCK,
+               .l_whence       = SEEK_SET,
+               .l_start        = 0,
+               .l_len          = LOCK_LEN,
+               .l_pid          = 0,
+       },
+       /* 2nd region */
+       {
+               .l_type         = F_WRLCK,
+               .l_whence       = SEEK_SET,
+               .l_start        = LOCK_LEN,
+               .l_len          = LOCK_LEN,
+               .l_pid          = 0,
+       },
+       /* 3rd region */
+       {
+               .l_type         = F_WRLCK,
+               .l_whence       = SEEK_SET,
+               .l_start        = 2 * LOCK_LEN,
+               .l_len          = LOCK_LEN,
+               .l_pid          = 0,
+       },
+       /* 2nd & 3rd regions */
+       {
+               .l_type         = F_WRLCK,
+               .l_whence       = SEEK_SET,
+               .l_start        = LOCK_LEN,
+               .l_len          = 2 * LOCK_LEN,
+               .l_pid          = 0,
+       },
+};
+
+enum {
+       FLOCK_GET       = 0,
+       FLOCK_PUT       = 1,
+};
+
+#define flock_call(fd, num, get, label)                                                \
+       flocks[num].l_type = get == FLOCK_GET ? F_WRLCK : F_UNLCK;              \
+       printf("%d: %s lock%d [%llu, %llu]\n", pid,                             \
+               get == FLOCK_GET ? "taking" : "putting",                        \
+               num, (unsigned long long)flocks[num].l_start,                   \
+               (unsigned long long)flocks[num].l_start + flocks[num].l_len);   \
+       rc = fcntl(fd, F_SETLKW, &flocks[num]);                                 \
+       if (rc < 0) {                                                           \
+               rc = errno;                                                     \
+               fprintf(stderr, "%d: failed to %s lock%d, %s\n",                \
+                       pid, get == FLOCK_GET ? "take" : "put",                 \
+                       num, strerror(errno));                                  \
+               goto label;                                                     \
+       } else {                                                                \
+               printf("%d: done\n", pid);                                      \
+       }
+
+void catch_alarm()
+{
+       fprintf(stderr, "lock timeout\n");
+       exit(124);
+}
+
+int main(int argc, char* argv[])
+{
+       struct sigaction act;
+       int status;
+       pid_t wpid = 0;
+       int fd, i, pid, num = 0, rc = 0;
+
+       if (argc != 2) {
+               fprintf(stderr, "usage: %s <file>\n", argv[0]);
+               return EXIT_FAILURE;
+       }
+       fd = open(argv[1], O_RDWR|O_CREAT, (mode_t)0666);
+       if (fd < 0) {
+               fprintf(stderr, "error open file %s\n", argv[1]);
+               return EXIT_FAILURE;
+       }
+
+       for (i = 0; i < 2; i++) {
+               fflush(stdout);
+               pid = fork();
+               if (pid && i == 0)
+                       wpid = pid;
+               if (pid == 0)
+                       wpid = 0;
+               if (pid == 0 && i == 0) {
+                       pid = getpid();
+
+                       flock_call(fd, num, FLOCK_GET, err_lock0);
+
+                       printf("%d sleeping 1\n", pid);
+                       sleep(1);
+
+                       /* First of all, it should get blocked on flocks[1]
+                        * 2nd child. Later, should deadlock with flocks[2]
+                        * parent, after cancelling flocks[1] 2nd child. */
+                       printf("%d: taking lock3 [%llu, %llu]\n", pid,
+                               (unsigned long long)flocks[3].l_start,
+                               (unsigned long long)flocks[3].l_start +
+                               flocks[3].l_len);
+                       memset(&act, 0, sizeof(act));
+                       act.sa_handler = catch_alarm;
+                       sigemptyset(&act.sa_mask);
+                       sigaddset(&act.sa_mask, SIGALRM);
+                       if (sigaction(SIGALRM, &act, NULL) < 0) {
+                               fprintf(stderr, "SIGALRM signal setup failed"
+                                               ", errno: %d", errno);
+                               rc = 3;
+                               goto err_lock1;
+                       }
+                       alarm(5);
+                       rc = fcntl(fd, F_SETLKW, &flocks[3]);
+                       if (rc >= 0) {
+                               fprintf(stderr, "%d: should not succeed to "
+                                               "take lock3\n", pid);
+
+                               flock_call(fd, 3, FLOCK_PUT, err_lock1);
+                               rc = EINVAL;
+                               goto err_lock1;
+                       }
+                       if (errno != EDEADLK) {
+                               rc = errno;
+                               fprintf(stderr, "%d: failed to take lock3: "
+                                               "%s\n", pid, strerror(errno));
+                               goto err_lock1;
+                       }
+
+                       printf("%d: expected deadlock\n", pid);
+
+                       flock_call(fd, num, FLOCK_PUT, err_lock0);
+                       break;
+               } else if (pid == 0 && i == 1) {
+                       pid = getpid();
+
+                       flock_call(fd, 1, FLOCK_GET, err_lock0);
+
+                       /* Let flocks[2] 2nd child get granted and
+                        * flocks[3] 1st child, flocks[0] parent get blocked.*/
+                       printf("%d sleeping 2\n", pid);
+                       sleep(2);
+
+                       flock_call(fd, 1, FLOCK_PUT, err_lock0);
+                       break;
+               } else if (pid && i == 1) {
+                       pid = getpid();
+                       num = 2;
+
+                       /* Let flocks[1] 2nd child get granted first */
+                       printf("%d: sleeping 1\n", pid);
+                       sleep(1);
+
+                       flock_call(fd, num, FLOCK_GET, err_lock0);
+
+                       /* Should get blocked on flocks[0], 1st shild
+                        * and succeed later. */
+                       flock_call(fd, 0, FLOCK_GET, err_lock1);
+
+                       flock_call(fd, 0, FLOCK_PUT, err_lock1);
+                       flock_call(fd, num, FLOCK_PUT, err_lock0);
+                       break;
+               }
+       }
+
+       if (pid == 0)
+               sleep(2);
+       if (wpid) {
+               waitpid(wpid, &status, 0);
+               rc = WEXITSTATUS(status);
+       }
+       printf("%d Exit\n", pid);
+       close(fd);
+       return rc;
+
+err_lock1:
+       flocks[num].l_type = F_UNLCK;
+       fcntl(fd, F_SETLKW, &flocks[num]);
+err_lock0:
+       close(fd);
+       return rc;
+}
index 4ef4b53..5b5b0dd 100644 (file)
@@ -11460,6 +11460,22 @@ test_234() {
 }
 run_test 234 "xattr cache should not crash on ENOMEM"
 
+test_235() {
+        [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.52) ] &&
+               skip "Need MDS version at least 2.4.52" && return
+       flock_deadlock $DIR/$tfile
+       local RC=$?
+       case $RC in
+               0)
+               ;;
+               124) error "process hangs on a deadlock"
+               ;;
+               *) error "error executing flock_deadlock $DIR/$tfile"
+               ;;
+       esac
+}
+run_test 235 "LU-1715: flock deadlock detection does not work properly"
+
 #
 # tests that do cleanup/setup should be run at the end
 #
index 52155e2..1ac48ab 100644 (file)
@@ -522,6 +522,7 @@ check_obd_connect_data(void)
        CHECK_DEFINE_64X(OBD_CONNECT_LIGHTWEIGHT);
        CHECK_DEFINE_64X(OBD_CONNECT_SHORTIO);
        CHECK_DEFINE_64X(OBD_CONNECT_PINGLESS);
+       CHECK_DEFINE_64X(OBD_CONNECT_FLOCK_DEAD);
 
        CHECK_VALUE_X(OBD_CKSUM_CRC32);
        CHECK_VALUE_X(OBD_CKSUM_ADLER);
index 801dcd6..62cbfa1 100644 (file)
@@ -1166,6 +1166,8 @@ void lustre_assert_wire_constants(void)
                 OBD_CONNECT_SHORTIO);
        LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
                 OBD_CONNECT_PINGLESS);
+       LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FLOCK_DEAD);
        LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
                (unsigned)OBD_CKSUM_CRC32);
        LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",