b=16774 mdc to cancel unused dlm locks before replay.

author Eric Mei <eric.mei@oracle.com>

Mon, 30 Aug 2010 15:24:57 +0000 (19:24 +0400)

committer Mikhail Pershin <tappro@sun.com>

Wed, 8 Sep 2010 16:40:29 +0000 (20:40 +0400)
author Eric Mei <eric.mei@oracle.com>
Mon, 30 Aug 2010 15:24:57 +0000 (19:24 +0400)
committer Mikhail Pershin <tappro@sun.com>
Wed, 8 Sep 2010 16:40:29 +0000 (20:40 +0400)
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index 1747a77..bf0d9f8 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -203,6 +203,10 @@ typedef enum {
   * emulation + race with upcoming bl_ast.  */
  #define LDLM_FL_FAIL_LOC       0x100000000ULL
  
+/* Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it */
+#define LDLM_FL_SKIPPED        0x200000000ULL
+
  /* The blocking callback is overloaded to perform two functions.  These flags
   * indicate which operation should be performed. */
  #define LDLM_CB_BLOCKING    1
@@ -368,6 +372,8 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
                                 void *req_cookie, ldlm_mode_t mode, int flags,
                                 void *data);
  
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
  struct ldlm_valblock_ops {
          int (*lvbo_init)(struct ldlm_resource *res);
          int (*lvbo_update)(struct ldlm_resource *res,
@@ -484,6 +490,9 @@ struct ldlm_namespace {
          struct obd_device     *ns_obd;
  
          struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/
+
+        /* callback to cancel locks before replaying it during recovery */
+        ldlm_cancel_for_recovery ns_cancel_for_recovery;
  };
  
  static inline int ns_is_client(struct ldlm_namespace *ns)
@@ -512,6 +521,13 @@ static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
          return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
  }
  
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+                                      ldlm_cancel_for_recovery arg)
+{
+        LASSERT(ns != NULL);
+        ns->ns_cancel_for_recovery = arg;
+}
+
  /*
   *
   * Resource hash table
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h

index d6ff17f..c6345cb 100644 (file)
--- a/lustre/ldlm/ldlm_internal.h
+++ b/lustre/ldlm/ldlm_internal.h
@@ -72,7 +72,9 @@ enum {
          LDLM_CANCEL_AGED   = 1 << 0, /* Cancel aged locks (non lru resize). */
          LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
          LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
-        LDLM_CANCEL_LRUR   = 1 << 3  /* Cancel locks from lru resize. */
+        LDLM_CANCEL_LRUR   = 1 << 3, /* Cancel locks from lru resize. */
+        LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+                                      * sending nor waiting for any rpcs) */
  };
  
  int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
@@ -200,7 +202,8 @@ void ldlm_exit(void);
  
  enum ldlm_policy_res {
          LDLM_POLICY_CANCEL_LOCK,
-        LDLM_POLICY_KEEP_LOCK
+        LDLM_POLICY_KEEP_LOCK,
+        LDLM_POLICY_SKIP_LOCK
  };
  
  typedef enum ldlm_policy_res ldlm_policy_res_t;
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 2cdd68b..c000c42 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -187,6 +187,8 @@ int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
                  struct ldlm_namespace *ns = lock->l_resource->lr_namespace;
                  LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
                  cfs_list_del_init(&lock->l_lru);
+                if (lock->l_flags & LDLM_FL_SKIPPED)
+                        lock->l_flags &= ~LDLM_FL_SKIPPED;
                  LASSERT(ns->ns_nr_unused > 0);
                  ns->ns_nr_unused--;
                  rc = 1;
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index e283eee..dc59d90 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -50,6 +50,9 @@ int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
  CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
                  "lock enqueue timeout minimum");
  
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
  static void interrupted_completion_wait(void *data)
  {
  }
@@ -1311,6 +1314,37 @@ int ldlm_cli_cancel_list_local(cfs_list_t *cancels, int count,
  }
  
  /**
+ * Cancel as many locks as possible w/o sending any rpcs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any rpcs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+                                                    struct ldlm_lock *lock,
+                                                    int unused, int added,
+                                                    int count)
+{
+        ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+        ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+        lock_res_and_lock(lock);
+
+        /* don't check added & count since we want to process all locks
+         * from unused list */
+        switch (lock->l_resource->lr_type) {
+                case LDLM_EXTENT:
+                case LDLM_IBITS:
+                        if (cb && cb(lock))
+                                break;
+                default:
+                        result = LDLM_POLICY_SKIP_LOCK;
+                        lock->l_flags |= LDLM_FL_SKIPPED;
+                        break;
+        }
+
+        unlock_res_and_lock(lock);
+        RETURN(result);
+}
+
+/**
   * Callback function for lru-resize policy. Makes decision whether to keep
   * \a lock in LRU for current \a LRU size \a unused, added in current scan
   * \a added and number of locks to be preferably canceled \a count.
@@ -1431,6 +1465,9 @@ typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
  static ldlm_cancel_lru_policy_t
  ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
  {
+        if (flags & LDLM_CANCEL_NO_WAIT)
+                return ldlm_cancel_no_wait_policy;
+
          if (ns_connect_lru_resize(ns)) {
                  if (flags & LDLM_CANCEL_SHRINK)
                          /* We kill passed number of old locks. */
@@ -1472,17 +1509,23 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
   *                              memory pressre policy function;
   *
   * flags & LDLM_CANCEL_AGED -   cancel alocks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *                               (typically before replaying locks) w/o
+ *                               sending any rpcs or waiting for any
+ *                               outstanding rpc to complete.
   */
  static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
                                   int count, int max, int flags)
  {
          ldlm_cancel_lru_policy_t pf;
          struct ldlm_lock *lock, *next;
-        int added = 0, unused;
+        int added = 0, unused, remained;
          ENTRY;
  
          cfs_spin_lock(&ns->ns_unused_lock);
          unused = ns->ns_nr_unused;
+        remained = unused;
  
          if (!ns_connect_lru_resize(ns))
                  count += unused - ns->ns_max_unused;
@@ -1491,6 +1534,12 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
          LASSERT(pf != NULL);
  
          while (!cfs_list_empty(&ns->ns_unused_list)) {
+                ldlm_policy_res_t result;
+
+                /* all unused locks */
+                if (remained-- <= 0)
+                        break;
+
                  /* For any flags, stop scanning if @max is reached. */
                  if (max && added >= max)
                          break;
@@ -1500,6 +1549,11 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
                          /* No locks which got blocking requests. */
                          LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
  
+                        if (flags & LDLM_CANCEL_NO_WAIT &&
+                            lock->l_flags & LDLM_FL_SKIPPED)
+                                /* already processed */
+                                continue;
+
                          /* Somebody is already doing CANCEL. No need in this
                           * lock in lru, do not traverse it again. */
                          if (!(lock->l_flags & LDLM_FL_CANCELING))
@@ -1527,14 +1581,21 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
                   * old locks, but additionally chose them by
                   * their weight. Big extent locks will stay in
                   * the cache. */
-                if (pf(ns, lock, unused, added, count) ==
-                    LDLM_POLICY_KEEP_LOCK) {
+                result = pf(ns, lock, unused, added, count);
+                if (result == LDLM_POLICY_KEEP_LOCK) {
                          lu_ref_del(&lock->l_reference,
                                     __FUNCTION__, cfs_current());
                          LDLM_LOCK_RELEASE(lock);
                          cfs_spin_lock(&ns->ns_unused_lock);
                          break;
                  }
+                if (result == LDLM_POLICY_SKIP_LOCK) {
+                        lu_ref_del(&lock->l_reference,
+                                   __FUNCTION__, cfs_current());
+                        LDLM_LOCK_RELEASE(lock);
+                        cfs_spin_lock(&ns->ns_unused_lock);
+                        continue;
+                }
  
                  lock_res_and_lock(lock);
                  /* Check flags again under the lock. */
@@ -2105,6 +2166,35 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
          RETURN(0);
  }
  
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+        int canceled;
+        CFS_LIST_HEAD(cancels);
+
+        CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+                           "replay for namespace %s (%d)\n", ns->ns_name,
+                           ns->ns_nr_unused);
+
+        /* We don't need to care whether or not LRU resize is enabled
+         * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+         * count parameter */
+        canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+                                         LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+        CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+                           canceled, ns->ns_name);
+}
+
  int ldlm_replay_locks(struct obd_import *imp)
  {
          struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
@@ -2123,6 +2213,9 @@ int ldlm_replay_locks(struct obd_import *imp)
          /* ensure this doesn't fall to 0 before all have been queued */
          cfs_atomic_inc(&imp->imp_replay_inflight);
  
+        if (ldlm_cancel_unused_locks_before_replay)
+                ldlm_cancel_unused_locks_for_replay(ns);
+
          (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
  
          cfs_list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c

index 6d0d4a1..2d947fe 100644 (file)
--- a/lustre/ldlm/ldlm_resource.c
+++ b/lustre/ldlm/ldlm_resource.c
@@ -64,6 +64,8 @@ cfs_proc_dir_entry_t *ldlm_type_proc_dir = NULL;
  cfs_proc_dir_entry_t *ldlm_ns_proc_dir = NULL;
  cfs_proc_dir_entry_t *ldlm_svc_proc_dir = NULL;
  
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
  #ifdef LPROCFS
  static int ldlm_proc_dump_ns(struct file *file, const char *buffer,
                               unsigned long count, void *data)
@@ -78,6 +80,9 @@ int ldlm_proc_setup(void)
          int rc;
          struct lprocfs_vars list[] = {
                  { "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL },
+                { "cancel_unused_locks_before_replay",
+                  lprocfs_rd_uint, lprocfs_wr_uint,
+                  &ldlm_cancel_unused_locks_before_replay, NULL },
                  { NULL }};
          ENTRY;
          LASSERT(ldlm_ns_proc_dir == NULL);
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c

index a1e4044..7c1db00 100644 (file)
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -1908,6 +1908,25 @@ struct obd_uuid *mdc_get_uuid(struct obd_export *exp) {
          return &cli->cl_target_uuid;
  }
  
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+        if (lock->l_resource->lr_type != LDLM_IBITS)
+                RETURN(0);
+
+        /* FIXME: if we ever get into a situation where there are too many
+         * opened files with open locks on a single node, then we really
+         * should replay these open locks to reget it */
+        if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+                RETURN(0);
+
+        RETURN(1);
+}
+
  static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
  {
          struct client_obd *cli = &obd->u.cli;
@@ -1940,6 +1959,8 @@ static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
          sptlrpc_lprocfs_cliobd_attach(obd);
          ptlrpc_lprocfs_register_obd(obd);
  
+        ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
          rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
          if (rc) {
                  mdc_cleanup(obd);
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 23fa404..6be1270 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -2073,6 +2073,30 @@ test_84a() {
  }
  run_test 84a "stale open during export disconnect"
  
+test_85a() { #bug 16774
+    lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
+
+    for i in `seq 100`; do
+        echo "tag-$i" > $DIR/$tfile-$i
+        grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
+    done
+
+    lov_id=`lctl dl | grep "clilov"`
+    addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'`
+    count=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count`
+    echo "before recovery: unused locks count = $count"
+
+    fail $SINGLEMDS
+
+    count2=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count`
+    echo "after recovery: unused locks count = $count2"
+
+    if [ $count2 -ge $count ]; then
+        error "unused locks are not canceled"
+    fi
+}
+run_test 85a "check the cancellation of unused locks during recovery(IBITS)"
+
  test_86() {
          local clients=${CLIENTS:-$HOSTNAME}
author	Eric Mei <eric.mei@oracle.com>
	Mon, 30 Aug 2010 15:24:57 +0000 (19:24 +0400)
committer	Mikhail Pershin <tappro@sun.com>
	Wed, 8 Sep 2010 16:40:29 +0000 (20:40 +0400)
lustre/include/lustre_dlm.h		patch \| blob \| history
lustre/ldlm/ldlm_internal.h		patch \| blob \| history
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history
lustre/ldlm/ldlm_resource.c		patch \| blob \| history
lustre/mdc/mdc_request.c		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history