LU-13456 ldlm: fix reprocessing of locks with more bits

[fs/lustre-release.git] / lustre / ldlm / ldlm_reclaim.c
diff --git a/lustre/ldlm/ldlm_reclaim.c b/lustre/ldlm/ldlm_reclaim.c

index 722dd9f..d371dc2 100644 (file)
--- a/lustre/ldlm/ldlm_reclaim.c
+++ b/lustre/ldlm/ldlm_reclaim.c
@@ -35,45 +35,36 @@
  
  /*
   * To avoid ldlm lock exhausting server memory, two global parameters:
- * ldlm_watermark_low & ldlm_watermark_high are used for reclaiming
+ * ldlm_reclaim_threshold & ldlm_lock_limit are used for reclaiming
   * granted locks and rejecting incoming enqueue requests defensively.
   *
- * ldlm_watermark_low: When the amount of granted locks reaching this
+ * ldlm_reclaim_threshold: When the amount of granted locks reaching this
   * threshold, server start to revoke locks gradually.
   *
- * ldlm_watermark_high: When the amount of granted locks reaching this
+ * ldlm_lock_limit: When the amount of granted locks reaching this
   * threshold, server will return -EINPROGRESS to any incoming enqueue
   * request until the lock count is shrunk below the threshold again.
   *
- * ldlm_watermark_low & ldlm_watermark_high is set to 20% & 30% of the
+ * ldlm_reclaim_threshold & ldlm_lock_limit is set to 20% & 30% of the
   * total memory by default. It is tunable via proc entry, when it's set
   * to 0, the feature is disabled.
   */
  
-/*
- * FIXME:
- *
- * In current implementation, server identifies which locks should be
- * revoked by choosing locks from namespace/resource in a roundrobin
- * manner, which isn't optimal. The ideal way should be server notifies
- * clients to cancel locks voluntarily, because only client knows exactly
- * when the lock is last used.
- *
- * However how to notify client immediately is a problem, one idea
- * is to leverage the glimplse callbacks on some artificial global
- * lock (like quota global lock does), but that requires protocol
- * changes, let's fix it in future long-term solution.
- */
+#ifdef HAVE_SERVER_SUPPORT
  
-__u64 ldlm_watermark_low;
-__u64 ldlm_watermark_high;
+/* Lock count is stored in ldlm_reclaim_threshold & ldlm_lock_limit */
+__u64 ldlm_reclaim_threshold;
+__u64 ldlm_lock_limit;
  
-#ifdef HAVE_SERVER_SUPPORT
+/* Represents ldlm_reclaim_threshold & ldlm_lock_limit in MB, used for
+ * proc interface. */
+__u64 ldlm_reclaim_threshold_mb;
+__u64 ldlm_lock_limit_mb;
  
-static struct percpu_counter   ldlm_granted_total;
+struct percpu_counter          ldlm_granted_total;
  static atomic_t                        ldlm_nr_reclaimer;
-static cfs_duration_t          ldlm_last_reclaim_age;
-static cfs_time_t              ldlm_last_reclaim_time;
+static s64                     ldlm_last_reclaim_age_ns;
+static ktime_t                 ldlm_last_reclaim_time;
  
  struct ldlm_reclaim_cb_data {
         struct list_head         rcd_rpc_list;
@@ -82,7 +73,7 @@ struct ldlm_reclaim_cb_data {
         int                      rcd_cursor;
         int                      rcd_start;
         bool                     rcd_skip;
-       cfs_duration_t           rcd_age;
+       s64                      rcd_age_ns;
         struct cfs_hash_bd      *rcd_prev_bd;
  };
  
@@ -101,6 +92,17 @@ static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock)
         return false;
  }
  
+/**
+ * Callback function for revoking locks from certain resource.
+ *
+ * \param [in] hs      ns_rs_hash
+ * \param [in] bd      current bucket of ns_rsh_hash
+ * \param [in] hnode   hnode of the resource
+ * \param [in] arg     opaque data
+ *
+ * \retval 0           continue the scan
+ * \retval 1           stop the iteration
+ */
  static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
                                 struct hlist_node *hnode, void *arg)
  
@@ -141,9 +143,9 @@ static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
                         continue;
  
                 if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) &&
-                   cfs_time_before(cfs_time_current(),
-                                   cfs_time_add(lock->l_last_used,
-                                                data->rcd_age)))
+                   ktime_before(ktime_get(),
+                                ktime_add_ns(lock->l_last_used,
+                                             data->rcd_age_ns)))
                         continue;
  
                 if (!ldlm_is_ast_sent(lock)) {
@@ -162,11 +164,24 @@ static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
         return rc;
  }
  
+/**
+ * Revoke locks from the resources of a namespace in a roundrobin
+ * manner.
+ *
+ * \param[in] ns       namespace to do the lock revoke on
+ * \param[in] count    count of lock to be revoked
+ * \param[in] age      only revoke locks older than the 'age'
+ * \param[in] skip     scan from the first lock on resource if the
+ *                     'skip' is false, otherwise, continue scan
+ *                     from the last scanned position
+ * \param[out] count   count of lock still to be revoked
+ */
  static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
-                            cfs_duration_t age, bool skip)
+                            s64 age_ns, bool skip)
  {
         struct ldlm_reclaim_cb_data     data;
         int                             idx, type, start;
+       int                             rc;
         ENTRY;
  
         LASSERT(*count != 0);
@@ -187,7 +202,7 @@ static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
         INIT_LIST_HEAD(&data.rcd_rpc_list);
         data.rcd_added = 0;
         data.rcd_total = *count;
-       data.rcd_age = age;
+       data.rcd_age_ns = age_ns;
         data.rcd_skip = skip;
         data.rcd_prev_bd = NULL;
         start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash);
@@ -202,35 +217,45 @@ static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
         LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count,
                  data.rcd_added);
  
-       ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST);
+       rc  = ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST);
+       if (rc == -ERESTART)
+               ldlm_reprocess_recovery_done(ns);
+
         *count -= data.rcd_added;
         EXIT;
  }
  
  #define LDLM_RECLAIM_BATCH     512
-#define LDLM_RECLAIM_AGE_MIN   cfs_time_seconds(300)
-#define LDLM_RECLAIM_AGE_MAX   (LDLM_DEFAULT_MAX_ALIVE * 3 / 4)
+#define LDLM_RECLAIM_AGE_MIN   (300 * NSEC_PER_SEC)
+#define LDLM_RECLAIM_AGE_MAX   (LDLM_DEFAULT_MAX_ALIVE * NSEC_PER_SEC * 3 / 4)
  
-static inline cfs_duration_t ldlm_reclaim_age(void)
+static inline s64 ldlm_reclaim_age(void)
  {
-       cfs_duration_t  age;
-
-       age = ldlm_last_reclaim_age +
-               cfs_time_sub(cfs_time_current(), ldlm_last_reclaim_time);
-       if (age > LDLM_RECLAIM_AGE_MAX)
-               age = LDLM_RECLAIM_AGE_MAX;
-       else if (age < (LDLM_RECLAIM_AGE_MIN * 2))
-               age = LDLM_RECLAIM_AGE_MIN;
-       return age;
+       s64 age_ns = ldlm_last_reclaim_age_ns;
+       ktime_t now = ktime_get();
+       ktime_t diff;
+
+       diff = ktime_sub(now, ldlm_last_reclaim_time);
+       age_ns += ktime_to_ns(diff);
+       if (age_ns > LDLM_RECLAIM_AGE_MAX)
+               age_ns = LDLM_RECLAIM_AGE_MAX;
+       else if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2))
+               age_ns = LDLM_RECLAIM_AGE_MIN;
+       return age_ns;
  }
  
+/**
+ * Revoke certain amount of locks from all the server namespaces
+ * in a roundrobin manner. Lock age is used to avoid reclaim on
+ * the non-aged locks.
+ */
  static void ldlm_reclaim_ns(void)
  {
         struct ldlm_namespace   *ns;
         int                      count = LDLM_RECLAIM_BATCH;
         int                      ns_nr, nr_processed;
         enum ldlm_side           ns_cli = LDLM_NAMESPACE_SERVER;
-       cfs_duration_t           age;
+       s64 age_ns;
         bool                     skip = true;
         ENTRY;
  
@@ -239,7 +264,7 @@ static void ldlm_reclaim_ns(void)
                 return;
         }
  
-       age = ldlm_reclaim_age();
+       age_ns = ldlm_reclaim_age();
  again:
         nr_processed = 0;
         ns_nr = ldlm_namespace_nr_read(ns_cli);
@@ -255,21 +280,21 @@ again:
                 ldlm_namespace_move_to_active_locked(ns, ns_cli);
                 mutex_unlock(ldlm_namespace_lock(ns_cli));
  
-               ldlm_reclaim_res(ns, &count, age, skip);
+               ldlm_reclaim_res(ns, &count, age_ns, skip);
                 ldlm_namespace_put(ns);
                 nr_processed++;
         }
  
-       if (count > 0 && age > LDLM_RECLAIM_AGE_MIN) {
-               age >>= 1;
-               if (age < (LDLM_RECLAIM_AGE_MIN * 2))
-                       age = LDLM_RECLAIM_AGE_MIN;
+       if (count > 0 && age_ns > LDLM_RECLAIM_AGE_MIN) {
+               age_ns >>= 1;
+               if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2))
+                       age_ns = LDLM_RECLAIM_AGE_MIN;
                 skip = false;
                 goto again;
         }
  
-       ldlm_last_reclaim_age = age;
-       ldlm_last_reclaim_time = cfs_time_current();
+       ldlm_last_reclaim_age_ns = age_ns;
+       ldlm_last_reclaim_time = ktime_get();
  out:
         atomic_add_unless(&ldlm_nr_reclaimer, -1, 0);
         EXIT;
@@ -280,7 +305,7 @@ void ldlm_reclaim_add(struct ldlm_lock *lock)
         if (!ldlm_lock_reclaimable(lock))
                 return;
         percpu_counter_add(&ldlm_granted_total, 1);
-       lock->l_last_used = cfs_time_current();
+       lock->l_last_used = ktime_get();
  }
  
  void ldlm_reclaim_del(struct ldlm_lock *lock)
@@ -290,23 +315,32 @@ void ldlm_reclaim_del(struct ldlm_lock *lock)
         percpu_counter_sub(&ldlm_granted_total, 1);
  }
  
+/**
+ * Check on the total granted locks: return true if it reaches the
+ * high watermark (ldlm_lock_limit), otherwise return false; It also
+ * triggers lock reclaim if the low watermark (ldlm_reclaim_threshold)
+ * is reached.
+ *
+ * \retval true                high watermark reached.
+ * \retval false       high watermark not reached.
+ */
  bool ldlm_reclaim_full(void)
  {
-       __u64 high = ldlm_watermark_high;
-       __u64 low = ldlm_watermark_low;
+       __u64 high = ldlm_lock_limit;
+       __u64 low = ldlm_reclaim_threshold;
  
         if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW))
                 low = cfs_fail_val;
  
         if (low != 0 &&
-           percpu_counter_read_positive(&ldlm_granted_total) > low)
+           percpu_counter_sum_positive(&ldlm_granted_total) > low)
                 ldlm_reclaim_ns();
  
         if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH))
                 high = cfs_fail_val;
  
         if (high != 0 &&
-           percpu_counter_read_positive(&ldlm_granted_total) > high)
+           percpu_counter_sum_positive(&ldlm_granted_total) > high)
                 return true;
  
         return false;
@@ -316,24 +350,37 @@ static inline __u64 ldlm_ratio2locknr(int ratio)
  {
         __u64 locknr;
  
-       locknr = ((__u64)NUM_CACHEPAGES << PAGE_CACHE_SHIFT) * ratio;
+       locknr = ((__u64)NUM_CACHEPAGES << PAGE_SHIFT) * ratio;
         do_div(locknr, 100 * sizeof(struct ldlm_lock));
  
         return locknr;
  }
  
+static inline __u64 ldlm_locknr2mb(__u64 locknr)
+{
+       return (locknr * sizeof(struct ldlm_lock) + 512 * 1024) >> 20;
+}
+
  #define LDLM_WM_RATIO_LOW_DEFAULT      20
  #define LDLM_WM_RATIO_HIGH_DEFAULT     30
  
  int ldlm_reclaim_setup(void)
  {
         atomic_set(&ldlm_nr_reclaimer, 0);
-       ldlm_watermark_low = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT);
-       ldlm_watermark_high = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT);
-       ldlm_last_reclaim_age = LDLM_RECLAIM_AGE_MAX;
-       ldlm_last_reclaim_time = cfs_time_current();
  
+       ldlm_reclaim_threshold = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT);
+       ldlm_reclaim_threshold_mb = ldlm_locknr2mb(ldlm_reclaim_threshold);
+       ldlm_lock_limit = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT);
+       ldlm_lock_limit_mb = ldlm_locknr2mb(ldlm_lock_limit);
+
+       ldlm_last_reclaim_age_ns = LDLM_RECLAIM_AGE_MAX;
+       ldlm_last_reclaim_time = ktime_get();
+
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+       return percpu_counter_init(&ldlm_granted_total, 0, GFP_KERNEL);
+#else
         return percpu_counter_init(&ldlm_granted_total, 0);
+#endif
  }
  
  void ldlm_reclaim_cleanup(void)