Whamcloud - gitweb
d256c64be05d526e08261a0d55770dd03f0aef0c
[fs/lustre-release.git] / lustre / ldlm / ldlm_reclaim.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2015, Intel Corporation.
24  * Use is subject to license terms.
25  *
26  * Author: Niu    Yawei    <yawei.niu@intel.com>
27  */
28
29 #define DEBUG_SUBSYSTEM S_LDLM
30
31 #include <linux/kthread.h>
32 #include <lustre_dlm.h>
33 #include <obd_class.h>
34 #include "ldlm_internal.h"
35
36 /*
37  * To avoid ldlm lock exhausting server memory, two global parameters:
38  * ldlm_reclaim_threshold & ldlm_lock_limit are used for reclaiming
39  * granted locks and rejecting incoming enqueue requests defensively.
40  *
41  * ldlm_reclaim_threshold: When the amount of granted locks reaching this
42  * threshold, server start to revoke locks gradually.
43  *
44  * ldlm_lock_limit: When the amount of granted locks reaching this
45  * threshold, server will return -EINPROGRESS to any incoming enqueue
46  * request until the lock count is shrunk below the threshold again.
47  *
48  * ldlm_reclaim_threshold & ldlm_lock_limit is set to 20% & 30% of the
49  * total memory by default. It is tunable via proc entry, when it's set
50  * to 0, the feature is disabled.
51  */
52
53 #ifdef HAVE_SERVER_SUPPORT
54
55 /* Lock count is stored in ldlm_reclaim_threshold & ldlm_lock_limit */
56 __u64 ldlm_reclaim_threshold;
57 __u64 ldlm_lock_limit;
58
59 /* Represents ldlm_reclaim_threshold & ldlm_lock_limit in MB, used for
60  * proc interface. */
61 __u64 ldlm_reclaim_threshold_mb;
62 __u64 ldlm_lock_limit_mb;
63
64 struct percpu_counter           ldlm_granted_total;
65 static atomic_t                 ldlm_nr_reclaimer;
66 static cfs_duration_t           ldlm_last_reclaim_age;
67 static cfs_time_t               ldlm_last_reclaim_time;
68
69 struct ldlm_reclaim_cb_data {
70         struct list_head         rcd_rpc_list;
71         int                      rcd_added;
72         int                      rcd_total;
73         int                      rcd_cursor;
74         int                      rcd_start;
75         bool                     rcd_skip;
76         cfs_duration_t           rcd_age;
77         struct cfs_hash_bd      *rcd_prev_bd;
78 };
79
80 static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock)
81 {
82         struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
83
84         /* FLOCK & PLAIN lock are not reclaimable. FLOCK is
85          * explicitly controlled by application, PLAIN lock
86          * is used by quota global lock and config lock.
87          */
88         if (ns->ns_client == LDLM_NAMESPACE_SERVER &&
89             (lock->l_resource->lr_type == LDLM_IBITS ||
90              lock->l_resource->lr_type == LDLM_EXTENT))
91                 return true;
92         return false;
93 }
94
95 /**
96  * Callback function for revoking locks from certain resource.
97  *
98  * \param [in] hs       ns_rs_hash
99  * \param [in] bd       current bucket of ns_rsh_hash
100  * \param [in] hnode    hnode of the resource
101  * \param [in] arg      opaque data
102  *
103  * \retval 0            continue the scan
104  * \retval 1            stop the iteration
105  */
106 static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
107                                 struct hlist_node *hnode, void *arg)
108
109 {
110         struct ldlm_resource            *res;
111         struct ldlm_reclaim_cb_data     *data;
112         struct ldlm_lock                *lock;
113         struct ldlm_ns_bucket           *nsb;
114         int                              rc = 0;
115
116         data = (struct ldlm_reclaim_cb_data *)arg;
117
118         LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n",
119                  data->rcd_added, data->rcd_total);
120
121         nsb = cfs_hash_bd_extra_get(hs, bd);
122         res = cfs_hash_object(hs, hnode);
123
124         if (data->rcd_prev_bd != bd) {
125                 if (data->rcd_prev_bd != NULL)
126                         ldlm_res_to_ns(res)->ns_reclaim_start++;
127                 data->rcd_prev_bd = bd;
128                 data->rcd_cursor = 0;
129                 data->rcd_start = nsb->nsb_reclaim_start %
130                                   cfs_hash_bd_count_get(bd);
131         }
132
133         if (data->rcd_skip && data->rcd_cursor < data->rcd_start) {
134                 data->rcd_cursor++;
135                 return 0;
136         }
137
138         nsb->nsb_reclaim_start++;
139
140         lock_res(res);
141         list_for_each_entry(lock, &res->lr_granted, l_res_link) {
142                 if (!ldlm_lock_reclaimable(lock))
143                         continue;
144
145                 if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) &&
146                     cfs_time_before(cfs_time_current(),
147                                     cfs_time_add(lock->l_last_used,
148                                                  data->rcd_age)))
149                         continue;
150
151                 if (!ldlm_is_ast_sent(lock)) {
152                         ldlm_set_ast_sent(lock);
153                         LASSERT(list_empty(&lock->l_rk_ast));
154                         list_add(&lock->l_rk_ast, &data->rcd_rpc_list);
155                         LDLM_LOCK_GET(lock);
156                         if (++data->rcd_added == data->rcd_total) {
157                                 rc = 1; /* stop the iteration */
158                                 break;
159                         }
160                 }
161         }
162         unlock_res(res);
163
164         return rc;
165 }
166
167 /**
168  * Revoke locks from the resources of a namespace in a roundrobin
169  * manner.
170  *
171  * \param[in] ns        namespace to do the lock revoke on
172  * \param[in] count     count of lock to be revoked
173  * \param[in] age       only revoke locks older than the 'age'
174  * \param[in] skip      scan from the first lock on resource if the
175  *                      'skip' is false, otherwise, continue scan
176  *                      from the last scanned position
177  * \param[out] count    count of lock still to be revoked
178  */
179 static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
180                              cfs_duration_t age, bool skip)
181 {
182         struct ldlm_reclaim_cb_data     data;
183         int                             idx, type, start;
184         ENTRY;
185
186         LASSERT(*count != 0);
187
188         if (ns->ns_obd) {
189                 type = server_name2index(ns->ns_obd->obd_name, &idx, NULL);
190                 if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) {
191                         EXIT;
192                         return;
193                 }
194         }
195
196         if (atomic_read(&ns->ns_bref) == 0) {
197                 EXIT;
198                 return;
199         }
200
201         INIT_LIST_HEAD(&data.rcd_rpc_list);
202         data.rcd_added = 0;
203         data.rcd_total = *count;
204         data.rcd_age = age;
205         data.rcd_skip = skip;
206         data.rcd_prev_bd = NULL;
207         start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash);
208
209         cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data,
210                                  start);
211
212         CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d "
213                "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added,
214                data.rcd_total);
215
216         LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count,
217                  data.rcd_added);
218
219         ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST);
220         *count -= data.rcd_added;
221         EXIT;
222 }
223
224 #define LDLM_RECLAIM_BATCH      512
225 #define LDLM_RECLAIM_AGE_MIN    cfs_time_seconds(300)
226 #define LDLM_RECLAIM_AGE_MAX    (LDLM_DEFAULT_MAX_ALIVE * 3 / 4)
227
228 static inline cfs_duration_t ldlm_reclaim_age(void)
229 {
230         cfs_duration_t  age;
231
232         age = ldlm_last_reclaim_age +
233                 cfs_time_sub(cfs_time_current(), ldlm_last_reclaim_time);
234         if (age > LDLM_RECLAIM_AGE_MAX)
235                 age = LDLM_RECLAIM_AGE_MAX;
236         else if (age < (LDLM_RECLAIM_AGE_MIN * 2))
237                 age = LDLM_RECLAIM_AGE_MIN;
238         return age;
239 }
240
241 /**
242  * Revoke certain amount of locks from all the server namespaces
243  * in a roundrobin manner. Lock age is used to avoid reclaim on
244  * the non-aged locks.
245  */
246 static void ldlm_reclaim_ns(void)
247 {
248         struct ldlm_namespace   *ns;
249         int                      count = LDLM_RECLAIM_BATCH;
250         int                      ns_nr, nr_processed;
251         enum ldlm_side           ns_cli = LDLM_NAMESPACE_SERVER;
252         cfs_duration_t           age;
253         bool                     skip = true;
254         ENTRY;
255
256         if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) {
257                 EXIT;
258                 return;
259         }
260
261         age = ldlm_reclaim_age();
262 again:
263         nr_processed = 0;
264         ns_nr = ldlm_namespace_nr_read(ns_cli);
265         while (count > 0 && nr_processed < ns_nr) {
266                 mutex_lock(ldlm_namespace_lock(ns_cli));
267
268                 if (list_empty(ldlm_namespace_list(ns_cli))) {
269                         mutex_unlock(ldlm_namespace_lock(ns_cli));
270                         goto out;
271                 }
272
273                 ns = ldlm_namespace_first_locked(ns_cli);
274                 ldlm_namespace_move_to_active_locked(ns, ns_cli);
275                 mutex_unlock(ldlm_namespace_lock(ns_cli));
276
277                 ldlm_reclaim_res(ns, &count, age, skip);
278                 ldlm_namespace_put(ns);
279                 nr_processed++;
280         }
281
282         if (count > 0 && age > LDLM_RECLAIM_AGE_MIN) {
283                 age >>= 1;
284                 if (age < (LDLM_RECLAIM_AGE_MIN * 2))
285                         age = LDLM_RECLAIM_AGE_MIN;
286                 skip = false;
287                 goto again;
288         }
289
290         ldlm_last_reclaim_age = age;
291         ldlm_last_reclaim_time = cfs_time_current();
292 out:
293         atomic_add_unless(&ldlm_nr_reclaimer, -1, 0);
294         EXIT;
295 }
296
297 void ldlm_reclaim_add(struct ldlm_lock *lock)
298 {
299         if (!ldlm_lock_reclaimable(lock))
300                 return;
301         percpu_counter_add(&ldlm_granted_total, 1);
302         lock->l_last_used = cfs_time_current();
303 }
304
305 void ldlm_reclaim_del(struct ldlm_lock *lock)
306 {
307         if (!ldlm_lock_reclaimable(lock))
308                 return;
309         percpu_counter_sub(&ldlm_granted_total, 1);
310 }
311
312 /**
313  * Check on the total granted locks: return true if it reaches the
314  * high watermark (ldlm_lock_limit), otherwise return false; It also
315  * triggers lock reclaim if the low watermark (ldlm_reclaim_threshold)
316  * is reached.
317  *
318  * \retval true         high watermark reached.
319  * \retval false        high watermark not reached.
320  */
321 bool ldlm_reclaim_full(void)
322 {
323         __u64 high = ldlm_lock_limit;
324         __u64 low = ldlm_reclaim_threshold;
325
326         if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW))
327                 low = cfs_fail_val;
328
329         if (low != 0 &&
330             percpu_counter_sum_positive(&ldlm_granted_total) > low)
331                 ldlm_reclaim_ns();
332
333         if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH))
334                 high = cfs_fail_val;
335
336         if (high != 0 &&
337             percpu_counter_sum_positive(&ldlm_granted_total) > high)
338                 return true;
339
340         return false;
341 }
342
343 static inline __u64 ldlm_ratio2locknr(int ratio)
344 {
345         __u64 locknr;
346
347         locknr = ((__u64)NUM_CACHEPAGES << PAGE_CACHE_SHIFT) * ratio;
348         do_div(locknr, 100 * sizeof(struct ldlm_lock));
349
350         return locknr;
351 }
352
353 static inline __u64 ldlm_locknr2mb(__u64 locknr)
354 {
355         return (locknr * sizeof(struct ldlm_lock) + 512 * 1024) >> 20;
356 }
357
358 #define LDLM_WM_RATIO_LOW_DEFAULT       20
359 #define LDLM_WM_RATIO_HIGH_DEFAULT      30
360
361 int ldlm_reclaim_setup(void)
362 {
363         atomic_set(&ldlm_nr_reclaimer, 0);
364
365         ldlm_reclaim_threshold = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT);
366         ldlm_reclaim_threshold_mb = ldlm_locknr2mb(ldlm_reclaim_threshold);
367         ldlm_lock_limit = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT);
368         ldlm_lock_limit_mb = ldlm_locknr2mb(ldlm_lock_limit);
369
370         ldlm_last_reclaim_age = LDLM_RECLAIM_AGE_MAX;
371         ldlm_last_reclaim_time = cfs_time_current();
372
373         return percpu_counter_init(&ldlm_granted_total, 0);
374 }
375
376 void ldlm_reclaim_cleanup(void)
377 {
378         percpu_counter_destroy(&ldlm_granted_total);
379 }
380
381 #else /* HAVE_SERVER_SUPPORT */
382
383 bool ldlm_reclaim_full(void)
384 {
385         return false;
386 }
387
388 void ldlm_reclaim_add(struct ldlm_lock *lock)
389 {
390 }
391
392 void ldlm_reclaim_del(struct ldlm_lock *lock)
393 {
394 }
395
396 int ldlm_reclaim_setup(void)
397 {
398         return 0;
399 }
400
401 void ldlm_reclaim_cleanup(void)
402 {
403 }
404
405 #endif /* HAVE_SERVER_SUPPORT */