1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2007 Cluster File Systems, Inc.
5 * Author: Yury Umanets <umka@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
26 /* Idea of this code is rather simple. Each second, for each server namespace
27 * we have SLV - server lock volume which is calculated on current number of
28 * granted locks, grant speed for past period, etc - that is, locking load.
29 * This SLV number may be thought as a flow definition for simplicity. It is
30 * sent to clients with each occasion to let them know what is current load
31 * situation on the server. By default, at the beginning, SLV on server is
32 * set max value which is calculated as the following: allow to one client
33 * have all locks of limit ->pl_limit for 10h.
35 * Next, on clients, number of cached locks is not limited artificially in any
36 * way as it was before. Instead, client calculates CLV, that is, client lock
37 * volume for each lock and compares it with last SLV from the server. CLV is
38 * calculated as the number of locks in LRU * lock live time in seconds. If
39 * CLV > SLV - lock is canceled.
41 * Client has LVF, that is, lock volume factor which regulates how much sensitive
42 * client should be about last SLV from server. The higher LVF is the more locks
43 * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
44 * that client will cancel locks 2 times faster.
46 * Locks on a client will be canceled more intensively in these cases:
47 * (1) if SLV is smaller, that is, load is higher on the server;
48 * (2) client has a lot of locks (the more locks are held by client, the bigger
49 * chances that some of them should be canceled);
50 * (3) client has old locks (taken some time ago);
52 * Thus, according to flow paradigm that we use for better understanding SLV,
53 * CLV is the volume of particle in flow described by SLV. According to this,
54 * if flow is getting thinner, more and more particles become outside of it and
55 * as particles are locks, they should be canceled.
57 * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
58 * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
59 * cleanups. Flow definition to allow more easy understanding of the logic belongs
60 * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
61 * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
63 * Glossary for terms used:
65 * pl_limit - Number of allowed locks in pool. Applies to server and client
68 * pl_granted - Number of granted locks (calculated);
69 * pl_grant_rate - Number of granted locks for last T (calculated);
70 * pl_cancel_rate - Number of canceled locks for last T (calculated);
71 * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
72 * pl_grant_plan - Planned number of granted locks for next T (calculated);
74 * pl_grant_step - Grant plan step, that is how ->pl_grant_plan
75 * will change in next T (tunable);
77 * pl_server_lock_volume - Current server lock volume (calculated);
79 * As it may be seen from list above, we have few possible tunables which may
80 * affect behavior much. They all may be modified via proc. However, they also
81 * give a possibility for constructing few pre-defined behavior policies. If
82 * none of predefines is suitable for a working pattern being used, new one may
83 * be "constructed" via proc tunables.
86 #define DEBUG_SUBSYSTEM S_LDLM
89 # include <lustre_dlm.h>
91 # include <liblustre.h>
92 # include <libcfs/kp30.h>
95 #include <obd_class.h>
96 #include <obd_support.h>
97 #include "ldlm_internal.h"
99 #ifdef HAVE_LRU_RESIZE_SUPPORT
101 /* 50 ldlm locks for 1MB of RAM. */
102 #define LDLM_POOL_HOST_L ((num_physpages >> (20 - PAGE_SHIFT)) * 50)
104 /* Default step in % for grant plan. */
105 #define LDLM_POOL_GSP (5)
107 /* LDLM_POOL_GSP% of all locks is default GP. */
108 #define LDLM_POOL_GP(L) ((L) * LDLM_POOL_GSP / 100)
110 /* Max age for locks on clients. */
111 #define LDLM_POOL_MAX_AGE (36000)
114 extern cfs_proc_dir_entry_t *ldlm_ns_proc_dir;
117 extern atomic_t ldlm_srv_namespace_nr;
118 extern atomic_t ldlm_cli_namespace_nr;
119 extern struct list_head ldlm_namespace_list;
120 extern struct semaphore ldlm_namespace_lock;
122 #define avg(src, add) \
123 ((src) = ((src) + (add)) / 2)
125 static inline __u64 dru(__u64 val, __u32 div)
127 __u64 ret = val + (div - 1);
132 static inline __u64 ldlm_pool_slv_max(__u32 L)
134 /* Allow to have all locks for 1 client for 10 hrs.
135 * Formula is the following: limit * 10h / 1 client. */
136 __u64 lim = L * LDLM_POOL_MAX_AGE / 1;
140 static inline __u64 ldlm_pool_slv_min(__u32 L)
146 LDLM_POOL_GRANTED_STAT = 0,
147 LDLM_POOL_GRANT_RATE_STAT,
148 LDLM_POOL_CANCEL_RATE_STAT,
149 LDLM_POOL_GRANT_PLAN_STAT,
154 static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
156 return container_of(pl, struct ldlm_namespace, ns_pool);
159 static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
161 int slv_factor, limit, granted, grant_speed;
162 int grant_rate, cancel_rate, grant_step;
163 time_t recalc_interval_sec;
168 spin_lock(&pl->pl_lock);
170 /* Get all values to local variables to avoid change some of them in
171 * the middle of re-calc. */
172 slv = ldlm_pool_get_slv(pl);
173 limit = ldlm_pool_get_limit(pl);
174 granted = atomic_read(&pl->pl_granted);
175 grant_rate = atomic_read(&pl->pl_grant_rate);
176 grant_plan = atomic_read(&pl->pl_grant_plan);
177 grant_step = atomic_read(&pl->pl_grant_step);
178 grant_speed = atomic_read(&pl->pl_grant_speed);
179 cancel_rate = atomic_read(&pl->pl_cancel_rate);
181 /* Zero out grant/cancel rates and speed for this T. */
182 atomic_set(&pl->pl_grant_rate, 0);
183 atomic_set(&pl->pl_cancel_rate, 0);
184 atomic_set(&pl->pl_grant_speed, 0);
186 /* Make sure that we use correct data for statistics. Pools thread may
187 * be not scheduled long time due to big CPU contention. We need to
189 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
191 if (recalc_interval_sec == 0)
192 recalc_interval_sec = 1;
194 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, slv);
195 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
197 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
198 grant_rate / recalc_interval_sec);
199 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
200 grant_plan / recalc_interval_sec);
201 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
202 cancel_rate / recalc_interval_sec);
204 /* Correcting old @grant_plan which may be obsolete in the case of big
205 * load on the server, when pools thread is not scheduled every 1s sharp
206 * (curent period). All values used in calculation are updated from
207 * other threads and up-to-date. Only @grant_plan is calculated by pool
208 * thread and directly affects SLV. */
209 grant_plan += grant_speed - (grant_speed / recalc_interval_sec);
211 if ((slv_factor = limit - (granted - grant_plan)) <= 0)
214 grant_plan = granted + ((limit - granted) * grant_step) / 100;
215 slv = (slv * ((slv_factor * 100) / limit));
218 if (slv > ldlm_pool_slv_max(limit)) {
219 CDEBUG(D_DLMTRACE, "Correcting SLV to allowed max "LPU64"\n",
220 ldlm_pool_slv_max(limit));
221 slv = ldlm_pool_slv_max(limit);
222 } else if (slv < ldlm_pool_slv_min(limit)) {
223 CDEBUG(D_DLMTRACE, "Correcting SLV to allowed min "LPU64"\n",
224 ldlm_pool_slv_min(limit));
225 slv = ldlm_pool_slv_min(limit);
228 ldlm_pool_set_slv(pl, slv);
229 atomic_set(&pl->pl_grant_plan, grant_plan);
230 pl->pl_update_time = cfs_time_current();
231 spin_unlock(&pl->pl_lock);
236 /* Our goal here is to decrease SLV the way to make a client hold
237 * @nr locks smaller in next 10h. */
238 static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
239 int nr, unsigned int gfp_mask)
241 __u32 granted, limit;
245 /* Client already canceled locks but server is already in shrinker and
246 * can't cancel anything. Let's catch this race. */
247 if ((granted = atomic_read(&pl->pl_granted)) == 0)
250 spin_lock(&pl->pl_lock);
252 /* Simple proportion but it gives impression on how much should be
253 * SLV changed for request @nr of locks to be canceled.*/
254 slv_delta = nr * ldlm_pool_get_slv(pl);
255 limit = ldlm_pool_get_limit(pl);
256 do_div(slv_delta, granted);
258 /* As SLV has some dependence on historical data, that is new value
259 * is based on old one, this decreasing will make clients get some
260 * locks back to the server and after some time it will stabilize.*/
261 if (slv_delta < ldlm_pool_get_slv(pl))
262 ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - slv_delta);
264 ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit));
265 spin_unlock(&pl->pl_lock);
267 /* We did not really free any memory here so far, it only will be
268 * freed later may be, so that we return 0 to not confuse VM. */
272 static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
274 int grant_rate, cancel_rate;
275 time_t recalc_interval_sec;
278 spin_lock(&pl->pl_lock);
279 grant_rate = atomic_read(&pl->pl_grant_rate);
280 cancel_rate = atomic_read(&pl->pl_cancel_rate);
282 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
284 if (recalc_interval_sec == 0)
285 recalc_interval_sec = 1;
287 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
288 ldlm_pool_get_slv(pl));
289 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
290 atomic_read(&pl->pl_granted));
291 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
292 grant_rate / recalc_interval_sec);
293 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
294 cancel_rate / recalc_interval_sec);
296 spin_unlock(&pl->pl_lock);
298 ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC);
302 static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
303 int nr, unsigned int gfp_mask)
306 RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC));
309 int ldlm_pool_recalc(struct ldlm_pool *pl)
311 if (pl->pl_recalc != NULL && pool_recalc_enabled(pl))
312 return pl->pl_recalc(pl);
315 EXPORT_SYMBOL(ldlm_pool_recalc);
317 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
318 unsigned int gfp_mask)
320 if (pl->pl_shrink != NULL && pool_shrink_enabled(pl)) {
321 CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks\n",
323 return pl->pl_shrink(pl, nr, gfp_mask);
327 EXPORT_SYMBOL(ldlm_pool_shrink);
329 /* The purpose of this function is to re-setup limit and maximal allowed
330 * slv according to the passed limit. */
331 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
334 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER) {
335 spin_lock(&pl->pl_lock);
336 ldlm_pool_set_limit(pl, limit);
337 spin_unlock(&pl->pl_lock);
341 EXPORT_SYMBOL(ldlm_pool_setup);
344 static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
345 int count, int *eof, void *data)
347 int nr = 0, granted, grant_rate, cancel_rate;
348 int grant_speed, grant_plan, grant_step;
349 struct ldlm_pool *pl = data;
353 spin_lock(&pl->pl_lock);
354 slv = pl->pl_server_lock_volume;
355 limit = ldlm_pool_get_limit(pl);
356 granted = atomic_read(&pl->pl_granted);
357 grant_rate = atomic_read(&pl->pl_grant_rate);
358 cancel_rate = atomic_read(&pl->pl_cancel_rate);
359 grant_speed = atomic_read(&pl->pl_grant_speed);
360 grant_plan = atomic_read(&pl->pl_grant_plan);
361 grant_step = atomic_read(&pl->pl_grant_step);
362 spin_unlock(&pl->pl_lock);
364 nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
366 nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv);
367 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER) {
368 nr += snprintf(page + nr, count - nr, " GSP: %d%%\n",
370 nr += snprintf(page + nr, count - nr, " GP: %d\n",
373 nr += snprintf(page + nr, count - nr, " LVF: %d\n",
374 atomic_read(&pl->pl_lock_volume_factor));
376 nr += snprintf(page + nr, count - nr, " GR: %d\n", grant_rate);
377 nr += snprintf(page + nr, count - nr, " CR: %d\n", cancel_rate);
378 nr += snprintf(page + nr, count - nr, " GS: %d\n", grant_speed);
379 nr += snprintf(page + nr, count - nr, " G: %d\n", granted);
380 nr += snprintf(page + nr, count - nr, " L: %d\n", limit);
384 static int ldlm_pool_proc_init(struct ldlm_pool *pl)
386 struct ldlm_namespace *ns = ldlm_pl2ns(pl);
387 struct proc_dir_entry *parent_ns_proc;
388 struct lprocfs_vars pool_vars[2];
389 char *var_name = NULL;
393 OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
397 parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir, ns->ns_name);
398 if (parent_ns_proc == NULL) {
399 CERROR("%s: proc entry is not initialized\n",
401 GOTO(out_free_name, rc = -EINVAL);
403 pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
405 if (IS_ERR(pl->pl_proc_dir)) {
406 CERROR("LProcFS failed in ldlm-pool-init\n");
407 rc = PTR_ERR(pl->pl_proc_dir);
408 GOTO(out_free_name, rc);
411 var_name[MAX_STRING_SIZE] = '\0';
412 memset(pool_vars, 0, sizeof(pool_vars));
413 pool_vars[0].name = var_name;
415 snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
416 pool_vars[0].data = &pl->pl_server_lock_volume;
417 pool_vars[0].read_fptr = lprocfs_rd_u64;
418 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
420 snprintf(var_name, MAX_STRING_SIZE, "limit");
421 pool_vars[0].data = &pl->pl_limit;
422 pool_vars[0].read_fptr = lprocfs_rd_atomic;
423 pool_vars[0].write_fptr = lprocfs_wr_atomic;
424 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
426 snprintf(var_name, MAX_STRING_SIZE, "granted");
427 pool_vars[0].data = &pl->pl_granted;
428 pool_vars[0].read_fptr = lprocfs_rd_atomic;
429 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
431 snprintf(var_name, MAX_STRING_SIZE, "control");
432 pool_vars[0].data = &pl->pl_control;
433 pool_vars[0].read_fptr = lprocfs_rd_uint;
434 pool_vars[0].write_fptr = lprocfs_wr_uint;
435 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
437 snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
438 pool_vars[0].data = &pl->pl_grant_speed;
439 pool_vars[0].read_fptr = lprocfs_rd_atomic;
440 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
442 snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
443 pool_vars[0].data = &pl->pl_cancel_rate;
444 pool_vars[0].read_fptr = lprocfs_rd_atomic;
445 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
447 snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
448 pool_vars[0].data = &pl->pl_grant_rate;
449 pool_vars[0].read_fptr = lprocfs_rd_atomic;
450 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
452 if (ns->ns_client == LDLM_NAMESPACE_SERVER) {
453 snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
454 pool_vars[0].data = &pl->pl_grant_plan;
455 pool_vars[0].read_fptr = lprocfs_rd_atomic;
456 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
458 snprintf(var_name, MAX_STRING_SIZE, "grant_step");
459 pool_vars[0].data = &pl->pl_grant_step;
460 pool_vars[0].read_fptr = lprocfs_rd_atomic;
461 pool_vars[0].write_fptr = lprocfs_wr_atomic;
462 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
464 snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
465 pool_vars[0].data = &pl->pl_lock_volume_factor;
466 pool_vars[0].read_fptr = lprocfs_rd_uint;
467 pool_vars[0].write_fptr = lprocfs_wr_uint;
468 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
471 snprintf(var_name, MAX_STRING_SIZE, "state");
472 pool_vars[0].data = pl;
473 pool_vars[0].read_fptr = lprocfs_rd_pool_state;
474 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
476 pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
477 LDLM_POOL_GRANTED_STAT);
479 GOTO(out_free_name, rc = -ENOMEM);
481 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
482 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
484 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
485 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
486 "grant_rate", "locks/s");
487 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
488 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
489 "cancel_rate", "locks/s");
490 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
491 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
492 "grant_plan", "locks/s");
493 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
494 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
496 lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
500 OBD_FREE(var_name, MAX_STRING_SIZE + 1);
504 static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
506 if (pl->pl_stats != NULL) {
507 lprocfs_free_stats(&pl->pl_stats);
510 if (pl->pl_proc_dir != NULL) {
511 lprocfs_remove(&pl->pl_proc_dir);
512 pl->pl_proc_dir = NULL;
515 #else /* !__KERNEL__*/
516 #define ldlm_pool_proc_init(pl) (0)
517 #define ldlm_pool_proc_fini(pl) while (0) {}
520 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
521 int idx, ldlm_side_t client)
526 spin_lock_init(&pl->pl_lock);
527 atomic_set(&pl->pl_granted, 0);
528 pl->pl_update_time = cfs_time_current();
529 atomic_set(&pl->pl_lock_volume_factor, 1);
531 atomic_set(&pl->pl_grant_rate, 0);
532 atomic_set(&pl->pl_cancel_rate, 0);
533 atomic_set(&pl->pl_grant_speed, 0);
534 pl->pl_control = LDLM_POOL_CTL_FULL;
535 atomic_set(&pl->pl_grant_step, LDLM_POOL_GSP);
536 atomic_set(&pl->pl_grant_plan, LDLM_POOL_GP(LDLM_POOL_HOST_L));
538 snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
541 if (client == LDLM_NAMESPACE_SERVER) {
542 pl->pl_recalc = ldlm_srv_pool_recalc;
543 pl->pl_shrink = ldlm_srv_pool_shrink;
544 ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
545 ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L));
547 ldlm_pool_set_slv(pl, 1);
548 ldlm_pool_set_limit(pl, 1);
549 pl->pl_recalc = ldlm_cli_pool_recalc;
550 pl->pl_shrink = ldlm_cli_pool_shrink;
553 rc = ldlm_pool_proc_init(pl);
557 CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
561 EXPORT_SYMBOL(ldlm_pool_init);
563 void ldlm_pool_fini(struct ldlm_pool *pl)
566 ldlm_pool_proc_fini(pl);
567 pl->pl_recalc = NULL;
568 pl->pl_shrink = NULL;
571 EXPORT_SYMBOL(ldlm_pool_fini);
573 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
576 atomic_inc(&pl->pl_granted);
577 atomic_inc(&pl->pl_grant_rate);
578 atomic_inc(&pl->pl_grant_speed);
581 EXPORT_SYMBOL(ldlm_pool_add);
583 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
586 LASSERT(atomic_read(&pl->pl_granted) > 0);
587 atomic_dec(&pl->pl_granted);
588 atomic_inc(&pl->pl_cancel_rate);
589 atomic_dec(&pl->pl_grant_speed);
592 EXPORT_SYMBOL(ldlm_pool_del);
594 /* ->pl_lock should be taken. */
595 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
597 return pl->pl_server_lock_volume;
599 EXPORT_SYMBOL(ldlm_pool_get_slv);
601 /* ->pl_lock should be taken. */
602 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
604 pl->pl_server_lock_volume = slv;
606 EXPORT_SYMBOL(ldlm_pool_set_slv);
608 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
610 return atomic_read(&pl->pl_limit);
612 EXPORT_SYMBOL(ldlm_pool_get_limit);
614 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
616 atomic_set(&pl->pl_limit, limit);
618 EXPORT_SYMBOL(ldlm_pool_set_limit);
620 /* Server side is only enabled for kernel space for now. */
622 static int ldlm_pool_granted(struct ldlm_pool *pl)
624 return atomic_read(&pl->pl_granted);
627 static struct ptlrpc_thread *ldlm_pools_thread;
628 static struct shrinker *ldlm_pools_shrinker;
629 static struct completion ldlm_pools_comp;
631 static int ldlm_pools_thread_main(void *arg)
633 struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
634 char *t_name = "ldlm_poold";
637 cfs_daemonize(t_name);
638 thread->t_flags = SVC_RUNNING;
639 cfs_waitq_signal(&thread->t_ctl_waitq);
641 CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
642 t_name, cfs_curproc_pid());
645 __u32 nr_l = 0, nr_p = 0, l;
646 struct ldlm_namespace *ns;
647 struct l_wait_info lwi;
650 /* Check all namespaces. */
651 mutex_down(&ldlm_namespace_lock);
652 list_for_each_entry(ns, &ldlm_namespace_list, ns_list_chain) {
653 if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
656 if (ns->ns_client == LDLM_NAMESPACE_SERVER) {
657 l = ldlm_pool_granted(&ns->ns_pool);
661 /* Set the modest pools limit equal to
662 * their avg granted locks + 5%. */
663 l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100);
664 ldlm_pool_setup(&ns->ns_pool, l);
669 /* After setup is done - recalc the pool. */
670 rc = ldlm_pool_recalc(&ns->ns_pool);
672 CERROR("%s: pool recalculation error "
673 "%d\n", ns->ns_pool.pl_name, rc);
676 if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
677 CWARN("Modest pools eat out 2/3 of locks limit. %d of %lu. "
678 "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
682 list_for_each_entry(ns, &ldlm_namespace_list, ns_list_chain) {
683 if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
686 if (ns->ns_client == LDLM_NAMESPACE_SERVER) {
688 /* In the case 2/3 locks are eaten out by
689 * modest pools, we re-setup equal limit
690 * for _all_ pools. */
691 l = LDLM_POOL_HOST_L /
692 atomic_read(&ldlm_srv_namespace_nr);
694 /* All the rest of greedy pools will have
695 * all locks in equal parts.*/
696 l = (LDLM_POOL_HOST_L - nr_l) /
697 (atomic_read(&ldlm_srv_namespace_nr) -
700 ldlm_pool_setup(&ns->ns_pool, l);
703 /* After setup is done - recalc the pool. */
704 rc = ldlm_pool_recalc(&ns->ns_pool);
706 CERROR("%s: pool recalculation error "
707 "%d\n", ns->ns_pool.pl_name, rc);
709 mutex_up(&ldlm_namespace_lock);
711 /* Wait until the next check time, or until we're
713 lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
715 l_wait_event(thread->t_ctl_waitq, (thread->t_flags &
716 (SVC_STOPPING|SVC_EVENT)),
719 if (thread->t_flags & SVC_STOPPING) {
720 thread->t_flags &= ~SVC_STOPPING;
722 } else if (thread->t_flags & SVC_EVENT) {
723 thread->t_flags &= ~SVC_EVENT;
727 thread->t_flags = SVC_STOPPED;
728 cfs_waitq_signal(&thread->t_ctl_waitq);
730 CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
731 t_name, cfs_curproc_pid());
733 complete_and_exit(&ldlm_pools_comp, 0);
736 static int ldlm_pools_thread_start(ldlm_side_t client)
738 struct l_wait_info lwi = { 0 };
742 if (ldlm_pools_thread != NULL)
745 OBD_ALLOC_PTR(ldlm_pools_thread);
746 if (ldlm_pools_thread == NULL)
749 ldlm_pools_thread->t_id = client;
750 init_completion(&ldlm_pools_comp);
751 cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq);
753 /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
754 * just drop the VM and FILES in ptlrpc_daemonize() right away. */
755 rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread,
756 CLONE_VM | CLONE_FILES);
758 CERROR("Can't start pool thread, error %d\n",
760 OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
761 ldlm_pools_thread = NULL;
764 l_wait_event(ldlm_pools_thread->t_ctl_waitq,
765 (ldlm_pools_thread->t_flags & SVC_RUNNING), &lwi);
769 static void ldlm_pools_thread_stop(void)
773 if (ldlm_pools_thread == NULL) {
778 ldlm_pools_thread->t_flags = SVC_STOPPING;
779 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
781 /* Make sure that pools thread is finished before freeing @thread.
782 * This fixes possible race and oops due to accessing freed memory
783 * in pools thread. */
784 wait_for_completion(&ldlm_pools_comp);
785 OBD_FREE_PTR(ldlm_pools_thread);
786 ldlm_pools_thread = NULL;
790 int ldlm_pools_init(ldlm_side_t client)
795 rc = ldlm_pools_thread_start(client);
797 ldlm_pools_shrinker = set_shrinker(DEFAULT_SEEKS,
801 EXPORT_SYMBOL(ldlm_pools_init);
803 void ldlm_pools_fini(void)
805 if (ldlm_pools_shrinker != NULL) {
806 remove_shrinker(ldlm_pools_shrinker);
807 ldlm_pools_shrinker = NULL;
809 ldlm_pools_thread_stop();
811 EXPORT_SYMBOL(ldlm_pools_fini);
813 void ldlm_pools_wakeup(void)
816 if (ldlm_pools_thread == NULL)
818 ldlm_pools_thread->t_flags |= SVC_EVENT;
819 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
822 EXPORT_SYMBOL(ldlm_pools_wakeup);
824 /* Cancel @nr locks from all namespaces (if possible). Returns number of
825 * cached locks after shrink is finished. All namespaces are asked to
826 * cancel approximately equal amount of locks. */
827 int ldlm_pools_shrink(int nr, unsigned int gfp_mask)
829 struct ldlm_namespace *ns;
830 int total = 0, cached = 0;
832 if (nr != 0 && !(gfp_mask & __GFP_FS))
835 CDEBUG(D_DLMTRACE, "request to shrink %d locks from all pools\n",
837 mutex_down(&ldlm_namespace_lock);
838 list_for_each_entry(ns, &ldlm_namespace_list, ns_list_chain)
839 total += ldlm_pool_granted(&ns->ns_pool);
842 mutex_up(&ldlm_namespace_lock);
846 /* Check all namespaces. */
847 list_for_each_entry(ns, &ldlm_namespace_list, ns_list_chain) {
848 struct ldlm_pool *pl = &ns->ns_pool;
849 int cancel, nr_locks;
851 nr_locks = ldlm_pool_granted(&ns->ns_pool);
852 cancel = 1 + nr_locks * nr / total;
853 cancel = ldlm_pool_shrink(pl, cancel, gfp_mask);
854 cached += ldlm_pool_granted(&ns->ns_pool);
856 mutex_up(&ldlm_namespace_lock);
859 EXPORT_SYMBOL(ldlm_pools_shrink);
860 #endif /* __KERNEL__ */
862 #else /* !HAVE_LRU_RESIZE_SUPPORT */
863 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
867 EXPORT_SYMBOL(ldlm_pool_setup);
869 int ldlm_pool_recalc(struct ldlm_pool *pl)
873 EXPORT_SYMBOL(ldlm_pool_recalc);
875 int ldlm_pool_shrink(struct ldlm_pool *pl,
876 int nr, unsigned int gfp_mask)
880 EXPORT_SYMBOL(ldlm_pool_shrink);
882 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
883 int idx, ldlm_side_t client)
887 EXPORT_SYMBOL(ldlm_pool_init);
889 void ldlm_pool_fini(struct ldlm_pool *pl)
893 EXPORT_SYMBOL(ldlm_pool_fini);
895 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
899 EXPORT_SYMBOL(ldlm_pool_add);
901 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
905 EXPORT_SYMBOL(ldlm_pool_del);
907 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
911 EXPORT_SYMBOL(ldlm_pool_get_slv);
913 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
917 EXPORT_SYMBOL(ldlm_pool_set_slv);
919 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
923 EXPORT_SYMBOL(ldlm_pool_get_limit);
925 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
929 EXPORT_SYMBOL(ldlm_pool_set_limit);
931 int ldlm_pools_init(ldlm_side_t client)
935 EXPORT_SYMBOL(ldlm_pools_init);
937 void ldlm_pools_fini(void)
941 EXPORT_SYMBOL(ldlm_pools_fini);
943 void ldlm_pools_wakeup(void)
947 EXPORT_SYMBOL(ldlm_pools_wakeup);
948 #endif /* HAVE_LRU_RESIZE_SUPPORT */