1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2007 Cluster File Systems, Inc.
5 * Author: Yury Umanets <umka@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
26 /* Idea of this code is rather simple. Each second, for each server namespace
27 * we have SLV - server lock volume which is calculated on current number of
28 * granted locks, grant speed for past period, etc - that is, locking load.
29 * This SLV number may be thought as a flow definition for simplicity. It is
30 * sent to clients with each occasion to let them know what is current load
31 * situation on the server. By default, at the beginning, SLV on server is
32 * set max value which is calculated as the following: allow to one client
33 * have all locks of limit ->pl_limit for 10h.
35 * Next, on clients, number of cached locks is not limited artificially in any
36 * way as it was before. Instead, client calculates CLV, that is, client lock
37 * volume for each lock and compares it with last SLV from the server. CLV is
38 * calculated as the number of locks in LRU * lock live time in seconds. If
39 * CLV > SLV - lock is canceled.
41 * Client has LVF, that is, lock volume factor which regulates how much sensitive
42 * client should be about last SLV from server. The higher LVF is the more locks
43 * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
44 * that client will cancel locks 2 times faster.
46 * Locks on a client will be canceled more intensively in these cases:
47 * (1) if SLV is smaller, that is, load is higher on the server;
48 * (2) client has a lot of locks (the more locks are held by client, the bigger
49 * chances that some of them should be canceled);
50 * (3) client has old locks (taken some time ago);
52 * Thus, according to flow paradigm that we use for better understanding SLV,
53 * CLV is the volume of particle in flow described by SLV. According to this,
54 * if flow is getting thinner, more and more particles become outside of it and
55 * as particles are locks, they should be canceled.
57 * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
58 * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
59 * cleanups. Flow definition to allow more easy understanding of the logic belongs
60 * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
61 * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
63 * Glossary for terms used:
65 * pl_limit - Number of allowed locks in pool. Applies to server and client
68 * pl_granted - Number of granted locks (calculated);
69 * pl_grant_rate - Number of granted locks for last T (calculated);
70 * pl_cancel_rate - Number of canceled locks for last T (calculated);
71 * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
72 * pl_grant_plan - Planned number of granted locks for next T (calculated);
74 * pl_grant_step - Grant plan step, that is how ->pl_grant_plan
75 * will change in next T (tunable);
77 * pl_server_lock_volume - Current server lock volume (calculated);
79 * As it may be seen from list above, we have few possible tunables which may
80 * affect behavior much. They all may be modified via proc. However, they also
81 * give a possibility for constructing few pre-defined behavior policies. If
82 * none of predefines is suitable for a working pattern being used, new one may
83 * be "constructed" via proc tunables.
86 #define DEBUG_SUBSYSTEM S_LDLM
89 # include <lustre_dlm.h>
91 # include <liblustre.h>
92 # include <libcfs/kp30.h>
95 #include <obd_class.h>
96 #include <obd_support.h>
97 #include "ldlm_internal.h"
99 #ifdef HAVE_LRU_RESIZE_SUPPORT
101 /* 50 ldlm locks for 1MB of RAM. */
102 #define LDLM_POOL_HOST_L ((num_physpages >> (20 - PAGE_SHIFT)) * 50)
104 /* Default step in % for grant plan. */
105 #define LDLM_POOL_GSP (5)
107 /* LDLM_POOL_GSP% of all locks is default GP. */
108 #define LDLM_POOL_GP(L) ((L) * LDLM_POOL_GSP / 100)
110 /* Max age for locks on clients. */
111 #define LDLM_POOL_MAX_AGE (36000)
114 extern cfs_proc_dir_entry_t *ldlm_ns_proc_dir;
117 #define avg(src, add) \
118 ((src) = ((src) + (add)) / 2)
120 static inline __u64 dru(__u64 val, __u32 div)
122 __u64 ret = val + (div - 1);
127 static inline __u64 ldlm_pool_slv_max(__u32 L)
129 /* Allow to have all locks for 1 client for 10 hrs.
130 * Formula is the following: limit * 10h / 1 client. */
131 __u64 lim = L * LDLM_POOL_MAX_AGE / 1;
135 static inline __u64 ldlm_pool_slv_min(__u32 L)
141 LDLM_POOL_GRANTED_STAT = 0,
142 LDLM_POOL_GRANT_RATE_STAT,
143 LDLM_POOL_CANCEL_RATE_STAT,
144 LDLM_POOL_GRANT_PLAN_STAT,
149 static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
151 return container_of(pl, struct ldlm_namespace, ns_pool);
154 /* Should be called under ->pl_lock taken */
155 static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
157 int grant_plan, granted;
160 limit = ldlm_pool_get_limit(pl);
161 granted = atomic_read(&pl->pl_granted);
163 grant_plan = granted + ((limit - granted) *
164 atomic_read(&pl->pl_grant_step)) / 100;
165 atomic_set(&pl->pl_grant_plan, grant_plan);
168 /* Should be called under ->pl_lock taken */
169 static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
171 int slv_factor, granted, grant_plan;
175 slv = ldlm_pool_get_slv(pl);
176 limit = ldlm_pool_get_limit(pl);
177 granted = atomic_read(&pl->pl_granted);
178 grant_plan = atomic_read(&pl->pl_grant_plan);
180 if ((slv_factor = limit - (granted - grant_plan)) <= 0)
183 slv = (slv * ((slv_factor * 100) / limit));
186 if (slv > ldlm_pool_slv_max(limit)) {
187 slv = ldlm_pool_slv_max(limit);
188 } else if (slv < ldlm_pool_slv_min(limit)) {
189 slv = ldlm_pool_slv_min(limit);
192 ldlm_pool_set_slv(pl, slv);
195 static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
197 __u64 slv = ldlm_pool_get_slv(pl);
198 __u32 granted = atomic_read(&pl->pl_granted);
199 __u32 grant_rate = atomic_read(&pl->pl_grant_rate);
200 __u32 grant_plan = atomic_read(&pl->pl_grant_plan);
201 __u32 cancel_rate = atomic_read(&pl->pl_cancel_rate);
203 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
205 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
207 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
209 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
211 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
215 static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
217 time_t recalc_interval_sec;
220 spin_lock(&pl->pl_lock);
221 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
223 if (recalc_interval_sec > 0) {
224 /* Update statistics */
225 ldlm_pool_recalc_stats(pl);
227 /* Recalc SLV after last period. This should be done
228 * _before_ recalculating new grant plan. */
229 ldlm_pool_recalc_slv(pl);
231 /* Update grant_plan for new period. */
232 ldlm_pool_recalc_grant_plan(pl);
233 pl->pl_update_time = cfs_time_current();
235 /* Zero out all rates and speed for the last period. */
236 atomic_set(&pl->pl_grant_rate, 0);
237 atomic_set(&pl->pl_cancel_rate, 0);
238 atomic_set(&pl->pl_grant_speed, 0);
240 spin_unlock(&pl->pl_lock);
244 /* Our goal here is to decrease SLV the way to make a client hold
245 * @nr locks smaller in next 10h. */
246 static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
247 int nr, unsigned int gfp_mask)
249 __u32 granted, limit;
253 /* Client already canceled locks but server is already in shrinker and
254 * can't cancel anything. Let's catch this race. */
255 if ((granted = atomic_read(&pl->pl_granted)) == 0)
258 spin_lock(&pl->pl_lock);
260 /* Simple proportion but it gives impression on how much should be
261 * SLV changed for request @nr of locks to be canceled.*/
262 slv_delta = nr * ldlm_pool_get_slv(pl);
263 limit = ldlm_pool_get_limit(pl);
264 do_div(slv_delta, granted);
266 /* As SLV has some dependence on historical data, that is new value
267 * is based on old one, this decreasing will make clients get some
268 * locks back to the server and after some time it will stabilize.*/
269 if (slv_delta < ldlm_pool_get_slv(pl))
270 ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - slv_delta);
272 ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit));
273 spin_unlock(&pl->pl_lock);
275 /* We did not really free any memory here so far, it only will be
276 * freed later may be, so that we return 0 to not confuse VM. */
280 static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
282 time_t recalc_interval_sec;
285 spin_lock(&pl->pl_lock);
287 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
289 if (recalc_interval_sec > 0) {
290 /* Update statistics only every T */
291 ldlm_pool_recalc_stats(pl);
293 /* Zero out grant/cancel rates and speed for last period. */
294 atomic_set(&pl->pl_grant_rate, 0);
295 atomic_set(&pl->pl_cancel_rate, 0);
296 atomic_set(&pl->pl_grant_speed, 0);
298 spin_unlock(&pl->pl_lock);
300 /* Recalc client pool is done without taking into account pl_update_time
301 * as this may be called voluntary in the case of emergency. Client
302 * recalc does not calculate anything, we do not risk to have skew
303 * of some pool param. */
304 ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC);
308 static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
309 int nr, unsigned int gfp_mask)
312 RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC));
315 int ldlm_pool_recalc(struct ldlm_pool *pl)
317 if (pl->pl_recalc != NULL && pool_recalc_enabled(pl))
318 return pl->pl_recalc(pl);
321 EXPORT_SYMBOL(ldlm_pool_recalc);
323 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
324 unsigned int gfp_mask)
326 if (pl->pl_shrink != NULL && pool_shrink_enabled(pl)) {
327 CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks\n",
329 return pl->pl_shrink(pl, nr, gfp_mask);
333 EXPORT_SYMBOL(ldlm_pool_shrink);
335 /* The purpose of this function is to re-setup limit and maximal allowed
336 * slv according to the passed limit. */
337 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
340 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER)
341 ldlm_pool_set_limit(pl, limit);
344 EXPORT_SYMBOL(ldlm_pool_setup);
347 static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
348 int count, int *eof, void *data)
350 __u32 granted, grant_rate, cancel_rate, grant_step;
351 int nr = 0, grant_speed, grant_plan;
352 struct ldlm_pool *pl = data;
356 spin_lock(&pl->pl_lock);
357 slv = ldlm_pool_get_slv(pl);
358 limit = ldlm_pool_get_limit(pl);
359 granted = atomic_read(&pl->pl_granted);
360 grant_rate = atomic_read(&pl->pl_grant_rate);
361 grant_plan = atomic_read(&pl->pl_grant_plan);
362 grant_step = atomic_read(&pl->pl_grant_step);
363 grant_speed = atomic_read(&pl->pl_grant_speed);
364 cancel_rate = atomic_read(&pl->pl_cancel_rate);
365 spin_unlock(&pl->pl_lock);
367 nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
369 nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv);
371 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_CLIENT) {
372 nr += snprintf(page + nr, count - nr, " LVF: %d\n",
373 atomic_read(&pl->pl_lock_volume_factor));
375 nr += snprintf(page + nr, count - nr, " GSP: %d%%\n",
377 nr += snprintf(page + nr, count - nr, " GP: %d\n",
379 nr += snprintf(page + nr, count - nr, " GR: %d\n",
381 nr += snprintf(page + nr, count - nr, " CR: %d\n",
383 nr += snprintf(page + nr, count - nr, " GS: %d\n",
385 nr += snprintf(page + nr, count - nr, " G: %d\n",
387 nr += snprintf(page + nr, count - nr, " L: %d\n",
392 static int ldlm_pool_proc_init(struct ldlm_pool *pl)
394 struct ldlm_namespace *ns = ldlm_pl2ns(pl);
395 struct proc_dir_entry *parent_ns_proc;
396 struct lprocfs_vars pool_vars[2];
397 char *var_name = NULL;
401 OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
405 parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir, ns->ns_name);
406 if (parent_ns_proc == NULL) {
407 CERROR("%s: proc entry is not initialized\n",
409 GOTO(out_free_name, rc = -EINVAL);
411 pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
413 if (IS_ERR(pl->pl_proc_dir)) {
414 CERROR("LProcFS failed in ldlm-pool-init\n");
415 rc = PTR_ERR(pl->pl_proc_dir);
416 GOTO(out_free_name, rc);
419 var_name[MAX_STRING_SIZE] = '\0';
420 memset(pool_vars, 0, sizeof(pool_vars));
421 pool_vars[0].name = var_name;
423 snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
424 pool_vars[0].data = &pl->pl_server_lock_volume;
425 pool_vars[0].read_fptr = lprocfs_rd_u64;
426 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
428 snprintf(var_name, MAX_STRING_SIZE, "limit");
429 pool_vars[0].data = &pl->pl_limit;
430 pool_vars[0].read_fptr = lprocfs_rd_atomic;
431 pool_vars[0].write_fptr = lprocfs_wr_atomic;
432 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
434 snprintf(var_name, MAX_STRING_SIZE, "granted");
435 pool_vars[0].data = &pl->pl_granted;
436 pool_vars[0].read_fptr = lprocfs_rd_atomic;
437 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
439 snprintf(var_name, MAX_STRING_SIZE, "control");
440 pool_vars[0].data = &pl->pl_control;
441 pool_vars[0].read_fptr = lprocfs_rd_uint;
442 pool_vars[0].write_fptr = lprocfs_wr_uint;
443 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
445 snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
446 pool_vars[0].data = &pl->pl_grant_speed;
447 pool_vars[0].read_fptr = lprocfs_rd_atomic;
448 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
450 snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
451 pool_vars[0].data = &pl->pl_cancel_rate;
452 pool_vars[0].read_fptr = lprocfs_rd_atomic;
453 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
455 snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
456 pool_vars[0].data = &pl->pl_grant_rate;
457 pool_vars[0].read_fptr = lprocfs_rd_atomic;
458 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
460 snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
461 pool_vars[0].data = &pl->pl_grant_plan;
462 pool_vars[0].read_fptr = lprocfs_rd_atomic;
463 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
465 snprintf(var_name, MAX_STRING_SIZE, "grant_step");
466 pool_vars[0].data = &pl->pl_grant_step;
467 pool_vars[0].read_fptr = lprocfs_rd_atomic;
468 if (ns->ns_client == LDLM_NAMESPACE_SERVER)
469 pool_vars[0].write_fptr = lprocfs_wr_atomic;
470 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
472 if (ns->ns_client == LDLM_NAMESPACE_CLIENT) {
473 snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
474 pool_vars[0].data = &pl->pl_lock_volume_factor;
475 pool_vars[0].read_fptr = lprocfs_rd_uint;
476 pool_vars[0].write_fptr = lprocfs_wr_uint;
477 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
480 snprintf(var_name, MAX_STRING_SIZE, "state");
481 pool_vars[0].data = pl;
482 pool_vars[0].read_fptr = lprocfs_rd_pool_state;
483 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
485 pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
486 LDLM_POOL_GRANTED_STAT, 0);
488 GOTO(out_free_name, rc = -ENOMEM);
490 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
491 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
493 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
494 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
495 "grant_rate", "locks/s");
496 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
497 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
498 "cancel_rate", "locks/s");
499 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
500 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
501 "grant_plan", "locks/s");
502 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
503 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
505 lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
509 OBD_FREE(var_name, MAX_STRING_SIZE + 1);
513 static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
515 if (pl->pl_stats != NULL) {
516 lprocfs_free_stats(&pl->pl_stats);
519 if (pl->pl_proc_dir != NULL) {
520 lprocfs_remove(&pl->pl_proc_dir);
521 pl->pl_proc_dir = NULL;
524 #else /* !__KERNEL__*/
525 #define ldlm_pool_proc_init(pl) (0)
526 #define ldlm_pool_proc_fini(pl) while (0) {}
529 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
530 int idx, ldlm_side_t client)
535 spin_lock_init(&pl->pl_lock);
536 atomic_set(&pl->pl_granted, 0);
537 pl->pl_update_time = cfs_time_current();
538 atomic_set(&pl->pl_lock_volume_factor, 1);
540 atomic_set(&pl->pl_grant_rate, 0);
541 atomic_set(&pl->pl_cancel_rate, 0);
542 atomic_set(&pl->pl_grant_speed, 0);
543 pl->pl_control = LDLM_POOL_CTL_FULL;
544 atomic_set(&pl->pl_grant_step, LDLM_POOL_GSP);
545 atomic_set(&pl->pl_grant_plan, LDLM_POOL_GP(LDLM_POOL_HOST_L));
547 snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
550 if (client == LDLM_NAMESPACE_SERVER) {
551 pl->pl_recalc = ldlm_srv_pool_recalc;
552 pl->pl_shrink = ldlm_srv_pool_shrink;
553 ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
554 ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L));
556 ldlm_pool_set_slv(pl, 1);
557 ldlm_pool_set_limit(pl, 1);
558 pl->pl_recalc = ldlm_cli_pool_recalc;
559 pl->pl_shrink = ldlm_cli_pool_shrink;
562 rc = ldlm_pool_proc_init(pl);
566 CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
570 EXPORT_SYMBOL(ldlm_pool_init);
572 void ldlm_pool_fini(struct ldlm_pool *pl)
575 ldlm_pool_proc_fini(pl);
576 pl->pl_recalc = NULL;
577 pl->pl_shrink = NULL;
580 EXPORT_SYMBOL(ldlm_pool_fini);
582 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
585 atomic_inc(&pl->pl_granted);
586 atomic_inc(&pl->pl_grant_rate);
587 atomic_inc(&pl->pl_grant_speed);
589 /* No need to recalc client pools here as this is already done
590 * on enqueue/cancel and locks to cancel already packed to the
592 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER)
593 ldlm_pool_recalc(pl);
596 EXPORT_SYMBOL(ldlm_pool_add);
598 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
601 LASSERT(atomic_read(&pl->pl_granted) > 0);
602 atomic_dec(&pl->pl_granted);
603 atomic_inc(&pl->pl_cancel_rate);
604 atomic_dec(&pl->pl_grant_speed);
606 /* Same as in ldlm_pool_add() */
607 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER)
608 ldlm_pool_recalc(pl);
611 EXPORT_SYMBOL(ldlm_pool_del);
613 /* ->pl_lock should be taken. */
614 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
616 return pl->pl_server_lock_volume;
618 EXPORT_SYMBOL(ldlm_pool_get_slv);
620 /* ->pl_lock should be taken. */
621 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
623 pl->pl_server_lock_volume = slv;
625 EXPORT_SYMBOL(ldlm_pool_set_slv);
627 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
629 return atomic_read(&pl->pl_limit);
631 EXPORT_SYMBOL(ldlm_pool_get_limit);
633 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
635 atomic_set(&pl->pl_limit, limit);
637 EXPORT_SYMBOL(ldlm_pool_set_limit);
639 /* Server side is only enabled for kernel space for now. */
641 static int ldlm_pool_granted(struct ldlm_pool *pl)
643 return atomic_read(&pl->pl_granted);
646 static struct ptlrpc_thread *ldlm_pools_thread;
647 static struct shrinker *ldlm_pools_srv_shrinker;
648 static struct shrinker *ldlm_pools_cli_shrinker;
649 static struct completion ldlm_pools_comp;
651 void ldlm_pools_wakeup(void)
654 if (ldlm_pools_thread == NULL)
656 ldlm_pools_thread->t_flags |= SVC_EVENT;
657 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
660 EXPORT_SYMBOL(ldlm_pools_wakeup);
662 /* Cancel @nr locks from all namespaces (if possible). Returns number of
663 * cached locks after shrink is finished. All namespaces are asked to
664 * cancel approximately equal amount of locks. */
665 static int ldlm_pools_shrink(ldlm_side_t client, int nr,
666 unsigned int gfp_mask)
668 int total = 0, cached = 0, nr_ns;
669 struct ldlm_namespace *ns;
671 if (nr != 0 && !(gfp_mask & __GFP_FS))
674 CDEBUG(D_DLMTRACE, "request to shrink %d %s locks from all pools\n",
675 nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
677 /* Find out how many resources we may release. */
678 mutex_down(ldlm_namespace_lock(client));
679 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain)
680 total += ldlm_pool_granted(&ns->ns_pool);
681 mutex_up(ldlm_namespace_lock(client));
683 if (nr == 0 || total == 0)
686 /* Shrink at least ldlm_namespace_nr(client) namespaces. */
687 for (nr_ns = atomic_read(ldlm_namespace_nr(client));
690 int cancel, nr_locks;
692 /* Do not call shrink under ldlm_namespace_lock(client) */
693 mutex_down(ldlm_namespace_lock(client));
694 if (list_empty(ldlm_namespace_list(client))) {
695 mutex_up(ldlm_namespace_lock(client));
696 /* If list is empty, we can't return any @cached > 0,
697 * that probably would cause needless shrinker
702 ns = ldlm_namespace_first(client);
703 ldlm_namespace_get(ns);
704 ldlm_namespace_move(ns, client);
705 mutex_up(ldlm_namespace_lock(client));
707 nr_locks = ldlm_pool_granted(&ns->ns_pool);
708 cancel = 1 + nr_locks * nr / total;
709 ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
710 cached += ldlm_pool_granted(&ns->ns_pool);
711 ldlm_namespace_put(ns, 1);
716 static int ldlm_pools_srv_shrink(int nr, unsigned int gfp_mask)
718 return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr, gfp_mask);
721 static int ldlm_pools_cli_shrink(int nr, unsigned int gfp_mask)
723 return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr, gfp_mask);
726 void ldlm_pools_recalc(ldlm_side_t client)
728 __u32 nr_l = 0, nr_p = 0, l;
729 struct ldlm_namespace *ns;
730 int rc, nr, equal = 0;
732 /* Check all modest namespaces. */
733 mutex_down(ldlm_namespace_lock(client));
734 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) {
735 if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
738 if (client == LDLM_NAMESPACE_SERVER) {
739 l = ldlm_pool_granted(&ns->ns_pool);
743 /* Set the modest pools limit equal to their avg granted
745 l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100);
746 ldlm_pool_setup(&ns->ns_pool, l);
752 /* Make sure that modest namespaces did not eat more that 2/3 of limit */
753 if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
754 CWARN("Modest pools eat out 2/3 of locks limit. %d of %lu. "
755 "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
759 /* The rest is given to greedy namespaces. */
760 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) {
761 if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
764 if (client == LDLM_NAMESPACE_SERVER) {
766 /* In the case 2/3 locks are eaten out by
767 * modest pools, we re-setup equal limit
768 * for _all_ pools. */
769 l = LDLM_POOL_HOST_L /
770 atomic_read(ldlm_namespace_nr(client));
772 /* All the rest of greedy pools will have
773 * all locks in equal parts.*/
774 l = (LDLM_POOL_HOST_L - nr_l) /
775 (atomic_read(ldlm_namespace_nr(client)) -
778 ldlm_pool_setup(&ns->ns_pool, l);
781 mutex_up(ldlm_namespace_lock(client));
783 /* Recalc at least ldlm_namespace_nr(client) namespaces. */
784 for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
785 /* Lock the list, get first @ns in the list, getref, move it
786 * to the tail, unlock and call pool recalc. This way we avoid
787 * calling recalc under @ns lock what is really good as we get
788 * rid of potential deadlock on client nodes when canceling
789 * locks synchronously. */
790 mutex_down(ldlm_namespace_lock(client));
791 if (list_empty(ldlm_namespace_list(client))) {
792 mutex_up(ldlm_namespace_lock(client));
795 ns = ldlm_namespace_first(client);
796 ldlm_namespace_get(ns);
797 ldlm_namespace_move(ns, client);
798 mutex_up(ldlm_namespace_lock(client));
800 /* After setup is done - recalc the pool. */
801 rc = ldlm_pool_recalc(&ns->ns_pool);
803 CERROR("%s: pool recalculation error "
804 "%d\n", ns->ns_pool.pl_name, rc);
806 ldlm_namespace_put(ns, 1);
809 EXPORT_SYMBOL(ldlm_pools_recalc);
811 static int ldlm_pools_thread_main(void *arg)
813 struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
814 char *t_name = "ldlm_poold";
817 cfs_daemonize(t_name);
818 thread->t_flags = SVC_RUNNING;
819 cfs_waitq_signal(&thread->t_ctl_waitq);
821 CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
822 t_name, cfs_curproc_pid());
825 struct l_wait_info lwi;
827 /* Recal all pools on this tick. */
828 ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
829 ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
831 /* Wait until the next check time, or until we're
833 lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
835 l_wait_event(thread->t_ctl_waitq, (thread->t_flags &
836 (SVC_STOPPING|SVC_EVENT)),
839 if (thread->t_flags & SVC_STOPPING) {
840 thread->t_flags &= ~SVC_STOPPING;
842 } else if (thread->t_flags & SVC_EVENT) {
843 thread->t_flags &= ~SVC_EVENT;
847 thread->t_flags = SVC_STOPPED;
848 cfs_waitq_signal(&thread->t_ctl_waitq);
850 CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
851 t_name, cfs_curproc_pid());
853 complete_and_exit(&ldlm_pools_comp, 0);
856 static int ldlm_pools_thread_start(ldlm_side_t client)
858 struct l_wait_info lwi = { 0 };
862 if (ldlm_pools_thread != NULL)
865 OBD_ALLOC_PTR(ldlm_pools_thread);
866 if (ldlm_pools_thread == NULL)
869 ldlm_pools_thread->t_id = client;
870 init_completion(&ldlm_pools_comp);
871 cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq);
873 /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
874 * just drop the VM and FILES in ptlrpc_daemonize() right away. */
875 rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread,
876 CLONE_VM | CLONE_FILES);
878 CERROR("Can't start pool thread, error %d\n",
880 OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
881 ldlm_pools_thread = NULL;
884 l_wait_event(ldlm_pools_thread->t_ctl_waitq,
885 (ldlm_pools_thread->t_flags & SVC_RUNNING), &lwi);
889 static void ldlm_pools_thread_stop(void)
893 if (ldlm_pools_thread == NULL) {
898 ldlm_pools_thread->t_flags = SVC_STOPPING;
899 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
901 /* Make sure that pools thread is finished before freeing @thread.
902 * This fixes possible race and oops due to accessing freed memory
903 * in pools thread. */
904 wait_for_completion(&ldlm_pools_comp);
905 OBD_FREE_PTR(ldlm_pools_thread);
906 ldlm_pools_thread = NULL;
910 int ldlm_pools_init(ldlm_side_t client)
915 rc = ldlm_pools_thread_start(client);
917 ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS,
918 ldlm_pools_srv_shrink);
919 ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS,
920 ldlm_pools_cli_shrink);
924 EXPORT_SYMBOL(ldlm_pools_init);
926 void ldlm_pools_fini(void)
928 if (ldlm_pools_srv_shrinker != NULL) {
929 remove_shrinker(ldlm_pools_srv_shrinker);
930 ldlm_pools_srv_shrinker = NULL;
932 if (ldlm_pools_cli_shrinker != NULL) {
933 remove_shrinker(ldlm_pools_cli_shrinker);
934 ldlm_pools_cli_shrinker = NULL;
936 ldlm_pools_thread_stop();
938 EXPORT_SYMBOL(ldlm_pools_fini);
939 #endif /* __KERNEL__ */
941 #else /* !HAVE_LRU_RESIZE_SUPPORT */
942 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
946 EXPORT_SYMBOL(ldlm_pool_setup);
948 int ldlm_pool_recalc(struct ldlm_pool *pl)
952 EXPORT_SYMBOL(ldlm_pool_recalc);
954 int ldlm_pool_shrink(struct ldlm_pool *pl,
955 int nr, unsigned int gfp_mask)
959 EXPORT_SYMBOL(ldlm_pool_shrink);
961 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
962 int idx, ldlm_side_t client)
966 EXPORT_SYMBOL(ldlm_pool_init);
968 void ldlm_pool_fini(struct ldlm_pool *pl)
972 EXPORT_SYMBOL(ldlm_pool_fini);
974 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
978 EXPORT_SYMBOL(ldlm_pool_add);
980 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
984 EXPORT_SYMBOL(ldlm_pool_del);
986 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
990 EXPORT_SYMBOL(ldlm_pool_get_slv);
992 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
996 EXPORT_SYMBOL(ldlm_pool_set_slv);
998 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
1002 EXPORT_SYMBOL(ldlm_pool_get_limit);
1004 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
1008 EXPORT_SYMBOL(ldlm_pool_set_limit);
1010 int ldlm_pools_init(ldlm_side_t client)
1014 EXPORT_SYMBOL(ldlm_pools_init);
1016 void ldlm_pools_fini(void)
1020 EXPORT_SYMBOL(ldlm_pools_fini);
1022 void ldlm_pools_wakeup(void)
1026 EXPORT_SYMBOL(ldlm_pools_wakeup);
1028 void ldlm_pools_recalc(ldlm_side_t client)
1032 EXPORT_SYMBOL(ldlm_pools_recalc);
1033 #endif /* HAVE_LRU_RESIZE_SUPPORT */