1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2007 Cluster File Systems, Inc.
5 * Author: Yury Umanets <umka@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
26 /* Idea of this code is rather simple. Each second, for each server namespace
27 * we have SLV - server lock volume which is calculated on current number of
28 * granted locks, grant speed for past period, etc - that is, locking load.
29 * This SLV number may be thought as a flow definition for simplicity. It is
30 * sent to clients with each occasion to let them know what is current load
31 * situation on the server. By default, at the beginning, SLV on server is
32 * set max value which is calculated as the following: allow to one client
33 * have all locks of limit ->pl_limit for 10h.
35 * Next, on clients, number of cached locks is not limited artificially in any
36 * way as it was before. Instead, client calculates CLV, that is, client lock
37 * volume for each lock and compares it with last SLV from the server. CLV is
38 * calculated as the number of locks in LRU * lock live time in seconds. If
39 * CLV > SLV - lock is canceled.
41 * Client has LVF, that is, lock volume factor which regulates how much sensitive
42 * client should be about last SLV from server. The higher LVF is the more locks
43 * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
44 * that client will cancel locks 2 times faster.
46 * Locks on a client will be canceled more intensively in these cases:
47 * (1) if SLV is smaller, that is, load is higher on the server;
48 * (2) client has a lot of locks (the more locks are held by client, the bigger
49 * chances that some of them should be canceled);
50 * (3) client has old locks (taken some time ago);
52 * Thus, according to flow paradigm that we use for better understanding SLV,
53 * CLV is the volume of particle in flow described by SLV. According to this,
54 * if flow is getting thinner, more and more particles become outside of it and
55 * as particles are locks, they should be canceled.
57 * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
58 * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
59 * cleanups. Flow definition to allow more easy understanding of the logic belongs
60 * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
61 * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
63 * Glossary for terms used:
65 * pl_limit - Number of allowed locks in pool. Applies to server and client
68 * pl_granted - Number of granted locks (calculated);
69 * pl_grant_rate - Number of granted locks for last T (calculated);
70 * pl_cancel_rate - Number of canceled locks for last T (calculated);
71 * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
72 * pl_grant_plan - Planned number of granted locks for next T (calculated);
74 * pl_grant_step - Grant plan step, that is how ->pl_grant_plan
75 * will change in next T (tunable);
77 * pl_server_lock_volume - Current server lock volume (calculated);
79 * As it may be seen from list above, we have few possible tunables which may
80 * affect behavior much. They all may be modified via proc. However, they also
81 * give a possibility for constructing few pre-defined behavior policies. If
82 * none of predefines is suitable for a working pattern being used, new one may
83 * be "constructed" via proc tunables.
86 #define DEBUG_SUBSYSTEM S_LDLM
89 # include <lustre_dlm.h>
91 # include <liblustre.h>
92 # include <libcfs/kp30.h>
95 #include <obd_class.h>
96 #include <obd_support.h>
97 #include "ldlm_internal.h"
99 #ifdef HAVE_LRU_RESIZE_SUPPORT
101 /* 50 ldlm locks for 1MB of RAM. */
102 #define LDLM_POOL_HOST_L ((num_physpages >> (20 - PAGE_SHIFT)) * 50)
104 /* Default step in % for grant plan. */
105 #define LDLM_POOL_GSP (5)
107 /* LDLM_POOL_GSP% of all locks is default GP. */
108 #define LDLM_POOL_GP(L) ((L) * LDLM_POOL_GSP / 100)
110 /* Max age for locks on clients. */
111 #define LDLM_POOL_MAX_AGE (36000)
114 extern cfs_proc_dir_entry_t *ldlm_ns_proc_dir;
117 #define avg(src, add) \
118 ((src) = ((src) + (add)) / 2)
120 static inline __u64 dru(__u64 val, __u32 div)
122 __u64 ret = val + (div - 1);
127 static inline __u64 ldlm_pool_slv_max(__u32 L)
129 /* Allow to have all locks for 1 client for 10 hrs.
130 * Formula is the following: limit * 10h / 1 client. */
131 __u64 lim = L * LDLM_POOL_MAX_AGE / 1;
135 static inline __u64 ldlm_pool_slv_min(__u32 L)
141 LDLM_POOL_GRANTED_STAT = 0,
142 LDLM_POOL_GRANT_RATE_STAT,
143 LDLM_POOL_CANCEL_RATE_STAT,
144 LDLM_POOL_GRANT_PLAN_STAT,
149 static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
151 return container_of(pl, struct ldlm_namespace, ns_pool);
154 /* Should be called under ->pl_lock taken */
155 static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
157 int grant_plan, granted;
160 limit = ldlm_pool_get_limit(pl);
161 granted = atomic_read(&pl->pl_granted);
163 grant_plan = granted + ((limit - granted) *
164 atomic_read(&pl->pl_grant_step)) / 100;
165 atomic_set(&pl->pl_grant_plan, grant_plan);
168 /* Should be called under ->pl_lock taken */
169 static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
171 int slv_factor, granted, grant_plan;
175 slv = ldlm_pool_get_slv(pl);
176 limit = ldlm_pool_get_limit(pl);
177 granted = atomic_read(&pl->pl_granted);
178 grant_plan = atomic_read(&pl->pl_grant_plan);
180 if ((slv_factor = limit - (granted - grant_plan)) <= 0)
183 slv = (slv * ((slv_factor * 100) / limit));
186 if (slv > ldlm_pool_slv_max(limit)) {
187 slv = ldlm_pool_slv_max(limit);
188 } else if (slv < ldlm_pool_slv_min(limit)) {
189 slv = ldlm_pool_slv_min(limit);
192 ldlm_pool_set_slv(pl, slv);
195 static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
197 __u64 slv = ldlm_pool_get_slv(pl);
198 __u32 granted = atomic_read(&pl->pl_granted);
199 __u32 grant_rate = atomic_read(&pl->pl_grant_rate);
200 __u32 grant_plan = atomic_read(&pl->pl_grant_plan);
201 __u32 cancel_rate = atomic_read(&pl->pl_cancel_rate);
203 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
205 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
207 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
209 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
211 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
215 static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
217 time_t recalc_interval_sec;
220 spin_lock(&pl->pl_lock);
221 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
223 if (recalc_interval_sec > 0) {
224 /* Update statistics */
225 ldlm_pool_recalc_stats(pl);
227 /* Recalc SLV after last period. This should be done
228 * _before_ recalculating new grant plan. */
229 ldlm_pool_recalc_slv(pl);
231 /* Update grant_plan for new period. */
232 ldlm_pool_recalc_grant_plan(pl);
233 pl->pl_update_time = cfs_time_current();
235 /* Zero out all rates and speed for the last period. */
236 atomic_set(&pl->pl_grant_rate, 0);
237 atomic_set(&pl->pl_cancel_rate, 0);
238 atomic_set(&pl->pl_grant_speed, 0);
240 spin_unlock(&pl->pl_lock);
244 /* Our goal here is to decrease SLV the way to make a client hold
245 * @nr locks smaller in next 10h. */
246 static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
247 int nr, unsigned int gfp_mask)
249 __u32 granted, limit;
253 /* Client already canceled locks but server is already in shrinker and
254 * can't cancel anything. Let's catch this race. */
255 if ((granted = atomic_read(&pl->pl_granted)) == 0)
258 spin_lock(&pl->pl_lock);
260 /* Simple proportion but it gives impression on how much should be
261 * SLV changed for request @nr of locks to be canceled.*/
262 slv_delta = nr * ldlm_pool_get_slv(pl);
263 limit = ldlm_pool_get_limit(pl);
264 do_div(slv_delta, granted);
266 /* As SLV has some dependence on historical data, that is new value
267 * is based on old one, this decreasing will make clients get some
268 * locks back to the server and after some time it will stabilize.*/
269 if (slv_delta < ldlm_pool_get_slv(pl))
270 ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - slv_delta);
272 ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit));
273 spin_unlock(&pl->pl_lock);
275 /* We did not really free any memory here so far, it only will be
276 * freed later may be, so that we return 0 to not confuse VM. */
280 static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
282 time_t recalc_interval_sec;
285 spin_lock(&pl->pl_lock);
287 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
289 if (recalc_interval_sec > 0) {
290 /* Update statistics only every T */
291 ldlm_pool_recalc_stats(pl);
293 /* Zero out grant/cancel rates and speed for last period. */
294 atomic_set(&pl->pl_grant_rate, 0);
295 atomic_set(&pl->pl_cancel_rate, 0);
296 atomic_set(&pl->pl_grant_speed, 0);
298 spin_unlock(&pl->pl_lock);
300 /* Recalc client pool is done without taking into account pl_update_time
301 * as this may be called voluntary in the case of emergency. Client
302 * recalc does not calculate anything, we do not risk to have skew
303 * of some pool param. */
304 ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC);
308 static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
309 int nr, unsigned int gfp_mask)
312 RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC));
315 int ldlm_pool_recalc(struct ldlm_pool *pl)
317 if (pl->pl_recalc != NULL && pool_recalc_enabled(pl))
318 return pl->pl_recalc(pl);
321 EXPORT_SYMBOL(ldlm_pool_recalc);
323 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
324 unsigned int gfp_mask)
326 if (pl->pl_shrink != NULL && pool_shrink_enabled(pl)) {
327 CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks\n",
329 return pl->pl_shrink(pl, nr, gfp_mask);
333 EXPORT_SYMBOL(ldlm_pool_shrink);
335 /* The purpose of this function is to re-setup limit and maximal allowed
336 * slv according to the passed limit. */
337 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
340 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER) {
341 spin_lock(&pl->pl_lock);
342 ldlm_pool_set_limit(pl, limit);
343 spin_unlock(&pl->pl_lock);
347 EXPORT_SYMBOL(ldlm_pool_setup);
350 static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
351 int count, int *eof, void *data)
353 __u32 granted, grant_rate, cancel_rate, grant_step;
354 int nr = 0, grant_speed, grant_plan;
355 struct ldlm_pool *pl = data;
359 spin_lock(&pl->pl_lock);
360 slv = ldlm_pool_get_slv(pl);
361 limit = ldlm_pool_get_limit(pl);
362 granted = atomic_read(&pl->pl_granted);
363 grant_rate = atomic_read(&pl->pl_grant_rate);
364 grant_plan = atomic_read(&pl->pl_grant_plan);
365 grant_step = atomic_read(&pl->pl_grant_step);
366 grant_speed = atomic_read(&pl->pl_grant_speed);
367 cancel_rate = atomic_read(&pl->pl_cancel_rate);
368 spin_unlock(&pl->pl_lock);
370 nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
372 nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv);
374 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_CLIENT) {
375 nr += snprintf(page + nr, count - nr, " LVF: %d\n",
376 atomic_read(&pl->pl_lock_volume_factor));
378 nr += snprintf(page + nr, count - nr, " GSP: %d%%\n",
380 nr += snprintf(page + nr, count - nr, " GP: %d\n",
382 nr += snprintf(page + nr, count - nr, " GR: %d\n",
384 nr += snprintf(page + nr, count - nr, " CR: %d\n",
386 nr += snprintf(page + nr, count - nr, " GS: %d\n",
388 nr += snprintf(page + nr, count - nr, " G: %d\n",
390 nr += snprintf(page + nr, count - nr, " L: %d\n",
395 static int ldlm_pool_proc_init(struct ldlm_pool *pl)
397 struct ldlm_namespace *ns = ldlm_pl2ns(pl);
398 struct proc_dir_entry *parent_ns_proc;
399 struct lprocfs_vars pool_vars[2];
400 char *var_name = NULL;
404 OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
408 parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir, ns->ns_name);
409 if (parent_ns_proc == NULL) {
410 CERROR("%s: proc entry is not initialized\n",
412 GOTO(out_free_name, rc = -EINVAL);
414 pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
416 if (IS_ERR(pl->pl_proc_dir)) {
417 CERROR("LProcFS failed in ldlm-pool-init\n");
418 rc = PTR_ERR(pl->pl_proc_dir);
419 GOTO(out_free_name, rc);
422 var_name[MAX_STRING_SIZE] = '\0';
423 memset(pool_vars, 0, sizeof(pool_vars));
424 pool_vars[0].name = var_name;
426 snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
427 pool_vars[0].data = &pl->pl_server_lock_volume;
428 pool_vars[0].read_fptr = lprocfs_rd_u64;
429 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
431 snprintf(var_name, MAX_STRING_SIZE, "limit");
432 pool_vars[0].data = &pl->pl_limit;
433 pool_vars[0].read_fptr = lprocfs_rd_atomic;
434 pool_vars[0].write_fptr = lprocfs_wr_atomic;
435 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
437 snprintf(var_name, MAX_STRING_SIZE, "granted");
438 pool_vars[0].data = &pl->pl_granted;
439 pool_vars[0].read_fptr = lprocfs_rd_atomic;
440 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
442 snprintf(var_name, MAX_STRING_SIZE, "control");
443 pool_vars[0].data = &pl->pl_control;
444 pool_vars[0].read_fptr = lprocfs_rd_uint;
445 pool_vars[0].write_fptr = lprocfs_wr_uint;
446 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
448 snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
449 pool_vars[0].data = &pl->pl_grant_speed;
450 pool_vars[0].read_fptr = lprocfs_rd_atomic;
451 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
453 snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
454 pool_vars[0].data = &pl->pl_cancel_rate;
455 pool_vars[0].read_fptr = lprocfs_rd_atomic;
456 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
458 snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
459 pool_vars[0].data = &pl->pl_grant_rate;
460 pool_vars[0].read_fptr = lprocfs_rd_atomic;
461 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
463 snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
464 pool_vars[0].data = &pl->pl_grant_plan;
465 pool_vars[0].read_fptr = lprocfs_rd_atomic;
466 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
468 snprintf(var_name, MAX_STRING_SIZE, "grant_step");
469 pool_vars[0].data = &pl->pl_grant_step;
470 pool_vars[0].read_fptr = lprocfs_rd_atomic;
471 if (ns->ns_client == LDLM_NAMESPACE_SERVER)
472 pool_vars[0].write_fptr = lprocfs_wr_atomic;
473 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
475 if (ns->ns_client == LDLM_NAMESPACE_CLIENT) {
476 snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
477 pool_vars[0].data = &pl->pl_lock_volume_factor;
478 pool_vars[0].read_fptr = lprocfs_rd_uint;
479 pool_vars[0].write_fptr = lprocfs_wr_uint;
480 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
483 snprintf(var_name, MAX_STRING_SIZE, "state");
484 pool_vars[0].data = pl;
485 pool_vars[0].read_fptr = lprocfs_rd_pool_state;
486 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
488 pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
489 LDLM_POOL_GRANTED_STAT, 0);
491 GOTO(out_free_name, rc = -ENOMEM);
493 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
494 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
496 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
497 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
498 "grant_rate", "locks/s");
499 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
500 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
501 "cancel_rate", "locks/s");
502 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
503 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
504 "grant_plan", "locks/s");
505 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
506 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
508 lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
512 OBD_FREE(var_name, MAX_STRING_SIZE + 1);
516 static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
518 if (pl->pl_stats != NULL) {
519 lprocfs_free_stats(&pl->pl_stats);
522 if (pl->pl_proc_dir != NULL) {
523 lprocfs_remove(&pl->pl_proc_dir);
524 pl->pl_proc_dir = NULL;
527 #else /* !__KERNEL__*/
528 #define ldlm_pool_proc_init(pl) (0)
529 #define ldlm_pool_proc_fini(pl) while (0) {}
532 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
533 int idx, ldlm_side_t client)
538 spin_lock_init(&pl->pl_lock);
539 atomic_set(&pl->pl_granted, 0);
540 pl->pl_update_time = cfs_time_current();
541 atomic_set(&pl->pl_lock_volume_factor, 1);
543 atomic_set(&pl->pl_grant_rate, 0);
544 atomic_set(&pl->pl_cancel_rate, 0);
545 atomic_set(&pl->pl_grant_speed, 0);
546 pl->pl_control = LDLM_POOL_CTL_FULL;
547 atomic_set(&pl->pl_grant_step, LDLM_POOL_GSP);
548 atomic_set(&pl->pl_grant_plan, LDLM_POOL_GP(LDLM_POOL_HOST_L));
550 snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
553 if (client == LDLM_NAMESPACE_SERVER) {
554 pl->pl_recalc = ldlm_srv_pool_recalc;
555 pl->pl_shrink = ldlm_srv_pool_shrink;
556 ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
557 ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L));
559 ldlm_pool_set_slv(pl, 1);
560 ldlm_pool_set_limit(pl, 1);
561 pl->pl_recalc = ldlm_cli_pool_recalc;
562 pl->pl_shrink = ldlm_cli_pool_shrink;
565 rc = ldlm_pool_proc_init(pl);
569 CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
573 EXPORT_SYMBOL(ldlm_pool_init);
575 void ldlm_pool_fini(struct ldlm_pool *pl)
578 ldlm_pool_proc_fini(pl);
579 pl->pl_recalc = NULL;
580 pl->pl_shrink = NULL;
583 EXPORT_SYMBOL(ldlm_pool_fini);
585 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
588 atomic_inc(&pl->pl_granted);
589 atomic_inc(&pl->pl_grant_rate);
590 atomic_inc(&pl->pl_grant_speed);
592 /* No need to recalc client pools here as this is already done
593 * on enqueue/cancel and locks to cancel already packed to the
595 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER)
596 ldlm_pool_recalc(pl);
599 EXPORT_SYMBOL(ldlm_pool_add);
601 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
604 LASSERT(atomic_read(&pl->pl_granted) > 0);
605 atomic_dec(&pl->pl_granted);
606 atomic_inc(&pl->pl_cancel_rate);
607 atomic_dec(&pl->pl_grant_speed);
609 /* Same as in ldlm_pool_add() */
610 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER)
611 ldlm_pool_recalc(pl);
614 EXPORT_SYMBOL(ldlm_pool_del);
616 /* ->pl_lock should be taken. */
617 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
619 return pl->pl_server_lock_volume;
621 EXPORT_SYMBOL(ldlm_pool_get_slv);
623 /* ->pl_lock should be taken. */
624 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
626 pl->pl_server_lock_volume = slv;
628 EXPORT_SYMBOL(ldlm_pool_set_slv);
630 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
632 return atomic_read(&pl->pl_limit);
634 EXPORT_SYMBOL(ldlm_pool_get_limit);
636 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
638 atomic_set(&pl->pl_limit, limit);
640 EXPORT_SYMBOL(ldlm_pool_set_limit);
642 /* Server side is only enabled for kernel space for now. */
644 static int ldlm_pool_granted(struct ldlm_pool *pl)
646 return atomic_read(&pl->pl_granted);
649 static struct ptlrpc_thread *ldlm_pools_thread;
650 static struct shrinker *ldlm_pools_srv_shrinker;
651 static struct shrinker *ldlm_pools_cli_shrinker;
652 static struct completion ldlm_pools_comp;
654 void ldlm_pools_wakeup(void)
657 if (ldlm_pools_thread == NULL)
659 ldlm_pools_thread->t_flags |= SVC_EVENT;
660 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
663 EXPORT_SYMBOL(ldlm_pools_wakeup);
665 /* Cancel @nr locks from all namespaces (if possible). Returns number of
666 * cached locks after shrink is finished. All namespaces are asked to
667 * cancel approximately equal amount of locks. */
668 static int ldlm_pools_shrink(ldlm_side_t client, int nr,
669 unsigned int gfp_mask)
671 int total = 0, cached = 0, nr_ns;
672 struct ldlm_namespace *ns;
674 if (nr != 0 && !(gfp_mask & __GFP_FS))
677 CDEBUG(D_DLMTRACE, "request to shrink %d %s locks from all pools\n",
678 nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
680 /* Find out how many resources we may release. */
681 mutex_down(ldlm_namespace_lock(client));
682 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain)
683 total += ldlm_pool_granted(&ns->ns_pool);
684 mutex_up(ldlm_namespace_lock(client));
686 if (nr == 0 || total == 0)
689 /* Shrink at least ldlm_namespace_nr(client) namespaces. */
690 for (nr_ns = atomic_read(ldlm_namespace_nr(client));
693 int cancel, nr_locks;
695 /* Do not call shrink under ldlm_namespace_lock(client) */
696 mutex_down(ldlm_namespace_lock(client));
697 if (list_empty(ldlm_namespace_list(client))) {
698 mutex_up(ldlm_namespace_lock(client));
699 /* If list is empty, we can't return any @cached > 0,
700 * that probably would cause needless shrinker
705 ns = ldlm_namespace_first(client);
706 ldlm_namespace_get(ns);
707 ldlm_namespace_move(ns, client);
708 mutex_up(ldlm_namespace_lock(client));
710 nr_locks = ldlm_pool_granted(&ns->ns_pool);
711 cancel = 1 + nr_locks * nr / total;
712 ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
713 cached += ldlm_pool_granted(&ns->ns_pool);
714 ldlm_namespace_put(ns, 1);
719 static int ldlm_pools_srv_shrink(int nr, unsigned int gfp_mask)
721 return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr, gfp_mask);
724 static int ldlm_pools_cli_shrink(int nr, unsigned int gfp_mask)
726 return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr, gfp_mask);
729 void ldlm_pools_recalc(ldlm_side_t client)
731 __u32 nr_l = 0, nr_p = 0, l;
732 struct ldlm_namespace *ns;
733 int rc, nr, equal = 0;
735 /* Check all modest namespaces. */
736 mutex_down(ldlm_namespace_lock(client));
737 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) {
738 if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
741 if (client == LDLM_NAMESPACE_SERVER) {
742 l = ldlm_pool_granted(&ns->ns_pool);
746 /* Set the modest pools limit equal to their avg granted
748 l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100);
749 ldlm_pool_setup(&ns->ns_pool, l);
755 /* Make sure that modest namespaces did not eat more that 2/3 of limit */
756 if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
757 CWARN("Modest pools eat out 2/3 of locks limit. %d of %lu. "
758 "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
762 /* The rest is given to greedy namespaces. */
763 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) {
764 if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
767 if (client == LDLM_NAMESPACE_SERVER) {
769 /* In the case 2/3 locks are eaten out by
770 * modest pools, we re-setup equal limit
771 * for _all_ pools. */
772 l = LDLM_POOL_HOST_L /
773 atomic_read(ldlm_namespace_nr(client));
775 /* All the rest of greedy pools will have
776 * all locks in equal parts.*/
777 l = (LDLM_POOL_HOST_L - nr_l) /
778 (atomic_read(ldlm_namespace_nr(client)) -
781 ldlm_pool_setup(&ns->ns_pool, l);
784 mutex_up(ldlm_namespace_lock(client));
786 /* Recalc at least ldlm_namespace_nr(client) namespaces. */
787 for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
788 /* Lock the list, get first @ns in the list, getref, move it
789 * to the tail, unlock and call pool recalc. This way we avoid
790 * calling recalc under @ns lock what is really good as we get
791 * rid of potential deadlock on client nodes when canceling
792 * locks synchronously. */
793 mutex_down(ldlm_namespace_lock(client));
794 if (list_empty(ldlm_namespace_list(client))) {
795 mutex_up(ldlm_namespace_lock(client));
798 ns = ldlm_namespace_first(client);
799 ldlm_namespace_get(ns);
800 ldlm_namespace_move(ns, client);
801 mutex_up(ldlm_namespace_lock(client));
803 /* After setup is done - recalc the pool. */
804 rc = ldlm_pool_recalc(&ns->ns_pool);
806 CERROR("%s: pool recalculation error "
807 "%d\n", ns->ns_pool.pl_name, rc);
809 ldlm_namespace_put(ns, 1);
812 EXPORT_SYMBOL(ldlm_pools_recalc);
814 static int ldlm_pools_thread_main(void *arg)
816 struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
817 char *t_name = "ldlm_poold";
820 cfs_daemonize(t_name);
821 thread->t_flags = SVC_RUNNING;
822 cfs_waitq_signal(&thread->t_ctl_waitq);
824 CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
825 t_name, cfs_curproc_pid());
828 struct l_wait_info lwi;
830 /* Recal all pools on this tick. */
831 ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
832 ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
834 /* Wait until the next check time, or until we're
836 lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
838 l_wait_event(thread->t_ctl_waitq, (thread->t_flags &
839 (SVC_STOPPING|SVC_EVENT)),
842 if (thread->t_flags & SVC_STOPPING) {
843 thread->t_flags &= ~SVC_STOPPING;
845 } else if (thread->t_flags & SVC_EVENT) {
846 thread->t_flags &= ~SVC_EVENT;
850 thread->t_flags = SVC_STOPPED;
851 cfs_waitq_signal(&thread->t_ctl_waitq);
853 CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
854 t_name, cfs_curproc_pid());
856 complete_and_exit(&ldlm_pools_comp, 0);
859 static int ldlm_pools_thread_start(ldlm_side_t client)
861 struct l_wait_info lwi = { 0 };
865 if (ldlm_pools_thread != NULL)
868 OBD_ALLOC_PTR(ldlm_pools_thread);
869 if (ldlm_pools_thread == NULL)
872 ldlm_pools_thread->t_id = client;
873 init_completion(&ldlm_pools_comp);
874 cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq);
876 /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
877 * just drop the VM and FILES in ptlrpc_daemonize() right away. */
878 rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread,
879 CLONE_VM | CLONE_FILES);
881 CERROR("Can't start pool thread, error %d\n",
883 OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
884 ldlm_pools_thread = NULL;
887 l_wait_event(ldlm_pools_thread->t_ctl_waitq,
888 (ldlm_pools_thread->t_flags & SVC_RUNNING), &lwi);
892 static void ldlm_pools_thread_stop(void)
896 if (ldlm_pools_thread == NULL) {
901 ldlm_pools_thread->t_flags = SVC_STOPPING;
902 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
904 /* Make sure that pools thread is finished before freeing @thread.
905 * This fixes possible race and oops due to accessing freed memory
906 * in pools thread. */
907 wait_for_completion(&ldlm_pools_comp);
908 OBD_FREE_PTR(ldlm_pools_thread);
909 ldlm_pools_thread = NULL;
913 int ldlm_pools_init(ldlm_side_t client)
918 rc = ldlm_pools_thread_start(client);
920 ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS,
921 ldlm_pools_srv_shrink);
922 ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS,
923 ldlm_pools_cli_shrink);
927 EXPORT_SYMBOL(ldlm_pools_init);
929 void ldlm_pools_fini(void)
931 if (ldlm_pools_srv_shrinker != NULL) {
932 remove_shrinker(ldlm_pools_srv_shrinker);
933 ldlm_pools_srv_shrinker = NULL;
935 if (ldlm_pools_cli_shrinker != NULL) {
936 remove_shrinker(ldlm_pools_cli_shrinker);
937 ldlm_pools_cli_shrinker = NULL;
939 ldlm_pools_thread_stop();
941 EXPORT_SYMBOL(ldlm_pools_fini);
942 #endif /* __KERNEL__ */
944 #else /* !HAVE_LRU_RESIZE_SUPPORT */
945 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
949 EXPORT_SYMBOL(ldlm_pool_setup);
951 int ldlm_pool_recalc(struct ldlm_pool *pl)
955 EXPORT_SYMBOL(ldlm_pool_recalc);
957 int ldlm_pool_shrink(struct ldlm_pool *pl,
958 int nr, unsigned int gfp_mask)
962 EXPORT_SYMBOL(ldlm_pool_shrink);
964 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
965 int idx, ldlm_side_t client)
969 EXPORT_SYMBOL(ldlm_pool_init);
971 void ldlm_pool_fini(struct ldlm_pool *pl)
975 EXPORT_SYMBOL(ldlm_pool_fini);
977 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
981 EXPORT_SYMBOL(ldlm_pool_add);
983 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
987 EXPORT_SYMBOL(ldlm_pool_del);
989 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
993 EXPORT_SYMBOL(ldlm_pool_get_slv);
995 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
999 EXPORT_SYMBOL(ldlm_pool_set_slv);
1001 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
1005 EXPORT_SYMBOL(ldlm_pool_get_limit);
1007 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
1011 EXPORT_SYMBOL(ldlm_pool_set_limit);
1013 int ldlm_pools_init(ldlm_side_t client)
1017 EXPORT_SYMBOL(ldlm_pools_init);
1019 void ldlm_pools_fini(void)
1023 EXPORT_SYMBOL(ldlm_pools_fini);
1025 void ldlm_pools_wakeup(void)
1029 EXPORT_SYMBOL(ldlm_pools_wakeup);
1031 void ldlm_pools_recalc(ldlm_side_t client)
1035 EXPORT_SYMBOL(ldlm_pools_recalc);
1036 #endif /* HAVE_LRU_RESIZE_SUPPORT */