1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2007 Cluster File Systems, Inc.
5 * Author: Yury Umanets <umka@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
26 /* Idea of this code is rather simple. Each second, for each server namespace
27 * we have SLV - server lock volume which is calculated on current number of
28 * granted locks, grant speed for past period, etc - that is, locking load.
29 * This SLV number may be thought as a flow definition for simplicity. It is
30 * sent to clients with each occasion to let them know what is current load
31 * situation on the server. By default, at the beginning, SLV on server is
32 * set max value which is calculated as the following: allow to one client
33 * have all locks of limit ->pl_limit for 10h.
35 * Next, on clients, number of cached locks is not limited artificially in any
36 * way as it was before. Instead, client calculates CLV, that is, client lock
37 * volume for each lock and compares it with last SLV from the server. CLV is
38 * calculated as the number of locks in LRU * lock live time in seconds. If
39 * CLV > SLV - lock is canceled.
41 * Client has LVF, that is, lock volume factor which regulates how much sensitive
42 * client should be about last SLV from server. The higher LVF is the more locks
43 * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
44 * that client will cancel locks 2 times faster.
46 * Locks on a client will be canceled more intensively in these cases:
47 * (1) if SLV is smaller, that is, load is higher on the server;
48 * (2) client has a lot of locks (the more locks are held by client, the bigger
49 * chances that some of them should be canceled);
50 * (3) client has old locks (taken some time ago);
52 * Thus, according to flow paradigm that we use for better understanding SLV,
53 * CLV is the volume of particle in flow described by SLV. According to this,
54 * if flow is getting thinner, more and more particles become outside of it and
55 * as particles are locks, they should be canceled.
57 * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
58 * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
59 * cleanups. Flow definition to allow more easy understanding of the logic belongs
60 * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
61 * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
63 * Glossary for terms used:
65 * pl_limit - Number of allowed locks in pool. Applies to server and client
68 * pl_granted - Number of granted locks (calculated);
69 * pl_grant_rate - Number of granted locks for last T (calculated);
70 * pl_cancel_rate - Number of canceled locks for last T (calculated);
71 * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
72 * pl_grant_plan - Planned number of granted locks for next T (calculated);
74 * pl_grant_step - Grant plan step, that is how ->pl_grant_plan
75 * will change in next T (tunable);
77 * pl_server_lock_volume - Current server lock volume (calculated);
79 * As it may be seen from list above, we have few possible tunables which may
80 * affect behavior much. They all may be modified via proc. However, they also
81 * give a possibility for constructing few pre-defined behavior policies. If
82 * none of predefines is suitable for a working pattern being used, new one may
83 * be "constructed" via proc tunables.
86 #define DEBUG_SUBSYSTEM S_LDLM
89 # include <lustre_dlm.h>
91 # include <liblustre.h>
92 # include <libcfs/kp30.h>
95 #include <obd_class.h>
96 #include <obd_support.h>
97 #include "ldlm_internal.h"
99 #ifdef HAVE_LRU_RESIZE_SUPPORT
101 /* 50 ldlm locks for 1MB of RAM. */
102 #define LDLM_POOL_HOST_L ((num_physpages >> (20 - PAGE_SHIFT)) * 50)
104 /* Default step in % for grant plan. */
105 #define LDLM_POOL_GSP (10)
107 /* LDLM_POOL_GSP% of all locks is default GP. */
108 #define LDLM_POOL_GP(L) (((L) * LDLM_POOL_GSP) / 100)
110 /* Max age for locks on clients. */
111 #define LDLM_POOL_MAX_AGE (36000)
114 extern cfs_proc_dir_entry_t *ldlm_ns_proc_dir;
117 #define avg(src, add) \
118 ((src) = ((src) + (add)) / 2)
120 static inline __u64 dru(__u64 val, __u32 div)
122 __u64 ret = val + (div - 1);
127 static inline __u64 ldlm_pool_slv_max(__u32 L)
129 /* Allow to have all locks for 1 client for 10 hrs.
130 * Formula is the following: limit * 10h / 1 client. */
131 __u64 lim = L * LDLM_POOL_MAX_AGE / 1;
135 static inline __u64 ldlm_pool_slv_min(__u32 L)
141 LDLM_POOL_FIRST_STAT = 0,
142 LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
143 LDLM_POOL_GRANT_STAT,
144 LDLM_POOL_CANCEL_STAT,
145 LDLM_POOL_GRANT_RATE_STAT,
146 LDLM_POOL_CANCEL_RATE_STAT,
147 LDLM_POOL_GRANT_PLAN_STAT,
149 LDLM_POOL_SHRINK_REQTD_STAT,
150 LDLM_POOL_SHRINK_FREED_STAT,
151 LDLM_POOL_RECALC_STAT,
152 LDLM_POOL_TIMING_STAT,
156 static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
158 return container_of(pl, struct ldlm_namespace, ns_pool);
161 /* Should be called under ->pl_lock taken */
162 static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
164 int granted, grant_step, limit;
166 limit = ldlm_pool_get_limit(pl);
167 granted = atomic_read(&pl->pl_granted);
169 grant_step = ((limit - granted) * pl->pl_grant_step) / 100;
170 pl->pl_grant_plan = granted + grant_step;
173 /* Should be called under ->pl_lock taken */
174 static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
176 int grant_usage, granted, grant_plan;
177 __u64 slv, slv_factor;
180 slv = ldlm_pool_get_slv(pl);
181 grant_plan = pl->pl_grant_plan;
182 limit = ldlm_pool_get_limit(pl);
183 granted = atomic_read(&pl->pl_granted);
185 grant_usage = limit - (granted - grant_plan);
186 if (grant_usage <= 0)
189 /* Find out SLV change factor which is the ratio of grant usage
190 * from limit. SLV changes as fast as the ratio of grant plan
191 * consumtion. The more locks from grant plan are not consumed
192 * by clients in last interval (idle time), the faster grows
193 * SLV. And the opposite, the more grant plan is over-consumed
194 * (load time) the faster drops SLV. */
195 slv_factor = (grant_usage * 100) / limit;
196 if (2 * abs(granted - limit) > limit) {
197 slv_factor *= slv_factor;
198 slv_factor = dru(slv_factor, 100);
200 slv = slv * slv_factor;
203 if (slv > ldlm_pool_slv_max(limit)) {
204 slv = ldlm_pool_slv_max(limit);
205 } else if (slv < ldlm_pool_slv_min(limit)) {
206 slv = ldlm_pool_slv_min(limit);
209 ldlm_pool_set_slv(pl, slv);
212 static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
214 __u64 slv = ldlm_pool_get_slv(pl);
215 int grant_plan = pl->pl_grant_plan;
216 int granted = atomic_read(&pl->pl_granted);
217 int grant_rate = atomic_read(&pl->pl_grant_rate);
218 int cancel_rate = atomic_read(&pl->pl_cancel_rate);
220 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
222 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
224 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
226 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
228 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
232 static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
234 time_t recalc_interval_sec;
237 spin_lock(&pl->pl_lock);
238 recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
239 if (recalc_interval_sec > 0) {
240 /* Update statistics */
241 ldlm_pool_recalc_stats(pl);
243 /* Recalc SLV after last period. This should be done
244 * _before_ recalculating new grant plan. */
245 ldlm_pool_recalc_slv(pl);
247 /* Update grant_plan for new period. */
248 ldlm_pool_recalc_grant_plan(pl);
250 /* Zero out all rates and speed for the last period. */
251 atomic_set(&pl->pl_grant_rate, 0);
252 atomic_set(&pl->pl_cancel_rate, 0);
253 atomic_set(&pl->pl_grant_speed, 0);
254 pl->pl_recalc_time = cfs_time_current_sec();
255 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
256 recalc_interval_sec);
258 spin_unlock(&pl->pl_lock);
262 /* Our goal here is to decrease SLV the way to make a client hold
263 * @nr locks smaller in next 10h. */
264 static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
265 int nr, unsigned int gfp_mask)
270 /* VM is asking how many entries may be potentially freed. */
272 RETURN(atomic_read(&pl->pl_granted));
274 /* Client already canceled locks but server is already in shrinker
275 * and can't cancel anything. Let's catch this race. */
276 if (atomic_read(&pl->pl_granted) == 0)
279 spin_lock(&pl->pl_lock);
281 /* We want shrinker to possibly cause cancelation of @nr locks from
282 * clients or grant approximately @nr locks smaller next intervals.
284 * This is why we decresed SLV by @nr. This effect will only be as
285 * long as one re-calc interval (1s these days) and this should be
286 * enough to pass this decreased SLV to all clients. On next recalc
287 * interval pool will either increase SLV if locks load is not high
288 * or will keep on same level or even decrease again, thus, shrinker
289 * decreased SLV will affect next recalc intervals and this way will
290 * make locking load lower. */
291 if (nr < ldlm_pool_get_slv(pl)) {
292 ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - nr);
294 limit = ldlm_pool_get_limit(pl);
295 ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit));
297 spin_unlock(&pl->pl_lock);
299 /* We did not really free any memory here so far, it only will be
300 * freed later may be, so that we return 0 to not confuse VM. */
304 static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
307 ldlm_pool_set_limit(pl, limit);
311 static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
313 time_t recalc_interval_sec;
316 spin_lock(&pl->pl_lock);
318 recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
319 if (recalc_interval_sec > 0) {
320 /* Update statistics only every T */
321 ldlm_pool_recalc_stats(pl);
323 /* Zero out grant/cancel rates and speed for last period. */
324 atomic_set(&pl->pl_grant_rate, 0);
325 atomic_set(&pl->pl_cancel_rate, 0);
326 atomic_set(&pl->pl_grant_speed, 0);
327 pl->pl_recalc_time = cfs_time_current_sec();
328 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
329 recalc_interval_sec);
331 spin_unlock(&pl->pl_lock);
333 /* Do not cancel locks in case lru resize is disabled for this ns */
334 if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
337 /* In the time of canceling locks on client we do not need to maintain
338 * sharp timing, we only want to cancel locks asap according to new SLV.
339 * This may be called when SLV has changed much, this is why we do not
340 * take into account pl->pl_recalc_time here. */
341 RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC,
345 static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
346 int nr, unsigned int gfp_mask)
350 /* Do not cancel locks in case lru resize is disabled for this ns */
351 if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
354 /* Find out how many locks may be released according to shrink
357 RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0,
358 LDLM_CANCEL_SHRINK));
360 /* Cancel @nr locks accoding to shrink policy */
361 RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC,
362 LDLM_CANCEL_SHRINK));
365 struct ldlm_pool_ops ldlm_srv_pool_ops = {
366 .po_recalc = ldlm_srv_pool_recalc,
367 .po_shrink = ldlm_srv_pool_shrink,
368 .po_setup = ldlm_srv_pool_setup
371 struct ldlm_pool_ops ldlm_cli_pool_ops = {
372 .po_recalc = ldlm_cli_pool_recalc,
373 .po_shrink = ldlm_cli_pool_shrink
376 int ldlm_pool_recalc(struct ldlm_pool *pl)
380 if (pl->pl_ops->po_recalc != NULL) {
381 count = pl->pl_ops->po_recalc(pl);
382 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
388 EXPORT_SYMBOL(ldlm_pool_recalc);
390 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
391 unsigned int gfp_mask)
395 if (pl->pl_ops->po_shrink != NULL) {
396 cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
398 lprocfs_counter_add(pl->pl_stats,
399 LDLM_POOL_SHRINK_REQTD_STAT,
401 lprocfs_counter_add(pl->pl_stats,
402 LDLM_POOL_SHRINK_FREED_STAT,
404 CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
405 "shrunk %d\n", pl->pl_name, nr, cancel);
410 EXPORT_SYMBOL(ldlm_pool_shrink);
412 /* The purpose of this function is to re-setup limit and maximal allowed
413 * slv according to the passed limit. */
414 int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
417 if (pl->pl_ops->po_setup != NULL)
418 RETURN(pl->pl_ops->po_setup(pl, limit));
421 EXPORT_SYMBOL(ldlm_pool_setup);
424 static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
425 int count, int *eof, void *data)
427 int granted, grant_rate, cancel_rate, grant_step;
428 int nr = 0, grant_speed, grant_plan;
429 struct ldlm_pool *pl = data;
433 spin_lock(&pl->pl_lock);
434 slv = ldlm_pool_get_slv(pl);
435 limit = ldlm_pool_get_limit(pl);
436 grant_plan = pl->pl_grant_plan;
437 grant_step = pl->pl_grant_step;
438 granted = atomic_read(&pl->pl_granted);
439 grant_rate = atomic_read(&pl->pl_grant_rate);
440 grant_speed = atomic_read(&pl->pl_grant_speed);
441 cancel_rate = atomic_read(&pl->pl_cancel_rate);
442 spin_unlock(&pl->pl_lock);
444 nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
446 nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv);
448 nr += snprintf(page + nr, count - nr, " LVF: %d\n",
449 atomic_read(&pl->pl_lock_volume_factor));
451 nr += snprintf(page + nr, count - nr, " GSP: %d%%\n",
453 nr += snprintf(page + nr, count - nr, " GP: %d\n",
455 nr += snprintf(page + nr, count - nr, " GR: %d\n",
457 nr += snprintf(page + nr, count - nr, " CR: %d\n",
459 nr += snprintf(page + nr, count - nr, " GS: %d\n",
461 nr += snprintf(page + nr, count - nr, " G: %d\n",
463 nr += snprintf(page + nr, count - nr, " L: %d\n",
468 LDLM_POOL_PROC_READER(grant_plan, int);
469 LDLM_POOL_PROC_READER(grant_step, int);
470 LDLM_POOL_PROC_WRITER(grant_step, int);
472 static int ldlm_pool_proc_init(struct ldlm_pool *pl)
474 struct ldlm_namespace *ns = ldlm_pl2ns(pl);
475 struct proc_dir_entry *parent_ns_proc;
476 struct lprocfs_vars pool_vars[2];
477 char *var_name = NULL;
481 OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
485 parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir, ns->ns_name);
486 if (parent_ns_proc == NULL) {
487 CERROR("%s: proc entry is not initialized\n",
489 GOTO(out_free_name, rc = -EINVAL);
491 pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
493 if (IS_ERR(pl->pl_proc_dir)) {
494 CERROR("LProcFS failed in ldlm-pool-init\n");
495 rc = PTR_ERR(pl->pl_proc_dir);
496 GOTO(out_free_name, rc);
499 var_name[MAX_STRING_SIZE] = '\0';
500 memset(pool_vars, 0, sizeof(pool_vars));
501 pool_vars[0].name = var_name;
503 snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
504 pool_vars[0].data = &pl->pl_server_lock_volume;
505 pool_vars[0].read_fptr = lprocfs_rd_u64;
506 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
508 snprintf(var_name, MAX_STRING_SIZE, "limit");
509 pool_vars[0].data = &pl->pl_limit;
510 pool_vars[0].read_fptr = lprocfs_rd_atomic;
511 pool_vars[0].write_fptr = lprocfs_wr_atomic;
512 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
514 snprintf(var_name, MAX_STRING_SIZE, "granted");
515 pool_vars[0].data = &pl->pl_granted;
516 pool_vars[0].read_fptr = lprocfs_rd_atomic;
517 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
519 snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
520 pool_vars[0].data = &pl->pl_grant_speed;
521 pool_vars[0].read_fptr = lprocfs_rd_atomic;
522 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
524 snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
525 pool_vars[0].data = &pl->pl_cancel_rate;
526 pool_vars[0].read_fptr = lprocfs_rd_atomic;
527 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
529 snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
530 pool_vars[0].data = &pl->pl_grant_rate;
531 pool_vars[0].read_fptr = lprocfs_rd_atomic;
532 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
534 snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
535 pool_vars[0].data = pl;
536 pool_vars[0].read_fptr = lprocfs_rd_grant_plan;
537 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
539 snprintf(var_name, MAX_STRING_SIZE, "grant_step");
540 pool_vars[0].data = pl;
541 pool_vars[0].read_fptr = lprocfs_rd_grant_step;
542 if (ns_is_server(ns))
543 pool_vars[0].write_fptr = lprocfs_wr_grant_step;
544 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
546 snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
547 pool_vars[0].data = &pl->pl_lock_volume_factor;
548 pool_vars[0].read_fptr = lprocfs_rd_atomic;
549 pool_vars[0].write_fptr = lprocfs_wr_atomic;
550 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
552 snprintf(var_name, MAX_STRING_SIZE, "state");
553 pool_vars[0].data = pl;
554 pool_vars[0].read_fptr = lprocfs_rd_pool_state;
555 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
557 pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
558 LDLM_POOL_FIRST_STAT, 0);
560 GOTO(out_free_name, rc = -ENOMEM);
562 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
563 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
565 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
566 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
568 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
569 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
571 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
572 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
573 "grant_rate", "locks/s");
574 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
575 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
576 "cancel_rate", "locks/s");
577 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
578 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
579 "grant_plan", "locks/s");
580 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
581 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
583 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
584 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
585 "shrink_request", "locks");
586 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
587 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
588 "shrink_freed", "locks");
589 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
590 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
591 "recalc_freed", "locks");
592 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
593 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
594 "recalc_timing", "sec");
595 lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
599 OBD_FREE(var_name, MAX_STRING_SIZE + 1);
603 static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
605 if (pl->pl_stats != NULL) {
606 lprocfs_free_stats(&pl->pl_stats);
609 if (pl->pl_proc_dir != NULL) {
610 lprocfs_remove(&pl->pl_proc_dir);
611 pl->pl_proc_dir = NULL;
614 #else /* !__KERNEL__*/
615 #define ldlm_pool_proc_init(pl) (0)
616 #define ldlm_pool_proc_fini(pl) while (0) {}
619 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
620 int idx, ldlm_side_t client)
625 spin_lock_init(&pl->pl_lock);
626 atomic_set(&pl->pl_granted, 0);
627 pl->pl_recalc_time = cfs_time_current_sec();
628 atomic_set(&pl->pl_lock_volume_factor, 1);
630 atomic_set(&pl->pl_grant_rate, 0);
631 atomic_set(&pl->pl_cancel_rate, 0);
632 atomic_set(&pl->pl_grant_speed, 0);
633 pl->pl_grant_step = LDLM_POOL_GSP;
634 pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
636 snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
639 if (client == LDLM_NAMESPACE_SERVER) {
640 pl->pl_ops = &ldlm_srv_pool_ops;
641 ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
642 ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L));
644 ldlm_pool_set_slv(pl, 1);
645 ldlm_pool_set_limit(pl, 1);
646 pl->pl_ops = &ldlm_cli_pool_ops;
649 rc = ldlm_pool_proc_init(pl);
653 CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
657 EXPORT_SYMBOL(ldlm_pool_init);
659 void ldlm_pool_fini(struct ldlm_pool *pl)
662 ldlm_pool_proc_fini(pl);
666 EXPORT_SYMBOL(ldlm_pool_fini);
668 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
670 /* FLOCK locks are special in a sense that they are almost never
671 * cancelled, instead special kind of lock is used to drop them.
672 * also there is no LRU for flock locks, so no point in tracking
674 if (lock->l_resource->lr_type == LDLM_FLOCK)
679 atomic_inc(&pl->pl_granted);
680 atomic_inc(&pl->pl_grant_rate);
681 atomic_inc(&pl->pl_grant_speed);
683 lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
685 /* Do not do pool recalc for client side as all locks which
686 * potentially may be canceled has already been packed into
687 * enqueue/cancel rpc. Also we do not want to run out of stack
688 * with too long call paths. */
689 if (ns_is_server(ldlm_pl2ns(pl)))
690 ldlm_pool_recalc(pl);
693 EXPORT_SYMBOL(ldlm_pool_add);
695 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
697 if (lock->l_resource->lr_type == LDLM_FLOCK)
700 LASSERT(atomic_read(&pl->pl_granted) > 0);
701 atomic_dec(&pl->pl_granted);
702 atomic_inc(&pl->pl_cancel_rate);
703 atomic_dec(&pl->pl_grant_speed);
705 lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
707 if (ns_is_server(ldlm_pl2ns(pl)))
708 ldlm_pool_recalc(pl);
711 EXPORT_SYMBOL(ldlm_pool_del);
713 /* ->pl_lock should be taken. */
714 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
716 return pl->pl_server_lock_volume;
718 EXPORT_SYMBOL(ldlm_pool_get_slv);
720 /* ->pl_lock should be taken. */
721 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
723 pl->pl_server_lock_volume = slv;
725 EXPORT_SYMBOL(ldlm_pool_set_slv);
727 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
729 return atomic_read(&pl->pl_limit);
731 EXPORT_SYMBOL(ldlm_pool_get_limit);
733 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
735 atomic_set(&pl->pl_limit, limit);
737 EXPORT_SYMBOL(ldlm_pool_set_limit);
739 /* Server side is only enabled for kernel space for now. */
741 static int ldlm_pool_granted(struct ldlm_pool *pl)
743 return atomic_read(&pl->pl_granted);
746 static struct ptlrpc_thread *ldlm_pools_thread;
747 static struct shrinker *ldlm_pools_srv_shrinker;
748 static struct shrinker *ldlm_pools_cli_shrinker;
749 static struct completion ldlm_pools_comp;
751 void ldlm_pools_wakeup(void)
754 if (ldlm_pools_thread == NULL)
756 ldlm_pools_thread->t_flags |= SVC_EVENT;
757 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
760 EXPORT_SYMBOL(ldlm_pools_wakeup);
762 /* Cancel @nr locks from all namespaces (if possible). Returns number of
763 * cached locks after shrink is finished. All namespaces are asked to
764 * cancel approximately equal amount of locks. */
765 static int ldlm_pools_shrink(ldlm_side_t client, int nr,
766 unsigned int gfp_mask)
768 int total = 0, cached = 0, nr_ns;
769 struct ldlm_namespace *ns;
771 if (nr != 0 && !(gfp_mask & __GFP_FS))
774 CDEBUG(D_DLMTRACE, "request to shrink %d %s locks from all pools\n",
775 nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
777 /* Find out how many resources we may release. */
778 for (nr_ns = atomic_read(ldlm_namespace_nr(client));
781 mutex_down(ldlm_namespace_lock(client));
782 if (list_empty(ldlm_namespace_list(client))) {
783 mutex_up(ldlm_namespace_lock(client));
786 ns = ldlm_namespace_first(client);
787 ldlm_namespace_get(ns);
788 ldlm_namespace_move(ns, client);
789 mutex_up(ldlm_namespace_lock(client));
790 total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
791 ldlm_namespace_put(ns, 1);
794 if (nr == 0 || total == 0)
797 /* Shrink at least ldlm_namespace_nr(client) namespaces. */
798 for (nr_ns = atomic_read(ldlm_namespace_nr(client));
801 int cancel, nr_locks;
803 /* Do not call shrink under ldlm_namespace_lock(client) */
804 mutex_down(ldlm_namespace_lock(client));
805 if (list_empty(ldlm_namespace_list(client))) {
806 mutex_up(ldlm_namespace_lock(client));
807 /* If list is empty, we can't return any @cached > 0,
808 * that probably would cause needless shrinker
813 ns = ldlm_namespace_first(client);
814 ldlm_namespace_get(ns);
815 ldlm_namespace_move(ns, client);
816 mutex_up(ldlm_namespace_lock(client));
818 nr_locks = ldlm_pool_granted(&ns->ns_pool);
819 cancel = 1 + nr_locks * nr / total;
820 ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
821 cached += ldlm_pool_granted(&ns->ns_pool);
822 ldlm_namespace_put(ns, 1);
827 static int ldlm_pools_srv_shrink(int nr, unsigned int gfp_mask)
829 return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr, gfp_mask);
832 static int ldlm_pools_cli_shrink(int nr, unsigned int gfp_mask)
834 return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr, gfp_mask);
837 void ldlm_pools_recalc(ldlm_side_t client)
839 __u32 nr_l = 0, nr_p = 0, l;
840 struct ldlm_namespace *ns;
843 /* No need to setup pool limit for client pools. */
844 if (client == LDLM_NAMESPACE_SERVER) {
845 /* Check all modest namespaces first. */
846 mutex_down(ldlm_namespace_lock(client));
847 list_for_each_entry(ns, ldlm_namespace_list(client),
850 if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
853 l = ldlm_pool_granted(&ns->ns_pool);
857 /* Set the modest pools limit equal to their avg granted
859 l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100);
860 ldlm_pool_setup(&ns->ns_pool, l);
865 /* Make sure that modest namespaces did not eat more that 2/3
867 if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
868 CWARN("\"Modest\" pools eat out 2/3 of server locks "
869 "limit (%d of %lu). This means that you have too "
870 "many clients for this amount of server RAM. "
871 "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
875 /* The rest is given to greedy namespaces. */
876 list_for_each_entry(ns, ldlm_namespace_list(client),
879 if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
883 /* In the case 2/3 locks are eaten out by
884 * modest pools, we re-setup equal limit
885 * for _all_ pools. */
886 l = LDLM_POOL_HOST_L /
887 atomic_read(ldlm_namespace_nr(client));
889 /* All the rest of greedy pools will have
890 * all locks in equal parts.*/
891 l = (LDLM_POOL_HOST_L - nr_l) /
892 (atomic_read(ldlm_namespace_nr(client)) -
895 ldlm_pool_setup(&ns->ns_pool, l);
897 mutex_up(ldlm_namespace_lock(client));
900 /* Recalc at least ldlm_namespace_nr(client) namespaces. */
901 for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
902 /* Lock the list, get first @ns in the list, getref, move it
903 * to the tail, unlock and call pool recalc. This way we avoid
904 * calling recalc under @ns lock what is really good as we get
905 * rid of potential deadlock on client nodes when canceling
906 * locks synchronously. */
907 mutex_down(ldlm_namespace_lock(client));
908 if (list_empty(ldlm_namespace_list(client))) {
909 mutex_up(ldlm_namespace_lock(client));
912 ns = ldlm_namespace_first(client);
913 ldlm_namespace_get(ns);
914 ldlm_namespace_move(ns, client);
915 mutex_up(ldlm_namespace_lock(client));
917 /* After setup is done - recalc the pool. */
918 ldlm_pool_recalc(&ns->ns_pool);
919 ldlm_namespace_put(ns, 1);
922 EXPORT_SYMBOL(ldlm_pools_recalc);
924 static int ldlm_pools_thread_main(void *arg)
926 struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
927 char *t_name = "ldlm_poold";
930 cfs_daemonize(t_name);
931 thread->t_flags = SVC_RUNNING;
932 cfs_waitq_signal(&thread->t_ctl_waitq);
934 CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
935 t_name, cfs_curproc_pid());
938 struct l_wait_info lwi;
940 /* Recal all pools on this tick. */
941 ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
942 ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
944 /* Wait until the next check time, or until we're
946 lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
948 l_wait_event(thread->t_ctl_waitq, (thread->t_flags &
949 (SVC_STOPPING|SVC_EVENT)),
952 if (thread->t_flags & SVC_STOPPING) {
953 thread->t_flags &= ~SVC_STOPPING;
955 } else if (thread->t_flags & SVC_EVENT) {
956 thread->t_flags &= ~SVC_EVENT;
960 thread->t_flags = SVC_STOPPED;
961 cfs_waitq_signal(&thread->t_ctl_waitq);
963 CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
964 t_name, cfs_curproc_pid());
966 complete_and_exit(&ldlm_pools_comp, 0);
969 static int ldlm_pools_thread_start(void)
971 struct l_wait_info lwi = { 0 };
975 if (ldlm_pools_thread != NULL)
978 OBD_ALLOC_PTR(ldlm_pools_thread);
979 if (ldlm_pools_thread == NULL)
982 init_completion(&ldlm_pools_comp);
983 cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq);
985 /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
986 * just drop the VM and FILES in ptlrpc_daemonize() right away. */
987 rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread,
988 CLONE_VM | CLONE_FILES);
990 CERROR("Can't start pool thread, error %d\n",
992 OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
993 ldlm_pools_thread = NULL;
996 l_wait_event(ldlm_pools_thread->t_ctl_waitq,
997 (ldlm_pools_thread->t_flags & SVC_RUNNING), &lwi);
1001 static void ldlm_pools_thread_stop(void)
1005 if (ldlm_pools_thread == NULL) {
1010 ldlm_pools_thread->t_flags = SVC_STOPPING;
1011 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
1013 /* Make sure that pools thread is finished before freeing @thread.
1014 * This fixes possible race and oops due to accessing freed memory
1015 * in pools thread. */
1016 wait_for_completion(&ldlm_pools_comp);
1017 OBD_FREE_PTR(ldlm_pools_thread);
1018 ldlm_pools_thread = NULL;
1022 int ldlm_pools_init(void)
1027 rc = ldlm_pools_thread_start();
1029 ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS,
1030 ldlm_pools_srv_shrink);
1031 ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS,
1032 ldlm_pools_cli_shrink);
1036 EXPORT_SYMBOL(ldlm_pools_init);
1038 void ldlm_pools_fini(void)
1040 if (ldlm_pools_srv_shrinker != NULL) {
1041 remove_shrinker(ldlm_pools_srv_shrinker);
1042 ldlm_pools_srv_shrinker = NULL;
1044 if (ldlm_pools_cli_shrinker != NULL) {
1045 remove_shrinker(ldlm_pools_cli_shrinker);
1046 ldlm_pools_cli_shrinker = NULL;
1048 ldlm_pools_thread_stop();
1050 EXPORT_SYMBOL(ldlm_pools_fini);
1051 #endif /* __KERNEL__ */
1053 #else /* !HAVE_LRU_RESIZE_SUPPORT */
1054 int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
1058 EXPORT_SYMBOL(ldlm_pool_setup);
1060 int ldlm_pool_recalc(struct ldlm_pool *pl)
1064 EXPORT_SYMBOL(ldlm_pool_recalc);
1066 int ldlm_pool_shrink(struct ldlm_pool *pl,
1067 int nr, unsigned int gfp_mask)
1071 EXPORT_SYMBOL(ldlm_pool_shrink);
1073 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
1074 int idx, ldlm_side_t client)
1078 EXPORT_SYMBOL(ldlm_pool_init);
1080 void ldlm_pool_fini(struct ldlm_pool *pl)
1084 EXPORT_SYMBOL(ldlm_pool_fini);
1086 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
1090 EXPORT_SYMBOL(ldlm_pool_add);
1092 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
1096 EXPORT_SYMBOL(ldlm_pool_del);
1098 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
1102 EXPORT_SYMBOL(ldlm_pool_get_slv);
1104 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
1108 EXPORT_SYMBOL(ldlm_pool_set_slv);
1110 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
1114 EXPORT_SYMBOL(ldlm_pool_get_limit);
1116 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
1120 EXPORT_SYMBOL(ldlm_pool_set_limit);
1122 int ldlm_pools_init(void)
1126 EXPORT_SYMBOL(ldlm_pools_init);
1128 void ldlm_pools_fini(void)
1132 EXPORT_SYMBOL(ldlm_pools_fini);
1134 void ldlm_pools_wakeup(void)
1138 EXPORT_SYMBOL(ldlm_pools_wakeup);
1140 void ldlm_pools_recalc(ldlm_side_t client)
1144 EXPORT_SYMBOL(ldlm_pools_recalc);
1145 #endif /* HAVE_LRU_RESIZE_SUPPORT */