1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2007 Cluster File Systems, Inc.
5 * Author: Yury Umanets <umka@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
26 /* Idea of this code is rather simple. Each second, for each server namespace
27 * we have SLV - server lock volume which is calculated on current number of
28 * granted locks, grant speed for past period, etc - that is, locking load.
29 * This SLV number may be thought as a flow definition for simplicity. It is
30 * sent to clients with each occasion to let them know what is current load
31 * situation on the server. By default, at the beginning, SLV on server is
32 * set max value which is calculated as the following: allow to one client
33 * have all locks of limit ->pl_limit for 10h.
35 * Next, on clients, number of cached locks is not limited artificially in any
36 * way as it was before. Instead, client calculates CLV, that is, client lock
37 * volume for each lock and compares it with last SLV from the server. CLV is
38 * calculated as the number of locks in LRU * lock live time in seconds. If
39 * CLV > SLV - lock is canceled.
41 * Client has LVF, that is, lock volume factor which regulates how much sensitive
42 * client should be about last SLV from server. The higher LVF is the more locks
43 * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
44 * that client will cancel locks 2 times faster.
46 * Locks on a client will be canceled more intensively in these cases:
47 * (1) if SLV is smaller, that is, load is higher on the server;
48 * (2) client has a lot of locks (the more locks are held by client, the bigger
49 * chances that some of them should be canceled);
50 * (3) client has old locks (taken some time ago);
52 * Thus, according to flow paradigm that we use for better understanding SLV,
53 * CLV is the volume of particle in flow described by SLV. According to this,
54 * if flow is getting thinner, more and more particles become outside of it and
55 * as particles are locks, they should be canceled.
57 * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
58 * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
59 * cleanups. Flow definition to allow more easy understanding of the logic belongs
60 * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
61 * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
63 * Glossary for terms used:
65 * pl_limit - Number of allowed locks in pool. Applies to server and client
68 * pl_granted - Number of granted locks (calculated);
69 * pl_grant_rate - Number of granted locks for last T (calculated);
70 * pl_cancel_rate - Number of canceled locks for last T (calculated);
71 * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
72 * pl_grant_plan - Planned number of granted locks for next T (calculated);
74 * pl_grant_step - Grant plan step, that is how ->pl_grant_plan
75 * will change in next T (tunable);
77 * pl_server_lock_volume - Current server lock volume (calculated);
79 * As it may be seen from list above, we have few possible tunables which may
80 * affect behavior much. They all may be modified via proc. However, they also
81 * give a possibility for constructing few pre-defined behavior policies. If
82 * none of predefines is suitable for a working pattern being used, new one may
83 * be "constructed" via proc tunables.
86 #define DEBUG_SUBSYSTEM S_LDLM
89 # include <lustre_dlm.h>
91 # include <liblustre.h>
92 # include <libcfs/kp30.h>
95 #include <obd_class.h>
96 #include <obd_support.h>
97 #include "ldlm_internal.h"
99 #ifdef HAVE_LRU_RESIZE_SUPPORT
101 /* 50 ldlm locks for 1MB of RAM. */
102 #define LDLM_POOL_HOST_L ((num_physpages >> (20 - PAGE_SHIFT)) * 50)
104 /* Default step in % for grant plan. */
105 #define LDLM_POOL_GSP (5)
107 /* LDLM_POOL_GSP% of all locks is default GP. */
108 #define LDLM_POOL_GP(L) ((L) * LDLM_POOL_GSP / 100)
110 /* Max age for locks on clients. */
111 #define LDLM_POOL_MAX_AGE (36000)
114 extern cfs_proc_dir_entry_t *ldlm_ns_proc_dir;
117 #define avg(src, add) \
118 ((src) = ((src) + (add)) / 2)
120 static inline __u64 dru(__u64 val, __u32 div)
122 __u64 ret = val + (div - 1);
127 static inline __u64 ldlm_pool_slv_max(__u32 L)
129 /* Allow to have all locks for 1 client for 10 hrs.
130 * Formula is the following: limit * 10h / 1 client. */
131 __u64 lim = L * LDLM_POOL_MAX_AGE / 1;
135 static inline __u64 ldlm_pool_slv_min(__u32 L)
141 LDLM_POOL_GRANTED_STAT = 0,
142 LDLM_POOL_GRANT_RATE_STAT,
143 LDLM_POOL_CANCEL_RATE_STAT,
144 LDLM_POOL_GRANT_PLAN_STAT,
149 static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
151 return container_of(pl, struct ldlm_namespace, ns_pool);
154 static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
156 int slv_factor, limit, granted, grant_speed;
157 int grant_rate, cancel_rate, grant_step;
158 time_t recalc_interval_sec;
163 spin_lock(&pl->pl_lock);
165 /* Get all values to local variables to avoid change some of them in
166 * the middle of re-calc. */
167 slv = ldlm_pool_get_slv(pl);
168 limit = ldlm_pool_get_limit(pl);
169 granted = atomic_read(&pl->pl_granted);
170 grant_rate = atomic_read(&pl->pl_grant_rate);
171 grant_plan = atomic_read(&pl->pl_grant_plan);
172 grant_step = atomic_read(&pl->pl_grant_step);
173 grant_speed = atomic_read(&pl->pl_grant_speed);
174 cancel_rate = atomic_read(&pl->pl_cancel_rate);
176 /* Zero out grant/cancel rates and speed for this T. */
177 atomic_set(&pl->pl_grant_rate, 0);
178 atomic_set(&pl->pl_cancel_rate, 0);
179 atomic_set(&pl->pl_grant_speed, 0);
181 /* Make sure that we use correct data for statistics. Pools thread may
182 * be not scheduled long time due to big CPU contention. We need to
184 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
186 if (recalc_interval_sec == 0)
187 recalc_interval_sec = 1;
189 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, slv);
190 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
192 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
193 grant_rate / recalc_interval_sec);
194 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
195 grant_plan / recalc_interval_sec);
196 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
197 cancel_rate / recalc_interval_sec);
199 /* Correcting old @grant_plan which may be obsolete in the case of big
200 * load on the server, when pools thread is not scheduled every 1s sharp
201 * (curent period). All values used in calculation are updated from
202 * other threads and up-to-date. Only @grant_plan is calculated by pool
203 * thread and directly affects SLV. */
204 grant_plan += grant_speed - (grant_speed / recalc_interval_sec);
206 if ((slv_factor = limit - (granted - grant_plan)) <= 0)
209 grant_plan = granted + ((limit - granted) * grant_step) / 100;
210 slv = (slv * ((slv_factor * 100) / limit));
213 if (slv > ldlm_pool_slv_max(limit)) {
214 CDEBUG(D_DLMTRACE, "Correcting SLV to allowed max "LPU64"\n",
215 ldlm_pool_slv_max(limit));
216 slv = ldlm_pool_slv_max(limit);
217 } else if (slv < ldlm_pool_slv_min(limit)) {
218 CDEBUG(D_DLMTRACE, "Correcting SLV to allowed min "LPU64"\n",
219 ldlm_pool_slv_min(limit));
220 slv = ldlm_pool_slv_min(limit);
223 ldlm_pool_set_slv(pl, slv);
224 atomic_set(&pl->pl_grant_plan, grant_plan);
225 pl->pl_update_time = cfs_time_current();
226 spin_unlock(&pl->pl_lock);
231 /* Our goal here is to decrease SLV the way to make a client hold
232 * @nr locks smaller in next 10h. */
233 static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
234 int nr, unsigned int gfp_mask)
236 __u32 granted, limit;
240 /* Client already canceled locks but server is already in shrinker and
241 * can't cancel anything. Let's catch this race. */
242 if ((granted = atomic_read(&pl->pl_granted)) == 0)
245 spin_lock(&pl->pl_lock);
247 /* Simple proportion but it gives impression on how much should be
248 * SLV changed for request @nr of locks to be canceled.*/
249 slv_delta = nr * ldlm_pool_get_slv(pl);
250 limit = ldlm_pool_get_limit(pl);
251 do_div(slv_delta, granted);
253 /* As SLV has some dependence on historical data, that is new value
254 * is based on old one, this decreasing will make clients get some
255 * locks back to the server and after some time it will stabilize.*/
256 if (slv_delta < ldlm_pool_get_slv(pl))
257 ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - slv_delta);
259 ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit));
260 spin_unlock(&pl->pl_lock);
262 /* We did not really free any memory here so far, it only will be
263 * freed later may be, so that we return 0 to not confuse VM. */
267 static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
269 int grant_rate, cancel_rate;
270 time_t recalc_interval_sec;
273 spin_lock(&pl->pl_lock);
274 grant_rate = atomic_read(&pl->pl_grant_rate);
275 cancel_rate = atomic_read(&pl->pl_cancel_rate);
277 /* Zero out grant/cancel rates and speed for this T. */
278 atomic_set(&pl->pl_grant_rate, 0);
279 atomic_set(&pl->pl_cancel_rate, 0);
280 atomic_set(&pl->pl_grant_speed, 0);
282 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
284 if (recalc_interval_sec == 0)
285 recalc_interval_sec = 1;
287 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
288 ldlm_pool_get_slv(pl));
289 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
290 atomic_read(&pl->pl_granted));
291 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
292 grant_rate / recalc_interval_sec);
293 lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
294 cancel_rate / recalc_interval_sec);
296 spin_unlock(&pl->pl_lock);
298 ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC);
302 static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
303 int nr, unsigned int gfp_mask)
306 RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC));
309 int ldlm_pool_recalc(struct ldlm_pool *pl)
311 if (pl->pl_recalc != NULL && pool_recalc_enabled(pl))
312 return pl->pl_recalc(pl);
315 EXPORT_SYMBOL(ldlm_pool_recalc);
317 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
318 unsigned int gfp_mask)
320 if (pl->pl_shrink != NULL && pool_shrink_enabled(pl)) {
321 CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks\n",
323 return pl->pl_shrink(pl, nr, gfp_mask);
327 EXPORT_SYMBOL(ldlm_pool_shrink);
329 /* The purpose of this function is to re-setup limit and maximal allowed
330 * slv according to the passed limit. */
331 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
334 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER) {
335 spin_lock(&pl->pl_lock);
336 ldlm_pool_set_limit(pl, limit);
337 spin_unlock(&pl->pl_lock);
341 EXPORT_SYMBOL(ldlm_pool_setup);
344 static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
345 int count, int *eof, void *data)
347 int nr = 0, granted, grant_rate, cancel_rate;
348 int grant_speed, grant_plan, grant_step;
349 struct ldlm_pool *pl = data;
350 time_t recalc_interval_sec;
354 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
356 if (recalc_interval_sec == 0)
357 recalc_interval_sec = 1;
359 spin_lock(&pl->pl_lock);
360 slv = pl->pl_server_lock_volume;
361 limit = ldlm_pool_get_limit(pl);
362 granted = atomic_read(&pl->pl_granted);
363 grant_rate = atomic_read(&pl->pl_grant_rate) /
365 cancel_rate = atomic_read(&pl->pl_cancel_rate) /
367 grant_speed = atomic_read(&pl->pl_grant_speed) /
369 grant_plan = atomic_read(&pl->pl_grant_plan);
370 grant_plan += atomic_read(&pl->pl_grant_speed) -
371 (atomic_read(&pl->pl_grant_speed) /
372 recalc_interval_sec);
373 grant_step = atomic_read(&pl->pl_grant_step);
374 spin_unlock(&pl->pl_lock);
376 nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
378 nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv);
379 if (ldlm_pl2ns(pl)->ns_client == LDLM_NAMESPACE_SERVER) {
380 nr += snprintf(page + nr, count - nr, " GSP: %d%%\n",
382 nr += snprintf(page + nr, count - nr, " GP: %d\n",
385 nr += snprintf(page + nr, count - nr, " LVF: %d\n",
386 atomic_read(&pl->pl_lock_volume_factor));
388 nr += snprintf(page + nr, count - nr, " GR: %d\n", grant_rate);
389 nr += snprintf(page + nr, count - nr, " CR: %d\n", cancel_rate);
390 nr += snprintf(page + nr, count - nr, " GS: %d\n", grant_speed);
391 nr += snprintf(page + nr, count - nr, " G: %d\n", granted);
392 nr += snprintf(page + nr, count - nr, " L: %d\n", limit);
396 static int ldlm_pool_rate_helper(struct ldlm_pool *pl, int rate)
398 time_t recalc_interval_sec;
400 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
402 if (recalc_interval_sec == 0)
403 recalc_interval_sec = 1;
404 return rate / recalc_interval_sec;
407 int lprocfs_rd_grant_rate(char *page, char **start, off_t off,
408 int count, int *eof, void *data)
410 struct ldlm_pool *pl = data;
413 grant_rate = ldlm_pool_rate_helper(pl, atomic_read(&pl->pl_grant_rate));
414 return lprocfs_rd_uint(page, start, off, count, eof, &grant_rate);
417 int lprocfs_rd_cancel_rate(char *page, char **start, off_t off,
418 int count, int *eof, void *data)
420 struct ldlm_pool *pl = data;
423 cancel_rate = ldlm_pool_rate_helper(pl, atomic_read(&pl->pl_cancel_rate));
424 return lprocfs_rd_uint(page, start, off, count, eof, &cancel_rate);
427 int lprocfs_rd_grant_plan(char *page, char **start, off_t off,
428 int count, int *eof, void *data)
430 struct ldlm_pool *pl = data;
431 time_t recalc_interval_sec;
434 recalc_interval_sec = cfs_duration_sec(cfs_time_current() -
436 if (recalc_interval_sec == 0)
437 recalc_interval_sec = 1;
439 grant_plan = atomic_read(&pl->pl_grant_plan);
440 grant_plan += atomic_read(&pl->pl_grant_speed) -
441 (atomic_read(&pl->pl_grant_speed) /
442 recalc_interval_sec);
444 return lprocfs_rd_uint(page, start, off, count, eof, &grant_plan);
447 int lprocfs_rd_grant_speed(char *page, char **start, off_t off,
448 int count, int *eof, void *data)
450 struct ldlm_pool *pl = data;
451 atomic_t grant_speed;
453 atomic_set(&grant_speed, ldlm_pool_rate_helper(pl,
454 atomic_read(&pl->pl_grant_speed)));
456 /* We need to output signed value here as speed may be < 0. Easiest
457 * way to do is to use existing fucntion lprocfs_rd_atomic() */
458 return lprocfs_rd_atomic(page, start, off, count, eof, &grant_speed);
461 static int ldlm_pool_proc_init(struct ldlm_pool *pl)
463 struct ldlm_namespace *ns = ldlm_pl2ns(pl);
464 struct proc_dir_entry *parent_ns_proc;
465 struct lprocfs_vars pool_vars[2];
466 char *var_name = NULL;
470 OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
474 parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir, ns->ns_name);
475 if (parent_ns_proc == NULL) {
476 CERROR("%s: proc entry is not initialized\n",
478 GOTO(out_free_name, rc = -EINVAL);
480 pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
482 if (IS_ERR(pl->pl_proc_dir)) {
483 CERROR("LProcFS failed in ldlm-pool-init\n");
484 rc = PTR_ERR(pl->pl_proc_dir);
485 GOTO(out_free_name, rc);
488 var_name[MAX_STRING_SIZE] = '\0';
489 memset(pool_vars, 0, sizeof(pool_vars));
490 pool_vars[0].name = var_name;
492 snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
493 pool_vars[0].data = &pl->pl_server_lock_volume;
494 pool_vars[0].read_fptr = lprocfs_rd_u64;
495 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
497 snprintf(var_name, MAX_STRING_SIZE, "limit");
498 pool_vars[0].data = &pl->pl_limit;
499 pool_vars[0].read_fptr = lprocfs_rd_atomic;
500 pool_vars[0].write_fptr = lprocfs_wr_atomic;
501 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
503 snprintf(var_name, MAX_STRING_SIZE, "granted");
504 pool_vars[0].data = &pl->pl_granted;
505 pool_vars[0].read_fptr = lprocfs_rd_atomic;
506 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
508 snprintf(var_name, MAX_STRING_SIZE, "control");
509 pool_vars[0].data = &pl->pl_control;
510 pool_vars[0].read_fptr = lprocfs_rd_uint;
511 pool_vars[0].write_fptr = lprocfs_wr_uint;
512 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
514 snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
515 pool_vars[0].data = pl;
516 pool_vars[0].read_fptr = lprocfs_rd_grant_speed;
517 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
519 snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
520 pool_vars[0].data = pl;
521 pool_vars[0].read_fptr = lprocfs_rd_cancel_rate;
522 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
524 snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
525 pool_vars[0].data = pl;
526 pool_vars[0].read_fptr = lprocfs_rd_grant_rate;
527 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
529 if (ns->ns_client == LDLM_NAMESPACE_SERVER) {
530 snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
531 pool_vars[0].data = pl;
532 pool_vars[0].read_fptr = lprocfs_rd_grant_plan;
533 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
535 snprintf(var_name, MAX_STRING_SIZE, "grant_step");
536 pool_vars[0].data = &pl->pl_grant_step;
537 pool_vars[0].read_fptr = lprocfs_rd_atomic;
538 pool_vars[0].write_fptr = lprocfs_wr_atomic;
539 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
541 snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
542 pool_vars[0].data = &pl->pl_lock_volume_factor;
543 pool_vars[0].read_fptr = lprocfs_rd_uint;
544 pool_vars[0].write_fptr = lprocfs_wr_uint;
545 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
548 snprintf(var_name, MAX_STRING_SIZE, "state");
549 pool_vars[0].data = pl;
550 pool_vars[0].read_fptr = lprocfs_rd_pool_state;
551 lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
553 pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
554 LDLM_POOL_GRANTED_STAT);
556 GOTO(out_free_name, rc = -ENOMEM);
558 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
559 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
561 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
562 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
563 "grant_rate", "locks/s");
564 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
565 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
566 "cancel_rate", "locks/s");
567 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
568 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
569 "grant_plan", "locks/s");
570 lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
571 LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
573 lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
577 OBD_FREE(var_name, MAX_STRING_SIZE + 1);
581 static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
583 if (pl->pl_stats != NULL) {
584 lprocfs_free_stats(&pl->pl_stats);
587 if (pl->pl_proc_dir != NULL) {
588 lprocfs_remove(&pl->pl_proc_dir);
589 pl->pl_proc_dir = NULL;
592 #else /* !__KERNEL__*/
593 #define ldlm_pool_proc_init(pl) (0)
594 #define ldlm_pool_proc_fini(pl) while (0) {}
597 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
598 int idx, ldlm_side_t client)
603 spin_lock_init(&pl->pl_lock);
604 atomic_set(&pl->pl_granted, 0);
605 pl->pl_update_time = cfs_time_current();
606 atomic_set(&pl->pl_lock_volume_factor, 1);
608 atomic_set(&pl->pl_grant_rate, 0);
609 atomic_set(&pl->pl_cancel_rate, 0);
610 atomic_set(&pl->pl_grant_speed, 0);
611 pl->pl_control = LDLM_POOL_CTL_FULL;
612 atomic_set(&pl->pl_grant_step, LDLM_POOL_GSP);
613 atomic_set(&pl->pl_grant_plan, LDLM_POOL_GP(LDLM_POOL_HOST_L));
615 snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
618 if (client == LDLM_NAMESPACE_SERVER) {
619 pl->pl_recalc = ldlm_srv_pool_recalc;
620 pl->pl_shrink = ldlm_srv_pool_shrink;
621 ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
622 ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L));
624 ldlm_pool_set_slv(pl, 1);
625 ldlm_pool_set_limit(pl, 1);
626 pl->pl_recalc = ldlm_cli_pool_recalc;
627 pl->pl_shrink = ldlm_cli_pool_shrink;
630 rc = ldlm_pool_proc_init(pl);
634 CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
638 EXPORT_SYMBOL(ldlm_pool_init);
640 void ldlm_pool_fini(struct ldlm_pool *pl)
643 ldlm_pool_proc_fini(pl);
644 pl->pl_recalc = NULL;
645 pl->pl_shrink = NULL;
648 EXPORT_SYMBOL(ldlm_pool_fini);
650 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
653 atomic_inc(&pl->pl_granted);
654 atomic_inc(&pl->pl_grant_rate);
655 atomic_inc(&pl->pl_grant_speed);
658 EXPORT_SYMBOL(ldlm_pool_add);
660 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
663 LASSERT(atomic_read(&pl->pl_granted) > 0);
664 atomic_dec(&pl->pl_granted);
665 atomic_inc(&pl->pl_cancel_rate);
666 atomic_dec(&pl->pl_grant_speed);
669 EXPORT_SYMBOL(ldlm_pool_del);
671 /* ->pl_lock should be taken. */
672 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
674 return pl->pl_server_lock_volume;
676 EXPORT_SYMBOL(ldlm_pool_get_slv);
678 /* ->pl_lock should be taken. */
679 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
681 pl->pl_server_lock_volume = slv;
683 EXPORT_SYMBOL(ldlm_pool_set_slv);
685 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
687 return atomic_read(&pl->pl_limit);
689 EXPORT_SYMBOL(ldlm_pool_get_limit);
691 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
693 atomic_set(&pl->pl_limit, limit);
695 EXPORT_SYMBOL(ldlm_pool_set_limit);
697 /* Server side is only enabled for kernel space for now. */
699 static int ldlm_pool_granted(struct ldlm_pool *pl)
701 return atomic_read(&pl->pl_granted);
704 static struct ptlrpc_thread *ldlm_pools_thread;
705 static struct shrinker *ldlm_pools_srv_shrinker;
706 static struct shrinker *ldlm_pools_cli_shrinker;
707 static struct completion ldlm_pools_comp;
709 void ldlm_pools_wakeup(void)
712 if (ldlm_pools_thread == NULL)
714 ldlm_pools_thread->t_flags |= SVC_EVENT;
715 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
718 EXPORT_SYMBOL(ldlm_pools_wakeup);
720 /* Cancel @nr locks from all namespaces (if possible). Returns number of
721 * cached locks after shrink is finished. All namespaces are asked to
722 * cancel approximately equal amount of locks. */
723 static int ldlm_pools_shrink(ldlm_side_t client, int nr,
724 unsigned int gfp_mask)
726 int total = 0, cached = 0, nr_ns;
727 struct ldlm_namespace *ns;
729 if (nr != 0 && !(gfp_mask & __GFP_FS))
732 CDEBUG(D_DLMTRACE, "request to shrink %d %s locks from all pools\n",
733 nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
735 /* Find out how many resources we may release. */
736 mutex_down(ldlm_namespace_lock(client));
737 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain)
738 total += ldlm_pool_granted(&ns->ns_pool);
739 mutex_up(ldlm_namespace_lock(client));
744 /* Shrink at least ldlm_namespace_nr(client) namespaces. */
745 for (nr_ns = atomic_read(ldlm_namespace_nr(client));
748 int cancel, nr_locks;
750 /* Do not call shrink under ldlm_namespace_lock(client) */
751 mutex_down(ldlm_namespace_lock(client));
752 if (list_empty(ldlm_namespace_list(client))) {
753 mutex_up(ldlm_namespace_lock(client));
754 /* If list is empty, we can't return any @cached > 0,
755 * that probably would cause needless shrinker
760 ns = ldlm_namespace_first(client);
761 ldlm_namespace_get(ns);
762 ldlm_namespace_move(ns, client);
763 mutex_up(ldlm_namespace_lock(client));
765 nr_locks = ldlm_pool_granted(&ns->ns_pool);
766 cancel = 1 + nr_locks * nr / total;
767 ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
768 cached += ldlm_pool_granted(&ns->ns_pool);
769 ldlm_namespace_put(ns, 1);
774 static int ldlm_pools_srv_shrink(int nr, unsigned int gfp_mask)
776 return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr, gfp_mask);
779 static int ldlm_pools_cli_shrink(int nr, unsigned int gfp_mask)
781 return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr, gfp_mask);
784 void ldlm_pools_recalc(ldlm_side_t client)
786 __u32 nr_l = 0, nr_p = 0, l;
787 struct ldlm_namespace *ns;
788 int rc, nr, equal = 0;
790 /* Check all modest namespaces. */
791 mutex_down(ldlm_namespace_lock(client));
792 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) {
793 if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
796 if (client == LDLM_NAMESPACE_SERVER) {
797 l = ldlm_pool_granted(&ns->ns_pool);
801 /* Set the modest pools limit equal to their avg granted
803 l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100);
804 ldlm_pool_setup(&ns->ns_pool, l);
810 /* Make sure that modest namespaces did not eat more that 2/3 of limit */
811 if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
812 CWARN("Modest pools eat out 2/3 of locks limit. %d of %lu. "
813 "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
817 /* The rest is given to greedy namespaces. */
818 list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) {
819 if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
822 if (client == LDLM_NAMESPACE_SERVER) {
824 /* In the case 2/3 locks are eaten out by
825 * modest pools, we re-setup equal limit
826 * for _all_ pools. */
827 l = LDLM_POOL_HOST_L /
828 atomic_read(ldlm_namespace_nr(client));
830 /* All the rest of greedy pools will have
831 * all locks in equal parts.*/
832 l = (LDLM_POOL_HOST_L - nr_l) /
833 (atomic_read(ldlm_namespace_nr(client)) -
836 ldlm_pool_setup(&ns->ns_pool, l);
839 mutex_up(ldlm_namespace_lock(client));
841 /* Recalc at least ldlm_namespace_nr(client) namespaces. */
842 for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
843 /* Lock the list, get first @ns in the list, getref, move it
844 * to the tail, unlock and call pool recalc. This way we avoid
845 * calling recalc under @ns lock what is really good as we get
846 * rid of potential deadlock on client nodes when canceling
847 * locks synchronously. */
848 mutex_down(ldlm_namespace_lock(client));
849 if (list_empty(ldlm_namespace_list(client))) {
850 mutex_up(ldlm_namespace_lock(client));
853 ns = ldlm_namespace_first(client);
854 ldlm_namespace_get(ns);
855 ldlm_namespace_move(ns, client);
856 mutex_up(ldlm_namespace_lock(client));
858 /* After setup is done - recalc the pool. */
859 rc = ldlm_pool_recalc(&ns->ns_pool);
861 CERROR("%s: pool recalculation error "
862 "%d\n", ns->ns_pool.pl_name, rc);
864 ldlm_namespace_put(ns, 1);
867 EXPORT_SYMBOL(ldlm_pools_recalc);
869 static int ldlm_pools_thread_main(void *arg)
871 struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
872 char *t_name = "ldlm_poold";
875 cfs_daemonize(t_name);
876 thread->t_flags = SVC_RUNNING;
877 cfs_waitq_signal(&thread->t_ctl_waitq);
879 CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
880 t_name, cfs_curproc_pid());
883 struct l_wait_info lwi;
885 /* Recal all pools on this tick. */
886 ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
887 ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
889 /* Wait until the next check time, or until we're
891 lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
893 l_wait_event(thread->t_ctl_waitq, (thread->t_flags &
894 (SVC_STOPPING|SVC_EVENT)),
897 if (thread->t_flags & SVC_STOPPING) {
898 thread->t_flags &= ~SVC_STOPPING;
900 } else if (thread->t_flags & SVC_EVENT) {
901 thread->t_flags &= ~SVC_EVENT;
905 thread->t_flags = SVC_STOPPED;
906 cfs_waitq_signal(&thread->t_ctl_waitq);
908 CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
909 t_name, cfs_curproc_pid());
911 complete_and_exit(&ldlm_pools_comp, 0);
914 static int ldlm_pools_thread_start(ldlm_side_t client)
916 struct l_wait_info lwi = { 0 };
920 if (ldlm_pools_thread != NULL)
923 OBD_ALLOC_PTR(ldlm_pools_thread);
924 if (ldlm_pools_thread == NULL)
927 ldlm_pools_thread->t_id = client;
928 init_completion(&ldlm_pools_comp);
929 cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq);
931 /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
932 * just drop the VM and FILES in ptlrpc_daemonize() right away. */
933 rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread,
934 CLONE_VM | CLONE_FILES);
936 CERROR("Can't start pool thread, error %d\n",
938 OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
939 ldlm_pools_thread = NULL;
942 l_wait_event(ldlm_pools_thread->t_ctl_waitq,
943 (ldlm_pools_thread->t_flags & SVC_RUNNING), &lwi);
947 static void ldlm_pools_thread_stop(void)
951 if (ldlm_pools_thread == NULL) {
956 ldlm_pools_thread->t_flags = SVC_STOPPING;
957 cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
959 /* Make sure that pools thread is finished before freeing @thread.
960 * This fixes possible race and oops due to accessing freed memory
961 * in pools thread. */
962 wait_for_completion(&ldlm_pools_comp);
963 OBD_FREE_PTR(ldlm_pools_thread);
964 ldlm_pools_thread = NULL;
968 int ldlm_pools_init(ldlm_side_t client)
973 rc = ldlm_pools_thread_start(client);
975 ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS,
976 ldlm_pools_srv_shrink);
977 ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS,
978 ldlm_pools_cli_shrink);
982 EXPORT_SYMBOL(ldlm_pools_init);
984 void ldlm_pools_fini(void)
986 if (ldlm_pools_srv_shrinker != NULL) {
987 remove_shrinker(ldlm_pools_srv_shrinker);
988 ldlm_pools_srv_shrinker = NULL;
990 if (ldlm_pools_cli_shrinker != NULL) {
991 remove_shrinker(ldlm_pools_cli_shrinker);
992 ldlm_pools_cli_shrinker = NULL;
994 ldlm_pools_thread_stop();
996 EXPORT_SYMBOL(ldlm_pools_fini);
997 #endif /* __KERNEL__ */
999 #else /* !HAVE_LRU_RESIZE_SUPPORT */
1000 int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit)
1004 EXPORT_SYMBOL(ldlm_pool_setup);
1006 int ldlm_pool_recalc(struct ldlm_pool *pl)
1010 EXPORT_SYMBOL(ldlm_pool_recalc);
1012 int ldlm_pool_shrink(struct ldlm_pool *pl,
1013 int nr, unsigned int gfp_mask)
1017 EXPORT_SYMBOL(ldlm_pool_shrink);
1019 int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
1020 int idx, ldlm_side_t client)
1024 EXPORT_SYMBOL(ldlm_pool_init);
1026 void ldlm_pool_fini(struct ldlm_pool *pl)
1030 EXPORT_SYMBOL(ldlm_pool_fini);
1032 void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
1036 EXPORT_SYMBOL(ldlm_pool_add);
1038 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
1042 EXPORT_SYMBOL(ldlm_pool_del);
1044 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
1048 EXPORT_SYMBOL(ldlm_pool_get_slv);
1050 void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
1054 EXPORT_SYMBOL(ldlm_pool_set_slv);
1056 __u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
1060 EXPORT_SYMBOL(ldlm_pool_get_limit);
1062 void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
1066 EXPORT_SYMBOL(ldlm_pool_set_limit);
1068 int ldlm_pools_init(ldlm_side_t client)
1072 EXPORT_SYMBOL(ldlm_pools_init);
1074 void ldlm_pools_fini(void)
1078 EXPORT_SYMBOL(ldlm_pools_fini);
1080 void ldlm_pools_wakeup(void)
1084 EXPORT_SYMBOL(ldlm_pools_wakeup);
1086 void ldlm_pools_recalc(ldlm_side_t client)
1090 EXPORT_SYMBOL(ldlm_pools_recalc);
1091 #endif /* HAVE_LRU_RESIZE_SUPPORT */