4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
32 * lustre/lod/lod_pool.c
36 * This file provides code related to the Logical Object Device (LOD)
37 * handling of OST Pools on the MDT. Pools are named lists of targets
38 * that allow userspace to group targets that share a particlar property
39 * together so that users or kernel helpers can make decisions about file
40 * allocation based on these properties. For example, pools could be
41 * defined based on fault domains (e.g. separate racks of server nodes) so
42 * that RAID-1 mirroring could select targets from independent fault
43 * domains, or pools could define target performance characteristics so
44 * that applicatins could select IOP-optimized storage or stream-optimized
45 * storage for a particular output file.
47 * This file handles creation, lookup, and removal of pools themselves, as
48 * well as adding and removing targets to pools. It also handles lprocfs
49 * display of configured pool. The pools are accessed by name in the pool
50 * hash, and are refcounted to ensure proper pool structure lifetimes.
52 * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
53 * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
54 * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
57 #define DEBUG_SUBSYSTEM S_LOV
59 #include <libcfs/libcfs.h>
60 #include <libcfs/linux/linux-hash.h>
61 #include <libcfs/linux/linux-fs.h>
63 #include "lod_internal.h"
65 #define pool_tgt(_p, _i) OST_TGT(lu2lod_dev((_p)->pool_lobd->obd_lu_dev), \
66 (_p)->pool_obds.op_array[_i])
69 * Get a reference on the specified pool.
71 * To ensure the pool descriptor is not freed before the caller is finished
72 * with it. Any process that is accessing \a pool directly needs to hold
73 * reference on it, including /proc since a userspace thread may be holding
74 * the /proc file open and busy in the kernel.
76 * \param[in] pool pool descriptor on which to gain reference
78 static void pool_getref(struct lod_pool_desc *pool)
80 CDEBUG(D_INFO, "pool %p\n", pool);
81 kref_get(&pool->pool_refcount);
84 static void lod_pool_putref_free(struct kref *kref)
86 struct lod_pool_desc *pool = container_of(kref, struct lod_pool_desc,
89 LASSERT(list_empty(&pool->pool_list));
90 LASSERT(pool->pool_proc_entry == NULL);
91 lu_tgt_pool_free(&(pool->pool_rr.lqr_pool));
92 lu_tgt_pool_free(&(pool->pool_obds));
93 kfree_rcu(pool, pool_rcu);
98 * Drop a reference on the specified pool and free its memory if needed.
100 * One reference is held by the LOD OBD device while it is configured, from
101 * the time the configuration log defines the pool until the time when it is
102 * dropped when the LOD OBD is cleaned up or the pool is deleted. This means
103 * that the pool will not be freed while the LOD device is configured, unless
104 * it is explicitly destroyed by the sysadmin. The pool structure is freed
105 * after the last reference on the structure is released.
107 * \param[in] pool lod pool descriptor to drop reference on and possibly
110 void lod_pool_putref(struct lod_pool_desc *pool)
112 CDEBUG(D_INFO, "pool %p\n", pool);
113 kref_put(&pool->pool_refcount, lod_pool_putref_free);
116 static u32 pool_hashfh(const void *data, u32 len, u32 seed)
118 const char *pool_name = data;
120 return hashlen_hash(cfs_hashlen_string((void *)(unsigned long)seed,
124 static int pool_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
126 const struct lod_pool_desc *pool = obj;
127 const char *pool_name = arg->key;
129 return strcmp(pool_name, pool->pool_name);
132 static const struct rhashtable_params pools_hash_params = {
133 .key_len = 1, /* actually variable */
134 .key_offset = offsetof(struct lod_pool_desc, pool_name),
135 .head_offset = offsetof(struct lod_pool_desc, pool_hash),
136 .hashfn = pool_hashfh,
137 .obj_cmpfn = pool_cmpfn,
138 .automatic_shrinking = true,
142 * Methods for /proc seq_file iteration of the defined pools.
145 #define POOL_IT_MAGIC 0xB001CEA0
146 struct lod_pool_iterator {
147 unsigned int lpi_magic; /* POOL_IT_MAGIC */
148 unsigned int lpi_idx; /* from 0 to pool_tgt_size - 1 */
149 struct lod_pool_desc *lpi_pool;
153 * Return the next configured target within one pool for seq_file iteration.
155 * Iterator is used to go through the target entries of a single pool
156 * (i.e. the list of OSTs configured for a named pool).
157 * lpi_idx is the current target index in the pool's op_array[].
159 * The return type is a void * because this function is one of the
160 * struct seq_operations methods and must match the function template.
162 * \param[in] seq /proc sequence file iteration tracking structure
163 * \param[in] v unused
164 * \param[in] pos position within iteration; 0 to number of targets - 1
166 * \retval struct pool_iterator of the next pool descriptor
168 static void *pool_proc_next(struct seq_file *seq, void *v, loff_t *pos)
170 struct lod_pool_iterator *iter = seq->private;
173 LASSERTF(iter->lpi_magic == POOL_IT_MAGIC, "%08X\n", iter->lpi_magic);
176 /* test if end of file */
177 if (*pos > pool_tgt_count(iter->lpi_pool))
180 CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LIST_ASSERT, cfs_fail_val);
182 /* iterate to find a non empty entry */
183 prev_idx = iter->lpi_idx;
185 if (iter->lpi_idx >= pool_tgt_count(iter->lpi_pool)) {
186 iter->lpi_idx = prev_idx; /* we stay on the last entry */
190 /* return != NULL to continue */
195 * Start seq_file iteration via /proc for a single pool.
197 * The \a pos parameter may be non-zero, indicating that the iteration
198 * is starting at some offset in the target list. Use the seq_file
199 * private field to memorize the iterator so we can free it at stop().
200 * Need to restore the private pointer to the pool before freeing it.
202 * \param[in] seq new sequence file structure to initialize
203 * \param[in] pos initial target number at which to start iteration
205 * \retval initialized pool iterator private structure
206 * \retval NULL if \a pos exceeds the number of targets in \a pool
207 * \retval negative error number on failure
209 static void *pool_proc_start(struct seq_file *seq, loff_t *pos)
211 struct lod_pool_desc *pool = seq->private;
212 struct lod_pool_iterator *iter;
215 if ((pool_tgt_count(pool) == 0) ||
216 (*pos >= pool_tgt_count(pool))) {
217 /* iter is not created, so stop() has no way to
218 * find pool to dec ref */
219 lod_pool_putref(pool);
225 return ERR_PTR(-ENOMEM);
226 iter->lpi_magic = POOL_IT_MAGIC;
227 iter->lpi_pool = pool;
231 down_read(&pool_tgt_rw_sem(pool));
238 ptr = pool_proc_next(seq, &iter, &i);
239 } while ((i < *pos) && (ptr != NULL));
248 * Finish seq_file iteration for a single pool.
250 * Once iteration has been completed, the pool_iterator struct must be
251 * freed, and the seq_file private pointer restored to the pool, as it
252 * was initially when pool_proc_start() was called.
254 * In some cases the stop() method may be called 2 times, without calling
255 * the start() method (see seq_read() from fs/seq_file.c). We have to free
256 * the private iterator struct only if seq->private points to the iterator.
258 * \param[in] seq sequence file structure to clean up
259 * \param[in] v (unused)
261 static void pool_proc_stop(struct seq_file *seq, void *v)
263 struct lod_pool_iterator *iter = seq->private;
265 if (iter != NULL && iter->lpi_magic == POOL_IT_MAGIC) {
266 up_read(&pool_tgt_rw_sem(iter->lpi_pool));
267 seq->private = iter->lpi_pool;
268 lod_pool_putref(iter->lpi_pool);
274 * Print out one target entry from the pool for seq_file iteration.
276 * The currently referenced pool target is given by op_array[lpi_idx].
278 * \param[in] seq new sequence file structure to initialize
279 * \param[in] v (unused)
281 static int pool_proc_show(struct seq_file *seq, void *v)
283 struct lod_pool_iterator *iter = v;
284 struct lod_tgt_desc *tgt;
286 LASSERTF(iter->lpi_magic == POOL_IT_MAGIC, "%08X\n", iter->lpi_magic);
287 LASSERT(iter->lpi_pool != NULL);
288 LASSERT(iter->lpi_idx <= pool_tgt_count(iter->lpi_pool));
290 tgt = pool_tgt(iter->lpi_pool, iter->lpi_idx);
292 seq_printf(seq, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
297 static const struct seq_operations pool_proc_ops = {
298 .start = pool_proc_start,
299 .next = pool_proc_next,
300 .stop = pool_proc_stop,
301 .show = pool_proc_show,
305 * Open a new /proc file for seq_file iteration of targets in one pool.
307 * Initialize the seq_file private pointer to reference the pool.
309 * \param inode inode to store iteration state for /proc
310 * \param file file descriptor to store iteration methods
312 * \retval 0 for success
313 * \retval negative error number on failure
315 static int pool_proc_open(struct inode *inode, struct file *file)
319 rc = seq_open(file, &pool_proc_ops);
321 struct seq_file *seq = file->private_data;
322 seq->private = pde_data(inode);
327 const static struct proc_ops pool_proc_operations = {
328 .proc_open = pool_proc_open,
329 .proc_read = seq_read,
330 .proc_lseek = seq_lseek,
331 .proc_release = seq_release,
334 static void pools_hash_exit(void *vpool, void *data)
336 struct lod_pool_desc *pool = vpool;
338 lod_pool_putref(pool);
341 int lod_pool_hash_init(struct rhashtable *tbl)
343 return rhashtable_init(tbl, &pools_hash_params);
346 void lod_pool_hash_destroy(struct rhashtable *tbl)
348 rhashtable_free_and_destroy(tbl, pools_hash_exit, NULL);
351 bool lod_pool_exists(struct lod_device *lod, char *poolname)
353 struct lod_pool_desc *pool;
356 pool = rhashtable_lookup(&lod->lod_pools_hash_body,
363 struct lod_pool_desc *lod_pool_find(struct lod_device *lod, const char *poolname)
365 struct lod_pool_desc *pool;
368 pool = rhashtable_lookup(&lod->lod_pools_hash_body,
371 if (pool && !kref_get_unless_zero(&pool->pool_refcount))
377 static int lod_ost_pool_weights_seq_show(struct seq_file *m, void *data)
379 struct lod_pool_desc *pool = m->private;
380 struct lod_device *lod = lu2lod_dev(pool->pool_lobd->obd_lu_dev);
382 return lod_tgt_weights_seq_show(m, lod, &pool->pool_obds, false);
386 lod_ost_pool_weights_seq_write(struct file *file, const char __user *buf,
387 size_t count, loff_t *off)
389 struct seq_file *m = file->private_data;
390 struct lod_pool_desc *pool = m->private;
391 struct lod_device *lod = lu2lod_dev(pool->pool_lobd->obd_lu_dev);
393 return lod_tgt_weights_seq_write(m, buf, count, lod, &pool->pool_obds,
396 LDEBUGFS_SEQ_FOPS(lod_ost_pool_weights);
398 static struct ldebugfs_vars ldebugfs_lod_pool_vars[] = {
399 { .name = "qos_ost_weights",
400 .fops = &lod_ost_pool_weights_fops,
406 * Allocate a new pool for the specified device.
408 * Allocate a new pool_desc structure for the specified \a new_pool
409 * device to create a pool with the given \a poolname. The new pool
410 * structure is created with a single reference, and is freed when the
411 * reference count drops to zero.
413 * \param[in] obd Lustre OBD device on which to add a pool iterator
414 * \param[in] poolname the name of the pool to be created
416 * \retval 0 in case of success
417 * \retval negative error code in case of error
419 int lod_pool_new(struct obd_device *obd, char *poolname)
421 struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev);
422 struct lod_pool_desc *new_pool;
426 if (strlen(poolname) > LOV_MAXPOOLNAME)
427 RETURN(-ENAMETOOLONG);
429 /* OBD_ALLOC_* doesn't work with direct kfree_rcu use */
430 new_pool = kmalloc(sizeof(*new_pool), __GFP_ZERO);
431 if (new_pool == NULL)
434 strscpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
435 new_pool->pool_spill_target[0] = '\0';
436 atomic_set(&new_pool->pool_spill_hit, 0);
437 new_pool->pool_lobd = obd;
438 kref_init(&new_pool->pool_refcount);
439 rc = lu_tgt_pool_init(&new_pool->pool_obds, 0);
441 GOTO(out_free_pool, rc);
443 lu_qos_rr_init(&new_pool->pool_rr);
445 rc = lu_tgt_pool_init(&new_pool->pool_rr.lqr_pool, 0);
447 GOTO(out_free_pool_obds, rc);
449 #ifdef CONFIG_PROC_FS
450 pool_getref(new_pool);
451 new_pool->pool_proc_entry = lprocfs_add_simple(lod->lod_pool_proc_entry,
453 &pool_proc_operations);
454 if (IS_ERR(new_pool->pool_proc_entry)) {
455 CDEBUG(D_CONFIG, "%s: cannot add proc entry "LOV_POOLNAMEF"\n",
456 obd->obd_name, poolname);
457 new_pool->pool_proc_entry = NULL;
458 lod_pool_putref(new_pool);
461 pool_getref(new_pool);
462 new_pool->pool_spill_proc_entry =
463 lprocfs_register(poolname, lod->lod_spill_proc_entry,
464 lprocfs_lod_spill_vars, new_pool);
465 if (IS_ERR(new_pool->pool_spill_proc_entry)) {
466 rc = PTR_ERR(new_pool->pool_spill_proc_entry);
467 new_pool->pool_proc_entry = NULL;
468 lod_pool_putref(new_pool);
471 CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool,
472 new_pool->pool_proc_entry);
475 spin_lock(&obd->obd_dev_lock);
476 list_add_tail(&new_pool->pool_list, &lod->lod_pool_list);
477 lod->lod_pool_count++;
478 spin_unlock(&obd->obd_dev_lock);
480 /* Add to hash table only when it is fully ready. */
481 rc = rhashtable_lookup_insert_fast(&lod->lod_pools_hash_body,
482 &new_pool->pool_hash,
487 * Hide -E2BIG and -EBUSY which
494 new_pool->pool_debugfs = debugfs_create_dir(poolname,
495 lod->lod_pool_debugfs);
496 ldebugfs_add_vars(new_pool->pool_debugfs, ldebugfs_lod_pool_vars,
499 CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
500 poolname, lod->lod_pool_count);
505 spin_lock(&obd->obd_dev_lock);
506 list_del_init(&new_pool->pool_list);
507 lod->lod_pool_count--;
508 spin_unlock(&obd->obd_dev_lock);
510 lprocfs_remove(&new_pool->pool_spill_proc_entry);
511 lprocfs_remove(&new_pool->pool_proc_entry);
513 lu_tgt_pool_free(&new_pool->pool_rr.lqr_pool);
515 lu_tgt_pool_free(&new_pool->pool_obds);
517 OBD_FREE_PTR(new_pool);
522 * Remove the named pool from the OBD device.
524 * \param[in] obd OBD device on which pool was previously created
525 * \param[in] poolname name of pool to remove from \a obd
527 * \retval 0 on successfully removing the pool
528 * \retval negative error numbers for failures
530 int lod_pool_del(struct obd_device *obd, char *poolname)
532 struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev);
533 struct lod_pool_desc *pool;
536 /* lookup and kill hash reference */
538 pool = rhashtable_lookup(&lod->lod_pools_hash_body, poolname,
540 if (pool && rhashtable_remove_fast(&lod->lod_pools_hash_body,
542 pools_hash_params) != 0)
548 debugfs_remove_recursive(pool->pool_debugfs);
550 if (pool->pool_proc_entry != NULL) {
551 CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
552 lprocfs_remove(&pool->pool_proc_entry);
553 lod_pool_putref(pool);
555 if (pool->pool_spill_proc_entry != NULL) {
556 CDEBUG(D_INFO, "proc entry %p\n", pool->pool_spill_proc_entry);
557 lprocfs_remove(&pool->pool_spill_proc_entry);
558 lod_pool_putref(pool);
561 spin_lock(&obd->obd_dev_lock);
562 list_del_init(&pool->pool_list);
563 lod->lod_pool_count--;
564 spin_unlock(&obd->obd_dev_lock);
566 /* release last reference */
567 lod_pool_putref(pool);
573 * Add a single target device to the named pool.
575 * Add the target specified by \a ostname to the specified \a poolname.
577 * \param[in] obd OBD device on which to add the pool
578 * \param[in] poolname name of the pool to which to add the target \a ostname
579 * \param[in] ostname name of the target device to be added
581 * \retval 0 if \a ostname was (previously) added to the named pool
582 * \retval negative error number on failure
584 int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname)
586 struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev);
587 struct obd_uuid ost_uuid;
588 struct lod_pool_desc *pool;
589 struct lu_tgt_desc *tgt;
593 pool = lod_pool_find(lod, poolname);
597 obd_str2uuid(&ost_uuid, ostname);
599 /* search ost in lod array */
600 lod_getref(&lod->lod_ost_descs);
601 lod_foreach_ost(lod, tgt) {
602 if (obd_uuid_equals(&ost_uuid, &tgt->ltd_uuid)) {
611 rc = lu_tgt_pool_add(&pool->pool_obds, tgt->ltd_index,
616 set_bit(LQ_DIRTY, &pool->pool_rr.lqr_flags);
618 CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
619 ostname, poolname, pool_tgt_count(pool));
623 lod_putref(lod, &lod->lod_ost_descs);
624 lod_pool_putref(pool);
629 * Remove the named target from the specified pool.
631 * Remove one target named \a ostname from \a poolname. The \a ostname
632 * is searched for in the lod_device lod_ost_bitmap array, to ensure the
633 * specified name actually exists in the pool.
635 * \param[in] obd OBD device from which to remove \a poolname
636 * \param[in] poolname name of the pool to be changed
637 * \param[in] ostname name of the target to remove from \a poolname
639 * \retval 0 on successfully removing \a ostname from the pool
640 * \retval negative number on error (e.g. \a ostname not in pool)
642 int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
644 struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev);
645 struct lu_tgt_desc *ost;
646 struct obd_uuid ost_uuid;
647 struct lod_pool_desc *pool;
651 /* lookup and kill hash reference */
652 pool = lod_pool_find(lod, poolname);
656 obd_str2uuid(&ost_uuid, ostname);
658 lod_getref(&lod->lod_ost_descs);
659 lod_foreach_ost(lod, ost) {
660 if (obd_uuid_equals(&ost_uuid, &ost->ltd_uuid)) {
666 /* test if ost found in lod array */
670 lu_tgt_pool_remove(&pool->pool_obds, ost->ltd_index);
671 set_bit(LQ_DIRTY, &pool->pool_rr.lqr_flags);
673 CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
678 lod_putref(lod, &lod->lod_ost_descs);
679 lod_pool_putref(pool);
684 * Check if the specified target exists in the pool.
686 * The caller may not have a reference on \a pool if it got the pool without
687 * calling lod_find_pool() (e.g. directly from the lod pool list)
689 * \param[in] idx Target index to check
690 * \param[in] pool Pool in which to check if target is added.
692 * \retval 0 successfully found index in \a pool
693 * \retval negative error if device not found in \a pool
695 int lod_check_index_in_pool(__u32 idx, struct lod_pool_desc *pool)
700 rc = lu_tgt_check_index(idx, &pool->pool_obds);
701 lod_pool_putref(pool);
706 * Find the pool descriptor for the specified pool and return it with a
707 * reference to the caller if found.
709 * \param[in] lod LOD on which the pools are configured
710 * \param[in] poolname NUL-terminated name of the pool
712 * \retval pointer to pool descriptor on success
713 * \retval NULL if \a poolname could not be found or poolname is empty
715 struct lod_pool_desc *lod_find_pool(struct lod_device *lod, const char *poolname)
717 struct lod_pool_desc *pool;
719 if (poolname[0] == '\0' || lov_pool_is_reserved(poolname))
722 pool = lod_pool_find(lod, poolname);
725 "%s: request for an unknown pool (" LOV_POOLNAMEF ")\n",
726 lod->lod_child_exp->exp_obd->obd_name, poolname);
727 if (pool != NULL && pool_tgt_count(pool) == 0) {
728 CDEBUG(D_CONFIG, "%s: request for an empty pool ("
730 lod->lod_child_exp->exp_obd->obd_name, poolname);
731 /* pool is ignored, so we remove ref on it */
732 lod_pool_putref(pool);
739 void lod_spill_target_refresh(const struct lu_env *env, struct lod_device *lod,
740 struct lod_pool_desc *pool)
742 __u64 avail_bytes = 0, total_bytes = 0;
743 struct lu_tgt_pool *osts;
746 if (ktime_get_seconds() < pool->pool_spill_expire)
749 if (pool->pool_spill_threshold_pct == 0)
752 lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
754 down_write(&pool_tgt_rw_sem(pool));
755 if (ktime_get_seconds() < pool->pool_spill_expire)
757 pool->pool_spill_expire = ktime_get_seconds() +
758 lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage;
760 osts = &(pool->pool_obds);
761 for (i = 0; i < osts->op_count; i++) {
762 int idx = osts->op_array[i];
763 struct lod_tgt_desc *tgt;
764 struct obd_statfs *sfs;
766 if (!test_bit(idx, lod->lod_ost_bitmap))
768 tgt = OST_TGT(lod, idx);
769 if (!tgt->ltd_active)
771 sfs = &tgt->ltd_statfs;
773 avail_bytes += sfs->os_bavail * sfs->os_bsize;
774 total_bytes += sfs->os_blocks * sfs->os_bsize;
776 if (total_bytes - avail_bytes >=
777 total_bytes * pool->pool_spill_threshold_pct / 100)
778 pool->pool_spill_is_active = true;
780 pool->pool_spill_is_active = false;
783 up_write(&pool_tgt_rw_sem(pool));
787 * XXX: consider a better schema to detect loops
789 void lod_check_and_spill_pool(const struct lu_env *env, struct lod_device *lod,
792 struct lod_pool_desc *pool;
794 if (!poolname || !*poolname || (*poolname)[0] == '\0')
797 pool = lod_pool_find(lod, *poolname);
801 lod_spill_target_refresh(env, lod, pool);
802 if (pool->pool_spill_is_active) {
803 lod_set_pool(poolname, pool->pool_spill_target);
804 atomic_inc(&pool->pool_spill_hit);
805 lod_pool_putref(pool);
809 lod_pool_putref(pool);