X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Flu_object.h;h=9ceabafb2636f8723d078220fd78cbe574ec8a39;hp=ab41873bea3c375e7ec6d7e7e1eeac4392e93259;hb=c04adbcd76725a360f411f09c63df785bf7db426;hpb=fa14bdf6b648d1d4023a4fa88789059d185f4a07 diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index ab41873..9ceabaf 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -23,26 +23,32 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2016, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. */ #ifndef __LUSTRE_LU_OBJECT_H #define __LUSTRE_LU_OBJECT_H +#ifdef HAVE_LINUX_STDARG_HEADER +#include +#else #include +#endif #include -#include +#include #include #include +#include +#include struct seq_file; struct proc_dir_entry; struct lustre_cfg; struct lprocfs_stats; +struct obd_type; /** \defgroup lu lu * lu_* data-types represent server-side entities shared by data and meta-data @@ -99,6 +105,7 @@ struct lu_device; struct lu_object_header; struct lu_context; struct lu_env; +struct lu_name; /** * Operations common for data and meta-data devices. @@ -157,6 +164,26 @@ struct lu_device_operations { struct lu_device *parent, struct lu_device *dev); + + /** + * Allocate new FID for file with @name under @parent + * + * \param[in] env execution environment for this thread + * \param[in] dev dt device + * \param[out] fid new FID allocated + * \param[in] parent parent object + * \param[in] name lu_name + * + * \retval 0 0 FID allocated successfully. + * \retval 1 1 FID allocated successfully and new sequence + * requested from seq meta server + * \retval negative negative errno if FID allocation failed. + */ + int (*ldo_fid_alloc)(const struct lu_env *env, + struct lu_device *dev, + struct lu_fid *fid, + struct lu_object *parent, + const struct lu_name *name); }; /** @@ -222,12 +249,13 @@ struct lu_object_operations { */ void (*loo_object_delete)(const struct lu_env *env, struct lu_object *o); - /** - * Dual to lu_device_operations::ldo_object_alloc(). Called when - * object is removed from memory. - */ - void (*loo_object_free)(const struct lu_env *env, - struct lu_object *o); + /** + * Dual to lu_device_operations::ldo_object_alloc(). Called when + * object is removed from memory. Must use call_rcu or kfree_rcu + * if the object contains an lu_object_header. + */ + void (*loo_object_free)(const struct lu_env *env, + struct lu_object *o); /** * Called when last active reference to the object is released (and * object returns to the cache). This method is optional. @@ -295,12 +323,12 @@ struct lu_device_type_operations; * device types. */ enum lu_device_tag { - /** this is meta-data device */ - LU_DEVICE_MD = (1 << 0), - /** this is data device */ - LU_DEVICE_DT = (1 << 1), - /** data device in the client stack */ - LU_DEVICE_CL = (1 << 2) + /** this is meta-data device */ + LU_DEVICE_MD = BIT(0), + /** this is data device */ + LU_DEVICE_DT = BIT(1), + /** data device in the client stack */ + LU_DEVICE_CL = BIT(2) }; /** @@ -320,10 +348,6 @@ struct lu_device_type { */ const struct lu_device_type_operations *ldt_ops; /** - * \todo XXX: temporary pointer to associated obd_type. - */ - struct obd_type *ldt_obd_type; - /** * \todo XXX: temporary: context tags used by obd_*() calls. */ __u32 ldt_ctx_tags; @@ -397,56 +421,44 @@ struct lu_attr { * * \see enum la_valid */ - __u64 la_valid; + __u64 la_valid; /** size in bytes */ - __u64 la_size; + __u64 la_size; /** modification time in seconds since Epoch */ s64 la_mtime; /** access time in seconds since Epoch */ s64 la_atime; /** change time in seconds since Epoch */ s64 la_ctime; + /** create time in seconds since Epoch */ + s64 la_btime; /** 512-byte blocks allocated to object */ - __u64 la_blocks; + __u64 la_blocks; /** permission bits and file type */ - __u32 la_mode; + __u32 la_mode; /** owner id */ - __u32 la_uid; + __u32 la_uid; /** group id */ - __u32 la_gid; + __u32 la_gid; /** object flags */ - __u32 la_flags; + __u32 la_flags; /** number of persistent references to this object */ - __u32 la_nlink; + __u32 la_nlink; /** blk bits of the object*/ - __u32 la_blkbits; + __u32 la_blkbits; /** blk size of the object*/ - __u32 la_blksize; + __u32 la_blksize; /** real device */ - __u32 la_rdev; + __u32 la_rdev; /** project id */ - __u32 la_projid; + __u32 la_projid; + /** set layout version to OST objects. */ + __u32 la_layout_version; + /** dirent count */ + __u64 la_dirent_count; }; -/** Bit-mask of valid attributes */ -enum la_valid { - LA_ATIME = 1 << 0, - LA_MTIME = 1 << 1, - LA_CTIME = 1 << 2, - LA_SIZE = 1 << 3, - LA_MODE = 1 << 4, - LA_UID = 1 << 5, - LA_GID = 1 << 6, - LA_BLOCKS = 1 << 7, - LA_TYPE = 1 << 8, - LA_FLAGS = 1 << 9, - LA_NLINK = 1 << 10, - LA_RDEV = 1 << 11, - LA_BLKSIZE = 1 << 12, - LA_KILL_SUID = 1 << 13, - LA_KILL_SGID = 1 << 14, - LA_PROJID = 1 << 15, -}; +#define LU_DIRENT_COUNT_UNSET ~0ULL /** * Layer in the layered object. @@ -484,17 +496,23 @@ enum lu_object_header_flags { /** * Mark this object has already been taken out of cache. */ - LU_OBJECT_UNHASHED = 1, + LU_OBJECT_UNHASHED = 1, + /** + * Object is initialized, when object is found in cache, it may not be + * intialized yet, the object allocator will initialize it. + */ + LU_OBJECT_INITED = 2, }; enum lu_object_header_attr { - LOHA_EXISTS = 1 << 0, - LOHA_REMOTE = 1 << 1, - /** - * UNIX file type is stored in S_IFMT bits. - */ - LOHA_FT_START = 001 << 12, /**< S_IFIFO */ - LOHA_FT_END = 017 << 12, /**< S_IFMT */ + LOHA_EXISTS = BIT(0), + LOHA_REMOTE = BIT(1), + LOHA_HAS_AGENT_ENTRY = BIT(2), + /** + * UNIX file type is stored in S_IFMT bits. + */ + LOHA_FT_START = 001 << 12, /**< S_IFIFO */ + LOHA_FT_END = 017 << 12, /**< S_IFMT */ }; /** @@ -507,6 +525,8 @@ enum lu_object_header_attr { * it is created for things like not-yet-existing child created by mkdir or * create calls. lu_object_operations::loo_exists() can be used to check * whether object is backed by persistent storage entity. + * Any object containing this structre which might be placed in an + * rhashtable via loh_hash MUST be freed using call_rcu() or rcu_kfree(). */ struct lu_object_header { /** @@ -528,9 +548,9 @@ struct lu_object_header { */ __u32 loh_attr; /** - * Linkage into per-site hash table. Protected by lu_site::ls_guard. + * Linkage into per-site hash table. */ - struct hlist_node loh_hash; + struct rhash_head loh_hash; /** * Linkage into per-site LRU list. Protected by lu_site::ls_guard. */ @@ -544,35 +564,14 @@ struct lu_object_header { * A list of references to this object, for debugging. */ struct lu_ref loh_reference; + /* + * Handle used for kfree_rcu() or similar. + */ + struct rcu_head loh_rcu; }; struct fld; -struct lu_site_bkt_data { - /** - * number of object in this bucket on the lsb_lru list. - */ - long lsb_lru_len; - /** - * LRU list, updated on each access to object. Protected by - * bucket lock of lu_site::ls_obj_hash. - * - * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are - * moved to the lu_site::ls_lru.prev (this is due to the non-existence - * of list_for_each_entry_safe_reverse()). - */ - struct list_head lsb_lru; - /** - * Wait-queue signaled when an object in this site is ultimately - * destroyed (lu_object_free()). It is used by lu_object_find() to - * wait before re-trying when object in the process of destruction is - * found in the hash table. - * - * \see htable_lookup(). - */ - wait_queue_head_t lsb_marche_funebre; -}; - enum { LU_SS_CREATED = 0, LU_SS_CACHE_HIT, @@ -597,7 +596,13 @@ struct lu_site { /** * objects hash table */ - struct cfs_hash *ls_obj_hash; + struct rhashtable ls_obj_hash; + /* + * buckets for summary data + */ + struct lu_site_bkt_data *ls_bkts; + int ls_bkt_cnt; + u32 ls_bkt_seed; /** * index of bucket on hash table while purging */ @@ -643,14 +648,8 @@ struct lu_site { struct percpu_counter ls_lru_len_counter; }; -static inline struct lu_site_bkt_data * -lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid) -{ - struct cfs_hash_bd bd; - - cfs_hash_bd_get(site->ls_obj_hash, fid, &bd); - return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); -} +wait_queue_head_t * +lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid); static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) { @@ -672,12 +671,14 @@ int lu_device_init (struct lu_device *d, struct lu_device_type *t); void lu_device_fini (struct lu_device *d); int lu_object_header_init(struct lu_object_header *h); void lu_object_header_fini(struct lu_object_header *h); +void lu_object_header_free(struct lu_object_header *h); int lu_object_init (struct lu_object *o, struct lu_object_header *h, struct lu_device *d); void lu_object_fini (struct lu_object *o); void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); void lu_object_add (struct lu_object *before, struct lu_object *o); - +struct lu_object *lu_object_get_first(struct lu_object_header *h, + struct lu_device *dev); void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); @@ -715,6 +716,14 @@ static inline int lu_object_is_dying(const struct lu_object_header *h) return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); } +/** + * Return true if object is initialized. + */ +static inline int lu_object_is_inited(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_INITED, &h->loh_flags); +} + void lu_object_put(const struct lu_env *env, struct lu_object *o); void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o); void lu_object_unhash(const struct lu_env *env, struct lu_object *o); @@ -727,8 +736,8 @@ static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s, return lu_site_purge_objects(env, s, nr, 1); } -void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, - lu_printer_t printer); +void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref, + int msg_flags, lu_printer_t printer); struct lu_object *lu_object_find(const struct lu_env *env, struct lu_device *dev, const struct lu_fid *f, const struct lu_object_conf *conf); @@ -753,7 +762,7 @@ struct lu_object *lu_object_find_slice(const struct lu_env *env, static inline struct lu_object *lu_object_top(struct lu_object_header *h) { LASSERT(!list_empty(&h->loh_layers)); - return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); + return container_of(h->loh_layers.next, struct lu_object, lo_linkage); } /** @@ -761,7 +770,7 @@ static inline struct lu_object *lu_object_top(struct lu_object_header *h) */ static inline struct lu_object *lu_object_next(const struct lu_object *o) { - return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); + return container_of(o->lo_linkage.next, struct lu_object, lo_linkage); } /** @@ -844,6 +853,22 @@ int lu_object_invariant(const struct lu_object *o); */ #define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) +/** + * Check whether the object as agent entry on current target + */ +#define lu_object_has_agent_entry(o) \ + unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY) + +static inline void lu_object_set_agent_entry(struct lu_object *o) +{ + o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY; +} + +static inline void lu_object_clear_agent_entry(struct lu_object *o) +{ + o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY; +} + static inline int lu_object_assert_exists(const struct lu_object *o) { return lu_object_exists(o); @@ -860,7 +885,15 @@ static inline int lu_object_assert_not_exists(const struct lu_object *o) static inline __u32 lu_object_attr(const struct lu_object *o) { LASSERT(lu_object_exists(o) != 0); - return o->lo_header->loh_attr; + + return o->lo_header->loh_attr & S_IFMT; +} + +static inline void lu_object_ref_add_atomic(struct lu_object *o, + const char *scope, + const void *source) +{ + lu_ref_add_atomic(&o->lo_header->loh_reference, scope, source); } static inline void lu_object_ref_add(struct lu_object *o, @@ -906,8 +939,11 @@ struct lu_rdpg { }; enum lu_xattr_flags { - LU_XATTR_REPLACE = (1 << 0), - LU_XATTR_CREATE = (1 << 1) + LU_XATTR_REPLACE = BIT(0), + LU_XATTR_CREATE = BIT(1), + LU_XATTR_MERGE = BIT(2), + LU_XATTR_SPLIT = BIT(3), + LU_XATTR_PURGE = BIT(4), }; /** @} helpers */ @@ -919,6 +955,7 @@ enum lu_xattr_flags { enum lu_context_state { LCS_INITIALIZED = 1, LCS_ENTERED, + LCS_LEAVING, LCS_LEFT, LCS_FINALIZED }; @@ -989,66 +1026,62 @@ struct lu_context { */ enum lu_context_tag { - /** - * Thread on md server - */ - LCT_MD_THREAD = 1 << 0, - /** - * Thread on dt server - */ - LCT_DT_THREAD = 1 << 1, - /** - * Context for transaction handle - */ - LCT_TX_HANDLE = 1 << 2, - /** - * Thread on client - */ - LCT_CL_THREAD = 1 << 3, - /** - * A per-request session on a server, and a per-system-call session on - * a client. - */ - LCT_SESSION = 1 << 4, - /** - * A per-request data on OSP device - */ - LCT_OSP_THREAD = 1 << 5, - /** - * MGS device thread - */ - LCT_MG_THREAD = 1 << 6, - /** - * Context for local operations - */ - LCT_LOCAL = 1 << 7, + /** + * Thread on md server + */ + LCT_MD_THREAD = BIT(0), + /** + * Thread on dt server + */ + LCT_DT_THREAD = BIT(1), + /** + * Thread on client + */ + LCT_CL_THREAD = BIT(3), + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = BIT(4), + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = BIT(5), + /** + * MGS device thread + */ + LCT_MG_THREAD = BIT(6), + /** + * Context for local operations + */ + LCT_LOCAL = BIT(7), /** * session for server thread **/ - LCT_SERVER_SESSION = 1 << 8, - /** - * Set when at least one of keys, having values in this context has - * non-NULL lu_context_key::lct_exit() method. This is used to - * optimize lu_context_exit() call. - */ - LCT_HAS_EXIT = 1 << 28, - /** - * Don't add references for modules creating key values in that context. - * This is only for contexts used internally by lu_object framework. - */ - LCT_NOREF = 1 << 29, - /** - * Key is being prepared for retiring, don't create new values for it. - */ - LCT_QUIESCENT = 1 << 30, - /** - * Context should be remembered. - */ - LCT_REMEMBER = 1 << 31, - /** - * Contexts usable in cache shrinker thread. - */ - LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF + LCT_SERVER_SESSION = BIT(8), + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = BIT(28), + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = BIT(29), + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = BIT(30), + /** + * Context should be remembered. + */ + LCT_REMEMBER = BIT(31), + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF, }; /** @@ -1133,20 +1166,20 @@ struct lu_context_key { }; #define LU_KEY_INIT(mod, type) \ - static void* mod##_key_init(const struct lu_context *ctx, \ - struct lu_context_key *key) \ - { \ - type *value; \ - \ - CLASSERT(PAGE_SIZE >= sizeof(*value)); \ + static void *mod##_key_init(const struct lu_context *ctx, \ + struct lu_context_key *key) \ + { \ + type *value; \ \ - OBD_ALLOC_PTR(value); \ - if (value == NULL) \ - value = ERR_PTR(-ENOMEM); \ + BUILD_BUG_ON(PAGE_SIZE < sizeof(*value)); \ \ - return value; \ - } \ - struct __##mod##__dummy_init {;} /* semicolon catcher */ + OBD_ALLOC_PTR(value); \ + if (value == NULL) \ + value = ERR_PTR(-ENOMEM); \ + \ + return value; \ + } \ + struct __##mod##__dummy_init { ; } /* semicolon catcher */ #define LU_KEY_FINI(mod, type) \ static void mod##_key_fini(const struct lu_context *ctx, \ @@ -1178,8 +1211,9 @@ int lu_context_key_register(struct lu_context_key *key); void lu_context_key_degister(struct lu_context_key *key); void *lu_context_key_get (const struct lu_context *ctx, const struct lu_context_key *key); -void lu_context_key_quiesce (struct lu_context_key *key); -void lu_context_key_revive (struct lu_context_key *key); +void lu_context_key_quiesce(struct lu_device_type *t, + struct lu_context_key *key); +void lu_context_key_revive(struct lu_context_key *key); /* @@ -1224,12 +1258,12 @@ void lu_context_key_revive (struct lu_context_key *key); } \ struct __##mod##_dummy_type_start {;} -#define LU_TYPE_STOP(mod, ...) \ - static void mod##_type_stop(struct lu_device_type *t) \ - { \ - lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ - } \ - struct __##mod##_dummy_type_stop {;} +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(t, __VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop { } @@ -1253,7 +1287,8 @@ int lu_context_refill(struct lu_context *ctx); int lu_context_key_register_many(struct lu_context_key *k, ...); void lu_context_key_degister_many(struct lu_context_key *k, ...); void lu_context_key_revive_many (struct lu_context_key *k, ...); -void lu_context_key_quiesce_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many(struct lu_device_type *t, + struct lu_context_key *k, ...); /* * update/clear ctx/ses tags. @@ -1282,6 +1317,24 @@ void lu_env_fini (struct lu_env *env); int lu_env_refill(struct lu_env *env); int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags); +static inline void* lu_env_info(const struct lu_env *env, + const struct lu_context_key *key) +{ + void *info; + info = lu_context_key_get(&env->le_ctx, key); + if (!info) { + if (!lu_env_refill((struct lu_env *)env)) + info = lu_context_key_get(&env->le_ctx, key); + } + LASSERT(info); + return info; +} + +struct lu_env *lu_env_find(void); +int lu_env_add(struct lu_env *env); +int lu_env_add_task(struct lu_env *env, struct task_struct *task); +void lu_env_remove(struct lu_env *env); + /** @} lu_context */ /** @@ -1298,6 +1351,87 @@ struct lu_name { int ln_namelen; }; +static inline bool name_is_dot_or_dotdot(const char *name, int namelen) +{ + return name[0] == '.' && + (namelen == 1 || (namelen == 2 && name[1] == '.')); +} + +static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname) +{ + return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen); +} + +static inline bool lu_name_is_temp_file(const char *name, int namelen, + bool dot_prefix, int suffixlen) +{ + int lower = 0; + int upper = 0; + int digit = 0; + int len = suffixlen; + + if (dot_prefix && name[0] != '.') + return false; + + if (namelen < dot_prefix + suffixlen + 2 || + name[namelen - suffixlen - 1] != '.') + return false; + + while (len) { + lower += islower(name[namelen - len]); + upper += isupper(name[namelen - len]); + digit += isdigit(name[namelen - len]); + len--; + } + /* mktemp() filename suffixes will have a mix of upper- and lower-case + * letters and/or numbers, not all numbers, or all upper or lower-case. + * About 0.07% of randomly-generated names will slip through, + * but this avoids 99.93% of cross-MDT renames for those files. + */ + if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) || + upper == suffixlen || lower == suffixlen) + return false; + + return true; +} + +static inline bool lu_name_is_backup_file(const char *name, int namelen, + int *suffixlen) +{ + if (namelen > 1 && + name[namelen - 2] != '.' && name[namelen - 1] == '~') { + if (suffixlen) + *suffixlen = 1; + return true; + } + + if (namelen > 4 && name[namelen - 4] == '.' && + (!strncasecmp(name + namelen - 3, "bak", 3) || + !strncasecmp(name + namelen - 3, "sav", 3))) { + if (suffixlen) + *suffixlen = 4; + return true; + } + + if (namelen > 5 && name[namelen - 5] == '.' && + !strncasecmp(name + namelen - 4, "orig", 4)) { + if (suffixlen) + *suffixlen = 5; + return true; + } + + return false; +} + +static inline bool lu_name_is_valid_len(const char *name, size_t name_len) +{ + return name != NULL && + name_len > 0 && + name_len < INT_MAX && + strlen(name) == name_len && + memchr(name, '/', name_len) == NULL; +} + /** * Validate names (path components) * @@ -1309,12 +1443,7 @@ struct lu_name { */ static inline bool lu_name_is_valid_2(const char *name, size_t name_len) { - return name != NULL && - name_len > 0 && - name_len < INT_MAX && - name[name_len] == '\0' && - strlen(name) == name_len && - memchr(name, '/', name_len) == NULL; + return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0'; } static inline bool lu_name_is_valid(const struct lu_name *ln) @@ -1400,5 +1529,232 @@ static inline bool lu_object_is_cl(const struct lu_object *o) return lu_device_is_cl(o->lo_dev); } +/* Generic subset of tgts */ +struct lu_tgt_pool { + __u32 *op_array; /* array of index of + * lov_obd->lov_tgts + */ + unsigned int op_count; /* number of tgts in the array */ + unsigned int op_size; /* allocated size of op_array */ + struct rw_semaphore op_rw_sem; /* to protect lu_tgt_pool use */ +}; + +int lu_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count); +int lu_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lu_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx); +void lu_tgt_pool_free(struct lu_tgt_pool *op); +int lu_tgt_check_index(int idx, struct lu_tgt_pool *osts); +int lu_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); + +/* bitflags used in rr / qos allocation */ +enum lq_flag { + LQ_DIRTY = 0, /* recalc qos data */ + LQ_SAME_SPACE, /* the OSTs all have approx. + * the same space avail */ + LQ_RESET, /* zero current penalties */ + LQ_SF_PROGRESS, /* statfs op in progress */ +}; + +#ifdef HAVE_SERVER_SUPPORT +/* round-robin QoS data for LOD/LMV */ +struct lu_qos_rr { + spinlock_t lqr_alloc; /* protect allocation index */ + atomic_t lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx;/* aliasing for start_idx */ + int lqr_start_count;/* reseed counter */ + struct lu_tgt_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_flags; +}; + +static inline void lu_qos_rr_init(struct lu_qos_rr *lqr) +{ + spin_lock_init(&lqr->lqr_alloc); + set_bit(LQ_DIRTY, &lqr->lqr_flags); +} + +#endif /* HAVE_SERVER_SUPPORT */ + +/* QoS data per MDS/OSS */ +struct lu_svr_qos { + struct obd_uuid lsq_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lsq_svr_list; /* link to lq_svr_list */ + __u64 lsq_bavail; /* total bytes avail on svr */ + __u64 lsq_iavail; /* total inode avail on svr */ + __u64 lsq_penalty; /* current penalty */ + __u64 lsq_penalty_per_obj; /* penalty decrease + * every obj*/ + time64_t lsq_used; /* last used time, seconds */ + __u32 lsq_tgt_count; /* number of tgts on this svr */ + __u32 lsq_id; /* unique svr id */ +}; + +/* QoS data per MDT/OST */ +struct lu_tgt_qos { + struct lu_svr_qos *ltq_svr; /* svr info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease + * every obj*/ + __u64 ltq_avail; /* bytes/inode avail */ + __u64 ltq_weight; /* net weighting */ + time64_t ltq_used; /* last used time, seconds */ + bool ltq_usable:1; /* usable for striping */ +}; + +/* target descriptor */ +#define LOV_QOS_DEF_THRESHOLD_RR_PCT 17 +#define LMV_QOS_DEF_THRESHOLD_RR_PCT 5 + +#define LOV_QOS_DEF_PRIO_FREE 90 +#define LMV_QOS_DEF_PRIO_FREE 90 + +struct lu_tgt_desc { + union { + struct dt_device *ltd_tgt; + struct obd_device *ltd_obd; + }; + struct obd_export *ltd_exp; + struct obd_uuid ltd_uuid; + __u32 ltd_index; + __u32 ltd_gen; + struct list_head ltd_kill; + struct task_struct *ltd_recovery_task; + struct mutex ltd_fid_mutex; + struct lu_tgt_qos ltd_qos; /* qos info per target */ + struct obd_statfs ltd_statfs; + time64_t ltd_statfs_age; + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1, /* should this target be deleted */ + ltd_got_update_log:1, /* Already got update log */ + ltd_connecting:1; /* target is connecting */ +}; + +/* number of pointers at 2nd level */ +#define TGT_PTRS_PER_BLOCK (PAGE_SIZE / sizeof(void *)) +/* number of pointers at 1st level - only need as many as max OST/MDT count */ +#define TGT_PTRS ((LOV_ALL_STRIPES + 1) / TGT_PTRS_PER_BLOCK) + +struct lu_tgt_desc_idx { + struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK]; +}; + + +/* QoS data for LOD/LMV */ +#define QOS_THRESHOLD_MAX 256 /* should be power of two */ +struct lu_qos { + struct list_head lq_svr_list; /* lu_svr_qos list */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_svr_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ +#ifdef HAVE_SERVER_SUPPORT + struct lu_qos_rr lq_rr; /* round robin qos data */ +#endif + unsigned long lq_flags; +#if 0 + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the servers all have approx. + * the same space avail */ + lq_reset:1; /* zero current penalties */ +#endif +}; + +struct lu_tgt_descs { + union { + struct lov_desc ltd_lov_desc; + struct lmv_desc ltd_lmv_desc; + }; + /* list of known TGTs */ + struct lu_tgt_desc_idx *ltd_tgt_idx[TGT_PTRS]; + /* Size of the lu_tgts array, granted to be a power of 2 */ + __u32 ltd_tgts_size; + /* bitmap of TGTs available */ + unsigned long *ltd_tgt_bitmap; + /* TGTs scheduled to be deleted */ + __u32 ltd_death_row; + /* Table refcount used for delayed deletion */ + int ltd_refcount; + /* mutex to serialize concurrent updates to the tgt table */ + struct mutex ltd_mutex; + /* read/write semaphore used for array relocation */ + struct rw_semaphore ltd_rw_sem; + /* QoS */ + struct lu_qos ltd_qos; + /* all tgts in a packed array */ + struct lu_tgt_pool ltd_tgt_pool; + /* true if tgt is MDT */ + bool ltd_is_mdt; +}; + +#define LTD_TGT(ltd, index) \ + (ltd)->ltd_tgt_idx[(index) / TGT_PTRS_PER_BLOCK]-> \ + ldi_tgt[(index) % TGT_PTRS_PER_BLOCK] + +u64 lu_prandom_u64_max(u64 ep_ro); +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt); + +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt); +void lu_tgt_descs_fini(struct lu_tgt_descs *ltd); +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd); +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt); + +/** + * Whether MDT inode and space usages are balanced. + */ +static inline bool ltd_qos_is_balanced(struct lu_tgt_descs *ltd) +{ + return !test_bit(LQ_DIRTY, <d->ltd_qos.lq_flags) && + test_bit(LQ_SAME_SPACE, <d->ltd_qos.lq_flags); +} + +/** + * Whether QoS data is up-to-date and QoS can be applied. + */ +static inline bool ltd_qos_is_usable(struct lu_tgt_descs *ltd) +{ + if (ltd_qos_is_balanced(ltd)) + return false; + + if (ltd->ltd_lov_desc.ld_active_tgt_count < 2) + return false; + + return true; +} + +static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd) +{ + int index; + + index = find_first_bit(ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size); + return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL; +} + +static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt) +{ + int index; + + if (!tgt) + return NULL; + + index = tgt->ltd_index; + LASSERT(index < ltd->ltd_tgts_size); + index = find_next_bit(ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size, index + 1); + return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL; +} + +#define ltd_foreach_tgt(ltd, tgt) \ + for (tgt = ltd_first_tgt(ltd); tgt; tgt = ltd_next_tgt(ltd, tgt)) + +#define ltd_foreach_tgt_safe(ltd, tgt, tmp) \ + for (tgt = ltd_first_tgt(ltd), tmp = ltd_next_tgt(ltd, tgt); tgt; \ + tgt = tmp, tmp = ltd_next_tgt(ltd, tgt)) + /** @} lu */ #endif /* __LUSTRE_LU_OBJECT_H */