X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Flu_object.h;h=9ceabafb2636f8723d078220fd78cbe574ec8a39;hp=4bb797f40b3e04c166780668bc46612fd5262879;hb=c04adbcd76725a360f411f09c63df785bf7db426;hpb=736d2d62ab1f00926000f0c3aa31fcb6aa53050f diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 4bb797f..9ceabaf 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -27,19 +27,22 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. */ #ifndef __LUSTRE_LU_OBJECT_H #define __LUSTRE_LU_OBJECT_H +#ifdef HAVE_LINUX_STDARG_HEADER +#include +#else #include +#endif #include #include #include #include +#include #include -#include struct seq_file; struct proc_dir_entry; @@ -171,7 +174,6 @@ struct lu_device_operations { * \param[in] parent parent object * \param[in] name lu_name * - * \retval 0 on success * \retval 0 0 FID allocated successfully. * \retval 1 1 FID allocated successfully and new sequence * requested from seq meta server @@ -321,12 +323,12 @@ struct lu_device_type_operations; * device types. */ enum lu_device_tag { - /** this is meta-data device */ - LU_DEVICE_MD = (1 << 0), - /** this is data device */ - LU_DEVICE_DT = (1 << 1), - /** data device in the client stack */ - LU_DEVICE_CL = (1 << 2) + /** this is meta-data device */ + LU_DEVICE_MD = BIT(0), + /** this is data device */ + LU_DEVICE_DT = BIT(1), + /** data device in the client stack */ + LU_DEVICE_CL = BIT(2) }; /** @@ -452,8 +454,12 @@ struct lu_attr { __u32 la_projid; /** set layout version to OST objects. */ __u32 la_layout_version; + /** dirent count */ + __u64 la_dirent_count; }; +#define LU_DIRENT_COUNT_UNSET ~0ULL + /** * Layer in the layered object. */ @@ -496,17 +502,12 @@ enum lu_object_header_flags { * intialized yet, the object allocator will initialize it. */ LU_OBJECT_INITED = 2, - /** - * Object is being purged, so mustn't be returned by - * htable_lookup() - */ - LU_OBJECT_PURGING = 3, }; enum lu_object_header_attr { - LOHA_EXISTS = 1 << 0, - LOHA_REMOTE = 1 << 1, - LOHA_HAS_AGENT_ENTRY = 1 << 2, + LOHA_EXISTS = BIT(0), + LOHA_REMOTE = BIT(1), + LOHA_HAS_AGENT_ENTRY = BIT(2), /** * UNIX file type is stored in S_IFMT bits. */ @@ -524,6 +525,8 @@ enum lu_object_header_attr { * it is created for things like not-yet-existing child created by mkdir or * create calls. lu_object_operations::loo_exists() can be used to check * whether object is backed by persistent storage entity. + * Any object containing this structre which might be placed in an + * rhashtable via loh_hash MUST be freed using call_rcu() or rcu_kfree(). */ struct lu_object_header { /** @@ -545,9 +548,9 @@ struct lu_object_header { */ __u32 loh_attr; /** - * Linkage into per-site hash table. Protected by lu_site::ls_guard. + * Linkage into per-site hash table. */ - struct hlist_node loh_hash; + struct rhash_head loh_hash; /** * Linkage into per-site LRU list. Protected by lu_site::ls_guard. */ @@ -593,7 +596,7 @@ struct lu_site { /** * objects hash table */ - struct cfs_hash *ls_obj_hash; + struct rhashtable ls_obj_hash; /* * buckets for summary data */ @@ -668,12 +671,14 @@ int lu_device_init (struct lu_device *d, struct lu_device_type *t); void lu_device_fini (struct lu_device *d); int lu_object_header_init(struct lu_object_header *h); void lu_object_header_fini(struct lu_object_header *h); +void lu_object_header_free(struct lu_object_header *h); int lu_object_init (struct lu_object *o, struct lu_object_header *h, struct lu_device *d); void lu_object_fini (struct lu_object *o); void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); void lu_object_add (struct lu_object *before, struct lu_object *o); - +struct lu_object *lu_object_get_first(struct lu_object_header *h, + struct lu_device *dev); void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); @@ -731,8 +736,8 @@ static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s, return lu_site_purge_objects(env, s, nr, 1); } -void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, - lu_printer_t printer); +void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref, + int msg_flags, lu_printer_t printer); struct lu_object *lu_object_find(const struct lu_env *env, struct lu_device *dev, const struct lu_fid *f, const struct lu_object_conf *conf); @@ -757,7 +762,7 @@ struct lu_object *lu_object_find_slice(const struct lu_env *env, static inline struct lu_object *lu_object_top(struct lu_object_header *h) { LASSERT(!list_empty(&h->loh_layers)); - return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); + return container_of(h->loh_layers.next, struct lu_object, lo_linkage); } /** @@ -765,7 +770,7 @@ static inline struct lu_object *lu_object_top(struct lu_object_header *h) */ static inline struct lu_object *lu_object_next(const struct lu_object *o) { - return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); + return container_of(o->lo_linkage.next, struct lu_object, lo_linkage); } /** @@ -884,6 +889,13 @@ static inline __u32 lu_object_attr(const struct lu_object *o) return o->lo_header->loh_attr & S_IFMT; } +static inline void lu_object_ref_add_atomic(struct lu_object *o, + const char *scope, + const void *source) +{ + lu_ref_add_atomic(&o->lo_header->loh_reference, scope, source); +} + static inline void lu_object_ref_add(struct lu_object *o, const char *scope, const void *source) @@ -927,10 +939,11 @@ struct lu_rdpg { }; enum lu_xattr_flags { - LU_XATTR_REPLACE = (1 << 0), - LU_XATTR_CREATE = (1 << 1), - LU_XATTR_MERGE = (1 << 2), - LU_XATTR_SPLIT = (1 << 3), + LU_XATTR_REPLACE = BIT(0), + LU_XATTR_CREATE = BIT(1), + LU_XATTR_MERGE = BIT(2), + LU_XATTR_SPLIT = BIT(3), + LU_XATTR_PURGE = BIT(4), }; /** @} helpers */ @@ -1013,62 +1026,62 @@ struct lu_context { */ enum lu_context_tag { - /** - * Thread on md server - */ - LCT_MD_THREAD = 1 << 0, - /** - * Thread on dt server - */ - LCT_DT_THREAD = 1 << 1, - /** - * Thread on client - */ - LCT_CL_THREAD = 1 << 3, - /** - * A per-request session on a server, and a per-system-call session on - * a client. - */ - LCT_SESSION = 1 << 4, - /** - * A per-request data on OSP device - */ - LCT_OSP_THREAD = 1 << 5, - /** - * MGS device thread - */ - LCT_MG_THREAD = 1 << 6, - /** - * Context for local operations - */ - LCT_LOCAL = 1 << 7, + /** + * Thread on md server + */ + LCT_MD_THREAD = BIT(0), + /** + * Thread on dt server + */ + LCT_DT_THREAD = BIT(1), + /** + * Thread on client + */ + LCT_CL_THREAD = BIT(3), + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = BIT(4), + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = BIT(5), + /** + * MGS device thread + */ + LCT_MG_THREAD = BIT(6), + /** + * Context for local operations + */ + LCT_LOCAL = BIT(7), /** * session for server thread **/ - LCT_SERVER_SESSION = 1 << 8, - /** - * Set when at least one of keys, having values in this context has - * non-NULL lu_context_key::lct_exit() method. This is used to - * optimize lu_context_exit() call. - */ - LCT_HAS_EXIT = 1 << 28, - /** - * Don't add references for modules creating key values in that context. - * This is only for contexts used internally by lu_object framework. - */ - LCT_NOREF = 1 << 29, - /** - * Key is being prepared for retiring, don't create new values for it. - */ - LCT_QUIESCENT = 1 << 30, - /** - * Context should be remembered. - */ - LCT_REMEMBER = 1 << 31, - /** - * Contexts usable in cache shrinker thread. - */ - LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF + LCT_SERVER_SESSION = BIT(8), + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = BIT(28), + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = BIT(29), + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = BIT(30), + /** + * Context should be remembered. + */ + LCT_REMEMBER = BIT(31), + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF, }; /** @@ -1198,8 +1211,9 @@ int lu_context_key_register(struct lu_context_key *key); void lu_context_key_degister(struct lu_context_key *key); void *lu_context_key_get (const struct lu_context *ctx, const struct lu_context_key *key); -void lu_context_key_quiesce (struct lu_context_key *key); -void lu_context_key_revive (struct lu_context_key *key); +void lu_context_key_quiesce(struct lu_device_type *t, + struct lu_context_key *key); +void lu_context_key_revive(struct lu_context_key *key); /* @@ -1244,12 +1258,12 @@ void lu_context_key_revive (struct lu_context_key *key); } \ struct __##mod##_dummy_type_start {;} -#define LU_TYPE_STOP(mod, ...) \ - static void mod##_type_stop(struct lu_device_type *t) \ - { \ - lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ - } \ - struct __##mod##_dummy_type_stop {;} +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(t, __VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop { } @@ -1273,7 +1287,8 @@ int lu_context_refill(struct lu_context *ctx); int lu_context_key_register_many(struct lu_context_key *k, ...); void lu_context_key_degister_many(struct lu_context_key *k, ...); void lu_context_key_revive_many (struct lu_context_key *k, ...); -void lu_context_key_quiesce_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many(struct lu_device_type *t, + struct lu_context_key *k, ...); /* * update/clear ctx/ses tags. @@ -1373,7 +1388,8 @@ static inline bool lu_name_is_temp_file(const char *name, int namelen, * About 0.07% of randomly-generated names will slip through, * but this avoids 99.93% of cross-MDT renames for those files. */ - if (digit >= suffixlen - 2 || upper == suffixlen || lower == suffixlen) + if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) || + upper == suffixlen || lower == suffixlen) return false; return true; @@ -1513,22 +1529,57 @@ static inline bool lu_object_is_cl(const struct lu_object *o) return lu_device_is_cl(o->lo_dev); } +/* Generic subset of tgts */ +struct lu_tgt_pool { + __u32 *op_array; /* array of index of + * lov_obd->lov_tgts + */ + unsigned int op_count; /* number of tgts in the array */ + unsigned int op_size; /* allocated size of op_array */ + struct rw_semaphore op_rw_sem; /* to protect lu_tgt_pool use */ +}; + +int lu_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count); +int lu_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lu_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx); +void lu_tgt_pool_free(struct lu_tgt_pool *op); +int lu_tgt_check_index(int idx, struct lu_tgt_pool *osts); +int lu_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); + +/* bitflags used in rr / qos allocation */ +enum lq_flag { + LQ_DIRTY = 0, /* recalc qos data */ + LQ_SAME_SPACE, /* the OSTs all have approx. + * the same space avail */ + LQ_RESET, /* zero current penalties */ + LQ_SF_PROGRESS, /* statfs op in progress */ +}; + +#ifdef HAVE_SERVER_SUPPORT /* round-robin QoS data for LOD/LMV */ struct lu_qos_rr { spinlock_t lqr_alloc; /* protect allocation index */ - __u32 lqr_start_idx; /* start index of new inode */ + atomic_t lqr_start_idx; /* start index of new inode */ __u32 lqr_offset_idx;/* aliasing for start_idx */ int lqr_start_count;/* reseed counter */ struct lu_tgt_pool lqr_pool; /* round-robin optimized list */ - unsigned long lqr_dirty:1; /* recalc round-robin list */ + unsigned long lqr_flags; }; +static inline void lu_qos_rr_init(struct lu_qos_rr *lqr) +{ + spin_lock_init(&lqr->lqr_alloc); + set_bit(LQ_DIRTY, &lqr->lqr_flags); +} + +#endif /* HAVE_SERVER_SUPPORT */ + /* QoS data per MDS/OSS */ struct lu_svr_qos { struct obd_uuid lsq_uuid; /* ptlrpc's c_remote_uuid */ struct list_head lsq_svr_list; /* link to lq_svr_list */ __u64 lsq_bavail; /* total bytes avail on svr */ - __u64 lsq_iavail; /* tital inode avail on svr */ + __u64 lsq_iavail; /* total inode avail on svr */ __u64 lsq_penalty; /* current penalty */ __u64 lsq_penalty_per_obj; /* penalty decrease * every obj*/ @@ -1543,12 +1594,19 @@ struct lu_tgt_qos { __u64 ltq_penalty; /* current penalty */ __u64 ltq_penalty_per_obj; /* penalty decrease * every obj*/ + __u64 ltq_avail; /* bytes/inode avail */ __u64 ltq_weight; /* net weighting */ time64_t ltq_used; /* last used time, seconds */ bool ltq_usable:1; /* usable for striping */ }; /* target descriptor */ +#define LOV_QOS_DEF_THRESHOLD_RR_PCT 17 +#define LMV_QOS_DEF_THRESHOLD_RR_PCT 5 + +#define LOV_QOS_DEF_PRIO_FREE 90 +#define LMV_QOS_DEF_PRIO_FREE 90 + struct lu_tgt_desc { union { struct dt_device *ltd_tgt; @@ -1571,27 +1629,34 @@ struct lu_tgt_desc { ltd_connecting:1; /* target is connecting */ }; -/* number of pointers at 1st level */ -#define TGT_PTRS (PAGE_SIZE / sizeof(void *)) /* number of pointers at 2nd level */ #define TGT_PTRS_PER_BLOCK (PAGE_SIZE / sizeof(void *)) +/* number of pointers at 1st level - only need as many as max OST/MDT count */ +#define TGT_PTRS ((LOV_ALL_STRIPES + 1) / TGT_PTRS_PER_BLOCK) struct lu_tgt_desc_idx { struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK]; }; + /* QoS data for LOD/LMV */ +#define QOS_THRESHOLD_MAX 256 /* should be power of two */ struct lu_qos { struct list_head lq_svr_list; /* lu_svr_qos list */ struct rw_semaphore lq_rw_sem; __u32 lq_active_svr_count; unsigned int lq_prio_free; /* priority for free space */ unsigned int lq_threshold_rr;/* priority for rr */ +#ifdef HAVE_SERVER_SUPPORT struct lu_qos_rr lq_rr; /* round robin qos data */ +#endif + unsigned long lq_flags; +#if 0 unsigned long lq_dirty:1, /* recalc qos data */ lq_same_space:1,/* the servers all have approx. * the same space avail */ lq_reset:1; /* zero current penalties */ +#endif }; struct lu_tgt_descs { @@ -1604,7 +1669,7 @@ struct lu_tgt_descs { /* Size of the lu_tgts array, granted to be a power of 2 */ __u32 ltd_tgts_size; /* bitmap of TGTs available */ - struct cfs_bitmap *ltd_tgt_bitmap; + unsigned long *ltd_tgt_bitmap; /* TGTs scheduled to be deleted */ __u32 ltd_death_row; /* Table refcount used for delayed deletion */ @@ -1622,11 +1687,10 @@ struct lu_tgt_descs { }; #define LTD_TGT(ltd, index) \ - (ltd)->ltd_tgt_idx[(index) / \ - TGT_PTRS_PER_BLOCK]->ldi_tgt[(index) % TGT_PTRS_PER_BLOCK] + (ltd)->ltd_tgt_idx[(index) / TGT_PTRS_PER_BLOCK]-> \ + ldi_tgt[(index) % TGT_PTRS_PER_BLOCK] u64 lu_prandom_u64_max(u64 ep_ro); -void lu_qos_rr_init(struct lu_qos_rr *lqr); int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt); @@ -1634,18 +1698,40 @@ int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt); void lu_tgt_descs_fini(struct lu_tgt_descs *ltd); int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); -bool ltd_qos_is_usable(struct lu_tgt_descs *ltd); int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd); int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, __u64 *total_wt); +/** + * Whether MDT inode and space usages are balanced. + */ +static inline bool ltd_qos_is_balanced(struct lu_tgt_descs *ltd) +{ + return !test_bit(LQ_DIRTY, <d->ltd_qos.lq_flags) && + test_bit(LQ_SAME_SPACE, <d->ltd_qos.lq_flags); +} + +/** + * Whether QoS data is up-to-date and QoS can be applied. + */ +static inline bool ltd_qos_is_usable(struct lu_tgt_descs *ltd) +{ + if (ltd_qos_is_balanced(ltd)) + return false; + + if (ltd->ltd_lov_desc.ld_active_tgt_count < 2) + return false; + + return true; +} + static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd) { int index; - index = find_first_bit(ltd->ltd_tgt_bitmap->data, - ltd->ltd_tgt_bitmap->size); - return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL; + index = find_first_bit(ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size); + return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL; } static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd, @@ -1657,10 +1743,10 @@ static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd, return NULL; index = tgt->ltd_index; - LASSERT(index < ltd->ltd_tgt_bitmap->size); - index = find_next_bit(ltd->ltd_tgt_bitmap->data, - ltd->ltd_tgt_bitmap->size, index + 1); - return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL; + LASSERT(index < ltd->ltd_tgts_size); + index = find_next_bit(ltd->ltd_tgt_bitmap, + ltd->ltd_tgts_size, index + 1); + return (index < ltd->ltd_tgts_size) ? LTD_TGT(ltd, index) : NULL; } #define ltd_foreach_tgt(ltd, tgt) \