X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Fobd.h;h=286779dec35d3bb4fd22b4e7694ceb753a25d39b;hp=7568dec7057c2cd9856ba763aa0e8e9c5f04c980;hb=1ba794f6ec9e7ce7ad65fd74f170089fffc31d91;hpb=800e18fc318096e0e552e9cb1927ad99b61d205e diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 7568dec..286779d 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,7 +23,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2014, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -37,9 +33,11 @@ #ifndef __OBD_H #define __OBD_H +#include #include +#include -#include +#include #include #include #ifdef HAVE_SERVER_SUPPORT @@ -52,8 +50,8 @@ #include #include #include -#include #include +#include #define MAX_OBD_DEVICES 8192 @@ -80,73 +78,15 @@ static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) oinfo->loi_kms_valid = 1; } -static inline void loi_init(struct lov_oinfo *loi) -{ -} - -/* If we are unable to get the maximum object size from the OST in - * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using - * the old maximum object size from ext3. */ -#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL - -struct lov_stripe_md { - atomic_t lsm_refc; - spinlock_t lsm_lock; - pid_t lsm_lock_owner; /* debugging */ - - /* maximum possible file size, might change as OSTs status changes, - * e.g. disconnected, deactivated */ - loff_t lsm_maxbytes; - struct ost_id lsm_oi; - __u32 lsm_magic; - __u32 lsm_stripe_size; - __u32 lsm_pattern; /* RAID0, RAID1, released, ... */ - __u16 lsm_stripe_count; - __u16 lsm_layout_gen; - char lsm_pool_name[LOV_MAXPOOLNAME + 1]; - struct lov_oinfo *lsm_oinfo[0]; -}; - -static inline bool lsm_is_released(struct lov_stripe_md *lsm) -{ - return !!(lsm->lsm_pattern & LOV_PATTERN_F_RELEASED); -} - -static inline bool lsm_has_objects(struct lov_stripe_md *lsm) -{ - if (lsm == NULL) - return false; - if (lsm_is_released(lsm)) - return false; - return true; -} - -static inline int lov_stripe_md_size(unsigned int stripe_count) -{ - struct lov_stripe_md lsm; - - return sizeof(lsm) + stripe_count * sizeof(lsm.lsm_oinfo[0]); -} - +struct lov_stripe_md; struct obd_info; typedef int (*obd_enqueue_update_f)(void *cookie, int rc); /* obd info for a particular level (lov, osc). */ struct obd_info { - /* Lock policy. It keeps an extent which is specific for a particular - * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy, - * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */ - ldlm_policy_data_t oi_policy; - /* Flags used for set request specific flags: - - while lock handling, the flags obtained on the enqueue - request are set here. - - while stats, the flags used for control delay/resend. - - while setattr, the flags used for distinguish punch operation - */ + /* OBD_STATFS_* flags */ __u64 oi_flags; - /* obdo data specific for every OSC, if needed at all. */ - struct obdo *oi_oa; /* statfs data specific for every OSC, if needed at all. */ struct obd_statfs *oi_osfs; /* An update callback which is called to update some data on upper @@ -154,9 +94,6 @@ struct obd_info { * request in osc level for enqueue requests. It is also possible to * update some caller data from LOV layer if needed. */ obd_enqueue_update_f oi_cb_up; - /* oss capability, its type is obd_capa in client to avoid copy. - * in contrary its type is lustre_capa in OSS. */ - void *oi_capa; }; struct obd_type { @@ -165,11 +102,15 @@ struct obd_type { struct md_ops *typ_md_ops; struct proc_dir_entry *typ_procroot; struct proc_dir_entry *typ_procsym; - __u32 typ_sym_filter; + struct dentry *typ_debugfs_entry; +#ifdef HAVE_SERVER_SUPPORT + bool typ_sym_filter; +#endif char *typ_name; int typ_refcnt; struct lu_device_type *typ_lu; spinlock_t obd_type_lock; + struct kobject *typ_kobj; }; struct brw_page { @@ -181,7 +122,7 @@ struct brw_page { struct timeout_item { enum timeout_event ti_event; - cfs_time_t ti_timeout; + time64_t ti_timeout; timeout_cb_t ti_cb; void *ti_cb_data; struct list_head ti_obd_list; @@ -191,7 +132,7 @@ struct timeout_item { #define OBD_MAX_RIF_DEFAULT 8 #define OBD_MAX_RIF_MAX 512 #define OSC_MAX_RIF_MAX 256 -#define OSC_MAX_DIRTY_DEFAULT (OBD_MAX_RIF_DEFAULT * 4) +#define OSC_MAX_DIRTY_DEFAULT 2000 /* Arbitrary large value */ #define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ #define OSC_DEFAULT_RESENDS 10 @@ -212,6 +153,12 @@ enum { */ #define OBD_MAX_DEFAULT_EA_SIZE 4096 +enum obd_cl_sem_lock_class { + OBD_CLI_SEM_NORMAL, + OBD_CLI_SEM_MGC, + OBD_CLI_SEM_MDCOSC, +}; + struct mdc_rpc_lock; struct obd_import; struct client_obd { @@ -240,6 +187,17 @@ struct client_obd { * run-time if a larger observed size is advertised by the MDT. */ __u32 cl_max_mds_easize; + /* Data-on-MDT specific value to set larger reply buffer for possible + * data read along with open/stat requests. By default it tries to use + * unused space in reply buffer. + * This value is used to ensure that reply buffer has at least as + * much free space as value indicates. That free space is gained from + * LOV EA buffer which is small for DoM files and on big systems can + * provide up to 32KB of extra space in reply buffer. + * Default value is 8K now. + */ + __u32 cl_dom_min_inline_repsize; + enum lustre_sec_part cl_sp_me; enum lustre_sec_part cl_sp_to; struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ @@ -250,6 +208,8 @@ struct client_obd { unsigned long cl_dirty_transit; /* dirty synchronous */ unsigned long cl_avail_grant; /* bytes of credit for ost */ unsigned long cl_lost_grant; /* lost credits (trunc) */ + /* grant consumed for dirty pages */ + unsigned long cl_dirty_grant; /* since we allocate grant by blocks, we don't know how many grant will * be used to add a page into cache. As a solution, we reserve maximum @@ -257,14 +217,18 @@ struct client_obd { * See osc_{reserve|unreserve}_grant for details. */ long cl_reserved_grant; struct list_head cl_cache_waiters; /* waiting for cache/grant */ - cfs_time_t cl_next_shrink_grant; /* jiffies */ - struct list_head cl_grant_shrink_list; /* Timeout event list */ - int cl_grant_shrink_interval; /* seconds */ + time64_t cl_next_shrink_grant; /* seconds */ + struct list_head cl_grant_chain; + time64_t cl_grant_shrink_interval; /* seconds */ /* A chunk is an optimal size used by osc_extent to determine - * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */ + * the extent size. A chunk is max(PAGE_SIZE, OST block size) */ int cl_chunkbits; - unsigned int cl_extent_tax; /* extent overhead, by bytes */ + /* extent insertion metadata overhead to be accounted in grant, + * in bytes */ + unsigned int cl_grant_extent_tax; + /* maximum extent size, in number of pages */ + unsigned int cl_max_extent_pages; /* keep track of objects that have lois that contain pages which * have been queued for async brw. this lock also protects the @@ -293,8 +257,9 @@ struct client_obd { /* just a sum of the loi/lop pending numbers to be exported by /proc */ atomic_t cl_pending_w_pages; atomic_t cl_pending_r_pages; - __u32 cl_max_pages_per_rpc; - __u32 cl_max_rpcs_in_flight; + u32 cl_max_pages_per_rpc; + u32 cl_max_rpcs_in_flight; + u32 cl_max_short_io_bytes; struct obd_histogram cl_read_rpc_hist; struct obd_histogram cl_write_rpc_hist; struct obd_histogram cl_read_page_hist; @@ -302,51 +267,92 @@ struct client_obd { struct obd_histogram cl_read_offset_hist; struct obd_histogram cl_write_offset_hist; - /* lru for osc caching pages */ - struct cl_client_cache *cl_cache; - struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */ - atomic_long_t *cl_lru_left; - atomic_long_t cl_lru_busy; - atomic_long_t cl_lru_in_list; - atomic_long_t cl_unstable_count; - struct list_head cl_lru_list; /* lru page list */ - spinlock_t cl_lru_list_lock; /* page list protector */ - atomic_t cl_lru_shrinkers; + /** LRU for osc caching pages */ + struct cl_client_cache *cl_cache; + /** member of cl_cache->ccc_lru */ + struct list_head cl_lru_osc; + /** # of available LRU slots left in the per-OSC cache. + * Available LRU slots are shared by all OSCs of the same file system, + * therefore this is a pointer to cl_client_cache::ccc_lru_left. */ + atomic_long_t *cl_lru_left; + /** # of busy LRU pages. A page is considered busy if it's in writeback + * queue, or in transfer. Busy pages can't be discarded so they are not + * in LRU cache. */ + atomic_long_t cl_lru_busy; + /** # of LRU pages in the cache for this client_obd */ + atomic_long_t cl_lru_in_list; + /** # of threads are shrinking LRU cache. To avoid contention, it's not + * allowed to have multiple threads shrinking LRU cache. */ + atomic_t cl_lru_shrinkers; + /** The time when this LRU cache was last used. */ + time64_t cl_lru_last_used; + /** stats: how many reclaims have happened for this client_obd. + * reclaim and shrink - shrink is async, voluntarily rebalancing; + * reclaim is sync, initiated by IO thread when the LRU slots are + * in shortage. */ + __u64 cl_lru_reclaim; + /** List of LRU pages for this client_obd */ + struct list_head cl_lru_list; + /** Lock for LRU page list */ + spinlock_t cl_lru_list_lock; + /** # of unstable pages in this client_obd. + * An unstable page is a page state that WRITE RPC has finished but + * the transaction has NOT yet committed. */ + atomic_long_t cl_unstable_count; + /** Link to osc_shrinker_list */ + struct list_head cl_shrink_list; /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ atomic_t cl_destroy_in_flight; wait_queue_head_t cl_destroy_waitq; - struct mdc_rpc_lock *cl_rpc_lock; - struct mdc_rpc_lock *cl_close_lock; + /* modify rpcs in flight + * currently used for metadata only */ + spinlock_t cl_mod_rpcs_lock; + __u16 cl_max_mod_rpcs_in_flight; + __u16 cl_mod_rpcs_in_flight; + __u16 cl_close_rpcs_in_flight; + wait_queue_head_t cl_mod_rpcs_waitq; + unsigned long *cl_mod_tag_bitmap; + struct obd_histogram cl_mod_rpcs_hist; /* mgc datastruct */ struct mutex cl_mgc_mutex; struct local_oid_storage *cl_mgc_los; struct dt_object *cl_mgc_configs_dir; - atomic_t cl_mgc_refcount; struct obd_export *cl_mgc_mgsexp; + atomic_t cl_mgc_refcount; + /* in-flight control list and total RPCs counter */ + struct list_head cl_flight_waiters; + __u32 cl_rpcs_in_flight; /* checksumming for data sent over the network */ - unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */ + unsigned int cl_checksum:1, /* 0 = disabled, 1 = enabled */ + cl_checksum_dump:1; /* same */ /* supported checksum types that are worked out at connect time */ __u32 cl_supp_cksum_types; /* checksum algorithm to be used */ - cksum_type_t cl_cksum_type; + enum cksum_types cl_cksum_type; /* also protected by the poorly named _loi_list_lock lock above */ struct osc_async_rc cl_ar; /* sequence manager */ struct lu_client_seq *cl_seq; + struct rw_semaphore cl_seq_rwsem; - atomic_t cl_resends; /* resend count */ + atomic_t cl_resends; /* resend count */ /* ptlrpc work for writeback in ptlrpcd context */ void *cl_writeback_work; void *cl_lru_work; + struct mutex cl_quota_mutex; /* hash tables for osc_quota_info */ - cfs_hash_t *cl_quota_hash[MAXQUOTAS]; + struct cfs_hash *cl_quota_hash[LL_MAXQUOTAS]; + /* the xid of the request updating the hash tables */ + __u64 cl_quota_last_xid; + /* Links to the global list of registered changelog devices */ + struct list_head cl_chg_dev_linkage; }; #define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) @@ -387,6 +393,11 @@ struct lov_tgt_desc { ltd_reap:1; /* should this target be deleted */ }; +struct lov_md_tgt_desc { + struct obd_device *lmtd_mdc; + __u32 lmtd_index; +}; + struct lov_obd { struct lov_desc desc; struct lov_tgt_desc **lov_tgts; /* sparse array */ @@ -400,7 +411,7 @@ struct lov_obd { __u32 lov_tgt_size; /* size of tgts array */ int lov_connects; int lov_pool_count; - cfs_hash_t *lov_pools_hash_body; /* used for key access */ + struct cfs_hash *lov_pools_hash_body; /* used for key access */ struct list_head lov_pool_list; /* used for sequential access */ struct proc_dir_entry *lov_pool_proc_entry; enum lustre_sec_part lov_sp_me; @@ -409,54 +420,57 @@ struct lov_obd { struct cl_client_cache *lov_cache; struct rw_semaphore lov_notify_lock; + /* Data-on-MDT: MDC array */ + struct lov_md_tgt_desc *lov_mdc_tgts; + + struct kobject *lov_tgts_kobj; }; struct lmv_tgt_desc { struct obd_uuid ltd_uuid; + struct obd_device *ltd_obd; struct obd_export *ltd_exp; __u32 ltd_idx; struct mutex ltd_fid_mutex; unsigned long ltd_active:1; /* target up for requests */ }; -enum placement_policy { - PLACEMENT_CHAR_POLICY = 0, - PLACEMENT_NID_POLICY = 1, - PLACEMENT_INVAL_POLICY = 2, - PLACEMENT_MAX_POLICY -}; - -typedef enum placement_policy placement_policy_t; - struct lmv_obd { - int refcount; struct lu_client_fld lmv_fld; spinlock_t lmv_lock; - placement_policy_t lmv_placement; struct lmv_desc desc; - struct obd_uuid cluuid; - struct obd_export *exp; - struct proc_dir_entry *targets_proc_entry; struct mutex lmv_init_mutex; int connected; int max_easize; int max_def_easize; + u32 lmv_statfs_start; - __u32 tgts_size; /* size of tgts array */ + u32 tgts_size; /* size of tgts array */ struct lmv_tgt_desc **tgts; struct obd_connect_data conn_data; + struct kobject *lmv_tgts_kobj; }; +/* Minimum sector size is 512 */ +#define MAX_GUARD_NUMBER (PAGE_SIZE / 512) + struct niobuf_local { __u64 lnb_file_offset; __u32 lnb_page_offset; __u32 lnb_len; __u32 lnb_flags; + int lnb_rc; struct page *lnb_page; void *lnb_data; - int lnb_rc; + __u16 lnb_guards[MAX_GUARD_NUMBER]; + __u16 lnb_guard_rpc:1; + __u16 lnb_guard_disk:1; +}; + +struct tgt_thread_big_cache { + struct niobuf_local local[PTLRPC_MAX_BRW_PAGES]; }; #define LUSTRE_FLD_NAME "fld" @@ -494,46 +508,82 @@ struct niobuf_local { #define LUSTRE_MGS_OBDNAME "MGS" #define LUSTRE_MGC_OBDNAME "MGC" +static inline int is_lwp_on_mdt(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* LWP name on MDT is fsname-MDTxxxx-lwp-MDTxxxx */ + + if (strncmp(ptr + 1, "MDT", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0) + return 0; + + return 1; +} + +static inline int is_lwp_on_ost(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* LWP name on OST is fsname-MDTxxxx-lwp-OSTxxxx */ + + if (strncmp(ptr + 1, "OST", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0) + return 0; + + return 1; +} + /* * Events signalled through obd_notify() upcall-chain. */ enum obd_notify_event { - /* target added */ - OBD_NOTIFY_CREATE, /* Device connect start */ OBD_NOTIFY_CONNECT, /* Device activated */ OBD_NOTIFY_ACTIVE, /* Device deactivated */ OBD_NOTIFY_INACTIVE, - /* Device disconnected */ - OBD_NOTIFY_DISCON, /* Connect data for import were changed */ OBD_NOTIFY_OCD, - /* Sync request */ - OBD_NOTIFY_SYNC_NONBLOCK, - OBD_NOTIFY_SYNC, - /* Configuration event */ - OBD_NOTIFY_CONFIG, /* Administratively deactivate/activate event */ OBD_NOTIFY_DEACTIVATE, OBD_NOTIFY_ACTIVATE }; -/* bit-mask flags for config events */ -enum config_flags { - CONFIG_LOG = 0x1, /* finished processing config log */ - CONFIG_SYNC = 0x2, /* mdt synced 1 ost */ - CONFIG_TARGET = 0x4 /* one target is added */ -}; - /* * Data structure used to pass obd_notify()-event to non-obd listeners (llite - * and liblustre being main examples). + * being main example). */ struct obd_notify_upcall { - int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, - enum obd_notify_event ev, void *owner, void *data); + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner); /* Opaque datum supplied by upper layer listener */ void *onu_owner; }; @@ -556,14 +606,14 @@ struct obd_llog_group { #define OBD_DEVICE_MAGIC 0XAB5CD6EF struct obd_device { - struct obd_type *obd_type; - __u32 obd_magic; + struct obd_type *obd_type; + __u32 obd_magic; /* OBD_DEVICE_MAGIC */ + int obd_minor; /* device number: lctl dl */ + struct lu_device *obd_lu_dev; - /* common and UUID name of this device */ - char obd_name[MAX_OBD_NAME]; - struct obd_uuid obd_uuid; - int obd_minor; - struct lu_device *obd_lu_dev; + /* common and UUID name of this device */ + struct obd_uuid obd_uuid; + char obd_name[MAX_OBD_NAME]; /* bitfield modification is protected by obd_dev_lock */ unsigned long @@ -572,8 +622,7 @@ struct obd_device { obd_recovering:1, /* there are recoverable clients */ obd_abort_recovery:1, /* recovery expired */ obd_version_recov:1, /* obd uses version checking */ - obd_replayable:1, /* recovery is enabled; - * inform clients */ + obd_replayable:1, /* recovery enabled; inform clients */ obd_no_transno:1, /* no committed-transno notification */ obd_no_recov:1, /* fail instead of retry messages */ obd_stopping:1, /* started cleanup */ @@ -585,34 +634,36 @@ struct obd_device { * (for /proc/status only!!) */ obd_no_ir:1, /* no imperative recovery. */ obd_process_conf:1, /* device is processing mgs config */ - obd_uses_nid_stats:1; /* maintain per-client OBD stats */ + obd_checksum_dump:1; /* dump pages upon cksum error */ /* use separate field as it is set in interrupt to don't mess with * protection of other bits using _bh lock */ unsigned long obd_recovery_expired:1; /* uuid-export hash body */ - cfs_hash_t *obd_uuid_hash; + struct cfs_hash *obd_uuid_hash; /* nid-export hash body */ - cfs_hash_t *obd_nid_hash; + struct cfs_hash *obd_nid_hash; /* nid stats body */ - cfs_hash_t *obd_nid_stats_hash; + struct cfs_hash *obd_nid_stats_hash; + /* client_generation-export hash body */ + struct cfs_hash *obd_gen_hash; struct list_head obd_nid_stats; - atomic_t obd_refcount; struct list_head obd_exports; struct list_head obd_unlinked_exports; struct list_head obd_delayed_exports; struct list_head obd_lwp_list; + atomic_t obd_refcount; int obd_num_exports; spinlock_t obd_nid_lock; struct ldlm_namespace *obd_namespace; struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ /* a spinlock is OK for what we do now, may need a semaphore later */ spinlock_t obd_dev_lock; /* protect OBD bitfield above */ - struct mutex obd_dev_mutex; - __u64 obd_last_committed; spinlock_t obd_osfs_lock; struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ - __u64 obd_osfs_age; + time64_t obd_osfs_age; + __u64 obd_last_committed; + struct mutex obd_dev_mutex; struct lvfs_run_ctxt obd_lvfs_ctxt; struct obd_llog_group obd_olg; /* default llog group */ struct obd_device *obd_observer; @@ -622,7 +673,7 @@ struct obd_device { struct obd_export *obd_lwp_export; /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ struct list_head obd_exports_timed; - time_t obd_eviction_timer; /* for ping evictor */ + time64_t obd_eviction_timer; /* for ping evictor */ int obd_max_recoverable_clients; atomic_t obd_connected_clients; @@ -635,20 +686,21 @@ struct obd_device { int obd_requests_queued_for_recovery; wait_queue_head_t obd_next_transno_waitq; /* protected by obd_recovery_task_lock */ - struct timer_list obd_recovery_timer; + struct hrtimer obd_recovery_timer; /* seconds */ - time_t obd_recovery_start; + time64_t obd_recovery_start; /* seconds, for lprocfs_status */ - time_t obd_recovery_end; - int obd_recovery_time_hard; - int obd_recovery_timeout; + time64_t obd_recovery_end; + time64_t obd_recovery_time_hard; + time64_t obd_recovery_timeout; int obd_recovery_ir_factor; /* new recovery stuff from CMD2 */ - struct target_recovery_data obd_recovery_data; int obd_replayed_locks; atomic_t obd_req_replay_clients; atomic_t obd_lock_replay_clients; + struct target_recovery_data obd_recovery_data; + /* all lists are protected by obd_recovery_task_lock */ struct list_head obd_req_replay_queue; struct list_head obd_lock_replay_queue; @@ -666,48 +718,43 @@ struct obd_device { struct lov_obd lov; struct lmv_obd lmv; } u; + /* Fields used by LProcFS */ - unsigned int obd_cntr_base; - struct lprocfs_stats *obd_stats; + struct lprocfs_stats *obd_stats; - unsigned int obd_md_cntr_base; - struct lprocfs_stats *obd_md_stats; + struct lprocfs_stats *obd_md_stats; + struct dentry *obd_debugfs_entry; struct proc_dir_entry *obd_proc_entry; struct proc_dir_entry *obd_proc_exports_entry; - struct proc_dir_entry *obd_svc_procroot; + struct dentry *obd_svc_debugfs_entry; struct lprocfs_stats *obd_svc_stats; + const struct attribute **obd_attrs; struct lprocfs_vars *obd_vars; atomic_t obd_evict_inprogress; wait_queue_head_t obd_evict_inprogress_waitq; struct list_head obd_evict_list; /* protected with pet_lock */ - /** - * Ldlm pool part. Save last calculated SLV and Limit. - */ - rwlock_t obd_pool_lock; - int obd_pool_limit; - __u64 obd_pool_slv; + /** + * LDLM pool part. Save last calculated SLV and Limit. + */ + rwlock_t obd_pool_lock; + __u64 obd_pool_slv; + int obd_pool_limit; - /** - * A list of outstanding class_incref()'s against this obd. For - * debugging. - */ - struct lu_ref obd_reference; + int obd_conn_inprogress; - int obd_conn_inprogress; -}; + /** + * List of outstanding class_incref()'s fo this OBD. For debugging. */ + struct lu_ref obd_reference; -enum obd_cleanup_stage { -/* Special case hack for MDS LOVs */ - OBD_CLEANUP_EARLY, -/* can be directly mapped to .ldto_device_fini() */ - OBD_CLEANUP_EXPORTS, + struct kset obd_kset; /* sysfs object collection */ + struct kobj_type obd_ktype; + struct completion obd_kobj_unregister; }; /* get/set_info keys */ #define KEY_ASYNC "async" -#define KEY_CAPA_KEY "capa_key" #define KEY_CHANGELOG_CLEAR "changelog_clear" #define KEY_FID2PATH "fid2path" #define KEY_CHECKSUM "checksum" @@ -736,38 +783,32 @@ enum obd_cleanup_stage { #define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" #define KEY_OSP_CONNECTED "osp_connected" -struct lu_context; +/* Flags for op_xvalid */ +enum op_xvalid { + OP_XVALID_CTIME_SET = BIT(0), /* 0x0001 */ + OP_XVALID_BLOCKS = BIT(1), /* 0x0002 */ + OP_XVALID_OWNEROVERRIDE = BIT(2), /* 0x0004 */ + OP_XVALID_FLAGS = BIT(3), /* 0x0008 */ + OP_XVALID_PROJID = BIT(4), /* 0x0010 */ + OP_XVALID_LAZYSIZE = BIT(5), /* 0x0020 */ + OP_XVALID_LAZYBLOCKS = BIT(6), /* 0x0040 */ +}; -/* /!\ must be coherent with include/linux/namei.h on patched kernel */ -#define IT_OPEN (1 << 0) -#define IT_CREAT (1 << 1) -#define IT_READDIR (1 << 2) -#define IT_GETATTR (1 << 3) -#define IT_LOOKUP (1 << 4) -#define IT_UNLINK (1 << 5) -#define IT_TRUNC (1 << 6) -#define IT_GETXATTR (1 << 7) -#define IT_EXEC (1 << 8) -#define IT_PIN (1 << 9) -#define IT_LAYOUT (1 << 10) -#define IT_QUOTA_DQACQ (1 << 11) -#define IT_QUOTA_CONN (1 << 12) -#define IT_SETXATTR (1 << 13) +struct lu_context; static inline int it_to_lock_mode(struct lookup_intent *it) { /* CREAT needs to be tested before open (both could be set) */ if (it->it_op & IT_CREAT) return LCK_CW; - else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP | - IT_LAYOUT)) + else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP)) return LCK_CR; + else if (it->it_op & IT_LAYOUT) + return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR; else if (it->it_op & IT_READDIR) return LCK_PR; else if (it->it_op & IT_GETXATTR) return LCK_PR; - else if (it->it_op & IT_SETXATTR) - return LCK_PW; LASSERTF(0, "Invalid it_op: %d\n", it->it_op); return -EINVAL; @@ -789,44 +830,50 @@ enum md_cli_flags { CLI_MIGRATE = 1 << 4, }; +/** + * GETXATTR is not included as only a couple of fields in the reply body + * is filled, but not FID which is needed for common intent handling in + * mdc_finish_intent_lock() + */ +static inline bool it_has_reply_body(const struct lookup_intent *it) +{ + return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR); +} + struct md_op_data { - struct lu_fid op_fid1; /* operation fid1 (usualy parent) */ - struct lu_fid op_fid2; /* operation fid2 (usualy child) */ - struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ - struct lu_fid op_fid4; /* to the operation locks. */ + struct lu_fid op_fid1; /* operation fid1 (usualy parent) */ + struct lu_fid op_fid2; /* operation fid2 (usualy child) */ + struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ + struct lu_fid op_fid4; /* to the operation locks. */ u32 op_mds; /* what mds server open will go to */ - struct lustre_handle op_handle; + __u32 op_mode; + struct lustre_handle op_open_handle; s64 op_mod_time; - const char *op_name; + const char *op_name; size_t op_namelen; - __u32 op_mode; - struct lmv_stripe_md *op_mea1; - struct lmv_stripe_md *op_mea2; - __u32 op_suppgids[2]; - __u32 op_fsuid; - __u32 op_fsgid; - cfs_cap_t op_cap; - void *op_data; + struct rw_semaphore *op_mea1_sem; + struct rw_semaphore *op_mea2_sem; + struct lmv_stripe_md *op_mea1; + struct lmv_stripe_md *op_mea2; + __u32 op_suppgids[2]; + __u32 op_fsuid; + __u32 op_fsgid; + cfs_cap_t op_cap; + void *op_data; size_t op_data_size; - /* iattr fields and blocks. */ + /* iattr fields and blocks. */ struct iattr op_attr; + enum op_xvalid op_xvalid; /* eXtra validity flags */ loff_t op_attr_blocks; - unsigned int op_attr_flags; /* LUSTRE_{SYNC,..}_FL */ - __u64 op_valid; /* OBD_MD_* */ + u64 op_valid; /* OBD_MD_* */ + unsigned int op_attr_flags; /* LUSTRE_{SYNC,..}_FL */ enum md_op_flags op_flags; - /* Capa fields */ - struct obd_capa *op_capa1; - struct obd_capa *op_capa2; - /* Various operation flags. */ enum mds_op_bias op_bias; - /* Used by readdir */ - unsigned int op_max_pages; - /* used to transfer info between the stacks of MD client * see enum op_cli_flags */ enum md_cli_flags op_cli_flags; @@ -835,8 +882,33 @@ struct md_op_data { __u64 op_data_version; struct lustre_handle op_lease_handle; + /* File security context, for creates/metadata ops */ + const char *op_file_secctx_name; + __u32 op_file_secctx_name_size; + void *op_file_secctx; + __u32 op_file_secctx_size; + /* default stripe offset */ __u32 op_default_stripe_offset; + + __u32 op_projid; + + /* Used by readdir */ + unsigned int op_max_pages; + + __u16 op_mirror_id; + + /* + * used to access migrating dir: if it's set, assume migration is + * finished, use the new layout to access dir, otherwise use old layout. + * By default it's not set, because new files are created under new + * layout, if we can't find file with name under both old and new + * layout, we are sure file with name doesn't exist, but in reverse + * order there may be a race with creation by others. + */ + bool op_post_migrate; + /* used to access dir with bash hash */ + __u32 op_stripe_index; }; struct md_callback { @@ -852,12 +924,13 @@ typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req, int rc); struct md_enqueue_info { - struct md_op_data mi_data; - struct lookup_intent mi_it; - struct lustre_handle mi_lockh; - struct inode *mi_dir; - md_enqueue_cb_t mi_cb; - void *mi_cbdata; + struct md_op_data mi_data; + struct lookup_intent mi_it; + struct lustre_handle mi_lockh; + struct inode *mi_dir; + struct ldlm_enqueue_info mi_einfo; + md_enqueue_cb_t mi_cb; + void *mi_cbdata; }; struct obd_ops { @@ -870,29 +943,28 @@ struct obd_ops { __u32 keylen, void *key, __u32 vallen, void *val, struct ptlrpc_request_set *set); - int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg); - int (*o_precleanup)(struct obd_device *dev, - enum obd_cleanup_stage cleanup_stage); - int (*o_cleanup)(struct obd_device *dev); + int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg); + int (*o_precleanup)(struct obd_device *dev); + int (*o_cleanup)(struct obd_device *dev); int (*o_process_config)(struct obd_device *dev, size_t len, void *data); - int (*o_postrecov)(struct obd_device *dev); - int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, - int priority); - int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); - /* connect to the target device with given connection - * data. @ocd->ocd_connect_flags is modified to reflect flags actually - * granted by the target, which are guaranteed to be a subset of flags - * asked for. If @ocd == NULL, use default parameters. */ - int (*o_connect)(const struct lu_env *env, - struct obd_export **exp, struct obd_device *src, - struct obd_uuid *cluuid, struct obd_connect_data *ocd, - void *localdata); - int (*o_reconnect)(const struct lu_env *env, - struct obd_export *exp, struct obd_device *src, - struct obd_uuid *cluuid, - struct obd_connect_data *ocd, - void *localdata); - int (*o_disconnect)(struct obd_export *exp); + int (*o_postrecov)(struct obd_device *dev); + int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, + int priority); + int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); + /* connect to the target device with given connection + * data. @ocd->ocd_connect_flags is modified to reflect flags actually + * granted by the target, which are guaranteed to be a subset of flags + * asked for. If @ocd == NULL, use default parameters. */ + int (*o_connect)(const struct lu_env *env, + struct obd_export **exp, struct obd_device *src, + struct obd_uuid *cluuid, struct obd_connect_data *ocd, + void *localdata); + int (*o_reconnect)(const struct lu_env *env, + struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd, + void *localdata); + int (*o_disconnect)(struct obd_export *exp); /* Initialize/finalize fids infrastructure. */ int (*o_fid_init)(struct obd_device *obd, @@ -903,26 +975,22 @@ struct obd_ops { int (*o_fid_alloc)(const struct lu_env *env, struct obd_export *exp, struct lu_fid *fid, struct md_op_data *op_data); - /* - * Object with @fid is getting deleted, we may want to do something - * about this. - */ - int (*o_statfs)(const struct lu_env *, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, __u32 flags); - int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, - __u64 max_age, struct ptlrpc_request_set *set); - int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt, - struct lov_stripe_md *mem_src); - int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt, - struct lov_mds_md *disk_src, int disk_len); + /* + * Object with @fid is getting deleted, we may want to do something + * about this. + */ + int (*o_statfs)(const struct lu_env *, struct obd_export *exp, + struct obd_statfs *osfs, time64_t max_age, __u32 flags); + int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, + time64_t max_age, struct ptlrpc_request_set *set); int (*o_create)(const struct lu_env *env, struct obd_export *exp, struct obdo *oa); int (*o_destroy)(const struct lu_env *env, struct obd_export *exp, struct obdo *oa); int (*o_setattr)(const struct lu_env *, struct obd_export *exp, - struct obd_info *oinfo); - int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, - struct obd_info *oinfo); + struct obdo *oa); + int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa); int (*o_preprw)(const struct lu_env *env, int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, struct niobuf_remote *remote, @@ -932,50 +1000,39 @@ struct obd_ops { int objcount, struct obd_ioobj *obj, struct niobuf_remote *remote, int pages, struct niobuf_local *local, int rc); - int (*o_init_export)(struct obd_export *exp); - int (*o_destroy_export)(struct obd_export *exp); - - int (*o_import_event)(struct obd_device *, struct obd_import *, - enum obd_import_event); + int (*o_init_export)(struct obd_export *exp); + int (*o_destroy_export)(struct obd_export *exp); - int (*o_notify)(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev, void *data); + int (*o_import_event)(struct obd_device *, struct obd_import *, + enum obd_import_event); - int (*o_health_check)(const struct lu_env *env, struct obd_device *); - struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); + int (*o_notify)(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev); - /* quota methods */ - int (*o_quotactl)(struct obd_device *, struct obd_export *, - struct obd_quotactl *); + int (*o_health_check)(const struct lu_env *env, struct obd_device *); + struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); - int (*o_ping)(const struct lu_env *, struct obd_export *exp); + /* quota methods */ + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); - /* pools methods */ - int (*o_pool_new)(struct obd_device *obd, char *poolname); - int (*o_pool_del)(struct obd_device *obd, char *poolname); - int (*o_pool_add)(struct obd_device *obd, char *poolname, - char *ostname); - int (*o_pool_rem)(struct obd_device *obd, char *poolname, - char *ostname); - void (*o_getref)(struct obd_device *obd); - void (*o_putref)(struct obd_device *obd); - /* - * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line - * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. - * Also, add a wrapper function in include/linux/obd_class.h. */ + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); }; /* lmv structures */ struct lustre_md { struct mdt_body *body; - struct lov_stripe_md *lsm; + struct lu_buf layout; struct lmv_stripe_md *lmv; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *posix_acl; #endif - struct mdt_remote_perm *remote_perm; - struct obd_capa *mds_capa; - struct obd_capa *oss_capa; }; struct md_open_data { @@ -987,7 +1044,7 @@ struct md_open_data { }; struct obd_client_handle { - struct lustre_handle och_fh; + struct lustre_handle och_open_handle; struct lu_fid och_fid; struct md_open_data *och_mod; struct lustre_handle och_lease_handle; /* open lock for lease */ @@ -1001,18 +1058,6 @@ struct lookup_intent; struct cl_attr; struct md_ops { - /* Every operation from MD_STATS_FIRST_OP up to and including - * MD_STATS_LAST_OP will be counted by EXP_MD_OP_INCREMENT() - * and will appear in /proc/fs/lustre/{lmv,mdc}/.../md_stats. - * Operations after MD_STATS_LAST_OP are excluded from stats. - * There are a few reasons for doing this: we prune the 17 - * counters which will be of minimal use in understanding - * metadata utilization, we save memory by allocating 15 - * instead of 32 counters, we save cycles by not counting. - * - * MD_STATS_FIRST_OP must be the first member of md_ops. - */ -#define MD_STATS_FIRST_OP m_close int (*m_close)(struct obd_export *, struct md_op_data *, struct md_open_data *, struct ptlrpc_request **); @@ -1021,8 +1066,7 @@ struct md_ops { cfs_cap_t, __u64, struct ptlrpc_request **); int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, - const union ldlm_policy_data *, - struct lookup_intent *, struct md_op_data *, + const union ldlm_policy_data *, struct md_op_data *, struct lustre_handle *, __u64); int (*m_getattr)(struct obd_export *, struct md_op_data *, @@ -1044,7 +1088,7 @@ struct md_ops { size_t , struct ptlrpc_request **); int (*m_fsync)(struct obd_export *, const struct lu_fid *, - struct obd_capa *, struct ptlrpc_request **); + struct ptlrpc_request **); int (*m_read_page)(struct obd_export *, struct md_op_data *, struct md_callback *cb_op, __u64 hash_offset, @@ -1054,32 +1098,23 @@ struct md_ops { struct ptlrpc_request **); int (*m_setxattr)(struct obd_export *, const struct lu_fid *, - struct obd_capa *, u64, const char *, - const char *, int, int, int, __u32, - struct ptlrpc_request **); + u64, const char *, const void *, size_t, unsigned int, + u32, struct ptlrpc_request **); int (*m_getxattr)(struct obd_export *, const struct lu_fid *, - struct obd_capa *, u64, const char *, - const char *, int, int, int, - struct ptlrpc_request **); + u64, const char *, size_t, struct ptlrpc_request **); - int (*m_intent_getattr_async)(struct obd_export *, - struct md_enqueue_info *, - struct ldlm_enqueue_info *); + int (*m_intent_getattr_async)(struct obd_export *, + struct md_enqueue_info *); int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *, struct lu_fid *, __u64 *bits); -#define MD_STATS_LAST_OP m_revalidate_lock - - int (*m_getstatus)(struct obd_export *, struct lu_fid *, - struct obd_capa **); + int (*m_file_resync)(struct obd_export *, struct md_op_data *); + int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *); int (*m_null_inode)(struct obd_export *, const struct lu_fid *); - int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *, - ldlm_iterator_t, void *); - int (*m_getattr_name)(struct obd_export *, struct md_op_data *, struct ptlrpc_request **); @@ -1102,60 +1137,26 @@ struct md_ops { int (*m_clear_open_replay_data)(struct obd_export *, struct obd_client_handle *); - int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *); + int (*m_set_lock_data)(struct obd_export *, + const struct lustre_handle *, void *, __u64 *); - ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64, - const struct lu_fid *, ldlm_type_t, - ldlm_policy_data_t *, ldlm_mode_t, - struct lustre_handle *); + enum ldlm_mode (*m_lock_match)(struct obd_export *, __u64, + const struct lu_fid *, enum ldlm_type, + union ldlm_policy_data *, enum ldlm_mode, + struct lustre_handle *); int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *, - ldlm_policy_data_t *, ldlm_mode_t, - ldlm_cancel_flags_t flags, void *opaque); - - int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc, - renew_capa_cb_t cb); - - int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *, - const struct req_msg_field *, struct obd_capa **); - - int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *, - struct obd_capa *, __u32, - struct ptlrpc_request **); + union ldlm_policy_data *, enum ldlm_mode, + enum ldlm_cancel_flags flags, void *opaque); int (*m_get_fid_from_lsm)(struct obd_export *, const struct lmv_stripe_md *, const char *name, int namelen, struct lu_fid *fid); + int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm, + const union lmv_mds_md *lmv, size_t lmv_size); }; -struct lsm_operations { - void (*lsm_free)(struct lov_stripe_md *); - void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, - loff_t *, loff_t *); - void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, - loff_t *, loff_t *); - int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes, - __u16 *stripe_count); - int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm, - struct lov_mds_md *lmm); -}; - -extern const struct lsm_operations lsm_v1_ops; -extern const struct lsm_operations lsm_v3_ops; -static inline const struct lsm_operations *lsm_op_find(u32 magic) -{ - switch(magic) { - case LOV_MAGIC_V1: - return &lsm_v1_ops; - case LOV_MAGIC_V3: - return &lsm_v3_ops; - default: - CERROR("Cannot recognize lsm_magic %08x\n", magic); - return NULL; - } -} - static inline struct md_open_data *obd_mod_alloc(void) { struct md_open_data *mod; @@ -1178,6 +1179,7 @@ static inline struct md_open_data *obd_mod_alloc(void) void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid); void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); +void obdo_set_o_projid(struct obdo *dst, u32 projid); /* return 1 if client should be resend request */ static inline int client_should_resend(int resend, struct client_obd *cli) @@ -1244,10 +1246,11 @@ bad_format: static inline int cli_brw_size(struct obd_device *obd) { LASSERT(obd != NULL); - return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT; } -/* when RPC size or the max RPCs in flight is increased, the max dirty pages +/* + * When RPC size or the max RPCs in flight is increased, the max dirty pages * of the client should be increased accordingly to avoid sending fragmented * RPCs over the network when the client runs out of the maximum dirty space * when so many RPCs are being generated. @@ -1255,10 +1258,10 @@ static inline int cli_brw_size(struct obd_device *obd) static inline void client_adjust_max_dirty(struct client_obd *cli) { /* initializing */ - if (cli->cl_dirty_max_pages <= 0) - cli->cl_dirty_max_pages = (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) - >> PAGE_CACHE_SHIFT; - else { + if (cli->cl_dirty_max_pages <= 0) { + cli->cl_dirty_max_pages = + (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT; + } else { unsigned long dirty_max = cli->cl_max_rpcs_in_flight * cli->cl_max_pages_per_rpc; @@ -1268,6 +1271,12 @@ static inline void client_adjust_max_dirty(struct client_obd *cli) if (cli->cl_dirty_max_pages > totalram_pages / 8) cli->cl_dirty_max_pages = totalram_pages / 8; + + /* This value is exported to userspace through the max_dirty_mb + * parameter. So we round up the number of pages to make it a round + * number of MBs. */ + cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages, + 1 << (20 - PAGE_SHIFT)); } #endif /* __OBD_H */