#ifndef __OBD_H
#define __OBD_H
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
#include <linux/kobject.h>
#include <linux/spinlock.h>
#include <linux/sysfs.h>
+#include <linux/xarray.h>
#include <uapi/linux/lustre/lustre_idl.h>
#include <lustre_lib.h>
struct obd_info {
/* OBD_STATFS_* flags */
__u64 oi_flags;
+ struct obd_device *oi_obd;
+ struct lu_tgt_desc *oi_tgt;
/* statfs data specific for every OSC, if needed at all. */
struct obd_statfs *oi_osfs;
/* An update callback which is called to update some data on upper
};
struct obd_type {
- struct list_head typ_chain;
- struct obd_ops *typ_dt_ops;
- struct md_ops *typ_md_ops;
+ const struct obd_ops *typ_dt_ops;
+ const struct md_ops *typ_md_ops;
struct proc_dir_entry *typ_procroot;
- struct proc_dir_entry *typ_procsym;
struct dentry *typ_debugfs_entry;
#ifdef HAVE_SERVER_SUPPORT
bool typ_sym_filter;
#endif
- char *typ_name;
- int typ_refcnt;
+ atomic_t typ_refcnt;
struct lu_device_type *typ_lu;
- spinlock_t obd_type_lock;
- struct kobject *typ_kobj;
+ struct kobject typ_kobj;
};
+#define typ_name typ_kobj.name
+#define OBD_LU_TYPE_SETUP ((void *)0x01UL)
struct brw_page {
u64 off;
struct page *pg;
u32 count;
u32 flag;
+ /* used for encryption: difference with offset in clear text page */
+ u16 bp_off_diff;
+ /* used for encryption: difference with count in clear text page */
+ u16 bp_count_diff;
+ u32 bp_padding;
};
struct timeout_item {
#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */
#define OSC_DEFAULT_RESENDS 10
-/* possible values for fo_sync_lock_cancel */
-enum {
- NEVER_SYNC_ON_CANCEL = 0,
- BLOCKING_SYNC_ON_CANCEL = 1,
- ALWAYS_SYNC_ON_CANCEL = 2,
- NUM_SYNC_ON_CANCEL_STATES
+/* possible values for lut_sync_lock_cancel */
+enum tgt_sync_lock_cancel {
+ SYNC_LOCK_CANCEL_NEVER = 0,
+ SYNC_LOCK_CANCEL_BLOCKING = 1,
+ SYNC_LOCK_CANCEL_ALWAYS = 2,
};
/*
* vmalloc(). Excessive use of vmalloc() may cause spinlock contention
* on the MDS.
*/
-#define OBD_MAX_DEFAULT_EA_SIZE 4096
+#define OBD_MAX_DEFAULT_EA_SIZE 4096
+
+/*
+ * Lustre can handle larger xattrs internally, but we must respect the Linux
+ * VFS limitation or tools like tar cannot interact with Lustre volumes
+ * correctly.
+ */
+#define OBD_MAX_EA_SIZE XATTR_SIZE_MAX
+
enum obd_cl_sem_lock_class {
OBD_CLI_SEM_NORMAL,
/* the grant values are protected by loi_list_lock below */
unsigned long cl_dirty_pages; /* all _dirty_ in pages */
unsigned long cl_dirty_max_pages; /* allowed w/o rpc */
- unsigned long cl_dirty_transit; /* dirty synchronous */
unsigned long cl_avail_grant; /* bytes of credit for ost */
unsigned long cl_lost_grant; /* lost credits (trunc) */
/* grant consumed for dirty pages */
* grant before trying to dirty a page and unreserve the rest.
* See osc_{reserve|unreserve}_grant for details. */
long cl_reserved_grant;
- struct list_head cl_cache_waiters; /* waiting for cache/grant */
+ wait_queue_head_t cl_cache_waiters; /* waiting for cache/grant */
time64_t cl_next_shrink_grant; /* seconds */
struct list_head cl_grant_chain;
time64_t cl_grant_shrink_interval; /* seconds */
/* just a sum of the loi/lop pending numbers to be exported by /proc */
atomic_t cl_pending_w_pages;
atomic_t cl_pending_r_pages;
- __u32 cl_max_pages_per_rpc;
- __u32 cl_max_rpcs_in_flight;
- __u32 cl_short_io_bytes;
+ u32 cl_max_pages_per_rpc;
+ u32 cl_max_rpcs_in_flight;
+ u32 cl_max_short_io_bytes;
struct obd_histogram cl_read_rpc_hist;
struct obd_histogram cl_write_rpc_hist;
struct obd_histogram cl_read_page_hist;
atomic_t cl_destroy_in_flight;
wait_queue_head_t cl_destroy_waitq;
- struct mdc_rpc_lock *cl_rpc_lock;
-
/* modify rpcs in flight
* currently used for metadata only */
spinlock_t cl_mod_rpcs_lock;
__u32 cl_supp_cksum_types;
/* checksum algorithm to be used */
enum cksum_types cl_cksum_type;
+ /* preferred checksum algorithm to be used */
+ enum cksum_types cl_preferred_cksum_type;
/* also protected by the poorly named _loi_list_lock lock above */
struct osc_async_rc cl_ar;
/* ptlrpc work for writeback in ptlrpcd context */
void *cl_writeback_work;
void *cl_lru_work;
+ struct mutex cl_quota_mutex;
/* hash tables for osc_quota_info */
struct cfs_hash *cl_quota_hash[LL_MAXQUOTAS];
+ /* the xid of the request updating the hash tables */
+ __u64 cl_quota_last_xid;
/* Links to the global list of registered changelog devices */
struct list_head cl_chg_dev_linkage;
};
__u64 ec_unique;
};
-/* Generic subset of OSTs */
-struct ost_pool {
- __u32 *op_array; /* array of index of
- lov_obd->lov_tgts */
- unsigned int op_count; /* number of OSTs in the array */
- unsigned int op_size; /* allocated size of lp_array */
- struct rw_semaphore op_rw_sem; /* to protect ost_pool use */
-};
-
/* allow statfs data caching for 1 second */
#define OBD_STATFS_CACHE_SECONDS 1
+/* arbitrary maximum. larger would be useless, allows catching bogus input */
+#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */
-struct lov_tgt_desc {
- struct list_head ltd_kill;
- struct obd_uuid ltd_uuid;
- struct obd_device *ltd_obd;
- struct obd_export *ltd_exp;
- __u32 ltd_gen;
- __u32 ltd_index; /* index in lov_obd->tgts */
- unsigned long ltd_active:1,/* is this target up for requests */
- ltd_activate:1,/* should target be activated */
- ltd_reap:1; /* should this target be deleted */
-};
+#define lov_tgt_desc lu_tgt_desc
struct lov_md_tgt_desc {
struct obd_device *lmtd_mdc;
struct lov_obd {
struct lov_desc desc;
struct lov_tgt_desc **lov_tgts; /* sparse array */
- struct ost_pool lov_packed; /* all OSTs in a packed
+ struct lu_tgt_pool lov_packed; /* all OSTs in a packed
array */
struct mutex lov_lock;
struct obd_connect_data lov_ocd;
__u32 lov_tgt_size; /* size of tgts array */
int lov_connects;
int lov_pool_count;
- struct cfs_hash *lov_pools_hash_body; /* used for key access */
+ struct rhashtable lov_pools_hash_body; /* used for key access */
struct list_head lov_pool_list; /* used for sequential access */
struct proc_dir_entry *lov_pool_proc_entry;
enum lustre_sec_part lov_sp_me;
struct kobject *lov_tgts_kobj;
};
-struct lmv_tgt_desc {
- struct obd_uuid ltd_uuid;
- struct obd_device *ltd_obd;
- struct obd_export *ltd_exp;
- __u32 ltd_idx;
- struct mutex ltd_fid_mutex;
- unsigned long ltd_active:1; /* target up for requests */
-};
+#define lmv_tgt_desc lu_tgt_desc
struct lmv_obd {
struct lu_client_fld lmv_fld;
spinlock_t lmv_lock;
- struct lmv_desc desc;
- struct mutex lmv_init_mutex;
int connected;
int max_easize;
int max_def_easize;
+ u32 lmv_statfs_start;
- __u32 tgts_size; /* size of tgts array */
- struct lmv_tgt_desc **tgts;
- int lmv_statfs_start;
-
+ struct lu_tgt_descs lmv_mdt_descs;
struct obd_connect_data conn_data;
struct kobject *lmv_tgts_kobj;
+ void *lmv_cache;
+
+ __u32 lmv_qos_rr_index;
};
+#define lmv_mdt_count lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count
+#define lmv_qos lmv_mdt_descs.ltd_qos
+
+/* Minimum sector size is 512 */
+#define MAX_GUARD_NUMBER (PAGE_SIZE / 512)
+
struct niobuf_local {
__u64 lnb_file_offset;
__u32 lnb_page_offset;
int lnb_rc;
struct page *lnb_page;
void *lnb_data;
+ __u16 lnb_guards[MAX_GUARD_NUMBER];
+ __u16 lnb_guard_rpc:1;
+ __u16 lnb_guard_disk:1;
+ /* separate unlock for read path to allow shared access */
+ __u16 lnb_locked:1;
};
struct tgt_thread_big_cache {
struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS];
wait_queue_head_t olg_waitq;
spinlock_t olg_lock;
- struct mutex olg_cat_processing;
};
/* corresponds to one of the obd's */
obd_abort_recovery:1, /* recovery expired */
obd_version_recov:1, /* obd uses version checking */
obd_replayable:1, /* recovery enabled; inform clients */
- obd_no_transno:1, /* no committed-transno notification */
obd_no_recov:1, /* fail instead of retry messages */
obd_stopping:1, /* started cleanup */
obd_starting:1, /* started setup */
obd_no_ir:1, /* no imperative recovery. */
obd_process_conf:1, /* device is processing mgs config */
obd_checksum_dump:1; /* dump pages upon cksum error */
+#ifdef HAVE_SERVER_SUPPORT
+ /* no committed-transno notification */
+ unsigned long obd_no_transno:1;
+#endif
/* use separate field as it is set in interrupt to don't mess with
* protection of other bits using _bh lock */
unsigned long obd_recovery_expired:1;
/* uuid-export hash body */
- struct cfs_hash *obd_uuid_hash;
+ struct rhashtable obd_uuid_hash;
/* nid-export hash body */
- struct cfs_hash *obd_nid_hash;
+ struct rhltable obd_nid_hash;
/* nid stats body */
struct cfs_hash *obd_nid_stats_hash;
/* client_generation-export hash body */
struct list_head obd_exports_timed;
time64_t obd_eviction_timer; /* for ping evictor */
- int obd_max_recoverable_clients;
+ atomic_t obd_max_recoverable_clients;
atomic_t obd_connected_clients;
int obd_stale_clients;
/* this lock protects all recovery list_heads, timer and
int obd_requests_queued_for_recovery;
wait_queue_head_t obd_next_transno_waitq;
/* protected by obd_recovery_task_lock */
- struct timer_list obd_recovery_timer;
+ struct hrtimer obd_recovery_timer;
/* seconds */
time64_t obd_recovery_start;
/* seconds, for lprocfs_status */
time64_t obd_recovery_end;
- time64_t obd_recovery_time_hard;
- time64_t obd_recovery_timeout;
- int obd_recovery_ir_factor;
+ /* To tell timeouts from time stamps Lustre uses timeout_t
+ * instead of time64_t.
+ */
+ timeout_t obd_recovery_time_hard;
+ timeout_t obd_recovery_timeout;
+ int obd_recovery_ir_factor;
/* new recovery stuff from CMD2 */
int obd_replayed_locks;
struct completion obd_kobj_unregister;
};
+int obd_uuid_add(struct obd_device *obd, struct obd_export *export);
+void obd_uuid_del(struct obd_device *obd, struct obd_export *export);
+#ifdef HAVE_SERVER_SUPPORT
+struct obd_export *obd_uuid_lookup(struct obd_device *obd,
+ struct obd_uuid *uuid);
+
+int obd_nid_export_for_each(struct obd_device *obd, lnet_nid_t nid,
+ int cb(struct obd_export *exp, void *data),
+ void *data);
+int obd_nid_add(struct obd_device *obd, struct obd_export *exp);
+void obd_nid_del(struct obd_device *obd, struct obd_export *exp);
+#endif
+
/* get/set_info keys */
#define KEY_ASYNC "async"
#define KEY_CHANGELOG_CLEAR "changelog_clear"
/* KEY_SET_INFO in lustre_idl.h */
#define KEY_SPTLRPC_CONF "sptlrpc_conf"
-#define KEY_CACHE_SET "cache_set"
#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink"
#define KEY_OSP_CONNECTED "osp_connected"
+/* Flags for op_xvalid */
+enum op_xvalid {
+ OP_XVALID_CTIME_SET = BIT(0), /* 0x0001 */
+ OP_XVALID_BLOCKS = BIT(1), /* 0x0002 */
+ OP_XVALID_OWNEROVERRIDE = BIT(2), /* 0x0004 */
+ OP_XVALID_FLAGS = BIT(3), /* 0x0008 */
+ OP_XVALID_PROJID = BIT(4), /* 0x0010 */
+ OP_XVALID_LAZYSIZE = BIT(5), /* 0x0020 */
+ OP_XVALID_LAZYBLOCKS = BIT(6), /* 0x0040 */
+};
+
struct lu_context;
static inline int it_to_lock_mode(struct lookup_intent *it)
}
enum md_op_flags {
- MF_MDC_CANCEL_FID1 = 1 << 0,
- MF_MDC_CANCEL_FID2 = 1 << 1,
- MF_MDC_CANCEL_FID3 = 1 << 2,
- MF_MDC_CANCEL_FID4 = 1 << 3,
- MF_GET_MDT_IDX = 1 << 4,
+ MF_MDC_CANCEL_FID1 = BIT(0),
+ MF_MDC_CANCEL_FID2 = BIT(1),
+ MF_MDC_CANCEL_FID3 = BIT(2),
+ MF_MDC_CANCEL_FID4 = BIT(3),
+ MF_GET_MDT_IDX = BIT(4),
};
enum md_cli_flags {
- CLI_SET_MEA = 1 << 0,
- CLI_RM_ENTRY = 1 << 1,
- CLI_HASH64 = 1 << 2,
- CLI_API32 = 1 << 3,
- CLI_MIGRATE = 1 << 4,
+ CLI_SET_MEA = BIT(0),
+ CLI_RM_ENTRY = BIT(1),
+ CLI_HASH64 = BIT(2),
+ CLI_API32 = BIT(3),
+ CLI_MIGRATE = BIT(4),
+ CLI_DIRTY_DATA = BIT(5),
+};
+
+enum md_op_code {
+ LUSTRE_OPC_MKDIR = 0,
+ LUSTRE_OPC_SYMLINK = 1,
+ LUSTRE_OPC_MKNOD = 2,
+ LUSTRE_OPC_CREATE = 3,
+ LUSTRE_OPC_ANY = 5,
};
/**
struct lu_fid op_fid4; /* to the operation locks. */
u32 op_mds; /* what mds server open will go to */
__u32 op_mode;
- struct lustre_handle op_handle;
+ enum md_op_code op_code;
+ struct lustre_handle op_open_handle;
s64 op_mod_time;
const char *op_name;
size_t op_namelen;
+ struct rw_semaphore *op_mea1_sem;
+ struct rw_semaphore *op_mea2_sem;
struct lmv_stripe_md *op_mea1;
struct lmv_stripe_md *op_mea2;
+ struct lmv_stripe_md *op_default_mea1; /* default LMV */
__u32 op_suppgids[2];
__u32 op_fsuid;
__u32 op_fsgid;
/* iattr fields and blocks. */
struct iattr op_attr;
+ enum op_xvalid op_xvalid; /* eXtra validity flags */
loff_t op_attr_blocks;
- __u64 op_valid; /* OBD_MD_* */
- unsigned int op_attr_flags; /* LUSTRE_{SYNC,..}_FL */
+ u64 op_valid; /* OBD_MD_* */
+ unsigned int op_attr_flags; /* LUSTRE_{SYNC,..}_FL */
enum md_op_flags op_flags;
__u64 op_data_version;
struct lustre_handle op_lease_handle;
- /* File security context, for creates. */
+ /* File security context, for creates/metadata ops */
const char *op_file_secctx_name;
+ __u32 op_file_secctx_name_size;
void *op_file_secctx;
__u32 op_file_secctx_size;
- /* default stripe offset */
- __u32 op_default_stripe_offset;
-
__u32 op_projid;
/* Used by readdir */
unsigned int op_max_pages;
__u16 op_mirror_id;
+
+ /*
+ * used to access dir that is changing layout: if it's set, access
+ * dir by new layout, otherwise old layout.
+ * By default it's not set, because new files are created under new
+ * layout, if we can't find file with name under both old and new
+ * layout, we are sure file with name doesn't exist, but in reverse
+ * order there may be a race with creation by others.
+ */
+ bool op_new_layout;
+ /* used to access dir with bash hash */
+ __u32 op_stripe_index;
+ /* Archive ID for PCC attach */
+ __u32 op_archive_id;
};
struct md_callback {
__u32 keylen, void *key,
__u32 vallen, void *val,
struct ptlrpc_request_set *set);
- int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
- int (*o_precleanup)(struct obd_device *dev);
- int (*o_cleanup)(struct obd_device *dev);
- int (*o_process_config)(struct obd_device *dev, size_t len, void *data);
- int (*o_postrecov)(struct obd_device *dev);
+ int (*o_setup) (struct obd_device *obd, struct lustre_cfg *cfg);
+ int (*o_precleanup)(struct obd_device *obd);
+ int (*o_cleanup)(struct obd_device *obd);
+ int (*o_process_config)(struct obd_device *obd, size_t len, void *data);
+ int (*o_postrecov)(struct obd_device *obd);
int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
int priority);
int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
int (*o_quotactl)(struct obd_device *, struct obd_export *,
struct obd_quotactl *);
- int (*o_ping)(const struct lu_env *, struct obd_export *exp);
-
/* pools methods */
int (*o_pool_new)(struct obd_device *obd, char *poolname);
int (*o_pool_del)(struct obd_device *obd, char *poolname);
struct lustre_md {
struct mdt_body *body;
struct lu_buf layout;
- struct lmv_stripe_md *lmv;
-#ifdef CONFIG_FS_POSIX_ACL
+ union {
+ struct lmv_stripe_md *lmv;
+ struct lmv_foreign_md *lfm;
+ };
+ struct lmv_stripe_md *default_lmv;
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
struct posix_acl *posix_acl;
#endif
};
+#ifdef CONFIG_LUSTRE_FS_POSIX_ACL
+static inline void lmd_clear_acl(struct lustre_md *md)
+{
+ if (md->posix_acl) {
+ posix_acl_release(md->posix_acl);
+ md->posix_acl = NULL;
+ }
+}
+
+#define OBD_CONNECT_ACL_FLAGS \
+ (OBD_CONNECT_ACL | OBD_CONNECT_UMASK | OBD_CONNECT_LARGE_ACL)
+#else
+static inline void lmd_clear_acl(struct lustre_md *md)
+{
+}
+
+#define OBD_CONNECT_ACL_FLAGS (0)
+#endif
+
struct md_open_data {
struct obd_client_handle *mod_och;
struct ptlrpc_request *mod_open_req;
};
struct obd_client_handle {
- struct lustre_handle och_fh;
+ struct lustre_handle och_open_handle;
struct lu_fid och_fid;
struct md_open_data *och_mod;
struct lustre_handle och_lease_handle; /* open lock for lease */
int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
- int (*m_merge_attr)(struct obd_export *,
+ int (*m_merge_attr)(struct obd_export *, const struct lu_fid *fid,
const struct lmv_stripe_md *lsm,
struct cl_attr *attr, ldlm_blocking_callback);
struct lu_fid *fid);
int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm,
const union lmv_mds_md *lmv, size_t lmv_size);
+ int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs,
+ struct ptlrpc_request_set *set);
};
static inline struct md_open_data *obd_mod_alloc(void)
cli->cl_dirty_max_pages = dirty_max;
}
- if (cli->cl_dirty_max_pages > totalram_pages / 8)
- cli->cl_dirty_max_pages = totalram_pages / 8;
+ if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8)
+ cli->cl_dirty_max_pages = cfs_totalram_pages() / 8;
/* This value is exported to userspace through the max_dirty_mb
* parameter. So we round up the number of pages to make it a round
1 << (20 - PAGE_SHIFT));
}
+static inline struct inode *page2inode(struct page *page)
+{
+ if (page->mapping) {
+ if (PageAnon(page))
+ return NULL;
+ else
+ return page->mapping->host;
+ } else {
+ return NULL;
+ }
+}
+
#endif /* __OBD_H */