X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_internal.h;h=bb3bec1158dcc405fe4bbffd1022f2d0af80b5a9;hp=ce78da1d420f4cf2c5c78564f72d098aa177acb1;hb=68635c3d9b3113621b93fd989f1a3f8f064385b9;hpb=ae85eecd5fd7921e248fbe84bb2bd9ad22f07639;ds=sidebyside diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index ce78da1..bb3bec1 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,7 +23,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2013, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -53,6 +49,7 @@ /* struct dirent64 */ #include #include +#include #include #include @@ -70,6 +67,7 @@ #include "osd_quota_fmt.h" struct inode; +extern struct kmem_cache *dynlock_cachep; #define OSD_COUNTERS (0) @@ -78,6 +76,7 @@ struct inode; /* OI scrub should skip this inode. */ #define LDISKFS_STATE_LUSTRE_NOSCRUB 31 +#define LDISKFS_STATE_LUSTRE_DESTROY 30 /** Enable thandle usage statistics */ #define OSD_THANDLE_STATS (0) @@ -88,6 +87,17 @@ struct inode; #define ADMIN_USR "admin_quotafile_v2.usr" #define ADMIN_GRP "admin_quotafile_v2.grp" +/* Statfs space reservation for fragmentation and local objects */ +#define OSD_STATFS_RESERVED (1ULL << 23) /* 8MB */ +#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ + +/* check if ldiskfs support project quota */ +#ifndef LDISKFS_IOC_FSSETXATTR +#undef HAVE_PROJECT_QUOTA +#endif + +#define OBD_BRW_MAPPED OBD_BRW_LOCAL1 + struct osd_directory { struct iam_container od_container; struct iam_descr od_descr; @@ -107,36 +117,42 @@ struct osd_oi { extern const int osd_dto_credits_noquota[]; struct osd_object { - struct dt_object oo_dt; - /** - * Inode for file system object represented by this osd_object. This - * inode is pinned for the whole duration of lu_object life. - * - * Not modified concurrently (either setup early during object - * creation, or assigned by osd_object_create() under write lock). - */ - struct inode *oo_inode; - /** - * to protect index ops. - */ - struct htree_lock_head *oo_hl_head; + struct dt_object oo_dt; + /** + * Inode for file system object represented by this osd_object. This + * inode is pinned for the whole duration of lu_object life. + * + * Not modified concurrently (either setup early during object + * creation, or assigned by osd_create() under write lock). + */ + struct inode *oo_inode; + /** + * to protect index ops. + */ + struct htree_lock_head *oo_hl_head; struct rw_semaphore oo_ext_idx_sem; struct rw_semaphore oo_sem; struct osd_directory *oo_dir; /** protects inode attributes. */ spinlock_t oo_guard; - /** - * Following two members are used to indicate the presence of dot and - * dotdot in the given directory. This is required for interop mode - * (b11826). - */ - int oo_compat_dot_created; - int oo_compat_dotdot_created; + + /** + * Following two members *compat_dot* are used to indicate + * the presence of dot and dotdot in the given directory. + * This is required for interop mode (b11826). + */ + __u32 oo_destroyed:1, + oo_pfid_in_lma:1, + oo_compat_dot_created:1, + oo_compat_dotdot_created:1; + + /* the i_flags in LMA */ + __u32 oo_lma_flags; const struct lu_env *oo_owner; -#ifdef CONFIG_LOCKDEP - struct lockdep_map oo_dep_map; -#endif + + struct list_head oo_xattr_list; + struct lu_object_header *oo_header; }; struct osd_obj_seq { @@ -144,7 +160,7 @@ struct osd_obj_seq { int oos_subdir_count; /* subdir count for each seq */ struct dentry *oos_root; /* O/ */ struct dentry **oos_dirs; /* O//d0-dXX */ - obd_seq oos_seq; /* seq number */ + u64 oos_seq; /* seq number */ struct list_head oos_seq_list; /* list to seq_list */ }; @@ -158,23 +174,23 @@ struct osd_obj_map { struct osd_mdobj { struct dentry *om_root; /* AGENT/ */ - obd_seq om_index; /* mdt index */ + u64 om_index; /* mdt index */ struct list_head om_list; /* list to omm_list */ }; struct osd_mdobj_map { struct dentry *omm_remote_parent; }; - -#define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \ - ldiskfs_add_entry(handle, child, cinode, hlock) +int osd_ldiskfs_add_entry(struct osd_thread_info *info, struct osd_device *osd, + handle_t *handle, struct dentry *child, + struct inode *inode, struct htree_lock *hlock); #define OSD_OTABLE_IT_CACHE_SIZE 64 #define OSD_OTABLE_IT_CACHE_MASK (~(OSD_OTABLE_IT_CACHE_SIZE - 1)) struct osd_inconsistent_item { - /* link into osd_scrub::os_inconsistent_items, - * protected by osd_scrub::os_lock. */ + /* link into lustre_scrub::os_inconsistent_items, + * protected by lustre_scrub::os_lock. */ struct list_head oii_list; /* The right FID <=> ino#/gen mapping. */ @@ -193,15 +209,16 @@ struct osd_otable_cache { int ooc_consumer_idx; /* How many items in ooc_cache. */ - int ooc_cached_items; + __u64 ooc_cached_items; /* Position for up layer LFSCK iteration pre-loading. */ - __u32 ooc_pos_preload; + __u64 ooc_pos_preload; }; struct osd_otable_it { struct osd_device *ooi_dev; struct osd_otable_cache ooi_cache; + struct osd_iit_param ooi_iit_param; /* The following bits can be updated/checked w/o lock protection. * If more bits will be introduced in the future and need lock to @@ -215,6 +232,20 @@ struct osd_otable_it { ooi_waiting:1; /* it::next is waiting. */ }; +struct osd_obj_orphan { + struct list_head oor_list; + struct lu_env *oor_env; /* to identify "own" records */ + __u32 oor_ino; +}; + +enum osd_t10_type { + OSD_T10_TYPE_UNKNOWN = 0, + OSD_T10_TYPE1_CRC, + OSD_T10_TYPE3_CRC, + OSD_T10_TYPE1_IP, + OSD_T10_TYPE3_IP +}; + /* * osd device. */ @@ -232,20 +263,19 @@ struct osd_device { */ unsigned int od_fl_capa:1, od_maybe_new:1, - od_noscrub:1, - od_dirent_journal:1, od_igif_inoi:1, od_check_ff:1, od_is_ost:1, - od_lma_self_repair:1; - - unsigned long od_capa_timeout; - __u32 od_capa_alg; - struct lustre_capa_key *od_capa_keys; - struct hlist_head *od_capa_hash; - + od_in_init:1, + od_index_in_idif:1, + /* Other flags */ + od_nonrotational:1; + + __s64 od_auto_scrub_interval; + __u32 od_dirent_journal; + int od_index; struct proc_dir_entry *od_proc_entry; - struct lprocfs_stats *od_stats; + struct lprocfs_stats *od_stats; spinlock_t od_osfs_lock; @@ -272,27 +302,71 @@ struct osd_device { char od_svname[MAX_OBD_NAME]; char od_mntdev[MAX_OBD_NAME]; - /* quota slave instance */ - struct qsd_instance *od_quota_slave; + /* quota slave instance for inode */ + struct qsd_instance *od_quota_slave_md; + + /* quota slave instance for block */ + struct qsd_instance *od_quota_slave_dt; /* osd seq instance */ struct lu_client_seq *od_cl_seq; + /* If the ratio of "the total OI mappings count" vs + * "the bad OI mappings count" is lower than the + * osd_device::od_full_scrub_ratio, then trigger + * OI scrub to scan the whole the device. */ + __u64 od_full_scrub_ratio; + /* If the speed of found bad OI mappings (per minute) + * exceeds the osd_device::od_full_scrub_threshold_rate, + * then trigger OI scrub to scan the whole device. */ + __u64 od_full_scrub_threshold_rate; + + /* a list of orphaned agent inodes, protected with od_osfs_lock */ + struct list_head od_orphan_list; + struct list_head od_index_backup_list; + struct list_head od_index_restore_list; + spinlock_t od_lock; + struct inode *od_index_backup_inode; + enum lustre_index_backup_policy od_index_backup_policy; + int od_index_backup_stop; + /* T10PI type, zero if not supported */ + enum osd_t10_type od_t10_type; +}; + +static inline struct qsd_instance *osd_def_qsd(struct osd_device *osd) +{ + if (osd->od_is_ost) + return osd->od_quota_slave_dt; + else + return osd->od_quota_slave_md; +} + +enum osd_full_scrub_ratio { + /* Trigger OI scrub to scan the whole device directly. */ + OFSR_DIRECTLY = 0, + + /* Because the bad OI mappings count cannot be larger than + * the total OI mappints count, then setting OFSR_NEVER means + * that the whole device scanning cannot be triggered by auto + * detected bad OI mappings during the RPC services. */ + OFSR_NEVER = 1, + OFSR_DEFAULT = 10000, }; -/* There are at most 10 uid/gids are affected in a transaction, and +#define FULL_SCRUB_THRESHOLD_RATE_DEFAULT 60 + +/* There are at most 15 uid/gid/projids are affected in a transaction, and * that's rename case: - * - 2 for source parent uid & gid; - * - 2 for source child uid & gid ('..' entry update when child is directory); - * - 2 for target parent uid & gid; - * - 2 for target child uid & gid (if the target child exists); - * - 2 for root uid & gid (last_rcvd, llog, etc); + * - 3 for source parent uid & gid & projid; + * - 3 for source child uid & gid & projid ('..' entry update when + * child is directory); + * - 3 for target parent uid & gid & projid; + * - 3 for target child uid & gid & projid(if the target child exists); + * - 3 for root uid & gid(last_rcvd, llog, etc); * - * The 0 to (OSD_MAX_UGID_CNT - 1) bits of ot_id_type is for indicating - * the id type of each id in the ot_id_array. */ -#define OSD_MAX_UGID_CNT 10 +#define OSD_MAX_UGID_CNT 15 -enum { +enum osd_op_type { OSD_OT_ATTR_SET = 0, OSD_OT_PUNCH = 1, OSD_OT_XATTR_SET = 2, @@ -303,30 +377,43 @@ enum { OSD_OT_WRITE = 7, OSD_OT_INSERT = 8, OSD_OT_DELETE = 9, - OSD_OT_UPDATE = 10, - OSD_OT_QUOTA = 11, - OSD_OT_MAX = 12 + OSD_OT_QUOTA = 10, + OSD_OT_MAX = 11 +}; + +struct osd_access_lock { + struct list_head tl_list; + struct osd_object *tl_obj; + bool tl_shared; + bool tl_truncate; }; struct osd_thandle { struct thandle ot_super; handle_t *ot_handle; struct ldiskfs_journal_cb_entry ot_jcb; - struct list_head ot_dcb_list; + struct list_head ot_commit_dcb_list; + struct list_head ot_stop_dcb_list; /* Link to the device, for debugging. */ struct lu_ref_link ot_dev_link; - unsigned short ot_credits; - unsigned short ot_id_cnt; - unsigned short ot_id_type; - uid_t ot_id_array[OSD_MAX_UGID_CNT]; + unsigned int ot_credits; + + /* quota IDs related to the transaction */ + unsigned short ot_id_cnt; + __u8 ot_id_res[OSD_MAX_UGID_CNT]; + __u8 ot_id_types[OSD_MAX_UGID_CNT]; + uid_t ot_id_array[OSD_MAX_UGID_CNT]; struct lquota_trans *ot_quota_trans; + + unsigned int ot_remove_agents:1; #if OSD_THANDLE_STATS /** time when this handle was allocated */ - cfs_time_t oth_alloced; + ktime_t oth_alloced; /** time when this thanle was started */ - cfs_time_t oth_started; + ktime_t oth_started; #endif + struct list_head ot_trunc_locks; }; /** @@ -351,7 +438,7 @@ enum dt_txn_op { * osd dev stats */ -#ifdef LPROCFS +#ifdef CONFIG_PROC_FS enum { LPROC_OSD_READ_BYTES = 0, LPROC_OSD_WRITE_BYTES = 1, @@ -396,24 +483,25 @@ struct osd_it_ea_dirent { * there would be one ext3 readdir for every mdd readdir page. */ -#define OSD_IT_EA_BUFSIZE (PAGE_CACHE_SIZE + PAGE_CACHE_SIZE/4) +#define OSD_IT_EA_BUFSIZE (PAGE_SIZE + PAGE_SIZE/4) /** * This is iterator's in-memory data structure in interoperability * mode (i.e. iterator over ldiskfs style directory) */ struct osd_it_ea { - struct osd_object *oie_obj; - /** used in ldiskfs iterator, to stored file pointer */ - struct file oie_file; - /** how many entries have been read-cached from storage */ - int oie_rd_dirent; - /** current entry is being iterated by caller */ - int oie_it_dirent; - /** current processing entry */ - struct osd_it_ea_dirent *oie_dirent; - /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */ - void *oie_buf; + struct osd_object *oie_obj; + /** used in ldiskfs iterator, to stored file pointer */ + struct file oie_file; + /** how many entries have been read-cached from storage */ + int oie_rd_dirent; + /** current entry is being iterated by caller */ + int oie_it_dirent; + /** current processing entry */ + struct osd_it_ea_dirent *oie_dirent; + /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */ + void *oie_buf; + struct dentry oie_dentry; }; /** @@ -447,7 +535,7 @@ struct osd_it_quota { struct list_head oiq_list; }; -#define MAX_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / 512) +#define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512) struct osd_iobuf { wait_queue_head_t dr_wait; @@ -456,32 +544,43 @@ struct osd_iobuf { int dr_npages; int dr_error; int dr_frags; - unsigned int dr_ignore_quota:1; unsigned int dr_elapsed_valid:1; /* we really did count time */ unsigned int dr_rw:1; struct lu_buf dr_pg_buf; struct page **dr_pages; + struct niobuf_local **dr_lnbs; struct lu_buf dr_bl_buf; - unsigned long *dr_blocks; - unsigned long dr_start_time; - unsigned long dr_elapsed; /* how long io took */ + struct lu_buf dr_lnb_buf; + sector_t *dr_blocks; + ktime_t dr_start_time; + ktime_t dr_elapsed; /* how long io took */ struct osd_device *dr_dev; unsigned int dr_init_at; /* the line iobuf was initialized */ }; +#define OSD_INS_CACHE_SIZE 8 + struct osd_thread_info { - const struct lu_env *oti_env; - /** - * used for index operations. - */ - struct dentry oti_obj_dentry; - struct dentry oti_child_dentry; + const struct lu_env *oti_env; + /** + * used for index operations. + */ + struct dentry oti_obj_dentry; + struct dentry oti_child_dentry; + + /** dentry for Iterator context. */ + struct dentry oti_it_dentry; + + union { + /* fake struct file for osd_object_sync */ + struct file oti_file; + /* osd_statfs() */ + struct kstatfs oti_ksfs; + }; - /** dentry for Iterator context. */ - struct dentry oti_it_dentry; - struct htree_lock *oti_hlock; + struct htree_lock *oti_hlock; - struct lu_fid oti_fid; + struct lu_fid oti_fid; struct lu_fid oti_fid2; struct lu_fid oti_fid3; struct osd_inode_id oti_id; @@ -493,19 +592,6 @@ struct osd_thread_info { * XXX temporary: for ->i_op calls. */ struct timespec oti_time; - /* - * XXX temporary: fake struct file for osd_object_sync - */ - struct file oti_file; - /* - * XXX temporary: for capa operations. - */ - struct lustre_capa_key oti_capa_key; - struct lustre_capa oti_capa; - - /** osd_device reference, initialized in osd_trans_start() and - used in osd_trans_stop() */ - struct osd_device *oti_dev; /** * following ipd and it structures are used for osd_index_iam_lookup() @@ -513,26 +599,16 @@ struct osd_thread_info { * in open iterator session. */ - /** osd iterator context used for iterator session */ - - union { - struct osd_it_iam oti_it; - /* ldiskfs iterator data structure, - * see osd_it_ea_{init, fini} */ - struct osd_it_ea oti_it_ea; - struct osd_it_quota oti_it_quota; - }; - /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */ void *oti_it_ea_buf; + unsigned int oti_it_ea_buf_used:1; - struct kstatfs oti_ksfs; - - /** IAM iterator for index operation. */ - struct iam_iterator oti_idx_it; + /* IAM iterator for index operation. */ + struct iam_iterator oti_idx_it; /** union to guarantee that ->oti_ipd[] has proper alignment. */ union { + char oti_name[48]; char oti_it_ipd[DX_IPD_MAX_SIZE]; long long oti_alignment_lieutenant; }; @@ -544,28 +620,38 @@ struct osd_thread_info { struct osd_idmap_cache oti_cache; - int oti_r_locks; - int oti_w_locks; - int oti_txns; - /** used in osd_fid_set() to put xattr */ - struct lu_buf oti_buf; - struct lu_buf oti_big_buf; - /** used in osd_ea_fid_set() to set fid into common ea */ + /* dedicated OI cache for insert (which needs inum) */ + struct osd_idmap_cache *oti_ins_cache; + int oti_ins_cache_size; + int oti_ins_cache_used; + /* inc by osd_trans_create and dec by osd_trans_stop */ + int oti_ins_cache_depth; + + int oti_r_locks; + int oti_w_locks; + int oti_txns; + /** used in osd_fid_set() to put xattr */ + struct lu_buf oti_buf; + struct lu_buf oti_big_buf; + /** used in osd_ea_fid_set() to set fid into common ea */ union { - struct lustre_mdt_attrs oti_mdt_attrs; - /* old LMA for compatibility */ - char oti_mdt_attrs_old[LMA_OLD_SIZE]; + struct lustre_ost_attrs oti_ost_attrs; + struct filter_fid_18_23 oti_ff_old; + struct filter_fid oti_ff; }; /** 0-copy IO */ - struct osd_iobuf oti_iobuf; - struct inode oti_inode; + struct osd_iobuf oti_iobuf; + /* used to access objects in /O */ + struct inode *oti_inode; #define OSD_FID_REC_SZ 32 - char oti_ldp[OSD_FID_REC_SZ]; - char oti_ldp2[OSD_FID_REC_SZ]; + char oti_ldp[OSD_FID_REC_SZ]; + char oti_ldp2[OSD_FID_REC_SZ]; /* used by quota code */ union { -#ifdef HAVE_DQUOT_FS_DISK_QUOTA +#if defined(HAVE_DQUOT_QC_DQBLK) + struct qc_dqblk oti_qdq; +#elif defined(HAVE_DQUOT_FS_DISK_QUOTA) struct fs_disk_quota oti_fdq; #else struct if_dqblk oti_dqblk; @@ -581,20 +667,34 @@ struct osd_thread_info { /* Tracking for transaction credits, to allow debugging and optimizing * cases where a large number of credits are being allocated for * single transaction. */ - unsigned short oti_declare_ops[OSD_OT_MAX]; - unsigned short oti_declare_ops_rb[OSD_OT_MAX]; - unsigned short oti_declare_ops_cred[OSD_OT_MAX]; - bool oti_rollback; - - char oti_name[48]; - union { - struct filter_fid_old oti_ff; - struct filter_fid oti_ff_new; - }; + unsigned int oti_credits_before; + unsigned int oti_declare_ops[OSD_OT_MAX]; + unsigned int oti_declare_ops_cred[OSD_OT_MAX]; + unsigned int oti_declare_ops_used[OSD_OT_MAX]; + struct osd_directory oti_iam; + + struct page **oti_dio_pages; + int oti_dio_pages_used; }; extern int ldiskfs_pdo; +#ifndef HAVE_VFS_SETXATTR +#define osd_setxattr(dentry, inode, name, buf, len, flag) \ + ((inode)->i_op->setxattr(dentry, name, buf, len, flag)) +#define osd_getxattr(dentry, inode, name, buf, len) \ + ((inode)->i_op->getxattr(dentry, name, buf, len)) +#define osd_removexattr(dentry, inode, name) \ + ((inode)->i_op->removexattr(dentry, name)) +#else /* HAVE_VFS_SETXATTR */ +#define osd_setxattr(dentry, inode, name, buf, len, flag) \ + __vfs_setxattr(dentry, inode, name, buf, len, flag) +#define osd_getxattr(dentry, inode, name, buf, len) \ + __vfs_getxattr(dentry, inode, name, buf, len) +#define osd_removexattr(dentry, inode, name) \ + __vfs_removexattr(dentry, name) +#endif /* !HAVE_VFS_SETXATTR */ + static inline int __osd_xattr_get(struct inode *inode, struct dentry *dentry, const char *name, void *buf, int len) { @@ -603,7 +703,7 @@ static inline int __osd_xattr_get(struct inode *inode, struct dentry *dentry, dentry->d_inode = inode; dentry->d_sb = inode->i_sb; - return inode->i_op->getxattr(dentry, name, buf, len); + return osd_getxattr(dentry, inode, name, buf, len); } static inline int __osd_xattr_set(struct osd_thread_info *info, @@ -615,30 +715,33 @@ static inline int __osd_xattr_set(struct osd_thread_info *info, ll_vfs_dq_init(inode); dentry->d_inode = inode; dentry->d_sb = inode->i_sb; - return inode->i_op->setxattr(dentry, name, buf, buflen, fl); + return osd_setxattr(dentry, inode, name, buf, buflen, fl); } -#ifdef LPROCFS +#ifdef CONFIG_PROC_FS /* osd_lproc.c */ -extern struct lprocfs_seq_vars lprocfs_osd_obd_vars[]; -extern struct lprocfs_seq_vars lprocfs_osd_module_vars[]; +extern struct lprocfs_vars lprocfs_osd_obd_vars[]; int osd_procfs_init(struct osd_device *osd, const char *name); int osd_procfs_fini(struct osd_device *osd); void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 52, 0) +int osd_register_proc_index_in_idif(struct osd_device *osd); +#endif #endif int osd_statfs(const struct lu_env *env, struct dt_device *dev, struct obd_statfs *sfs); -int osd_object_auth(const struct lu_env *env, struct dt_object *dt, - struct lustre_capa *capa, __u64 opc); struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev, struct osd_inode_id *id); +struct inode * +osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev, + struct osd_inode_id *id, struct lu_fid *fid); int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode, const struct lu_fid *fid, __u32 compat, __u32 incompat); int osd_get_lma(struct osd_thread_info *info, struct inode *inode, - struct dentry *dentry, struct lustre_mdt_attrs *lma); -int osd_add_oi_cache(struct osd_thread_info *info, struct osd_device *osd, - struct osd_inode_id *id, const struct lu_fid *fid); + struct dentry *dentry, struct lustre_ost_attrs *loa); +void osd_add_oi_cache(struct osd_thread_info *info, struct osd_device *osd, + struct osd_inode_id *id, const struct lu_fid *fid); int osd_get_idif(struct osd_thread_info *info, struct inode *inode, struct dentry *dentry, struct lu_fid *fid); @@ -666,25 +769,25 @@ int osd_obj_spec_update(struct osd_thread_info *info, struct osd_device *osd, const struct lu_fid *fid, const struct osd_inode_id *id, handle_t *th); -void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags); -int osd_scrub_file_store(struct osd_scrub *scrub); char *osd_lf_fid2name(const struct lu_fid *fid); -int osd_scrub_start(struct osd_device *dev); +int osd_scrub_start(const struct lu_env *env, struct osd_device *dev, + __u32 flags); +void osd_scrub_stop(struct osd_device *dev); int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev); void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev); int osd_oii_insert(struct osd_device *dev, struct osd_idmap_cache *oic, int insert); int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid, struct osd_inode_id *id); -int osd_scrub_dump(struct seq_file *m, struct osd_device *dev); +void osd_scrub_dump(struct seq_file *m, struct osd_device *dev); int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, - obd_seq seq, struct lu_seq_range *range); + u64 seq, struct lu_seq_range *range); int osd_delete_from_remote_parent(const struct lu_env *env, struct osd_device *osd, struct osd_object *obj, - struct osd_thandle *oh); + struct osd_thandle *oh, bool destroy); int osd_add_to_remote_parent(const struct lu_env *env, struct osd_device *osd, struct osd_object *obj, struct osd_thandle *oh); int osd_lookup_in_remote_parent(struct osd_thread_info *oti, @@ -707,24 +810,16 @@ loff_t find_tree_dqentry(const struct lu_env *env, struct osd_it_quota *it); /* osd_quota.c */ int osd_declare_qid(const struct lu_env *env, struct osd_thandle *oh, - struct lquota_id_info *qi, bool allocated, int *flags); + struct lquota_id_info *qi, struct osd_object *obj, + bool enforce, int *flags); int osd_declare_inode_qid(const struct lu_env *env, qid_t uid, qid_t gid, - long long space, struct osd_thandle *oh, - bool is_blk, bool allocated, int *flags, bool force); + __u32 projid, long long space, struct osd_thandle *oh, + struct osd_object *obj, int *flags, + enum osd_qid_declare_flags); const struct dt_rec *osd_quota_pack(struct osd_object *obj, const struct dt_rec *rec, union lquota_rec *quota_rec); void osd_quota_unpack(struct osd_object *obj, const struct dt_rec *rec); -int osd_quota_migration(const struct lu_env *env, struct dt_object *dt, - const struct dt_index_features *feat); - -static inline bool is_quota_glb_feat(const struct dt_index_features *feat) -{ - return (feat == &dt_quota_iusr_features || - feat == &dt_quota_busr_features || - feat == &dt_quota_igrp_features || - feat == &dt_quota_bgrp_features) ? true : false; -} #ifndef HAVE_I_UID_READ static inline uid_t i_uid_read(const struct inode *inode) @@ -748,6 +843,84 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) } #endif +#ifdef HAVE_PROJECT_QUOTA +static inline __u32 i_projid_read(struct inode *inode) +{ + return (__u32)from_kprojid(&init_user_ns, LDISKFS_I(inode)->i_projid); +} + +static inline void i_projid_write(struct inode *inode, __u32 projid) +{ + kprojid_t kprojid; + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + LDISKFS_I(inode)->i_projid = kprojid; +} +#else +static inline uid_t i_projid_read(struct inode *inode) +{ + return 0; +} +static inline void i_projid_write(struct inode *inode, __u32 projid) +{ + return; +} +#endif + +#ifdef HAVE_LDISKFS_INFO_JINODE +# define osd_attach_jinode(inode) ldiskfs_inode_attach_jinode(inode) +#else /* HAVE_LDISKFS_INFO_JINODE */ +# define osd_attach_jinode(inode) 0 +#endif /* HAVE_LDISKFS_INFO_JINODE */ + +#ifdef LDISKFS_HT_MISC +# define osd_journal_start_sb(sb, type, nblock) \ + ldiskfs_journal_start_sb(sb, type, nblock) +static inline struct buffer_head *osd_ldiskfs_append(handle_t *handle, + struct inode *inode, + ldiskfs_lblk_t *nblock) +{ + int rc; + + rc = osd_attach_jinode(inode); + if (rc) + return ERR_PTR(rc); + return ldiskfs_append(handle, inode, nblock); +} +# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ + (__ldiskfs_find_entry(dir, name, de, inlined, lock) ?: \ + ERR_PTR(-ENOENT)) +# define osd_journal_start(inode, type, nblocks) \ + ldiskfs_journal_start(inode, type, nblocks) +# define osd_transaction_size(dev) \ + (osd_journal(dev)->j_max_transaction_buffers / 2) +#else +# define LDISKFS_HT_MISC 0 +# define osd_journal_start_sb(sb, type, nblock) \ + ldiskfs_journal_start_sb(sb, nblock) + +static inline struct buffer_head *osd_ldiskfs_append(handle_t *handle, + struct inode *inode, + ldiskfs_lblk_t *nblock) +{ + struct buffer_head *bh; + int err = 0; + + bh = ldiskfs_append(handle, inode, nblock, &err); + if (bh == NULL) + bh = ERR_PTR(err); + + return bh; +} + +# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ + (__ldiskfs_find_entry(dir, name, de, lock) ?: \ + ERR_PTR(-ENOENT)) +# define osd_journal_start(inode, type, nblocks) \ + ldiskfs_journal_start(inode, nblocks) +# define osd_transaction_size(dev) \ + (osd_journal(dev)->j_max_transaction_buffers) +#endif + /* * Invariants, assertions. */ @@ -825,6 +998,27 @@ static inline struct super_block *osd_sb(const struct osd_device *dev) return dev->od_mnt->mnt_sb; } +static inline const char *osd_dev2name(const struct osd_device *dev) +{ + return osd_sb(dev)->s_id; +} + +static inline const char *osd_ino2name(const struct inode *inode) +{ + return inode->i_sb->s_id; +} + +/** + * Put the osd object once done with it. + * + * \param obj osd object that needs to be put + */ +static inline void osd_object_put(const struct lu_env *env, + struct osd_object *obj) +{ + dt_object_put(env, &obj->oo_dt); +} + static inline int osd_object_is_root(const struct osd_object *obj) { return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode; @@ -858,7 +1052,12 @@ static inline struct seq_server_site *osd_seq_site(struct osd_device *osd) static inline char *osd_name(struct osd_device *osd) { - return osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name; + return osd->od_svname; +} + +static inline bool osd_is_ea_inode(struct inode *inode) +{ + return !!(LDISKFS_I(inode)->i_flags & LDISKFS_EA_INODE_FL); } extern const struct dt_body_operations osd_body_ops; @@ -897,6 +1096,10 @@ static inline void osd_ipd_put(const struct lu_env *env, bag->ic_descr->id_ops->id_ipd_free(ipd); } +int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode, + const loff_t size, const loff_t pos, + const int blocks); + int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs); int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, int write_NUL, loff_t *offs, handle_t *handle); @@ -923,10 +1126,11 @@ struct dentry *osd_child_dentry_by_inode(const struct lu_env *env, extern int osd_trans_declare_op2rb[]; extern int ldiskfs_track_declares_assert; +void osd_trans_dump_creds(const struct lu_env *env, struct thandle *th); static inline void osd_trans_declare_op(const struct lu_env *env, struct osd_thandle *oh, - unsigned int op, int credits) + enum osd_op_type op, int credits) { struct osd_thread_info *oti = osd_oti_get(env); @@ -936,7 +1140,7 @@ static inline void osd_trans_declare_op(const struct lu_env *env, LASSERT(op < OSD_OT_MAX); } else { CWARN("%s: Invalid operation index %d\n", - osd_name(oti->oti_dev), op); + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op); libcfs_debug_dumpstack(NULL); } } else { @@ -947,77 +1151,119 @@ static inline void osd_trans_declare_op(const struct lu_env *env, } static inline void osd_trans_exec_op(const struct lu_env *env, - struct thandle *th, unsigned int op) + struct thandle *th, + enum osd_op_type op) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super); - unsigned int rb; + unsigned int rb, left; LASSERT(oh->ot_handle != NULL); if (unlikely(op >= OSD_OT_MAX)) { if (unlikely(ldiskfs_track_declares_assert)) LASSERT(op < OSD_OT_MAX); else { - CWARN("%s: Invalid operation index %d\n", - osd_name(oti->oti_dev), op); + CWARN("%s: opcode %u: invalid value >= %u\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), + op, OSD_OT_MAX); libcfs_debug_dumpstack(NULL); return; } } - if (likely(!oti->oti_rollback && oti->oti_declare_ops[op] > 0)) { - oti->oti_declare_ops[op]--; - oti->oti_declare_ops_rb[op]++; - } else { - /* all future updates are considered rollback */ - oti->oti_rollback = true; - rb = osd_trans_declare_op2rb[op]; - if (unlikely(rb >= OSD_OT_MAX)) { - if (unlikely(ldiskfs_track_declares_assert)) - LASSERTF(rb < OSD_OT_MAX, "rb = %u\n", rb); - else { - CWARN("%s: Invalid rollback index %d\n", - osd_name(oti->oti_dev), rb); - libcfs_debug_dumpstack(NULL); - return; - } - } - if (unlikely(oti->oti_declare_ops_rb[rb] == 0)) { - if (unlikely(ldiskfs_track_declares_assert)) - LASSERTF(oti->oti_declare_ops_rb[rb] > 0, - "rb = %u\n", rb); - else { - CWARN("%s: Overflow in tracking declares for " - "index, rb = %d\n", - osd_name(oti->oti_dev), rb); - libcfs_debug_dumpstack(NULL); - return; - } - } - oti->oti_declare_ops_rb[rb]--; + /* find rollback (or reverse) operation for the given one + * such an operation doesn't require additional credits + * as the same set of blocks are modified */ + rb = osd_trans_declare_op2rb[op]; + + /* check whether credits for this operation were reserved at all */ + if (unlikely(oti->oti_declare_ops_cred[op] == 0 && + oti->oti_declare_ops_cred[rb] == 0)) { + /* the API is not perfect yet: CREATE does REF_ADD internally + * while DESTROY does not. To rollback CREATE the callers + * needs to call REF_DEL+DESTROY which is hard to detect using + * a simple table of rollback operations */ + if (op == OSD_OT_REF_DEL && + oti->oti_declare_ops_cred[OSD_OT_CREATE] > 0) + goto proceed; + if (op == OSD_OT_REF_ADD && + oti->oti_declare_ops_cred[OSD_OT_DESTROY] > 0) + goto proceed; + CWARN("%s: opcode %u: credits = 0, rollback = %u\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb); + osd_trans_dump_creds(env, th); + LASSERT(!ldiskfs_track_declares_assert); + } + +proceed: + /* remember how many credits we have unused before the operation */ + oti->oti_credits_before = oh->ot_handle->h_buffer_credits; + left = oti->oti_declare_ops_cred[op] - oti->oti_declare_ops_used[op]; + if (unlikely(oti->oti_credits_before < left)) { + CWARN("%s: opcode %u: before %u < left %u, rollback = %u\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, + oti->oti_credits_before, left, rb); + osd_trans_dump_creds(env, th); + /* on a very small fs (testing?) it's possible that + * the transaction can't fit 1/4 of journal, so we + * just request less credits (see osd_trans_start()). + * ignore the same case here */ + rb = osd_transaction_size(osd_dt_dev(th->th_dev)); + if (unlikely(oh->ot_credits < rb)) + LASSERT(!ldiskfs_track_declares_assert); } } -static inline void osd_trans_declare_rb(const struct lu_env *env, - struct thandle *th, unsigned int op) +static inline void osd_trans_exec_check(const struct lu_env *env, + struct thandle *th, + enum osd_op_type op) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super); + int used, over, quota; - LASSERT(oh->ot_handle != NULL); - if (unlikely(op >= OSD_OT_MAX)) { - if (unlikely(ldiskfs_track_declares_assert)) - LASSERT(op < OSD_OT_MAX); - else { - CWARN("%s: Invalid operation index %d\n", - osd_name(oti->oti_dev), op); - libcfs_debug_dumpstack(NULL); - } + /* how many credits have been used by the operation */ + used = oti->oti_credits_before - oh->ot_handle->h_buffer_credits; + if (unlikely(used < 0)) { + /* if some block was allocated and released in the same + * transaction, then it won't be a part of the transaction + * and delta can be negative */ + return; + } + + if (used == 0) { + /* rollback operations (e.g. when we destroy just created + * object) should not consume any credits. there is no point + * to confuse the checks below */ + return; + } + + oti->oti_declare_ops_used[op] += used; + if (oti->oti_declare_ops_used[op] <= oti->oti_declare_ops_cred[op]) + return; + + /* we account quota for a whole transaction and any operation can + * consume corresponding credits */ + over = oti->oti_declare_ops_used[op] - + oti->oti_declare_ops_cred[op]; + quota = oti->oti_declare_ops_cred[OSD_OT_QUOTA] - + oti->oti_declare_ops_used[OSD_OT_QUOTA]; + if (over <= quota) { + /* probably that credits were consumed by + * quota indirectly (in the depths of ldiskfs) */ + oti->oti_declare_ops_used[OSD_OT_QUOTA] += over; + oti->oti_declare_ops_used[op] -= over; } else { - oti->oti_declare_ops_rb[op]++; + CWARN("%s: opcode %d: used %u, used now %u, reserved %u\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, + oti->oti_declare_ops_used[op], used, + oti->oti_declare_ops_cred[op]); + osd_trans_dump_creds(env, th); + if (unlikely(ldiskfs_track_declares_assert)) + LBUG(); } } @@ -1062,7 +1308,7 @@ int osd_acct_obj_lookup(struct osd_thread_info *info, struct osd_device *osd, static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT - return is_compat_task(); + return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif @@ -1082,34 +1328,57 @@ static inline int fid_is_internal(const struct lu_fid *fid) return (!fid_is_namespace_visible(fid) && !fid_is_idif(fid)); } -static inline unsigned long osd_remote_parent_ino(struct osd_device *dev) +static inline bool is_remote_parent_ino(struct osd_device *o, unsigned long ino) { - return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino; + if (o->od_is_ost) + return false; + + LASSERT(o->od_mdt_map != NULL); + + return ino == o->od_mdt_map->omm_remote_parent->d_inode->i_ino; } -#ifdef JOURNAL_START_HAS_3ARGS -# define osd_journal_start_sb(sb, type, nblock) \ - ldiskfs_journal_start_sb(sb, type, nblock) -# define osd_ldiskfs_append(handle, inode, nblock, err) \ - ldiskfs_append(handle, inode, nblock) -# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ - ldiskfs_find_entry(dir, name, de, inlined, lock) -# define osd_journal_start(inode, type, nblocks) \ - ldiskfs_journal_start(inode, type, nblocks) -# define osd_transaction_size(dev) \ - (osd_journal(dev)->j_max_transaction_buffers / 2) +/** + * ext4_bread/ldiskfs_bread has either 5 or 4 parameters. The error + * return code has been removed and integrated into the pointer in the + * kernel 3.18. + */ +static inline struct buffer_head *__ldiskfs_bread(handle_t *handle, + struct inode *inode, + ldiskfs_lblk_t block, + int create) +{ + int rc = 0; + struct buffer_head *bh; + + if (create) { + rc = osd_attach_jinode(inode); + if (rc) + return ERR_PTR(rc); + } +#ifdef HAVE_EXT4_BREAD_4ARGS + bh = ldiskfs_bread(handle, inode, block, create); #else -# define LDISKFS_HT_MISC 0 -# define osd_journal_start_sb(sb, type, nblock) \ - ldiskfs_journal_start_sb(sb, nblock) -# define osd_ldiskfs_append(handle, inode, nblock, err) \ - ldiskfs_append(handle, inode, nblock, err) -# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ - ldiskfs_find_entry(dir, name, de, lock) -# define osd_journal_start(inode, type, nblocks) \ - ldiskfs_journal_start(inode, nblocks) -# define osd_transaction_size(dev) \ - (osd_journal(dev)->j_max_transaction_buffers) + + bh = ldiskfs_bread(handle, inode, block, create, &rc); + if (bh == NULL && rc != 0) + bh = ERR_PTR(rc); +#endif + return bh; +} + +#ifndef HAVE_BIO_INTEGRITY_ENABLED +bool bio_integrity_enabled(struct bio *bio); +#endif + +#ifdef HAVE_BI_BDEV +# define bio_get_dev(bio) ((bio)->bi_bdev) +# define bio_get_disk(bio) (bio_get_dev(bio)->bd_disk) +# define bio_get_queue(bio) bdev_get_queue(bio_get_dev(bio)) +# define bio_set_dev(bio, bdev) (bio_get_dev(bio) = (bdev)) +#else +# define bio_get_disk(bio) ((bio)->bi_disk) +# define bio_get_queue(bio) (bio_get_disk(bio)->queue) #endif void ldiskfs_inc_count(handle_t *handle, struct inode *inode); @@ -1117,4 +1386,123 @@ void ldiskfs_dec_count(handle_t *handle, struct inode *inode); void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf); +static inline int +osd_index_register(struct osd_device *osd, const struct lu_fid *fid, + __u32 keysize, __u32 recsize) +{ + return lustre_index_register(&osd->od_dt_dev, osd_name(osd), + &osd->od_index_backup_list, &osd->od_lock, + &osd->od_index_backup_stop, + fid, keysize, recsize); +} + +static inline void +osd_index_backup(const struct lu_env *env, struct osd_device *osd, bool backup) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct lu_fid *fid = &info->oti_fid3; + struct osd_inode_id *id = &info->oti_id3; + + lu_local_obj_fid(fid, INDEX_BACKUP_OID); + osd_id_gen(id, osd->od_index_backup_inode->i_ino, + osd->od_index_backup_inode->i_generation); + osd_add_oi_cache(info, osd, id, fid); + + lustre_index_backup(env, &osd->od_dt_dev, osd_name(osd), + &osd->od_index_backup_list, &osd->od_lock, + &osd->od_index_backup_stop, backup); +} + +#ifdef LDISKFS_HAS_INCOMPAT_FEATURE + +# ifdef LDISKFS_FEATURE_INCOMPAT_EXTENTS +# define ldiskfs_has_feature_extents(sb) \ + LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_EXTENTS) +# endif +# ifdef LDISKFS_FEATURE_INCOMPAT_EA_INODE +# define ldiskfs_has_feature_ea_inode(sb) \ + LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_EA_INODE) +# endif +# ifdef LDISKFS_FEATURE_INCOMPAT_DIRDATA +# define ldiskfs_has_feature_dirdata(sb) \ + LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_DIRDATA) +# endif +# ifdef LDISKFS_FEATURE_COMPAT_HAS_JOURNAL +# define ldiskfs_has_feature_journal(sb) \ + LDISKFS_HAS_COMPAT_FEATURE(sb, LDISKFS_FEATURE_COMPAT_HAS_JOURNAL) +# endif +# ifdef LDISKFS_FEATURE_RO_COMPAT_QUOTA +# define ldiskfs_has_feature_quota(sb) \ + LDISKFS_HAS_RO_COMPAT_FEATURE(sb, LDISKFS_FEATURE_RO_COMPAT_QUOTA) +# endif +# ifdef LDISKFS_FEATURE_RO_COMPAT_PROJECT +# define ldiskfs_has_feature_project(sb) \ + LDISKFS_HAS_RO_COMPAT_FEATURE(sb, LDISKFS_FEATURE_RO_COMPAT_PROJECT) +# endif + +#endif + +int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, + bool shared); +void osd_trunc_unlock_all(struct list_head *list); +void osd_process_truncates(struct list_head *list); +void osd_execute_truncate(struct osd_object *obj); + +#ifdef HAVE_BIO_ENDIO_USES_ONE_ARG +#define osd_dio_complete_routine(bio, error) dio_complete_routine(bio) +#else +#define osd_dio_complete_routine(bio, error) dio_complete_routine(bio, error) +#endif + +#ifndef HAVE___BI_CNT +#define __bi_cnt bi_cnt +#endif + +#ifndef HAVE_BI_OPF +#define bi_opf bi_rw +#endif + +#ifndef HAVE_CLEAN_BDEV_ALIASES +#define clean_bdev_aliases(bdev, block, len) \ + unmap_underlying_metadata((bdev), (block)) +#endif + +#ifndef HAVE_BI_STATUS +#define bi_status bi_error +#endif + +/* + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb + * This limit is arbitrary, but is reasonable for the xattr API. + */ +#define LDISKFS_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) + +struct osd_bio_private { + struct osd_iobuf *obp_iobuf; + /* Start page index in the obp_iobuf for the bio */ + int obp_start_page_idx; +}; + +#ifdef HAVE_BIO_INTEGRITY_PREP_FN +int osd_get_integrity_profile(struct osd_device *osd, + integrity_gen_fn **generate_fn, + integrity_vrfy_fn **verify_fn); +#else +#define integrity_gen_fn void +#define integrity_vrfy_fn int +static inline int osd_get_integrity_profile(struct osd_device *osd, + integrity_gen_fn **generate_fn, + integrity_vrfy_fn **verify_fn) +{ + return 0; +} + +static inline bool bio_integrity_prep_fn(struct bio *bio, + integrity_gen_fn *generate_fn, + integrity_vrfy_fn *verify_fn) +{ + return bio_integrity_prep(bio); +} +#endif + #endif /* _OSD_INTERNAL_H */