X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_internal.h;h=ae51525f74e12f904c6bb89dcf9daa1518640576;hb=07660ad33a7d109cced29b6400f99f25adab3f54;hp=527fa5cf01a4288bdde6195cf0e3e3dac1326dbd;hpb=9fb46705ae86aa2c0ac29427f0ff24f923560eb7;p=fs%2Flustre-release.git diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 527fa5c..ae51525 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -27,7 +27,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Whamcloud, Inc. + * Copyright (c) 2011, 2015, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -43,28 +43,18 @@ #ifndef _OSD_INTERNAL_H #define _OSD_INTERNAL_H -#if defined(__KERNEL__) +/* struct mutex */ +#include /* struct rw_semaphore */ #include /* struct dentry */ #include /* struct dirent64 */ #include - +#include #include #include -#ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD -# define journal_callback ldiskfs_journal_cb_entry -# define osd_journal_callback_set(handle, func, jcb) \ - ldiskfs_journal_callback_add(handle, func, jcb) -#else -# define osd_journal_callback_set(handle, func, jcb) \ - jbd2_journal_callback_set(handle, func, jcb) -#endif - -/* fsfilt_{get|put}_ops */ -#include /* LUSTRE_OSD_NAME */ #include @@ -80,15 +70,26 @@ #include "osd_quota_fmt.h" struct inode; +extern struct kmem_cache *dynlock_cachep; #define OSD_COUNTERS (0) -/* Lustre special inode::i_state to indicate OI scrub skip this inode. */ -#define I_LUSTRE_NOSCRUB (1 << 31) +/* ldiskfs special inode::i_state_flags need to be accessed with + * ldiskfs_{set,clear,test}_inode_state() only */ + +/* OI scrub should skip this inode. */ +#define LDISKFS_STATE_LUSTRE_NOSCRUB 31 +#define LDISKFS_STATE_LUSTRE_DESTROY 30 /** Enable thandle usage statistics */ #define OSD_THANDLE_STATS (0) +#define MAX_OBJID_GROUP (FID_SEQ_ECHO + 1) + +#define OBJECTS "OBJECTS" +#define ADMIN_USR "admin_quotafile_v2.usr" +#define ADMIN_GRP "admin_quotafile_v2.grp" + struct osd_directory { struct iam_container od_container; struct iam_descr od_descr; @@ -126,6 +127,8 @@ struct osd_object { struct osd_directory *oo_dir; /** protects inode attributes. */ spinlock_t oo_guard; + + __u32 oo_destroyed:1; /** * Following two members are used to indicate the presence of dot and * dotdot in the given directory. This is required for interop mode @@ -140,62 +143,43 @@ struct osd_object { #endif }; -#ifdef HAVE_LDISKFS_PDO - -#define osd_ldiskfs_find_entry(dir, dentry, de, lock) \ - ll_ldiskfs_find_entry(dir, dentry, de, lock) -#define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \ - ldiskfs_add_entry(handle, child, cinode, hlock) - -#else /* HAVE_LDISKFS_PDO */ - -struct htree_lock { - int dummy; +struct osd_obj_seq { + /* protects on-fly initialization */ + int oos_subdir_count; /* subdir count for each seq */ + struct dentry *oos_root; /* O/ */ + struct dentry **oos_dirs; /* O//d0-dXX */ + u64 oos_seq; /* seq number */ + struct list_head oos_seq_list; /* list to seq_list */ }; -struct htree_lock_head { - int dummy; +struct osd_obj_map { + struct dentry *om_root; /* dentry for /O */ + rwlock_t om_seq_list_lock; /* lock for seq_list */ + struct list_head om_seq_list; /* list head for seq */ + int om_subdir_count; + struct mutex om_dir_init_mutex; }; -#define ldiskfs_htree_lock(lock, head, inode, op) do { LBUG(); } while (0) -#define ldiskfs_htree_unlock(lock) do { LBUG(); } while (0) - -static inline struct htree_lock_head *ldiskfs_htree_lock_head_alloc(int dep) -{ - LBUG(); - return NULL; -} - -#define ldiskfs_htree_lock_head_free(lh) do { LBUG(); } while (0) - -#define LDISKFS_DUMMY_HTREE_LOCK 0xbabecafe - -static inline struct htree_lock *ldiskfs_htree_lock_alloc(void) -{ - return (struct htree_lock *)LDISKFS_DUMMY_HTREE_LOCK; -} - -static inline void ldiskfs_htree_lock_free(struct htree_lock *lk) -{ - LASSERT((unsigned long)lk == LDISKFS_DUMMY_HTREE_LOCK); -} - -#define HTREE_HBITS_DEF 0 - -#define osd_ldiskfs_find_entry(dir, dentry, de, lock) \ - ll_ldiskfs_find_entry(dir, dentry, de) -#define osd_ldiskfs_add_entry(handle, child, cinode, lock) \ - ldiskfs_add_entry(handle, child, cinode) +struct osd_mdobj { + struct dentry *om_root; /* AGENT/ */ + u64 om_index; /* mdt index */ + struct list_head om_list; /* list to omm_list */ +}; -#endif /* HAVE_LDISKFS_PDO */ +struct osd_mdobj_map { + struct dentry *omm_remote_parent; +}; +int osd_ldiskfs_add_entry(struct osd_thread_info *info, + handle_t *handle, struct dentry *child, + struct inode *inode, struct htree_lock *hlock); -#define OSD_OTABLE_IT_CACHE_SIZE 128 +#define OSD_OTABLE_IT_CACHE_SIZE 64 #define OSD_OTABLE_IT_CACHE_MASK (~(OSD_OTABLE_IT_CACHE_SIZE - 1)) struct osd_inconsistent_item { /* link into osd_scrub::os_inconsistent_items, * protected by osd_scrub::os_lock. */ - cfs_list_t oii_list; + struct list_head oii_list; /* The right FID <=> ino#/gen mapping. */ struct osd_idmap_cache oii_cache; @@ -223,9 +207,6 @@ struct osd_otable_it { struct osd_device *ooi_dev; struct osd_otable_cache ooi_cache; - /* For osd_otable_it_key. */ - __u8 ooi_key[16]; - /* The following bits can be updated/checked w/o lock protection. * If more bits will be introduced in the future and need lock to * protect, please add comment. */ @@ -238,7 +219,11 @@ struct osd_otable_it { ooi_waiting:1; /* it::next is waiting. */ }; -extern const int osd_dto_credits_noquota[]; +struct osd_obj_orphan { + struct list_head oor_list; + struct lu_env *oor_env; /* to identify "own" records */ + __u32 oor_ino; +}; /* * osd device. @@ -256,70 +241,75 @@ struct osd_device { * Fid Capability */ unsigned int od_fl_capa:1, - od_is_md:1; /* set in ->ldo_prepare */ - unsigned long od_capa_timeout; - __u32 od_capa_alg; - struct lustre_capa_key *od_capa_keys; - cfs_hlist_head_t *od_capa_hash; - - cfs_proc_dir_entry_t *od_proc_entry; - struct lprocfs_stats *od_stats; - /* - * statfs optimization: we cache a bit. - */ - cfs_time_t od_osfs_age; - struct obd_statfs od_statfs; - spinlock_t od_osfs_lock; + od_maybe_new:1, + od_noscrub:1, + od_igif_inoi:1, + od_check_ff:1, + od_is_ost:1, + od_index_in_idif:1; + + __u32 od_dirent_journal; + int od_index; + struct proc_dir_entry *od_proc_entry; + struct lprocfs_stats *od_stats; - unsigned int od_noscrub:1; + spinlock_t od_osfs_lock; - struct fsfilt_operations *od_fsops; int od_connects; struct lu_site od_site; - /* - * mapping for legacy OST objids - */ - struct osd_compat_objid *od_ost_map; + struct osd_obj_map *od_ost_map; + struct osd_mdobj_map *od_mdt_map; - unsigned long long od_readcache_max_filesize; - int od_read_cache; - int od_writethrough_cache; + unsigned long long od_readcache_max_filesize; + int od_read_cache; + int od_writethrough_cache; - struct brw_stats od_brw_stats; - cfs_atomic_t od_r_in_flight; - cfs_atomic_t od_w_in_flight; + struct brw_stats od_brw_stats; + atomic_t od_r_in_flight; + atomic_t od_w_in_flight; struct mutex od_otable_mutex; struct osd_otable_it *od_otable_it; struct osd_scrub od_scrub; + struct list_head od_ios_list; /* service name associated with the osd device */ char od_svname[MAX_OBD_NAME]; + char od_mntdev[MAX_OBD_NAME]; /* quota slave instance */ struct qsd_instance *od_quota_slave; + + /* osd seq instance */ + struct lu_client_seq *od_cl_seq; + /* If the ratio of "the total OI mappings count" vs + * "the bad OI mappings count" is lower than the + * osd_device::od_full_scrub_ratio, then trigger + * OI scrub to scan the whole the device. */ + __u64 od_full_scrub_ratio; + /* If the speed of found bad OI mappings (per minute) + * exceeds the osd_device::od_full_scrub_threshold_rate, + * then trigger OI scrub to scan the whole device. */ + __u64 od_full_scrub_threshold_rate; + + /* a list of orphaned agent inodes, protected with od_osfs_lock */ + struct list_head od_orphan_list; }; -#define OSD_TRACK_DECLARES -#ifdef OSD_TRACK_DECLARES -#define OSD_DECLARE_OP(oh, op, credits) \ -do { \ - LASSERT((oh)->ot_handle == NULL); \ - ((oh)->ot_declare_ ##op)++; \ - ((oh)->ot_declare_ ##op ##_cred) += (credits); \ - (oh)->ot_credits += (credits); \ -} while (0) -#define OSD_EXEC_OP(handle, op) \ -do { \ - struct osd_thandle *oh = container_of(handle, typeof(*oh), ot_super); \ - LASSERT((oh)->ot_declare_ ##op > 0); \ - ((oh)->ot_declare_ ##op)--; \ -} while (0) -#else -#define OSD_DECLARE_OP(oh, op, credits) (oh)->ot_credits += (credits) -#define OSD_EXEC_OP(oh, op) -#endif +enum osd_full_scrub_ratio { + /* Trigger OI scrub to scan the whole device directly. */ + OFSR_DIRECTLY = 0, + + /* Because the bad OI mappings count cannot be larger than + * the total OI mappints count, then setting OFSR_NEVER means + * that the whole device scanning cannot be triggered by auto + * detected bad OI mappings during the RPC services. */ + OFSR_NEVER = 1, + OFSR_DEFAULT = 10000, +}; + +#define FULL_SCRUB_THRESHOLD_RATE_DEFAULT 60 /* There are at most 10 uid/gids are affected in a transaction, and * that's rename case: @@ -334,48 +324,35 @@ do { \ */ #define OSD_MAX_UGID_CNT 10 +enum { + OSD_OT_ATTR_SET = 0, + OSD_OT_PUNCH = 1, + OSD_OT_XATTR_SET = 2, + OSD_OT_CREATE = 3, + OSD_OT_DESTROY = 4, + OSD_OT_REF_ADD = 5, + OSD_OT_REF_DEL = 6, + OSD_OT_WRITE = 7, + OSD_OT_INSERT = 8, + OSD_OT_DELETE = 9, + OSD_OT_QUOTA = 10, + OSD_OT_MAX = 11 +}; + struct osd_thandle { struct thandle ot_super; handle_t *ot_handle; - struct journal_callback ot_jcb; - cfs_list_t ot_dcb_list; - /* Link to the device, for debugging. */ - struct lu_ref_link *ot_dev_link; + struct ldiskfs_journal_cb_entry ot_jcb; + struct list_head ot_commit_dcb_list; + struct list_head ot_stop_dcb_list; + /* Link to the device, for debugging. */ + struct lu_ref_link ot_dev_link; unsigned short ot_credits; unsigned short ot_id_cnt; unsigned short ot_id_type; + int ot_remove_agents:1; uid_t ot_id_array[OSD_MAX_UGID_CNT]; struct lquota_trans *ot_quota_trans; - -#ifdef OSD_TRACK_DECLARES - /* Tracking for transaction credits, to allow debugging and optimizing - * cases where a large number of credits are being allocated for - * single transaction. */ - unsigned char ot_declare_attr_set; - unsigned char ot_declare_punch; - unsigned char ot_declare_xattr_set; - unsigned char ot_declare_create; - unsigned char ot_declare_destroy; - unsigned char ot_declare_ref_add; - unsigned char ot_declare_ref_del; - unsigned char ot_declare_write; - unsigned char ot_declare_insert; - unsigned char ot_declare_delete; - unsigned char ot_declare_quota; - - unsigned short ot_declare_attr_set_cred; - unsigned short ot_declare_punch_cred; - unsigned short ot_declare_xattr_set_cred; - unsigned short ot_declare_create_cred; - unsigned short ot_declare_destroy_cred; - unsigned short ot_declare_ref_add_cred; - unsigned short ot_declare_ref_del_cred; - unsigned short ot_declare_write_cred; - unsigned short ot_declare_insert_cred; - unsigned short ot_declare_delete_cred; - unsigned short ot_declare_quota_cred; -#endif - #if OSD_THANDLE_STATS /** time when this handle was allocated */ cfs_time_t oth_alloced; @@ -396,7 +373,6 @@ enum dt_txn_op { DTO_OBJECT_DELETE, DTO_ATTR_SET_BASE, DTO_XATTR_SET, - DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */ DTO_WRITE_BASE, DTO_WRITE_BLOCK, DTO_ATTR_SET_CHOWN, @@ -408,7 +384,7 @@ enum dt_txn_op { * osd dev stats */ -#ifdef LPROCFS +#ifdef CONFIG_PROC_FS enum { LPROC_OSD_READ_BYTES = 0, LPROC_OSD_WRITE_BYTES = 1, @@ -453,24 +429,25 @@ struct osd_it_ea_dirent { * there would be one ext3 readdir for every mdd readdir page. */ -#define OSD_IT_EA_BUFSIZE (CFS_PAGE_SIZE + CFS_PAGE_SIZE/4) +#define OSD_IT_EA_BUFSIZE (PAGE_CACHE_SIZE + PAGE_CACHE_SIZE/4) /** * This is iterator's in-memory data structure in interoperability * mode (i.e. iterator over ldiskfs style directory) */ struct osd_it_ea { - struct osd_object *oie_obj; - /** used in ldiskfs iterator, to stored file pointer */ - struct file oie_file; - /** how many entries have been read-cached from storage */ - int oie_rd_dirent; - /** current entry is being iterated by caller */ - int oie_it_dirent; - /** current processing entry */ - struct osd_it_ea_dirent *oie_dirent; - /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */ - void *oie_buf; + struct osd_object *oie_obj; + /** used in ldiskfs iterator, to stored file pointer */ + struct file oie_file; + /** how many entries have been read-cached from storage */ + int oie_rd_dirent; + /** current entry is being iterated by caller */ + int oie_it_dirent; + /** current processing entry */ + struct osd_it_ea_dirent *oie_dirent; + /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */ + void *oie_buf; + struct dentry oie_dentry; }; /** @@ -483,7 +460,7 @@ struct osd_it_iam { }; struct osd_quota_leaf { - cfs_list_t oql_link; + struct list_head oql_link; uint oql_blk; }; @@ -501,60 +478,65 @@ struct osd_it_quota { /** the record index in the leaf/index block */ uint oiq_index[LUSTRE_DQTREEDEPTH + 1]; /** list of already processed leaf blocks */ - cfs_list_t oiq_list; + struct list_head oiq_list; }; -#define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512) +#define MAX_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / 512) struct osd_iobuf { - cfs_waitq_t dr_wait; - cfs_atomic_t dr_numreqs; /* number of reqs being processed */ - int dr_max_pages; - int dr_npages; - int dr_error; - int dr_frags; - unsigned int dr_ignore_quota:1; - unsigned int dr_elapsed_valid:1; /* we really did count time */ - unsigned int dr_rw:1; - struct page *dr_pages[PTLRPC_MAX_BRW_PAGES]; - unsigned long dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE]; - unsigned long dr_start_time; - unsigned long dr_elapsed; /* how long io took */ - struct osd_device *dr_dev; + wait_queue_head_t dr_wait; + atomic_t dr_numreqs; /* number of reqs being processed */ + int dr_max_pages; + int dr_npages; + int dr_error; + int dr_frags; + unsigned int dr_ignore_quota:1; + unsigned int dr_elapsed_valid:1; /* we really did count time */ + unsigned int dr_rw:1; + struct lu_buf dr_pg_buf; + struct page **dr_pages; + struct lu_buf dr_bl_buf; + sector_t *dr_blocks; + unsigned long dr_start_time; + unsigned long dr_elapsed; /* how long io took */ + struct osd_device *dr_dev; unsigned int dr_init_at; /* the line iobuf was initialized */ }; +#define OSD_INS_CACHE_SIZE 8 + struct osd_thread_info { - const struct lu_env *oti_env; - /** - * used for index operations. - */ - struct dentry oti_obj_dentry; - struct dentry oti_child_dentry; + const struct lu_env *oti_env; + /** + * used for index operations. + */ + struct dentry oti_obj_dentry; + struct dentry oti_child_dentry; + + /** dentry for Iterator context. */ + struct dentry oti_it_dentry; + + union { + /* fake struct file for osd_object_sync */ + struct file oti_file; + /* osd_statfs() */ + struct kstatfs oti_ksfs; + }; - /** dentry for Iterator context. */ - struct dentry oti_it_dentry; - struct htree_lock *oti_hlock; + struct htree_lock *oti_hlock; - struct lu_fid oti_fid; + struct lu_fid oti_fid; struct lu_fid oti_fid2; + struct lu_fid oti_fid3; struct osd_inode_id oti_id; struct osd_inode_id oti_id2; + struct osd_inode_id oti_id3; struct ost_id oti_ostid; /* * XXX temporary: for ->i_op calls. */ struct timespec oti_time; - /* - * XXX temporary: fake struct file for osd_object_sync - */ - struct file oti_file; - /* - * XXX temporary: for capa operations. - */ - struct lustre_capa_key oti_capa_key; - struct lustre_capa oti_capa; /** osd_device reference, initialized in osd_trans_start() and used in osd_trans_stop() */ @@ -566,26 +548,16 @@ struct osd_thread_info { * in open iterator session. */ - /** osd iterator context used for iterator session */ - - union { - struct osd_it_iam oti_it; - /* ldiskfs iterator data structure, - * see osd_it_ea_{init, fini} */ - struct osd_it_ea oti_it_ea; - struct osd_it_quota oti_it_quota; - }; - - /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */ - void *oti_it_ea_buf; + /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */ + void *oti_it_ea_buf; + unsigned int oti_it_ea_buf_used:1; - cfs_kstatfs_t oti_ksfs; - - /** IAM iterator for index operation. */ - struct iam_iterator oti_idx_it; + /* IAM iterator for index operation. */ + struct iam_iterator oti_idx_it; /** union to guarantee that ->oti_ipd[] has proper alignment. */ union { + char oti_name[48]; char oti_it_ipd[DX_IPD_MAX_SIZE]; long long oti_alignment_lieutenant; }; @@ -597,86 +569,159 @@ struct osd_thread_info { struct osd_idmap_cache oti_cache; + /* dedicated OI cache for insert (which needs inum) */ + struct osd_idmap_cache *oti_ins_cache; + int oti_ins_cache_size; + int oti_ins_cache_used; + int oti_r_locks; int oti_w_locks; int oti_txns; /** used in osd_fid_set() to put xattr */ struct lu_buf oti_buf; + struct lu_buf oti_big_buf; /** used in osd_ea_fid_set() to set fid into common ea */ - struct lustre_mdt_attrs oti_mdt_attrs; - /** 0-copy IO */ - struct osd_iobuf oti_iobuf; - struct inode oti_inode; - int oti_created[PTLRPC_MAX_BRW_PAGES]; - struct lu_env oti_obj_delete_tx_env; + union { + struct lustre_mdt_attrs oti_mdt_attrs; + /* old LMA for compatibility */ + char oti_mdt_attrs_old[LMA_OLD_SIZE]; + struct filter_fid_old oti_ff; + struct filter_fid oti_ff_new; + }; + /** 0-copy IO */ + struct osd_iobuf oti_iobuf; + /* used to access objects in /O */ + struct inode *oti_inode; #define OSD_FID_REC_SZ 32 - char oti_ldp[OSD_FID_REC_SZ]; - char oti_ldp2[OSD_FID_REC_SZ]; + char oti_ldp[OSD_FID_REC_SZ]; + char oti_ldp2[OSD_FID_REC_SZ]; /* used by quota code */ union { +#ifdef HAVE_DQUOT_FS_DISK_QUOTA + struct fs_disk_quota oti_fdq; +#else struct if_dqblk oti_dqblk; +#endif struct if_dqinfo oti_dqinfo; }; struct lquota_id_info oti_qi; struct lquota_trans oti_quota_trans; union lquota_rec oti_quota_rec; __u64 oti_quota_id; + struct lu_seq_range oti_seq_range; + + /* Tracking for transaction credits, to allow debugging and optimizing + * cases where a large number of credits are being allocated for + * single transaction. */ + unsigned int oti_credits_before; + unsigned short oti_declare_ops[OSD_OT_MAX]; + unsigned short oti_declare_ops_cred[OSD_OT_MAX]; + unsigned short oti_declare_ops_used[OSD_OT_MAX]; }; extern int ldiskfs_pdo; -#ifdef LPROCFS +static inline int __osd_xattr_get(struct inode *inode, struct dentry *dentry, + const char *name, void *buf, int len) +{ + if (inode == NULL) + return -EINVAL; + + dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; + return inode->i_op->getxattr(dentry, name, buf, len); +} + +static inline int __osd_xattr_set(struct osd_thread_info *info, + struct inode *inode, const char *name, + const void *buf, int buflen, int fl) +{ + struct dentry *dentry = &info->oti_child_dentry; + + ll_vfs_dq_init(inode); + dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; + return inode->i_op->setxattr(dentry, name, buf, buflen, fl); +} + +#ifdef CONFIG_PROC_FS /* osd_lproc.c */ -void lprocfs_osd_init_vars(struct lprocfs_static_vars *lvars); +extern struct lprocfs_vars lprocfs_osd_obd_vars[]; +extern struct lprocfs_vars lprocfs_osd_module_vars[]; int osd_procfs_init(struct osd_device *osd, const char *name); int osd_procfs_fini(struct osd_device *osd); -void osd_lprocfs_time_start(const struct lu_env *env); -void osd_lprocfs_time_end(const struct lu_env *env, - struct osd_device *osd, int op); void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 52, 0) +int osd_register_proc_index_in_idif(struct osd_device *osd); +#endif #endif int osd_statfs(const struct lu_env *env, struct dt_device *dev, struct obd_statfs *sfs); -int osd_object_auth(const struct lu_env *env, struct dt_object *dt, - struct lustre_capa *capa, __u64 opc); struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev, struct osd_inode_id *id); -struct inode *osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev, - struct osd_inode_id *id, struct lu_fid *fid); - -int osd_compat_init(struct osd_device *dev); -void osd_compat_fini(struct osd_device *dev); -int osd_compat_objid_lookup(struct osd_thread_info *info, - struct osd_device *osd, - const struct lu_fid *fid, struct osd_inode_id *id); -int osd_compat_objid_insert(struct osd_thread_info *info, - struct osd_device *osd, - const struct lu_fid *fid, - const struct osd_inode_id *id, struct thandle *th); -int osd_compat_objid_delete(struct osd_thread_info *info, - struct osd_device *osd, - const struct lu_fid *fid, struct thandle *th); -int osd_compat_spec_lookup(struct osd_thread_info *info, - struct osd_device *osd, - const struct lu_fid *fid, struct osd_inode_id *id); -int osd_compat_spec_insert(struct osd_thread_info *info, - struct osd_device *osd, - const struct lu_fid *fid, - const struct osd_inode_id *id, struct thandle *th); +int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode, + const struct lu_fid *fid, __u32 compat, __u32 incompat); +int osd_get_lma(struct osd_thread_info *info, struct inode *inode, + struct dentry *dentry, struct lustre_mdt_attrs *lma); +void osd_add_oi_cache(struct osd_thread_info *info, struct osd_device *osd, + struct osd_inode_id *id, const struct lu_fid *fid); +int osd_get_idif(struct osd_thread_info *info, struct inode *inode, + struct dentry *dentry, struct lu_fid *fid); + +int osd_obj_map_init(const struct lu_env *env, struct osd_device *osd); +void osd_obj_map_fini(struct osd_device *dev); +int osd_obj_map_lookup(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, struct osd_inode_id *id); +int osd_obj_map_insert(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, const struct osd_inode_id *id, + handle_t *th); +int osd_obj_map_delete(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, handle_t *th); +int osd_obj_map_update(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, const struct osd_inode_id *id, + handle_t *th); +int osd_obj_map_recover(struct osd_thread_info *info, struct osd_device *osd, + struct inode *src_parent, struct dentry *src_child, + const struct lu_fid *fid); +int osd_obj_spec_lookup(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, struct osd_inode_id *id); +int osd_obj_spec_insert(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, const struct osd_inode_id *id, + handle_t *th); +int osd_obj_spec_update(struct osd_thread_info *info, struct osd_device *osd, + const struct lu_fid *fid, const struct osd_inode_id *id, + handle_t *th); void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags); int osd_scrub_file_store(struct osd_scrub *scrub); -int osd_scrub_start(struct osd_device *dev); +char *osd_lf_fid2name(const struct lu_fid *fid); +int osd_scrub_start(struct osd_device *dev, __u32 flags); int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev); void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev); int osd_oii_insert(struct osd_device *dev, struct osd_idmap_cache *oic, int insert); int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid, struct osd_inode_id *id); -int osd_scrub_dump(struct osd_device *dev, char *buf, int len); - +int osd_scrub_dump(struct seq_file *m, struct osd_device *dev); + +int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, + u64 seq, struct lu_seq_range *range); + +int osd_delete_from_remote_parent(const struct lu_env *env, + struct osd_device *osd, + struct osd_object *obj, + struct osd_thandle *oh); +int osd_add_to_remote_parent(const struct lu_env *env, struct osd_device *osd, + struct osd_object *obj, struct osd_thandle *oh); +int osd_lookup_in_remote_parent(struct osd_thread_info *oti, + struct osd_device *osd, + const struct lu_fid *fid, + struct osd_inode_id *id); + +int osd_ost_seq_exists(struct osd_thread_info *info, struct osd_device *osd, + __u64 seq); /* osd_quota_fmt.c */ int walk_tree_dqentry(const struct lu_env *env, struct osd_object *obj, int type, uint blk, int depth, uint index, @@ -690,25 +735,78 @@ loff_t find_tree_dqentry(const struct lu_env *env, struct osd_it_quota *it); /* osd_quota.c */ int osd_declare_qid(const struct lu_env *env, struct osd_thandle *oh, - struct lquota_id_info *qi, bool allocated, int *flags); + struct lquota_id_info *qi, struct osd_object *obj, + bool enforce, int *flags); int osd_declare_inode_qid(const struct lu_env *env, qid_t uid, qid_t gid, long long space, struct osd_thandle *oh, - bool is_blk, bool allocated, int *flags, bool force); + struct osd_object *obj, bool is_blk, int *flags, + bool force); const struct dt_rec *osd_quota_pack(struct osd_object *obj, const struct dt_rec *rec, union lquota_rec *quota_rec); void osd_quota_unpack(struct osd_object *obj, const struct dt_rec *rec); -int osd_quota_migration(const struct lu_env *env, struct dt_object *dt, - const struct dt_index_features *feat); +int osd_quota_migration(const struct lu_env *env, struct dt_object *dt); + +#ifndef HAVE_I_UID_READ +static inline uid_t i_uid_read(const struct inode *inode) +{ + return inode->i_uid; +} + +static inline gid_t i_gid_read(const struct inode *inode) +{ + return inode->i_gid; +} -static inline bool is_quota_glb_feat(const struct dt_index_features *feat) +static inline void i_uid_write(struct inode *inode, uid_t uid) { - return (feat == &dt_quota_iusr_features || - feat == &dt_quota_busr_features || - feat == &dt_quota_igrp_features || - feat == &dt_quota_bgrp_features) ? true : false; + inode->i_uid = uid; } +static inline void i_gid_write(struct inode *inode, gid_t gid) +{ + inode->i_gid = gid; +} +#endif + +#ifdef LDISKFS_HT_MISC +# define osd_journal_start_sb(sb, type, nblock) \ + ldiskfs_journal_start_sb(sb, type, nblock) +# define osd_ldiskfs_append(handle, inode, nblock) \ + ldiskfs_append(handle, inode, nblock) +# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ + __ldiskfs_find_entry(dir, name, de, inlined, lock) +# define osd_journal_start(inode, type, nblocks) \ + ldiskfs_journal_start(inode, type, nblocks) +# define osd_transaction_size(dev) \ + (osd_journal(dev)->j_max_transaction_buffers / 2) +#else +# define LDISKFS_HT_MISC 0 +# define osd_journal_start_sb(sb, type, nblock) \ + ldiskfs_journal_start_sb(sb, nblock) + +static inline struct buffer_head *osd_ldiskfs_append(handle_t *handle, + struct inode *inode, + ldiskfs_lblk_t *nblock) +{ + struct buffer_head *bh; + int err = 0; + + bh = ldiskfs_append(handle, inode, nblock, &err); + if (bh == NULL) + bh = ERR_PTR(err); + + return bh; +} + +# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ + __ldiskfs_find_entry(dir, name, de, lock) +# define osd_journal_start(inode, type, nblocks) \ + ldiskfs_journal_start(inode, nblocks) +# define osd_transaction_size(dev) \ + (osd_journal(dev)->j_max_transaction_buffers) +#endif + /* * Invariants, assertions. */ @@ -749,8 +847,9 @@ static inline struct osd_oi *osd_fid2oi(struct osd_device *osd, const struct lu_fid *fid) { LASSERTF(!fid_is_idif(fid), DFID"\n", PFID(fid)); - LASSERTF(!fid_is_igif(fid), DFID"\n", PFID(fid)); - LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1); + LASSERTF(!fid_is_last_id(fid), DFID"\n", PFID(fid)); + LASSERTF(osd->od_oi_table != NULL && osd->od_oi_count >= 1, + DFID"\n", PFID(fid)); /* It can work even od_oi_count equals to 1 although it's unexpected, * the only reason we set it to 1 is for performance measurement */ return osd->od_oi_table[osd_oi_fid2idx(osd, fid)]; @@ -811,6 +910,21 @@ static inline journal_t *osd_journal(const struct osd_device *dev) return LDISKFS_SB(osd_sb(dev))->s_journal; } +static inline struct seq_server_site *osd_seq_site(struct osd_device *osd) +{ + return osd->od_dt_dev.dd_lu_dev.ld_site->ld_seq_site; +} + +static inline char *osd_name(struct osd_device *osd) +{ + return osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name; +} + +static inline bool osd_is_ea_inode(struct inode *inode) +{ + return !!(LDISKFS_I(inode)->i_flags & LDISKFS_EA_INODE_FL); +} + extern const struct dt_body_operations osd_body_ops; extern struct lu_context_key osd_key; @@ -847,6 +961,10 @@ static inline void osd_ipd_put(const struct lu_env *env, bag->ic_descr->id_ops->id_ipd_free(ipd); } +int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode, + const loff_t size, const loff_t pos, + const int blocks); + int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs); int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, int write_NUL, loff_t *offs, handle_t *handle); @@ -871,6 +989,148 @@ struct dentry *osd_child_dentry_by_inode(const struct lu_env *env, return child_dentry; } +extern int osd_trans_declare_op2rb[]; +extern int ldiskfs_track_declares_assert; +void osd_trans_dump_creds(const struct lu_env *env, struct thandle *th); + +static inline void osd_trans_declare_op(const struct lu_env *env, + struct osd_thandle *oh, + unsigned int op, int credits) +{ + struct osd_thread_info *oti = osd_oti_get(env); + + LASSERT(oh->ot_handle == NULL); + if (unlikely(op >= OSD_OT_MAX)) { + if (unlikely(ldiskfs_track_declares_assert)) { + LASSERT(op < OSD_OT_MAX); + } else { + CWARN("%s: Invalid operation index %d\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op); + libcfs_debug_dumpstack(NULL); + } + } else { + oti->oti_declare_ops[op]++; + oti->oti_declare_ops_cred[op] += credits; + } + oh->ot_credits += credits; +} + +static inline void osd_trans_exec_op(const struct lu_env *env, + struct thandle *th, unsigned int op) +{ + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_thandle *oh = container_of(th, struct osd_thandle, + ot_super); + unsigned int rb, left; + + LASSERT(oh->ot_handle != NULL); + if (unlikely(op >= OSD_OT_MAX)) { + if (unlikely(ldiskfs_track_declares_assert)) + LASSERT(op < OSD_OT_MAX); + else { + CWARN("%s: Invalid operation index %d\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op); + libcfs_debug_dumpstack(NULL); + return; + } + } + + /* find rollback (or reverse) operation for the given one + * such an operation doesn't require additional credits + * as the same set of blocks are modified */ + rb = osd_trans_declare_op2rb[op]; + + /* check whether credits for this operation were reserved at all */ + if (unlikely(oti->oti_declare_ops_cred[op] == 0 && + oti->oti_declare_ops_cred[rb] == 0)) { + /* the API is not perfect yet: CREATE does REF_ADD internally + * while DESTROY does not. To rollback CREATE the callers + * needs to call REF_DEL+DESTROY which is hard to detect using + * a simple table of rollback operations */ + if (op == OSD_OT_REF_DEL && + oti->oti_declare_ops_cred[OSD_OT_CREATE] > 0) + goto proceed; + if (op == OSD_OT_REF_ADD && + oti->oti_declare_ops_cred[OSD_OT_DESTROY] > 0) + goto proceed; + osd_trans_dump_creds(env, th); + CERROR("%s: op = %d, rb = %d\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb); + if (unlikely(ldiskfs_track_declares_assert)) + LBUG(); + } + +proceed: + /* remember how many credits we have unused before the operation */ + oti->oti_credits_before = oh->ot_handle->h_buffer_credits; + left = oti->oti_declare_ops_cred[op] - oti->oti_declare_ops_used[op]; + if (unlikely(oti->oti_credits_before < left)) { + osd_trans_dump_creds(env, th); + CERROR("%s: op = %d, rb = %d\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb); + /* on a very small fs (testing?) it's possible that + * the transaction can't fit 1/4 of journal, so we + * just request less credits (see osd_trans_start()). + * ignore the same case here */ + rb = osd_transaction_size(osd_dt_dev(th->th_dev)); + if (unlikely(oh->ot_credits < rb)) { + if (unlikely(ldiskfs_track_declares_assert)) + LBUG(); + } + } +} + +static inline void osd_trans_exec_check(const struct lu_env *env, + struct thandle *th, + unsigned int op) +{ + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_thandle *oh = container_of(th, struct osd_thandle, + ot_super); + int used, over, quota; + + /* how many credits have been used by the operation */ + used = oti->oti_credits_before - oh->ot_handle->h_buffer_credits; + + if (unlikely(used < 0)) { + /* if some block was allocated and released in the same + * transaction, then it won't be a part of the transaction + * and delta can be negative */ + return; + } + + if (used == 0) { + /* rollback operations (e.g. when we destroy just created + * object) should not consume any credits. there is no point + * to confuse the checks below */ + return; + } + + oti->oti_declare_ops_used[op] += used; + if (oti->oti_declare_ops_used[op] <= oti->oti_declare_ops_cred[op]) + return; + + /* we account quota for a whole transaction and any operation can + * consume corresponding credits */ + over = oti->oti_declare_ops_used[op] - + oti->oti_declare_ops_cred[op]; + quota = oti->oti_declare_ops_cred[OSD_OT_QUOTA] - + oti->oti_declare_ops_used[OSD_OT_QUOTA]; + if (over <= quota) { + /* probably that credits were consumed by + * quota indirectly (in the depths of ldiskfs) */ + oti->oti_declare_ops_used[OSD_OT_QUOTA] += over; + oti->oti_declare_ops_used[op] -= over; + } else { + CWARN("op %d: used %u, used now %u, reserved %u\n", + op, oti->oti_declare_ops_used[op], used, + oti->oti_declare_ops_cred[op]); + osd_trans_dump_creds(env, th); + if (unlikely(ldiskfs_track_declares_assert)) + LBUG(); + } +} + /** * Helper function to pack the fid, ldiskfs stores fid in packed format. */ @@ -927,5 +1187,19 @@ static inline loff_t ldiskfs_get_htree_eof(struct file *filp) return LDISKFS_HTREE_EOF_64BIT; } -#endif /* __KERNEL__ */ +static inline int fid_is_internal(const struct lu_fid *fid) +{ + return (!fid_is_namespace_visible(fid) && !fid_is_idif(fid)); +} + +static inline unsigned long osd_remote_parent_ino(struct osd_device *dev) +{ + return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino; +} + +void ldiskfs_inc_count(handle_t *handle, struct inode *inode); +void ldiskfs_dec_count(handle_t *handle, struct inode *inode); + +void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf); + #endif /* _OSD_INTERNAL_H */