X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_internal.h;h=7ad0c244c8a5df11bc8713f17b9c82758714df8b;hb=113aac9c212d63ec880a9731bd9a364f9b9a99bf;hp=ed3571ffbcbcc038dadfe9f3cad51a8442f231df;hpb=6b0fa766a4444cf655e965aba067a07143101966;p=fs%2Flustre-release.git diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index ed3571f..7ad0c24 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -27,7 +27,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2013, Intel Corporation. + * Copyright (c) 2011, 2015, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -79,6 +79,7 @@ extern struct kmem_cache *dynlock_cachep; /* OI scrub should skip this inode. */ #define LDISKFS_STATE_LUSTRE_NOSCRUB 31 +#define LDISKFS_STATE_LUSTRE_DESTROY 30 /** Enable thandle usage statistics */ #define OSD_THANDLE_STATS (0) @@ -126,6 +127,8 @@ struct osd_object { struct osd_directory *oo_dir; /** protects inode attributes. */ spinlock_t oo_guard; + + __u32 oo_destroyed:1; /** * Following two members are used to indicate the presence of dot and * dotdot in the given directory. This is required for interop mode @@ -145,7 +148,7 @@ struct osd_obj_seq { int oos_subdir_count; /* subdir count for each seq */ struct dentry *oos_root; /* O/ */ struct dentry **oos_dirs; /* O//d0-dXX */ - obd_seq oos_seq; /* seq number */ + u64 oos_seq; /* seq number */ struct list_head oos_seq_list; /* list to seq_list */ }; @@ -159,7 +162,7 @@ struct osd_obj_map { struct osd_mdobj { struct dentry *om_root; /* AGENT/ */ - obd_seq om_index; /* mdt index */ + u64 om_index; /* mdt index */ struct list_head om_list; /* list to omm_list */ }; @@ -168,7 +171,7 @@ struct osd_mdobj_map { }; #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \ - ldiskfs_add_entry(handle, child, cinode, hlock) + __ldiskfs_add_entry(handle, child, cinode, hlock) #define OSD_OTABLE_IT_CACHE_SIZE 64 #define OSD_OTABLE_IT_CACHE_MASK (~(OSD_OTABLE_IT_CACHE_SIZE - 1)) @@ -216,6 +219,12 @@ struct osd_otable_it { ooi_waiting:1; /* it::next is waiting. */ }; +struct osd_obj_orphan { + struct list_head oor_list; + struct lu_env *oor_env; /* to identify "own" records */ + __u32 oor_ino; +}; + /* * osd device. */ @@ -234,19 +243,15 @@ struct osd_device { unsigned int od_fl_capa:1, od_maybe_new:1, od_noscrub:1, - od_dirent_journal:1, od_igif_inoi:1, od_check_ff:1, od_is_ost:1, - od_lma_self_repair:1; - - unsigned long od_capa_timeout; - __u32 od_capa_alg; - struct lustre_capa_key *od_capa_keys; - struct hlist_head *od_capa_hash; + od_index_in_idif:1; + __u32 od_dirent_journal; + int od_index; struct proc_dir_entry *od_proc_entry; - struct lprocfs_stats *od_stats; + struct lprocfs_stats *od_stats; spinlock_t od_osfs_lock; @@ -284,9 +289,12 @@ struct osd_device { * OI scrub to scan the whole the device. */ __u64 od_full_scrub_ratio; /* If the speed of found bad OI mappings (per minute) - * exceeds the osd_device::od_full_scrub_speed, then - * trigger OI scrub to scan the whole the device. */ - __u64 od_full_scrub_speed; + * exceeds the osd_device::od_full_scrub_threshold_rate, + * then trigger OI scrub to scan the whole device. */ + __u64 od_full_scrub_threshold_rate; + + /* a list of orphaned agent inodes, protected with od_osfs_lock */ + struct list_head od_orphan_list; }; enum osd_full_scrub_ratio { @@ -301,7 +309,7 @@ enum osd_full_scrub_ratio { OFSR_DEFAULT = 10000, }; -#define FULL_SCRUB_SPEED_DEFULT 60 +#define FULL_SCRUB_THRESHOLD_RATE_DEFAULT 60 /* There are at most 10 uid/gids are affected in a transaction, and * that's rename case: @@ -327,21 +335,22 @@ enum { OSD_OT_WRITE = 7, OSD_OT_INSERT = 8, OSD_OT_DELETE = 9, - OSD_OT_UPDATE = 10, - OSD_OT_QUOTA = 11, - OSD_OT_MAX = 12 + OSD_OT_QUOTA = 10, + OSD_OT_MAX = 11 }; struct osd_thandle { struct thandle ot_super; handle_t *ot_handle; struct ldiskfs_journal_cb_entry ot_jcb; - struct list_head ot_dcb_list; + struct list_head ot_commit_dcb_list; + struct list_head ot_stop_dcb_list; /* Link to the device, for debugging. */ struct lu_ref_link ot_dev_link; unsigned short ot_credits; unsigned short ot_id_cnt; unsigned short ot_id_type; + int ot_remove_agents:1; uid_t ot_id_array[OSD_MAX_UGID_CNT]; struct lquota_trans *ot_quota_trans; #if OSD_THANDLE_STATS @@ -375,7 +384,7 @@ enum dt_txn_op { * osd dev stats */ -#ifdef LPROCFS +#ifdef CONFIG_PROC_FS enum { LPROC_OSD_READ_BYTES = 0, LPROC_OSD_WRITE_BYTES = 1, @@ -427,17 +436,18 @@ struct osd_it_ea_dirent { * mode (i.e. iterator over ldiskfs style directory) */ struct osd_it_ea { - struct osd_object *oie_obj; - /** used in ldiskfs iterator, to stored file pointer */ - struct file oie_file; - /** how many entries have been read-cached from storage */ - int oie_rd_dirent; - /** current entry is being iterated by caller */ - int oie_it_dirent; - /** current processing entry */ - struct osd_it_ea_dirent *oie_dirent; - /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */ - void *oie_buf; + struct osd_object *oie_obj; + /** used in ldiskfs iterator, to stored file pointer */ + struct file oie_file; + /** how many entries have been read-cached from storage */ + int oie_rd_dirent; + /** current entry is being iterated by caller */ + int oie_it_dirent; + /** current processing entry */ + struct osd_it_ea_dirent *oie_dirent; + /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */ + void *oie_buf; + struct dentry oie_dentry; }; /** @@ -486,7 +496,7 @@ struct osd_iobuf { struct lu_buf dr_pg_buf; struct page **dr_pages; struct lu_buf dr_bl_buf; - unsigned long *dr_blocks; + sector_t *dr_blocks; unsigned long dr_start_time; unsigned long dr_elapsed; /* how long io took */ struct osd_device *dr_dev; @@ -494,18 +504,26 @@ struct osd_iobuf { }; struct osd_thread_info { - const struct lu_env *oti_env; - /** - * used for index operations. - */ - struct dentry oti_obj_dentry; - struct dentry oti_child_dentry; + const struct lu_env *oti_env; + /** + * used for index operations. + */ + struct dentry oti_obj_dentry; + struct dentry oti_child_dentry; + + /** dentry for Iterator context. */ + struct dentry oti_it_dentry; + + union { + /* fake struct file for osd_object_sync */ + struct file oti_file; + /* osd_statfs() */ + struct kstatfs oti_ksfs; + }; - /** dentry for Iterator context. */ - struct dentry oti_it_dentry; - struct htree_lock *oti_hlock; + struct htree_lock *oti_hlock; - struct lu_fid oti_fid; + struct lu_fid oti_fid; struct lu_fid oti_fid2; struct lu_fid oti_fid3; struct osd_inode_id oti_id; @@ -517,15 +535,6 @@ struct osd_thread_info { * XXX temporary: for ->i_op calls. */ struct timespec oti_time; - /* - * XXX temporary: fake struct file for osd_object_sync - */ - struct file oti_file; - /* - * XXX temporary: for capa operations. - */ - struct lustre_capa_key oti_capa_key; - struct lustre_capa oti_capa; /** osd_device reference, initialized in osd_trans_start() and used in osd_trans_stop() */ @@ -537,26 +546,16 @@ struct osd_thread_info { * in open iterator session. */ - /** osd iterator context used for iterator session */ - - union { - struct osd_it_iam oti_it; - /* ldiskfs iterator data structure, - * see osd_it_ea_{init, fini} */ - struct osd_it_ea oti_it_ea; - struct osd_it_quota oti_it_quota; - }; - /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */ void *oti_it_ea_buf; + unsigned int oti_it_ea_buf_used:1; - struct kstatfs oti_ksfs; - - /** IAM iterator for index operation. */ - struct iam_iterator oti_idx_it; + /* IAM iterator for index operation. */ + struct iam_iterator oti_idx_it; /** union to guarantee that ->oti_ipd[] has proper alignment. */ union { + char oti_name[48]; char oti_it_ipd[DX_IPD_MAX_SIZE]; long long oti_alignment_lieutenant; }; @@ -568,7 +567,6 @@ struct osd_thread_info { struct osd_idmap_cache oti_cache; - unsigned int oti_it_inline:1; int oti_r_locks; int oti_w_locks; int oti_txns; @@ -580,10 +578,13 @@ struct osd_thread_info { struct lustre_mdt_attrs oti_mdt_attrs; /* old LMA for compatibility */ char oti_mdt_attrs_old[LMA_OLD_SIZE]; + struct filter_fid_old oti_ff; + struct filter_fid oti_ff_new; }; /** 0-copy IO */ struct osd_iobuf oti_iobuf; - struct inode oti_inode; + /* used to access objects in /O */ + struct inode *oti_inode; #define OSD_FID_REC_SZ 32 char oti_ldp[OSD_FID_REC_SZ]; char oti_ldp2[OSD_FID_REC_SZ]; @@ -606,16 +607,10 @@ struct osd_thread_info { /* Tracking for transaction credits, to allow debugging and optimizing * cases where a large number of credits are being allocated for * single transaction. */ + unsigned int oti_credits_before; unsigned short oti_declare_ops[OSD_OT_MAX]; - unsigned short oti_declare_ops_rb[OSD_OT_MAX]; unsigned short oti_declare_ops_cred[OSD_OT_MAX]; - bool oti_rollback; - - char oti_name[48]; - union { - struct filter_fid_old oti_ff; - struct filter_fid oti_ff_new; - }; + unsigned short oti_declare_ops_used[OSD_OT_MAX]; }; extern int ldiskfs_pdo; @@ -643,19 +638,20 @@ static inline int __osd_xattr_set(struct osd_thread_info *info, return inode->i_op->setxattr(dentry, name, buf, buflen, fl); } -#ifdef LPROCFS +#ifdef CONFIG_PROC_FS /* osd_lproc.c */ -extern struct lprocfs_seq_vars lprocfs_osd_obd_vars[]; -extern struct lprocfs_seq_vars lprocfs_osd_module_vars[]; +extern struct lprocfs_vars lprocfs_osd_obd_vars[]; +extern struct lprocfs_vars lprocfs_osd_module_vars[]; int osd_procfs_init(struct osd_device *osd, const char *name); int osd_procfs_fini(struct osd_device *osd); void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 52, 0) +int osd_register_proc_index_in_idif(struct osd_device *osd); +#endif #endif int osd_statfs(const struct lu_env *env, struct dt_device *dev, struct obd_statfs *sfs); -int osd_object_auth(const struct lu_env *env, struct dt_object *dt, - struct lustre_capa *capa, __u64 opc); struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev, struct osd_inode_id *id); int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode, @@ -704,7 +700,7 @@ int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid, int osd_scrub_dump(struct seq_file *m, struct osd_device *dev); int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, - obd_seq seq, struct lu_seq_range *range); + u64 seq, struct lu_seq_range *range); int osd_delete_from_remote_parent(const struct lu_env *env, struct osd_device *osd, @@ -742,16 +738,7 @@ const struct dt_rec *osd_quota_pack(struct osd_object *obj, const struct dt_rec *rec, union lquota_rec *quota_rec); void osd_quota_unpack(struct osd_object *obj, const struct dt_rec *rec); -int osd_quota_migration(const struct lu_env *env, struct dt_object *dt, - const struct dt_index_features *feat); - -static inline bool is_quota_glb_feat(const struct dt_index_features *feat) -{ - return (feat == &dt_quota_iusr_features || - feat == &dt_quota_busr_features || - feat == &dt_quota_igrp_features || - feat == &dt_quota_bgrp_features) ? true : false; -} +int osd_quota_migration(const struct lu_env *env, struct dt_object *dt); #ifndef HAVE_I_UID_READ static inline uid_t i_uid_read(const struct inode *inode) @@ -775,6 +762,44 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) } #endif +#ifdef LDISKFS_HT_MISC +# define osd_journal_start_sb(sb, type, nblock) \ + ldiskfs_journal_start_sb(sb, type, nblock) +# define osd_ldiskfs_append(handle, inode, nblock) \ + ldiskfs_append(handle, inode, nblock) +# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ + __ldiskfs_find_entry(dir, name, de, inlined, lock) +# define osd_journal_start(inode, type, nblocks) \ + ldiskfs_journal_start(inode, type, nblocks) +# define osd_transaction_size(dev) \ + (osd_journal(dev)->j_max_transaction_buffers / 2) +#else +# define LDISKFS_HT_MISC 0 +# define osd_journal_start_sb(sb, type, nblock) \ + ldiskfs_journal_start_sb(sb, nblock) + +static inline struct buffer_head *osd_ldiskfs_append(handle_t *handle, + struct inode *inode, + ldiskfs_lblk_t *nblock) +{ + struct buffer_head *bh; + int err = 0; + + bh = ldiskfs_append(handle, inode, nblock, &err); + if (bh == NULL) + bh = ERR_PTR(err); + + return bh; +} + +# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ + __ldiskfs_find_entry(dir, name, de, lock) +# define osd_journal_start(inode, type, nblocks) \ + ldiskfs_journal_start(inode, nblocks) +# define osd_transaction_size(dev) \ + (osd_journal(dev)->j_max_transaction_buffers) +#endif + /* * Invariants, assertions. */ @@ -924,6 +949,10 @@ static inline void osd_ipd_put(const struct lu_env *env, bag->ic_descr->id_ops->id_ipd_free(ipd); } +int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode, + const loff_t size, const loff_t pos, + const int blocks); + int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs); int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, int write_NUL, loff_t *offs, handle_t *handle); @@ -950,6 +979,7 @@ struct dentry *osd_child_dentry_by_inode(const struct lu_env *env, extern int osd_trans_declare_op2rb[]; extern int ldiskfs_track_declares_assert; +void osd_trans_dump_creds(const struct lu_env *env, struct thandle *th); static inline void osd_trans_declare_op(const struct lu_env *env, struct osd_thandle *oh, @@ -963,7 +993,7 @@ static inline void osd_trans_declare_op(const struct lu_env *env, LASSERT(op < OSD_OT_MAX); } else { CWARN("%s: Invalid operation index %d\n", - osd_name(oti->oti_dev), op); + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op); libcfs_debug_dumpstack(NULL); } } else { @@ -979,7 +1009,7 @@ static inline void osd_trans_exec_op(const struct lu_env *env, struct osd_thread_info *oti = osd_oti_get(env); struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super); - unsigned int rb; + unsigned int rb, left; LASSERT(oh->ot_handle != NULL); if (unlikely(op >= OSD_OT_MAX)) { @@ -987,64 +1017,105 @@ static inline void osd_trans_exec_op(const struct lu_env *env, LASSERT(op < OSD_OT_MAX); else { CWARN("%s: Invalid operation index %d\n", - osd_name(oti->oti_dev), op); + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op); libcfs_debug_dumpstack(NULL); return; } } - if (likely(!oti->oti_rollback && oti->oti_declare_ops[op] > 0)) { - oti->oti_declare_ops[op]--; - oti->oti_declare_ops_rb[op]++; - } else { - /* all future updates are considered rollback */ - oti->oti_rollback = true; - rb = osd_trans_declare_op2rb[op]; - if (unlikely(rb >= OSD_OT_MAX)) { - if (unlikely(ldiskfs_track_declares_assert)) - LASSERTF(rb < OSD_OT_MAX, "rb = %u\n", rb); - else { - CWARN("%s: Invalid rollback index %d\n", - osd_name(oti->oti_dev), rb); - libcfs_debug_dumpstack(NULL); - return; - } - } - if (unlikely(oti->oti_declare_ops_rb[rb] == 0)) { + /* find rollback (or reverse) operation for the given one + * such an operation doesn't require additional credits + * as the same set of blocks are modified */ + rb = osd_trans_declare_op2rb[op]; + + /* check whether credits for this operation were reserved at all */ + if (unlikely(oti->oti_declare_ops_cred[op] == 0 && + oti->oti_declare_ops_cred[rb] == 0)) { + /* the API is not perfect yet: CREATE does REF_ADD internally + * while DESTROY does not. To rollback CREATE the callers + * needs to call REF_DEL+DESTROY which is hard to detect using + * a simple table of rollback operations */ + if (op == OSD_OT_REF_DEL && + oti->oti_declare_ops_cred[OSD_OT_CREATE] > 0) + goto proceed; + if (op == OSD_OT_REF_ADD && + oti->oti_declare_ops_cred[OSD_OT_DESTROY] > 0) + goto proceed; + osd_trans_dump_creds(env, th); + CERROR("%s: op = %d, rb = %d\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb); + if (unlikely(ldiskfs_track_declares_assert)) + LBUG(); + } + +proceed: + /* remember how many credits we have unused before the operation */ + oti->oti_credits_before = oh->ot_handle->h_buffer_credits; + left = oti->oti_declare_ops_cred[op] - oti->oti_declare_ops_used[op]; + if (unlikely(oti->oti_credits_before < left)) { + osd_trans_dump_creds(env, th); + CERROR("%s: op = %d, rb = %d\n", + osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb); + /* on a very small fs (testing?) it's possible that + * the transaction can't fit 1/4 of journal, so we + * just request less credits (see osd_trans_start()). + * ignore the same case here */ + rb = osd_transaction_size(osd_dt_dev(th->th_dev)); + if (unlikely(oh->ot_credits < rb)) { if (unlikely(ldiskfs_track_declares_assert)) - LASSERTF(oti->oti_declare_ops_rb[rb] > 0, - "rb = %u\n", rb); - else { - CWARN("%s: Overflow in tracking declares for " - "index, rb = %d\n", - osd_name(oti->oti_dev), rb); - libcfs_debug_dumpstack(NULL); - return; - } + LBUG(); } - oti->oti_declare_ops_rb[rb]--; } } -static inline void osd_trans_declare_rb(const struct lu_env *env, - struct thandle *th, unsigned int op) +static inline void osd_trans_exec_check(const struct lu_env *env, + struct thandle *th, + unsigned int op) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super); + int used, over, quota; - LASSERT(oh->ot_handle != NULL); - if (unlikely(op >= OSD_OT_MAX)) { - if (unlikely(ldiskfs_track_declares_assert)) - LASSERT(op < OSD_OT_MAX); - else { - CWARN("%s: Invalid operation index %d\n", - osd_name(oti->oti_dev), op); - libcfs_debug_dumpstack(NULL); - } + /* how many credits have been used by the operation */ + used = oti->oti_credits_before - oh->ot_handle->h_buffer_credits; + if (unlikely(used < 0)) { + /* if some block was allocated and released in the same + * transaction, then it won't be a part of the transaction + * and delta can be negative */ + return; + } + + if (used == 0) { + /* rollback operations (e.g. when we destroy just created + * object) should not consume any credits. there is no point + * to confuse the checks below */ + return; + } + + oti->oti_declare_ops_used[op] += used; + if (oti->oti_declare_ops_used[op] <= oti->oti_declare_ops_cred[op]) + return; + + /* we account quota for a whole transaction and any operation can + * consume corresponding credits */ + over = oti->oti_declare_ops_used[op] - + oti->oti_declare_ops_cred[op]; + quota = oti->oti_declare_ops_cred[OSD_OT_QUOTA] - + oti->oti_declare_ops_used[OSD_OT_QUOTA]; + if (over <= quota) { + /* probably that credits were consumed by + * quota indirectly (in the depths of ldiskfs) */ + oti->oti_declare_ops_used[OSD_OT_QUOTA] += over; + oti->oti_declare_ops_used[op] -= over; } else { - oti->oti_declare_ops_rb[op]++; + CWARN("op %d: used %u, used now %u, reserved %u\n", + op, oti->oti_declare_ops_used[op], used, + oti->oti_declare_ops_cred[op]); + osd_trans_dump_creds(env, th); + if (unlikely(ldiskfs_track_declares_assert)) + LBUG(); } } @@ -1114,31 +1185,6 @@ static inline unsigned long osd_remote_parent_ino(struct osd_device *dev) return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino; } -#ifdef JOURNAL_START_HAS_3ARGS -# define osd_journal_start_sb(sb, type, nblock) \ - ldiskfs_journal_start_sb(sb, type, nblock) -# define osd_ldiskfs_append(handle, inode, nblock, err) \ - ldiskfs_append(handle, inode, nblock) -# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ - ldiskfs_find_entry(dir, name, de, inlined, lock) -# define osd_journal_start(inode, type, nblocks) \ - ldiskfs_journal_start(inode, type, nblocks) -# define osd_transaction_size(dev) \ - (osd_journal(dev)->j_max_transaction_buffers / 2) -#else -# define LDISKFS_HT_MISC 0 -# define osd_journal_start_sb(sb, type, nblock) \ - ldiskfs_journal_start_sb(sb, nblock) -# define osd_ldiskfs_append(handle, inode, nblock, err) \ - ldiskfs_append(handle, inode, nblock, err) -# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \ - ldiskfs_find_entry(dir, name, de, lock) -# define osd_journal_start(inode, type, nblocks) \ - ldiskfs_journal_start(inode, nblocks) -# define osd_transaction_size(dev) \ - (osd_journal(dev)->j_max_transaction_buffers) -#endif - void ldiskfs_inc_count(handle_t *handle, struct inode *inode); void ldiskfs_dec_count(handle_t *handle, struct inode *inode);