4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/osd/osd_internal.h
38 * Shared definitions and declarations for osd module
40 * Author: Nikita Danilov <nikita@clusterfs.com>
43 #ifndef _OSD_INTERNAL_H
44 #define _OSD_INTERNAL_H
46 #if defined(__KERNEL__)
48 /* struct rw_semaphore */
49 #include <linux/rwsem.h>
51 #include <linux/dcache.h>
53 #include <linux/dirent.h>
55 #include <ldiskfs/ldiskfs.h>
56 #include <ldiskfs/ldiskfs_jbd2.h>
57 #ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD
58 # define journal_callback ldiskfs_journal_cb_entry
59 # define osd_journal_callback_set(handle, func, jcb) \
60 ldiskfs_journal_callback_add(handle, func, jcb)
62 # define osd_journal_callback_set(handle, func, jcb) \
63 jbd2_journal_callback_set(handle, func, jcb)
66 /* fsfilt_{get|put}_ops */
67 #include <lustre_fsfilt.h>
71 /* class_register_type(), class_unregister_type(), class_get_type() */
72 #include <obd_class.h>
73 #include <lustre_disk.h>
74 #include <dt_object.h>
81 #define OSD_COUNTERS (0)
83 /** Enable thandle usage statistics */
84 #define OSD_THANDLE_STATS (0)
86 #ifdef HAVE_QUOTA_SUPPORT
90 cfs_kernel_cap_t oc_cap;
94 struct osd_directory {
95 struct iam_container od_container;
96 struct iam_descr od_descr;
100 * Object Index (oi) instance.
104 * underlying index object, where fid->id mapping in stored.
106 struct inode *oi_inode;
107 struct osd_directory oi_dir;
110 extern const int osd_dto_credits_noquota[];
113 struct dt_object oo_dt;
115 * Inode for file system object represented by this osd_object. This
116 * inode is pinned for the whole duration of lu_object life.
118 * Not modified concurrently (either setup early during object
119 * creation, or assigned by osd_object_create() under write lock).
121 struct inode *oo_inode;
123 * to protect index ops.
125 struct htree_lock_head *oo_hl_head;
126 cfs_rw_semaphore_t oo_ext_idx_sem;
127 cfs_rw_semaphore_t oo_sem;
128 struct osd_directory *oo_dir;
129 /** protects inode attributes. */
130 cfs_spinlock_t oo_guard;
132 * Following two members are used to indicate the presence of dot and
133 * dotdot in the given directory. This is required for interop mode
136 int oo_compat_dot_created;
137 int oo_compat_dotdot_created;
139 const struct lu_env *oo_owner;
140 #ifdef CONFIG_LOCKDEP
141 struct lockdep_map oo_dep_map;
145 #ifdef HAVE_LDISKFS_PDO
147 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
148 ll_ldiskfs_find_entry(dir, dentry, de, lock)
149 #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
150 ldiskfs_add_entry(handle, child, cinode, hlock)
152 #else /* HAVE_LDISKFS_PDO */
158 struct htree_lock_head {
162 #define ldiskfs_htree_lock(lock, head, inode, op) do { LBUG(); } while (0)
163 #define ldiskfs_htree_unlock(lock) do { LBUG(); } while (0)
165 static inline struct htree_lock_head *ldiskfs_htree_lock_head_alloc(int dep)
171 #define ldiskfs_htree_lock_head_free(lh) do { LBUG(); } while (0)
173 #define LDISKFS_DUMMY_HTREE_LOCK 0xbabecafe
175 static inline struct htree_lock *ldiskfs_htree_lock_alloc(void)
177 return (struct htree_lock *)LDISKFS_DUMMY_HTREE_LOCK;
180 static inline void ldiskfs_htree_lock_free(struct htree_lock *lk)
182 LASSERT((unsigned long)lk == LDISKFS_DUMMY_HTREE_LOCK);
185 #define HTREE_HBITS_DEF 0
187 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
188 ll_ldiskfs_find_entry(dir, dentry, de)
189 #define osd_ldiskfs_add_entry(handle, child, cinode, lock) \
190 ldiskfs_add_entry(handle, child, cinode)
192 #endif /* HAVE_LDISKFS_PDO */
194 extern const int osd_dto_credits_noquota[];
201 struct dt_device od_dt_dev;
202 /* information about underlying file system */
203 struct lustre_mount_info *od_mount;
204 struct vfsmount *od_mnt;
206 struct osd_oi **od_oi_table;
207 /* total number of OI containers */
212 unsigned int od_fl_capa:1;
213 unsigned long od_capa_timeout;
215 struct lustre_capa_key *od_capa_keys;
216 cfs_hlist_head_t *od_capa_hash;
218 cfs_proc_dir_entry_t *od_proc_entry;
219 struct lprocfs_stats *od_stats;
221 * statfs optimization: we cache a bit.
223 cfs_time_t od_osfs_age;
224 struct obd_statfs od_statfs;
225 cfs_spinlock_t od_osfs_lock;
228 * The following flag indicates, if it is interop mode or not.
229 * It will be initialized, using mount param.
233 struct fsfilt_operations *od_fsops;
236 * mapping for legacy OST objids
238 struct osd_compat_objid *od_ost_map;
240 unsigned long long od_readcache_max_filesize;
242 int od_writethrough_cache;
244 struct brw_stats od_brw_stats;
245 cfs_atomic_t od_r_in_flight;
246 cfs_atomic_t od_w_in_flight;
249 #define OSD_TRACK_DECLARES
250 #ifdef OSD_TRACK_DECLARES
251 #define OSD_DECLARE_OP(oh, op) { \
252 LASSERT(oh->ot_handle == NULL); \
253 ((oh)->ot_declare_ ##op)++; }
254 #define OSD_EXEC_OP(handle,op) { \
255 struct osd_thandle *oh; \
256 oh = container_of0(handle, struct osd_thandle, ot_super);\
257 if (((oh)->ot_declare_ ##op) > 0) { \
258 ((oh)->ot_declare_ ##op)--; \
262 #define OSD_DECLARE_OP(oh, op)
263 #define OSD_EXEC_OP(oh, op)
266 /* There are at most 10 uid/gids are affected in a transaction, and
267 * that's rename case:
268 * - 2 for source parent uid & gid;
269 * - 2 for source child uid & gid ('..' entry update when the child
271 * - 2 for target parent uid & gid;
272 * - 2 for target child uid & gid (if the target child exists);
273 * - 2 for root uid & gid (last_rcvd, llog, etc);
275 * The 0 to (OSD_MAX_UGID_CNT - 1) bits of ot_id_type is for indicating
276 * the id type of each id in the ot_id_array.
278 #define OSD_MAX_UGID_CNT 10
281 struct thandle ot_super;
283 struct journal_callback ot_jcb;
284 cfs_list_t ot_dcb_list;
285 /* Link to the device, for debugging. */
286 struct lu_ref_link *ot_dev_link;
287 unsigned short ot_credits;
288 unsigned short ot_id_cnt;
289 unsigned short ot_id_type;
290 uid_t ot_id_array[OSD_MAX_UGID_CNT];
292 #ifdef OSD_TRACK_DECLARES
293 unsigned char ot_declare_attr_set;
294 unsigned char ot_declare_punch;
295 unsigned char ot_declare_xattr_set;
296 unsigned char ot_declare_create;
297 unsigned char ot_declare_destroy;
298 unsigned char ot_declare_ref_add;
299 unsigned char ot_declare_ref_del;
300 unsigned char ot_declare_write;
301 unsigned char ot_declare_insert;
302 unsigned char ot_declare_delete;
305 #if OSD_THANDLE_STATS
306 /** time when this handle was allocated */
307 cfs_time_t oth_alloced;
309 /** time when this thanle was started */
310 cfs_time_t oth_started;
315 * Basic transaction credit op
325 DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
339 LPROC_OSD_READ_BYTES = 0,
340 LPROC_OSD_WRITE_BYTES = 1,
341 LPROC_OSD_GET_PAGE = 2,
342 LPROC_OSD_NO_PAGE = 3,
343 LPROC_OSD_CACHE_ACCESS = 4,
344 LPROC_OSD_CACHE_HIT = 5,
345 LPROC_OSD_CACHE_MISS = 6,
347 #if OSD_THANDLE_STATS
348 LPROC_OSD_THANDLE_STARTING,
349 LPROC_OSD_THANDLE_OPEN,
350 LPROC_OSD_THANDLE_CLOSING,
357 * Storage representation for fids.
359 * Variable size, first byte contains the length of the whole record.
361 struct osd_fid_pack {
362 unsigned char fp_len;
363 char fp_area[sizeof(struct lu_fid)];
366 struct osd_it_ea_dirent {
367 struct lu_fid oied_fid;
370 unsigned short oied_namelen;
371 unsigned int oied_type;
373 } __attribute__((packed));
376 * as osd_it_ea_dirent (in memory dirent struct for osd) is greater
377 * than lu_dirent struct. osd readdir reads less number of dirent than
378 * required for mdd dir page. so buffer size need to be increased so that
379 * there would be one ext3 readdir for every mdd readdir page.
382 #define OSD_IT_EA_BUFSIZE (CFS_PAGE_SIZE + CFS_PAGE_SIZE/4)
385 * This is iterator's in-memory data structure in interoperability
386 * mode (i.e. iterator over ldiskfs style directory)
389 struct osd_object *oie_obj;
390 /** used in ldiskfs iterator, to stored file pointer */
391 struct file oie_file;
392 /** how many entries have been read-cached from storage */
394 /** current entry is being iterated by caller */
396 /** current processing entry */
397 struct osd_it_ea_dirent *oie_dirent;
398 /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */
403 * Iterator's in-memory data structure for IAM mode.
406 struct osd_object *oi_obj;
407 struct iam_path_descr *oi_ipd;
408 struct iam_iterator oi_it;
411 #define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512)
415 cfs_atomic_t dr_numreqs; /* number of reqs being processed */
420 unsigned int dr_ignore_quota:1;
421 unsigned int dr_elapsed_valid:1; /* we really did count time */
422 unsigned int dr_rw:1;
423 struct page *dr_pages[PTLRPC_MAX_BRW_PAGES];
424 unsigned long dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE];
425 unsigned long dr_start_time;
426 unsigned long dr_elapsed; /* how long io took */
427 struct osd_device *dr_dev;
430 struct osd_thread_info {
431 const struct lu_env *oti_env;
433 * used for index operations.
435 struct dentry oti_obj_dentry;
436 struct dentry oti_child_dentry;
438 /** dentry for Iterator context. */
439 struct dentry oti_it_dentry;
440 struct htree_lock *oti_hlock;
442 struct lu_fid oti_fid;
443 struct lu_fid oti_fid2;
444 struct osd_inode_id oti_id;
445 struct osd_inode_id oti_id2;
446 struct ost_id oti_ostid;
449 * XXX temporary: for ->i_op calls.
451 struct timespec oti_time;
453 * XXX temporary: fake struct file for osd_object_sync
455 struct file oti_file;
457 * XXX temporary: for capa operations.
459 struct lustre_capa_key oti_capa_key;
460 struct lustre_capa oti_capa;
462 /** osd_device reference, initialized in osd_trans_start() and
463 used in osd_trans_stop() */
464 struct osd_device *oti_dev;
467 * following ipd and it structures are used for osd_index_iam_lookup()
468 * these are defined separately as we might do index operation
469 * in open iterator session.
472 /** osd iterator context used for iterator session */
475 struct osd_it_iam oti_it;
476 /** ldiskfs iterator data structure, see osd_it_ea_{init, fini} */
477 struct osd_it_ea oti_it_ea;
480 /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */
483 cfs_kstatfs_t oti_ksfs;
485 /** IAM iterator for index operation. */
486 struct iam_iterator oti_idx_it;
488 /** union to guarantee that ->oti_ipd[] has proper alignment. */
490 char oti_it_ipd[DX_IPD_MAX_SIZE];
491 long long oti_alignment_lieutenant;
495 char oti_idx_ipd[DX_IPD_MAX_SIZE];
496 long long oti_alignment_lieutenant_colonel;
503 /** used in osd_fid_set() to put xattr */
504 struct lu_buf oti_buf;
505 /** used in osd_ea_fid_set() to set fid into common ea */
506 struct lustre_mdt_attrs oti_mdt_attrs;
508 struct osd_iobuf oti_iobuf;
509 struct inode oti_inode;
510 int oti_created[PTLRPC_MAX_BRW_PAGES];
511 #ifdef HAVE_QUOTA_SUPPORT
512 struct osd_ctxt oti_ctxt;
514 struct lu_env oti_obj_delete_tx_env;
515 #define OSD_FID_REC_SZ 32
516 char oti_ldp[OSD_FID_REC_SZ];
517 char oti_ldp2[OSD_FID_REC_SZ];
520 extern int ldiskfs_pdo;
524 void lprocfs_osd_init_vars(struct lprocfs_static_vars *lvars);
525 int osd_procfs_init(struct osd_device *osd, const char *name);
526 int osd_procfs_fini(struct osd_device *osd);
527 void osd_lprocfs_time_start(const struct lu_env *env);
528 void osd_lprocfs_time_end(const struct lu_env *env,
529 struct osd_device *osd, int op);
530 void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf);
533 int osd_statfs(const struct lu_env *env, struct dt_device *dev,
534 struct obd_statfs *sfs);
535 int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
536 struct lustre_capa *capa, __u64 opc);
537 void osd_declare_qid(struct dt_object *dt, struct osd_thandle *oh,
538 int type, uid_t id, struct inode *inode);
539 struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
540 struct osd_inode_id *id);
541 struct inode *osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev,
542 struct osd_inode_id *id, struct lu_fid *fid);
544 int osd_compat_init(struct osd_device *dev);
545 void osd_compat_fini(struct osd_device *dev);
546 int osd_compat_objid_lookup(struct osd_thread_info *info,
547 struct osd_device *osd,
548 const struct lu_fid *fid, struct osd_inode_id *id);
549 int osd_compat_objid_insert(struct osd_thread_info *info,
550 struct osd_device *osd,
551 const struct lu_fid *fid,
552 const struct osd_inode_id *id, struct thandle *th);
553 int osd_compat_objid_delete(struct osd_thread_info *info,
554 struct osd_device *osd,
555 const struct lu_fid *fid, struct thandle *th);
556 int osd_compat_spec_lookup(struct osd_thread_info *info,
557 struct osd_device *osd,
558 const struct lu_fid *fid, struct osd_inode_id *id);
559 int osd_compat_spec_insert(struct osd_thread_info *info,
560 struct osd_device *osd,
561 const struct lu_fid *fid,
562 const struct osd_inode_id *id, struct thandle *th);
565 * Invariants, assertions.
569 * XXX: do not enable this, until invariant checking code is made thread safe
570 * in the face of pdirops locking.
572 #define OSD_INVARIANT_CHECKS (0)
574 #if OSD_INVARIANT_CHECKS
575 static inline int osd_invariant(const struct osd_object *obj)
579 ergo(obj->oo_inode != NULL,
580 obj->oo_inode->i_sb == osd_sb(osd_obj2dev(obj)) &&
581 atomic_read(&obj->oo_inode->i_count) > 0) &&
582 ergo(obj->oo_dir != NULL &&
583 obj->oo_dir->od_conationer.ic_object != NULL,
584 obj->oo_dir->od_conationer.ic_object == obj->oo_inode);
587 #define osd_invariant(obj) (1)
590 extern const struct dt_index_operations osd_otable_ops;
592 static inline int osd_oi_fid2idx(struct osd_device *dev,
593 const struct lu_fid *fid)
595 return fid->f_seq & (dev->od_oi_count - 1);
598 static inline struct osd_oi *osd_fid2oi(struct osd_device *osd,
599 const struct lu_fid *fid)
601 LASSERT(!fid_is_idif(fid));
602 LASSERT(!fid_is_igif(fid));
603 LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1);
604 /* It can work even od_oi_count equals to 1 although it's unexpected,
605 * the only reason we set it to 1 is for performance measurement */
606 return osd->od_oi_table[osd_oi_fid2idx(osd, fid)];
609 extern const struct lu_device_operations osd_lu_ops;
611 static inline int lu_device_is_osd(const struct lu_device *d)
613 return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
616 static inline struct osd_device *osd_dt_dev(const struct dt_device *d)
618 LASSERT(lu_device_is_osd(&d->dd_lu_dev));
619 return container_of0(d, struct osd_device, od_dt_dev);
622 static inline struct osd_device *osd_dev(const struct lu_device *d)
624 LASSERT(lu_device_is_osd(d));
625 return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
628 static inline struct osd_device *osd_obj2dev(const struct osd_object *o)
630 return osd_dev(o->oo_dt.do_lu.lo_dev);
633 static inline struct super_block *osd_sb(const struct osd_device *dev)
635 return dev->od_mount->lmi_mnt->mnt_sb;
638 static inline int osd_object_is_root(const struct osd_object *obj)
640 return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
643 static inline struct osd_object *osd_obj(const struct lu_object *o)
645 LASSERT(lu_device_is_osd(o->lo_dev));
646 return container_of0(o, struct osd_object, oo_dt.do_lu);
649 static inline struct osd_object *osd_dt_obj(const struct dt_object *d)
651 return osd_obj(&d->do_lu);
654 static inline struct lu_device *osd2lu_dev(struct osd_device *osd)
656 return &osd->od_dt_dev.dd_lu_dev;
659 static inline journal_t *osd_journal(const struct osd_device *dev)
661 return LDISKFS_SB(osd_sb(dev))->s_journal;
664 extern const struct dt_body_operations osd_body_ops;
665 extern struct lu_context_key osd_key;
667 static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env)
669 return lu_context_key_get(&env->le_ctx, &osd_key);
672 extern const struct dt_body_operations osd_body_ops_new;
678 struct iam_path_descr *osd_it_ipd_get(const struct lu_env *env,
679 const struct iam_container *bag)
681 return bag->ic_descr->id_ops->id_ipd_alloc(bag,
682 osd_oti_get(env)->oti_it_ipd);
686 struct iam_path_descr *osd_idx_ipd_get(const struct lu_env *env,
687 const struct iam_container *bag)
689 return bag->ic_descr->id_ops->id_ipd_alloc(bag,
690 osd_oti_get(env)->oti_idx_ipd);
693 static inline void osd_ipd_put(const struct lu_env *env,
694 const struct iam_container *bag,
695 struct iam_path_descr *ipd)
697 bag->ic_descr->id_ops->id_ipd_free(ipd);
700 int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs);
701 int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize,
702 loff_t *offs, handle_t *handle);
705 struct dentry *osd_child_dentry_by_inode(const struct lu_env *env,
707 const char *name, const int namelen)
709 struct osd_thread_info *info = osd_oti_get(env);
710 struct dentry *child_dentry = &info->oti_child_dentry;
711 struct dentry *obj_dentry = &info->oti_obj_dentry;
713 obj_dentry->d_inode = inode;
714 obj_dentry->d_sb = inode->i_sb;
715 obj_dentry->d_name.hash = 0;
717 child_dentry->d_name.hash = 0;
718 child_dentry->d_parent = obj_dentry;
719 child_dentry->d_name.name = name;
720 child_dentry->d_name.len = namelen;
725 * Helper function to pack the fid, ldiskfs stores fid in packed format.
728 void osd_fid_pack(struct osd_fid_pack *pack, const struct dt_rec *fid,
729 struct lu_fid *befider)
731 fid_cpu_to_be(befider, (struct lu_fid *)fid);
732 memcpy(pack->fp_area, befider, sizeof(*befider));
733 pack->fp_len = sizeof(*befider) + 1;
737 int osd_fid_unpack(struct lu_fid *fid, const struct osd_fid_pack *pack)
742 switch (pack->fp_len) {
743 case sizeof *fid + 1:
744 memcpy(fid, pack->fp_area, sizeof *fid);
745 fid_be_to_cpu(fid, fid);
748 CERROR("Unexpected packed fid size: %d\n", pack->fp_len);
754 #endif /* __KERNEL__ */
755 #endif /* _OSD_INTERNAL_H */