4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/osd/osd_internal.h
38 * Shared definitions and declarations for osd module
40 * Author: Nikita Danilov <nikita@clusterfs.com>
43 #ifndef _OSD_INTERNAL_H
44 #define _OSD_INTERNAL_H
46 #if defined(__KERNEL__)
48 /* struct rw_semaphore */
49 #include <linux/rwsem.h>
51 #include <linux/dcache.h>
53 #include <linux/dirent.h>
55 #include <ldiskfs/ldiskfs.h>
56 #include <ldiskfs/ldiskfs_jbd2.h>
57 #ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD
58 # define journal_callback ldiskfs_journal_cb_entry
59 # define osd_journal_callback_set(handle, func, jcb) \
60 ldiskfs_journal_callback_add(handle, func, jcb)
62 # define osd_journal_callback_set(handle, func, jcb) \
63 jbd2_journal_callback_set(handle, func, jcb)
66 /* fsfilt_{get|put}_ops */
67 #include <lustre_fsfilt.h>
71 /* class_register_type(), class_unregister_type(), class_get_type() */
72 #include <obd_class.h>
73 #include <lustre_disk.h>
74 #include <dt_object.h>
81 #define OSD_OII_NOGEN (0)
82 #define OSD_COUNTERS (0)
84 /** Enable thandle usage statistics */
85 #define OSD_THANDLE_STATS (0)
87 #ifdef HAVE_QUOTA_SUPPORT
91 cfs_kernel_cap_t oc_cap;
95 struct osd_directory {
96 struct iam_container od_container;
97 struct iam_descr od_descr;
101 * Object Index (oi) instance.
105 * underlying index object, where fid->id mapping in stored.
107 struct inode *oi_inode;
108 struct osd_directory oi_dir;
111 extern const int osd_dto_credits_noquota[];
114 struct dt_object oo_dt;
116 * Inode for file system object represented by this osd_object. This
117 * inode is pinned for the whole duration of lu_object life.
119 * Not modified concurrently (either setup early during object
120 * creation, or assigned by osd_object_create() under write lock).
122 struct inode *oo_inode;
124 * to protect index ops.
126 struct htree_lock_head *oo_hl_head;
127 cfs_rw_semaphore_t oo_ext_idx_sem;
128 cfs_rw_semaphore_t oo_sem;
129 struct osd_directory *oo_dir;
130 /** protects inode attributes. */
131 cfs_spinlock_t oo_guard;
133 * Following two members are used to indicate the presence of dot and
134 * dotdot in the given directory. This is required for interop mode
137 int oo_compat_dot_created;
138 int oo_compat_dotdot_created;
140 const struct lu_env *oo_owner;
141 #ifdef CONFIG_LOCKDEP
142 struct lockdep_map oo_dep_map;
146 #ifdef HAVE_LDISKFS_PDO
148 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
149 ll_ldiskfs_find_entry(dir, dentry, de, lock)
150 #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
151 ldiskfs_add_entry(handle, child, cinode, hlock)
153 #else /* HAVE_LDISKFS_PDO */
159 struct htree_lock_head {
163 #define ldiskfs_htree_lock(lock, head, inode, op) do { LBUG(); } while (0)
164 #define ldiskfs_htree_unlock(lock) do { LBUG(); } while (0)
166 static inline struct htree_lock_head *ldiskfs_htree_lock_head_alloc(int dep)
172 #define ldiskfs_htree_lock_head_free(lh) do { LBUG(); } while (0)
174 #define LDISKFS_DUMMY_HTREE_LOCK 0xbabecafe
176 static inline struct htree_lock *ldiskfs_htree_lock_alloc(void)
178 return (struct htree_lock *)LDISKFS_DUMMY_HTREE_LOCK;
181 static inline void ldiskfs_htree_lock_free(struct htree_lock *lk)
183 LASSERT((unsigned long)lk == LDISKFS_DUMMY_HTREE_LOCK);
186 #define HTREE_HBITS_DEF 0
188 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
189 ll_ldiskfs_find_entry(dir, dentry, de)
190 #define osd_ldiskfs_add_entry(handle, child, cinode, lock) \
191 ldiskfs_add_entry(handle, child, cinode)
193 #endif /* HAVE_LDISKFS_PDO */
195 extern const int osd_dto_credits_noquota[];
202 struct dt_device od_dt_dev;
203 /* information about underlying file system */
204 struct lustre_mount_info *od_mount;
205 struct vfsmount *od_mnt;
207 struct osd_oi **od_oi_table;
208 /* total number of OI containers */
213 unsigned int od_fl_capa:1;
214 unsigned long od_capa_timeout;
216 struct lustre_capa_key *od_capa_keys;
217 cfs_hlist_head_t *od_capa_hash;
219 cfs_proc_dir_entry_t *od_proc_entry;
220 struct lprocfs_stats *od_stats;
222 * statfs optimization: we cache a bit.
224 cfs_time_t od_osfs_age;
225 struct obd_statfs od_statfs;
226 cfs_spinlock_t od_osfs_lock;
229 * The following flag indicates, if it is interop mode or not.
230 * It will be initialized, using mount param.
234 struct fsfilt_operations *od_fsops;
237 * mapping for legacy OST objids
239 struct osd_compat_objid *od_ost_map;
241 unsigned long long od_readcache_max_filesize;
243 int od_writethrough_cache;
245 struct brw_stats od_brw_stats;
246 cfs_atomic_t od_r_in_flight;
247 cfs_atomic_t od_w_in_flight;
250 #define OSD_TRACK_DECLARES
251 #ifdef OSD_TRACK_DECLARES
252 #define OSD_DECLARE_OP(oh, op) { \
253 LASSERT(oh->ot_handle == NULL); \
254 ((oh)->ot_declare_ ##op)++; }
255 #define OSD_EXEC_OP(handle,op) { \
256 struct osd_thandle *oh; \
257 oh = container_of0(handle, struct osd_thandle, ot_super);\
258 if (((oh)->ot_declare_ ##op) > 0) { \
259 ((oh)->ot_declare_ ##op)--; \
263 #define OSD_DECLARE_OP(oh, op)
264 #define OSD_EXEC_OP(oh, op)
267 /* There are at most 10 uid/gids are affected in a transaction, and
268 * that's rename case:
269 * - 2 for source parent uid & gid;
270 * - 2 for source child uid & gid ('..' entry update when the child
272 * - 2 for target parent uid & gid;
273 * - 2 for target child uid & gid (if the target child exists);
274 * - 2 for root uid & gid (last_rcvd, llog, etc);
276 * The 0 to (OSD_MAX_UGID_CNT - 1) bits of ot_id_type is for indicating
277 * the id type of each id in the ot_id_array.
279 #define OSD_MAX_UGID_CNT 10
282 struct thandle ot_super;
284 struct journal_callback ot_jcb;
285 cfs_list_t ot_dcb_list;
286 /* Link to the device, for debugging. */
287 struct lu_ref_link *ot_dev_link;
288 unsigned short ot_credits;
289 unsigned short ot_id_cnt;
290 unsigned short ot_id_type;
291 uid_t ot_id_array[OSD_MAX_UGID_CNT];
293 #ifdef OSD_TRACK_DECLARES
294 unsigned char ot_declare_attr_set;
295 unsigned char ot_declare_punch;
296 unsigned char ot_declare_xattr_set;
297 unsigned char ot_declare_create;
298 unsigned char ot_declare_destroy;
299 unsigned char ot_declare_ref_add;
300 unsigned char ot_declare_ref_del;
301 unsigned char ot_declare_write;
302 unsigned char ot_declare_insert;
303 unsigned char ot_declare_delete;
306 #if OSD_THANDLE_STATS
307 /** time when this handle was allocated */
308 cfs_time_t oth_alloced;
310 /** time when this thanle was started */
311 cfs_time_t oth_started;
316 * Basic transaction credit op
326 DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
340 LPROC_OSD_READ_BYTES = 0,
341 LPROC_OSD_WRITE_BYTES = 1,
342 LPROC_OSD_GET_PAGE = 2,
343 LPROC_OSD_NO_PAGE = 3,
344 LPROC_OSD_CACHE_ACCESS = 4,
345 LPROC_OSD_CACHE_HIT = 5,
346 LPROC_OSD_CACHE_MISS = 6,
348 #if OSD_THANDLE_STATS
349 LPROC_OSD_THANDLE_STARTING,
350 LPROC_OSD_THANDLE_OPEN,
351 LPROC_OSD_THANDLE_CLOSING,
358 * Storage representation for fids.
360 * Variable size, first byte contains the length of the whole record.
362 struct osd_fid_pack {
363 unsigned char fp_len;
364 char fp_area[sizeof(struct lu_fid)];
367 struct osd_it_ea_dirent {
368 struct lu_fid oied_fid;
371 unsigned short oied_namelen;
372 unsigned int oied_type;
374 } __attribute__((packed));
377 * as osd_it_ea_dirent (in memory dirent struct for osd) is greater
378 * than lu_dirent struct. osd readdir reads less number of dirent than
379 * required for mdd dir page. so buffer size need to be increased so that
380 * there would be one ext3 readdir for every mdd readdir page.
383 #define OSD_IT_EA_BUFSIZE (CFS_PAGE_SIZE + CFS_PAGE_SIZE/4)
386 * This is iterator's in-memory data structure in interoperability
387 * mode (i.e. iterator over ldiskfs style directory)
390 struct osd_object *oie_obj;
391 /** used in ldiskfs iterator, to stored file pointer */
392 struct file oie_file;
393 /** how many entries have been read-cached from storage */
395 /** current entry is being iterated by caller */
397 /** current processing entry */
398 struct osd_it_ea_dirent *oie_dirent;
399 /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */
404 * Iterator's in-memory data structure for IAM mode.
407 struct osd_object *oi_obj;
408 struct iam_path_descr *oi_ipd;
409 struct iam_iterator oi_it;
412 #define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512)
416 cfs_atomic_t dr_numreqs; /* number of reqs being processed */
421 unsigned int dr_ignore_quota:1;
422 unsigned int dr_elapsed_valid:1; /* we really did count time */
423 unsigned int dr_rw:1;
424 struct page *dr_pages[PTLRPC_MAX_BRW_PAGES];
425 unsigned long dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE];
426 unsigned long dr_start_time;
427 unsigned long dr_elapsed; /* how long io took */
428 struct osd_device *dr_dev;
431 struct osd_thread_info {
432 const struct lu_env *oti_env;
434 * used for index operations.
436 struct dentry oti_obj_dentry;
437 struct dentry oti_child_dentry;
439 /** dentry for Iterator context. */
440 struct dentry oti_it_dentry;
441 struct htree_lock *oti_hlock;
443 struct lu_fid oti_fid;
444 struct osd_inode_id oti_id;
445 struct ost_id oti_ostid;
448 * XXX temporary: for ->i_op calls.
450 struct timespec oti_time;
452 * XXX temporary: fake struct file for osd_object_sync
454 struct file oti_file;
456 * XXX temporary: for capa operations.
458 struct lustre_capa_key oti_capa_key;
459 struct lustre_capa oti_capa;
461 /** osd_device reference, initialized in osd_trans_start() and
462 used in osd_trans_stop() */
463 struct osd_device *oti_dev;
466 * following ipd and it structures are used for osd_index_iam_lookup()
467 * these are defined separately as we might do index operation
468 * in open iterator session.
471 /** osd iterator context used for iterator session */
474 struct osd_it_iam oti_it;
475 /** ldiskfs iterator data structure, see osd_it_ea_{init, fini} */
476 struct osd_it_ea oti_it_ea;
479 /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */
482 cfs_kstatfs_t oti_ksfs;
484 /** IAM iterator for index operation. */
485 struct iam_iterator oti_idx_it;
487 /** union to guarantee that ->oti_ipd[] has proper alignment. */
489 char oti_it_ipd[DX_IPD_MAX_SIZE];
490 long long oti_alignment_lieutenant;
494 char oti_idx_ipd[DX_IPD_MAX_SIZE];
495 long long oti_alignment_lieutenant_colonel;
502 /** used in osd_fid_set() to put xattr */
503 struct lu_buf oti_buf;
504 /** used in osd_ea_fid_set() to set fid into common ea */
505 struct lustre_mdt_attrs oti_mdt_attrs;
507 struct osd_iobuf oti_iobuf;
508 struct inode oti_inode;
509 int oti_created[PTLRPC_MAX_BRW_PAGES];
510 #ifdef HAVE_QUOTA_SUPPORT
511 struct osd_ctxt oti_ctxt;
513 struct lu_env oti_obj_delete_tx_env;
514 #define OSD_FID_REC_SZ 32
515 char oti_ldp[OSD_FID_REC_SZ];
516 char oti_ldp2[OSD_FID_REC_SZ];
519 extern int ldiskfs_pdo;
523 void lprocfs_osd_init_vars(struct lprocfs_static_vars *lvars);
524 int osd_procfs_init(struct osd_device *osd, const char *name);
525 int osd_procfs_fini(struct osd_device *osd);
526 void osd_lprocfs_time_start(const struct lu_env *env);
527 void osd_lprocfs_time_end(const struct lu_env *env,
528 struct osd_device *osd, int op);
529 void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf);
532 int osd_statfs(const struct lu_env *env, struct dt_device *dev,
533 struct obd_statfs *sfs);
534 int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
535 struct lustre_capa *capa, __u64 opc);
536 void osd_declare_qid(struct dt_object *dt, struct osd_thandle *oh,
537 int type, uid_t id, struct inode *inode);
538 struct inode *osd_iget(struct osd_thread_info *info,
539 struct osd_device *dev,
540 const struct osd_inode_id *id);
542 int osd_compat_init(struct osd_device *dev);
543 void osd_compat_fini(struct osd_device *dev);
544 int osd_compat_objid_lookup(struct osd_thread_info *info,
545 struct osd_device *osd,
546 const struct lu_fid *fid, struct osd_inode_id *id);
547 int osd_compat_objid_insert(struct osd_thread_info *info,
548 struct osd_device *osd,
549 const struct lu_fid *fid,
550 const struct osd_inode_id *id, struct thandle *th);
551 int osd_compat_objid_delete(struct osd_thread_info *info,
552 struct osd_device *osd,
553 const struct lu_fid *fid, struct thandle *th);
554 int osd_compat_spec_lookup(struct osd_thread_info *info,
555 struct osd_device *osd,
556 const struct lu_fid *fid, struct osd_inode_id *id);
557 int osd_compat_spec_insert(struct osd_thread_info *info,
558 struct osd_device *osd,
559 const struct lu_fid *fid,
560 const struct osd_inode_id *id, struct thandle *th);
563 * Invariants, assertions.
567 * XXX: do not enable this, until invariant checking code is made thread safe
568 * in the face of pdirops locking.
570 #define OSD_INVARIANT_CHECKS (0)
572 #if OSD_INVARIANT_CHECKS
573 static inline int osd_invariant(const struct osd_object *obj)
577 ergo(obj->oo_inode != NULL,
578 obj->oo_inode->i_sb == osd_sb(osd_obj2dev(obj)) &&
579 atomic_read(&obj->oo_inode->i_count) > 0) &&
580 ergo(obj->oo_dir != NULL &&
581 obj->oo_dir->od_conationer.ic_object != NULL,
582 obj->oo_dir->od_conationer.ic_object == obj->oo_inode);
585 #define osd_invariant(obj) (1)
588 static inline struct osd_oi *osd_fid2oi(struct osd_device *osd,
589 const struct lu_fid *fid)
591 LASSERT(!fid_is_idif(fid));
592 LASSERT(!fid_is_igif(fid));
593 LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1);
594 /* It can work even od_oi_count equals to 1 although it's unexpected,
595 * the only reason we set it to 1 is for performance measurement */
596 return osd->od_oi_table[fid->f_seq & (osd->od_oi_count - 1)];
599 extern const struct lu_device_operations osd_lu_ops;
601 static inline int lu_device_is_osd(const struct lu_device *d)
603 return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
606 static inline struct osd_device *osd_dt_dev(const struct dt_device *d)
608 LASSERT(lu_device_is_osd(&d->dd_lu_dev));
609 return container_of0(d, struct osd_device, od_dt_dev);
612 static inline struct osd_device *osd_dev(const struct lu_device *d)
614 LASSERT(lu_device_is_osd(d));
615 return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
618 static inline struct osd_device *osd_obj2dev(const struct osd_object *o)
620 return osd_dev(o->oo_dt.do_lu.lo_dev);
623 static inline struct super_block *osd_sb(const struct osd_device *dev)
625 return dev->od_mount->lmi_mnt->mnt_sb;
628 static inline int osd_object_is_root(const struct osd_object *obj)
630 return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
633 static inline struct osd_object *osd_obj(const struct lu_object *o)
635 LASSERT(lu_device_is_osd(o->lo_dev));
636 return container_of0(o, struct osd_object, oo_dt.do_lu);
639 static inline struct osd_object *osd_dt_obj(const struct dt_object *d)
641 return osd_obj(&d->do_lu);
644 static inline struct lu_device *osd2lu_dev(struct osd_device *osd)
646 return &osd->od_dt_dev.dd_lu_dev;
649 static inline journal_t *osd_journal(const struct osd_device *dev)
651 return LDISKFS_SB(osd_sb(dev))->s_journal;
654 extern const struct dt_body_operations osd_body_ops;
655 extern struct lu_context_key osd_key;
657 static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env)
659 return lu_context_key_get(&env->le_ctx, &osd_key);
662 extern const struct dt_body_operations osd_body_ops_new;
668 struct iam_path_descr *osd_it_ipd_get(const struct lu_env *env,
669 const struct iam_container *bag)
671 return bag->ic_descr->id_ops->id_ipd_alloc(bag,
672 osd_oti_get(env)->oti_it_ipd);
676 struct iam_path_descr *osd_idx_ipd_get(const struct lu_env *env,
677 const struct iam_container *bag)
679 return bag->ic_descr->id_ops->id_ipd_alloc(bag,
680 osd_oti_get(env)->oti_idx_ipd);
683 static inline void osd_ipd_put(const struct lu_env *env,
684 const struct iam_container *bag,
685 struct iam_path_descr *ipd)
687 bag->ic_descr->id_ops->id_ipd_free(ipd);
690 int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs);
693 struct dentry *osd_child_dentry_by_inode(const struct lu_env *env,
695 const char *name, const int namelen)
697 struct osd_thread_info *info = osd_oti_get(env);
698 struct dentry *child_dentry = &info->oti_child_dentry;
699 struct dentry *obj_dentry = &info->oti_obj_dentry;
701 obj_dentry->d_inode = inode;
702 obj_dentry->d_sb = inode->i_sb;
703 obj_dentry->d_name.hash = 0;
705 child_dentry->d_name.hash = 0;
706 child_dentry->d_parent = obj_dentry;
707 child_dentry->d_name.name = name;
708 child_dentry->d_name.len = namelen;
713 * Helper function to pack the fid, ldiskfs stores fid in packed format.
716 void osd_fid_pack(struct osd_fid_pack *pack, const struct dt_rec *fid,
717 struct lu_fid *befider)
719 fid_cpu_to_be(befider, (struct lu_fid *)fid);
720 memcpy(pack->fp_area, befider, sizeof(*befider));
721 pack->fp_len = sizeof(*befider) + 1;
725 int osd_fid_unpack(struct lu_fid *fid, const struct osd_fid_pack *pack)
730 switch (pack->fp_len) {
731 case sizeof *fid + 1:
732 memcpy(fid, pack->fp_area, sizeof *fid);
733 fid_be_to_cpu(fid, fid);
736 CERROR("Unexpected packed fid size: %d\n", pack->fp_len);
742 #endif /* __KERNEL__ */
743 #endif /* _OSD_INTERNAL_H */