1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011, 2012, Whamcloud, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
38 * lustre/osd/osd_internal.h
40 * Shared definitions and declarations for osd module
42 * Author: Nikita Danilov <nikita@clusterfs.com>
45 #ifndef _OSD_INTERNAL_H
46 #define _OSD_INTERNAL_H
48 #if defined(__KERNEL__)
50 /* struct rw_semaphore */
51 #include <linux/rwsem.h>
53 #include <linux/dcache.h>
55 #include <linux/dirent.h>
57 #include <ldiskfs/ldiskfs.h>
58 #include <ldiskfs/ldiskfs_jbd2.h>
59 #ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD
60 # define journal_callback ldiskfs_journal_cb_entry
61 # define osd_journal_callback_set(handle, func, jcb) \
62 ldiskfs_journal_callback_add(handle, func, jcb)
64 # define osd_journal_callback_set(handle, func, jcb) \
65 jbd2_journal_callback_set(handle, func, jcb)
68 /* fsfilt_{get|put}_ops */
69 #include <lustre_fsfilt.h>
73 /* class_register_type(), class_unregister_type(), class_get_type() */
74 #include <obd_class.h>
75 #include <lustre_disk.h>
76 #include <dt_object.h>
83 #define OSD_OII_NOGEN (0)
84 #define OSD_COUNTERS (0)
86 /** Enable thandle usage statistics */
87 #define OSD_THANDLE_STATS (0)
89 #ifdef HAVE_QUOTA_SUPPORT
93 cfs_kernel_cap_t oc_cap;
97 struct osd_directory {
98 struct iam_container od_container;
99 struct iam_descr od_descr;
103 * Object Index (oi) instance.
107 * underlying index object, where fid->id mapping in stored.
109 struct inode *oi_inode;
110 struct osd_directory oi_dir;
113 extern const int osd_dto_credits_noquota[];
116 struct dt_object oo_dt;
118 * Inode for file system object represented by this osd_object. This
119 * inode is pinned for the whole duration of lu_object life.
121 * Not modified concurrently (either setup early during object
122 * creation, or assigned by osd_object_create() under write lock).
124 struct inode *oo_inode;
126 * to protect index ops.
128 struct htree_lock_head *oo_hl_head;
129 cfs_rw_semaphore_t oo_ext_idx_sem;
130 cfs_rw_semaphore_t oo_sem;
131 struct osd_directory *oo_dir;
132 /** protects inode attributes. */
133 cfs_spinlock_t oo_guard;
135 * Following two members are used to indicate the presence of dot and
136 * dotdot in the given directory. This is required for interop mode
139 int oo_compat_dot_created;
140 int oo_compat_dotdot_created;
142 const struct lu_env *oo_owner;
143 #ifdef CONFIG_LOCKDEP
144 struct lockdep_map oo_dep_map;
148 #ifdef HAVE_LDISKFS_PDO
150 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
151 ll_ldiskfs_find_entry(dir, dentry, de, lock)
152 #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
153 ldiskfs_add_entry(handle, child, cinode, hlock)
155 #else /* HAVE_LDISKFS_PDO */
161 struct htree_lock_head {
165 #define ldiskfs_htree_lock(lock, head, inode, op) do { LBUG(); } while (0)
166 #define ldiskfs_htree_unlock(lock) do { LBUG(); } while (0)
168 static inline struct htree_lock_head *ldiskfs_htree_lock_head_alloc(int dep)
174 #define ldiskfs_htree_lock_head_free(lh) do { LBUG(); } while (0)
176 #define LDISKFS_DUMMY_HTREE_LOCK 0xbabecafe
178 static inline struct htree_lock *ldiskfs_htree_lock_alloc(void)
180 return (struct htree_lock *)LDISKFS_DUMMY_HTREE_LOCK;
183 static inline void ldiskfs_htree_lock_free(struct htree_lock *lk)
185 LASSERT((unsigned long)lk == LDISKFS_DUMMY_HTREE_LOCK);
188 #define HTREE_HBITS_DEF 0
190 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
191 ll_ldiskfs_find_entry(dir, dentry, de)
192 #define osd_ldiskfs_add_entry(handle, child, cinode, lock) \
193 ldiskfs_add_entry(handle, child, cinode)
195 #endif /* HAVE_LDISKFS_PDO */
197 extern const int osd_dto_credits_noquota[];
204 struct dt_device od_dt_dev;
205 /* information about underlying file system */
206 struct lustre_mount_info *od_mount;
207 struct vfsmount *od_mnt;
209 struct osd_oi **od_oi_table;
210 /* total number of OI containers */
215 unsigned int od_fl_capa:1;
216 unsigned long od_capa_timeout;
218 struct lustre_capa_key *od_capa_keys;
219 cfs_hlist_head_t *od_capa_hash;
221 cfs_proc_dir_entry_t *od_proc_entry;
222 struct lprocfs_stats *od_stats;
224 * statfs optimization: we cache a bit.
226 cfs_time_t od_osfs_age;
227 struct obd_statfs od_statfs;
228 cfs_spinlock_t od_osfs_lock;
231 * The following flag indicates, if it is interop mode or not.
232 * It will be initialized, using mount param.
236 struct fsfilt_operations *od_fsops;
239 * mapping for legacy OST objids
241 struct osd_compat_objid *od_ost_map;
243 unsigned long long od_readcache_max_filesize;
245 int od_writethrough_cache;
247 struct brw_stats od_brw_stats;
248 cfs_atomic_t od_r_in_flight;
249 cfs_atomic_t od_w_in_flight;
252 #define OSD_TRACK_DECLARES
253 #ifdef OSD_TRACK_DECLARES
254 #define OSD_DECLARE_OP(oh, op) { \
255 LASSERT(oh->ot_handle == NULL); \
256 ((oh)->ot_declare_ ##op)++; }
257 #define OSD_EXEC_OP(handle,op) { \
258 struct osd_thandle *oh; \
259 oh = container_of0(handle, struct osd_thandle, ot_super);\
260 if (((oh)->ot_declare_ ##op) > 0) { \
261 ((oh)->ot_declare_ ##op)--; \
265 #define OSD_DECLARE_OP(oh, op)
266 #define OSD_EXEC_OP(oh, op)
269 /* There are at most 10 uid/gids are affected in a transaction, and
270 * that's rename case:
271 * - 2 for source parent uid & gid;
272 * - 2 for source child uid & gid ('..' entry update when the child
274 * - 2 for target parent uid & gid;
275 * - 2 for target child uid & gid (if the target child exists);
276 * - 2 for root uid & gid (last_rcvd, llog, etc);
278 * The 0 to (OSD_MAX_UGID_CNT - 1) bits of ot_id_type is for indicating
279 * the id type of each id in the ot_id_array.
281 #define OSD_MAX_UGID_CNT 10
284 struct thandle ot_super;
286 struct journal_callback ot_jcb;
287 cfs_list_t ot_dcb_list;
288 /* Link to the device, for debugging. */
289 struct lu_ref_link *ot_dev_link;
290 unsigned short ot_credits;
291 unsigned short ot_id_cnt;
292 unsigned short ot_id_type;
293 uid_t ot_id_array[OSD_MAX_UGID_CNT];
295 #ifdef OSD_TRACK_DECLARES
296 unsigned char ot_declare_attr_set;
297 unsigned char ot_declare_punch;
298 unsigned char ot_declare_xattr_set;
299 unsigned char ot_declare_create;
300 unsigned char ot_declare_destroy;
301 unsigned char ot_declare_ref_add;
302 unsigned char ot_declare_ref_del;
303 unsigned char ot_declare_write;
304 unsigned char ot_declare_insert;
305 unsigned char ot_declare_delete;
308 #if OSD_THANDLE_STATS
309 /** time when this handle was allocated */
310 cfs_time_t oth_alloced;
312 /** time when this thanle was started */
313 cfs_time_t oth_started;
318 * Basic transaction credit op
328 DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
342 LPROC_OSD_READ_BYTES = 0,
343 LPROC_OSD_WRITE_BYTES = 1,
344 LPROC_OSD_GET_PAGE = 2,
345 LPROC_OSD_NO_PAGE = 3,
346 LPROC_OSD_CACHE_ACCESS = 4,
347 LPROC_OSD_CACHE_HIT = 5,
348 LPROC_OSD_CACHE_MISS = 6,
350 #if OSD_THANDLE_STATS
351 LPROC_OSD_THANDLE_STARTING,
352 LPROC_OSD_THANDLE_OPEN,
353 LPROC_OSD_THANDLE_CLOSING,
360 * Storage representation for fids.
362 * Variable size, first byte contains the length of the whole record.
364 struct osd_fid_pack {
365 unsigned char fp_len;
366 char fp_area[sizeof(struct lu_fid)];
369 struct osd_it_ea_dirent {
370 struct lu_fid oied_fid;
373 unsigned short oied_namelen;
374 unsigned int oied_type;
376 } __attribute__((packed));
379 * as osd_it_ea_dirent (in memory dirent struct for osd) is greater
380 * than lu_dirent struct. osd readdir reads less number of dirent than
381 * required for mdd dir page. so buffer size need to be increased so that
382 * there would be one ext3 readdir for every mdd readdir page.
385 #define OSD_IT_EA_BUFSIZE (CFS_PAGE_SIZE + CFS_PAGE_SIZE/4)
388 * This is iterator's in-memory data structure in interoperability
389 * mode (i.e. iterator over ldiskfs style directory)
392 struct osd_object *oie_obj;
393 /** used in ldiskfs iterator, to stored file pointer */
394 struct file oie_file;
395 /** how many entries have been read-cached from storage */
397 /** current entry is being iterated by caller */
399 /** current processing entry */
400 struct osd_it_ea_dirent *oie_dirent;
401 /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */
406 * Iterator's in-memory data structure for IAM mode.
409 struct osd_object *oi_obj;
410 struct iam_path_descr *oi_ipd;
411 struct iam_iterator oi_it;
414 #define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512)
418 cfs_atomic_t dr_numreqs; /* number of reqs being processed */
423 unsigned int dr_ignore_quota:1;
424 unsigned int dr_elapsed_valid:1; /* we really did count time */
425 unsigned int dr_rw:1;
426 struct page *dr_pages[PTLRPC_MAX_BRW_PAGES];
427 unsigned long dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE];
428 unsigned long dr_start_time;
429 unsigned long dr_elapsed; /* how long io took */
430 struct osd_device *dr_dev;
433 struct osd_thread_info {
434 const struct lu_env *oti_env;
436 * used for index operations.
438 struct dentry oti_obj_dentry;
439 struct dentry oti_child_dentry;
441 /** dentry for Iterator context. */
442 struct dentry oti_it_dentry;
443 struct htree_lock *oti_hlock;
445 struct lu_fid oti_fid;
446 struct osd_inode_id oti_id;
447 struct ost_id oti_ostid;
450 * XXX temporary: for ->i_op calls.
452 struct timespec oti_time;
454 * XXX temporary: fake struct file for osd_object_sync
456 struct file oti_file;
458 * XXX temporary: for capa operations.
460 struct lustre_capa_key oti_capa_key;
461 struct lustre_capa oti_capa;
463 /** osd_device reference, initialized in osd_trans_start() and
464 used in osd_trans_stop() */
465 struct osd_device *oti_dev;
468 * following ipd and it structures are used for osd_index_iam_lookup()
469 * these are defined separately as we might do index operation
470 * in open iterator session.
473 /** osd iterator context used for iterator session */
476 struct osd_it_iam oti_it;
477 /** ldiskfs iterator data structure, see osd_it_ea_{init, fini} */
478 struct osd_it_ea oti_it_ea;
481 /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */
484 cfs_kstatfs_t oti_ksfs;
486 /** IAM iterator for index operation. */
487 struct iam_iterator oti_idx_it;
489 /** union to guarantee that ->oti_ipd[] has proper alignment. */
491 char oti_it_ipd[DX_IPD_MAX_SIZE];
492 long long oti_alignment_lieutenant;
496 char oti_idx_ipd[DX_IPD_MAX_SIZE];
497 long long oti_alignment_lieutenant_colonel;
504 /** used in osd_fid_set() to put xattr */
505 struct lu_buf oti_buf;
506 /** used in osd_ea_fid_set() to set fid into common ea */
507 struct lustre_mdt_attrs oti_mdt_attrs;
509 struct osd_iobuf oti_iobuf;
510 struct inode oti_inode;
511 int oti_created[PTLRPC_MAX_BRW_PAGES];
512 #ifdef HAVE_QUOTA_SUPPORT
513 struct osd_ctxt oti_ctxt;
515 struct lu_env oti_obj_delete_tx_env;
516 #define OSD_FID_REC_SZ 32
517 char oti_ldp[OSD_FID_REC_SZ];
518 char oti_ldp2[OSD_FID_REC_SZ];
521 extern int ldiskfs_pdo;
525 void lprocfs_osd_init_vars(struct lprocfs_static_vars *lvars);
526 int osd_procfs_init(struct osd_device *osd, const char *name);
527 int osd_procfs_fini(struct osd_device *osd);
528 void osd_lprocfs_time_start(const struct lu_env *env);
529 void osd_lprocfs_time_end(const struct lu_env *env,
530 struct osd_device *osd, int op);
531 void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf);
534 int osd_statfs(const struct lu_env *env, struct dt_device *dev,
535 struct obd_statfs *sfs);
536 int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
537 struct lustre_capa *capa, __u64 opc);
538 void osd_declare_qid(struct dt_object *dt, struct osd_thandle *oh,
539 int type, uid_t id, struct inode *inode);
540 struct inode *osd_iget(struct osd_thread_info *info,
541 struct osd_device *dev,
542 const struct osd_inode_id *id);
544 int osd_compat_init(struct osd_device *dev);
545 void osd_compat_fini(struct osd_device *dev);
546 int osd_compat_objid_lookup(struct osd_thread_info *info,
547 struct osd_device *osd,
548 const struct lu_fid *fid, struct osd_inode_id *id);
549 int osd_compat_objid_insert(struct osd_thread_info *info,
550 struct osd_device *osd,
551 const struct lu_fid *fid,
552 const struct osd_inode_id *id, struct thandle *th);
553 int osd_compat_objid_delete(struct osd_thread_info *info,
554 struct osd_device *osd,
555 const struct lu_fid *fid, struct thandle *th);
556 int osd_compat_spec_lookup(struct osd_thread_info *info,
557 struct osd_device *osd,
558 const struct lu_fid *fid, struct osd_inode_id *id);
559 int osd_compat_spec_insert(struct osd_thread_info *info,
560 struct osd_device *osd,
561 const struct lu_fid *fid,
562 const struct osd_inode_id *id, struct thandle *th);
565 * Invariants, assertions.
569 * XXX: do not enable this, until invariant checking code is made thread safe
570 * in the face of pdirops locking.
572 #define OSD_INVARIANT_CHECKS (0)
574 #if OSD_INVARIANT_CHECKS
575 static inline int osd_invariant(const struct osd_object *obj)
579 ergo(obj->oo_inode != NULL,
580 obj->oo_inode->i_sb == osd_sb(osd_obj2dev(obj)) &&
581 atomic_read(&obj->oo_inode->i_count) > 0) &&
582 ergo(obj->oo_dir != NULL &&
583 obj->oo_dir->od_conationer.ic_object != NULL,
584 obj->oo_dir->od_conationer.ic_object == obj->oo_inode);
587 #define osd_invariant(obj) (1)
590 static inline struct osd_oi *osd_fid2oi(struct osd_device *osd,
591 const struct lu_fid *fid)
593 LASSERT(!fid_is_idif(fid));
594 LASSERT(!fid_is_igif(fid));
595 LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1);
596 /* It can work even od_oi_count equals to 1 although it's unexpected,
597 * the only reason we set it to 1 is for performance measurement */
598 return osd->od_oi_table[fid->f_seq & (osd->od_oi_count - 1)];
601 extern const struct lu_device_operations osd_lu_ops;
603 static inline int lu_device_is_osd(const struct lu_device *d)
605 return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
608 static inline struct osd_device *osd_dt_dev(const struct dt_device *d)
610 LASSERT(lu_device_is_osd(&d->dd_lu_dev));
611 return container_of0(d, struct osd_device, od_dt_dev);
614 static inline struct osd_device *osd_dev(const struct lu_device *d)
616 LASSERT(lu_device_is_osd(d));
617 return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
620 static inline struct osd_device *osd_obj2dev(const struct osd_object *o)
622 return osd_dev(o->oo_dt.do_lu.lo_dev);
625 static inline struct super_block *osd_sb(const struct osd_device *dev)
627 return dev->od_mount->lmi_mnt->mnt_sb;
630 static inline int osd_object_is_root(const struct osd_object *obj)
632 return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
635 static inline struct osd_object *osd_obj(const struct lu_object *o)
637 LASSERT(lu_device_is_osd(o->lo_dev));
638 return container_of0(o, struct osd_object, oo_dt.do_lu);
641 static inline struct osd_object *osd_dt_obj(const struct dt_object *d)
643 return osd_obj(&d->do_lu);
646 static inline struct lu_device *osd2lu_dev(struct osd_device *osd)
648 return &osd->od_dt_dev.dd_lu_dev;
651 static inline journal_t *osd_journal(const struct osd_device *dev)
653 return LDISKFS_SB(osd_sb(dev))->s_journal;
656 extern const struct dt_body_operations osd_body_ops;
657 extern struct lu_context_key osd_key;
659 static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env)
661 return lu_context_key_get(&env->le_ctx, &osd_key);
664 extern const struct dt_body_operations osd_body_ops_new;
670 struct iam_path_descr *osd_it_ipd_get(const struct lu_env *env,
671 const struct iam_container *bag)
673 return bag->ic_descr->id_ops->id_ipd_alloc(bag,
674 osd_oti_get(env)->oti_it_ipd);
678 struct iam_path_descr *osd_idx_ipd_get(const struct lu_env *env,
679 const struct iam_container *bag)
681 return bag->ic_descr->id_ops->id_ipd_alloc(bag,
682 osd_oti_get(env)->oti_idx_ipd);
685 static inline void osd_ipd_put(const struct lu_env *env,
686 const struct iam_container *bag,
687 struct iam_path_descr *ipd)
689 bag->ic_descr->id_ops->id_ipd_free(ipd);
692 int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs);
695 struct dentry *osd_child_dentry_by_inode(const struct lu_env *env,
697 const char *name, const int namelen)
699 struct osd_thread_info *info = osd_oti_get(env);
700 struct dentry *child_dentry = &info->oti_child_dentry;
701 struct dentry *obj_dentry = &info->oti_obj_dentry;
703 obj_dentry->d_inode = inode;
704 obj_dentry->d_sb = inode->i_sb;
705 obj_dentry->d_name.hash = 0;
707 child_dentry->d_name.hash = 0;
708 child_dentry->d_parent = obj_dentry;
709 child_dentry->d_name.name = name;
710 child_dentry->d_name.len = namelen;
715 * Helper function to pack the fid, ldiskfs stores fid in packed format.
718 void osd_fid_pack(struct osd_fid_pack *pack, const struct dt_rec *fid,
719 struct lu_fid *befider)
721 fid_cpu_to_be(befider, (struct lu_fid *)fid);
722 memcpy(pack->fp_area, befider, sizeof(*befider));
723 pack->fp_len = sizeof(*befider) + 1;
727 int osd_fid_unpack(struct lu_fid *fid, const struct osd_fid_pack *pack)
732 switch (pack->fp_len) {
733 case sizeof *fid + 1:
734 memcpy(fid, pack->fp_area, sizeof *fid);
735 fid_be_to_cpu(fid, fid);
738 CERROR("Unexpected packed fid size: %d\n", pack->fp_len);
744 #endif /* __KERNEL__ */
745 #endif /* _OSD_INTERNAL_H */