1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011, 2012, Whamcloud, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
38 * lustre/osd/osd_internal.h
40 * Shared definitions and declarations for osd module
42 * Author: Nikita Danilov <nikita@clusterfs.com>
45 #ifndef _OSD_INTERNAL_H
46 #define _OSD_INTERNAL_H
48 #if defined(__KERNEL__)
50 /* struct rw_semaphore */
51 #include <linux/rwsem.h>
53 #include <linux/dcache.h>
55 #include <linux/dirent.h>
57 #include <ldiskfs/ldiskfs.h>
58 #include <ldiskfs/ldiskfs_jbd2.h>
59 #ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD
60 # define journal_callback ldiskfs_journal_cb_entry
61 # define osd_journal_callback_set(handle, func, jcb) \
62 ldiskfs_journal_callback_add(handle, func, jcb)
64 # define osd_journal_callback_set(handle, func, jcb) \
65 jbd2_journal_callback_set(handle, func, jcb)
68 /* fsfilt_{get|put}_ops */
69 #include <lustre_fsfilt.h>
73 /* class_register_type(), class_unregister_type(), class_get_type() */
74 #include <obd_class.h>
75 #include <lustre_disk.h>
76 #include <dt_object.h>
83 #define OSD_OII_NOGEN (0)
84 #define OSD_COUNTERS (0)
86 /** Enable thandle usage statistics */
87 #define OSD_THANDLE_STATS (0)
89 #ifdef HAVE_QUOTA_SUPPORT
93 cfs_kernel_cap_t oc_cap;
97 struct osd_directory {
98 struct iam_container od_container;
99 struct iam_descr od_descr;
103 struct dt_object oo_dt;
105 * Inode for file system object represented by this osd_object. This
106 * inode is pinned for the whole duration of lu_object life.
108 * Not modified concurrently (either setup early during object
109 * creation, or assigned by osd_object_create() under write lock).
111 struct inode *oo_inode;
113 * to protect index ops.
115 struct htree_lock_head *oo_hl_head;
116 cfs_rw_semaphore_t oo_ext_idx_sem;
117 cfs_rw_semaphore_t oo_sem;
118 struct osd_directory *oo_dir;
119 /** protects inode attributes. */
120 cfs_spinlock_t oo_guard;
122 * Following two members are used to indicate the presence of dot and
123 * dotdot in the given directory. This is required for interop mode
126 int oo_compat_dot_created;
127 int oo_compat_dotdot_created;
129 const struct lu_env *oo_owner;
130 #ifdef CONFIG_LOCKDEP
131 struct lockdep_map oo_dep_map;
135 #ifdef HAVE_LDISKFS_PDO
137 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
138 ll_ldiskfs_find_entry(dir, dentry, de, lock)
139 #define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
140 ldiskfs_add_entry(handle, child, cinode, hlock)
142 #else /* HAVE_LDISKFS_PDO */
148 struct htree_lock_head {
152 #define ldiskfs_htree_lock(lock, head, inode, op) do { LBUG(); } while (0)
153 #define ldiskfs_htree_unlock(lock) do { LBUG(); } while (0)
155 static inline struct htree_lock_head *ldiskfs_htree_lock_head_alloc(int dep)
161 #define ldiskfs_htree_lock_head_free(lh) do { LBUG(); } while (0)
163 #define LDISKFS_DUMMY_HTREE_LOCK 0xbabecafe
165 static inline struct htree_lock *ldiskfs_htree_lock_alloc(void)
167 return (struct htree_lock *)LDISKFS_DUMMY_HTREE_LOCK;
170 static inline void ldiskfs_htree_lock_free(struct htree_lock *lk)
172 LASSERT((unsigned long)lk == LDISKFS_DUMMY_HTREE_LOCK);
175 #define HTREE_HBITS_DEF 0
177 #define osd_ldiskfs_find_entry(dir, dentry, de, lock) \
178 ll_ldiskfs_find_entry(dir, dentry, de)
179 #define osd_ldiskfs_add_entry(handle, child, cinode, lock) \
180 ldiskfs_add_entry(handle, child, cinode)
182 #endif /* HAVE_LDISKFS_PDO */
184 extern const int osd_dto_credits_noquota[];
191 struct dt_device od_dt_dev;
192 /* information about underlying file system */
193 struct lustre_mount_info *od_mount;
195 * XXX temporary stuff for object index: directory where every object
196 * is named by its fid.
198 struct dt_object *od_obj_area;
200 struct osd_oi *od_oi_table;
201 /* total number of OI containers */
206 unsigned int od_fl_capa:1;
207 unsigned long od_capa_timeout;
209 struct lustre_capa_key *od_capa_keys;
210 cfs_hlist_head_t *od_capa_hash;
212 cfs_proc_dir_entry_t *od_proc_entry;
213 struct lprocfs_stats *od_stats;
215 * statfs optimization: we cache a bit.
217 cfs_time_t od_osfs_age;
218 cfs_kstatfs_t od_kstatfs;
219 cfs_spinlock_t od_osfs_lock;
222 * The following flag indicates, if it is interop mode or not.
223 * It will be initialized, using mount param.
227 struct fsfilt_operations *od_fsops;
229 unsigned long long od_readcache_max_filesize;
231 int od_writethrough_cache;
233 struct brw_stats od_brw_stats;
234 cfs_atomic_t od_r_in_flight;
235 cfs_atomic_t od_w_in_flight;
238 #define OSD_TRACK_DECLARES
239 #ifdef OSD_TRACK_DECLARES
240 #define OSD_DECLARE_OP(oh, op) { \
241 LASSERT(oh->ot_handle == NULL); \
242 ((oh)->ot_declare_ ##op)++; }
243 #define OSD_EXEC_OP(handle, op) { \
244 struct osd_thandle *oh; \
245 oh = container_of0(handle, struct osd_thandle, ot_super);\
246 LASSERT((oh)->ot_declare_ ##op > 0); \
247 ((oh)->ot_declare_ ##op)--; }
249 #define OSD_DECLARE_OP(oh, op)
250 #define OSD_EXEC_OP(oh, op)
253 /* There are at most 10 uid/gids are affected in a transaction, and
254 * that's rename case:
255 * - 2 for source parent uid & gid;
256 * - 2 for source child uid & gid ('..' entry update when the child
258 * - 2 for target parent uid & gid;
259 * - 2 for target child uid & gid (if the target child exists);
260 * - 2 for root uid & gid (last_rcvd, llog, etc);
262 * The 0 to (OSD_MAX_UGID_CNT - 1) bits of ot_id_type is for indicating
263 * the id type of each id in the ot_id_array.
265 #define OSD_MAX_UGID_CNT 10
268 struct thandle ot_super;
270 struct journal_callback ot_jcb;
271 cfs_list_t ot_dcb_list;
272 /* Link to the device, for debugging. */
273 struct lu_ref_link *ot_dev_link;
274 unsigned short ot_credits;
275 unsigned short ot_id_cnt;
276 unsigned short ot_id_type;
277 uid_t ot_id_array[OSD_MAX_UGID_CNT];
279 #ifdef OSD_TRACK_DECLARES
280 unsigned char ot_declare_attr_set;
281 unsigned char ot_declare_punch;
282 unsigned char ot_declare_xattr_set;
283 unsigned char ot_declare_create;
284 unsigned char ot_declare_destroy;
285 unsigned char ot_declare_ref_add;
286 unsigned char ot_declare_ref_del;
287 unsigned char ot_declare_write;
288 unsigned char ot_declare_insert;
289 unsigned char ot_declare_delete;
292 #if OSD_THANDLE_STATS
293 /** time when this handle was allocated */
294 cfs_time_t oth_alloced;
296 /** time when this thanle was started */
297 cfs_time_t oth_started;
302 * Basic transaction credit op
312 DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
326 LPROC_OSD_READ_BYTES = 0,
327 LPROC_OSD_WRITE_BYTES = 1,
328 LPROC_OSD_GET_PAGE = 2,
329 LPROC_OSD_NO_PAGE = 3,
330 LPROC_OSD_CACHE_ACCESS = 4,
331 LPROC_OSD_CACHE_HIT = 5,
332 LPROC_OSD_CACHE_MISS = 6,
334 #if OSD_THANDLE_STATS
335 LPROC_OSD_THANDLE_STARTING,
336 LPROC_OSD_THANDLE_OPEN,
337 LPROC_OSD_THANDLE_CLOSING,
344 * Storage representation for fids.
346 * Variable size, first byte contains the length of the whole record.
348 struct osd_fid_pack {
349 unsigned char fp_len;
350 char fp_area[sizeof(struct lu_fid)];
353 struct osd_it_ea_dirent {
354 struct lu_fid oied_fid;
357 unsigned short oied_namelen;
358 unsigned int oied_type;
360 } __attribute__((packed));
363 * as osd_it_ea_dirent (in memory dirent struct for osd) is greater
364 * than lu_dirent struct. osd readdir reads less number of dirent than
365 * required for mdd dir page. so buffer size need to be increased so that
366 * there would be one ext3 readdir for every mdd readdir page.
369 #define OSD_IT_EA_BUFSIZE (CFS_PAGE_SIZE + CFS_PAGE_SIZE/4)
372 * This is iterator's in-memory data structure in interoperability
373 * mode (i.e. iterator over ldiskfs style directory)
376 struct osd_object *oie_obj;
377 /** used in ldiskfs iterator, to stored file pointer */
378 struct file oie_file;
379 /** how many entries have been read-cached from storage */
381 /** current entry is being iterated by caller */
383 /** current processing entry */
384 struct osd_it_ea_dirent *oie_dirent;
385 /** buffer to hold entries, size == OSD_IT_EA_BUFSIZE */
390 * Iterator's in-memory data structure for IAM mode.
393 struct osd_object *oi_obj;
394 struct iam_path_descr *oi_ipd;
395 struct iam_iterator oi_it;
398 #define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512)
402 cfs_atomic_t dr_numreqs; /* number of reqs being processed */
407 unsigned int dr_ignore_quota:1;
408 unsigned int dr_elapsed_valid:1; /* we really did count time */
409 unsigned int dr_rw:1;
410 struct page *dr_pages[PTLRPC_MAX_BRW_PAGES];
411 unsigned long dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE];
412 unsigned long dr_start_time;
413 unsigned long dr_elapsed; /* how long io took */
414 struct osd_device *dr_dev;
417 struct osd_thread_info {
418 const struct lu_env *oti_env;
420 * used for index operations.
422 struct dentry oti_obj_dentry;
423 struct dentry oti_child_dentry;
425 /** dentry for Iterator context. */
426 struct dentry oti_it_dentry;
427 struct htree_lock *oti_hlock;
429 struct lu_fid oti_fid;
430 struct osd_inode_id oti_id;
432 * XXX temporary: for ->i_op calls.
434 struct timespec oti_time;
436 * XXX temporary: fake struct file for osd_object_sync
438 struct file oti_file;
440 * XXX temporary: for capa operations.
442 struct lustre_capa_key oti_capa_key;
443 struct lustre_capa oti_capa;
445 /** osd_device reference, initialized in osd_trans_start() and
446 used in osd_trans_stop() */
447 struct osd_device *oti_dev;
450 * following ipd and it structures are used for osd_index_iam_lookup()
451 * these are defined separately as we might do index operation
452 * in open iterator session.
455 /** osd iterator context used for iterator session */
458 struct osd_it_iam oti_it;
459 /** ldiskfs iterator data structure, see osd_it_ea_{init, fini} */
460 struct osd_it_ea oti_it_ea;
463 /** pre-allocated buffer used by oti_it_ea, size OSD_IT_EA_BUFSIZE */
466 /** IAM iterator for index operation. */
467 struct iam_iterator oti_idx_it;
469 /** union to guarantee that ->oti_ipd[] has proper alignment. */
471 char oti_it_ipd[DX_IPD_MAX_SIZE];
472 long long oti_alignment_lieutenant;
476 char oti_idx_ipd[DX_IPD_MAX_SIZE];
477 long long oti_alignment_lieutenant_colonel;
484 /** used in osd_fid_set() to put xattr */
485 struct lu_buf oti_buf;
486 /** used in osd_ea_fid_set() to set fid into common ea */
487 struct lustre_mdt_attrs oti_mdt_attrs;
489 struct osd_iobuf oti_iobuf;
490 struct inode oti_inode;
491 int oti_created[PTLRPC_MAX_BRW_PAGES];
492 #ifdef HAVE_QUOTA_SUPPORT
493 struct osd_ctxt oti_ctxt;
495 struct lu_env oti_obj_delete_tx_env;
496 #define OSD_FID_REC_SZ 32
497 char oti_ldp[OSD_FID_REC_SZ];
498 char oti_ldp2[OSD_FID_REC_SZ];
501 extern int ldiskfs_pdo;
505 void lprocfs_osd_init_vars(struct lprocfs_static_vars *lvars);
506 int osd_procfs_init(struct osd_device *osd, const char *name);
507 int osd_procfs_fini(struct osd_device *osd);
508 void osd_lprocfs_time_start(const struct lu_env *env);
509 void osd_lprocfs_time_end(const struct lu_env *env,
510 struct osd_device *osd, int op);
511 void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf);
514 int osd_statfs(const struct lu_env *env, struct dt_device *dev,
516 int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
517 struct lustre_capa *capa, __u64 opc);
518 void osd_declare_qid(struct dt_object *dt, struct osd_thandle *oh,
519 int type, uid_t id, struct inode *inode);
520 int generic_error_remove_page(struct address_space *mapping,
524 * Invariants, assertions.
528 * XXX: do not enable this, until invariant checking code is made thread safe
529 * in the face of pdirops locking.
531 #define OSD_INVARIANT_CHECKS (0)
533 #if OSD_INVARIANT_CHECKS
534 static inline int osd_invariant(const struct osd_object *obj)
538 ergo(obj->oo_inode != NULL,
539 obj->oo_inode->i_sb == osd_sb(osd_obj2dev(obj)) &&
540 atomic_read(&obj->oo_inode->i_count) > 0) &&
541 ergo(obj->oo_dir != NULL &&
542 obj->oo_dir->od_conationer.ic_object != NULL,
543 obj->oo_dir->od_conationer.ic_object == obj->oo_inode);
546 #define osd_invariant(obj) (1)
549 /* The on-disk extN format reserves inodes 0-11 for internal filesystem
550 * use, and these inodes will be invisible on client side, so the valid
551 * sequence for IGIF fid is 12-0xffffffff. But root inode (2#) will be seen
552 * on server side (osd), and it should be valid too here.
554 #define OSD_ROOT_SEQ 2
555 static inline int osd_fid_is_root(const struct lu_fid *fid)
557 return fid_seq(fid) == OSD_ROOT_SEQ;
560 static inline int osd_fid_is_igif(const struct lu_fid *fid)
562 return fid_is_igif(fid) || osd_fid_is_root(fid);
565 static inline struct osd_oi *
566 osd_fid2oi(struct osd_device *osd, const struct lu_fid *fid)
568 if (!fid_is_norm(fid))
571 LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1);
572 /* It can work even od_oi_count equals to 1 although it's unexpected,
573 * the only reason we set it to 1 is for performance measurement */
574 return &osd->od_oi_table[fid->f_seq & (osd->od_oi_count - 1)];
580 extern const struct lu_device_operations osd_lu_ops;
582 static inline int lu_device_is_osd(const struct lu_device *d)
584 return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
587 static inline struct osd_device *osd_dt_dev(const struct dt_device *d)
589 LASSERT(lu_device_is_osd(&d->dd_lu_dev));
590 return container_of0(d, struct osd_device, od_dt_dev);
593 static inline struct osd_device *osd_dev(const struct lu_device *d)
595 LASSERT(lu_device_is_osd(d));
596 return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
599 static inline struct osd_device *osd_obj2dev(const struct osd_object *o)
601 return osd_dev(o->oo_dt.do_lu.lo_dev);
604 static inline struct super_block *osd_sb(const struct osd_device *dev)
606 return dev->od_mount->lmi_mnt->mnt_sb;
609 static inline int osd_object_is_root(const struct osd_object *obj)
611 return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
614 static inline struct osd_object *osd_obj(const struct lu_object *o)
616 LASSERT(lu_device_is_osd(o->lo_dev));
617 return container_of0(o, struct osd_object, oo_dt.do_lu);
620 static inline struct osd_object *osd_dt_obj(const struct dt_object *d)
622 return osd_obj(&d->do_lu);
625 static inline struct lu_device *osd2lu_dev(struct osd_device *osd)
627 return &osd->od_dt_dev.dd_lu_dev;
630 static inline journal_t *osd_journal(const struct osd_device *dev)
632 return LDISKFS_SB(osd_sb(dev))->s_journal;
635 extern const struct dt_body_operations osd_body_ops;
636 extern struct lu_context_key osd_key;
638 static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env)
640 return lu_context_key_get(&env->le_ctx, &osd_key);
643 extern const struct dt_body_operations osd_body_ops_new;
645 #endif /* __KERNEL__ */
646 #endif /* _OSD_INTERNAL_H */