Whamcloud - gitweb
LU-911 osd: support for legacy OST objects in ldiskfs osd
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_internal.h
index 662f7a1..6af354c 100644 (file)
@@ -28,9 +28,8 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2011 Whamcloud, Inc.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 /* struct dirent64 */
 #include <linux/dirent.h>
 
-#ifdef HAVE_EXT4_LDISKFS
 #include <ldiskfs/ldiskfs.h>
 #include <ldiskfs/ldiskfs_jbd2.h>
-# ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD
-#  define journal_callback ldiskfs_journal_cb_entry
-#  define osd_journal_callback_set(handle, func, jcb) ldiskfs_journal_callback_add(handle, func, jcb)
-# else
-#  define osd_journal_callback_set(handle, func, jcb) jbd2_journal_callback_set(handle, func, jcb)
-# endif
+#ifdef HAVE_LDISKFS_JOURNAL_CALLBACK_ADD
+# define journal_callback ldiskfs_journal_cb_entry
+# define osd_journal_callback_set(handle, func, jcb) \
+         ldiskfs_journal_callback_add(handle, func, jcb)
 #else
-#include <linux/jbd.h>
-#include <linux/ldiskfs_fs.h>
-#include <linux/ldiskfs_jbd.h>
-#define osd_journal_callback_set(handle, func, jcb) journal_callback_set(handle, func, jcb)
+# define osd_journal_callback_set(handle, func, jcb) \
+         jbd2_journal_callback_set(handle, func, jcb)
 #endif
 
+/* fsfilt_{get|put}_ops */
+#include <lustre_fsfilt.h>
 
 /* LUSTRE_OSD_NAME */
 #include <obd.h>
 /* class_register_type(), class_unregister_type(), class_get_type() */
 #include <obd_class.h>
 #include <lustre_disk.h>
-
 #include <dt_object.h>
+
 #include "osd_oi.h"
 #include "osd_iam.h"
 
@@ -98,6 +94,57 @@ struct osd_ctxt {
 };
 #endif
 
+struct osd_directory {
+        struct iam_container od_container;
+        struct iam_descr     od_descr;
+};
+
+/*
+ * Object Index (oi) instance.
+ */
+struct osd_oi {
+        /*
+         * underlying index object, where fid->id mapping in stored.
+         */
+        struct inode         *oi_inode;
+        struct osd_directory  oi_dir;
+};
+
+extern const int osd_dto_credits_noquota[];
+
+struct osd_object {
+        struct dt_object        oo_dt;
+        /**
+         * Inode for file system object represented by this osd_object. This
+         * inode is pinned for the whole duration of lu_object life.
+         *
+         * Not modified concurrently (either setup early during object
+         * creation, or assigned by osd_object_create() under write lock).
+         */
+        struct inode           *oo_inode;
+        /**
+         * to protect index ops.
+         */
+        struct htree_lock_head *oo_hl_head;
+        cfs_rw_semaphore_t      oo_ext_idx_sem;
+        cfs_rw_semaphore_t      oo_sem;
+        struct osd_directory   *oo_dir;
+        /** protects inode attributes. */
+        cfs_spinlock_t          oo_guard;
+        /**
+         * Following two members are used to indicate the presence of dot and
+         * dotdot in the given directory. This is required for interop mode
+         * (b11826).
+         */
+        int                     oo_compat_dot_created;
+        int                     oo_compat_dotdot_created;
+
+        const struct lu_env    *oo_owner;
+#ifdef CONFIG_LOCKDEP
+        struct lockdep_map      oo_dep_map;
+#endif
+};
+
 #ifdef HAVE_LDISKFS_PDO
 
 #define osd_ldiskfs_find_entry(dir, dentry, de, lock)   \
@@ -147,6 +194,8 @@ static inline void ldiskfs_htree_lock_free(struct htree_lock *lk)
 
 #endif /* HAVE_LDISKFS_PDO */
 
+extern const int osd_dto_credits_noquota[];
+
 /*
  * osd device.
  */
@@ -155,14 +204,11 @@ struct osd_device {
         struct dt_device          od_dt_dev;
         /* information about underlying file system */
         struct lustre_mount_info *od_mount;
+        struct vfsmount          *od_mnt;
         /* object index */
-        struct osd_oi             od_oi;
-        /*
-         * XXX temporary stuff for object index: directory where every object
-         * is named by its fid.
-         */
-        struct dt_object         *od_obj_area;
-
+        struct osd_oi           **od_oi_table;
+        /* total number of OI containers */
+        int                       od_oi_count;
         /*
          * Fid Capability
          */
@@ -186,6 +232,103 @@ struct osd_device {
          * It will be initialized, using mount param.
          */
         __u32                     od_iop_mode;
+
+        struct fsfilt_operations *od_fsops;
+
+        /*
+         * mapping for legacy OST objids
+         */
+        struct osd_compat_objid  *od_ost_map;
+
+        unsigned long long        od_readcache_max_filesize;
+        int                       od_read_cache;
+        int                       od_writethrough_cache;
+
+        struct brw_stats          od_brw_stats;
+        cfs_atomic_t              od_r_in_flight;
+        cfs_atomic_t              od_w_in_flight;
+};
+
+#define OSD_TRACK_DECLARES
+#ifdef OSD_TRACK_DECLARES
+#define OSD_DECLARE_OP(oh, op)   {                               \
+        LASSERT(oh->ot_handle == NULL);                          \
+        ((oh)->ot_declare_ ##op)++; }
+#define OSD_EXEC_OP(handle, op)     {                            \
+        struct osd_thandle *oh;                                  \
+        oh = container_of0(handle, struct osd_thandle, ot_super);\
+        LASSERT((oh)->ot_declare_ ##op > 0);                     \
+        ((oh)->ot_declare_ ##op)--; }
+#else
+#define OSD_DECLARE_OP(oh, op)
+#define OSD_EXEC_OP(oh, op)
+#endif
+
+/* There are at most 10 uid/gids are affected in a transaction, and
+ * that's rename case:
+ * - 2 for source parent uid & gid;
+ * - 2 for source child uid & gid ('..' entry update when the child
+ *   is directory);
+ * - 2 for target parent uid & gid;
+ * - 2 for target child uid & gid (if the target child exists);
+ * - 2 for root uid & gid (last_rcvd, llog, etc);
+ *
+ * The 0 to (OSD_MAX_UGID_CNT - 1) bits of ot_id_type is for indicating
+ * the id type of each id in the ot_id_array.
+ */
+#define OSD_MAX_UGID_CNT        10
+
+struct osd_thandle {
+        struct thandle          ot_super;
+        handle_t               *ot_handle;
+        struct journal_callback ot_jcb;
+        cfs_list_t              ot_dcb_list;
+        /* Link to the device, for debugging. */
+        struct lu_ref_link     *ot_dev_link;
+        unsigned short          ot_credits;
+        unsigned short          ot_id_cnt;
+        unsigned short          ot_id_type;
+        uid_t                   ot_id_array[OSD_MAX_UGID_CNT];
+
+#ifdef OSD_TRACK_DECLARES
+        unsigned char           ot_declare_attr_set;
+        unsigned char           ot_declare_punch;
+        unsigned char           ot_declare_xattr_set;
+        unsigned char           ot_declare_create;
+        unsigned char           ot_declare_destroy;
+        unsigned char           ot_declare_ref_add;
+        unsigned char           ot_declare_ref_del;
+        unsigned char           ot_declare_write;
+        unsigned char           ot_declare_insert;
+        unsigned char           ot_declare_delete;
+#endif
+
+#if OSD_THANDLE_STATS
+        /** time when this handle was allocated */
+        cfs_time_t oth_alloced;
+
+        /** time when this thanle was started */
+        cfs_time_t oth_started;
+#endif
+};
+
+/**
+ * Basic transaction credit op
+ */
+enum dt_txn_op {
+        DTO_INDEX_INSERT,
+        DTO_INDEX_DELETE,
+        DTO_INDEX_UPDATE,
+        DTO_OBJECT_CREATE,
+        DTO_OBJECT_DELETE,
+        DTO_ATTR_SET_BASE,
+        DTO_XATTR_SET,
+        DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
+        DTO_WRITE_BASE,
+        DTO_WRITE_BLOCK,
+        DTO_ATTR_SET_CHOWN,
+
+        DTO_NR
 };
 
 /*
@@ -194,12 +337,20 @@ struct osd_device {
 
 #ifdef LPROCFS
 enum {
+        LPROC_OSD_READ_BYTES    = 0,
+        LPROC_OSD_WRITE_BYTES   = 1,
+        LPROC_OSD_GET_PAGE      = 2,
+        LPROC_OSD_NO_PAGE       = 3,
+        LPROC_OSD_CACHE_ACCESS  = 4,
+        LPROC_OSD_CACHE_HIT     = 5,
+        LPROC_OSD_CACHE_MISS    = 6,
+
 #if OSD_THANDLE_STATS
         LPROC_OSD_THANDLE_STARTING,
         LPROC_OSD_THANDLE_OPEN,
         LPROC_OSD_THANDLE_CLOSING,
 #endif
-        LPROC_OSD_NR
+        LPROC_OSD_LAST,
 };
 #endif
 
@@ -258,6 +409,25 @@ struct osd_it_iam {
         struct iam_iterator    oi_it;
 };
 
+#define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512)
+
+struct osd_iobuf {
+        cfs_waitq_t        dr_wait;
+        cfs_atomic_t       dr_numreqs;  /* number of reqs being processed */
+        int                dr_max_pages;
+        int                dr_npages;
+        int                dr_error;
+        int                dr_frags;
+        unsigned int       dr_ignore_quota:1;
+        unsigned int       dr_elapsed_valid:1; /* we really did count time */
+        unsigned int       dr_rw:1;
+        struct page       *dr_pages[PTLRPC_MAX_BRW_PAGES];
+        unsigned long      dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE];
+        unsigned long      dr_start_time;
+        unsigned long      dr_elapsed;  /* how long io took */
+        struct osd_device *dr_dev;
+};
+
 struct osd_thread_info {
         const struct lu_env   *oti_env;
         /**
@@ -272,10 +442,11 @@ struct osd_thread_info {
 
         struct lu_fid          oti_fid;
         struct osd_inode_id    oti_id;
+        struct ost_id          oti_ostid;
+
         /*
          * XXX temporary: for ->i_op calls.
          */
-        struct txn_param       oti_txn;
         struct timespec        oti_time;
         /*
          * XXX temporary: fake struct file for osd_object_sync
@@ -330,6 +501,10 @@ struct osd_thread_info {
         struct lu_buf          oti_buf;
         /** used in osd_ea_fid_set() to set fid into common ea */
         struct lustre_mdt_attrs oti_mdt_attrs;
+        /** 0-copy IO */
+        struct osd_iobuf       oti_iobuf;
+        struct inode           oti_inode;
+        int                    oti_created[PTLRPC_MAX_BRW_PAGES];
 #ifdef HAVE_QUOTA_SUPPORT
         struct osd_ctxt        oti_ctxt;
 #endif
@@ -349,9 +524,38 @@ int osd_procfs_fini(struct osd_device *osd);
 void osd_lprocfs_time_start(const struct lu_env *env);
 void osd_lprocfs_time_end(const struct lu_env *env,
                           struct osd_device *osd, int op);
+void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf);
+
 #endif
 int osd_statfs(const struct lu_env *env, struct dt_device *dev,
                cfs_kstatfs_t *sfs);
+int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
+                    struct lustre_capa *capa, __u64 opc);
+void osd_declare_qid(struct dt_object *dt, struct osd_thandle *oh,
+                     int type, uid_t id, struct inode *inode);
+struct inode *osd_iget(struct osd_thread_info *info,
+                       struct osd_device *dev,
+                       const struct osd_inode_id *id);
+
+int osd_compat_init(struct osd_device *dev);
+void osd_compat_fini(struct osd_device *dev);
+int osd_compat_objid_lookup(struct osd_thread_info *info,
+                            struct osd_device *osd,
+                            const struct lu_fid *fid, struct osd_inode_id *id);
+int osd_compat_objid_insert(struct osd_thread_info *info,
+                            struct osd_device *osd,
+                            const struct lu_fid *fid,
+                            const struct osd_inode_id *id, struct thandle *th);
+int osd_compat_objid_delete(struct osd_thread_info *info,
+                            struct osd_device *osd,
+                            const struct lu_fid *fid, struct thandle *th);
+int osd_compat_spec_lookup(struct osd_thread_info *info,
+                           struct osd_device *osd,
+                           const struct lu_fid *fid, struct osd_inode_id *id);
+int osd_compat_spec_insert(struct osd_thread_info *info,
+                           struct osd_device *osd,
+                           const struct lu_fid *fid,
+                           const struct osd_inode_id *id, struct thandle *th);
 
 /*
  * Invariants, assertions.
@@ -379,20 +583,158 @@ static inline int osd_invariant(const struct osd_object *obj)
 #define osd_invariant(obj) (1)
 #endif
 
-/* The on-disk extN format reserves inodes 0-11 for internal filesystem
- * use, and these inodes will be invisible on client side, so the valid
- * sequence for IGIF fid is 12-0xffffffff. But root inode (2#) will be seen
- * on server side (osd), and it should be valid too here.
+static inline struct osd_oi *osd_fid2oi(struct osd_device *osd,
+                                        const struct lu_fid *fid)
+{
+        LASSERT(!fid_is_idif(fid));
+        LASSERT(!fid_is_igif(fid));
+        LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1);
+        /* It can work even od_oi_count equals to 1 although it's unexpected,
+         * the only reason we set it to 1 is for performance measurement */
+        return osd->od_oi_table[fid->f_seq & (osd->od_oi_count - 1)];
+}
+
+extern const struct lu_device_operations  osd_lu_ops;
+
+static inline int lu_device_is_osd(const struct lu_device *d)
+{
+        return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
+}
+
+static inline struct osd_device *osd_dt_dev(const struct dt_device *d)
+{
+        LASSERT(lu_device_is_osd(&d->dd_lu_dev));
+        return container_of0(d, struct osd_device, od_dt_dev);
+}
+
+static inline struct osd_device *osd_dev(const struct lu_device *d)
+{
+        LASSERT(lu_device_is_osd(d));
+        return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
+}
+
+static inline struct osd_device *osd_obj2dev(const struct osd_object *o)
+{
+        return osd_dev(o->oo_dt.do_lu.lo_dev);
+}
+
+static inline struct super_block *osd_sb(const struct osd_device *dev)
+{
+        return dev->od_mount->lmi_mnt->mnt_sb;
+}
+
+static inline int osd_object_is_root(const struct osd_object *obj)
+{
+        return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
+}
+
+static inline struct osd_object *osd_obj(const struct lu_object *o)
+{
+        LASSERT(lu_device_is_osd(o->lo_dev));
+        return container_of0(o, struct osd_object, oo_dt.do_lu);
+}
+
+static inline struct osd_object *osd_dt_obj(const struct dt_object *d)
+{
+        return osd_obj(&d->do_lu);
+}
+
+static inline struct lu_device *osd2lu_dev(struct osd_device *osd)
+{
+        return &osd->od_dt_dev.dd_lu_dev;
+}
+
+static inline journal_t *osd_journal(const struct osd_device *dev)
+{
+        return LDISKFS_SB(osd_sb(dev))->s_journal;
+}
+
+extern const struct dt_body_operations osd_body_ops;
+extern struct lu_context_key osd_key;
+
+static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env)
+{
+        return lu_context_key_get(&env->le_ctx, &osd_key);
+}
+
+extern const struct dt_body_operations osd_body_ops_new;
+
+/**
+ * IAM Iterator
+ */
+static inline
+struct iam_path_descr *osd_it_ipd_get(const struct lu_env *env,
+                                      const struct iam_container *bag)
+{
+        return bag->ic_descr->id_ops->id_ipd_alloc(bag,
+                                           osd_oti_get(env)->oti_it_ipd);
+}
+
+static inline
+struct iam_path_descr *osd_idx_ipd_get(const struct lu_env *env,
+                                       const struct iam_container *bag)
+{
+        return bag->ic_descr->id_ops->id_ipd_alloc(bag,
+                                           osd_oti_get(env)->oti_idx_ipd);
+}
+
+static inline void osd_ipd_put(const struct lu_env *env,
+                               const struct iam_container *bag,
+                               struct iam_path_descr *ipd)
+{
+        bag->ic_descr->id_ops->id_ipd_free(ipd);
+}
+
+int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs);
+
+static inline
+struct dentry *osd_child_dentry_by_inode(const struct lu_env *env,
+                                         struct inode *inode,
+                                         const char *name, const int namelen)
+{
+        struct osd_thread_info *info = osd_oti_get(env);
+        struct dentry *child_dentry = &info->oti_child_dentry;
+        struct dentry *obj_dentry = &info->oti_obj_dentry;
+
+        obj_dentry->d_inode = inode;
+        obj_dentry->d_sb = inode->i_sb;
+        obj_dentry->d_name.hash = 0;
+
+        child_dentry->d_name.hash = 0;
+        child_dentry->d_parent = obj_dentry;
+        child_dentry->d_name.name = name;
+        child_dentry->d_name.len = namelen;
+        return child_dentry;
+}
+
+/**
+ * Helper function to pack the fid, ldiskfs stores fid in packed format.
  */
-#define OSD_ROOT_SEQ            2
-static inline int osd_fid_is_root(const struct lu_fid *fid)
+static inline
+void osd_fid_pack(struct osd_fid_pack *pack, const struct dt_rec *fid,
+                  struct lu_fid *befider)
 {
-        return fid_seq(fid) == OSD_ROOT_SEQ;
+        fid_cpu_to_be(befider, (struct lu_fid *)fid);
+        memcpy(pack->fp_area, befider, sizeof(*befider));
+        pack->fp_len =  sizeof(*befider) + 1;
 }
 
-static inline int osd_fid_is_igif(const struct lu_fid *fid)
+static inline
+int osd_fid_unpack(struct lu_fid *fid, const struct osd_fid_pack *pack)
 {
-        return fid_is_igif(fid) || osd_fid_is_root(fid);
+        int result;
+
+        result = 0;
+        switch (pack->fp_len) {
+        case sizeof *fid + 1:
+                memcpy(fid, pack->fp_area, sizeof *fid);
+                fid_be_to_cpu(fid, fid);
+                break;
+        default:
+                CERROR("Unexpected packed fid size: %d\n", pack->fp_len);
+                result = -EIO;
+        }
+        return result;
 }
 
 #endif /* __KERNEL__ */