Whamcloud - gitweb
LU-8560 llite: handle is_compat_task() rename
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_internal.h
index 0caaf0d..3f685a7 100644 (file)
@@ -27,7 +27,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -79,6 +79,7 @@ extern struct kmem_cache *dynlock_cachep;
 
 /* OI scrub should skip this inode. */
 #define LDISKFS_STATE_LUSTRE_NOSCRUB   31
+#define LDISKFS_STATE_LUSTRE_DESTROY   30
 
 /** Enable thandle usage statistics */
 #define OSD_THANDLE_STATS (0)
@@ -89,6 +90,10 @@ extern struct kmem_cache *dynlock_cachep;
 #define ADMIN_USR      "admin_quotafile_v2.usr"
 #define ADMIN_GRP      "admin_quotafile_v2.grp"
 
+/* Statfs space reservation for fragmentation and local objects */
+#define OSD_STATFS_RESERVED            (1ULL << 23) /* 8MB */
+#define OSD_STATFS_RESERVED_SHIFT      (7) /* reserve 0.78% of all space */
+
 struct osd_directory {
         struct iam_container od_container;
         struct iam_descr     od_descr;
@@ -126,6 +131,11 @@ struct osd_object {
        struct osd_directory    *oo_dir;
        /** protects inode attributes. */
        spinlock_t              oo_guard;
+
+       __u32                   oo_destroyed:1;
+
+       /* the i_flags in LMA */
+       __u32                   oo_lma_flags;
         /**
          * Following two members are used to indicate the presence of dot and
          * dotdot in the given directory. This is required for interop mode
@@ -138,6 +148,8 @@ struct osd_object {
 #ifdef CONFIG_LOCKDEP
         struct lockdep_map      oo_dep_map;
 #endif
+
+       struct list_head        oo_xattr_list;
 };
 
 struct osd_obj_seq {
@@ -166,9 +178,9 @@ struct osd_mdobj {
 struct osd_mdobj_map {
        struct dentry   *omm_remote_parent;
 };
-
-#define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \
-       __ldiskfs_add_entry(handle, child, cinode, hlock)
+int osd_ldiskfs_add_entry(struct osd_thread_info *info,
+                         handle_t *handle, struct dentry *child,
+                         struct inode *inode, struct htree_lock *hlock);
 
 #define OSD_OTABLE_IT_CACHE_SIZE       64
 #define OSD_OTABLE_IT_CACHE_MASK       (~(OSD_OTABLE_IT_CACHE_SIZE - 1))
@@ -216,6 +228,12 @@ struct osd_otable_it {
                                 ooi_waiting:1; /* it::next is waiting. */
 };
 
+struct osd_obj_orphan {
+       struct list_head oor_list;
+       struct lu_env   *oor_env; /* to identify "own" records */
+       __u32 oor_ino;
+};
+
 /*
  * osd device.
  */
@@ -283,6 +301,9 @@ struct osd_device {
         * exceeds the osd_device::od_full_scrub_threshold_rate,
         * then trigger OI scrub to scan the whole device. */
        __u64                    od_full_scrub_threshold_rate;
+
+       /* a list of orphaned agent inodes, protected with od_osfs_lock */
+       struct list_head         od_orphan_list;
 };
 
 enum osd_full_scrub_ratio {
@@ -338,6 +359,7 @@ struct osd_thandle {
         unsigned short          ot_credits;
         unsigned short          ot_id_cnt;
         unsigned short          ot_id_type;
+       unsigned int            ot_remove_agents:1;
         uid_t                   ot_id_array[OSD_MAX_UGID_CNT];
        struct lquota_trans    *ot_quota_trans;
 #if OSD_THANDLE_STATS
@@ -416,7 +438,7 @@ struct osd_it_ea_dirent {
  * there  would be one ext3 readdir for every mdd readdir page.
  */
 
-#define OSD_IT_EA_BUFSIZE       (PAGE_CACHE_SIZE + PAGE_CACHE_SIZE/4)
+#define OSD_IT_EA_BUFSIZE       (PAGE_SIZE + PAGE_SIZE/4)
 
 /**
  * This is iterator's in-memory data structure in interoperability
@@ -468,7 +490,7 @@ struct osd_it_quota {
        struct list_head         oiq_list;
 };
 
-#define MAX_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / 512)
+#define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512)
 
 struct osd_iobuf {
        wait_queue_head_t  dr_wait;
@@ -490,6 +512,8 @@ struct osd_iobuf {
        unsigned int       dr_init_at;  /* the line iobuf was initialized */
 };
 
+#define OSD_INS_CACHE_SIZE     8
+
 struct osd_thread_info {
        const struct lu_env   *oti_env;
        /**
@@ -554,6 +578,11 @@ struct osd_thread_info {
 
        struct osd_idmap_cache oti_cache;
 
+       /* dedicated OI cache for insert (which needs inum) */
+       struct osd_idmap_cache *oti_ins_cache;
+       int                    oti_ins_cache_size;
+       int                    oti_ins_cache_used;
+
         int                    oti_r_locks;
         int                    oti_w_locks;
         int                    oti_txns;
@@ -578,7 +607,9 @@ struct osd_thread_info {
 
        /* used by quota code */
        union {
-#ifdef HAVE_DQUOT_FS_DISK_QUOTA
+#if defined(HAVE_DQUOT_QC_DQBLK)
+               struct qc_dqblk         oti_qdq;
+#elif defined(HAVE_DQUOT_FS_DISK_QUOTA)
                struct fs_disk_quota    oti_fdq;
 #else
                struct if_dqblk         oti_dqblk;
@@ -594,10 +625,10 @@ struct osd_thread_info {
        /* Tracking for transaction credits, to allow debugging and optimizing
         * cases where a large number of credits are being allocated for
         * single transaction. */
+       unsigned int            oti_credits_before;
        unsigned short          oti_declare_ops[OSD_OT_MAX];
-       unsigned short          oti_declare_ops_rb[OSD_OT_MAX];
        unsigned short          oti_declare_ops_cred[OSD_OT_MAX];
-       bool                    oti_rollback;
+       unsigned short          oti_declare_ops_used[OSD_OT_MAX];
 };
 
 extern int ldiskfs_pdo;
@@ -749,6 +780,46 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 }
 #endif
 
+#ifdef LDISKFS_HT_MISC
+# define osd_journal_start_sb(sb, type, nblock) \
+               ldiskfs_journal_start_sb(sb, type, nblock)
+# define osd_ldiskfs_append(handle, inode, nblock) \
+               ldiskfs_append(handle, inode, nblock)
+# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
+               (__ldiskfs_find_entry(dir, name, de, inlined, lock) ?: \
+                ERR_PTR(-ENOENT))
+# define osd_journal_start(inode, type, nblocks) \
+               ldiskfs_journal_start(inode, type, nblocks)
+# define osd_transaction_size(dev) \
+               (osd_journal(dev)->j_max_transaction_buffers / 2)
+#else
+# define LDISKFS_HT_MISC       0
+# define osd_journal_start_sb(sb, type, nblock) \
+               ldiskfs_journal_start_sb(sb, nblock)
+
+static inline struct buffer_head *osd_ldiskfs_append(handle_t *handle,
+                                                    struct inode *inode,
+                                                    ldiskfs_lblk_t *nblock)
+{
+       struct buffer_head *bh;
+       int err = 0;
+
+       bh = ldiskfs_append(handle, inode, nblock, &err);
+       if (bh == NULL)
+               bh = ERR_PTR(err);
+
+       return bh;
+}
+
+# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
+               (__ldiskfs_find_entry(dir, name, de, lock) ?: \
+                ERR_PTR(-ENOENT))
+# define osd_journal_start(inode, type, nblocks) \
+               ldiskfs_journal_start(inode, nblocks)
+# define osd_transaction_size(dev) \
+               (osd_journal(dev)->j_max_transaction_buffers)
+#endif
+
 /*
  * Invariants, assertions.
  */
@@ -862,6 +933,11 @@ static inline char *osd_name(struct osd_device *osd)
        return osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name;
 }
 
+static inline bool osd_is_ea_inode(struct inode *inode)
+{
+       return !!(LDISKFS_I(inode)->i_flags & LDISKFS_EA_INODE_FL);
+}
+
 extern const struct dt_body_operations osd_body_ops;
 extern struct lu_context_key osd_key;
 
@@ -898,6 +974,10 @@ static inline void osd_ipd_put(const struct lu_env *env,
         bag->ic_descr->id_ops->id_ipd_free(ipd);
 }
 
+int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode,
+                          const loff_t size, const loff_t pos,
+                          const int blocks);
+
 int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs);
 int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize,
                             int write_NUL, loff_t *offs, handle_t *handle);
@@ -924,6 +1004,7 @@ struct dentry *osd_child_dentry_by_inode(const struct lu_env *env,
 
 extern int osd_trans_declare_op2rb[];
 extern int ldiskfs_track_declares_assert;
+void osd_trans_dump_creds(const struct lu_env *env, struct thandle *th);
 
 static inline void osd_trans_declare_op(const struct lu_env *env,
                                        struct osd_thandle *oh,
@@ -953,7 +1034,7 @@ static inline void osd_trans_exec_op(const struct lu_env *env,
        struct osd_thread_info *oti = osd_oti_get(env);
        struct osd_thandle     *oh  = container_of(th, struct osd_thandle,
                                                   ot_super);
-       unsigned int            rb;
+       unsigned int            rb, left;
 
        LASSERT(oh->ot_handle != NULL);
        if (unlikely(op >= OSD_OT_MAX)) {
@@ -967,58 +1048,99 @@ static inline void osd_trans_exec_op(const struct lu_env *env,
                }
        }
 
-       if (likely(!oti->oti_rollback && oti->oti_declare_ops[op] > 0)) {
-               oti->oti_declare_ops[op]--;
-               oti->oti_declare_ops_rb[op]++;
-       } else {
-               /* all future updates are considered rollback */
-               oti->oti_rollback = true;
-               rb = osd_trans_declare_op2rb[op];
-               if (unlikely(rb >= OSD_OT_MAX)) {
-                       if (unlikely(ldiskfs_track_declares_assert))
-                               LASSERTF(rb < OSD_OT_MAX, "rb = %u\n", rb);
-                       else {
-                               CWARN("%s: Invalid rollback index %d\n",
-                                     osd_name(osd_dt_dev(th->th_dev)), rb);
-                               libcfs_debug_dumpstack(NULL);
-                               return;
-                       }
-               }
-               if (unlikely(oti->oti_declare_ops_rb[rb] == 0)) {
+       /* find rollback (or reverse) operation for the given one
+        * such an operation doesn't require additional credits
+        * as the same set of blocks are modified */
+       rb = osd_trans_declare_op2rb[op];
+
+       /* check whether credits for this operation were reserved at all */
+       if (unlikely(oti->oti_declare_ops_cred[op] == 0 &&
+                    oti->oti_declare_ops_cred[rb] == 0)) {
+               /* the API is not perfect yet: CREATE does REF_ADD internally
+                * while DESTROY does not. To rollback CREATE the callers
+                * needs to call REF_DEL+DESTROY which is hard to detect using
+                * a simple table of rollback operations */
+               if (op == OSD_OT_REF_DEL &&
+                   oti->oti_declare_ops_cred[OSD_OT_CREATE] > 0)
+                       goto proceed;
+               if (op == OSD_OT_REF_ADD &&
+                   oti->oti_declare_ops_cred[OSD_OT_DESTROY] > 0)
+                       goto proceed;
+               osd_trans_dump_creds(env, th);
+               CERROR("%s: op = %d, rb = %d\n",
+                      osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb);
+               if (unlikely(ldiskfs_track_declares_assert))
+                       LBUG();
+       }
+
+proceed:
+       /* remember how many credits we have unused before the operation */
+       oti->oti_credits_before = oh->ot_handle->h_buffer_credits;
+       left = oti->oti_declare_ops_cred[op] - oti->oti_declare_ops_used[op];
+       if (unlikely(oti->oti_credits_before < left)) {
+               osd_trans_dump_creds(env, th);
+               CERROR("%s: op = %d, rb = %d\n",
+                      osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, rb);
+               /* on a very small fs (testing?) it's possible that
+                * the transaction can't fit 1/4 of journal, so we
+                * just request less credits (see osd_trans_start()).
+                * ignore the same case here */
+               rb = osd_transaction_size(osd_dt_dev(th->th_dev));
+               if (unlikely(oh->ot_credits < rb)) {
                        if (unlikely(ldiskfs_track_declares_assert))
-                               LASSERTF(oti->oti_declare_ops_rb[rb] > 0,
-                                        "rb = %u\n", rb);
-                       else {
-                               CWARN("%s: Overflow in tracking declares for "
-                                     "index, rb = %d\n",
-                                     osd_name(osd_dt_dev(th->th_dev)), rb);
-                               libcfs_debug_dumpstack(NULL);
-                               return;
-                       }
+                               LBUG();
                }
-               oti->oti_declare_ops_rb[rb]--;
        }
 }
 
-static inline void osd_trans_declare_rb(const struct lu_env *env,
-                                       struct thandle *th, unsigned int op)
+static inline void osd_trans_exec_check(const struct lu_env *env,
+                                       struct thandle *th,
+                                       unsigned int op)
 {
        struct osd_thread_info *oti = osd_oti_get(env);
        struct osd_thandle     *oh  = container_of(th, struct osd_thandle,
                                                   ot_super);
+       int                     used, over, quota;
 
-       LASSERT(oh->ot_handle != NULL);
-       if (unlikely(op >= OSD_OT_MAX)) {
-               if (unlikely(ldiskfs_track_declares_assert))
-                       LASSERT(op < OSD_OT_MAX);
-               else {
-                       CWARN("%s: Invalid operation index %d\n",
-                             osd_name(osd_dt_dev(th->th_dev)), op);
-                       libcfs_debug_dumpstack(NULL);
-               }
+       /* how many credits have been used by the operation */
+       used = oti->oti_credits_before - oh->ot_handle->h_buffer_credits;
+
+       if (unlikely(used < 0)) {
+               /* if some block was allocated and released in the same
+                * transaction, then it won't be a part of the transaction
+                * and delta can be negative */
+               return;
+       }
+
+       if (used == 0) {
+               /* rollback operations (e.g. when we destroy just created
+                * object) should not consume any credits. there is no point
+                * to confuse the checks below */
+               return;
+       }
 
+       oti->oti_declare_ops_used[op] += used;
+       if (oti->oti_declare_ops_used[op] <= oti->oti_declare_ops_cred[op])
+               return;
+
+       /* we account quota for a whole transaction and any operation can
+        * consume corresponding credits */
+       over = oti->oti_declare_ops_used[op] -
+               oti->oti_declare_ops_cred[op];
+       quota = oti->oti_declare_ops_cred[OSD_OT_QUOTA] -
+               oti->oti_declare_ops_used[OSD_OT_QUOTA];
+       if (over <= quota) {
+               /* probably that credits were consumed by
+                * quota indirectly (in the depths of ldiskfs) */
+               oti->oti_declare_ops_used[OSD_OT_QUOTA] += over;
+               oti->oti_declare_ops_used[op] -= over;
        } else {
-               oti->oti_declare_ops_rb[op]++;
+               CWARN("op %d: used %u, used now %u, reserved %u\n",
+                     op, oti->oti_declare_ops_used[op], used,
+                     oti->oti_declare_ops_cred[op]);
+               osd_trans_dump_creds(env, th);
+               if (unlikely(ldiskfs_track_declares_assert))
+                       LBUG();
        }
 }
 
@@ -1063,7 +1185,7 @@ int osd_acct_obj_lookup(struct osd_thread_info *info, struct osd_device *osd,
 static inline int is_32bit_api(void)
 {
 #ifdef CONFIG_COMPAT
-       return is_compat_task();
+       return in_compat_syscall();
 #else
        return (BITS_PER_LONG == 32);
 #endif
@@ -1088,30 +1210,29 @@ static inline unsigned long osd_remote_parent_ino(struct osd_device *dev)
        return dev->od_mdt_map->omm_remote_parent->d_inode->i_ino;
 }
 
-#ifdef LDISKFS_HT_MISC
-# define osd_journal_start_sb(sb, type, nblock) \
-               ldiskfs_journal_start_sb(sb, type, nblock)
-# define osd_ldiskfs_append(handle, inode, nblock, err) \
-               ldiskfs_append(handle, inode, nblock)
-# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
-               __ldiskfs_find_entry(dir, name, de, inlined, lock)
-# define osd_journal_start(inode, type, nblocks) \
-               ldiskfs_journal_start(inode, type, nblocks)
-# define osd_transaction_size(dev) \
-               (osd_journal(dev)->j_max_transaction_buffers / 2)
+/**
+ * ext4_bread/ldiskfs_bread has either 5 or 4 parameters. The error
+ * return code has been removed and integrated into the pointer in the
+ * kernel 3.18.
+ */
+static inline struct buffer_head *__ldiskfs_bread(handle_t *handle,
+                                                 struct inode *inode,
+                                                 ldiskfs_lblk_t block,
+                                                 int create)
+{
+#ifdef HAVE_EXT4_BREAD_4ARGS
+       return ldiskfs_bread(handle, inode, block, create);
 #else
-# define LDISKFS_HT_MISC       0
-# define osd_journal_start_sb(sb, type, nblock) \
-               ldiskfs_journal_start_sb(sb, nblock)
-# define osd_ldiskfs_append(handle, inode, nblock, err) \
-               ldiskfs_append(handle, inode, nblock, err)
-# define osd_ldiskfs_find_entry(dir, name, de, inlined, lock) \
-               __ldiskfs_find_entry(dir, name, de, lock)
-# define osd_journal_start(inode, type, nblocks) \
-               ldiskfs_journal_start(inode, nblocks)
-# define osd_transaction_size(dev) \
-               (osd_journal(dev)->j_max_transaction_buffers)
+       struct buffer_head *bh;
+       int error = 0;
+
+       bh = ldiskfs_bread(handle, inode, block, create, &error);
+       if (bh == NULL && error != 0)
+               bh = ERR_PTR(error);
+
+       return bh;
 #endif
+}
 
 void ldiskfs_inc_count(handle_t *handle, struct inode *inode);
 void ldiskfs_dec_count(handle_t *handle, struct inode *inode);