LU-1187 osp: add osp_md_object for remote directory.

[fs/lustre-release.git] / lustre / include / cl_object.h
diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h

index 3d44b1c..087e251 100644 (file)
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
   * GPL HEADER START
   *
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -29,8 +27,7 @@
   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   *
- * Copyright (c) 2011 Whamcloud, Inc.
- *
+ * Copyright (c) 2011, 2012, Intel Corporation.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -254,6 +251,8 @@ struct cl_object {
          struct lu_object                   co_lu;
          /** per-object-layer operations */
          const struct cl_object_operations *co_ops;
+       /** offset of page slice in cl_page buffer */
+       int                                co_slice_off;
  };
  
  /**
@@ -279,6 +278,26 @@ struct cl_object_conf {
           * VFS inode. This is consumed by vvp.
           */
          struct inode             *coc_inode;
+       /**
+        * Layout lock handle.
+        */
+       struct ldlm_lock         *coc_lock;
+       /**
+        * Operation to handle layout, OBJECT_CONF_XYZ.
+        */
+       int                       coc_opc;
+};
+
+enum {
+       /** configure layout, set up a new stripe, must be called while
+        * holding layout lock. */
+       OBJECT_CONF_SET = 0,
+       /** invalidate the current stripe configuration due to losing
+        * layout lock. */
+       OBJECT_CONF_INVALIDATE = 1,
+       /** wait for old layout to go away so that new layout can be
+        * set up. */
+       OBJECT_CONF_WAIT = 2
  };
  
  /**
@@ -302,10 +321,8 @@ struct cl_object_operations {
           * \retval valid-pointer pointer to already existing referenced page
           *         to be used instead of newly created.
           */
-        struct cl_page *(*coo_page_init)(const struct lu_env *env,
-                                         struct cl_object *obj,
-                                         struct cl_page *page,
-                                         cfs_page_t *vmpage);
+       int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+                               struct cl_page *page, cfs_page_t *vmpage);
          /**
           * Initialize lock slice for this layer. Called top-to-bottom through
           * every object layer when a new cl_lock is instantiated. Layer
@@ -386,9 +403,9 @@ struct cl_object_header {
           */
          /** @{ */
          /** Lock protecting page tree. */
-        cfs_spinlock_t           coh_page_guard;
-        /** Lock protecting lock list. */
-        cfs_spinlock_t           coh_lock_guard;
+       spinlock_t               coh_page_guard;
+       /** Lock protecting lock list. */
+       spinlock_t               coh_lock_guard;
          /** @} locks */
          /** Radix tree of cl_page's, cached for this object. */
          struct radix_tree_root   coh_tree;
@@ -412,12 +429,16 @@ struct cl_object_header {
           *
           * \todo XXX this can be read/write lock if needed.
           */
-        cfs_spinlock_t           coh_attr_guard;
-        /**
-         * Number of objects above this one: 0 for a top-object, 1 for its
-         * sub-object, etc.
-         */
-        unsigned                 coh_nesting;
+       spinlock_t               coh_attr_guard;
+       /**
+        * Size of cl_page + page slices
+        */
+       unsigned short           coh_page_bufsize;
+       /**
+        * Number of objects above this one: 0 for a top-object, 1 for its
+        * sub-object, etc.
+        */
+       unsigned char            coh_nesting;
  };
  
  /**
@@ -716,12 +737,10 @@ struct cl_page {
           * modified only internally within cl_page.c. Protected by a VM lock.
           */
          const enum cl_page_state cp_state;
-        /**
-         * Linkage of pages within some group. Protected by
-         * cl_page::cp_mutex. */
-        cfs_list_t               cp_batch;
-        /** Mutex serializing membership of a page in a batch. */
-        cfs_mutex_t              cp_mutex;
+       /** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+       cfs_list_t              cp_batch;
+       /** Mutex serializing membership of a page in a batch. */
+       struct mutex            cp_mutex;
          /** Linkage of pages within cl_req. */
          cfs_list_t               cp_flight;
          /** Transfer error. */
@@ -1057,6 +1076,15 @@ struct cl_page_operations {
           */
          int (*cpo_cancel)(const struct lu_env *env,
                            const struct cl_page_slice *slice);
+       /**
+        * Write out a page by kernel. This is only called by ll_writepage
+        * right now.
+        *
+        * \see cl_page_flush()
+        */
+       int (*cpo_flush)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io);
          /** @} transfer */
  };
  
@@ -1065,10 +1093,10 @@ struct cl_page_operations {
   */
  #define CL_PAGE_DEBUG(mask, env, page, format, ...)                     \
  do {                                                                    \
-        static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
                                                                          \
          if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
-                cl_page_print(env, &__info, lu_cdebug_printer, page);   \
+                cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
                  CDEBUG(mask, format , ## __VA_ARGS__);                  \
          }                                                               \
  } while (0)
@@ -1076,16 +1104,26 @@ do {                                                                    \
  /**
   * Helper macro, dumping shorter information about \a page into a log.
   */
-#define CL_PAGE_HEADER(mask, env, page, format, ...)                    \
-do {                                                                    \
-        static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
-                                                                        \
-        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
-                cl_page_header_print(env, &__info, lu_cdebug_printer, page); \
-                CDEBUG(mask, format , ## __VA_ARGS__);                  \
-        }                                                               \
+#define CL_PAGE_HEADER(mask, env, page, format, ...)                          \
+do {                                                                          \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                      \
+                                                                              \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                         \
+                cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+                CDEBUG(mask, format , ## __VA_ARGS__);                        \
+        }                                                                     \
  } while (0)
  
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+       if (page->cp_type == CPT_CACHEABLE)
+               ++refc;
+       LASSERT(cfs_atomic_read(&page->cp_ref) > 0);
+       return (cfs_atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
  /** @} cl_page */
  
  /** \addtogroup cl_lock cl_lock
@@ -1443,7 +1481,9 @@ enum cl_lock_flags {
          /** cancellation is pending for this lock. */
          CLF_CANCELPEND = 1 << 1,
          /** destruction is pending for this lock. */
-        CLF_DOOMED     = 1 << 2
+        CLF_DOOMED     = 1 << 2,
+        /** from enqueue RPC reply upcall. */
+        CLF_FROM_UPCALL= 1 << 3,
  };
  
  /**
@@ -1537,7 +1577,7 @@ struct cl_lock {
           *
           * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
           */
-        cfs_mutex_t           cll_guard;
+       struct mutex            cll_guard;
          cfs_task_t           *cll_guarder;
          int                   cll_depth;
  
@@ -1616,9 +1656,11 @@ struct cl_lock_slice {
   */
  enum cl_lock_transition {
          /** operation cannot be completed immediately. Wait for state change. */
-        CLO_WAIT   = 1,
+        CLO_WAIT        = 1,
          /** operation had to release lock mutex, restart. */
-        CLO_REPEAT = 2
+        CLO_REPEAT      = 2,
+        /** lower layer re-enqueued. */
+        CLO_REENQUEUED  = 3,
  };
  
  /**
@@ -1789,14 +1831,22 @@ struct cl_lock_operations {
  
  #define CL_LOCK_DEBUG(mask, env, lock, format, ...)                     \
  do {                                                                    \
-        static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
                                                                          \
          if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
-                cl_lock_print(env, &__info, lu_cdebug_printer, lock);   \
+                cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
                  CDEBUG(mask, format , ## __VA_ARGS__);                  \
          }                                                               \
  } while (0)
  
+#define CL_LOCK_ASSERT(expr, env, lock) do {                            \
+       if (likely(expr))                                               \
+               break;                                                  \
+                                                                       \
+       CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+       LBUG();                                                         \
+} while (0)
+
  /** @} cl_lock */
  
  /** \addtogroup cl_page_list cl_page_list
@@ -1900,6 +1950,11 @@ enum cl_io_type {
           */
          CIT_FAULT,
          /**
+        * fsync system call handling
+        * To write out a range of file
+        */
+       CIT_FSYNC,
+       /**
           * Miscellaneous io. This is used for occasional io activity that
           * doesn't fit into other types. Currently this is used for:
           *
@@ -1946,11 +2001,6 @@ enum cl_io_state {
          CIS_FINI
  };
  
-enum cl_req_priority {
-        CRP_NORMAL,
-        CRP_CANCEL
-};
-
  /**
   * IO state private for a layer.
   *
@@ -2068,8 +2118,7 @@ struct cl_io_operations {
                  int  (*cio_submit)(const struct lu_env *env,
                                     const struct cl_io_slice *slice,
                                     enum cl_req_type crt,
-                                   struct cl_2queue *queue,
-                                   enum cl_req_priority priority);
+                                  struct cl_2queue *queue);
          } req_op[CRT_NR];
          /**
           * Read missing page.
@@ -2155,9 +2204,13 @@ enum cl_enq_flags {
           */
          CEF_NEVER        = 0x00000010,
          /**
+         * for async glimpse lock.
+         */
+        CEF_AGL          = 0x00000020,
+        /**
           * mask of enq_flags.
           */
-        CEF_MASK         = 0x0000001f
+        CEF_MASK         = 0x0000003f,
  };
  
  /**
@@ -2227,6 +2280,18 @@ enum cl_io_lock_dmd {
          CILR_PEEK
  };
  
+enum cl_fsync_mode {
+       /** start writeback, do not wait for them to finish */
+       CL_FSYNC_NONE  = 0,
+       /** start writeback and wait for them to finish */
+       CL_FSYNC_LOCAL = 1,
+       /** discard all of dirty pages in a specific file range */
+       CL_FSYNC_DISCARD = 2,
+       /** start writeback and make sure they have reached storage before
+        * return. OST_SYNC RPC must be issued and finished */
+       CL_FSYNC_ALL   = 3
+};
+
  struct cl_io_rw_common {
          loff_t      crw_pos;
          size_t      crw_count;
@@ -2261,11 +2326,6 @@ struct cl_io {
          struct cl_lockset              ci_lockset;
          /** lock requirements, this is just a help info for sublayers. */
          enum cl_io_lock_dmd            ci_lockreq;
-        /**
-         * This io has held grouplock, to inform sublayers that
-         * don't do lockless i/o.
-         */
-        int                            ci_no_srvlock;
          union {
                  struct cl_rd_io {
                          struct cl_io_rw_common rd;
@@ -2273,6 +2333,7 @@ struct cl_io {
                  struct cl_wr_io {
                          struct cl_io_rw_common wr;
                          int                    wr_append;
+                       int                    wr_sync;
                  } ci_wr;
                  struct cl_io_rw_common ci_rw;
                  struct cl_setattr_io {
@@ -2294,11 +2355,44 @@ struct cl_io {
                          /** resulting page */
                          struct cl_page *ft_page;
                  } ci_fault;
+               struct cl_fsync_io {
+                       loff_t             fi_start;
+                       loff_t             fi_end;
+                       struct obd_capa   *fi_capa;
+                       /** file system level fid */
+                       struct lu_fid     *fi_fid;
+                       enum cl_fsync_mode fi_mode;
+                       /* how many pages were written/discarded */
+                       unsigned int       fi_nr_written;
+               } ci_fsync;
          } u;
          struct cl_2queue     ci_queue;
          size_t               ci_nob;
          int                  ci_result;
-        int                  ci_continue;
+       unsigned int         ci_continue:1,
+       /**
+        * This io has held grouplock, to inform sublayers that
+        * don't do lockless i/o.
+        */
+                            ci_no_srvlock:1,
+       /**
+        * The whole IO need to be restarted because layout has been changed
+        */
+                            ci_need_restart:1,
+       /**
+        * to not refresh layout - the IO issuer knows that the layout won't
+        * change(page operations, layout change causes all page to be
+        * discarded), or it doesn't matter if it changes(sync).
+        */
+                            ci_ignore_layout:1,
+       /**
+        * Check if layout changed after the IO finishes. Mainly for HSM
+        * requirement. If IO occurs to openning files, it doesn't need to
+        * verify layout because HSM won't release openning files.
+        * Right now, only two opertaions need to verify layout: glimpse
+        * and setattr.
+        */
+                            ci_verify_layout:1;
          /**
           * Number of pages owned by this IO. For invariant checking.
           */
@@ -2373,10 +2467,12 @@ struct cl_io {
   * Per-transfer attributes.
   */
  struct cl_req_attr {
-        /** Generic attributes for the server consumption. */
-        struct obdo     *cra_oa;
-        /** Capability. */
-        struct obd_capa *cra_capa;
+       /** Generic attributes for the server consumption. */
+       struct obdo     *cra_oa;
+       /** Capability. */
+       struct obd_capa *cra_capa;
+       /** Jobid */
+       char             cra_jobid[JOBSTATS_JOBID_SIZE];
  };
  
  /**
@@ -2479,22 +2575,29 @@ struct cl_req_slice {
  
  /* @} cl_req */
  
+enum cache_stats_item {
+       /** how many cache lookups were performed */
+       CS_lookup = 0,
+       /** how many times cache lookup resulted in a hit */
+       CS_hit,
+       /** how many entities are in the cache right now */
+       CS_total,
+       /** how many entities in the cache are actively used (and cannot be
+        * evicted) right now */
+       CS_busy,
+       /** how many entities were created at all */
+       CS_create,
+       CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
  /**
   * Stats for a generic cache (similar to inode, lu_object, etc. caches).
   */
  struct cache_stats {
          const char    *cs_name;
-        /** how many entities were created at all */
-        cfs_atomic_t   cs_created;
-        /** how many cache lookups were performed */
-        cfs_atomic_t   cs_lookup;
-        /** how many times cache lookup resulted in a hit */
-        cfs_atomic_t   cs_hit;
-        /** how many entities are in the cache right now */
-        cfs_atomic_t   cs_total;
-        /** how many entities in the cache are actively used (and cannot be
-         * evicted) right now */
-        cfs_atomic_t   cs_busy;
+        cfs_atomic_t   cs_stats[CS_NR];
  };
  
  /** These are not exported so far */
@@ -2657,21 +2760,37 @@ static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
          return cl_object_header(o0) == cl_object_header(o1);
  }
  
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+       clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+       cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+                                        struct cl_page *page)
+{
+       return (void *)((char *)page + clob->co_slice_off);
+}
+
  /** @} cl_object */
  
  /** \defgroup cl_page cl_page
   * @{ */
  enum {
          CLP_GANG_OKAY = 0,
+        CLP_GANG_RESCHED,
          CLP_GANG_AGAIN,
-        CLP_GANG_RESCHED
+        CLP_GANG_ABORT
  };
  
+/* callback of cl_page_gang_lookup() */
+typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
+                                     struct cl_page *, void *);
  int             cl_page_gang_lookup (const struct lu_env *env,
                                       struct cl_object *obj,
                                       struct cl_io *io,
                                       pgoff_t start, pgoff_t end,
-                                     struct cl_page_list *plist);
+                                     cl_page_gang_cb_t cb, void *cbdata);
  struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
                                       pgoff_t index);
  struct cl_page *cl_page_find        (const struct lu_env *env,
@@ -2738,6 +2857,8 @@ int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
  void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
                           int from, int to);
  int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg);
  
  /** @} transfer */
  
@@ -2784,9 +2905,20 @@ struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
  struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
                                  const struct cl_lock_descr *need,
                                  const char *scope, const void *source);
-struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj,
-                                struct cl_page *page, struct cl_lock *except,
-                                int pending, int canceld);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+                                struct cl_object *obj, pgoff_t index,
+                                struct cl_lock *except, int pending,
+                                int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+                                             struct cl_object *obj,
+                                             struct cl_page *page,
+                                             struct cl_lock *except,
+                                             int pending, int canceld)
+{
+       LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+       return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+                               pending, canceld);
+}
  
  const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
                                         const struct lu_device_type *dtype);
@@ -2796,12 +2928,14 @@ void  cl_lock_get_trust (struct cl_lock *lock);
  void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
  void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
                           const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source);
  void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
                           const char *scope, const void *source);
  void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
                           const char *scope, const void *source);
  void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
-int   cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
  
  enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
                                       struct cl_lock *lock);
@@ -2868,8 +3002,7 @@ int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
  void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
  int  cl_lock_is_mutexed (struct cl_lock *lock);
  int  cl_lock_nr_mutexed (const struct lu_env *env);
-int  cl_lock_page_out   (const struct lu_env *env, struct cl_lock *lock,
-                         int discard);
+int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
  int  cl_lock_ext_match  (const struct cl_lock_descr *has,
                           const struct cl_lock_descr *need);
  int  cl_lock_descr_match(const struct cl_lock_descr *has,
@@ -2927,11 +3060,10 @@ int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
  int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
                            struct cl_page *page, unsigned from, unsigned to);
  int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
-                          enum cl_req_type iot, struct cl_2queue *queue,
-                          enum cl_req_priority priority);
+                         enum cl_req_type iot, struct cl_2queue *queue);
  int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
-                          enum cl_req_type iot, struct cl_2queue *queue,
-                          enum cl_req_priority priority, long timeout);
+                         enum cl_req_type iot, struct cl_2queue *queue,
+                         long timeout);
  void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
                            size_t nob);
  int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
@@ -2946,6 +3078,16 @@ static inline int cl_io_is_append(const struct cl_io *io)
          return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
  }
  
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+       return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+       return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
  /**
   * True, iff \a io is a truncate(2).
   */
@@ -3051,12 +3193,14 @@ void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
   * anchor and wakes up waiting thread when transfer is complete.
   */
  struct cl_sync_io {
-        /** number of pages yet to be transferred. */
-        cfs_atomic_t          csi_sync_nr;
-        /** completion to be signaled when transfer is complete. */
-        cfs_waitq_t          csi_waitq;
-        /** error code. */
-        int                   csi_sync_rc;
+       /** number of pages yet to be transferred. */
+       cfs_atomic_t            csi_sync_nr;
+       /** error code. */
+       int                     csi_sync_rc;
+       /** barrier of destroy this structure */
+       cfs_atomic_t            csi_barrier;
+       /** completion to be signaled when transfer is complete. */
+       cfs_waitq_t             csi_waitq;
  };
  
  void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);