LU-12616 obclass: fix MDS start/stop race

[fs/lustre-release.git] / lustre / osp / osp_internal.h
diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h

index 74dc8f8..f9b474a 100644 (file)
--- a/lustre/osp/osp_internal.h
+++ b/lustre/osp/osp_internal.h
@@ -23,7 +23,7 @@
   * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -44,7 +44,6 @@
  #include <lustre_fid.h>
  #include <lustre_update.h>
  #include <lu_target.h>
-#include <lustre_mdc.h>
  
  /*
   * Infrastructure to support tracking of last committed llog record
@@ -68,7 +67,6 @@ struct osp_precreate {
         /*
          * Precreation pool
          */
-       spinlock_t                       osp_pre_lock;
  
         /* last fid to assign in creation */
         struct lu_fid                    osp_pre_used_fid;
@@ -146,6 +144,13 @@ struct osp_updates {
         __u64                   ou_generation;
  };
  
+struct osp_rpc_lock {
+       /** Lock protecting in-flight RPC concurrency. */
+       struct mutex            rpcl_mutex;
+       /** Used for MDS/RPC load testing purposes. */
+       unsigned int            rpcl_fakes;
+};
+
  struct osp_device {
         struct dt_device                 opd_dt_dev;
         /* corresponded OST index */
@@ -164,16 +169,16 @@ struct osp_device {
          * and required le64_to_cpu() conversion before use.
          * Protected by opd_pre_lock */
         struct lu_fid                   opd_last_used_fid;
+       /* on disk copy last_used_fid.f_oid or idif */
+       u64                             opd_last_id;
         struct lu_fid                   opd_gap_start_fid;
         int                              opd_gap_count;
         /* connection to OST */
+       struct osp_rpc_lock              opd_rpc_lock;
         struct obd_device               *opd_obd;
         struct obd_export               *opd_exp;
-       struct obd_uuid                  opd_cluuid;
         struct obd_connect_data         *opd_connect_data;
         int                              opd_connects;
-       struct proc_dir_entry           *opd_proc_entry;
-       struct lprocfs_stats            *opd_stats;
         /* connection status. */
         unsigned int                     opd_new_connection:1,
                                          opd_got_disconnected:1,
@@ -190,6 +195,7 @@ struct osp_device {
         struct osp_precreate            *opd_pre;
         /* dedicate precreate thread */
         struct ptlrpc_thread             opd_pre_thread;
+       spinlock_t                       opd_pre_lock;
         /* thread waits for signals about pool going empty */
         wait_queue_head_t                opd_pre_waitq;
  
@@ -236,20 +242,20 @@ struct osp_device {
         atomic_t                         opd_sync_barrier;
         wait_queue_head_t                opd_sync_barrier_waitq;
         /* last generated id */
-       cfs_time_t                       opd_sync_next_commit_cb;
+       ktime_t                          opd_sync_next_commit_cb;
         atomic_t                         opd_commits_registered;
  
         /*
          * statfs related fields: OSP maintains it on its own
          */
         struct obd_statfs                opd_statfs;
-       cfs_time_t                       opd_statfs_fresh_till;
-       struct timer_list                        opd_statfs_timer;
+       ktime_t                          opd_statfs_fresh_till;
+       struct timer_list                opd_statfs_timer;
         int                              opd_statfs_update_in_progress;
         /* how often to update statfs data */
-       int                              opd_statfs_maxage;
+       time64_t                         opd_statfs_maxage;
  
-       struct proc_dir_entry           *opd_symlink;
+       struct dentry                   *opd_debugfs;
  
         /* If the caller wants to do some idempotent async operations on
          * remote server, it can append the async remote requests on the
@@ -270,7 +276,6 @@ struct osp_device {
         int                             opd_reserved_mb_low;
  };
  
-#define opd_pre_lock                   opd_pre->osp_pre_lock
  #define opd_pre_used_fid               opd_pre->osp_pre_used_fid
  #define opd_pre_last_created_fid       opd_pre->osp_pre_last_created_fid
  #define opd_pre_reserved               opd_pre->osp_pre_reserved
@@ -314,6 +319,9 @@ struct osp_object {
         struct list_head        opo_invalidate_cb_list;
         /* Protect opo_ooa. */
         spinlock_t              opo_lock;
+       /* to implement in-flight invalidation */
+       atomic_t                opo_invalidate_seq;
+       struct rw_semaphore     opo_invalidate_sem;
  };
  
  extern struct lu_object_operations osp_lu_obj_ops;
@@ -328,7 +336,6 @@ struct osp_thread_info {
         struct lu_attr           osi_attr;
         struct ost_id            osi_oi;
         struct ost_id            osi_oi2;
-       u64                      osi_id;
         loff_t                   osi_off;
         union {
                 struct llog_rec_hdr             osi_hdr;
@@ -506,20 +513,79 @@ static inline struct seq_server_site *osp_seq_site(struct osp_device *osp)
         return osp->opd_dt_dev.dd_lu_dev.ld_site->ld_seq_site;
  }
  
-#define osp_init_rpc_lock(lck) mdc_init_rpc_lock(lck)
+/**
+ * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
+ *
+ * This mutex is used to implement execute-once semantics on the MDT.
+ * The MDT stores the last transaction ID and result for every client in
+ * its last_rcvd file. If the client doesn't get a reply, it can safely
+ * resend the request and the MDT will reconstruct the reply being aware
+ * that the request has already been executed. Without this lock,
+ * execution status of concurrent in-flight requests would be
+ * overwritten.
+ *
+ * This imlpementation limits the extent to which we can keep a full pipeline
+ * of in-flight requests from a single client.  This limitation can be
+ * overcome by allowing multiple slots per client in the last_rcvd file,
+ * see LU-6864.
+ */
+#define OSP_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void osp_init_rpc_lock(struct osp_device *osp)
+{
+       struct osp_rpc_lock *lck = &osp->opd_rpc_lock;
+
+       mutex_init(&lck->rpcl_mutex);
+       lck->rpcl_fakes = 0;
+}
  
  static inline void osp_get_rpc_lock(struct osp_device *osp)
  {
-       struct mdc_rpc_lock *rpc_lock = osp->opd_obd->u.cli.cl_rpc_lock;
+       struct osp_rpc_lock *lck = &osp->opd_rpc_lock;
+
+       /* This would normally block until the existing request finishes.
+        * If fail_loc is set it will block until the regular request is
+        * done, then increment rpcl_fakes.  Once that is non-zero it
+        * will only be cleared when all fake requests are finished.
+        * Only when all fake requests are finished can normal requests
+        * be sent, to ensure they are recoverable again.
+        */
+ again:
+       mutex_lock(&lck->rpcl_mutex);
+
+       if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM) ||
+           CFS_FAIL_CHECK_QUIET(OBD_FAIL_OSP_RPCS_SEM)) {
+               lck->rpcl_fakes++;
+               mutex_unlock(&lck->rpcl_mutex);
  
-       mdc_get_rpc_lock(rpc_lock, NULL);
+               return;
+       }
+
+       /* This will only happen when the CFS_FAIL_CHECK() was just turned
+        * off but there are still requests in progress.  Wait until they
+        * finish.  It doesn't need to be efficient in this extremely rare
+        * case, just have low overhead in the common case when it isn't true.
+        */
+       if (unlikely(lck->rpcl_fakes)) {
+               mutex_unlock(&lck->rpcl_mutex);
+               schedule_timeout(cfs_time_seconds(1) / 4);
+
+               goto again;
+       }
  }
  
  static inline void osp_put_rpc_lock(struct osp_device *osp)
  {
-       struct mdc_rpc_lock *rpc_lock = osp->opd_obd->u.cli.cl_rpc_lock;
+       struct osp_rpc_lock *lck = &osp->opd_rpc_lock;
+
+       if (lck->rpcl_fakes) { /* OBD_FAIL_OSP_RPCS_SEM */
+               mutex_lock(&lck->rpcl_mutex);
  
-       mdc_put_rpc_lock(rpc_lock, NULL);
+               if (lck->rpcl_fakes) /* check again under lock */
+                       lck->rpcl_fakes--;
+       }
+
+       mutex_unlock(&lck->rpcl_mutex);
  }
  
  static inline int osp_fid_diff(const struct lu_fid *fid1,
@@ -540,17 +606,26 @@ static inline int osp_fid_diff(const struct lu_fid *fid1,
                        fid_idif_id(fid2->f_seq, fid2->f_oid, 0);
         }
  
-       LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID
-                ", fid2:"DFID"\n", PFID(fid1), PFID(fid2));
+       LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+                PFID(fid1), PFID(fid2));
  
         return fid_oid(fid1) - fid_oid(fid2);
  }
  
+static inline void osp_fid_to_obdid(struct lu_fid *last_fid, u64 *osi_id)
+{
+       if (fid_is_idif((last_fid)))
+               *osi_id = fid_idif_id(fid_seq(last_fid), fid_oid(last_fid),
+                                     fid_ver(last_fid));
+       else
+               *osi_id = fid_oid(last_fid);
+}
  
  static inline void osp_update_last_fid(struct osp_device *d, struct lu_fid *fid)
  {
         int diff = osp_fid_diff(fid, &d->opd_last_used_fid);
         struct lu_fid *gap_start = &d->opd_gap_start_fid;
+
         /*
          * we might have lost precreated objects due to VBR and precreate
          * orphans, the gap in objid can be calculated properly only here
@@ -570,6 +645,7 @@ static inline void osp_update_last_fid(struct osp_device *d, struct lu_fid *fid)
                                PFID(&d->opd_gap_start_fid), d->opd_gap_count);
                 }
                 d->opd_last_used_fid = *fid;
+               osp_fid_to_obdid(fid, &d->opd_last_id);
         }
  }
  
@@ -625,10 +701,10 @@ osp_current_object_update_request(struct osp_update_request *our);
  int osp_object_update_request_create(struct osp_update_request *our,
                                      size_t size);
  
-#define osp_update_rpc_pack(env, name, our, op, ...)                   \
+#define OSP_UPDATE_RPC_PACK(env, out_something_pack, our, ...)         \
  ({                                                                     \
-       struct object_update    *object_update;                         \
-       size_t                  max_update_length;                      \
+       struct object_update *object_update;                            \
+       size_t max_update_length;                                       \
         struct osp_update_request_sub *ours;                            \
         int ret;                                                        \
                                                                         \
@@ -640,9 +716,9 @@ int osp_object_update_request_create(struct osp_update_request *our,
                                                                         \
                 object_update = update_buffer_get_update(ours->ours_req,\
                                          ours->ours_req->ourq_count);   \
-               ret = out_##name##_pack(env, object_update,             \
-                                       &max_update_length,             \
-                                      __VA_ARGS__);                    \
+               ret = out_something_pack(env, object_update,            \
+                                        &max_update_length,            \
+                                        __VA_ARGS__);                  \
                 if (ret == -E2BIG) {                                    \
                         int rc1;                                        \
                         /* Create new object update request */          \
@@ -688,7 +764,6 @@ typedef int (*osp_update_interpreter_t)(const struct lu_env *env,
  
  /* osp_dev.c */
  void osp_update_last_id(struct osp_device *d, u64 objid);
-extern struct llog_operations osp_mds_ost_orig_logops;
  
  /* osp_trans.c */
  int osp_insert_async_request(const struct lu_env *env, enum update_type op,
@@ -802,22 +877,24 @@ int osp_reset_last_used(const struct lu_env *env, struct osp_device *osp);
  int osp_write_last_oid_seq_files(struct lu_env *env, struct osp_device *osp,
                                  struct lu_fid *fid, int sync);
  int osp_init_pre_fid(struct osp_device *osp);
+int osp_init_statfs(struct osp_device *osp);
+void osp_fini_statfs(struct osp_device *osp);
+void osp_statfs_fini(struct osp_device *d);
  
  /* lproc_osp.c */
-void osp_lprocfs_init(struct osp_device *osp);
+void osp_tunables_init(struct osp_device *osp);
+void osp_tunables_fini(struct osp_device *osp);
  
  /* osp_sync.c */
  int osp_sync_declare_add(const struct lu_env *env, struct osp_object *o,
-                        llog_op_type type, struct thandle *th);
+                        enum llog_op_type type, struct thandle *th);
  int osp_sync_add(const struct lu_env *env, struct osp_object *o,
-                llog_op_type type, struct thandle *th,
+                enum llog_op_type type, struct thandle *th,
                  const struct lu_attr *attr);
  int osp_sync_init(const struct lu_env *env, struct osp_device *d);
  int osp_sync_fini(struct osp_device *d);
  void osp_sync_check_for_work(struct osp_device *osp);
  void osp_sync_force(const struct lu_env *env, struct osp_device *d);
-int osp_sync_add_commit_cb(const struct lu_env *env, struct osp_device *d,
-                          struct thandle *th);
  int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
                               struct thandle *th);