X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Finclude%2Flustre_mdc.h;h=de7bc748355e751ebadb5409a4877b27fc6f98cb;hb=6712478e79588e73e28c7ccac3afc7ac2368a4f3;hp=a30c2ecf4946a611113db93c08bc7efbaf2a4aaa;hpb=6e3ec5812ebd1b5ecf7cae584f429b013ffe7431;p=fs%2Flustre-release.git

diff --git a/lustre/include/lustre_mdc.h b/lustre/include/lustre_mdc.h
index a30c2ec..de7bc74 100644
--- a/lustre/include/lustre_mdc.h
+++ b/lustre/include/lustre_mdc.h
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -17,19 +15,18 @@
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
 /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  */
 /*
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
@@ -42,27 +39,23 @@
 #ifndef _LUSTRE_MDC_H
 #define _LUSTRE_MDC_H
 
-#ifdef __KERNEL__
-# include <linux/fs.h>
-# include <linux/dcache.h>
-# ifdef CONFIG_FS_POSIX_ACL
-#  ifdef HAVE_XATTR_ACL
-#   include <linux/xattr_acl.h>
-#  endif /*HAVE_XATTR_ACL */
-#  ifdef HAVE_LINUX_POSIX_ACL_XATTR_H
-#   include <linux/posix_acl_xattr.h>
-#  endif /* HAVE_LINUX_POSIX_ACL_XATTR_H */
-# endif /* CONFIG_FS_POSIX_ACL */
-# ifndef HAVE_VFS_INTENT_PATCHES
-# include <linux/lustre_intent.h>
-# endif /* HAVE_VFS_INTENT_PATCHES */
-#endif /* __KERNEL__ */
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/posix_acl_xattr.h>
+#endif /* CONFIG_FS_POSIX_ACL */
 #include <lustre_handles.h>
+#include <lustre_intent.h>
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
+#include <obd_class.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
-#include <lustre_log.h>
 #include <lustre_export.h>
 
 struct ptlrpc_client;
@@ -70,66 +63,177 @@ struct obd_export;
 struct ptlrpc_request;
 struct obd_device;
 
+/**
+ * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
+ *
+ * This mutex is used to implement execute-once semantics on the MDT.
+ * The MDT stores the last transaction ID and result for every client in
+ * its last_rcvd file. If the client doesn't get a reply, it can safely
+ * resend the request and the MDT will reconstruct the reply being aware
+ * that the request has already been executed. Without this lock,
+ * execution status of concurrent in-flight requests would be
+ * overwritten.
+ *
+ * This design limits the extent to which we can keep a full pipeline of
+ * in-flight requests from a single client.  This limitation could be
+ * overcome by allowing multiple slots per client in the last_rcvd file.
+ */
 struct mdc_rpc_lock {
-        cfs_semaphore_t  rpcl_sem;
-        struct lookup_intent *rpcl_it;
+	/** Lock protecting in-flight RPC concurrency. */
+	struct mutex		rpcl_mutex;
+	/** Intent associated with currently executing request. */
+	struct lookup_intent	*rpcl_it;
+	/** Used for MDS/RPC load testing purposes. */
+	int			rpcl_fakes;
 };
 
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
 static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
 {
-        cfs_sema_init(&lck->rpcl_sem, 1);
+	mutex_init(&lck->rpcl_mutex);
         lck->rpcl_it = NULL;
 }
 
 static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
-                                    struct lookup_intent *it)
+				    struct lookup_intent *it)
 {
-        ENTRY;
-        if (!it || (it->it_op != IT_GETATTR && it->it_op != IT_LOOKUP)) {
-                cfs_down(&lck->rpcl_sem);
-                LASSERT(lck->rpcl_it == NULL);
-                lck->rpcl_it = it;
-        }
+	ENTRY;
+
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
+		return;
+
+	/* This would normally block until the existing request finishes.
+	 * If fail_loc is set it will block until the regular request is
+	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+	 * it will only be cleared when all fake requests are finished.
+	 * Only when all fake requests are finished can normal requests
+	 * be sent, to ensure they are recoverable again. */
+ again:
+	mutex_lock(&lck->rpcl_mutex);
+
+	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+		lck->rpcl_it = MDC_FAKE_RPCL_IT;
+		lck->rpcl_fakes++;
+		mutex_unlock(&lck->rpcl_mutex);
+		return;
+	}
+
+	/* This will only happen when the CFS_FAIL_CHECK() was
+	 * just turned off but there are still requests in progress.
+	 * Wait until they finish.  It doesn't need to be efficient
+	 * in this extremely rare case, just have low overhead in
+	 * the common case when it isn't true. */
+	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+		mutex_unlock(&lck->rpcl_mutex);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+		goto again;
+	}
+
+	LASSERT(lck->rpcl_it == NULL);
+	lck->rpcl_it = it;
 }
 
 static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
-                                    struct lookup_intent *it)
+				    struct lookup_intent *it)
 {
-        if (!it || (it->it_op != IT_GETATTR && it->it_op != IT_LOOKUP)) {
-                LASSERT(it == lck->rpcl_it);
-                lck->rpcl_it = NULL;
-                cfs_up(&lck->rpcl_sem);
-        }
-        EXIT;
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
+		goto out;
+
+	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+		mutex_lock(&lck->rpcl_mutex);
+
+		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+		lck->rpcl_fakes--;
+
+		if (lck->rpcl_fakes == 0)
+			lck->rpcl_it = NULL;
+
+	} else {
+		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+		lck->rpcl_it = NULL;
+	}
+
+	mutex_unlock(&lck->rpcl_mutex);
+ out:
+	EXIT;
 }
 
-static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
-                                               struct mdt_body *body)
+static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req,
+					struct lookup_intent *it)
 {
-        if (body->valid & OBD_MD_FLMODEASIZE) {
-                if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
-                        exp->exp_obd->u.cli.cl_max_mds_easize =
-                                                body->max_mdsize;
-                if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
-                                                body->max_cookiesize)
-                        exp->exp_obd->u.cli.cl_max_mds_cookiesize =
-                                                body->max_cookiesize;
-        }
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	__u32 opc;
+	__u16 tag;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	tag = obd_get_mod_rpc_slot(cli, opc, it);
+	lustre_msg_set_tag(req->rq_reqmsg, tag);
 }
 
+static inline void mdc_put_mod_rpc_slot(struct ptlrpc_request *req,
+					struct lookup_intent *it)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	__u32 opc;
+	__u16 tag;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	tag = lustre_msg_get_tag(req->rq_reqmsg);
+	obd_put_mod_rpc_slot(cli, opc, it, tag);
+}
+
+
+/**
+ * Update the maximum possible easize.
+ *
+ * This value is learned from ptlrpc replies sent by the MDT.  The
+ * default easize is initialized to the minimum value but allowed to
+ * grow up to a single page in size if required to handle the common
+ * case.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] exp	export for MDC device
+ * \param[in] body	body of ptlrpc reply from MDT
+ *
+ */
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+					       struct mdt_body *body)
+{
+	if (body->mbo_valid & OBD_MD_FLMODEASIZE) {
+		struct client_obd *cli = &exp->exp_obd->u.cli;
+		__u32 def_easize;
+
+		if (cli->cl_max_mds_easize < body->mbo_max_mdsize)
+			cli->cl_max_mds_easize = body->mbo_max_mdsize;
+
+		def_easize = min_t(__u32, body->mbo_max_mdsize,
+				   OBD_MAX_DEFAULT_EA_SIZE);
+		cli->cl_default_mds_easize = def_easize;
+	}
+}
 
-struct mdc_cache_waiter {
-        cfs_list_t              mcw_entry;
-        cfs_waitq_t             mcw_waitq;
-};
 
 /* mdc/mdc_locks.c */
-int it_disposition(struct lookup_intent *it, int flag);
-void it_clear_disposition(struct lookup_intent *it, int flag);
-void it_set_disposition(struct lookup_intent *it, int flag);
 int it_open_error(int phase, struct lookup_intent *it);
-#ifdef HAVE_SPLIT_SUPPORT
-int mdc_sendpage(struct obd_export *exp, const struct lu_fid *fid,
-                 const struct page *page, int offset);
-#endif
+
+static inline bool cl_is_lov_delay_create(unsigned int flags)
+{
+	return  (flags & O_LOV_DELAY_CREATE_1_8) != 0 ||
+		(flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK;
+}
+
+static inline void cl_lov_delay_create_clear(unsigned int *flags)
+{
+	if ((*flags & O_LOV_DELAY_CREATE_1_8) != 0)
+		*flags &= ~O_LOV_DELAY_CREATE_1_8;
+	if ((*flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK)
+		*flags &= ~O_LOV_DELAY_CREATE_MASK;
+}
+
+/** @} mdc */
+
 #endif