X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Flustre_mdc.h;h=7b09164da97d4c1feba1934f57cb51d76ed70767;hp=381a676f23897450be08a3ce2722aa0fb3e7238c;hb=617a53daff1768d88f694ae349214d5c6606d3cf;hpb=45c495867c903379ec9e9acf01d3d49caffa36e4 diff --git a/lustre/include/lustre_mdc.h b/lustre/include/lustre_mdc.h index 381a676..7b09164 100644 --- a/lustre/include/lustre_mdc.h +++ b/lustre/include/lustre_mdc.h @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -26,10 +24,13 @@ * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. */ /* + * Copyright (c) 2011, 2014, Intel Corporation. + */ +/* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. * @@ -47,27 +48,18 @@ * @{ */ -#ifdef __KERNEL__ -# include -# include -# ifdef CONFIG_FS_POSIX_ACL -# ifdef HAVE_XATTR_ACL -# include -# endif /*HAVE_XATTR_ACL */ -# ifdef HAVE_LINUX_POSIX_ACL_XATTR_H -# include -# endif /* HAVE_LINUX_POSIX_ACL_XATTR_H */ -# endif /* CONFIG_FS_POSIX_ACL */ -# ifndef HAVE_VFS_INTENT_PATCHES -# include -# endif /* HAVE_VFS_INTENT_PATCHES */ -#endif /* __KERNEL__ */ +#include +#include +#ifdef CONFIG_FS_POSIX_ACL +# include +#endif /* CONFIG_FS_POSIX_ACL */ #include +#include #include +#include #include #include #include -#include #include struct ptlrpc_client; @@ -75,68 +67,160 @@ struct obd_export; struct ptlrpc_request; struct obd_device; +/** + * Serializes in-flight MDT-modifying RPC requests to preserve idempotency. + * + * This mutex is used to implement execute-once semantics on the MDT. + * The MDT stores the last transaction ID and result for every client in + * its last_rcvd file. If the client doesn't get a reply, it can safely + * resend the request and the MDT will reconstruct the reply being aware + * that the request has already been executed. Without this lock, + * execution status of concurrent in-flight requests would be + * overwritten. + * + * This design limits the extent to which we can keep a full pipeline of + * in-flight requests from a single client. This limitation could be + * overcome by allowing multiple slots per client in the last_rcvd file. + */ struct mdc_rpc_lock { - cfs_semaphore_t rpcl_sem; - struct lookup_intent *rpcl_it; + /** Lock protecting in-flight RPC concurrency. */ + struct mutex rpcl_mutex; + /** Intent associated with currently executing request. */ + struct lookup_intent *rpcl_it; + /** Used for MDS/RPC load testing purposes. */ + int rpcl_fakes; }; +#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL) + static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) { - cfs_sema_init(&lck->rpcl_sem, 1); + mutex_init(&lck->rpcl_mutex); lck->rpcl_it = NULL; } static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, - struct lookup_intent *it) + struct lookup_intent *it) { - ENTRY; - if (!it || (it->it_op != IT_GETATTR && it->it_op != IT_LOOKUP)) { - cfs_down(&lck->rpcl_sem); - LASSERT(lck->rpcl_it == NULL); - lck->rpcl_it = it; - } + ENTRY; + + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || + it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) + return; + + /* This would normally block until the existing request finishes. + * If fail_loc is set it will block until the regular request is + * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set + * it will only be cleared when all fake requests are finished. + * Only when all fake requests are finished can normal requests + * be sent, to ensure they are recoverable again. */ + again: + mutex_lock(&lck->rpcl_mutex); + + if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) { + lck->rpcl_it = MDC_FAKE_RPCL_IT; + lck->rpcl_fakes++; + mutex_unlock(&lck->rpcl_mutex); + return; + } + + /* This will only happen when the CFS_FAIL_CHECK() was + * just turned off but there are still requests in progress. + * Wait until they finish. It doesn't need to be efficient + * in this extremely rare case, just have low overhead in + * the common case when it isn't true. */ + while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) { + mutex_unlock(&lck->rpcl_mutex); + schedule_timeout(cfs_time_seconds(1) / 4); + goto again; + } + + LASSERT(lck->rpcl_it == NULL); + lck->rpcl_it = it; } static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, - struct lookup_intent *it) + struct lookup_intent *it) { - if (!it || (it->it_op != IT_GETATTR && it->it_op != IT_LOOKUP)) { - LASSERT(it == lck->rpcl_it); - lck->rpcl_it = NULL; - cfs_up(&lck->rpcl_sem); - } - EXIT; + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || + it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) + goto out; + + if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */ + mutex_lock(&lck->rpcl_mutex); + + LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes); + lck->rpcl_fakes--; + + if (lck->rpcl_fakes == 0) + lck->rpcl_it = NULL; + + } else { + LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it); + lck->rpcl_it = NULL; + } + + mutex_unlock(&lck->rpcl_mutex); + out: + EXIT; } +/** + * Update the maximum possible easize and cookiesize. + * + * The values are learned from ptlrpc replies sent by the MDT. The + * default easize and cookiesize is initialized to the minimum value but + * allowed to grow up to a single page in size if required to handle the + * common case. + * + * \see client_obd::cl_default_mds_easize and + * client_obd::cl_default_mds_cookiesize + * + * \param[in] exp export for MDC device + * \param[in] body body of ptlrpc reply from MDT + * + */ static inline void mdc_update_max_ea_from_body(struct obd_export *exp, - struct mdt_body *body) + struct mdt_body *body) { - if (body->valid & OBD_MD_FLMODEASIZE) { - if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize) - exp->exp_obd->u.cli.cl_max_mds_easize = - body->max_mdsize; - if (exp->exp_obd->u.cli.cl_max_mds_cookiesize < - body->max_cookiesize) - exp->exp_obd->u.cli.cl_max_mds_cookiesize = - body->max_cookiesize; - } -} + if (body->mbo_valid & OBD_MD_FLMODEASIZE) { + struct client_obd *cli = &exp->exp_obd->u.cli; + __u32 def_easize; + __u32 def_cookiesize; + if (cli->cl_max_mds_easize < body->mbo_max_mdsize) + cli->cl_max_mds_easize = body->mbo_max_mdsize; + + def_easize = min_t(__u32, body->mbo_max_mdsize, + OBD_MAX_DEFAULT_EA_SIZE); + cli->cl_default_mds_easize = def_easize; + + if (cli->cl_max_mds_cookiesize < body->mbo_max_cookiesize) + cli->cl_max_mds_cookiesize = body->mbo_max_cookiesize; + + def_cookiesize = min_t(__u32, body->mbo_max_cookiesize, + OBD_MAX_DEFAULT_COOKIE_SIZE); + cli->cl_default_mds_cookiesize = def_cookiesize; + } +} -struct mdc_cache_waiter { - cfs_list_t mcw_entry; - cfs_waitq_t mcw_waitq; -}; /* mdc/mdc_locks.c */ -int it_disposition(struct lookup_intent *it, int flag); -void it_clear_disposition(struct lookup_intent *it, int flag); -void it_set_disposition(struct lookup_intent *it, int flag); int it_open_error(int phase, struct lookup_intent *it); -#ifdef HAVE_SPLIT_SUPPORT -int mdc_sendpage(struct obd_export *exp, const struct lu_fid *fid, - const struct page *page, int offset); -#endif + +static inline bool cl_is_lov_delay_create(unsigned int flags) +{ + return (flags & O_LOV_DELAY_CREATE_1_8) != 0 || + (flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK; +} + +static inline void cl_lov_delay_create_clear(unsigned int *flags) +{ + if ((*flags & O_LOV_DELAY_CREATE_1_8) != 0) + *flags &= ~O_LOV_DELAY_CREATE_1_8; + if ((*flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK) + *flags &= ~O_LOV_DELAY_CREATE_MASK; +} /** @} mdc */