/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * lustre/fid/fid_handler.c
- * Lustre Sequence Manager
+ * GPL HEADER START
*
- * Copyright (c) 2006 Cluster File Systems, Inc.
- * Author: Yury Umanets <umka@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * This file is part of the Lustre file system, http://www.lustre.org
- * Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * You may have signed or agreed to another license before downloading
- * this software. If so, you are bound by the terms and conditions
- * of that agreement, and the following does not apply to you. See the
- * LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
- * If you did not agree to a different license, then this copy of Lustre
- * is open source software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
*
- * In either case, Lustre is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
*/
#ifndef EXPORT_SYMTAB
#include <obd.h>
#include <obd_class.h>
+#include <lu_target.h>
#include <dt_object.h>
#include <md_object.h>
#include <obd_support.h>
* Ask client for new range, assign that range to ->seq_space and write
* seq state to backing store should be atomic.
*/
- down(&seq->lss_sem);
+ cfs_down(&seq->lss_sem);
if (cli == NULL) {
CDEBUG(D_INFO, "%s: Detached sequence client %s\n",
seq->lss_name, cli->lcs_name);
seq->lss_cli = cli;
+ cli->lcs_space.lsr_index = seq->lss_site->ms_node_id;
EXIT;
out_up:
- up(&seq->lss_sem);
+ cfs_up(&seq->lss_sem);
return rc;
}
EXPORT_SYMBOL(seq_server_set_cli);
-
/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+ struct lu_seq_range *from,
+ __u64 width)
+{
+ width = min(range_space(from), width);
+ to->lsr_start = from->lsr_start;
+ to->lsr_end = from->lsr_start + width;
+ from->lsr_start += width;
+}
+
+/**
* On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
*/
+
static int __seq_server_alloc_super(struct lu_server_seq *seq,
- struct lu_range *in,
- struct lu_range *out,
+ struct lu_seq_range *out,
const struct lu_env *env)
{
- struct lu_range *space = &seq->lss_space;
+ struct lu_seq_range *space = &seq->lss_space;
int rc;
ENTRY;
LASSERT(range_is_sane(space));
- if (in != NULL) {
- CDEBUG(D_INFO, "%s: Input seq range: "
- DRANGE"\n", seq->lss_name, PRANGE(in));
-
- if (in->lr_end > space->lr_start)
- space->lr_start = in->lr_end;
- *out = *in;
-
- CDEBUG(D_INFO, "%s: Recovered space: "DRANGE"\n",
- seq->lss_name, PRANGE(space));
+ if (range_is_exhausted(space)) {
+ CERROR("%s: Sequences space is exhausted\n",
+ seq->lss_name);
+ RETURN(-ENOSPC);
} else {
- if (range_space(space) < seq->lss_width) {
- CWARN("%s: Sequences space to be exhausted soon. "
- "Only "LPU64" sequences left\n", seq->lss_name,
- range_space(space));
- *out = *space;
- space->lr_start = space->lr_end;
- } else if (range_is_exhausted(space)) {
- CERROR("%s: Sequences space is exhausted\n",
- seq->lss_name);
- RETURN(-ENOSPC);
- } else {
- range_alloc(out, space, seq->lss_width);
- }
+ range_alloc(out, space, seq->lss_width);
}
- rc = seq_store_write(seq, env);
- if (rc) {
- CERROR("%s: Can't write space data, rc %d\n",
- seq->lss_name, rc);
- RETURN(rc);
- }
+ rc = seq_store_update(env, seq, out, 1 /* sync */);
- CDEBUG(D_INFO, "%s: Allocated super-sequence "
- DRANGE"\n", seq->lss_name, PRANGE(out));
+ CDEBUG(D_INFO, "%s: super-sequence allocation rc = %d "
+ DRANGE"\n", seq->lss_name, rc, PRANGE(out));
RETURN(rc);
}
int seq_server_alloc_super(struct lu_server_seq *seq,
- struct lu_range *in,
- struct lu_range *out,
+ struct lu_seq_range *out,
const struct lu_env *env)
{
int rc;
ENTRY;
- down(&seq->lss_sem);
- rc = __seq_server_alloc_super(seq, in, out, env);
- up(&seq->lss_sem);
+ cfs_down(&seq->lss_sem);
+ rc = __seq_server_alloc_super(seq, out, env);
+ cfs_up(&seq->lss_sem);
+
+ RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+ struct lu_server_seq *seq)
+{
+ struct lu_seq_range *space = &seq->lss_space;
+ int rc;
+
+ range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+ range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+ rc = seq_store_update(env, seq, NULL, 1);
+ seq->lss_set_transno = 0;
+
+ return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ * not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not commited, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+ struct lu_seq_range *out,
+ struct lu_server_seq *seq)
+{
+ struct lu_seq_range *space = &seq->lss_space;
+ struct lu_seq_range *loset = &seq->lss_lowater_set;
+ struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+ int rc = 0;
+
+ if (range_is_zero(loset))
+ __seq_set_init(env, seq);
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+ loset->lsr_start = loset->lsr_end;
+
+ if (range_is_exhausted(loset)) {
+ /* reached high water mark. */
+ struct lu_device *dev = seq->lss_site->ms_lu.ls_top_dev;
+ struct lu_target *tg = dev->ld_obd->u.obt.obt_lut;
+ int obd_num_clients = dev->ld_obd->obd_num_exports;
+ __u64 set_sz;
+ int sync = 0;
+
+ /* calculate new seq width based on number of clients */
+ set_sz = max(seq->lss_set_width,
+ obd_num_clients * seq->lss_width);
+ set_sz = min(range_space(space), set_sz);
+
+ /* Switch to hiwater range now */
+ loset = hiset;
+ /* allocate new hiwater range */
+ range_alloc(hiset, space, set_sz);
+
+ if (seq->lss_set_transno > dev->ld_obd->obd_last_committed)
+ sync = 1;
+
+ /* update ondisk seq with new *space */
+ rc = seq_store_update(env, seq, NULL, sync);
+
+ /* set new hiwater transno */
+ cfs_spin_lock(&tg->lut_translock);
+ seq->lss_set_transno = tg->lut_last_transno;
+ cfs_spin_unlock(&tg->lut_translock);
+ }
+
+ LASSERTF(!range_is_exhausted(loset) || range_is_sane(loset),
+ DRANGE"\n", PRANGE(loset));
+
+ if (rc == 0)
+ range_alloc(out, loset, seq->lss_width);
RETURN(rc);
}
static int __seq_server_alloc_meta(struct lu_server_seq *seq,
- struct lu_range *in,
- struct lu_range *out,
+ struct lu_seq_range *out,
const struct lu_env *env)
{
- struct lu_range *space = &seq->lss_space;
+ struct lu_seq_range *space = &seq->lss_space;
int rc = 0;
+
ENTRY;
LASSERT(range_is_sane(space));
- /*
- * This is recovery case. Adjust super range if input range looks like
- * it is allocated from new super.
- */
- if (in != NULL) {
- CDEBUG(D_INFO, "%s: Input seq range: "
- DRANGE"\n", seq->lss_name, PRANGE(in));
-
- if (range_is_exhausted(space)) {
- /*
- * Server cannot send empty range to client, this is why
- * we check here that range from client is "newer" than
- * exhausted super.
- */
- LASSERT(in->lr_end > space->lr_start);
-
- /*
- * Start is set to end of last allocated, because it
- * *is* already allocated so we take that into account
- * and do not use for other allocations.
- */
- space->lr_start = in->lr_end;
-
- /*
- * End is set to in->lr_start + super sequence
- * allocation unit. That is because in->lr_start is
- * first seq in new allocated range from controller
- * before failure.
- */
- space->lr_end = in->lr_start + LUSTRE_SEQ_SUPER_WIDTH;
-
- if (!seq->lss_cli) {
- CERROR("%s: No sequence controller "
- "is attached.\n", seq->lss_name);
- RETURN(-ENODEV);
- }
-
- /*
- * Let controller know that this is recovery and last
- * obtained range from it was @space.
- */
- rc = seq_client_replay_super(seq->lss_cli, space, env);
- if (rc) {
- CERROR("%s: Can't replay super-sequence, "
- "rc %d\n", seq->lss_name, rc);
- RETURN(rc);
- }
- } else {
- /*
- * Update super start by end from client's range. Super
- * end should not be changed if range was not exhausted.
- */
- if (in->lr_end > space->lr_start)
- space->lr_start = in->lr_end;
+ /* Check if available space ends and allocate new super seq */
+ if (range_is_exhausted(space)) {
+ if (!seq->lss_cli) {
+ CERROR("%s: No sequence controller is attached.\n",
+ seq->lss_name);
+ RETURN(-ENODEV);
}
- *out = *in;
-
- CDEBUG(D_INFO, "%s: Recovered space: "DRANGE"\n",
- seq->lss_name, PRANGE(space));
- } else {
- /*
- * XXX: Avoid cascading RPCs using kind of async preallocation
- * when meta-sequence is close to exhausting.
- */
- if (range_is_exhausted(space)) {
- if (!seq->lss_cli) {
- CERROR("%s: No sequence controller "
- "is attached.\n", seq->lss_name);
- RETURN(-ENODEV);
- }
-
- rc = seq_client_alloc_super(seq->lss_cli, env);
- if (rc) {
- CERROR("%s: Can't allocate super-sequence, "
- "rc %d\n", seq->lss_name, rc);
- RETURN(rc);
- }
-
- /* Saving new range to allocation space. */
- *space = seq->lss_cli->lcs_space;
- LASSERT(range_is_sane(space));
+ rc = seq_client_alloc_super(seq->lss_cli, env);
+ if (rc) {
+ CERROR("%s: Can't allocate super-sequence, rc %d\n",
+ seq->lss_name, rc);
+ RETURN(rc);
}
- range_alloc(out, space, seq->lss_width);
- }
-
- rc = seq_store_write(seq, env);
- if (rc) {
- CERROR("%s: Can't write space data, rc %d\n",
- seq->lss_name, rc);
+ /* Saving new range to allocation space. */
+ *space = seq->lss_cli->lcs_space;
+ LASSERT(range_is_sane(space));
}
+ rc = range_alloc_set(env, out, seq);
if (rc == 0) {
CDEBUG(D_INFO, "%s: Allocated meta-sequence "
DRANGE"\n", seq->lss_name, PRANGE(out));
}
int seq_server_alloc_meta(struct lu_server_seq *seq,
- struct lu_range *in,
- struct lu_range *out,
+ struct lu_seq_range *out,
const struct lu_env *env)
{
int rc;
ENTRY;
- down(&seq->lss_sem);
- rc = __seq_server_alloc_meta(seq, in, out, env);
- up(&seq->lss_sem);
+ cfs_down(&seq->lss_sem);
+ rc = __seq_server_alloc_meta(seq, out, env);
+ cfs_up(&seq->lss_sem);
RETURN(rc);
}
static int seq_server_handle(struct lu_site *site,
const struct lu_env *env,
- __u32 opc, struct lu_range *in,
- struct lu_range *out)
+ __u32 opc, struct lu_seq_range *out)
{
int rc;
+ struct md_site *mite;
ENTRY;
+ mite = lu_site2md(site);
switch (opc) {
case SEQ_ALLOC_META:
- if (!site->ls_server_seq) {
+ if (!mite->ms_server_seq) {
CERROR("Sequence server is not "
"initialized\n");
RETURN(-EINVAL);
}
- rc = seq_server_alloc_meta(site->ls_server_seq,
- in, out, env);
+ rc = seq_server_alloc_meta(mite->ms_server_seq, out, env);
break;
case SEQ_ALLOC_SUPER:
- if (!site->ls_control_seq) {
+ if (!mite->ms_control_seq) {
CERROR("Sequence controller is not "
"initialized\n");
RETURN(-EINVAL);
}
- rc = seq_server_alloc_super(site->ls_control_seq,
- in, out, env);
+ rc = seq_server_alloc_super(mite->ms_control_seq, out, env);
break;
default:
rc = -EINVAL;
const struct lu_env *env,
struct seq_thread_info *info)
{
- struct lu_range *out, *in = NULL;
+ struct lu_seq_range *out, *tmp;
struct lu_site *site;
int rc = -EPROTO;
__u32 *opc;
ENTRY;
+ LASSERT(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY));
site = req->rq_export->exp_obd->obd_lu_dev->ld_site;
LASSERT(site != NULL);
-
- rc = req_capsule_pack(&info->sti_pill);
+
+ rc = req_capsule_server_pack(info->sti_pill);
if (rc)
RETURN(err_serious(rc));
- opc = req_capsule_client_get(&info->sti_pill,
- &RMF_SEQ_OPC);
+ opc = req_capsule_client_get(info->sti_pill, &RMF_SEQ_OPC);
if (opc != NULL) {
- out = req_capsule_server_get(&info->sti_pill,
- &RMF_SEQ_RANGE);
+ out = req_capsule_server_get(info->sti_pill, &RMF_SEQ_RANGE);
if (out == NULL)
RETURN(err_serious(-EPROTO));
- if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
- in = req_capsule_client_get(&info->sti_pill,
- &RMF_SEQ_RANGE);
+ tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE);
- LASSERT(!range_is_zero(in) && range_is_sane(in));
- }
+ /* seq client passed mdt id, we need to pass that using out
+ * range parameter */
- rc = seq_server_handle(site, env, *opc, in, out);
+ out->lsr_index = tmp->lsr_index;
+ out->lsr_flags = tmp->lsr_flags;
+ rc = seq_server_handle(site, env, *opc, out);
} else
rc = err_serious(-EPROTO);
RETURN(rc);
}
-static void *seq_key_init(const struct lu_context *ctx,
- struct lu_context_key *key)
-{
- struct seq_thread_info *info;
-
- /*
- * check that no high order allocations are incurred.
- */
- CLASSERT(CFS_PAGE_SIZE >= sizeof *info);
- OBD_ALLOC_PTR(info);
- if (info == NULL)
- info = ERR_PTR(-ENOMEM);
- return info;
-}
-
-static void seq_key_fini(const struct lu_context *ctx,
- struct lu_context_key *key, void *data)
-{
- struct seq_thread_info *info = data;
- OBD_FREE_PTR(info);
-}
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
-struct lu_context_key seq_thread_key = {
- .lct_tags = LCT_MD_THREAD,
- .lct_init = seq_key_init,
- .lct_fini = seq_key_fini
-};
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD);
static void seq_thread_info_init(struct ptlrpc_request *req,
struct seq_thread_info *info)
{
- int i;
-
- /* Mark rep buffer as req-layout stuff expects */
- for (i = 0; i < ARRAY_SIZE(info->sti_rep_buf_size); i++)
- info->sti_rep_buf_size[i] = -1;
-
+ info->sti_pill = &req->rq_pill;
/* Init request capsule */
- req_capsule_init(&info->sti_pill, req, RCL_SERVER,
- info->sti_rep_buf_size);
-
- req_capsule_set(&info->sti_pill, &RQF_SEQ_QUERY);
+ req_capsule_init(info->sti_pill, req, RCL_SERVER);
+ req_capsule_set(info->sti_pill, &RQF_SEQ_QUERY);
}
static void seq_thread_info_fini(struct seq_thread_info *info)
{
- req_capsule_fini(&info->sti_pill);
+ req_capsule_fini(info->sti_pill);
}
static int seq_handle(struct ptlrpc_request *req)
seq_thread_info_init(req, info);
rc = seq_req_handle(req, env, info);
+ /* XXX: we don't need replay but MDT assign transno in any case,
+ * remove it manually before reply*/
+ lustre_msg_set_transno(req->rq_repmsg, 0);
seq_thread_info_fini(info);
return rc;
*/
int seq_query(struct com_thread_info *info)
{
- return seq_handle(info->cti_pill.rc_req);
+ return seq_handle(info->cti_pill->rc_req);
}
EXPORT_SYMBOL(seq_query);
}
#endif
+
int seq_server_init(struct lu_server_seq *seq,
struct dt_device *dev,
const char *prefix,
enum lu_mgr_type type,
+ struct md_site *ms,
const struct lu_env *env)
{
int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
ENTRY;
- LASSERT(dev != NULL);
+ LASSERT(dev != NULL);
LASSERT(prefix != NULL);
seq->lss_cli = NULL;
seq->lss_type = type;
- range_zero(&seq->lss_space);
- sema_init(&seq->lss_sem, 1);
+ seq->lss_site = ms;
+ range_init(&seq->lss_space);
+
+ range_init(&seq->lss_lowater_set);
+ range_init(&seq->lss_hiwater_set);
+ seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+ cfs_sema_init(&seq->lss_sem, 1);
seq->lss_width = is_srv ?
LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
rc = seq_store_init(seq, env, dev);
if (rc)
GOTO(out, rc);
-
/* Request backing store for saved sequence info. */
rc = seq_store_read(seq, env);
if (rc == -ENODATA) {
LUSTRE_SEQ_ZERO_RANGE:
LUSTRE_SEQ_SPACE_RANGE;
+ seq->lss_space.lsr_index = ms->ms_node_id;
CDEBUG(D_INFO, "%s: No data found "
"on store. Initialize space\n",
seq->lss_name);
- /* Save default controller value to store. */
- rc = seq_store_write(seq, env);
+ rc = seq_store_update(env, seq, NULL, 0);
if (rc) {
CERROR("%s: Can't write space data, "
"rc %d\n", seq->lss_name, rc);
}
} else if (rc) {
- CERROR("%s: Can't read space data, rc %d\n",
- seq->lss_name, rc);
- GOTO(out, rc);
- }
+ CERROR("%s: Can't read space data, rc %d\n",
+ seq->lss_name, rc);
+ GOTO(out, rc);
+ }
if (is_srv) {
LASSERT(range_is_sane(&seq->lss_space));
rc = seq_server_proc_init(seq);
if (rc)
- GOTO(out, rc);
+ GOTO(out, rc);
- EXIT;
+ EXIT;
out:
- if (rc)
- seq_server_fini(seq, env);
- return rc;
+ if (rc)
+ seq_server_fini(seq, env);
+ return rc;
}
EXPORT_SYMBOL(seq_server_init);
cfs_proc_dir_entry_t *seq_type_proc_dir = NULL;
+static struct lu_local_obj_desc llod_seq_srv = {
+ .llod_name = LUSTRE_SEQ_SRV_NAME,
+ .llod_oid = FID_SEQ_SRV_OID,
+ .llod_is_index = 0,
+};
+
+static struct lu_local_obj_desc llod_seq_ctl = {
+ .llod_name = LUSTRE_SEQ_CTL_NAME,
+ .llod_oid = FID_SEQ_CTL_OID,
+ .llod_is_index = 0,
+};
+
static int __init fid_mod_init(void)
{
seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
if (IS_ERR(seq_type_proc_dir))
return PTR_ERR(seq_type_proc_dir);
+ llo_local_obj_register(&llod_seq_srv);
+ llo_local_obj_register(&llod_seq_ctl);
+
LU_CONTEXT_KEY_INIT(&seq_thread_key);
lu_context_key_register(&seq_thread_key);
return 0;
static void __exit fid_mod_exit(void)
{
+ llo_local_obj_unregister(&llod_seq_srv);
+ llo_local_obj_unregister(&llod_seq_ctl);
+
lu_context_key_degister(&seq_thread_key);
if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
lprocfs_remove(&seq_type_proc_dir);
}
}
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
MODULE_DESCRIPTION("Lustre FID Module");
MODULE_LICENSE("GPL");