-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/*
+ * GPL HEADER START
*
- * lustre/cmm/cmm_split.c
- * Lustre splitting dir
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * Copyright (c) 2006 Cluster File Systems, Inc.
- * Author: Alex Thomas <alex@clusterfs.com>
- * Wang Di <wangdi@clusterfs.com>
- * Yury Umanets <umka@clusterfs.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * This file is part of the Lustre file system, http://www.lustre.org
- * Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
- * You may have signed or agreed to another license before downloading
- * this software. If so, you are bound by the terms and conditions
- * of that agreement, and the following does not apply to you. See the
- * LICENSE file included with this distribution for more information.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
*
- * If you did not agree to a different license, then this copy of Lustre
- * is open source software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
*
- * In either case, Lustre is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * license text for more details.
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/cmm/cmm_split.c
+ *
+ * Lustre splitting dir
+ *
+ * Author: Alex Thomas <alex@clusterfs.com>
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
*/
-
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
#define DEBUG_SUBSYSTEM S_MDS
#include "cmm_internal.h"
#include "mdc_internal.h"
+/**
+ * \addtogroup split
+ * @{
+ */
enum {
CMM_SPLIT_SIZE = 128 * 1024
};
-/*
- * This function checks if passed @name come to correct server (local MDT). If
- * not - return -ERESTART and let client know that dir was split and client
- * needs to chose correct stripe.
+/**
+ * This function checks if passed \a name come to correct server (local MDT).
+ *
+ * \param mp Parent directory
+ * \param name Name to lookup
+ * \retval -ERESTART Let client know that dir was split and client needs to
+ * chose correct stripe.
*/
int cmm_split_check(const struct lu_env *env, struct md_object *mp,
const char *name)
if (ma->ma_lmv->mea_count != 0) {
int idx;
- /*
- * Get stripe by name to check the name belongs to master dir,
- * otherwise return the -ERESTART
+ /**
+ * This gets stripe by name to check the name belongs to master
+ * dir, otherwise return the -ERESTART
*/
idx = mea_name2idx(ma->ma_lmv, name, strlen(name));
- /*
- * Check if name came to correct MDT server. We suppose that if
- * client does not know about split, it sends create operation
- * to master MDT. And this is master job to say it that dir got
- * split and client should orward request to correct MDT. This
+ /**
+ * When client does not know about split, it sends create() to
+ * the master MDT and master replay back if directory is split.
+ * So client should orward request to correct MDT. This
* is why we check here if stripe zero or not. Zero stripe means
* master stripe. If stripe calculated from name is not zero -
* return -ERESTART.
return rc;
}
-/*
- * Return preferable access mode to caller taking into account possible split
- * and the fact of existing not splittable dirs in principle.
+/**
+ * Return preferable access mode to the caller taking into account the split
+ * case and the fact of existing not splittable dirs.
*/
int cmm_split_access(const struct lu_env *env, struct md_object *mo,
mdl_mode_t lm)
RETURN(MDL_MINMODE);
}
-/* Check if split is expected for current thread. */
+/**
+ * Check if split is expected for current thread.
+ *
+ * \param mo Directory to split.
+ * \param ma md attributes.
+ * \param split Flag to save split information.
+ */
int cmm_split_expect(const struct lu_env *env, struct md_object *mo,
struct md_attr *ma, int *split)
{
struct cmm_device *d,
const struct lu_fid *f)
{
- struct lu_object *o;
- struct cmm_object *m;
- ENTRY;
-
- o = lu_object_find(env, d->cmm_md_dev.md_lu_dev.ld_site, f);
- if (IS_ERR(o))
- m = (struct cmm_object *)o;
- else
- m = lu2cmm_obj(lu_object_locate(o->lo_header,
- d->cmm_md_dev.md_lu_dev.ld_type));
- RETURN(m);
+ return md2cmm_obj(md_object_find_slice(env, &d->cmm_md_dev, fid));
}
static inline void cmm_object_put(const struct lu_env *env,
lu_object_put(env, &o->cmo_obj.mo_lu);
}
-/*
- * Allocate new on passed @mc for slave object which is going to create there
- * soon.
+/**
+ * Allocate new FID on passed \a mc for slave object which is going to
+ * create there soon.
*/
static int cmm_split_fid_alloc(const struct lu_env *env,
struct cmm_device *cmm,
LASSERT(cmm != NULL && mc != NULL && fid != NULL);
- down(&mc->mc_fid_sem);
+ cfs_down(&mc->mc_fid_sem);
- /* Alloc new fid on @mc. */
+ /* Alloc new fid on \a mc. */
rc = obd_fid_alloc(mc->mc_desc.cl_exp, fid, NULL);
- if (rc > 0) {
- /* Setup FLD for new sequenceif needed. */
- rc = fld_client_create(cmm->cmm_fld, fid_seq(fid),
- mc->mc_num, env);
- if (rc)
- CERROR("Can't create fld entry, rc %d\n", rc);
- }
- up(&mc->mc_fid_sem);
+ if (rc > 0)
+ rc = 0;
+ cfs_up(&mc->mc_fid_sem);
RETURN(rc);
}
-/* Allocate new slave object on passed @mc */
+/**
+ * Allocate new slave object on passed \a mc.
+ */
static int cmm_split_slave_create(const struct lu_env *env,
struct cmm_device *cmm,
struct mdc_device *mc,
RETURN(rc);
}
-/*
- * Create so many slaves as number of stripes. This is called in split time
- * before sending pages to slaves.
+/**
+ * Create so many slaves as number of stripes.
+ * This is called in split time before sending pages to slaves.
*/
static int cmm_split_slaves_create(const struct lu_env *env,
struct md_object *mo,
slave_lmv->mea_magic = MEA_MAGIC_HASH_SEGMENT;
slave_lmv->mea_count = 0;
- list_for_each_entry_safe(mc, tmp, &cmm->cmm_targets, mc_linkage) {
+ cfs_list_for_each_entry_safe(mc, tmp, &cmm->cmm_targets, mc_linkage) {
rc = cmm_split_slave_create(env, cmm, mc, &lmv->mea_ids[i],
ma, slave_lmv, sizeof(*slave_lmv));
if (rc)
GOTO(cleanup, rc);
i++;
}
-
- ma->ma_valid |= MA_LMV;
EXIT;
cleanup:
return rc;
return 0;
}
+/**
+ * Convert string to the lu_name structure.
+ */
static inline struct lu_name *cmm_name(const struct lu_env *env,
char *name, int buflen)
{
cmi = cmm_env_info(env);
lname = &cmi->cti_name;
lname->ln_name = name;
- /* NOT count the terminating '\0' of name for length */
+ /* do NOT count the terminating '\0' of name for length */
lname->ln_namelen = buflen - 1;
return lname;
}
-/*
- * Remove one entry from local MDT. Do not corrupt byte order in page, it will
- * be sent to remote MDT.
+/**
+ * Helper for cmm_split_remove_page(). It removes one entry from local MDT.
+ * Do not corrupt byte order in page, it will be sent to remote MDT.
*/
static int cmm_split_remove_entry(const struct lu_env *env,
struct md_object *mo,
{
struct cmm_device *cmm = cmm_obj2dev(md2cmm_obj(mo));
struct cmm_thread_info *cmi;
- struct md_attr *ma;
+ struct md_attr *ma;
struct cmm_object *obj;
int is_dir, rc;
char *name;
if (lu_object_exists(&obj->cmo_obj.mo_lu) > 0)
is_dir = S_ISDIR(lu_object_attr(&obj->cmo_obj.mo_lu));
else
- /*
- * XXX: These days only cross-ref dirs are possible, so for the
+ /**
+ * \note These days only cross-ref dirs are possible, so for the
* sake of simplicity, in split, we suppose that all cross-ref
- * names pint to directory and do not do additional getattr to
+ * names point to directory and do not do additional getattr to
* remote MDT.
*/
is_dir = 1;
memcpy(name, ent->lde_name, le16_to_cpu(ent->lde_namelen));
lname = cmm_name(env, name, le16_to_cpu(ent->lde_namelen) + 1);
- /*
- * When split, no need update parent's ctime,
+ /**
+ * \note When split, no need update parent's ctime,
* and no permission check for name_remove.
*/
ma->ma_attr.la_ctime = 0;
if (rc)
GOTO(cleanup, rc);
- /*
- * This @ent will be transferred to slave MDS and insert there, so in
- * the slave MDS, we should know whether this object is dir or not, so
- * use the highest bit of the hash to indicate that (because we do not
- * use highest bit of hash).
+ /**
+ * \note For each entry transferred to the slave MDS we should know
+ * whether this object is dir or not. Therefore the highest bit of the
+ * hash is used to indicate that (it is unused for hash purposes anyway).
*/
if (is_dir) {
ent->lde_hash = le64_to_cpu(ent->lde_hash);
return rc;
}
-/*
- * Remove all entries from passed page. These entries are going to remote MDT
- * and thus should be removed locally.
+/**
+ * Remove all entries from passed page.
+ * These entries are going to remote MDT and thus should be removed locally.
*/
static int cmm_split_remove_page(const struct lu_env *env,
struct md_object *mo,
return rc;
}
-/* Send one page to remote MDT for creating entries there. */
+/**
+ * Send one page of entries to the slave MDT.
+ * This page contains entries to be created there.
+ */
static int cmm_split_send_page(const struct lu_env *env,
struct md_object *mo,
struct lu_rdpg *rdpg,
RETURN(rc);
}
-/* Read one page of entries from local MDT. */
+/** Read one page of entries from local MDT. */
static int cmm_split_read_page(const struct lu_env *env,
struct md_object *mo,
struct lu_rdpg *rdpg)
RETURN(rc);
}
-/*
- * This function performs migration of all pages with entries which fit into one
- * stripe and one hash segment.
+/**
+ * This function performs migration of each directory stripe to its MDS.
*/
static int cmm_split_process_stripe(const struct lu_env *env,
struct md_object *mo,
struct lu_dirpage *ldp;
__u32 len = 0;
- /* Read one page from local MDT. */
+ /** - Read one page of entries from local MDT. */
rc = cmm_split_read_page(env, mo, rdpg);
if (rc) {
CERROR("Error in readpage: %d\n", rc);
RETURN(rc);
}
- /* Remove local entries which are going to remite MDT. */
+ /** - Remove local entries which are going to remite MDT. */
rc = cmm_split_remove_page(env, mo, rdpg, end, &len);
if (rc) {
CERROR("Error in remove stripe entries: %d\n", rc);
RETURN(rc);
}
- /* Send entries page to slave MDT. */
+ /**
+ * - Send entries page to slave MDT and repeat while there are
+ * more pages.
+ */
if (len > 0) {
rc = cmm_split_send_page(env, mo, rdpg, lf, len);
if (rc) {
RETURN(rc);
}
+/**
+ * Directory scanner for split operation.
+ *
+ * It calculates hashes for names and organizes files to stripes.
+ */
static int cmm_split_process_dir(const struct lu_env *env,
struct md_object *mo,
struct md_attr *ma)
{
struct cmm_device *cmm = cmm_obj2dev(md2cmm_obj(mo));
struct lu_rdpg *rdpg = &cmm_env_info(env)->cmi_rdpg;
- __u64 hash_segement;
+ __u64 hash_segment;
int rc = 0, i;
ENTRY;
GOTO(cleanup, rc = -ENOMEM);
}
- LASSERT(ma->ma_valid & MA_LMV);
- hash_segement = MAX_HASH_SIZE / (cmm->cmm_tgt_count + 1);
+ hash_segment = MAX_HASH_SIZE;
+ /** Whole hash range is divided on segments by number of MDS-es. */
+ do_div(hash_segment, cmm->cmm_tgt_count + 1);
+ /**
+ * For each segment the cmm_split_process_stripe() is called to move
+ * entries on new server.
+ */
for (i = 1; i < cmm->cmm_tgt_count + 1; i++) {
struct lu_fid *lf;
__u64 hash_end;
lf = &ma->ma_lmv->mea_ids[i];
- rdpg->rp_hash = i * hash_segement;
+ rdpg->rp_hash = i * hash_segment;
if (i == cmm->cmm_tgt_count)
hash_end = MAX_HASH_SIZE;
else
- hash_end = rdpg->rp_hash + hash_segement;
+ hash_end = rdpg->rp_hash + hash_segment;
rc = cmm_split_process_stripe(env, mo, rdpg, lf, hash_end);
if (rc) {
CERROR("Error (rc = %d) while splitting for %d: fid="
- DFID", %08x:%08x\n", rc, i, PFID(lf),
+ DFID", "LPX64":"LPX64"\n", rc, i, PFID(lf),
rdpg->rp_hash, hash_end);
GOTO(cleanup, rc);
}
cleanup:
for (i = 0; i < rdpg->rp_npages; i++)
if (rdpg->rp_pages[i] != NULL)
- __cfs_free_page(rdpg->rp_pages[i]);
+ cfs_free_page(rdpg->rp_pages[i]);
return rc;
}
+/**
+ * Directory splitting.
+ *
+ * Big directory can be split eventually.
+ */
int cmm_split_dir(const struct lu_env *env, struct md_object *mo)
{
struct cmm_device *cmm = cmm_obj2dev(md2cmm_obj(mo));
LASSERT(S_ISDIR(lu_object_attr(&mo->mo_lu)));
memset(ma, 0, sizeof(*ma));
- /* Step1: Checking whether the dir needs to be split. */
+ /** - Step1: Checking whether the dir needs to be split. */
rc = cmm_split_expect(env, mo, ma, &split);
if (rc)
GOTO(out, rc);
CWARN("Dir "DFID" is going to split (size: "LPU64")\n",
PFID(lu_object_fid(&mo->mo_lu)), ma->ma_attr.la_size);
- /*
- * Disable transacrions for split, since there will be so many trans in
+ /**
+ * /note Disable transactions for split, since there will be so many trans in
* this one ops, conflict with current recovery design.
*/
- rc = cmm_upcall(env, &cmm->cmm_md_dev, MD_NO_TRANS);
+ rc = cmm_upcall(env, &cmm->cmm_md_dev, MD_NO_TRANS, NULL);
if (rc) {
CERROR("Can't disable trans for split, rc %d\n", rc);
GOTO(out, rc);
}
- /* Step2: Prepare the md memory */
+ /** - Step2: Prepare the md memory */
ma->ma_lmv_size = CMM_MD_SIZE(cmm->cmm_tgt_count + 1);
OBD_ALLOC(ma->ma_lmv, ma->ma_lmv_size);
if (ma->ma_lmv == NULL)
GOTO(out, rc = -ENOMEM);
- /* Step3: Create slave objects and fill the ma->ma_lmv */
+ /** - Step3: Create slave objects and fill the ma->ma_lmv */
rc = cmm_split_slaves_create(env, mo, ma);
if (rc) {
CERROR("Can't create slaves for split, rc %d\n", rc);
GOTO(cleanup, rc);
}
- /* Step4: Scan and split the object. */
+ /** - Step4: Scan and split the object. */
rc = cmm_split_process_dir(env, mo, ma);
if (rc) {
CERROR("Can't scan and split, rc %d\n", rc);
GOTO(cleanup, rc);
}
- /* Step5: Set mea to the master object. */
- LASSERT(ma->ma_valid & MA_LMV);
+ /** - Step5: Set mea to the master object. */
buf = cmm_buf_get(env, ma->ma_lmv, ma->ma_lmv_size);
rc = mo_xattr_set(env, md_object_next(mo), buf,
- MDS_LMV_MD_NAME, 0, NULL);
+ MDS_LMV_MD_NAME, 0);
if (rc) {
CERROR("Can't set MEA to master dir, " "rc %d\n", rc);
GOTO(cleanup, rc);
/* set flag in cmm_object */
md2cml_obj(mo)->clo_split = CMM_SPLIT_DONE;
- /*
- * Finally, split succeed, tell client to repeat opetartion on correct
+ /**
+ * - Finally, split succeed, tell client to repeat opetartion on correct
* MDT.
*/
CWARN("Dir "DFID" has been split\n", PFID(lu_object_fid(&mo->mo_lu)));
cmm_lprocfs_time_end(env, cmm, LPROC_CMM_SPLIT);
return rc;
}
+/** @} */