1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/mds/handler.c
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Andreas Dilger <adilger@clusterfs.com>
40 * Author: Phil Schwan <phil@clusterfs.com>
41 * Author: Mike Shaver <shaver@clusterfs.com>
44 #define DEBUG_SUBSYSTEM S_MDS
46 #include <lustre_mds.h>
47 #include <linux/module.h>
48 #include <linux/init.h>
49 #include <linux/random.h>
51 #include <linux/jbd.h>
52 #include <linux/smp_lock.h>
53 #include <linux/buffer_head.h>
54 #include <linux/workqueue.h>
55 #include <linux/mount.h>
57 #include <lustre_acl.h>
58 #include <obd_class.h>
59 #include <lustre_dlm.h>
61 #include <lustre_fsfilt.h>
62 #include <lprocfs_status.h>
63 #include <lustre_disk.h>
64 #include <lustre_param.h>
66 #include "mds_internal.h"
68 __u32 mds_max_ost_index=0xFFFF;
69 CFS_MODULE_PARM(mds_max_ost_index, "i", int, 0444,
72 /* Look up an entry by inode number. */
73 /* this function ONLY returns valid dget'd dentries with an initialized inode
75 struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
76 struct vfsmount **mnt)
79 unsigned long ino = fid->id;
80 __u32 generation = fid->generation;
82 struct dentry *result;
85 RETURN(ERR_PTR(-ESTALE));
87 snprintf(fid_name, sizeof(fid_name), "0x%lx", ino);
89 /* under ext3 this is neither supposed to return bad inodes
91 result = ll_lookup_one_len(fid_name, mds->mds_fid_de, strlen(fid_name));
95 inode = result->d_inode;
97 RETURN(ERR_PTR(-ENOENT));
99 if (inode->i_generation == 0 || inode->i_nlink == 0) {
100 LCONSOLE_WARN("Found inode with zero generation or link -- this"
101 " may indicate disk corruption (inode: %lu/%u, "
102 "link %lu, count %d)\n", inode->i_ino,
103 inode->i_generation,(unsigned long)inode->i_nlink,
104 atomic_read(&inode->i_count));
106 RETURN(ERR_PTR(-ENOENT));
109 if (generation && inode->i_generation != generation) {
110 /* we didn't find the right inode.. */
111 CDEBUG(D_INODE, "found wrong generation: inode %lu, link: %lu, "
112 "count: %d, generation %u/%u\n", inode->i_ino,
113 (unsigned long)inode->i_nlink,
114 atomic_read(&inode->i_count), inode->i_generation,
117 RETURN(ERR_PTR(-ENOENT));
121 *mnt = mds->mds_obt.obt_vfsmnt;
128 static int mds_lov_presetup (struct mds_obd *mds, struct lustre_cfg *lcfg)
133 if (lcfg->lcfg_bufcount >= 4 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
136 ll_generate_random_uuid(uuid);
137 class_uuid_unparse(uuid, &mds->mds_lov_uuid);
139 OBD_ALLOC(mds->mds_profile, LUSTRE_CFG_BUFLEN(lcfg, 3));
140 if (mds->mds_profile == NULL)
143 strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3),
144 LUSTRE_CFG_BUFLEN(lcfg, 3));
149 static int mds_lov_clean(struct obd_device *obd)
151 struct mds_obd *mds = &obd->u.mds;
152 struct obd_device *osc = mds->mds_osc_obd;
155 if (mds->mds_profile) {
156 class_del_profile(mds->mds_profile);
157 OBD_FREE(mds->mds_profile, strlen(mds->mds_profile) + 1);
158 mds->mds_profile = NULL;
161 /* There better be a lov */
165 RETURN(PTR_ERR(osc));
167 obd_register_observer(osc, NULL);
169 /* Give lov our same shutdown flags */
170 osc->obd_force = obd->obd_force;
171 osc->obd_fail = obd->obd_fail;
173 /* Cleanup the lov */
174 obd_disconnect(mds->mds_osc_exp);
175 class_manual_cleanup(osc);
180 static int mds_postsetup(struct obd_device *obd)
182 struct mds_obd *mds = &obd->u.mds;
183 struct llog_ctxt *ctxt;
187 rc = llog_setup(obd, &obd->obd_olg, LLOG_CONFIG_ORIG_CTXT, obd, 0, NULL,
192 rc = llog_setup(obd, &obd->obd_olg, LLOG_LOVEA_ORIG_CTXT, obd, 0, NULL,
197 mds_changelog_llog_init(obd, obd);
199 if (mds->mds_profile) {
200 struct lustre_profile *lprof;
201 /* The profile defines which osc and mdc to connect to, for a
202 client. We reuse that here to figure out the name of the
203 lov to use (and ignore lprof->lp_md).
204 The profile was set in the config log with
205 LCFG_MOUNTOPT profilenm oscnm mdcnm */
206 lprof = class_get_profile(mds->mds_profile);
208 CERROR("No profile found: %s\n", mds->mds_profile);
209 GOTO(err_cleanup, rc = -ENOENT);
211 rc = mds_lov_connect(obd, lprof->lp_dt);
213 GOTO(err_cleanup, rc);
220 ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT);
224 ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
230 int mds_postrecov(struct obd_device *obd)
238 LASSERT(!obd->obd_recovering);
239 /* clean PENDING dir */
241 if (strncmp(obd->obd_name, MDD_OBD_NAME, strlen(MDD_OBD_NAME)))
242 rc = mds_cleanup_pending(obd);
246 /* FIXME Does target_finish_recovery really need this to block? */
247 /* Notify the LOV, which will in turn call mds_notify for each tgt */
248 /* This means that we have to hack obd_notify to think we're obd_set_up
249 during mds_lov_connect. */
250 obd_notify(obd->u.mds.mds_osc_obd, NULL,
251 obd->obd_async_recov ? OBD_NOTIFY_SYNC_NONBLOCK :
252 OBD_NOTIFY_SYNC, NULL);
257 /* We need to be able to stop an mds_lov_synchronize */
258 static int mds_lov_early_clean(struct obd_device *obd)
260 struct mds_obd *mds = &obd->u.mds;
261 struct obd_device *osc = mds->mds_osc_obd;
263 if (!osc || (!obd->obd_force && !obd->obd_fail))
266 CDEBUG(D_HA, "abort inflight\n");
267 return (obd_precleanup(osc, OBD_CLEANUP_EARLY));
270 static int mds_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
272 struct mds_obd *mds = &obd->u.mds;
273 struct llog_ctxt *ctxt;
278 case OBD_CLEANUP_EARLY:
280 case OBD_CLEANUP_EXPORTS:
281 mds_lov_early_clean(obd);
282 cfs_down_write(&mds->mds_notify_lock);
283 mds_lov_disconnect(obd);
285 ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
288 ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT);
291 rc = obd_llog_finish(obd, 0);
292 mds->mds_osc_exp = NULL;
293 cfs_up_write(&mds->mds_notify_lock);
299 static struct dentry *mds_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr,
302 struct obd_device *obd = data;
305 fid.generation = gen;
306 return mds_fid2dentry(&obd->u.mds, &fid, NULL);
310 struct lvfs_callback_ops mds_lvfs_ops = {
311 l_fid2dentry: mds_lvfs_fid2dentry,
314 static void mds_init_ctxt(struct obd_device *obd, struct vfsmount *mnt)
316 struct mds_obd *mds = &obd->u.mds;
318 mds->mds_obt.obt_vfsmnt = mnt;
319 /* why not mnt->mnt_sb instead of mnt->mnt_root->d_inode->i_sb? */
320 obd->u.obt.obt_sb = mnt->mnt_root->d_inode->i_sb;
322 fsfilt_setup(obd, obd->u.obt.obt_sb);
324 OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
325 obd->obd_lvfs_ctxt.pwdmnt = mnt;
326 obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
327 obd->obd_lvfs_ctxt.fs = get_ds();
328 obd->obd_lvfs_ctxt.cb_ops = mds_lvfs_ops;
332 /*mds still need lov setup here*/
333 static int mds_cmd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
335 struct mds_obd *mds = &obd->u.mds;
336 struct lvfs_run_ctxt saved;
338 struct vfsmount *mnt;
339 struct lustre_sb_info *lsi;
340 struct lustre_mount_info *lmi;
341 struct dentry *dentry;
345 CDEBUG(D_INFO, "obd %s setup \n", obd->obd_name);
346 if (strncmp(obd->obd_name, MDD_OBD_NAME, strlen(MDD_OBD_NAME)))
349 if (lcfg->lcfg_bufcount < 5) {
350 CERROR("invalid arg for setup %s\n", MDD_OBD_NAME);
353 dev = lustre_cfg_string(lcfg, 4);
354 lmi = server_get_mount(dev);
355 LASSERT(lmi != NULL);
357 lsi = s2lsi(lmi->lmi_sb);
359 /* FIXME: MDD LOV initialize objects.
360 * we need only lmi here but not get mount
361 * OSD did mount already, so put mount back
363 cfs_atomic_dec(&lsi->lsi_mounts);
365 cfs_init_rwsem(&mds->mds_notify_lock);
367 obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
368 mds_init_ctxt(obd, mnt);
370 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
371 dentry = simple_mkdir(current->fs->pwd, mnt, "OBJECTS", 0777, 1);
372 if (IS_ERR(dentry)) {
373 rc = PTR_ERR(dentry);
374 CERROR("cannot create OBJECTS directory: rc = %d\n", rc);
377 mds->mds_objects_dir = dentry;
379 dentry = lookup_one_len("__iopen__", current->fs->pwd,
380 strlen("__iopen__"));
381 if (IS_ERR(dentry)) {
382 rc = PTR_ERR(dentry);
383 CERROR("cannot lookup __iopen__ directory: rc = %d\n", rc);
384 GOTO(err_objects, rc);
387 mds->mds_fid_de = dentry;
388 if (!dentry->d_inode || is_bad_inode(dentry->d_inode)) {
390 CERROR("__iopen__ directory has no inode? rc = %d\n", rc);
393 rc = mds_lov_init_objids(obd);
395 CERROR("cannot init lov objid rc = %d\n", rc);
399 rc = mds_lov_presetup(mds, lcfg);
401 GOTO(err_objects, rc);
403 /* Don't wait for mds_postrecov trying to clear orphans */
404 obd->obd_async_recov = 1;
405 rc = mds_postsetup(obd);
406 /* Bug 11557 - allow async abort_recov start
407 FIXME can remove most of this obd_async_recov plumbing
408 obd->obd_async_recov = 0;
412 GOTO(err_objects, rc);
415 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
418 dput(mds->mds_fid_de);
420 dput(mds->mds_objects_dir);
422 fsfilt_put_ops(obd->obd_fsops);
426 static int mds_cmd_cleanup(struct obd_device *obd)
428 struct mds_obd *mds = &obd->u.mds;
429 struct lvfs_run_ctxt saved;
433 mds->mds_osc_exp = NULL;
436 LCONSOLE_WARN("%s: shutting down for failover; client state "
437 "will be preserved.\n", obd->obd_name);
439 if (strncmp(obd->obd_name, MDD_OBD_NAME, strlen(MDD_OBD_NAME)))
442 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
444 mds_lov_destroy_objids(obd);
446 if (mds->mds_objects_dir != NULL) {
447 l_dput(mds->mds_objects_dir);
448 mds->mds_objects_dir = NULL;
451 dput(mds->mds_fid_de);
452 LL_DQUOT_OFF(obd->u.obt.obt_sb);
453 shrink_dcache_sb(mds->mds_obt.obt_sb);
454 fsfilt_put_ops(obd->obd_fsops);
456 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
461 static int mds_cmd_health_check(struct obd_device *obd)
466 static struct obd_ops mds_cmd_obd_ops = {
467 .o_owner = THIS_MODULE,
468 .o_setup = mds_cmd_setup,
469 .o_cleanup = mds_cmd_cleanup,
470 .o_precleanup = mds_precleanup,
471 .o_create = mds_obd_create,
472 .o_destroy = mds_obd_destroy,
473 .o_llog_init = mds_llog_init,
474 .o_llog_finish = mds_llog_finish,
475 .o_notify = mds_notify,
476 .o_postrecov = mds_postrecov,
477 // .o_health_check = mds_cmd_health_check,
480 quota_interface_t *mds_quota_interface_ref;
481 extern quota_interface_t mds_quota_interface;
483 static int __init mds_cmd_init(void)
485 struct lprocfs_static_vars lvars;
488 cfs_request_module("%s", "lquota");
489 mds_quota_interface_ref = PORTAL_SYMBOL_GET(mds_quota_interface);
490 rc = lquota_init(mds_quota_interface_ref);
492 if (mds_quota_interface_ref)
493 PORTAL_SYMBOL_PUT(mds_quota_interface);
496 init_obd_quota_ops(mds_quota_interface_ref, &mds_cmd_obd_ops);
498 lprocfs_mds_init_vars(&lvars);
499 class_register_type(&mds_cmd_obd_ops, NULL, lvars.module_vars,
500 LUSTRE_MDS_NAME, NULL);
505 static void /*__exit*/ mds_cmd_exit(void)
507 lquota_exit(mds_quota_interface_ref);
508 if (mds_quota_interface_ref)
509 PORTAL_SYMBOL_PUT(mds_quota_interface);
511 class_unregister_type(LUSTRE_MDS_NAME);
514 EXPORT_SYMBOL(mds_quota_interface_ref);
515 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
516 MODULE_DESCRIPTION("Lustre Metadata Server (MDS)");
517 MODULE_LICENSE("GPL");
519 module_init(mds_cmd_init);
520 module_exit(mds_cmd_exit);