1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/mgs/mgs_handler.c
5 * Lustre Management Server (mgs) request handler
7 * Copyright (C) 2001-2005 Cluster File Systems, Inc.
8 * Author Nathan <nathan@clusterfs.com>
9 * Author LinSongTao <lincent@clusterfs.com>
11 * This file is part of Lustre, http://www.lustre.org.
13 * Lustre is free software; you can redistribute it and/or
14 * modify it under the terms of version 2 of the GNU General Public
15 * License as published by the Free Software Foundation.
17 * Lustre is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with Lustre; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 # define EXPORT_SYMTAB
30 #define DEBUG_SUBSYSTEM S_MGS
33 # include <linux/module.h>
34 # include <linux/pagemap.h>
35 # include <linux/miscdevice.h>
36 # include <linux/init.h>
38 # include <liblustre.h>
41 #include <linux/obd_class.h>
42 #include <linux/lustre_dlm.h>
43 #include <linux/lprocfs_status.h>
44 #include <linux/lustre_fsfilt.h>
45 #include <linux/lustre_commit_confd.h>
46 #include "mgs_internal.h"
48 static int mgs_postsetup(struct obd_device *obd);
49 static int mgs_cleanup(struct obd_device *obd);
51 /* Establish a connection to the MGS.*/
52 static int mgs_connect(struct lustre_handle *conn, struct obd_device *obd,
53 struct obd_uuid *cluuid, struct obd_connect_data *data)
55 struct obd_export *exp;
56 struct mgs_export_data *med;
57 struct mgs_client_data *mcd;
58 int rc, abort_recovery;
61 if (!conn || !obd || !cluuid)
64 /* Check for aborted recovery. */
65 spin_lock_bh(&obd->obd_processing_task_lock);
66 abort_recovery = obd->obd_abort_recovery;
67 spin_unlock_bh(&obd->obd_processing_task_lock);
69 target_abort_recovery(obd);
71 rc = class_connect(conn, obd, cluuid);
74 exp = class_conn2export(conn);
76 med = &exp->exp_mgs_data;
79 data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
80 exp->exp_connect_flags = data->ocd_connect_flags;
83 OBD_ALLOC(mcd, sizeof(*mcd));
85 CERROR("mgs: out of memory for client data\n");
86 GOTO(out, rc = -ENOMEM);
89 memcpy(mcd->mcd_uuid, cluuid, sizeof(mcd->mcd_uuid));
95 OBD_FREE(mcd, sizeof(*mcd));
98 class_disconnect(exp);
100 class_export_put(exp);
106 static int mgs_init_export(struct obd_export *exp)
108 struct mgs_export_data *med = &exp->exp_mgs_data;
110 INIT_LIST_HEAD(&med->med_open_head);
111 spin_lock_init(&med->med_open_lock);
115 static int mgs_disconnect(struct obd_export *exp)
117 unsigned long irqflags;
122 class_export_get(exp);
124 /* Disconnect early so that clients can't keep using export */
125 rc = class_disconnect(exp);
127 /* complete all outstanding replies */
128 spin_lock_irqsave(&exp->exp_lock, irqflags);
129 while (!list_empty(&exp->exp_outstanding_replies)) {
130 struct ptlrpc_reply_state *rs =
131 list_entry(exp->exp_outstanding_replies.next,
132 struct ptlrpc_reply_state, rs_exp_list);
133 struct ptlrpc_service *svc = rs->rs_service;
135 spin_lock(&svc->srv_lock);
136 list_del_init(&rs->rs_exp_list);
137 ptlrpc_schedule_difficult_reply(rs);
138 spin_unlock(&svc->srv_lock);
140 spin_unlock_irqrestore(&exp->exp_lock, irqflags);
142 class_export_put(exp);
146 /* mount the file system (secretly) */
147 static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
149 struct lprocfs_static_vars lvars;
150 struct lustre_cfg* lcfg = buf;
151 char *options = NULL;
152 struct mgs_obd *mgs = &obd->u.mgs;
153 struct vfsmount *mnt;
158 /* setup 1:/dev/loop/0 2:ext3 3:mgs 4:errors=remount-ro,iopen_nopriv*/
160 if (lcfg->lcfg_bufcount < 3)
161 RETURN(rc = -EINVAL);
163 if (LUSTRE_CFG_BUFLEN(lcfg, 1) == 0 || LUSTRE_CFG_BUFLEN(lcfg, 2) == 0)
164 RETURN(rc = -EINVAL);
166 obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
167 if (IS_ERR(obd->obd_fsops))
168 RETURN(rc = PTR_ERR(obd->obd_fsops));
170 page = __get_free_page(GFP_KERNEL);
174 options = (char *)page;
175 memset(options, 0, PAGE_SIZE);
177 if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4))
178 sprintf(options , ",%s", lustre_cfg_string(lcfg, 4));
180 //FIXME mount was already done in lustre_fill_super,
181 //we just need to access it
182 mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0,
183 lustre_cfg_string(lcfg, 1), (void *)options);
187 CERROR("do_kern_mount failed: rc = %d\n", rc);
191 CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt);
193 LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
195 rc = mgs_fs_setup(obd, mnt);
197 CERROR("%s: MGS filesystem method init failed: rc = %d\n",
202 INIT_LIST_HEAD(&mgs->mgs_open_llogs);
203 INIT_LIST_HEAD(&mgs->mgs_update_llhs);
205 rc = llog_start_commit_thread();
209 //FIXME: no LDLM support for llog now
210 ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
211 "mgs_ldlm_client", &obd->obd_ldlm_client);
213 obd->obd_replayable = 1;
215 rc = mgs_postsetup(obd);
219 lprocfs_init_vars(mgs, &lvars);
220 lprocfs_obd_setup(obd, lvars.obd_vars);
222 if (obd->obd_recovering) {
223 LCONSOLE_WARN("MGT %s now serving %s, but will be in recovery "
224 "until %d %s reconnect, or if no clients "
225 "reconnect for %d:%.02d; during that time new "
226 "clients will not be allowed to connect. "
227 "Recovery progress can be monitored by watching "
228 "/proc/fs/lustre/mgs/%s/recovery_status.\n",
230 lustre_cfg_string(lcfg, 1),
231 obd->obd_recoverable_clients,
232 (obd->obd_recoverable_clients == 1)
233 ? "client" : "clients",
234 (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60,
235 (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60,
238 LCONSOLE_INFO("MGT %s now serving %s with recovery %s.\n",
240 lustre_cfg_string(lcfg, 1),
241 obd->obd_replayable ? "enabled" : "disabled");
243 //FIXME: no ldlm support now
245 ping_evictor_start();
250 /* No extra cleanup needed for llog_init_commit_thread() */
254 mntput(mgs->mgs_vfsmnt);
258 fsfilt_put_ops(obd->obd_fsops);
262 static int mgs_postsetup(struct obd_device *obd)
267 rc = llog_setup(obd, LLOG_CONFIG_ORIG_CTXT, obd, 0, NULL,
272 static int mgs_precleanup(struct obd_device *obd, int stage)
277 llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
278 rc = obd_llog_finish(obd, 0);
282 static int mgs_cleanup(struct obd_device *obd)
284 struct mgs_obd *mgs = &obd->u.mgs;
285 lvfs_sbdev_type save_dev;
291 if (mgs->mgs_sb == NULL)
293 save_dev = lvfs_sbdev(mgs->mgs_sb);
295 lprocfs_obd_cleanup(obd);
297 mgs_update_server_data(obd, 1);
301 if (atomic_read(&obd->u.mgs.mgs_vfsmnt->mnt_count) > 2)
302 CERROR("%s: mount busy, mnt_count %d != 2\n", obd->obd_name,
303 atomic_read(&obd->u.mgs.mgs_vfsmnt->mnt_count));
305 /* We can only unlock kernel if we are in the context of sys_ioctl,
306 otherwise we never called lock_kernel */
307 if (kernel_locked()) {
312 mntput(mgs->mgs_vfsmnt);
315 spin_lock_bh(&obd->obd_processing_task_lock);
316 if (obd->obd_recovering) {
317 target_cancel_recovery_timer(obd);
318 obd->obd_recovering = 0;
320 spin_unlock_bh(&obd->obd_processing_task_lock);
322 lvfs_clear_rdonly(save_dev);
327 fsfilt_put_ops(obd->obd_fsops);
329 LCONSOLE_INFO("MDT %s has stopped.\n", obd->obd_name);
334 /* Look up an entry by inode number. */
335 /* this function ONLY returns valid dget'd dentries with an initialized inode
337 struct dentry *mgs_fid2dentry(struct mgs_obd *mgs, struct ll_fid *fid,
338 struct vfsmount **mnt)
340 unsigned long ino = fid->id;
341 __u32 generation = fid->generation;
342 struct mgs_open_llog *mollog, *n;
343 struct list_head *llog_list = &mgs->mgs_open_llogs;
345 struct dentry *result = NULL;
348 RETURN(ERR_PTR(-ESTALE));
351 CDEBUG(D_DENTRY, "--> mgs_fid2dentry: ino/gen %lu/%u, sb %p\n",
352 ino, generation, mgs->mgs_sb);
354 list_for_each_entry_safe(mollog, n, llog_list, mol_list) {
355 if (mollog->mol_id == ino) {
356 result = mollog->mol_dentry;
364 inode = result->d_inode;
366 RETURN(ERR_PTR(-ENOENT));
368 if (generation && inode->i_generation != generation) {
369 /* we didn't find the right inode.. */
370 CERROR("bad inode %lu, link: %lu ct: %d or generation %u/%u\n",
371 inode->i_ino, (unsigned long)inode->i_nlink,
372 atomic_read(&inode->i_count), inode->i_generation,
375 RETURN(ERR_PTR(-ENOENT));
379 *mnt = mgs->mgs_vfsmnt;
386 static struct dentry *mgs_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr,
389 struct obd_device *obd = data;
392 fid.generation = gen;
393 return mgs_fid2dentry(&obd->u.mgs, &fid, NULL);
396 static int mgs_open_llog(__u64 id, void *data, void *handle)
398 struct obd_device *obd = data;
399 struct mgs_update_llh *mul = handle;
400 struct llog_handle *lgh = &mul->mul_lgh;
401 struct dentry *dentry = lgh->lgh_file->f_dentry;
402 __u64 id = dentry->d_inode->i_ino;
403 struct mgs_obd *mgs = &obd->u.mgs;
404 struct mgs_open_llog *mollog, *n;
405 struct list_head *llog_list = &mgs->mgs_open_llogs;
407 list_for_each_entry_safe(mollog, n, llog_list, mol_list) {
408 if (mollog->mol_id == id) {
409 spin_lock(&mollog->mol_lock);
411 spin_unlock(&mollog->mol_lock);
417 /* add a new open llog to mgs_open_llogs */
418 OBD_ALLOC(mollog, sizeof(*mollog));
420 CERROR("No memory for mollog.\n");
424 mollog->mol_dentry = dentry;
425 mollog->mol_update = 0;
427 spin_lock_init(&mollog->mol_lock);
429 spin_lock(&mgs->mgs_llogs_lock);
430 list_add(&mollog->mol_list, &mgs->mgs_open_llogs);
431 spin_unlock(&mgs->mgs_llogs_lock);
437 int mgs_handle(struct ptlrpc_request *req)
439 int fail = OBD_FAIL_MGS_ALL_REPLY_NET;
441 struct mgs_obd *mgs = NULL; /* quell gcc overwarning */
442 struct obd_device *obd = NULL;
445 OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
447 LASSERT(current->journal_info == NULL);
448 /* XXX identical to MDS */
449 if (req->rq_reqmsg->opc != MGS_CONNECT) {
450 struct mgs_export_data *med;
453 if (req->rq_export == NULL) {
454 CERROR("lustre_mgs: operation %d on unconnected MGS\n",
455 req->rq_reqmsg->opc);
456 req->rq_status = -ENOTCONN;
457 GOTO(out, rc = -ENOTCONN);
460 med = &req->rq_export->exp_mgs_data;
461 obd = req->rq_export->exp_obd;
464 /* sanity check: if the xid matches, the request must
465 * be marked as a resent or replayed */
466 if (req->rq_xid == med->med_mcd->mcd_last_xid)
467 LASSERTF(lustre_msg_get_flags(req->rq_reqmsg) &
468 (MSG_RESENT | MSG_REPLAY),
469 "rq_xid "LPU64" matches last_xid, "
470 "expected RESENT flag\n",
472 /* else: note the opposite is not always true; a
473 * RESENT req after a failover will usually not match
474 * the last_xid, since it was likely never
475 * committed. A REPLAYed request will almost never
476 * match the last xid, however it could for a
477 * committed, but still retained, open. */
479 /* Check for aborted recovery. */
480 spin_lock_bh(&obd->obd_processing_task_lock);
481 abort_recovery = obd->obd_abort_recovery;
482 spin_unlock_bh(&obd->obd_processing_task_lock);
483 if (abort_recovery) {
484 target_abort_recovery(obd);
488 switch (req->rq_reqmsg->opc) {
490 DEBUG_REQ(D_INODE, req, "connect");
491 OBD_FAIL_RETURN(OBD_FAIL_MGS_CONNECT_NET, 0);
492 rc = target_handle_connect(req, mgs_handle);
494 /* Now that we have an export, set mgs. */
495 obd = req->rq_export->exp_obd;
496 mgs = mgs_req2mgs(req);
501 DEBUG_REQ(D_INODE, req, "disconnect");
502 OBD_FAIL_RETURN(OBD_FAIL_MGS_DISCONNECT_NET, 0);
503 rc = target_handle_disconnect(req);
504 req->rq_status = rc; /* superfluous? */
508 DEBUG_REQ(D_INODE, req, "ping");
509 rc = target_handle_ping(req);
513 CDEBUG(D_INODE, "log cancel\n");
514 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
515 rc = -ENOTSUPP; /* la la la */
518 case LLOG_ORIGIN_HANDLE_CREATE:
519 DEBUG_REQ(D_INODE, req, "llog_init");
520 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
521 rc = llog_origin_handle_create(req);
523 case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
524 DEBUG_REQ(D_INODE, req, "llog next block");
525 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
526 rc = llog_origin_handle_next_block(req);
528 case LLOG_ORIGIN_HANDLE_READ_HEADER:
529 DEBUG_REQ(D_INODE, req, "llog read header");
530 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
531 rc = llog_origin_handle_read_header(req);
533 case LLOG_ORIGIN_HANDLE_CLOSE:
534 DEBUG_REQ(D_INODE, req, "llog close");
535 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
536 rc = llog_origin_handle_close(req);
539 DEBUG_REQ(D_INODE, req, "llog catinfo");
540 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
541 rc = llog_catinfo(req);
544 req->rq_status = -ENOTSUPP;
545 rc = ptlrpc_error(req);
549 LASSERT(current->journal_info == NULL);
551 /* If we're DISCONNECTing, the mgs_export_data is already freed */
552 if (!rc && req->rq_reqmsg->opc != MGS_DISCONNECT) {
553 struct mgs_export_data *med = &req->rq_export->exp_mgs_data;
554 req->rq_repmsg->last_xid =
555 le64_to_cpu(med->med_mcd->mcd_last_xid);
557 target_committed_to_req(req);
563 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
564 if (obd && obd->obd_recovering) {
565 DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
566 return target_queue_final_reply(req, rc);
568 /* Lost a race with recovery; let the error path DTRT. */
569 rc = req->rq_status = -ENOTCONN;
572 target_send_reply(req, rc, fail);
576 static int mgt_setup(struct obd_device *obd, obd_count len, void *buf)
578 struct mgs_obd *mgs = &obd->u.mgs;
579 struct lprocfs_static_vars lvars;
583 lprocfs_init_vars(mgt, &lvars);
584 lprocfs_obd_setup(obd, lvars.obd_vars);
587 ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
588 MGS_REQUEST_PORTAL, MGC_REPLY_PORTAL,
589 MGS_SERVICE_WATCHDOG_TIMEOUT,
590 mgs_handle, "mgs", obd->obd_proc_entry, NULL);
592 if (!mgs->mgs_service) {
593 CERROR("failed to start service\n");
594 GOTO(err_lprocfs, rc = -ENOMEM);
597 rc = ptlrpc_start_n_threads(obd, mgs->mgs_service, MGT_NUM_THREADS,
600 GOTO(err_thread, rc);
605 ptlrpc_unregister_service(mgs->mgs_service);
607 lprocfs_obd_cleanup(obd);
611 static int mgt_cleanup(struct obd_device *obd)
613 struct mgs_obd *mgs = &obd->u.mgs;
616 ptlrpc_unregister_service(mgs->mgs_service);
618 lprocfs_obd_cleanup(obd);
623 struct lvfs_callback_ops mgs_lvfs_ops = {
624 l_fid2dentry: mgs_lvfs_fid2dentry,
625 l_open_llog: mgs_lvfs_open_llog,
628 /* use obd ops to offer management infrastructure */
629 static struct obd_ops mgs_obd_ops = {
630 .o_owner = THIS_MODULE,
631 .o_connect = mgs_connect,
632 .o_init_export = mgs_init_export,
633 .o_disconnect = mgs_disconnect,
634 .o_setup = mgs_setup,
635 .o_precleanup = mgs_precleanup,
636 .o_cleanup = mgs_cleanup,
637 .o_iocontrol = mgs_iocontrol,
640 static struct obd_ops mgt_obd_ops = {
641 .o_owner = THIS_MODULE,
642 .o_setup = mgt_setup,
643 .o_cleanup = mgt_cleanup,
646 static int __init mgs_init(void)
648 struct lprocfs_static_vars lvars;
650 lprocfs_init_vars(mgs, &lvars);
651 class_register_type(&mgs_obd_ops, lvars.module_vars, LUSTRE_MGS_NAME);
652 lprocfs_init_vars(mgt, &lvars);
653 class_register_type(&mgt_obd_ops, lvars.module_vars, LUSTRE_MGT_NAME);
658 static void /*__exit*/ mgs_exit(void)
660 class_unregister_type(LUSTRE_MGS_NAME);
661 class_unregister_type(LUSTRE_MGT_NAME);
664 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
665 MODULE_DESCRIPTION("Lustre Management Server (MGS)");
666 MODULE_LICENSE("GPL");
668 module_init(mgs_init);
669 module_exit(mgs_exit);