1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/mgs/mgs_handler.c
5 * Lustre Management Server (mgs) request handler
7 * Copyright (C) 2006 Cluster File Systems, Inc.
8 * Author: Nathan Rutman <nathan@clusterfs.com>
10 * This file is part of Lustre, http://www.lustre.org.
12 * Lustre is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Lustre is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Lustre; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 # define EXPORT_SYMTAB
29 #define DEBUG_SUBSYSTEM S_MGS
30 #define D_MGS D_CONFIG/*|D_WARNING*/
33 # include <linux/module.h>
34 # include <linux/pagemap.h>
35 # include <linux/miscdevice.h>
36 # include <linux/init.h>
38 # include <liblustre.h>
41 #include <obd_class.h>
42 #include <lustre_dlm.h>
43 #include <lprocfs_status.h>
44 #include <lustre_fsfilt.h>
45 #include <lustre_commit_confd.h>
46 #include <lustre_disk.h>
47 #include "mgs_internal.h"
50 /* Establish a connection to the MGS.*/
51 static int mgs_connect(const struct lu_env *env,
52 struct lustre_handle *conn, struct obd_device *obd,
53 struct obd_uuid *cluuid, struct obd_connect_data *data)
55 struct obd_export *exp;
59 if (!conn || !obd || !cluuid)
62 rc = class_connect(conn, obd, cluuid);
65 exp = class_conn2export(conn);
69 data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
70 exp->exp_connect_flags = data->ocd_connect_flags;
71 data->ocd_version = LUSTRE_VERSION_CODE;
75 class_disconnect(exp);
77 class_export_put(exp);
83 static int mgs_disconnect(struct obd_export *exp)
89 class_export_get(exp);
91 /* Disconnect early so that clients can't keep using export */
92 rc = class_disconnect(exp);
93 ldlm_cancel_locks_for_export(exp);
95 /* complete all outstanding replies */
96 spin_lock(&exp->exp_lock);
97 while (!list_empty(&exp->exp_outstanding_replies)) {
98 struct ptlrpc_reply_state *rs =
99 list_entry(exp->exp_outstanding_replies.next,
100 struct ptlrpc_reply_state, rs_exp_list);
101 struct ptlrpc_service *svc = rs->rs_service;
103 spin_lock(&svc->srv_lock);
104 list_del_init(&rs->rs_exp_list);
105 ptlrpc_schedule_difficult_reply(rs);
106 spin_unlock(&svc->srv_lock);
108 spin_unlock(&exp->exp_lock);
110 class_export_put(exp);
114 static int mgs_cleanup(struct obd_device *obd);
115 static int mgs_handle(struct ptlrpc_request *req);
117 /* Start the MGS obd */
118 static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
120 struct lprocfs_static_vars lvars;
121 struct mgs_obd *mgs = &obd->u.mgs;
122 struct lustre_mount_info *lmi;
123 struct lustre_sb_info *lsi;
124 struct vfsmount *mnt;
128 CDEBUG(D_CONFIG, "Starting MGS\n");
131 lmi = server_get_mount(obd->obd_name);
133 RETURN(rc = -EINVAL);
136 lsi = s2lsi(lmi->lmi_sb);
137 obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
138 if (IS_ERR(obd->obd_fsops))
139 GOTO(err_put, rc = PTR_ERR(obd->obd_fsops));
141 /* namespace for mgs llog */
142 obd->obd_namespace = ldlm_namespace_new("MGS", LDLM_NAMESPACE_SERVER,
143 LDLM_NAMESPACE_MODEST);
144 if (obd->obd_namespace == NULL)
145 GOTO(err_ops, rc = -ENOMEM);
148 ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
149 "mgs_ldlm_client", &obd->obd_ldlm_client);
151 LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
153 rc = mgs_fs_setup(obd, mnt);
155 CERROR("%s: MGS filesystem method init failed: rc = %d\n",
160 rc = llog_start_commit_thread();
164 rc = llog_setup(obd, NULL, LLOG_CONFIG_ORIG_CTXT, obd, 0, NULL,
169 /* No recovery for MGC's */
170 obd->obd_replayable = 0;
172 /* Internal mgs setup */
173 mgs_init_fsdb_list(obd);
174 sema_init(&mgs->mgs_sem, 1);
176 /* Start the service threads */
178 ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
179 MGS_MAXREPSIZE, MGS_REQUEST_PORTAL,
180 MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT,
181 mgs_handle, LUSTRE_MGS_NAME,
182 obd->obd_proc_entry, NULL,
183 MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
184 "ll_mgs", LCT_MD_THREAD);
186 if (!mgs->mgs_service) {
187 CERROR("failed to start service\n");
188 GOTO(err_fs, rc = -ENOMEM);
191 rc = ptlrpc_start_threads(obd, mgs->mgs_service);
193 GOTO(err_thread, rc);
196 lprocfs_mgs_init_vars(&lvars);
197 if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
198 lproc_mgs_setup(obd);
201 ping_evictor_start();
203 LCONSOLE_INFO("MGS %s started\n", obd->obd_name);
208 ptlrpc_unregister_service(mgs->mgs_service);
210 /* No extra cleanup needed for llog_init_commit_thread() */
213 ldlm_namespace_free(obd->obd_namespace, 0);
214 obd->obd_namespace = NULL;
216 fsfilt_put_ops(obd->obd_fsops);
218 server_put_mount(obd->obd_name, mnt);
223 static int mgs_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
229 case OBD_CLEANUP_EARLY:
230 case OBD_CLEANUP_EXPORTS:
232 case OBD_CLEANUP_SELF_EXP:
233 llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
234 rc = obd_llog_finish(obd, 0);
236 case OBD_CLEANUP_OBD:
242 static int mgs_ldlm_nsfree(void *data)
244 struct ldlm_namespace *ns = (struct ldlm_namespace *)data;
248 ptlrpc_daemonize("ll_mgs_nsfree");
249 rc = ldlm_namespace_free(ns, 1 /* obd_force should always be on */);
253 static int mgs_cleanup(struct obd_device *obd)
255 struct mgs_obd *mgs = &obd->u.mgs;
258 if (mgs->mgs_sb == NULL)
263 ptlrpc_unregister_service(mgs->mgs_service);
265 mgs_cleanup_fsdb_list(obd);
267 lprocfs_obd_cleanup(obd);
268 mgs->mgs_proc_live = NULL;
272 server_put_mount(obd->obd_name, mgs->mgs_vfsmnt);
275 /* Free the namespace in it's own thread, so that if the
276 ldlm_cancel_handler put the last mgs obd ref, we won't
278 cfs_kernel_thread(mgs_ldlm_nsfree, obd->obd_namespace,
279 CLONE_VM | CLONE_FILES);
282 fsfilt_put_ops(obd->obd_fsops);
284 LCONSOLE_INFO("%s has stopped.\n", obd->obd_name);
288 /* similar to filter_prepare_destroy */
289 static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname,
290 struct lustre_handle *lockh)
292 struct ldlm_res_id res_id;
296 rc = mgc_fsname2resid(fsname, &res_id);
298 rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id,
299 LDLM_PLAIN, NULL, LCK_EX,
300 &flags, ldlm_blocking_ast,
301 ldlm_completion_ast, NULL,
302 fsname, 0, NULL, lockh);
304 CERROR("can't take cfg lock for %s (%d)\n", fsname, rc);
309 static int mgs_put_cfg_lock(struct lustre_handle *lockh)
312 ldlm_lock_decref(lockh, LCK_EX);
319 static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
324 rc = mgs_check_index(obd, mti);
326 LCONSOLE_ERROR_MSG(0x13b, "%s claims to have registered, but "
327 "this MGS does not know about it. Assuming"
328 " writeconf.\n", mti->mti_svname);
329 mti->mti_flags |= LDD_F_WRITECONF;
331 } else if (rc == -1) {
332 LCONSOLE_ERROR_MSG(0x13c, "Client log %s-client has "
333 "disappeared! Regenerating all logs.\n",
335 mti->mti_flags |= LDD_F_WRITECONF;
338 /* Index is correctly marked as used */
340 /* If the logs don't contain the mti_nids then add
341 them as failover nids */
342 rc = mgs_check_failnid(obd, mti);
348 /* Called whenever a target starts up. Flags indicate first connect, etc. */
349 static int mgs_handle_target_reg(struct ptlrpc_request *req)
351 struct obd_device *obd = req->rq_export->exp_obd;
352 struct lustre_handle lockh;
353 struct mgs_target_info *mti, *rep_mti;
354 int rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*mti) };
358 mti = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*mti),
359 lustre_swab_mgs_target_info);
361 if (!(mti->mti_flags & (LDD_F_WRITECONF | LDD_F_UPGRADE14 |
363 /* We're just here as a startup ping. */
364 CDEBUG(D_MGS, "Server %s is running on %s\n",
365 mti->mti_svname, obd_export_nid2str(req->rq_export));
366 rc = mgs_check_target(obd, mti);
367 /* above will set appropriate mti flags */
369 /* Nothing wrong, or fatal error */
370 GOTO(out_nolock, rc);
373 /* Revoke the config lock to make sure nobody is reading. */
374 /* Although actually I think it should be alright if
375 someone was reading while we were updating the logs - if we
376 revoke at the end they will just update from where they left off. */
377 lockrc = mgs_get_cfg_lock(obd, mti->mti_fsname, &lockh);
378 if (lockrc != ELDLM_OK) {
379 LCONSOLE_ERROR_MSG(0x13d, "%s: Can't signal other nodes to "
380 "update their configuration (%d). Updating "
381 "local logs anyhow; you might have to "
382 "manually restart other nodes to get the "
383 "latest configuration.\n",
384 obd->obd_name, lockrc);
387 OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10);
389 /* Log writing contention is handled by the fsdb_sem */
391 if (mti->mti_flags & LDD_F_WRITECONF) {
392 if (mti->mti_flags & LDD_F_SV_TYPE_MDT &&
393 mti->mti_stripe_index == 0) {
394 rc = mgs_erase_logs(obd, mti->mti_fsname);
395 LCONSOLE_WARN("%s: Logs for fs %s were removed by user "
396 "request. All servers must be restarted "
397 "in order to regenerate the logs."
398 "\n", obd->obd_name, mti->mti_fsname);
399 } else if (mti->mti_flags &
400 (LDD_F_SV_TYPE_OST | LDD_F_SV_TYPE_MDT)) {
401 rc = mgs_erase_log(obd, mti->mti_svname);
402 LCONSOLE_WARN("%s: Regenerating %s log by user "
404 obd->obd_name, mti->mti_svname);
406 mti->mti_flags |= LDD_F_UPDATE;
407 /* Erased logs means start from scratch. */
408 mti->mti_flags &= ~LDD_F_UPGRADE14;
412 if (mti->mti_flags & LDD_F_UPGRADE14) {
413 rc = mgs_upgrade_sv_14(obd, mti);
415 CERROR("Can't upgrade from 1.4 (%d)\n", rc);
419 /* We're good to go */
420 mti->mti_flags |= LDD_F_UPDATE;
424 if (mti->mti_flags & LDD_F_UPDATE) {
425 CDEBUG(D_MGS, "updating %s, index=%d\n", mti->mti_svname,
426 mti->mti_stripe_index);
428 /* create or update the target log
429 and update the client/mdt logs */
430 rc = mgs_write_log_target(obd, mti);
432 CERROR("Failed to write %s log (%d)\n",
433 mti->mti_svname, rc);
437 mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE |
438 LDD_F_NEED_INDEX | LDD_F_WRITECONF |
440 mti->mti_flags |= LDD_F_REWRITE_LDD;
444 /* done with log update */
445 if (lockrc == ELDLM_OK)
446 mgs_put_cfg_lock(&lockh);
448 CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname,
449 mti->mti_stripe_index, rc);
450 lustre_pack_reply(req, 2, rep_size, NULL);
451 /* send back the whole mti in the reply */
452 rep_mti = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
454 memcpy(rep_mti, mti, sizeof(*rep_mti));
456 /* Flush logs to disk */
457 fsfilt_sync(obd, obd->u.mgs.mgs_sb);
461 static int mgs_set_info_rpc(struct ptlrpc_request *req)
463 struct obd_device *obd = req->rq_export->exp_obd;
464 struct mgs_send_param *msp, *rep_msp;
465 struct lustre_handle lockh;
466 int rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*msp) };
468 struct lustre_cfg_bufs bufs;
469 struct lustre_cfg *lcfg;
470 char fsname[MTI_NAME_MAXLEN];
473 msp = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*msp), NULL);
475 /* Construct lustre_cfg structure to pass to function mgs_setparam */
476 lustre_cfg_bufs_reset(&bufs, NULL);
477 lustre_cfg_bufs_set_string(&bufs, 1, msp->mgs_param);
478 lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
479 rc = mgs_setparam(obd, lcfg, fsname);
481 CERROR("Error %d in setting the parameter %s for fs %s\n",
482 rc, msp->mgs_param, fsname);
486 /* Revoke lock so everyone updates. Should be alright if
487 * someone was already reading while we were updating the logs,
488 * so we don't really need to hold the lock while we're
492 lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
493 if (lockrc != ELDLM_OK)
494 CERROR("lock error %d for fs %s\n", lockrc,
497 mgs_put_cfg_lock(&lockh);
499 lustre_cfg_free(lcfg);
501 lustre_pack_reply(req, 2, rep_size, NULL);
502 rep_msp = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
504 memcpy(rep_msp, msp, sizeof(*rep_msp));
509 int mgs_handle(struct ptlrpc_request *req)
511 int fail = OBD_FAIL_MGS_ALL_REPLY_NET;
515 OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2);
517 LASSERT(current->journal_info == NULL);
518 opc = lustre_msg_get_opc(req->rq_reqmsg);
519 if (opc != MGS_CONNECT) {
520 if (req->rq_export == NULL) {
521 CERROR("lustre_mgs: operation %d on unconnected MGS\n",
523 req->rq_status = -ENOTCONN;
524 GOTO(out, rc = -ENOTCONN);
530 DEBUG_REQ(D_MGS, req, "connect");
531 rc = target_handle_connect(req);
532 if (!rc && (lustre_msg_get_conn_cnt(req->rq_reqmsg) > 1))
533 /* Make clients trying to reconnect after a MGS restart
534 happy; also requires obd_replayable */
535 lustre_msg_add_op_flags(req->rq_repmsg,
536 MSG_CONNECT_RECONNECT);
539 DEBUG_REQ(D_MGS, req, "disconnect");
540 rc = target_handle_disconnect(req);
541 req->rq_status = rc; /* superfluous? */
544 DEBUG_REQ(D_MGS, req, "target add");
545 rc = mgs_handle_target_reg(req);
548 DEBUG_REQ(D_MGS, req, "target del");
549 //rc = mgs_handle_target_del(req);
552 rc = mgs_set_info_rpc(req);
556 DEBUG_REQ(D_MGS, req, "enqueue");
557 rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
558 ldlm_server_blocking_ast, NULL);
560 case LDLM_BL_CALLBACK:
561 case LDLM_CP_CALLBACK:
562 DEBUG_REQ(D_MGS, req, "callback");
563 CERROR("callbacks should not happen on MGS\n");
568 DEBUG_REQ(D_INFO, req, "ping");
569 rc = target_handle_ping(req);
572 DEBUG_REQ(D_MGS, req, "log cancel");
573 rc = -ENOTSUPP; /* la la la */
576 case LLOG_ORIGIN_HANDLE_CREATE:
577 DEBUG_REQ(D_MGS, req, "llog_init");
578 rc = llog_origin_handle_create(req);
580 case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
581 DEBUG_REQ(D_MGS, req, "llog next block");
582 rc = llog_origin_handle_next_block(req);
584 case LLOG_ORIGIN_HANDLE_READ_HEADER:
585 DEBUG_REQ(D_MGS, req, "llog read header");
586 rc = llog_origin_handle_read_header(req);
588 case LLOG_ORIGIN_HANDLE_CLOSE:
589 DEBUG_REQ(D_MGS, req, "llog close");
590 rc = llog_origin_handle_close(req);
593 DEBUG_REQ(D_MGS, req, "llog catinfo");
594 rc = llog_catinfo(req);
597 req->rq_status = -ENOTSUPP;
598 rc = ptlrpc_error(req);
602 LASSERT(current->journal_info == NULL);
605 CERROR("MGS handle cmd=%d rc=%d\n", opc, rc);
608 target_send_reply(req, rc, fail);
612 static inline int mgs_destroy_export(struct obd_export *exp)
616 target_destroy_export(exp);
621 /* from mdt_iocontrol */
622 int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
623 void *karg, void *uarg)
625 struct obd_device *obd = exp->exp_obd;
626 struct obd_ioctl_data *data = karg;
627 struct lvfs_run_ctxt saved;
631 CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd);
635 case OBD_IOC_PARAM: {
636 struct lustre_handle lockh;
637 struct lustre_cfg *lcfg;
638 struct llog_rec_hdr rec;
639 char fsname[MTI_NAME_MAXLEN];
642 rec.lrh_len = llog_data_len(data->ioc_plen1);
644 if (data->ioc_type == LUSTRE_CFG_TYPE) {
645 rec.lrh_type = OBD_CFG_REC;
647 CERROR("unknown cfg record type:%d \n", data->ioc_type);
651 OBD_ALLOC(lcfg, data->ioc_plen1);
654 rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
658 if (lcfg->lcfg_bufcount < 1)
659 GOTO(out_free, rc = -EINVAL);
661 rc = mgs_setparam(obd, lcfg, fsname);
663 CERROR("setparam err %d\n", rc);
667 /* Revoke lock so everyone updates. Should be alright if
668 someone was already reading while we were updating the logs,
669 so we don't really need to hold the lock while we're
672 lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
673 if (lockrc != ELDLM_OK)
674 CERROR("lock error %d for fs %s\n", lockrc,
677 mgs_put_cfg_lock(&lockh);
681 OBD_FREE(lcfg, data->ioc_plen1);
685 case OBD_IOC_DUMP_LOG: {
686 struct llog_ctxt *ctxt =
687 llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
688 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
689 rc = class_config_dump_llog(ctxt, data->ioc_inlbuf1, NULL);
690 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
697 case OBD_IOC_LLOG_CHECK:
698 case OBD_IOC_LLOG_INFO:
699 case OBD_IOC_LLOG_PRINT: {
700 struct llog_ctxt *ctxt =
701 llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
703 push_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
704 rc = llog_ioctl(ctxt, cmd, data);
705 pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
711 CDEBUG(D_INFO, "unknown command %x\n", cmd);
717 /* use obd ops to offer management infrastructure */
718 static struct obd_ops mgs_obd_ops = {
719 .o_owner = THIS_MODULE,
720 .o_connect = mgs_connect,
721 .o_disconnect = mgs_disconnect,
722 .o_setup = mgs_setup,
723 .o_precleanup = mgs_precleanup,
724 .o_cleanup = mgs_cleanup,
725 .o_destroy_export = mgs_destroy_export,
726 .o_iocontrol = mgs_iocontrol,
729 static int __init mgs_init(void)
731 struct lprocfs_static_vars lvars;
733 lprocfs_mgs_init_vars(&lvars);
734 class_register_type(&mgs_obd_ops, NULL,
735 lvars.module_vars, LUSTRE_MGS_NAME, NULL);
740 static void /*__exit*/ mgs_exit(void)
742 class_unregister_type(LUSTRE_MGS_NAME);
745 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
746 MODULE_DESCRIPTION("Lustre Management Server (MGS)");
747 MODULE_LICENSE("GPL");
749 module_init(mgs_init);
750 module_exit(mgs_exit);