1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/mgs/mgs_handler.c
5 * Lustre Management Server (mgs) request handler
7 * Copyright (C) 2006 Cluster File Systems, Inc.
8 * Author: Nathan Rutman <nathan@clusterfs.com>
10 * This file is part of Lustre, http://www.lustre.org.
12 * Lustre is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Lustre is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Lustre; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 # define EXPORT_SYMTAB
29 #define DEBUG_SUBSYSTEM S_MGS
30 #define D_MGS D_CONFIG/*|D_WARNING*/
33 # include <linux/module.h>
34 # include <linux/pagemap.h>
35 # include <linux/miscdevice.h>
36 # include <linux/init.h>
38 # include <liblustre.h>
41 #include <obd_class.h>
42 #include <lustre_dlm.h>
43 #include <lprocfs_status.h>
44 #include <lustre_fsfilt.h>
45 #include <lustre_commit_confd.h>
46 #include <lustre_disk.h>
47 #include "mgs_internal.h"
50 /* Establish a connection to the MGS.*/
51 static int mgs_connect(const struct lu_env *env,
52 struct lustre_handle *conn, struct obd_device *obd,
53 struct obd_uuid *cluuid, struct obd_connect_data *data)
55 struct obd_export *exp;
59 if (!conn || !obd || !cluuid)
62 rc = class_connect(conn, obd, cluuid);
65 exp = class_conn2export(conn);
68 exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
71 data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
72 exp->exp_connect_flags = data->ocd_connect_flags;
73 data->ocd_version = LUSTRE_VERSION_CODE;
77 class_disconnect(exp);
79 class_export_put(exp);
85 static int mgs_disconnect(struct obd_export *exp)
91 class_export_get(exp);
93 /* Disconnect early so that clients can't keep using export */
94 rc = class_disconnect(exp);
95 ldlm_cancel_locks_for_export(exp);
97 /* complete all outstanding replies */
98 spin_lock(&exp->exp_lock);
99 while (!list_empty(&exp->exp_outstanding_replies)) {
100 struct ptlrpc_reply_state *rs =
101 list_entry(exp->exp_outstanding_replies.next,
102 struct ptlrpc_reply_state, rs_exp_list);
103 struct ptlrpc_service *svc = rs->rs_service;
105 spin_lock(&svc->srv_lock);
106 list_del_init(&rs->rs_exp_list);
107 ptlrpc_schedule_difficult_reply(rs);
108 spin_unlock(&svc->srv_lock);
110 spin_unlock(&exp->exp_lock);
112 class_export_put(exp);
116 static int mgs_cleanup(struct obd_device *obd);
117 static int mgs_handle(struct ptlrpc_request *req);
119 /* Start the MGS obd */
120 static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
122 struct lprocfs_static_vars lvars;
123 struct mgs_obd *mgs = &obd->u.mgs;
124 struct lustre_mount_info *lmi;
125 struct lustre_sb_info *lsi;
126 struct vfsmount *mnt;
130 CDEBUG(D_CONFIG, "Starting MGS\n");
133 lmi = server_get_mount(obd->obd_name);
135 RETURN(rc = -EINVAL);
138 lsi = s2lsi(lmi->lmi_sb);
139 obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
140 if (IS_ERR(obd->obd_fsops))
141 GOTO(err_put, rc = PTR_ERR(obd->obd_fsops));
143 /* namespace for mgs llog */
144 obd->obd_namespace = ldlm_namespace_new("MGS", LDLM_NAMESPACE_SERVER,
145 LDLM_NAMESPACE_MODEST);
146 if (obd->obd_namespace == NULL)
147 GOTO(err_ops, rc = -ENOMEM);
150 ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
151 "mgs_ldlm_client", &obd->obd_ldlm_client);
153 LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
155 rc = mgs_fs_setup(obd, mnt);
157 CERROR("%s: MGS filesystem method init failed: rc = %d\n",
162 rc = llog_start_commit_thread();
166 rc = llog_setup(obd, NULL, LLOG_CONFIG_ORIG_CTXT, obd, 0, NULL,
171 /* No recovery for MGC's */
172 obd->obd_replayable = 0;
174 /* Internal mgs setup */
175 mgs_init_fsdb_list(obd);
176 sema_init(&mgs->mgs_sem, 1);
178 /* Start the service threads */
180 ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
181 MGS_MAXREPSIZE, MGS_REQUEST_PORTAL,
182 MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT,
183 mgs_handle, LUSTRE_MGS_NAME,
184 obd->obd_proc_entry, NULL,
185 MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
186 "ll_mgs", LCT_MD_THREAD);
188 if (!mgs->mgs_service) {
189 CERROR("failed to start service\n");
190 GOTO(err_fs, rc = -ENOMEM);
193 rc = ptlrpc_start_threads(obd, mgs->mgs_service);
195 GOTO(err_thread, rc);
198 lprocfs_mgs_init_vars(&lvars);
199 if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
200 lproc_mgs_setup(obd);
203 ping_evictor_start();
205 LCONSOLE_INFO("MGS %s started\n", obd->obd_name);
210 ptlrpc_unregister_service(mgs->mgs_service);
212 /* No extra cleanup needed for llog_init_commit_thread() */
215 ldlm_namespace_free(obd->obd_namespace, 0);
216 obd->obd_namespace = NULL;
218 fsfilt_put_ops(obd->obd_fsops);
220 server_put_mount(obd->obd_name, mnt);
225 static int mgs_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
231 case OBD_CLEANUP_EARLY:
232 case OBD_CLEANUP_EXPORTS:
234 case OBD_CLEANUP_SELF_EXP:
235 llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
236 rc = obd_llog_finish(obd, 0);
238 case OBD_CLEANUP_OBD:
244 static int mgs_ldlm_nsfree(void *data)
246 struct ldlm_namespace *ns = (struct ldlm_namespace *)data;
250 ptlrpc_daemonize("ll_mgs_nsfree");
251 rc = ldlm_namespace_free(ns, 1 /* obd_force should always be on */);
255 static int mgs_cleanup(struct obd_device *obd)
257 struct mgs_obd *mgs = &obd->u.mgs;
260 if (mgs->mgs_sb == NULL)
265 ptlrpc_unregister_service(mgs->mgs_service);
267 mgs_cleanup_fsdb_list(obd);
269 lprocfs_obd_cleanup(obd);
270 mgs->mgs_proc_live = NULL;
274 server_put_mount(obd->obd_name, mgs->mgs_vfsmnt);
277 /* Free the namespace in it's own thread, so that if the
278 ldlm_cancel_handler put the last mgs obd ref, we won't
280 cfs_kernel_thread(mgs_ldlm_nsfree, obd->obd_namespace,
281 CLONE_VM | CLONE_FILES);
284 fsfilt_put_ops(obd->obd_fsops);
286 LCONSOLE_INFO("%s has stopped.\n", obd->obd_name);
290 /* similar to filter_prepare_destroy */
291 static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname,
292 struct lustre_handle *lockh)
294 struct ldlm_res_id res_id;
298 rc = mgc_fsname2resid(fsname, &res_id);
300 rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id,
301 LDLM_PLAIN, NULL, LCK_EX,
302 &flags, ldlm_blocking_ast,
303 ldlm_completion_ast, NULL,
304 fsname, 0, NULL, lockh);
306 CERROR("can't take cfg lock for %s (%d)\n", fsname, rc);
311 static int mgs_put_cfg_lock(struct lustre_handle *lockh)
314 ldlm_lock_decref(lockh, LCK_EX);
321 static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
326 rc = mgs_check_index(obd, mti);
328 LCONSOLE_ERROR_MSG(0x13b, "%s claims to have registered, but "
329 "this MGS does not know about it. Assuming"
330 " writeconf.\n", mti->mti_svname);
331 mti->mti_flags |= LDD_F_WRITECONF;
333 } else if (rc == -1) {
334 LCONSOLE_ERROR_MSG(0x13c, "Client log %s-client has "
335 "disappeared! Regenerating all logs.\n",
337 mti->mti_flags |= LDD_F_WRITECONF;
340 /* Index is correctly marked as used */
342 /* If the logs don't contain the mti_nids then add
343 them as failover nids */
344 rc = mgs_check_failnid(obd, mti);
350 /* Called whenever a target starts up. Flags indicate first connect, etc. */
351 static int mgs_handle_target_reg(struct ptlrpc_request *req)
353 struct obd_device *obd = req->rq_export->exp_obd;
354 struct lustre_handle lockh;
355 struct mgs_target_info *mti, *rep_mti;
356 int rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*mti) };
360 mti = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*mti),
361 lustre_swab_mgs_target_info);
363 if (!(mti->mti_flags & (LDD_F_WRITECONF | LDD_F_UPGRADE14 |
365 /* We're just here as a startup ping. */
366 CDEBUG(D_MGS, "Server %s is running on %s\n",
367 mti->mti_svname, obd_export_nid2str(req->rq_export));
368 rc = mgs_check_target(obd, mti);
369 /* above will set appropriate mti flags */
371 /* Nothing wrong, or fatal error */
372 GOTO(out_nolock, rc);
375 /* Revoke the config lock to make sure nobody is reading. */
376 /* Although actually I think it should be alright if
377 someone was reading while we were updating the logs - if we
378 revoke at the end they will just update from where they left off. */
379 lockrc = mgs_get_cfg_lock(obd, mti->mti_fsname, &lockh);
380 if (lockrc != ELDLM_OK) {
381 LCONSOLE_ERROR_MSG(0x13d, "%s: Can't signal other nodes to "
382 "update their configuration (%d). Updating "
383 "local logs anyhow; you might have to "
384 "manually restart other nodes to get the "
385 "latest configuration.\n",
386 obd->obd_name, lockrc);
389 OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10);
391 /* Log writing contention is handled by the fsdb_sem */
393 if (mti->mti_flags & LDD_F_WRITECONF) {
394 if (mti->mti_flags & LDD_F_SV_TYPE_MDT &&
395 mti->mti_stripe_index == 0) {
396 rc = mgs_erase_logs(obd, mti->mti_fsname);
397 LCONSOLE_WARN("%s: Logs for fs %s were removed by user "
398 "request. All servers must be restarted "
399 "in order to regenerate the logs."
400 "\n", obd->obd_name, mti->mti_fsname);
401 } else if (mti->mti_flags &
402 (LDD_F_SV_TYPE_OST | LDD_F_SV_TYPE_MDT)) {
403 rc = mgs_erase_log(obd, mti->mti_svname);
404 LCONSOLE_WARN("%s: Regenerating %s log by user "
406 obd->obd_name, mti->mti_svname);
408 mti->mti_flags |= LDD_F_UPDATE;
409 /* Erased logs means start from scratch. */
410 mti->mti_flags &= ~LDD_F_UPGRADE14;
414 if (mti->mti_flags & LDD_F_UPGRADE14) {
415 rc = mgs_upgrade_sv_14(obd, mti);
417 CERROR("Can't upgrade from 1.4 (%d)\n", rc);
421 /* We're good to go */
422 mti->mti_flags |= LDD_F_UPDATE;
426 if (mti->mti_flags & LDD_F_UPDATE) {
427 CDEBUG(D_MGS, "updating %s, index=%d\n", mti->mti_svname,
428 mti->mti_stripe_index);
430 /* create or update the target log
431 and update the client/mdt logs */
432 rc = mgs_write_log_target(obd, mti);
434 CERROR("Failed to write %s log (%d)\n",
435 mti->mti_svname, rc);
439 mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE |
440 LDD_F_NEED_INDEX | LDD_F_WRITECONF |
442 mti->mti_flags |= LDD_F_REWRITE_LDD;
446 /* done with log update */
447 if (lockrc == ELDLM_OK)
448 mgs_put_cfg_lock(&lockh);
450 CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname,
451 mti->mti_stripe_index, rc);
452 lustre_pack_reply(req, 2, rep_size, NULL);
453 /* send back the whole mti in the reply */
454 rep_mti = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
456 memcpy(rep_mti, mti, sizeof(*rep_mti));
458 /* Flush logs to disk */
459 fsfilt_sync(obd, obd->u.mgs.mgs_sb);
463 static int mgs_set_info_rpc(struct ptlrpc_request *req)
465 struct obd_device *obd = req->rq_export->exp_obd;
466 struct mgs_send_param *msp, *rep_msp;
467 struct lustre_handle lockh;
468 int rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*msp) };
470 struct lustre_cfg_bufs bufs;
471 struct lustre_cfg *lcfg;
472 char fsname[MTI_NAME_MAXLEN];
475 msp = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*msp), NULL);
477 /* Construct lustre_cfg structure to pass to function mgs_setparam */
478 lustre_cfg_bufs_reset(&bufs, NULL);
479 lustre_cfg_bufs_set_string(&bufs, 1, msp->mgs_param);
480 lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
481 rc = mgs_setparam(obd, lcfg, fsname);
483 CERROR("Error %d in setting the parameter %s for fs %s\n",
484 rc, msp->mgs_param, fsname);
488 /* Revoke lock so everyone updates. Should be alright if
489 * someone was already reading while we were updating the logs,
490 * so we don't really need to hold the lock while we're
494 lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
495 if (lockrc != ELDLM_OK)
496 CERROR("lock error %d for fs %s\n", lockrc,
499 mgs_put_cfg_lock(&lockh);
501 lustre_cfg_free(lcfg);
503 lustre_pack_reply(req, 2, rep_size, NULL);
504 rep_msp = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
506 memcpy(rep_msp, msp, sizeof(*rep_msp));
511 int mgs_handle(struct ptlrpc_request *req)
513 int fail = OBD_FAIL_MGS_ALL_REPLY_NET;
517 OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2);
519 LASSERT(current->journal_info == NULL);
520 opc = lustre_msg_get_opc(req->rq_reqmsg);
521 if (opc != MGS_CONNECT) {
522 if (req->rq_export == NULL) {
523 CERROR("lustre_mgs: operation %d on unconnected MGS\n",
525 req->rq_status = -ENOTCONN;
526 GOTO(out, rc = -ENOTCONN);
532 DEBUG_REQ(D_MGS, req, "connect");
533 rc = target_handle_connect(req);
534 if (!rc && (lustre_msg_get_conn_cnt(req->rq_reqmsg) > 1))
535 /* Make clients trying to reconnect after a MGS restart
536 happy; also requires obd_replayable */
537 lustre_msg_add_op_flags(req->rq_repmsg,
538 MSG_CONNECT_RECONNECT);
541 DEBUG_REQ(D_MGS, req, "disconnect");
542 rc = target_handle_disconnect(req);
543 req->rq_status = rc; /* superfluous? */
546 DEBUG_REQ(D_MGS, req, "target add");
547 rc = mgs_handle_target_reg(req);
550 DEBUG_REQ(D_MGS, req, "target del");
551 //rc = mgs_handle_target_del(req);
554 rc = mgs_set_info_rpc(req);
558 DEBUG_REQ(D_MGS, req, "enqueue");
559 rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
560 ldlm_server_blocking_ast, NULL);
562 case LDLM_BL_CALLBACK:
563 case LDLM_CP_CALLBACK:
564 DEBUG_REQ(D_MGS, req, "callback");
565 CERROR("callbacks should not happen on MGS\n");
570 DEBUG_REQ(D_INFO, req, "ping");
571 rc = target_handle_ping(req);
574 DEBUG_REQ(D_MGS, req, "log cancel");
575 rc = -ENOTSUPP; /* la la la */
578 case LLOG_ORIGIN_HANDLE_CREATE:
579 DEBUG_REQ(D_MGS, req, "llog_init");
580 rc = llog_origin_handle_create(req);
582 case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
583 DEBUG_REQ(D_MGS, req, "llog next block");
584 rc = llog_origin_handle_next_block(req);
586 case LLOG_ORIGIN_HANDLE_READ_HEADER:
587 DEBUG_REQ(D_MGS, req, "llog read header");
588 rc = llog_origin_handle_read_header(req);
590 case LLOG_ORIGIN_HANDLE_CLOSE:
591 DEBUG_REQ(D_MGS, req, "llog close");
592 rc = llog_origin_handle_close(req);
595 DEBUG_REQ(D_MGS, req, "llog catinfo");
596 rc = llog_catinfo(req);
599 req->rq_status = -ENOTSUPP;
600 rc = ptlrpc_error(req);
604 LASSERT(current->journal_info == NULL);
607 CERROR("MGS handle cmd=%d rc=%d\n", opc, rc);
610 target_send_reply(req, rc, fail);
614 static inline int mgs_destroy_export(struct obd_export *exp)
618 target_destroy_export(exp);
623 /* from mdt_iocontrol */
624 int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
625 void *karg, void *uarg)
627 struct obd_device *obd = exp->exp_obd;
628 struct obd_ioctl_data *data = karg;
629 struct lvfs_run_ctxt saved;
633 CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd);
637 case OBD_IOC_PARAM: {
638 struct lustre_handle lockh;
639 struct lustre_cfg *lcfg;
640 struct llog_rec_hdr rec;
641 char fsname[MTI_NAME_MAXLEN];
644 rec.lrh_len = llog_data_len(data->ioc_plen1);
646 if (data->ioc_type == LUSTRE_CFG_TYPE) {
647 rec.lrh_type = OBD_CFG_REC;
649 CERROR("unknown cfg record type:%d \n", data->ioc_type);
653 OBD_ALLOC(lcfg, data->ioc_plen1);
656 rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
660 if (lcfg->lcfg_bufcount < 1)
661 GOTO(out_free, rc = -EINVAL);
663 rc = mgs_setparam(obd, lcfg, fsname);
665 CERROR("setparam err %d\n", rc);
669 /* Revoke lock so everyone updates. Should be alright if
670 someone was already reading while we were updating the logs,
671 so we don't really need to hold the lock while we're
674 lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
675 if (lockrc != ELDLM_OK)
676 CERROR("lock error %d for fs %s\n", lockrc,
679 mgs_put_cfg_lock(&lockh);
683 OBD_FREE(lcfg, data->ioc_plen1);
687 case OBD_IOC_DUMP_LOG: {
688 struct llog_ctxt *ctxt =
689 llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
690 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
691 rc = class_config_dump_llog(ctxt, data->ioc_inlbuf1, NULL);
692 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
699 case OBD_IOC_LLOG_CHECK:
700 case OBD_IOC_LLOG_INFO:
701 case OBD_IOC_LLOG_PRINT: {
702 struct llog_ctxt *ctxt =
703 llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
705 push_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
706 rc = llog_ioctl(ctxt, cmd, data);
707 pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
713 CDEBUG(D_INFO, "unknown command %x\n", cmd);
719 /* use obd ops to offer management infrastructure */
720 static struct obd_ops mgs_obd_ops = {
721 .o_owner = THIS_MODULE,
722 .o_connect = mgs_connect,
723 .o_disconnect = mgs_disconnect,
724 .o_setup = mgs_setup,
725 .o_precleanup = mgs_precleanup,
726 .o_cleanup = mgs_cleanup,
727 .o_destroy_export = mgs_destroy_export,
728 .o_iocontrol = mgs_iocontrol,
731 static int __init mgs_init(void)
733 struct lprocfs_static_vars lvars;
735 lprocfs_mgs_init_vars(&lvars);
736 class_register_type(&mgs_obd_ops, NULL,
737 lvars.module_vars, LUSTRE_MGS_NAME, NULL);
742 static void /*__exit*/ mgs_exit(void)
744 class_unregister_type(LUSTRE_MGS_NAME);
747 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
748 MODULE_DESCRIPTION("Lustre Management Server (MGS)");
749 MODULE_LICENSE("GPL");
751 module_init(mgs_init);
752 module_exit(mgs_exit);