4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2013, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lustre/mdt/mdt_mds.c
33 * Lustre Metadata Service Layer
35 * Author: Di Wang <di.wang@whamcloud.com>
38 #define DEBUG_SUBSYSTEM S_MDS
40 #include <linux/module.h>
42 #include <obd_support.h>
43 /* struct ptlrpc_request */
44 #include <lustre_net.h>
45 /* struct obd_export */
46 #include <lustre_export.h>
47 /* struct obd_device */
50 #include <dt_object.h>
51 #include <lustre_mds.h>
52 #include "mdt_internal.h"
53 #include <lustre_quota.h>
54 #include <lustre_acl.h>
55 #include <lustre_nodemap.h>
56 #include <uapi/linux/lustre/lustre_param.h>
60 struct md_device mds_md_dev;
61 struct ptlrpc_service *mds_regular_service;
62 struct ptlrpc_service *mds_readpage_service;
63 struct ptlrpc_service *mds_out_service;
64 struct ptlrpc_service *mds_mdsc_service;
65 struct ptlrpc_service *mds_mdss_service;
66 struct ptlrpc_service *mds_fld_service;
67 struct ptlrpc_service *mds_io_service;
68 struct mutex mds_health_mutex;
72 * * Initialized in mds_mod_init().
74 static unsigned long mds_num_threads;
75 module_param(mds_num_threads, ulong, 0444);
76 MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
78 static unsigned int mds_cpu_bind = 1;
79 module_param(mds_cpu_bind, uint, 0444);
80 MODULE_PARM_DESC(mds_cpu_bind,
81 "bind MDS threads to particular CPU partitions");
83 int mds_max_io_threads = 512;
84 module_param(mds_max_io_threads, int, 0444);
85 MODULE_PARM_DESC(mds_max_io_threads,
86 "maximum number of MDS IO service threads");
88 static unsigned int mds_io_cpu_bind = 1;
89 module_param(mds_io_cpu_bind, uint, 0444);
90 MODULE_PARM_DESC(mds_io_cpu_bind,
91 "bind MDS IO threads to particular CPU partitions");
93 static char *mds_io_num_cpts;
94 module_param(mds_io_num_cpts, charp, 0444);
95 MODULE_PARM_DESC(mds_io_num_cpts,
96 "CPU partitions MDS IO threads should run on");
98 static struct cfs_cpt_table *mdt_io_cptable;
100 static char *mds_num_cpts;
101 module_param(mds_num_cpts, charp, 0444);
102 MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
104 static unsigned long mds_rdpg_num_threads;
105 module_param(mds_rdpg_num_threads, ulong, 0444);
106 MODULE_PARM_DESC(mds_rdpg_num_threads,
107 "number of MDS readpage service threads to start");
109 static unsigned int mds_rdpg_cpu_bind = 1;
110 module_param(mds_rdpg_cpu_bind, uint, 0444);
111 MODULE_PARM_DESC(mds_rdpg_cpu_bind,
112 "bind MDS readpage threads to particular CPU partitions");
114 static char *mds_rdpg_num_cpts;
115 module_param(mds_rdpg_num_cpts, charp, 0444);
116 MODULE_PARM_DESC(mds_rdpg_num_cpts,
117 "CPU partitions MDS readpage threads should run on");
119 /* device init/fini methods */
120 static void mds_stop_ptlrpc_service(struct mds_device *m)
124 mutex_lock(&m->mds_health_mutex);
125 if (m->mds_regular_service != NULL) {
126 ptlrpc_unregister_service(m->mds_regular_service);
127 m->mds_regular_service = NULL;
129 if (m->mds_readpage_service != NULL) {
130 ptlrpc_unregister_service(m->mds_readpage_service);
131 m->mds_readpage_service = NULL;
133 if (m->mds_out_service != NULL) {
134 ptlrpc_unregister_service(m->mds_out_service);
135 m->mds_out_service = NULL;
137 if (m->mds_mdsc_service != NULL) {
138 ptlrpc_unregister_service(m->mds_mdsc_service);
139 m->mds_mdsc_service = NULL;
141 if (m->mds_mdss_service != NULL) {
142 ptlrpc_unregister_service(m->mds_mdss_service);
143 m->mds_mdss_service = NULL;
145 if (m->mds_fld_service != NULL) {
146 ptlrpc_unregister_service(m->mds_fld_service);
147 m->mds_fld_service = NULL;
149 if (m->mds_io_service != NULL) {
150 ptlrpc_unregister_service(m->mds_io_service);
151 m->mds_io_service = NULL;
153 mutex_unlock(&m->mds_health_mutex);
155 if (mdt_io_cptable != NULL) {
156 cfs_cpt_table_free(mdt_io_cptable);
157 mdt_io_cptable = NULL;
163 static int ldlm_enqueue_hpreq_check(struct ptlrpc_request *req)
165 struct ldlm_request *dlm_req;
169 if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_REPLAY|MSG_RESENT)) !=
173 req_capsule_init(&req->rq_pill, req, RCL_SERVER);
174 req_capsule_set(&req->rq_pill, &RQF_LDLM_ENQUEUE);
175 dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
179 if (dlm_req->lock_count > 0) {
180 struct ldlm_lock *lock;
182 lock = cfs_hash_lookup(req->rq_export->exp_lock_hash,
183 (void *)&dlm_req->lock_handle[0]);
185 DEBUG_REQ(D_RPCTRACE, req, "lock %p cookie 0x%llx",
186 lock, dlm_req->lock_handle[0].cookie);
188 rc = lock->l_granted_mode == lock->l_req_mode;
190 LDLM_DEBUG(lock, "hpreq resend");
191 LDLM_LOCK_RELEASE(lock);
198 static struct ptlrpc_hpreq_ops ldlm_enqueue_hpreq_ops = {
199 .hpreq_lock_match = NULL,
200 .hpreq_check = ldlm_enqueue_hpreq_check,
204 static int mds_hpreq_handler(struct ptlrpc_request *req)
206 if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_ENQUEUE)
207 req->rq_ops = &ldlm_enqueue_hpreq_ops;
209 ptlrpc_hpreq_handler(req);
213 static int mds_start_ptlrpc_service(struct mds_device *m)
215 static struct ptlrpc_service_conf conf;
216 struct obd_device *obd = m->mds_md_dev.md_lu_dev.ld_obd;
222 conf = (typeof(conf)) {
223 .psc_name = LUSTRE_MDT_NAME,
224 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
226 .bc_nbufs = MDS_NBUFS,
227 .bc_buf_size = MDS_REG_BUFSIZE,
228 .bc_req_max_size = MDS_REG_MAXREQSIZE,
229 .bc_rep_max_size = MDS_REG_MAXREPSIZE,
230 .bc_req_portal = MDS_REQUEST_PORTAL,
231 .bc_rep_portal = MDC_REPLY_PORTAL,
234 * We'd like to have a mechanism to set this on a per-device
238 .tc_thr_name = LUSTRE_MDT_NAME,
239 .tc_thr_factor = MDS_THR_FACTOR,
240 .tc_nthrs_init = MDS_NTHRS_INIT,
241 .tc_nthrs_base = MDS_NTHRS_BASE,
242 .tc_nthrs_max = MDS_NTHRS_MAX,
243 .tc_nthrs_user = mds_num_threads,
244 .tc_cpu_bind = mds_cpu_bind,
245 /* LCT_DT_THREAD is required as MDT threads may scan
246 * all LDLM namespaces (including OFD-originated) to
247 * cancel LDLM locks */
248 .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD,
251 .cc_pattern = mds_num_cpts,
255 .so_req_handler = tgt_request_handle,
256 .so_req_printer = target_print_req,
257 .so_hpreq_handler = mds_hpreq_handler,
260 m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
261 obd->obd_debugfs_entry);
262 if (IS_ERR(m->mds_regular_service)) {
263 rc = PTR_ERR(m->mds_regular_service);
264 CERROR("failed to start regular mdt service: %d\n", rc);
265 m->mds_regular_service = NULL;
271 * readpage service configuration. Parameters have to be adjusted,
274 memset(&conf, 0, sizeof(conf));
275 conf = (typeof(conf)) {
276 .psc_name = LUSTRE_MDT_NAME "_readpage",
277 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
279 .bc_nbufs = MDS_NBUFS,
280 .bc_buf_size = MDS_BUFSIZE,
281 .bc_req_max_size = MDS_MAXREQSIZE,
282 .bc_rep_max_size = MDS_MAXREPSIZE,
283 .bc_req_portal = MDS_READPAGE_PORTAL,
284 .bc_rep_portal = MDC_REPLY_PORTAL,
287 .tc_thr_name = LUSTRE_MDT_NAME "_rdpg",
288 .tc_thr_factor = MDS_RDPG_THR_FACTOR,
289 .tc_nthrs_init = MDS_RDPG_NTHRS_INIT,
290 .tc_nthrs_base = MDS_RDPG_NTHRS_BASE,
291 .tc_nthrs_max = MDS_RDPG_NTHRS_MAX,
292 .tc_nthrs_user = mds_rdpg_num_threads,
293 .tc_cpu_bind = mds_rdpg_cpu_bind,
294 .tc_ctx_tags = LCT_MD_THREAD,
297 .cc_pattern = mds_rdpg_num_cpts,
301 .so_req_handler = tgt_request_handle,
302 .so_req_printer = target_print_req,
305 m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
306 obd->obd_debugfs_entry);
307 if (IS_ERR(m->mds_readpage_service)) {
308 rc = PTR_ERR(m->mds_readpage_service);
309 CERROR("failed to start readpage service: %d\n", rc);
310 m->mds_readpage_service = NULL;
312 GOTO(err_mds_svc, rc);
315 /* Object update service */
316 conf = (typeof(conf)) {
317 .psc_name = LUSTRE_MDT_NAME "_out",
318 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
320 .bc_nbufs = MDS_NBUFS,
321 .bc_buf_size = OUT_BUFSIZE,
322 .bc_req_max_size = OUT_MAXREQSIZE,
323 .bc_rep_max_size = OUT_MAXREPSIZE,
324 .bc_req_portal = OUT_PORTAL,
325 .bc_rep_portal = OSC_REPLY_PORTAL,
328 * We'd like to have a mechanism to set this on a per-device
332 .tc_thr_name = LUSTRE_MDT_NAME "_out",
333 .tc_thr_factor = MDS_THR_FACTOR,
334 .tc_nthrs_init = MDS_NTHRS_INIT,
335 .tc_nthrs_base = MDS_NTHRS_BASE,
336 .tc_nthrs_max = MDS_NTHRS_MAX,
337 .tc_nthrs_user = mds_num_threads,
338 .tc_cpu_bind = mds_cpu_bind,
339 .tc_ctx_tags = LCT_MD_THREAD |
343 .cc_pattern = mds_num_cpts,
347 .so_req_handler = tgt_request_handle,
348 .so_req_printer = target_print_req,
349 .so_hpreq_handler = NULL,
352 m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
353 obd->obd_debugfs_entry);
354 if (IS_ERR(m->mds_out_service)) {
355 rc = PTR_ERR(m->mds_out_service);
356 CERROR("failed to start out service: %d\n", rc);
357 m->mds_out_service = NULL;
358 GOTO(err_mds_svc, rc);
362 * sequence controller service configuration
364 memset(&conf, 0, sizeof(conf));
365 conf = (typeof(conf)) {
366 .psc_name = LUSTRE_MDT_NAME "_seqs",
367 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
369 .bc_nbufs = MDS_NBUFS,
370 .bc_buf_size = SEQ_BUFSIZE,
371 .bc_req_max_size = SEQ_MAXREQSIZE,
372 .bc_rep_max_size = SEQ_MAXREPSIZE,
373 .bc_req_portal = SEQ_CONTROLLER_PORTAL,
374 .bc_rep_portal = MDC_REPLY_PORTAL,
377 .tc_thr_name = LUSTRE_MDT_NAME "_seqs",
378 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
379 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
380 .tc_ctx_tags = LCT_MD_THREAD,
383 .so_req_handler = tgt_request_handle,
384 .so_req_printer = target_print_req,
385 .so_hpreq_handler = NULL,
388 m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
389 obd->obd_debugfs_entry);
390 if (IS_ERR(m->mds_mdsc_service)) {
391 rc = PTR_ERR(m->mds_mdsc_service);
392 CERROR("failed to start seq controller service: %d\n", rc);
393 m->mds_mdsc_service = NULL;
395 GOTO(err_mds_svc, rc);
399 * metadata sequence server service configuration
401 memset(&conf, 0, sizeof(conf));
402 conf = (typeof(conf)) {
403 .psc_name = LUSTRE_MDT_NAME "_seqm",
404 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
406 .bc_nbufs = MDS_NBUFS,
407 .bc_buf_size = SEQ_BUFSIZE,
408 .bc_req_max_size = SEQ_MAXREQSIZE,
409 .bc_rep_max_size = SEQ_MAXREPSIZE,
410 .bc_req_portal = SEQ_METADATA_PORTAL,
411 .bc_rep_portal = MDC_REPLY_PORTAL,
414 .tc_thr_name = LUSTRE_MDT_NAME "_seqm",
415 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
416 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
417 .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD
420 .so_req_handler = tgt_request_handle,
421 .so_req_printer = target_print_req,
422 .so_hpreq_handler = NULL,
425 m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
426 obd->obd_debugfs_entry);
427 if (IS_ERR(m->mds_mdss_service)) {
428 rc = PTR_ERR(m->mds_mdss_service);
429 CERROR("failed to start metadata seq server service: %d\n", rc);
430 m->mds_mdss_service = NULL;
432 GOTO(err_mds_svc, rc);
435 /* FLD service start */
436 memset(&conf, 0, sizeof(conf));
437 conf = (typeof(conf)) {
438 .psc_name = LUSTRE_MDT_NAME "_fld",
439 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
441 .bc_nbufs = MDS_NBUFS,
442 .bc_buf_size = FLD_BUFSIZE,
443 .bc_req_max_size = FLD_MAXREQSIZE,
444 .bc_rep_max_size = FLD_MAXREPSIZE,
445 .bc_req_portal = FLD_REQUEST_PORTAL,
446 .bc_rep_portal = MDC_REPLY_PORTAL,
449 .tc_thr_name = LUSTRE_MDT_NAME "_fld",
450 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
451 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
452 .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
455 .so_req_handler = tgt_request_handle,
456 .so_req_printer = target_print_req,
457 .so_hpreq_handler = NULL,
460 m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
461 obd->obd_debugfs_entry);
462 if (IS_ERR(m->mds_fld_service)) {
463 rc = PTR_ERR(m->mds_fld_service);
464 CERROR("failed to start fld service: %d\n", rc);
465 m->mds_fld_service = NULL;
467 GOTO(err_mds_svc, rc);
471 mask = cfs_cpt_nodemask(cfs_cpt_tab, CFS_CPT_ANY);
472 /* event CPT feature is disabled in libcfs level by set partition
473 * number to 1, we still want to set node affinity for io service */
474 if (cfs_cpt_number(cfs_cpt_tab) == 1 && nodes_weight(*mask) > 1) {
478 mdt_io_cptable = cfs_cpt_table_alloc(nodes_weight(*mask));
479 for_each_node_mask(i, *mask) {
480 if (mdt_io_cptable == NULL) {
481 CWARN("MDS failed to create CPT table\n");
485 rc = cfs_cpt_set_node(mdt_io_cptable, cpt++, i);
487 CWARN("MDS Failed to set node %d for IO CPT table\n",
489 cfs_cpt_table_free(mdt_io_cptable);
490 mdt_io_cptable = NULL;
496 memset(&conf, 0, sizeof(conf));
497 conf = (typeof(conf)) {
498 .psc_name = LUSTRE_MDT_NAME "_io",
499 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
501 .bc_nbufs = OST_NBUFS,
502 .bc_buf_size = OST_IO_BUFSIZE,
503 .bc_req_max_size = OST_IO_MAXREQSIZE,
504 .bc_rep_max_size = OST_IO_MAXREPSIZE,
505 .bc_req_portal = MDS_IO_PORTAL,
506 .bc_rep_portal = MDC_REPLY_PORTAL,
509 .tc_thr_name = LUSTRE_MDT_NAME "_io",
510 .tc_thr_factor = OSS_THR_FACTOR,
511 .tc_nthrs_init = OSS_NTHRS_INIT,
512 .tc_nthrs_base = OSS_NTHRS_BASE,
513 .tc_nthrs_max = mds_max_io_threads,
514 .tc_nthrs_user = mds_num_threads,
515 .tc_cpu_bind = mds_io_cpu_bind,
516 .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
519 .cc_cptable = mdt_io_cptable,
520 .cc_pattern = mdt_io_cptable == NULL ?
521 mds_io_num_cpts : NULL,
525 .so_thr_init = tgt_io_thread_init,
526 .so_thr_done = tgt_io_thread_done,
527 .so_req_handler = tgt_request_handle,
528 .so_req_printer = target_print_req,
529 .so_hpreq_handler = tgt_hpreq_handler,
532 m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
533 obd->obd_debugfs_entry);
534 if (IS_ERR(m->mds_io_service)) {
535 rc = PTR_ERR(m->mds_io_service);
536 CERROR("failed to start MDT I/O service: %d\n", rc);
537 m->mds_io_service = NULL;
538 GOTO(err_mds_svc, rc);
544 mds_stop_ptlrpc_service(m);
549 static inline struct mds_device *mds_dev(struct lu_device *d)
551 return container_of_safe(d, struct mds_device, mds_md_dev.md_lu_dev);
554 static struct lu_device *mds_device_fini(const struct lu_env *env,
557 struct mds_device *m = mds_dev(d);
558 struct obd_device *obd = d->ld_obd;
561 mds_stop_ptlrpc_service(m);
562 lprocfs_obd_cleanup(obd);
566 static struct lu_device *mds_device_free(const struct lu_env *env,
569 struct mds_device *m = mds_dev(d);
572 md_device_fini(&m->mds_md_dev);
577 static struct lu_device *mds_device_alloc(const struct lu_env *env,
578 struct lu_device_type *t,
579 struct lustre_cfg *cfg)
581 struct mds_device *m;
582 struct obd_device *obd;
588 return ERR_PTR(-ENOMEM);
590 md_device_init(&m->mds_md_dev, t);
591 l = &m->mds_md_dev.md_lu_dev;
593 obd = class_name2obd(lustre_cfg_string(cfg, 0));
594 LASSERT(obd != NULL);
597 /* set this lu_device to obd, because error handling need it */
600 rc = lprocfs_obd_setup(obd, true);
602 mds_device_free(env, l);
607 mutex_init(&m->mds_health_mutex);
609 rc = mds_start_ptlrpc_service(m);
611 lprocfs_obd_cleanup(obd);
612 mds_device_free(env, l);
619 /* type constructor/destructor: mdt_type_init, mdt_type_fini */
620 LU_TYPE_INIT_FINI(mds, &mdt_thread_key);
622 static const struct lu_device_type_operations mds_device_type_ops = {
623 .ldto_init = mds_type_init,
624 .ldto_fini = mds_type_fini,
626 .ldto_start = mds_type_start,
627 .ldto_stop = mds_type_stop,
629 .ldto_device_alloc = mds_device_alloc,
630 .ldto_device_free = mds_device_free,
631 .ldto_device_fini = mds_device_fini
634 static struct lu_device_type mds_device_type = {
635 .ldt_tags = LU_DEVICE_MD,
636 .ldt_name = LUSTRE_MDS_NAME,
637 .ldt_ops = &mds_device_type_ops,
638 .ldt_ctx_tags = LCT_MD_THREAD
641 static int mds_health_check(const struct lu_env *env, struct obd_device *obd)
643 struct mds_device *mds = mds_dev(obd->obd_lu_dev);
647 mutex_lock(&mds->mds_health_mutex);
648 rc |= ptlrpc_service_health_check(mds->mds_regular_service);
649 rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
650 rc |= ptlrpc_service_health_check(mds->mds_out_service);
651 rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
652 rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
653 rc |= ptlrpc_service_health_check(mds->mds_fld_service);
654 rc |= ptlrpc_service_health_check(mds->mds_io_service);
655 mutex_unlock(&mds->mds_health_mutex);
657 return rc != 0 ? 1 : 0;
660 /* ioctls on obd dev */
661 static int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
662 void *karg, void __user *uarg)
664 struct obd_device *obd = exp->exp_obd;
665 struct obd_ioctl_data *data;
669 CDEBUG(D_IOCTL, "%s: cmd=%x len=%u karg=%pK uarg=%pK\n",
670 obd->obd_name, cmd, len, karg, uarg);
673 /* we only support nodemap ioctls, for now */
674 if (cmd != OBD_IOC_NODEMAP)
675 GOTO(out, rc = -EINVAL);
677 rc = server_iocontrol_nodemap(obd, data, true);
685 static const struct obd_ops mds_obd_device_ops = {
686 .o_owner = THIS_MODULE,
687 .o_health_check = mds_health_check,
688 .o_iocontrol = mds_iocontrol,
691 int mds_mod_init(void)
693 return class_register_type(&mds_obd_device_ops, NULL, false,
694 LUSTRE_MDS_NAME, &mds_device_type);
697 void mds_mod_exit(void)
699 class_unregister_type(LUSTRE_MDS_NAME);