4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2013, 2015, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lustre/mdt/mdt_mds.c
33 * Lustre Metadata Service Layer
35 * Author: Di Wang <di.wang@whamcloud.com>
38 #define DEBUG_SUBSYSTEM S_MDS
40 #include <linux/module.h>
42 #include <obd_support.h>
43 /* struct ptlrpc_request */
44 #include <lustre_net.h>
45 /* struct obd_export */
46 #include <lustre_export.h>
47 /* struct obd_device */
50 #include <dt_object.h>
51 #include <lustre_mds.h>
52 #include "mdt_internal.h"
53 #include <lustre_quota.h>
54 #include <lustre_acl.h>
55 #include <uapi/linux/lustre/lustre_param.h>
59 struct md_device mds_md_dev;
60 struct ptlrpc_service *mds_regular_service;
61 struct ptlrpc_service *mds_readpage_service;
62 struct ptlrpc_service *mds_out_service;
63 struct ptlrpc_service *mds_setattr_service;
64 struct ptlrpc_service *mds_mdsc_service;
65 struct ptlrpc_service *mds_mdss_service;
66 struct ptlrpc_service *mds_fld_service;
67 struct ptlrpc_service *mds_io_service;
68 struct mutex mds_health_mutex;
69 struct kset *mds_kset;
73 * * Initialized in mds_mod_init().
75 static unsigned long mds_num_threads;
76 module_param(mds_num_threads, ulong, 0444);
77 MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
79 int mds_max_io_threads = 512;
80 module_param(mds_max_io_threads, int, 0444);
81 MODULE_PARM_DESC(mds_max_io_threads, "maximum number of MDS IO service threads");
83 static char *mds_num_cpts;
84 module_param(mds_num_cpts, charp, 0444);
85 MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
87 static unsigned long mds_rdpg_num_threads;
88 module_param(mds_rdpg_num_threads, ulong, 0444);
89 MODULE_PARM_DESC(mds_rdpg_num_threads,
90 "number of MDS readpage service threads to start");
92 static char *mds_rdpg_num_cpts;
93 module_param(mds_rdpg_num_cpts, charp, 0444);
94 MODULE_PARM_DESC(mds_rdpg_num_cpts,
95 "CPU partitions MDS readpage threads should run on");
97 /* NB: these two should be removed along with setattr service in the future */
98 static unsigned long mds_attr_num_threads;
99 module_param(mds_attr_num_threads, ulong, 0444);
100 MODULE_PARM_DESC(mds_attr_num_threads,
101 "number of MDS setattr service threads to start");
103 static char *mds_attr_num_cpts;
104 module_param(mds_attr_num_cpts, charp, 0444);
105 MODULE_PARM_DESC(mds_attr_num_cpts,
106 "CPU partitions MDS setattr threads should run on");
108 /* device init/fini methods */
109 static void mds_stop_ptlrpc_service(struct mds_device *m)
113 mutex_lock(&m->mds_health_mutex);
114 if (m->mds_regular_service != NULL) {
115 ptlrpc_unregister_service(m->mds_regular_service);
116 m->mds_regular_service = NULL;
118 if (m->mds_readpage_service != NULL) {
119 ptlrpc_unregister_service(m->mds_readpage_service);
120 m->mds_readpage_service = NULL;
122 if (m->mds_out_service != NULL) {
123 ptlrpc_unregister_service(m->mds_out_service);
124 m->mds_out_service = NULL;
126 if (m->mds_setattr_service != NULL) {
127 ptlrpc_unregister_service(m->mds_setattr_service);
128 m->mds_setattr_service = NULL;
130 if (m->mds_mdsc_service != NULL) {
131 ptlrpc_unregister_service(m->mds_mdsc_service);
132 m->mds_mdsc_service = NULL;
134 if (m->mds_mdss_service != NULL) {
135 ptlrpc_unregister_service(m->mds_mdss_service);
136 m->mds_mdss_service = NULL;
138 if (m->mds_fld_service != NULL) {
139 ptlrpc_unregister_service(m->mds_fld_service);
140 m->mds_fld_service = NULL;
142 if (m->mds_io_service != NULL) {
143 ptlrpc_unregister_service(m->mds_io_service);
144 m->mds_io_service = NULL;
146 mutex_unlock(&m->mds_health_mutex);
151 static int mds_start_ptlrpc_service(struct mds_device *m)
153 static struct ptlrpc_service_conf conf;
154 struct obd_device *obd = m->mds_md_dev.md_lu_dev.ld_obd;
155 struct proc_dir_entry *procfs_entry;
159 procfs_entry = obd->obd_proc_entry;
160 LASSERT(procfs_entry != NULL);
162 conf = (typeof(conf)) {
163 .psc_name = LUSTRE_MDT_NAME,
164 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
166 .bc_nbufs = MDS_NBUFS,
167 .bc_buf_size = MDS_REG_BUFSIZE,
168 .bc_req_max_size = MDS_REG_MAXREQSIZE,
169 .bc_rep_max_size = MDS_REG_MAXREPSIZE,
170 .bc_req_portal = MDS_REQUEST_PORTAL,
171 .bc_rep_portal = MDC_REPLY_PORTAL,
174 * We'd like to have a mechanism to set this on a per-device
178 .tc_thr_name = LUSTRE_MDT_NAME,
179 .tc_thr_factor = MDS_THR_FACTOR,
180 .tc_nthrs_init = MDS_NTHRS_INIT,
181 .tc_nthrs_base = MDS_NTHRS_BASE,
182 .tc_nthrs_max = MDS_NTHRS_MAX,
183 .tc_nthrs_user = mds_num_threads,
184 .tc_cpu_affinity = 1,
185 .tc_ctx_tags = LCT_MD_THREAD,
188 .cc_pattern = mds_num_cpts,
191 .so_req_handler = tgt_request_handle,
192 .so_req_printer = target_print_req,
193 .so_hpreq_handler = ptlrpc_hpreq_handler,
196 m->mds_regular_service = ptlrpc_register_service(&conf, m->mds_kset,
198 if (IS_ERR(m->mds_regular_service)) {
199 rc = PTR_ERR(m->mds_regular_service);
200 CERROR("failed to start regular mdt service: %d\n", rc);
201 m->mds_regular_service = NULL;
207 * readpage service configuration. Parameters have to be adjusted,
210 memset(&conf, 0, sizeof(conf));
211 conf = (typeof(conf)) {
212 .psc_name = LUSTRE_MDT_NAME "_readpage",
213 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
215 .bc_nbufs = MDS_NBUFS,
216 .bc_buf_size = MDS_BUFSIZE,
217 .bc_req_max_size = MDS_MAXREQSIZE,
218 .bc_rep_max_size = MDS_MAXREPSIZE,
219 .bc_req_portal = MDS_READPAGE_PORTAL,
220 .bc_rep_portal = MDC_REPLY_PORTAL,
223 .tc_thr_name = LUSTRE_MDT_NAME "_rdpg",
224 .tc_thr_factor = MDS_RDPG_THR_FACTOR,
225 .tc_nthrs_init = MDS_RDPG_NTHRS_INIT,
226 .tc_nthrs_base = MDS_RDPG_NTHRS_BASE,
227 .tc_nthrs_max = MDS_RDPG_NTHRS_MAX,
228 .tc_nthrs_user = mds_rdpg_num_threads,
229 .tc_cpu_affinity = 1,
230 .tc_ctx_tags = LCT_MD_THREAD,
233 .cc_pattern = mds_rdpg_num_cpts,
236 .so_req_handler = tgt_request_handle,
237 .so_req_printer = target_print_req,
240 m->mds_readpage_service = ptlrpc_register_service(&conf, m->mds_kset,
242 if (IS_ERR(m->mds_readpage_service)) {
243 rc = PTR_ERR(m->mds_readpage_service);
244 CERROR("failed to start readpage service: %d\n", rc);
245 m->mds_readpage_service = NULL;
247 GOTO(err_mds_svc, rc);
251 * setattr service configuration.
253 * XXX To keep the compatibility with old client(< 2.2), we need to
254 * preserve this portal for a certain time, it should be removed
255 * eventually. LU-617.
257 memset(&conf, 0, sizeof(conf));
258 conf = (typeof(conf)) {
259 .psc_name = LUSTRE_MDT_NAME "_setattr",
260 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
262 .bc_nbufs = MDS_NBUFS,
263 .bc_buf_size = MDS_BUFSIZE,
264 .bc_req_max_size = MDS_MAXREQSIZE,
265 .bc_rep_max_size = MDS_LOV_MAXREPSIZE,
266 .bc_req_portal = MDS_SETATTR_PORTAL,
267 .bc_rep_portal = MDC_REPLY_PORTAL,
270 .tc_thr_name = LUSTRE_MDT_NAME "_attr",
271 .tc_thr_factor = MDS_SETA_THR_FACTOR,
272 .tc_nthrs_init = MDS_SETA_NTHRS_INIT,
273 .tc_nthrs_base = MDS_SETA_NTHRS_BASE,
274 .tc_nthrs_max = MDS_SETA_NTHRS_MAX,
275 .tc_nthrs_user = mds_attr_num_threads,
276 .tc_cpu_affinity = 1,
277 .tc_ctx_tags = LCT_MD_THREAD,
280 .cc_pattern = mds_attr_num_cpts,
283 .so_req_handler = tgt_request_handle,
284 .so_req_printer = target_print_req,
285 .so_hpreq_handler = NULL,
288 m->mds_setattr_service = ptlrpc_register_service(&conf, m->mds_kset,
290 if (IS_ERR(m->mds_setattr_service)) {
291 rc = PTR_ERR(m->mds_setattr_service);
292 CERROR("failed to start setattr service: %d\n", rc);
293 m->mds_setattr_service = NULL;
295 GOTO(err_mds_svc, rc);
298 /* Object update service */
299 conf = (typeof(conf)) {
300 .psc_name = LUSTRE_MDT_NAME "_out",
301 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
303 .bc_nbufs = MDS_NBUFS,
304 .bc_buf_size = OUT_BUFSIZE,
305 .bc_req_max_size = OUT_MAXREQSIZE,
306 .bc_rep_max_size = OUT_MAXREPSIZE,
307 .bc_req_portal = OUT_PORTAL,
308 .bc_rep_portal = OSC_REPLY_PORTAL,
311 * We'd like to have a mechanism to set this on a per-device
315 .tc_thr_name = LUSTRE_MDT_NAME "_out",
316 .tc_thr_factor = MDS_THR_FACTOR,
317 .tc_nthrs_init = MDS_NTHRS_INIT,
318 .tc_nthrs_base = MDS_NTHRS_BASE,
319 .tc_nthrs_max = MDS_NTHRS_MAX,
320 .tc_nthrs_user = mds_num_threads,
321 .tc_cpu_affinity = 1,
322 .tc_ctx_tags = LCT_MD_THREAD |
326 .cc_pattern = mds_num_cpts,
329 .so_req_handler = tgt_request_handle,
330 .so_req_printer = target_print_req,
331 .so_hpreq_handler = NULL,
334 m->mds_out_service = ptlrpc_register_service(&conf, m->mds_kset,
336 if (IS_ERR(m->mds_out_service)) {
337 rc = PTR_ERR(m->mds_out_service);
338 CERROR("failed to start out service: %d\n", rc);
339 m->mds_out_service = NULL;
340 GOTO(err_mds_svc, rc);
344 * sequence controller service configuration
346 memset(&conf, 0, sizeof(conf));
347 conf = (typeof(conf)) {
348 .psc_name = LUSTRE_MDT_NAME "_seqs",
349 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
351 .bc_nbufs = MDS_NBUFS,
352 .bc_buf_size = SEQ_BUFSIZE,
353 .bc_req_max_size = SEQ_MAXREQSIZE,
354 .bc_rep_max_size = SEQ_MAXREPSIZE,
355 .bc_req_portal = SEQ_CONTROLLER_PORTAL,
356 .bc_rep_portal = MDC_REPLY_PORTAL,
359 .tc_thr_name = LUSTRE_MDT_NAME "_seqs",
360 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
361 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
362 .tc_ctx_tags = LCT_MD_THREAD,
365 .so_req_handler = tgt_request_handle,
366 .so_req_printer = target_print_req,
367 .so_hpreq_handler = NULL,
370 m->mds_mdsc_service = ptlrpc_register_service(&conf, m->mds_kset,
372 if (IS_ERR(m->mds_mdsc_service)) {
373 rc = PTR_ERR(m->mds_mdsc_service);
374 CERROR("failed to start seq controller service: %d\n", rc);
375 m->mds_mdsc_service = NULL;
377 GOTO(err_mds_svc, rc);
381 * metadata sequence server service configuration
383 memset(&conf, 0, sizeof(conf));
384 conf = (typeof(conf)) {
385 .psc_name = LUSTRE_MDT_NAME "_seqm",
386 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
388 .bc_nbufs = MDS_NBUFS,
389 .bc_buf_size = SEQ_BUFSIZE,
390 .bc_req_max_size = SEQ_MAXREQSIZE,
391 .bc_rep_max_size = SEQ_MAXREPSIZE,
392 .bc_req_portal = SEQ_METADATA_PORTAL,
393 .bc_rep_portal = MDC_REPLY_PORTAL,
396 .tc_thr_name = LUSTRE_MDT_NAME "_seqm",
397 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
398 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
399 .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD
402 .so_req_handler = tgt_request_handle,
403 .so_req_printer = target_print_req,
404 .so_hpreq_handler = NULL,
407 m->mds_mdss_service = ptlrpc_register_service(&conf, m->mds_kset,
409 if (IS_ERR(m->mds_mdss_service)) {
410 rc = PTR_ERR(m->mds_mdss_service);
411 CERROR("failed to start metadata seq server service: %d\n", rc);
412 m->mds_mdss_service = NULL;
414 GOTO(err_mds_svc, rc);
417 /* FLD service start */
418 memset(&conf, 0, sizeof(conf));
419 conf = (typeof(conf)) {
420 .psc_name = LUSTRE_MDT_NAME "_fld",
421 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
423 .bc_nbufs = MDS_NBUFS,
424 .bc_buf_size = FLD_BUFSIZE,
425 .bc_req_max_size = FLD_MAXREQSIZE,
426 .bc_rep_max_size = FLD_MAXREPSIZE,
427 .bc_req_portal = FLD_REQUEST_PORTAL,
428 .bc_rep_portal = MDC_REPLY_PORTAL,
431 .tc_thr_name = LUSTRE_MDT_NAME "_fld",
432 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
433 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
434 .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
437 .so_req_handler = tgt_request_handle,
438 .so_req_printer = target_print_req,
439 .so_hpreq_handler = NULL,
442 m->mds_fld_service = ptlrpc_register_service(&conf, m->mds_kset,
444 if (IS_ERR(m->mds_fld_service)) {
445 rc = PTR_ERR(m->mds_fld_service);
446 CERROR("failed to start fld service: %d\n", rc);
447 m->mds_fld_service = NULL;
449 GOTO(err_mds_svc, rc);
452 memset(&conf, 0, sizeof(conf));
453 conf = (typeof(conf)) {
454 .psc_name = LUSTRE_MDT_NAME "_io",
455 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
457 .bc_nbufs = OST_NBUFS,
458 .bc_buf_size = OST_IO_BUFSIZE,
459 .bc_req_max_size = OST_IO_MAXREQSIZE,
460 .bc_rep_max_size = OST_IO_MAXREPSIZE,
461 .bc_req_portal = MDS_IO_PORTAL,
462 .bc_rep_portal = MDC_REPLY_PORTAL,
465 .tc_thr_name = "ll_mdt_io",
466 .tc_thr_factor = OSS_THR_FACTOR,
467 .tc_nthrs_init = OSS_NTHRS_INIT,
468 .tc_nthrs_base = OSS_NTHRS_BASE,
469 .tc_nthrs_max = mds_max_io_threads,
470 .tc_cpu_affinity = 1,
471 .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
474 .so_thr_init = tgt_io_thread_init,
475 .so_thr_done = tgt_io_thread_done,
476 .so_req_handler = tgt_request_handle,
477 .so_req_printer = target_print_req,
480 m->mds_io_service = ptlrpc_register_service(&conf, m->mds_kset,
482 if (IS_ERR(m->mds_io_service)) {
483 rc = PTR_ERR(m->mds_io_service);
484 CERROR("failed to start MDT I/O service: %d\n", rc);
485 m->mds_io_service = NULL;
486 GOTO(err_mds_svc, rc);
492 mds_stop_ptlrpc_service(m);
497 static inline struct mds_device *mds_dev(struct lu_device *d)
499 return container_of0(d, struct mds_device, mds_md_dev.md_lu_dev);
502 static struct lu_device *mds_device_fini(const struct lu_env *env,
505 struct mds_device *m = mds_dev(d);
506 struct obd_device *obd = d->ld_obd;
509 mds_stop_ptlrpc_service(m);
510 lprocfs_kset_unregister(obd, m->mds_kset);
514 static struct lu_device *mds_device_free(const struct lu_env *env,
517 struct mds_device *m = mds_dev(d);
520 md_device_fini(&m->mds_md_dev);
525 static struct lu_device *mds_device_alloc(const struct lu_env *env,
526 struct lu_device_type *t,
527 struct lustre_cfg *cfg)
529 struct mds_device *m;
530 struct obd_device *obd;
536 return ERR_PTR(-ENOMEM);
538 md_device_init(&m->mds_md_dev, t);
539 l = &m->mds_md_dev.md_lu_dev;
541 obd = class_name2obd(lustre_cfg_string(cfg, 0));
542 LASSERT(obd != NULL);
545 /* set this lu_device to obd, because error handling need it */
548 rc = lprocfs_kset_register(obd, &m->mds_kset);
550 mds_device_free(env, l);
555 mutex_init(&m->mds_health_mutex);
557 rc = mds_start_ptlrpc_service(m);
559 lprocfs_kset_unregister(obd, m->mds_kset);
560 mds_device_free(env, l);
567 /* type constructor/destructor: mdt_type_init, mdt_type_fini */
568 LU_TYPE_INIT_FINI(mds, &mdt_thread_key);
570 static struct lu_device_type_operations mds_device_type_ops = {
571 .ldto_init = mds_type_init,
572 .ldto_fini = mds_type_fini,
574 .ldto_start = mds_type_start,
575 .ldto_stop = mds_type_stop,
577 .ldto_device_alloc = mds_device_alloc,
578 .ldto_device_free = mds_device_free,
579 .ldto_device_fini = mds_device_fini
582 static struct lu_device_type mds_device_type = {
583 .ldt_tags = LU_DEVICE_MD,
584 .ldt_name = LUSTRE_MDS_NAME,
585 .ldt_ops = &mds_device_type_ops,
586 .ldt_ctx_tags = LCT_MD_THREAD
589 static int mds_health_check(const struct lu_env *env, struct obd_device *obd)
591 struct mds_device *mds = mds_dev(obd->obd_lu_dev);
595 mutex_lock(&mds->mds_health_mutex);
596 rc |= ptlrpc_service_health_check(mds->mds_regular_service);
597 rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
598 rc |= ptlrpc_service_health_check(mds->mds_out_service);
599 rc |= ptlrpc_service_health_check(mds->mds_setattr_service);
600 rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
601 rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
602 rc |= ptlrpc_service_health_check(mds->mds_fld_service);
603 rc |= ptlrpc_service_health_check(mds->mds_io_service);
604 mutex_unlock(&mds->mds_health_mutex);
606 return rc != 0 ? 1 : 0;
609 static struct obd_ops mds_obd_device_ops = {
610 .o_owner = THIS_MODULE,
611 .o_health_check = mds_health_check,
614 int mds_mod_init(void)
616 return class_register_type(&mds_obd_device_ops, NULL, true, NULL,
617 LUSTRE_MDS_NAME, &mds_device_type);
620 void mds_mod_exit(void)
622 class_unregister_type(LUSTRE_MDS_NAME);