4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2013, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lustre/mdt/mdt_mds.c
33 * Lustre Metadata Service Layer
35 * Author: Di Wang <di.wang@whamcloud.com>
38 #define DEBUG_SUBSYSTEM S_MDS
40 #include <linux/module.h>
42 #include <obd_support.h>
43 /* struct ptlrpc_request */
44 #include <lustre_net.h>
45 /* struct obd_export */
46 #include <lustre_export.h>
47 /* struct obd_device */
50 #include <dt_object.h>
51 #include <lustre_mds.h>
52 #include "mdt_internal.h"
53 #include <lustre_quota.h>
54 #include <lustre_acl.h>
55 #include <uapi/linux/lustre/lustre_param.h>
59 struct md_device mds_md_dev;
60 struct ptlrpc_service *mds_regular_service;
61 struct ptlrpc_service *mds_readpage_service;
62 struct ptlrpc_service *mds_out_service;
63 struct ptlrpc_service *mds_setattr_service;
64 struct ptlrpc_service *mds_mdsc_service;
65 struct ptlrpc_service *mds_mdss_service;
66 struct ptlrpc_service *mds_fld_service;
67 struct ptlrpc_service *mds_io_service;
68 struct mutex mds_health_mutex;
72 * * Initialized in mds_mod_init().
74 static unsigned long mds_num_threads;
75 module_param(mds_num_threads, ulong, 0444);
76 MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
78 int mds_max_io_threads = 512;
79 module_param(mds_max_io_threads, int, 0444);
80 MODULE_PARM_DESC(mds_max_io_threads, "maximum number of MDS IO service threads");
82 static char *mds_num_cpts;
83 module_param(mds_num_cpts, charp, 0444);
84 MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
86 static unsigned long mds_rdpg_num_threads;
87 module_param(mds_rdpg_num_threads, ulong, 0444);
88 MODULE_PARM_DESC(mds_rdpg_num_threads,
89 "number of MDS readpage service threads to start");
91 static char *mds_rdpg_num_cpts;
92 module_param(mds_rdpg_num_cpts, charp, 0444);
93 MODULE_PARM_DESC(mds_rdpg_num_cpts,
94 "CPU partitions MDS readpage threads should run on");
96 /* NB: these two should be removed along with setattr service in the future */
97 static unsigned long mds_attr_num_threads;
98 module_param(mds_attr_num_threads, ulong, 0444);
99 MODULE_PARM_DESC(mds_attr_num_threads,
100 "number of MDS setattr service threads to start");
102 static char *mds_attr_num_cpts;
103 module_param(mds_attr_num_cpts, charp, 0444);
104 MODULE_PARM_DESC(mds_attr_num_cpts,
105 "CPU partitions MDS setattr threads should run on");
107 /* device init/fini methods */
108 static void mds_stop_ptlrpc_service(struct mds_device *m)
112 mutex_lock(&m->mds_health_mutex);
113 if (m->mds_regular_service != NULL) {
114 ptlrpc_unregister_service(m->mds_regular_service);
115 m->mds_regular_service = NULL;
117 if (m->mds_readpage_service != NULL) {
118 ptlrpc_unregister_service(m->mds_readpage_service);
119 m->mds_readpage_service = NULL;
121 if (m->mds_out_service != NULL) {
122 ptlrpc_unregister_service(m->mds_out_service);
123 m->mds_out_service = NULL;
125 if (m->mds_setattr_service != NULL) {
126 ptlrpc_unregister_service(m->mds_setattr_service);
127 m->mds_setattr_service = NULL;
129 if (m->mds_mdsc_service != NULL) {
130 ptlrpc_unregister_service(m->mds_mdsc_service);
131 m->mds_mdsc_service = NULL;
133 if (m->mds_mdss_service != NULL) {
134 ptlrpc_unregister_service(m->mds_mdss_service);
135 m->mds_mdss_service = NULL;
137 if (m->mds_fld_service != NULL) {
138 ptlrpc_unregister_service(m->mds_fld_service);
139 m->mds_fld_service = NULL;
141 if (m->mds_io_service != NULL) {
142 ptlrpc_unregister_service(m->mds_io_service);
143 m->mds_io_service = NULL;
145 mutex_unlock(&m->mds_health_mutex);
150 static int mds_start_ptlrpc_service(struct mds_device *m)
152 static struct ptlrpc_service_conf conf;
153 struct obd_device *obd = m->mds_md_dev.md_lu_dev.ld_obd;
157 conf = (typeof(conf)) {
158 .psc_name = LUSTRE_MDT_NAME,
159 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
161 .bc_nbufs = MDS_NBUFS,
162 .bc_buf_size = MDS_REG_BUFSIZE,
163 .bc_req_max_size = MDS_REG_MAXREQSIZE,
164 .bc_rep_max_size = MDS_REG_MAXREPSIZE,
165 .bc_req_portal = MDS_REQUEST_PORTAL,
166 .bc_rep_portal = MDC_REPLY_PORTAL,
169 * We'd like to have a mechanism to set this on a per-device
173 .tc_thr_name = LUSTRE_MDT_NAME,
174 .tc_thr_factor = MDS_THR_FACTOR,
175 .tc_nthrs_init = MDS_NTHRS_INIT,
176 .tc_nthrs_base = MDS_NTHRS_BASE,
177 .tc_nthrs_max = MDS_NTHRS_MAX,
178 .tc_nthrs_user = mds_num_threads,
179 .tc_cpu_affinity = 1,
180 .tc_ctx_tags = LCT_MD_THREAD,
183 .cc_pattern = mds_num_cpts,
186 .so_req_handler = tgt_request_handle,
187 .so_req_printer = target_print_req,
188 .so_hpreq_handler = ptlrpc_hpreq_handler,
191 m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
192 obd->obd_debugfs_entry);
193 if (IS_ERR(m->mds_regular_service)) {
194 rc = PTR_ERR(m->mds_regular_service);
195 CERROR("failed to start regular mdt service: %d\n", rc);
196 m->mds_regular_service = NULL;
202 * readpage service configuration. Parameters have to be adjusted,
205 memset(&conf, 0, sizeof(conf));
206 conf = (typeof(conf)) {
207 .psc_name = LUSTRE_MDT_NAME "_readpage",
208 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
210 .bc_nbufs = MDS_NBUFS,
211 .bc_buf_size = MDS_BUFSIZE,
212 .bc_req_max_size = MDS_MAXREQSIZE,
213 .bc_rep_max_size = MDS_MAXREPSIZE,
214 .bc_req_portal = MDS_READPAGE_PORTAL,
215 .bc_rep_portal = MDC_REPLY_PORTAL,
218 .tc_thr_name = LUSTRE_MDT_NAME "_rdpg",
219 .tc_thr_factor = MDS_RDPG_THR_FACTOR,
220 .tc_nthrs_init = MDS_RDPG_NTHRS_INIT,
221 .tc_nthrs_base = MDS_RDPG_NTHRS_BASE,
222 .tc_nthrs_max = MDS_RDPG_NTHRS_MAX,
223 .tc_nthrs_user = mds_rdpg_num_threads,
224 .tc_cpu_affinity = 1,
225 .tc_ctx_tags = LCT_MD_THREAD,
228 .cc_pattern = mds_rdpg_num_cpts,
231 .so_req_handler = tgt_request_handle,
232 .so_req_printer = target_print_req,
235 m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
236 obd->obd_debugfs_entry);
237 if (IS_ERR(m->mds_readpage_service)) {
238 rc = PTR_ERR(m->mds_readpage_service);
239 CERROR("failed to start readpage service: %d\n", rc);
240 m->mds_readpage_service = NULL;
242 GOTO(err_mds_svc, rc);
246 * setattr service configuration.
248 * XXX To keep the compatibility with old client(< 2.2), we need to
249 * preserve this portal for a certain time, it should be removed
250 * eventually. LU-617.
252 memset(&conf, 0, sizeof(conf));
253 conf = (typeof(conf)) {
254 .psc_name = LUSTRE_MDT_NAME "_setattr",
255 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
257 .bc_nbufs = MDS_NBUFS,
258 .bc_buf_size = MDS_BUFSIZE,
259 .bc_req_max_size = MDS_MAXREQSIZE,
260 .bc_rep_max_size = MDS_LOV_MAXREPSIZE,
261 .bc_req_portal = MDS_SETATTR_PORTAL,
262 .bc_rep_portal = MDC_REPLY_PORTAL,
265 .tc_thr_name = LUSTRE_MDT_NAME "_attr",
266 .tc_thr_factor = MDS_SETA_THR_FACTOR,
267 .tc_nthrs_init = MDS_SETA_NTHRS_INIT,
268 .tc_nthrs_base = MDS_SETA_NTHRS_BASE,
269 .tc_nthrs_max = MDS_SETA_NTHRS_MAX,
270 .tc_nthrs_user = mds_attr_num_threads,
271 .tc_cpu_affinity = 1,
272 .tc_ctx_tags = LCT_MD_THREAD,
275 .cc_pattern = mds_attr_num_cpts,
278 .so_req_handler = tgt_request_handle,
279 .so_req_printer = target_print_req,
280 .so_hpreq_handler = NULL,
283 m->mds_setattr_service = ptlrpc_register_service(&conf, &obd->obd_kset,
284 obd->obd_debugfs_entry);
285 if (IS_ERR(m->mds_setattr_service)) {
286 rc = PTR_ERR(m->mds_setattr_service);
287 CERROR("failed to start setattr service: %d\n", rc);
288 m->mds_setattr_service = NULL;
290 GOTO(err_mds_svc, rc);
293 /* Object update service */
294 conf = (typeof(conf)) {
295 .psc_name = LUSTRE_MDT_NAME "_out",
296 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
298 .bc_nbufs = MDS_NBUFS,
299 .bc_buf_size = OUT_BUFSIZE,
300 .bc_req_max_size = OUT_MAXREQSIZE,
301 .bc_rep_max_size = OUT_MAXREPSIZE,
302 .bc_req_portal = OUT_PORTAL,
303 .bc_rep_portal = OSC_REPLY_PORTAL,
306 * We'd like to have a mechanism to set this on a per-device
310 .tc_thr_name = LUSTRE_MDT_NAME "_out",
311 .tc_thr_factor = MDS_THR_FACTOR,
312 .tc_nthrs_init = MDS_NTHRS_INIT,
313 .tc_nthrs_base = MDS_NTHRS_BASE,
314 .tc_nthrs_max = MDS_NTHRS_MAX,
315 .tc_nthrs_user = mds_num_threads,
316 .tc_cpu_affinity = 1,
317 .tc_ctx_tags = LCT_MD_THREAD |
321 .cc_pattern = mds_num_cpts,
324 .so_req_handler = tgt_request_handle,
325 .so_req_printer = target_print_req,
326 .so_hpreq_handler = NULL,
329 m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
330 obd->obd_debugfs_entry);
331 if (IS_ERR(m->mds_out_service)) {
332 rc = PTR_ERR(m->mds_out_service);
333 CERROR("failed to start out service: %d\n", rc);
334 m->mds_out_service = NULL;
335 GOTO(err_mds_svc, rc);
339 * sequence controller service configuration
341 memset(&conf, 0, sizeof(conf));
342 conf = (typeof(conf)) {
343 .psc_name = LUSTRE_MDT_NAME "_seqs",
344 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
346 .bc_nbufs = MDS_NBUFS,
347 .bc_buf_size = SEQ_BUFSIZE,
348 .bc_req_max_size = SEQ_MAXREQSIZE,
349 .bc_rep_max_size = SEQ_MAXREPSIZE,
350 .bc_req_portal = SEQ_CONTROLLER_PORTAL,
351 .bc_rep_portal = MDC_REPLY_PORTAL,
354 .tc_thr_name = LUSTRE_MDT_NAME "_seqs",
355 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
356 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
357 .tc_ctx_tags = LCT_MD_THREAD,
360 .so_req_handler = tgt_request_handle,
361 .so_req_printer = target_print_req,
362 .so_hpreq_handler = NULL,
365 m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
366 obd->obd_debugfs_entry);
367 if (IS_ERR(m->mds_mdsc_service)) {
368 rc = PTR_ERR(m->mds_mdsc_service);
369 CERROR("failed to start seq controller service: %d\n", rc);
370 m->mds_mdsc_service = NULL;
372 GOTO(err_mds_svc, rc);
376 * metadata sequence server service configuration
378 memset(&conf, 0, sizeof(conf));
379 conf = (typeof(conf)) {
380 .psc_name = LUSTRE_MDT_NAME "_seqm",
381 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
383 .bc_nbufs = MDS_NBUFS,
384 .bc_buf_size = SEQ_BUFSIZE,
385 .bc_req_max_size = SEQ_MAXREQSIZE,
386 .bc_rep_max_size = SEQ_MAXREPSIZE,
387 .bc_req_portal = SEQ_METADATA_PORTAL,
388 .bc_rep_portal = MDC_REPLY_PORTAL,
391 .tc_thr_name = LUSTRE_MDT_NAME "_seqm",
392 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
393 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
394 .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD
397 .so_req_handler = tgt_request_handle,
398 .so_req_printer = target_print_req,
399 .so_hpreq_handler = NULL,
402 m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
403 obd->obd_debugfs_entry);
404 if (IS_ERR(m->mds_mdss_service)) {
405 rc = PTR_ERR(m->mds_mdss_service);
406 CERROR("failed to start metadata seq server service: %d\n", rc);
407 m->mds_mdss_service = NULL;
409 GOTO(err_mds_svc, rc);
412 /* FLD service start */
413 memset(&conf, 0, sizeof(conf));
414 conf = (typeof(conf)) {
415 .psc_name = LUSTRE_MDT_NAME "_fld",
416 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
418 .bc_nbufs = MDS_NBUFS,
419 .bc_buf_size = FLD_BUFSIZE,
420 .bc_req_max_size = FLD_MAXREQSIZE,
421 .bc_rep_max_size = FLD_MAXREPSIZE,
422 .bc_req_portal = FLD_REQUEST_PORTAL,
423 .bc_rep_portal = MDC_REPLY_PORTAL,
426 .tc_thr_name = LUSTRE_MDT_NAME "_fld",
427 .tc_nthrs_init = MDS_OTHR_NTHRS_INIT,
428 .tc_nthrs_max = MDS_OTHR_NTHRS_MAX,
429 .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
432 .so_req_handler = tgt_request_handle,
433 .so_req_printer = target_print_req,
434 .so_hpreq_handler = NULL,
437 m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
438 obd->obd_debugfs_entry);
439 if (IS_ERR(m->mds_fld_service)) {
440 rc = PTR_ERR(m->mds_fld_service);
441 CERROR("failed to start fld service: %d\n", rc);
442 m->mds_fld_service = NULL;
444 GOTO(err_mds_svc, rc);
447 memset(&conf, 0, sizeof(conf));
448 conf = (typeof(conf)) {
449 .psc_name = LUSTRE_MDT_NAME "_io",
450 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
452 .bc_nbufs = OST_NBUFS,
453 .bc_buf_size = OST_IO_BUFSIZE,
454 .bc_req_max_size = OST_IO_MAXREQSIZE,
455 .bc_rep_max_size = OST_IO_MAXREPSIZE,
456 .bc_req_portal = MDS_IO_PORTAL,
457 .bc_rep_portal = MDC_REPLY_PORTAL,
460 .tc_thr_name = "ll_mdt_io",
461 .tc_thr_factor = OSS_THR_FACTOR,
462 .tc_nthrs_init = OSS_NTHRS_INIT,
463 .tc_nthrs_base = OSS_NTHRS_BASE,
464 .tc_nthrs_max = mds_max_io_threads,
465 .tc_cpu_affinity = 1,
466 .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
469 .so_thr_init = tgt_io_thread_init,
470 .so_thr_done = tgt_io_thread_done,
471 .so_req_handler = tgt_request_handle,
472 .so_req_printer = target_print_req,
473 .so_hpreq_handler = tgt_hpreq_handler,
476 m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
477 obd->obd_debugfs_entry);
478 if (IS_ERR(m->mds_io_service)) {
479 rc = PTR_ERR(m->mds_io_service);
480 CERROR("failed to start MDT I/O service: %d\n", rc);
481 m->mds_io_service = NULL;
482 GOTO(err_mds_svc, rc);
488 mds_stop_ptlrpc_service(m);
493 static inline struct mds_device *mds_dev(struct lu_device *d)
495 return container_of0(d, struct mds_device, mds_md_dev.md_lu_dev);
498 static struct lu_device *mds_device_fini(const struct lu_env *env,
501 struct mds_device *m = mds_dev(d);
502 struct obd_device *obd = d->ld_obd;
505 mds_stop_ptlrpc_service(m);
506 lprocfs_obd_cleanup(obd);
510 static struct lu_device *mds_device_free(const struct lu_env *env,
513 struct mds_device *m = mds_dev(d);
516 md_device_fini(&m->mds_md_dev);
521 static struct lu_device *mds_device_alloc(const struct lu_env *env,
522 struct lu_device_type *t,
523 struct lustre_cfg *cfg)
525 struct mds_device *m;
526 struct obd_device *obd;
532 return ERR_PTR(-ENOMEM);
534 md_device_init(&m->mds_md_dev, t);
535 l = &m->mds_md_dev.md_lu_dev;
537 obd = class_name2obd(lustre_cfg_string(cfg, 0));
538 LASSERT(obd != NULL);
541 /* set this lu_device to obd, because error handling need it */
544 rc = lprocfs_obd_setup(obd, true);
546 mds_device_free(env, l);
551 mutex_init(&m->mds_health_mutex);
553 rc = mds_start_ptlrpc_service(m);
555 lprocfs_obd_cleanup(obd);
556 mds_device_free(env, l);
563 /* type constructor/destructor: mdt_type_init, mdt_type_fini */
564 LU_TYPE_INIT_FINI(mds, &mdt_thread_key);
566 static struct lu_device_type_operations mds_device_type_ops = {
567 .ldto_init = mds_type_init,
568 .ldto_fini = mds_type_fini,
570 .ldto_start = mds_type_start,
571 .ldto_stop = mds_type_stop,
573 .ldto_device_alloc = mds_device_alloc,
574 .ldto_device_free = mds_device_free,
575 .ldto_device_fini = mds_device_fini
578 static struct lu_device_type mds_device_type = {
579 .ldt_tags = LU_DEVICE_MD,
580 .ldt_name = LUSTRE_MDS_NAME,
581 .ldt_ops = &mds_device_type_ops,
582 .ldt_ctx_tags = LCT_MD_THREAD
585 static int mds_health_check(const struct lu_env *env, struct obd_device *obd)
587 struct mds_device *mds = mds_dev(obd->obd_lu_dev);
591 mutex_lock(&mds->mds_health_mutex);
592 rc |= ptlrpc_service_health_check(mds->mds_regular_service);
593 rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
594 rc |= ptlrpc_service_health_check(mds->mds_out_service);
595 rc |= ptlrpc_service_health_check(mds->mds_setattr_service);
596 rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
597 rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
598 rc |= ptlrpc_service_health_check(mds->mds_fld_service);
599 rc |= ptlrpc_service_health_check(mds->mds_io_service);
600 mutex_unlock(&mds->mds_health_mutex);
602 return rc != 0 ? 1 : 0;
605 static struct obd_ops mds_obd_device_ops = {
606 .o_owner = THIS_MODULE,
607 .o_health_check = mds_health_check,
610 int mds_mod_init(void)
612 return class_register_type(&mds_obd_device_ops, NULL, false, NULL,
613 LUSTRE_MDS_NAME, &mds_device_type);
616 void mds_mod_exit(void)
618 class_unregister_type(LUSTRE_MDS_NAME);