Whamcloud - gitweb
LU-17871 ldlm: FLOCK ownlocks may be not set
[fs/lustre-release.git] / lustre / mdt / mdt_mds.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2013, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * lustre/mdt/mdt_mds.c
32  *
33  * Lustre Metadata Service Layer
34  *
35  * Author: Di Wang <di.wang@whamcloud.com>
36  **/
37
38 #define DEBUG_SUBSYSTEM S_MDS
39
40 #include <linux/module.h>
41
42 #include <obd_support.h>
43 /* struct ptlrpc_request */
44 #include <lustre_net.h>
45 /* struct obd_export */
46 #include <lustre_export.h>
47 /* struct obd_device */
48 #include <obd.h>
49 /* lu2dt_dev() */
50 #include <dt_object.h>
51 #include <lustre_mds.h>
52 #include "mdt_internal.h"
53 #include <lustre_quota.h>
54 #include <lustre_acl.h>
55 #include <lustre_nodemap.h>
56 #include <uapi/linux/lustre/lustre_param.h>
57
58 struct mds_device {
59         /* super-class */
60         struct md_device         mds_md_dev;
61         struct ptlrpc_service   *mds_regular_service;
62         struct ptlrpc_service   *mds_readpage_service;
63         struct ptlrpc_service   *mds_out_service;
64         struct ptlrpc_service   *mds_mdsc_service;
65         struct ptlrpc_service   *mds_mdss_service;
66         struct ptlrpc_service   *mds_fld_service;
67         struct ptlrpc_service   *mds_io_service;
68         struct mutex             mds_health_mutex;
69 };
70
71 /*
72  *  * Initialized in mds_mod_init().
73  *   */
74 static unsigned long mds_num_threads;
75 module_param(mds_num_threads, ulong, 0444);
76 MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
77
78 static unsigned int mds_cpu_bind = 1;
79 module_param(mds_cpu_bind, uint, 0444);
80 MODULE_PARM_DESC(mds_cpu_bind,
81                  "bind MDS threads to particular CPU partitions");
82
83 int mds_max_io_threads = 512;
84 module_param(mds_max_io_threads, int, 0444);
85 MODULE_PARM_DESC(mds_max_io_threads,
86                  "maximum number of MDS IO service threads");
87
88 static unsigned int mds_io_cpu_bind = 1;
89 module_param(mds_io_cpu_bind, uint, 0444);
90 MODULE_PARM_DESC(mds_io_cpu_bind,
91                  "bind MDS IO threads to particular CPU partitions");
92
93 static char *mds_io_num_cpts;
94 module_param(mds_io_num_cpts, charp, 0444);
95 MODULE_PARM_DESC(mds_io_num_cpts,
96                  "CPU partitions MDS IO threads should run on");
97
98 static struct cfs_cpt_table *mdt_io_cptable;
99
100 static char *mds_num_cpts;
101 module_param(mds_num_cpts, charp, 0444);
102 MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
103
104 static unsigned long mds_rdpg_num_threads;
105 module_param(mds_rdpg_num_threads, ulong, 0444);
106 MODULE_PARM_DESC(mds_rdpg_num_threads,
107                  "number of MDS readpage service threads to start");
108
109 static unsigned int mds_rdpg_cpu_bind = 1;
110 module_param(mds_rdpg_cpu_bind, uint, 0444);
111 MODULE_PARM_DESC(mds_rdpg_cpu_bind,
112                  "bind MDS readpage threads to particular CPU partitions");
113
114 static char *mds_rdpg_num_cpts;
115 module_param(mds_rdpg_num_cpts, charp, 0444);
116 MODULE_PARM_DESC(mds_rdpg_num_cpts,
117                  "CPU partitions MDS readpage threads should run on");
118
119 /* device init/fini methods */
120 static void mds_stop_ptlrpc_service(struct mds_device *m)
121 {
122         ENTRY;
123
124         mutex_lock(&m->mds_health_mutex);
125         if (m->mds_regular_service != NULL) {
126                 ptlrpc_unregister_service(m->mds_regular_service);
127                 m->mds_regular_service = NULL;
128         }
129         if (m->mds_readpage_service != NULL) {
130                 ptlrpc_unregister_service(m->mds_readpage_service);
131                 m->mds_readpage_service = NULL;
132         }
133         if (m->mds_out_service != NULL) {
134                 ptlrpc_unregister_service(m->mds_out_service);
135                 m->mds_out_service = NULL;
136         }
137         if (m->mds_mdsc_service != NULL) {
138                 ptlrpc_unregister_service(m->mds_mdsc_service);
139                 m->mds_mdsc_service = NULL;
140         }
141         if (m->mds_mdss_service != NULL) {
142                 ptlrpc_unregister_service(m->mds_mdss_service);
143                 m->mds_mdss_service = NULL;
144         }
145         if (m->mds_fld_service != NULL) {
146                 ptlrpc_unregister_service(m->mds_fld_service);
147                 m->mds_fld_service = NULL;
148         }
149         if (m->mds_io_service != NULL) {
150                 ptlrpc_unregister_service(m->mds_io_service);
151                 m->mds_io_service = NULL;
152         }
153         mutex_unlock(&m->mds_health_mutex);
154
155         if (mdt_io_cptable != NULL) {
156                 cfs_cpt_table_free(mdt_io_cptable);
157                 mdt_io_cptable = NULL;
158         }
159
160         EXIT;
161 }
162
163 static int ldlm_enqueue_hpreq_check(struct ptlrpc_request *req)
164 {
165         struct ldlm_request *dlm_req;
166         int rc = 0;
167         ENTRY;
168
169         if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_REPLAY|MSG_RESENT)) !=
170             MSG_RESENT)
171                 RETURN(0);
172
173         req_capsule_init(&req->rq_pill, req, RCL_SERVER);
174         req_capsule_set(&req->rq_pill, &RQF_LDLM_ENQUEUE);
175         dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
176         if (dlm_req == NULL)
177                 RETURN(-EFAULT);
178
179         if (dlm_req->lock_count > 0) {
180                 struct ldlm_lock *lock;
181
182                 lock = cfs_hash_lookup(req->rq_export->exp_lock_hash,
183                                        (void *)&dlm_req->lock_handle[0]);
184
185                 DEBUG_REQ(D_RPCTRACE, req, "lock %p cookie 0x%llx",
186                         lock, dlm_req->lock_handle[0].cookie);
187                 if (lock != NULL) {
188                         rc = lock->l_granted_mode == lock->l_req_mode;
189                         if (rc)
190                                 LDLM_DEBUG(lock, "hpreq resend");
191                         LDLM_LOCK_RELEASE(lock);
192                 }
193         }
194
195         RETURN(rc);
196 }
197
198 static struct ptlrpc_hpreq_ops ldlm_enqueue_hpreq_ops = {
199         .hpreq_lock_match = NULL,
200         .hpreq_check      = ldlm_enqueue_hpreq_check,
201         .hpreq_fini       = NULL,
202 };
203
204 static int mds_hpreq_handler(struct ptlrpc_request *req)
205 {
206         if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_ENQUEUE)
207                 req->rq_ops = &ldlm_enqueue_hpreq_ops;
208         else
209                 ptlrpc_hpreq_handler(req);
210         return 0;
211 }
212
213 static int mds_start_ptlrpc_service(struct mds_device *m)
214 {
215         static struct ptlrpc_service_conf conf;
216         struct obd_device *obd = m->mds_md_dev.md_lu_dev.ld_obd;
217         nodemask_t *mask;
218         int rc = 0;
219
220         ENTRY;
221
222         conf = (typeof(conf)) {
223                 .psc_name               = LUSTRE_MDT_NAME,
224                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
225                 .psc_buf                = {
226                         .bc_nbufs               = MDS_NBUFS,
227                         .bc_buf_size            = MDS_REG_BUFSIZE,
228                         .bc_req_max_size        = MDS_REG_MAXREQSIZE,
229                         .bc_rep_max_size        = MDS_REG_MAXREPSIZE,
230                         .bc_req_portal          = MDS_REQUEST_PORTAL,
231                         .bc_rep_portal          = MDC_REPLY_PORTAL,
232                 },
233                 /*
234                  * We'd like to have a mechanism to set this on a per-device
235                  * basis, but alas...
236                  */
237                 .psc_thr                = {
238                         .tc_thr_name            = LUSTRE_MDT_NAME,
239                         .tc_thr_factor          = MDS_THR_FACTOR,
240                         .tc_nthrs_init          = MDS_NTHRS_INIT,
241                         .tc_nthrs_base          = MDS_NTHRS_BASE,
242                         .tc_nthrs_max           = MDS_NTHRS_MAX,
243                         .tc_nthrs_user          = mds_num_threads,
244                         .tc_cpu_bind            = mds_cpu_bind,
245                         /* LCT_DT_THREAD is required as MDT threads may scan
246                          * all LDLM namespaces (including OFD-originated) to
247                          * cancel LDLM locks */
248                         .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD,
249                 },
250                 .psc_cpt                = {
251                         .cc_pattern             = mds_num_cpts,
252                         .cc_affinity            = true,
253                 },
254                 .psc_ops                = {
255                         .so_req_handler         = tgt_request_handle,
256                         .so_req_printer         = target_print_req,
257                         .so_hpreq_handler       = mds_hpreq_handler,
258                 },
259         };
260         m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
261                                                          obd->obd_debugfs_entry);
262         if (IS_ERR(m->mds_regular_service)) {
263                 rc = PTR_ERR(m->mds_regular_service);
264                 CERROR("failed to start regular mdt service: %d\n", rc);
265                 m->mds_regular_service = NULL;
266
267                 RETURN(rc);
268         }
269
270         /*
271          * readpage service configuration. Parameters have to be adjusted,
272          * ideally.
273          */
274         memset(&conf, 0, sizeof(conf));
275         conf = (typeof(conf)) {
276                 .psc_name               = LUSTRE_MDT_NAME "_readpage",
277                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
278                 .psc_buf                = {
279                         .bc_nbufs               = MDS_NBUFS,
280                         .bc_buf_size            = MDS_BUFSIZE,
281                         .bc_req_max_size        = MDS_MAXREQSIZE,
282                         .bc_rep_max_size        = MDS_MAXREPSIZE,
283                         .bc_req_portal          = MDS_READPAGE_PORTAL,
284                         .bc_rep_portal          = MDC_REPLY_PORTAL,
285                 },
286                 .psc_thr                = {
287                         .tc_thr_name            = LUSTRE_MDT_NAME "_rdpg",
288                         .tc_thr_factor          = MDS_RDPG_THR_FACTOR,
289                         .tc_nthrs_init          = MDS_RDPG_NTHRS_INIT,
290                         .tc_nthrs_base          = MDS_RDPG_NTHRS_BASE,
291                         .tc_nthrs_max           = MDS_RDPG_NTHRS_MAX,
292                         .tc_nthrs_user          = mds_rdpg_num_threads,
293                         .tc_cpu_bind            = mds_rdpg_cpu_bind,
294                         .tc_ctx_tags            = LCT_MD_THREAD,
295                 },
296                 .psc_cpt                = {
297                         .cc_pattern             = mds_rdpg_num_cpts,
298                         .cc_affinity            = true,
299                 },
300                 .psc_ops                = {
301                         .so_req_handler         = tgt_request_handle,
302                         .so_req_printer         = target_print_req,
303                 },
304         };
305         m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
306                                                           obd->obd_debugfs_entry);
307         if (IS_ERR(m->mds_readpage_service)) {
308                 rc = PTR_ERR(m->mds_readpage_service);
309                 CERROR("failed to start readpage service: %d\n", rc);
310                 m->mds_readpage_service = NULL;
311
312                 GOTO(err_mds_svc, rc);
313         }
314
315         /* Object update service */
316         conf = (typeof(conf)) {
317                 .psc_name               = LUSTRE_MDT_NAME "_out",
318                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
319                 .psc_buf                = {
320                         .bc_nbufs               = MDS_NBUFS,
321                         .bc_buf_size            = OUT_BUFSIZE,
322                         .bc_req_max_size        = OUT_MAXREQSIZE,
323                         .bc_rep_max_size        = OUT_MAXREPSIZE,
324                         .bc_req_portal          = OUT_PORTAL,
325                         .bc_rep_portal          = OSC_REPLY_PORTAL,
326                 },
327                 /*
328                  * We'd like to have a mechanism to set this on a per-device
329                  * basis, but alas...
330                  */
331                 .psc_thr                = {
332                         .tc_thr_name            = LUSTRE_MDT_NAME "_out",
333                         .tc_thr_factor          = MDS_THR_FACTOR,
334                         .tc_nthrs_init          = MDS_NTHRS_INIT,
335                         .tc_nthrs_base          = MDS_NTHRS_BASE,
336                         .tc_nthrs_max           = MDS_NTHRS_MAX,
337                         .tc_nthrs_user          = mds_num_threads,
338                         .tc_cpu_bind            = mds_cpu_bind,
339                         .tc_ctx_tags            = LCT_MD_THREAD |
340                                                   LCT_DT_THREAD,
341                 },
342                 .psc_cpt                = {
343                         .cc_pattern             = mds_num_cpts,
344                         .cc_affinity            = true,
345                 },
346                 .psc_ops                = {
347                         .so_req_handler         = tgt_request_handle,
348                         .so_req_printer         = target_print_req,
349                         .so_hpreq_handler       = NULL,
350                 },
351         };
352         m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
353                                                      obd->obd_debugfs_entry);
354         if (IS_ERR(m->mds_out_service)) {
355                 rc = PTR_ERR(m->mds_out_service);
356                 CERROR("failed to start out service: %d\n", rc);
357                 m->mds_out_service = NULL;
358                 GOTO(err_mds_svc, rc);
359         }
360
361         /*
362          * sequence controller service configuration
363          */
364         memset(&conf, 0, sizeof(conf));
365         conf = (typeof(conf)) {
366                 .psc_name               = LUSTRE_MDT_NAME "_seqs",
367                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
368                 .psc_buf                = {
369                         .bc_nbufs               = MDS_NBUFS,
370                         .bc_buf_size            = SEQ_BUFSIZE,
371                         .bc_req_max_size        = SEQ_MAXREQSIZE,
372                         .bc_rep_max_size        = SEQ_MAXREPSIZE,
373                         .bc_req_portal          = SEQ_CONTROLLER_PORTAL,
374                         .bc_rep_portal          = MDC_REPLY_PORTAL,
375                 },
376                 .psc_thr                = {
377                         .tc_thr_name            = LUSTRE_MDT_NAME "_seqs",
378                         .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
379                         .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
380                         .tc_ctx_tags            = LCT_MD_THREAD,
381                 },
382                 .psc_ops                = {
383                         .so_req_handler         = tgt_request_handle,
384                         .so_req_printer         = target_print_req,
385                         .so_hpreq_handler       = NULL,
386                 },
387         };
388         m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
389                                                       obd->obd_debugfs_entry);
390         if (IS_ERR(m->mds_mdsc_service)) {
391                 rc = PTR_ERR(m->mds_mdsc_service);
392                 CERROR("failed to start seq controller service: %d\n", rc);
393                 m->mds_mdsc_service = NULL;
394
395                 GOTO(err_mds_svc, rc);
396         }
397
398         /*
399          * metadata sequence server service configuration
400          */
401         memset(&conf, 0, sizeof(conf));
402         conf = (typeof(conf)) {
403                 .psc_name               = LUSTRE_MDT_NAME "_seqm",
404                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
405                 .psc_buf                = {
406                         .bc_nbufs               = MDS_NBUFS,
407                         .bc_buf_size            = SEQ_BUFSIZE,
408                         .bc_req_max_size        = SEQ_MAXREQSIZE,
409                         .bc_rep_max_size        = SEQ_MAXREPSIZE,
410                         .bc_req_portal          = SEQ_METADATA_PORTAL,
411                         .bc_rep_portal          = MDC_REPLY_PORTAL,
412                 },
413                 .psc_thr                = {
414                         .tc_thr_name            = LUSTRE_MDT_NAME "_seqm",
415                         .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
416                         .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
417                         .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD
418                 },
419                 .psc_ops                = {
420                         .so_req_handler         = tgt_request_handle,
421                         .so_req_printer         = target_print_req,
422                         .so_hpreq_handler       = NULL,
423                 },
424         };
425         m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
426                                                       obd->obd_debugfs_entry);
427         if (IS_ERR(m->mds_mdss_service)) {
428                 rc = PTR_ERR(m->mds_mdss_service);
429                 CERROR("failed to start metadata seq server service: %d\n", rc);
430                 m->mds_mdss_service = NULL;
431
432                 GOTO(err_mds_svc, rc);
433         }
434
435         /* FLD service start */
436         memset(&conf, 0, sizeof(conf));
437         conf = (typeof(conf)) {
438                 .psc_name            = LUSTRE_MDT_NAME "_fld",
439                 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
440                 .psc_buf                = {
441                         .bc_nbufs               = MDS_NBUFS,
442                         .bc_buf_size            = FLD_BUFSIZE,
443                         .bc_req_max_size        = FLD_MAXREQSIZE,
444                         .bc_rep_max_size        = FLD_MAXREPSIZE,
445                         .bc_req_portal          = FLD_REQUEST_PORTAL,
446                         .bc_rep_portal          = MDC_REPLY_PORTAL,
447                 },
448                 .psc_thr                = {
449                         .tc_thr_name            = LUSTRE_MDT_NAME "_fld",
450                         .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
451                         .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
452                         .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
453                 },
454                 .psc_ops                = {
455                         .so_req_handler         = tgt_request_handle,
456                         .so_req_printer         = target_print_req,
457                         .so_hpreq_handler       = NULL,
458                 },
459         };
460         m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
461                                                      obd->obd_debugfs_entry);
462         if (IS_ERR(m->mds_fld_service)) {
463                 rc = PTR_ERR(m->mds_fld_service);
464                 CERROR("failed to start fld service: %d\n", rc);
465                 m->mds_fld_service = NULL;
466
467                 GOTO(err_mds_svc, rc);
468         }
469
470
471         mask = cfs_cpt_nodemask(cfs_cpt_tab, CFS_CPT_ANY);
472         /* event CPT feature is disabled in libcfs level by set partition
473          * number to 1, we still want to set node affinity for io service */
474         if (cfs_cpt_number(cfs_cpt_tab) == 1 && nodes_weight(*mask) > 1) {
475                 int cpt = 0;
476                 int i;
477
478                 mdt_io_cptable = cfs_cpt_table_alloc(nodes_weight(*mask));
479                 for_each_node_mask(i, *mask) {
480                         if (mdt_io_cptable == NULL) {
481                                 CWARN("MDS failed to create CPT table\n");
482                                 break;
483                         }
484
485                         rc = cfs_cpt_set_node(mdt_io_cptable, cpt++, i);
486                         if (!rc) {
487                                 CWARN("MDS Failed to set node %d for IO CPT table\n",
488                                       i);
489                                 cfs_cpt_table_free(mdt_io_cptable);
490                                 mdt_io_cptable = NULL;
491                                 break;
492                         }
493                 }
494         }
495
496         memset(&conf, 0, sizeof(conf));
497         conf = (typeof(conf)) {
498                 .psc_name               = LUSTRE_MDT_NAME "_io",
499                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
500                 .psc_buf                = {
501                         .bc_nbufs               = OST_NBUFS,
502                         .bc_buf_size            = OST_IO_BUFSIZE,
503                         .bc_req_max_size        = OST_IO_MAXREQSIZE,
504                         .bc_rep_max_size        = OST_IO_MAXREPSIZE,
505                         .bc_req_portal          = MDS_IO_PORTAL,
506                         .bc_rep_portal          = MDC_REPLY_PORTAL,
507                 },
508                 .psc_thr                = {
509                         .tc_thr_name            = LUSTRE_MDT_NAME "_io",
510                         .tc_thr_factor          = OSS_THR_FACTOR,
511                         .tc_nthrs_init          = OSS_NTHRS_INIT,
512                         .tc_nthrs_base          = OSS_NTHRS_BASE,
513                         .tc_nthrs_max           = mds_max_io_threads,
514                         .tc_nthrs_user          = mds_num_threads,
515                         .tc_cpu_bind            = mds_io_cpu_bind,
516                         .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
517                 },
518                 .psc_cpt                = {
519                         .cc_cptable             = mdt_io_cptable,
520                         .cc_pattern             = mdt_io_cptable == NULL ?
521                                                   mds_io_num_cpts : NULL,
522                         .cc_affinity            = true,
523                 },
524                 .psc_ops                = {
525                         .so_thr_init            = tgt_io_thread_init,
526                         .so_thr_done            = tgt_io_thread_done,
527                         .so_req_handler         = tgt_request_handle,
528                         .so_req_printer         = target_print_req,
529                         .so_hpreq_handler       = tgt_hpreq_handler,
530                 },
531         };
532         m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
533                                                     obd->obd_debugfs_entry);
534         if (IS_ERR(m->mds_io_service)) {
535                 rc = PTR_ERR(m->mds_io_service);
536                 CERROR("failed to start MDT I/O service: %d\n", rc);
537                 m->mds_io_service = NULL;
538                 GOTO(err_mds_svc, rc);
539         }
540
541         EXIT;
542 err_mds_svc:
543         if (rc)
544                 mds_stop_ptlrpc_service(m);
545
546         return rc;
547 }
548
549 static inline struct mds_device *mds_dev(struct lu_device *d)
550 {
551         return container_of_safe(d, struct mds_device, mds_md_dev.md_lu_dev);
552 }
553
554 static struct lu_device *mds_device_fini(const struct lu_env *env,
555                                          struct lu_device *d)
556 {
557         struct mds_device *m = mds_dev(d);
558         struct obd_device *obd = d->ld_obd;
559         ENTRY;
560
561         mds_stop_ptlrpc_service(m);
562         lprocfs_obd_cleanup(obd);
563         RETURN(NULL);
564 }
565
566 static struct lu_device *mds_device_free(const struct lu_env *env,
567                                          struct lu_device *d)
568 {
569         struct mds_device *m = mds_dev(d);
570         ENTRY;
571
572         md_device_fini(&m->mds_md_dev);
573         OBD_FREE_PTR(m);
574         RETURN(NULL);
575 }
576
577 static struct lu_device *mds_device_alloc(const struct lu_env *env,
578                                           struct lu_device_type *t,
579                                           struct lustre_cfg *cfg)
580 {
581         struct mds_device        *m;
582         struct obd_device        *obd;
583         struct lu_device          *l;
584         int rc;
585
586         OBD_ALLOC_PTR(m);
587         if (m == NULL)
588                 return ERR_PTR(-ENOMEM);
589
590         md_device_init(&m->mds_md_dev, t);
591         l = &m->mds_md_dev.md_lu_dev;
592
593         obd = class_name2obd(lustre_cfg_string(cfg, 0));
594         LASSERT(obd != NULL);
595
596         l->ld_obd = obd;
597         /* set this lu_device to obd, because error handling need it */
598         obd->obd_lu_dev = l;
599
600         rc = lprocfs_obd_setup(obd, true);
601         if (rc != 0) {
602                 mds_device_free(env, l);
603                 l = ERR_PTR(rc);
604                 return l;
605         }
606
607         mutex_init(&m->mds_health_mutex);
608
609         rc = mds_start_ptlrpc_service(m);
610         if (rc != 0) {
611                 lprocfs_obd_cleanup(obd);
612                 mds_device_free(env, l);
613                 l = ERR_PTR(rc);
614                 return l;
615         }
616         return l;
617 }
618
619 /* type constructor/destructor: mdt_type_init, mdt_type_fini */
620 LU_TYPE_INIT_FINI(mds, &mdt_thread_key);
621
622 static const struct lu_device_type_operations mds_device_type_ops = {
623         .ldto_init = mds_type_init,
624         .ldto_fini = mds_type_fini,
625
626         .ldto_start = mds_type_start,
627         .ldto_stop  = mds_type_stop,
628
629         .ldto_device_alloc = mds_device_alloc,
630         .ldto_device_free  = mds_device_free,
631         .ldto_device_fini  = mds_device_fini
632 };
633
634 static struct lu_device_type mds_device_type = {
635         .ldt_tags     = LU_DEVICE_MD,
636         .ldt_name     = LUSTRE_MDS_NAME,
637         .ldt_ops      = &mds_device_type_ops,
638         .ldt_ctx_tags = LCT_MD_THREAD
639 };
640
641 static int mds_health_check(const struct lu_env *env, struct obd_device *obd)
642 {
643         struct mds_device *mds = mds_dev(obd->obd_lu_dev);
644         int rc = 0;
645
646
647         mutex_lock(&mds->mds_health_mutex);
648         rc |= ptlrpc_service_health_check(mds->mds_regular_service);
649         rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
650         rc |= ptlrpc_service_health_check(mds->mds_out_service);
651         rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
652         rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
653         rc |= ptlrpc_service_health_check(mds->mds_fld_service);
654         rc |= ptlrpc_service_health_check(mds->mds_io_service);
655         mutex_unlock(&mds->mds_health_mutex);
656
657         return rc != 0 ? 1 : 0;
658 }
659
660 /* ioctls on obd dev */
661 static int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
662                          void *karg, void __user *uarg)
663 {
664         struct obd_device *obd = exp->exp_obd;
665         struct obd_ioctl_data *data;
666         int rc = 0;
667
668         ENTRY;
669         CDEBUG(D_IOCTL, "%s: cmd=%x len=%u karg=%pK uarg=%pK\n",
670                obd->obd_name, cmd, len, karg, uarg);
671
672         data = karg;
673         /* we only support nodemap ioctls, for now */
674         if (cmd != OBD_IOC_NODEMAP)
675                 GOTO(out, rc = -EINVAL);
676
677         rc = server_iocontrol_nodemap(obd, data, true);
678         if (rc)
679                 GOTO(out, rc);
680
681 out:
682         RETURN(rc);
683 }
684
685 static const struct obd_ops mds_obd_device_ops = {
686         .o_owner           = THIS_MODULE,
687         .o_health_check    = mds_health_check,
688         .o_iocontrol       = mds_iocontrol,
689 };
690
691 int mds_mod_init(void)
692 {
693         return class_register_type(&mds_obd_device_ops, NULL, false,
694                                    LUSTRE_MDS_NAME, &mds_device_type);
695 }
696
697 void mds_mod_exit(void)
698 {
699         class_unregister_type(LUSTRE_MDS_NAME);
700 }