Whamcloud - gitweb
7c5560db3543ed7e9c3a090035bbaf3e21aa0f7f
[fs/lustre-release.git] / lustre / mdt / mdt_mds.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2013, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * lustre/mdt/mdt_mds.c
32  *
33  * Lustre Metadata Service Layer
34  *
35  * Author: Di Wang <di.wang@whamcloud.com>
36  **/
37
38 #define DEBUG_SUBSYSTEM S_MDS
39
40 #include <linux/module.h>
41
42 #include <obd_support.h>
43 /* struct ptlrpc_request */
44 #include <lustre_net.h>
45 /* struct obd_export */
46 #include <lustre_export.h>
47 /* struct obd_device */
48 #include <obd.h>
49 /* lu2dt_dev() */
50 #include <dt_object.h>
51 #include <lustre_mds.h>
52 #include "mdt_internal.h"
53 #include <lustre_quota.h>
54 #include <lustre_acl.h>
55 #include <uapi/linux/lustre_param.h>
56
57 struct mds_device {
58         /* super-class */
59         struct md_device         mds_md_dev;
60         struct ptlrpc_service   *mds_regular_service;
61         struct ptlrpc_service   *mds_readpage_service;
62         struct ptlrpc_service   *mds_out_service;
63         struct ptlrpc_service   *mds_setattr_service;
64         struct ptlrpc_service   *mds_mdsc_service;
65         struct ptlrpc_service   *mds_mdss_service;
66         struct ptlrpc_service   *mds_fld_service;
67         struct mutex             mds_health_mutex;
68 };
69
70 /*
71  *  * Initialized in mds_mod_init().
72  *   */
73 static unsigned long mds_num_threads;
74 module_param(mds_num_threads, ulong, 0444);
75 MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
76
77 static char *mds_num_cpts;
78 module_param(mds_num_cpts, charp, 0444);
79 MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
80
81 static unsigned long mds_rdpg_num_threads;
82 module_param(mds_rdpg_num_threads, ulong, 0444);
83 MODULE_PARM_DESC(mds_rdpg_num_threads,
84                  "number of MDS readpage service threads to start");
85
86 static char *mds_rdpg_num_cpts;
87 module_param(mds_rdpg_num_cpts, charp, 0444);
88 MODULE_PARM_DESC(mds_rdpg_num_cpts,
89                  "CPU partitions MDS readpage threads should run on");
90
91 /* NB: these two should be removed along with setattr service in the future */
92 static unsigned long mds_attr_num_threads;
93 module_param(mds_attr_num_threads, ulong, 0444);
94 MODULE_PARM_DESC(mds_attr_num_threads,
95                  "number of MDS setattr service threads to start");
96
97 static char *mds_attr_num_cpts;
98 module_param(mds_attr_num_cpts, charp, 0444);
99 MODULE_PARM_DESC(mds_attr_num_cpts,
100                  "CPU partitions MDS setattr threads should run on");
101
102 /* device init/fini methods */
103 static void mds_stop_ptlrpc_service(struct mds_device *m)
104 {
105         ENTRY;
106
107         mutex_lock(&m->mds_health_mutex);
108         if (m->mds_regular_service != NULL) {
109                 ptlrpc_unregister_service(m->mds_regular_service);
110                 m->mds_regular_service = NULL;
111         }
112         if (m->mds_readpage_service != NULL) {
113                 ptlrpc_unregister_service(m->mds_readpage_service);
114                 m->mds_readpage_service = NULL;
115         }
116         if (m->mds_out_service != NULL) {
117                 ptlrpc_unregister_service(m->mds_out_service);
118                 m->mds_out_service = NULL;
119         }
120         if (m->mds_setattr_service != NULL) {
121                 ptlrpc_unregister_service(m->mds_setattr_service);
122                 m->mds_setattr_service = NULL;
123         }
124         if (m->mds_mdsc_service != NULL) {
125                 ptlrpc_unregister_service(m->mds_mdsc_service);
126                 m->mds_mdsc_service = NULL;
127         }
128         if (m->mds_mdss_service != NULL) {
129                 ptlrpc_unregister_service(m->mds_mdss_service);
130                 m->mds_mdss_service = NULL;
131         }
132         if (m->mds_fld_service != NULL) {
133                 ptlrpc_unregister_service(m->mds_fld_service);
134                 m->mds_fld_service = NULL;
135         }
136         mutex_unlock(&m->mds_health_mutex);
137
138         EXIT;
139 }
140
141 static int mds_start_ptlrpc_service(struct mds_device *m)
142 {
143         static struct ptlrpc_service_conf conf;
144         struct obd_device *obd = m->mds_md_dev.md_lu_dev.ld_obd;
145         struct proc_dir_entry *procfs_entry;
146         int rc = 0;
147         ENTRY;
148
149         procfs_entry = obd->obd_proc_entry;
150         LASSERT(procfs_entry != NULL);
151
152         conf = (typeof(conf)) {
153                 .psc_name               = LUSTRE_MDT_NAME,
154                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
155                 .psc_buf                = {
156                         .bc_nbufs               = MDS_NBUFS,
157                         .bc_buf_size            = MDS_REG_BUFSIZE,
158                         .bc_req_max_size        = MDS_REG_MAXREQSIZE,
159                         .bc_rep_max_size        = MDS_REG_MAXREPSIZE,
160                         .bc_req_portal          = MDS_REQUEST_PORTAL,
161                         .bc_rep_portal          = MDC_REPLY_PORTAL,
162                 },
163                 /*
164                  * We'd like to have a mechanism to set this on a per-device
165                  * basis, but alas...
166                  */
167                 .psc_thr                = {
168                         .tc_thr_name            = LUSTRE_MDT_NAME,
169                         .tc_thr_factor          = MDS_THR_FACTOR,
170                         .tc_nthrs_init          = MDS_NTHRS_INIT,
171                         .tc_nthrs_base          = MDS_NTHRS_BASE,
172                         .tc_nthrs_max           = MDS_NTHRS_MAX,
173                         .tc_nthrs_user          = mds_num_threads,
174                         .tc_cpu_affinity        = 1,
175                         .tc_ctx_tags            = LCT_MD_THREAD,
176                 },
177                 .psc_cpt                = {
178                         .cc_pattern             = mds_num_cpts,
179                 },
180                 .psc_ops                = {
181                         .so_req_handler         = tgt_request_handle,
182                         .so_req_printer         = target_print_req,
183                         .so_hpreq_handler       = ptlrpc_hpreq_handler,
184                 },
185         };
186         m->mds_regular_service = ptlrpc_register_service(&conf, procfs_entry);
187         if (IS_ERR(m->mds_regular_service)) {
188                 rc = PTR_ERR(m->mds_regular_service);
189                 CERROR("failed to start regular mdt service: %d\n", rc);
190                 m->mds_regular_service = NULL;
191
192                 RETURN(rc);
193         }
194
195         /*
196          * readpage service configuration. Parameters have to be adjusted,
197          * ideally.
198          */
199         memset(&conf, 0, sizeof(conf));
200         conf = (typeof(conf)) {
201                 .psc_name               = LUSTRE_MDT_NAME "_readpage",
202                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
203                 .psc_buf                = {
204                         .bc_nbufs               = MDS_NBUFS,
205                         .bc_buf_size            = MDS_BUFSIZE,
206                         .bc_req_max_size        = MDS_MAXREQSIZE,
207                         .bc_rep_max_size        = MDS_MAXREPSIZE,
208                         .bc_req_portal          = MDS_READPAGE_PORTAL,
209                         .bc_rep_portal          = MDC_REPLY_PORTAL,
210                 },
211                 .psc_thr                = {
212                         .tc_thr_name            = LUSTRE_MDT_NAME "_rdpg",
213                         .tc_thr_factor          = MDS_RDPG_THR_FACTOR,
214                         .tc_nthrs_init          = MDS_RDPG_NTHRS_INIT,
215                         .tc_nthrs_base          = MDS_RDPG_NTHRS_BASE,
216                         .tc_nthrs_max           = MDS_RDPG_NTHRS_MAX,
217                         .tc_nthrs_user          = mds_rdpg_num_threads,
218                         .tc_cpu_affinity        = 1,
219                         .tc_ctx_tags            = LCT_MD_THREAD,
220                 },
221                 .psc_cpt                = {
222                         .cc_pattern             = mds_rdpg_num_cpts,
223                 },
224                 .psc_ops                = {
225                         .so_req_handler         = tgt_request_handle,
226                         .so_req_printer         = target_print_req,
227                 },
228         };
229         m->mds_readpage_service = ptlrpc_register_service(&conf, procfs_entry);
230         if (IS_ERR(m->mds_readpage_service)) {
231                 rc = PTR_ERR(m->mds_readpage_service);
232                 CERROR("failed to start readpage service: %d\n", rc);
233                 m->mds_readpage_service = NULL;
234
235                 GOTO(err_mds_svc, rc);
236         }
237
238         /*
239          * setattr service configuration.
240          *
241          * XXX To keep the compatibility with old client(< 2.2), we need to
242          * preserve this portal for a certain time, it should be removed
243          * eventually. LU-617.
244          */
245         memset(&conf, 0, sizeof(conf));
246         conf = (typeof(conf)) {
247                 .psc_name               = LUSTRE_MDT_NAME "_setattr",
248                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
249                 .psc_buf                = {
250                         .bc_nbufs               = MDS_NBUFS,
251                         .bc_buf_size            = MDS_BUFSIZE,
252                         .bc_req_max_size        = MDS_MAXREQSIZE,
253                         .bc_rep_max_size        = MDS_LOV_MAXREPSIZE,
254                         .bc_req_portal          = MDS_SETATTR_PORTAL,
255                         .bc_rep_portal          = MDC_REPLY_PORTAL,
256                 },
257                 .psc_thr                = {
258                         .tc_thr_name            = LUSTRE_MDT_NAME "_attr",
259                         .tc_thr_factor          = MDS_SETA_THR_FACTOR,
260                         .tc_nthrs_init          = MDS_SETA_NTHRS_INIT,
261                         .tc_nthrs_base          = MDS_SETA_NTHRS_BASE,
262                         .tc_nthrs_max           = MDS_SETA_NTHRS_MAX,
263                         .tc_nthrs_user          = mds_attr_num_threads,
264                         .tc_cpu_affinity        = 1,
265                         .tc_ctx_tags            = LCT_MD_THREAD,
266                 },
267                 .psc_cpt                = {
268                         .cc_pattern             = mds_attr_num_cpts,
269                 },
270                 .psc_ops                = {
271                         .so_req_handler         = tgt_request_handle,
272                         .so_req_printer         = target_print_req,
273                         .so_hpreq_handler       = NULL,
274                 },
275         };
276         m->mds_setattr_service = ptlrpc_register_service(&conf, procfs_entry);
277         if (IS_ERR(m->mds_setattr_service)) {
278                 rc = PTR_ERR(m->mds_setattr_service);
279                 CERROR("failed to start setattr service: %d\n", rc);
280                 m->mds_setattr_service = NULL;
281
282                 GOTO(err_mds_svc, rc);
283         }
284
285         /* Object update service */
286         conf = (typeof(conf)) {
287                 .psc_name               = LUSTRE_MDT_NAME "_out",
288                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
289                 .psc_buf                = {
290                         .bc_nbufs               = MDS_NBUFS,
291                         .bc_buf_size            = OUT_BUFSIZE,
292                         .bc_req_max_size        = OUT_MAXREQSIZE,
293                         .bc_rep_max_size        = OUT_MAXREPSIZE,
294                         .bc_req_portal          = OUT_PORTAL,
295                         .bc_rep_portal          = OSC_REPLY_PORTAL,
296                 },
297                 /*
298                  * We'd like to have a mechanism to set this on a per-device
299                  * basis, but alas...
300                  */
301                 .psc_thr                = {
302                         .tc_thr_name            = LUSTRE_MDT_NAME "_out",
303                         .tc_thr_factor          = MDS_THR_FACTOR,
304                         .tc_nthrs_init          = MDS_NTHRS_INIT,
305                         .tc_nthrs_base          = MDS_NTHRS_BASE,
306                         .tc_nthrs_max           = MDS_NTHRS_MAX,
307                         .tc_nthrs_user          = mds_num_threads,
308                         .tc_cpu_affinity        = 1,
309                         .tc_ctx_tags            = LCT_MD_THREAD |
310                                                   LCT_DT_THREAD,
311                 },
312                 .psc_cpt                = {
313                         .cc_pattern             = mds_num_cpts,
314                 },
315                 .psc_ops                = {
316                         .so_req_handler         = tgt_request_handle,
317                         .so_req_printer         = target_print_req,
318                         .so_hpreq_handler       = NULL,
319                 },
320         };
321         m->mds_out_service = ptlrpc_register_service(&conf, procfs_entry);
322         if (IS_ERR(m->mds_out_service)) {
323                 rc = PTR_ERR(m->mds_out_service);
324                 CERROR("failed to start out service: %d\n", rc);
325                 m->mds_out_service = NULL;
326                 GOTO(err_mds_svc, rc);
327         }
328
329         /*
330          * sequence controller service configuration
331          */
332         memset(&conf, 0, sizeof(conf));
333         conf = (typeof(conf)) {
334                 .psc_name               = LUSTRE_MDT_NAME "_seqs",
335                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
336                 .psc_buf                = {
337                         .bc_nbufs               = MDS_NBUFS,
338                         .bc_buf_size            = SEQ_BUFSIZE,
339                         .bc_req_max_size        = SEQ_MAXREQSIZE,
340                         .bc_rep_max_size        = SEQ_MAXREPSIZE,
341                         .bc_req_portal          = SEQ_CONTROLLER_PORTAL,
342                         .bc_rep_portal          = MDC_REPLY_PORTAL,
343                 },
344                 .psc_thr                = {
345                         .tc_thr_name            = LUSTRE_MDT_NAME "_seqs",
346                         .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
347                         .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
348                         .tc_ctx_tags            = LCT_MD_THREAD,
349                 },
350                 .psc_ops                = {
351                         .so_req_handler         = tgt_request_handle,
352                         .so_req_printer         = target_print_req,
353                         .so_hpreq_handler       = NULL,
354                 },
355         };
356         m->mds_mdsc_service = ptlrpc_register_service(&conf, procfs_entry);
357         if (IS_ERR(m->mds_mdsc_service)) {
358                 rc = PTR_ERR(m->mds_mdsc_service);
359                 CERROR("failed to start seq controller service: %d\n", rc);
360                 m->mds_mdsc_service = NULL;
361
362                 GOTO(err_mds_svc, rc);
363         }
364
365         /*
366          * metadata sequence server service configuration
367          */
368         memset(&conf, 0, sizeof(conf));
369         conf = (typeof(conf)) {
370                 .psc_name               = LUSTRE_MDT_NAME "_seqm",
371                 .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
372                 .psc_buf                = {
373                         .bc_nbufs               = MDS_NBUFS,
374                         .bc_buf_size            = SEQ_BUFSIZE,
375                         .bc_req_max_size        = SEQ_MAXREQSIZE,
376                         .bc_rep_max_size        = SEQ_MAXREPSIZE,
377                         .bc_req_portal          = SEQ_METADATA_PORTAL,
378                         .bc_rep_portal          = MDC_REPLY_PORTAL,
379                 },
380                 .psc_thr                = {
381                         .tc_thr_name            = LUSTRE_MDT_NAME "_seqm",
382                         .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
383                         .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
384                         .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD
385                 },
386                 .psc_ops                = {
387                         .so_req_handler         = tgt_request_handle,
388                         .so_req_printer         = target_print_req,
389                         .so_hpreq_handler       = NULL,
390                 },
391         };
392         m->mds_mdss_service = ptlrpc_register_service(&conf, procfs_entry);
393         if (IS_ERR(m->mds_mdss_service)) {
394                 rc = PTR_ERR(m->mds_mdss_service);
395                 CERROR("failed to start metadata seq server service: %d\n", rc);
396                 m->mds_mdss_service = NULL;
397
398                 GOTO(err_mds_svc, rc);
399         }
400
401         /* FLD service start */
402         memset(&conf, 0, sizeof(conf));
403         conf = (typeof(conf)) {
404                 .psc_name            = LUSTRE_MDT_NAME "_fld",
405                 .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
406                 .psc_buf                = {
407                         .bc_nbufs               = MDS_NBUFS,
408                         .bc_buf_size            = FLD_BUFSIZE,
409                         .bc_req_max_size        = FLD_MAXREQSIZE,
410                         .bc_rep_max_size        = FLD_MAXREPSIZE,
411                         .bc_req_portal          = FLD_REQUEST_PORTAL,
412                         .bc_rep_portal          = MDC_REPLY_PORTAL,
413                 },
414                 .psc_thr                = {
415                         .tc_thr_name            = LUSTRE_MDT_NAME "_fld",
416                         .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
417                         .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
418                         .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
419                 },
420                 .psc_ops                = {
421                         .so_req_handler         = tgt_request_handle,
422                         .so_req_printer         = target_print_req,
423                         .so_hpreq_handler       = NULL,
424                 },
425         };
426         m->mds_fld_service = ptlrpc_register_service(&conf, procfs_entry);
427         if (IS_ERR(m->mds_fld_service)) {
428                 rc = PTR_ERR(m->mds_fld_service);
429                 CERROR("failed to start fld service: %d\n", rc);
430                 m->mds_fld_service = NULL;
431
432                 GOTO(err_mds_svc, rc);
433         }
434
435         EXIT;
436 err_mds_svc:
437         if (rc)
438                 mds_stop_ptlrpc_service(m);
439
440         return rc;
441 }
442
443 static inline struct mds_device *mds_dev(struct lu_device *d)
444 {
445         return container_of0(d, struct mds_device, mds_md_dev.md_lu_dev);
446 }
447
448 static struct lu_device *mds_device_fini(const struct lu_env *env,
449                                          struct lu_device *d)
450 {
451         struct mds_device *m = mds_dev(d);
452         struct obd_device *obd = d->ld_obd;
453         ENTRY;
454
455         mds_stop_ptlrpc_service(m);
456         lprocfs_obd_cleanup(obd);
457         RETURN(NULL);
458 }
459
460 static struct lu_device *mds_device_free(const struct lu_env *env,
461                                          struct lu_device *d)
462 {
463         struct mds_device *m = mds_dev(d);
464         ENTRY;
465
466         md_device_fini(&m->mds_md_dev);
467         OBD_FREE_PTR(m);
468         RETURN(NULL);
469 }
470
471 LPROC_SEQ_FOPS_RO_TYPE(mds, uuid);
472
473 static struct lprocfs_vars lprocfs_mds_obd_vars[] = {
474         {
475                 .name   = "uuid",
476                 .fops   = &mds_uuid_fops
477         },
478         {
479                 .name   = NULL
480         }
481 };
482
483 static struct lu_device *mds_device_alloc(const struct lu_env *env,
484                                           struct lu_device_type *t,
485                                           struct lustre_cfg *cfg)
486 {
487         struct mds_device        *m;
488         struct obd_device        *obd;
489         struct lu_device          *l;
490         int rc;
491
492         OBD_ALLOC_PTR(m);
493         if (m == NULL)
494                 return ERR_PTR(-ENOMEM);
495
496         md_device_init(&m->mds_md_dev, t);
497         l = &m->mds_md_dev.md_lu_dev;
498
499         obd = class_name2obd(lustre_cfg_string(cfg, 0));
500         LASSERT(obd != NULL);
501
502         l->ld_obd = obd;
503         /* set this lu_device to obd, because error handling need it */
504         obd->obd_lu_dev = l;
505
506         obd->obd_vars = lprocfs_mds_obd_vars;
507         rc = lprocfs_obd_setup(obd);
508         if (rc != 0) {
509                 mds_device_free(env, l);
510                 l = ERR_PTR(rc);
511                 return l;
512         }
513
514         mutex_init(&m->mds_health_mutex);
515
516         rc = mds_start_ptlrpc_service(m);
517
518         if (rc != 0) {
519                 mds_device_free(env, l);
520                 l = ERR_PTR(rc);
521                 return l;
522         }
523         return l;
524 }
525
526 /* type constructor/destructor: mdt_type_init, mdt_type_fini */
527 LU_TYPE_INIT_FINI(mds, &mdt_thread_key);
528
529 static struct lu_device_type_operations mds_device_type_ops = {
530         .ldto_init = mds_type_init,
531         .ldto_fini = mds_type_fini,
532
533         .ldto_start = mds_type_start,
534         .ldto_stop  = mds_type_stop,
535
536         .ldto_device_alloc = mds_device_alloc,
537         .ldto_device_free  = mds_device_free,
538         .ldto_device_fini  = mds_device_fini
539 };
540
541 static struct lu_device_type mds_device_type = {
542         .ldt_tags     = LU_DEVICE_MD,
543         .ldt_name     = LUSTRE_MDS_NAME,
544         .ldt_ops      = &mds_device_type_ops,
545         .ldt_ctx_tags = LCT_MD_THREAD
546 };
547
548 static int mds_health_check(const struct lu_env *env, struct obd_device *obd)
549 {
550         struct mds_device *mds = mds_dev(obd->obd_lu_dev);
551         int rc = 0;
552
553
554         mutex_lock(&mds->mds_health_mutex);
555         rc |= ptlrpc_service_health_check(mds->mds_regular_service);
556         rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
557         rc |= ptlrpc_service_health_check(mds->mds_out_service);
558         rc |= ptlrpc_service_health_check(mds->mds_setattr_service);
559         rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
560         rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
561         rc |= ptlrpc_service_health_check(mds->mds_fld_service);
562         mutex_unlock(&mds->mds_health_mutex);
563
564         return rc != 0 ? 1 : 0;
565 }
566
567 static struct obd_ops mds_obd_device_ops = {
568         .o_owner           = THIS_MODULE,
569         .o_health_check    = mds_health_check,
570 };
571
572 int mds_mod_init(void)
573 {
574         return class_register_type(&mds_obd_device_ops, NULL, true, NULL,
575                                    LUSTRE_MDS_NAME, &mds_device_type);
576 }
577
578 void mds_mod_exit(void)
579 {
580         class_unregister_type(LUSTRE_MDS_NAME);
581 }