Whamcloud - gitweb
LU-9859 libcfs: rename cfs_cpt_table to cfs_cpt_tab
[fs/lustre-release.git] / lustre / mdt / mdt_mds.c
index cbdf73a..860cdf9 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012 Intel Corporation
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 /* lu2dt_dev() */
 #include <dt_object.h>
 #include <lustre_mds.h>
-#include <lustre_mdt.h>
 #include "mdt_internal.h"
 #include <lustre_quota.h>
 #include <lustre_acl.h>
-#include <lustre_param.h>
-#include <lustre_fsfilt.h>
+#include <uapi/linux/lustre/lustre_param.h>
 
 struct mds_device {
        /* super-class */
-       struct md_device           mds_md_dev;
-       struct ptlrpc_service     *mds_regular_service;
-       struct ptlrpc_service     *mds_readpage_service;
-       struct ptlrpc_service     *mds_out_service;
-       struct ptlrpc_service     *mds_setattr_service;
-       struct ptlrpc_service     *mds_mdsc_service;
-       struct ptlrpc_service     *mds_mdss_service;
-       struct ptlrpc_service     *mds_fld_service;
+       struct md_device         mds_md_dev;
+       struct ptlrpc_service   *mds_regular_service;
+       struct ptlrpc_service   *mds_readpage_service;
+       struct ptlrpc_service   *mds_out_service;
+       struct ptlrpc_service   *mds_setattr_service;
+       struct ptlrpc_service   *mds_mdsc_service;
+       struct ptlrpc_service   *mds_mdss_service;
+       struct ptlrpc_service   *mds_fld_service;
+       struct ptlrpc_service   *mds_io_service;
+       struct mutex             mds_health_mutex;
 };
 
 /*
- *  * Initialized in mdt_mod_init().
+ *  * Initialized in mds_mod_init().
  *   */
-static unsigned long mdt_num_threads;
-CFS_MODULE_PARM(mdt_num_threads, "ul", ulong, 0444,
-               "number of MDS service threads to start "
-               "(deprecated in favor of mds_num_threads)");
-
 static unsigned long mds_num_threads;
-CFS_MODULE_PARM(mds_num_threads, "ul", ulong, 0444,
-               "number of MDS service threads to start");
-
-static char *mds_num_cpts;
-CFS_MODULE_PARM(mds_num_cpts, "c", charp, 0444,
-               "CPU partitions MDS threads should run on");
-
-static unsigned long mds_rdpg_num_threads;
-CFS_MODULE_PARM(mds_rdpg_num_threads, "ul", ulong, 0444,
-               "number of MDS readpage service threads to start");
+module_param(mds_num_threads, ulong, 0444);
+MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
 
-static char *mds_rdpg_num_cpts;
-CFS_MODULE_PARM(mds_rdpg_num_cpts, "c", charp, 0444,
-               "CPU partitions MDS readpage threads should run on");
+static unsigned int mds_cpu_bind = 1;
+module_param(mds_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(mds_cpu_bind,
+                "bind MDS threads to particular CPU partitions");
 
-/* NB: these two should be removed along with setattr service in the future */
-static unsigned long mds_attr_num_threads;
-CFS_MODULE_PARM(mds_attr_num_threads, "ul", ulong, 0444,
-               "number of MDS setattr service threads to start");
+int mds_max_io_threads = 512;
+module_param(mds_max_io_threads, int, 0444);
+MODULE_PARM_DESC(mds_max_io_threads,
+                "maximum number of MDS IO service threads");
 
-static char *mds_attr_num_cpts;
-CFS_MODULE_PARM(mds_attr_num_cpts, "c", charp, 0444,
-               "CPU partitions MDS setattr threads should run on");
-
-#define DEFINE_RPC_HANDLER(base, flags, opc, fn, fmt)                  \
-[opc - base] = {                                                       \
-       .mh_name        = #opc,                                         \
-       .mh_fail_id     = OBD_FAIL_ ## opc ## _NET,                     \
-       .mh_opc         = opc,                                          \
-       .mh_flags       = flags,                                        \
-       .mh_act         = fn,                                           \
-       .mh_fmt         = fmt                                           \
-}
-
-/* Request with a format known in advance */
-#define DEF_MDT_HDL(flags, name, fn)                                   \
-       DEFINE_RPC_HANDLER(MDS_GETATTR, flags, name, fn, &RQF_ ## name)
-
-/* Request with a format we do not yet know */
-#define DEF_MDT_HDL_VAR(flags, name, fn)                               \
-       DEFINE_RPC_HANDLER(MDS_GETATTR, flags, name, fn, NULL)
-
-/* Map one non-standard request format handler.  This should probably get
- * a common OBD_SET_INFO RPC opcode instead of this mismatch. */
-#define RQF_MDS_SET_INFO RQF_OBD_SET_INFO
-
-static struct mdt_handler mdt_mds_ops[] = {
-DEF_MDT_HDL(0,                         MDS_CONNECT,      mdt_connect),
-DEF_MDT_HDL(0,                         MDS_DISCONNECT,   mdt_disconnect),
-DEF_MDT_HDL(0,                         MDS_SET_INFO,     mdt_set_info),
-DEF_MDT_HDL(0,                         MDS_GET_INFO,     mdt_get_info),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_GETSTATUS,    mdt_getstatus),
-DEF_MDT_HDL(HABEO_CORPUS,              MDS_GETATTR,      mdt_getattr),
-DEF_MDT_HDL(HABEO_CORPUS| HABEO_REFERO,        MDS_GETATTR_NAME, mdt_getattr_name),
-DEF_MDT_HDL(HABEO_CORPUS,              MDS_GETXATTR,     mdt_getxattr),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_STATFS,       mdt_statfs),
-DEF_MDT_HDL(0          | MUTABOR,      MDS_REINT,        mdt_reint),
-DEF_MDT_HDL(HABEO_CORPUS,              MDS_CLOSE,        mdt_close),
-DEF_MDT_HDL(HABEO_CORPUS,              MDS_DONE_WRITING, mdt_done_writing),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_PIN,          mdt_pin),
-DEF_MDT_HDL_VAR(0,                     MDS_SYNC,         mdt_sync),
-DEF_MDT_HDL(HABEO_CORPUS| HABEO_REFERO,        MDS_IS_SUBDIR,    mdt_is_subdir),
-DEF_MDT_HDL(0,                         MDS_QUOTACHECK,   mdt_quotacheck),
-DEF_MDT_HDL(0,                         MDS_QUOTACTL,     mdt_quotactl),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_HSM_PROGRESS, mdt_hsm_progress),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_HSM_CT_REGISTER,
-                                               mdt_hsm_ct_register),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_HSM_CT_UNREGISTER,
-                                               mdt_hsm_ct_unregister),
-DEF_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_HSM_STATE_GET,
-                                               mdt_hsm_state_get),
-DEF_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_HSM_STATE_SET,
-                                               mdt_hsm_state_set),
-DEF_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_HSM_ACTION, mdt_hsm_action),
-DEF_MDT_HDL(0          | HABEO_REFERO, MDS_HSM_REQUEST, mdt_hsm_request),
-DEF_MDT_HDL(HABEO_CORPUS|HABEO_REFERO, MDS_SWAP_LAYOUTS, mdt_swap_layouts)
-};
-
-#define DEF_OBD_HDL(flags, name, fn)                                   \
-       DEFINE_RPC_HANDLER(OBD_PING, flags, name, fn, NULL)
-
-static struct mdt_handler mdt_obd_ops[] = {
-DEF_OBD_HDL(0,                         OBD_PING,         mdt_obd_ping),
-DEF_OBD_HDL(0,                         OBD_LOG_CANCEL,   mdt_obd_log_cancel),
-DEF_OBD_HDL(0,                         OBD_QC_CALLBACK,  mdt_obd_qc_callback),
-DEF_OBD_HDL(0,                         OBD_IDX_READ,     mdt_obd_idx_read)
-};
+static unsigned int mds_io_cpu_bind = 1;
+module_param(mds_io_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(mds_io_cpu_bind,
+                "bind MDS IO threads to particular CPU partitions");
 
-#define DEF_DLM_HDL_VAR(flags, name, fn)                               \
-       DEFINE_RPC_HANDLER(LDLM_ENQUEUE, flags, name, fn, NULL)
-#define DEF_DLM_HDL(flags, name, fn)                                   \
-       DEFINE_RPC_HANDLER(LDLM_ENQUEUE, flags, name, fn, &RQF_ ## name)
+static char *mds_io_num_cpts;
+module_param(mds_io_num_cpts, charp, 0444);
+MODULE_PARM_DESC(mds_io_num_cpts,
+                "CPU partitions MDS IO threads should run on");
 
-static struct mdt_handler mdt_dlm_ops[] = {
-DEF_DLM_HDL    (HABEO_CLAVIS,          LDLM_ENQUEUE,     mdt_enqueue),
-DEF_DLM_HDL_VAR(HABEO_CLAVIS,          LDLM_CONVERT,     mdt_convert),
-DEF_DLM_HDL_VAR(0,                     LDLM_BL_CALLBACK, mdt_bl_callback),
-DEF_DLM_HDL_VAR(0,                     LDLM_CP_CALLBACK, mdt_cp_callback)
-};
-
-#define DEF_LLOG_HDL(flags, name, fn)                                  \
-       DEFINE_RPC_HANDLER(LLOG_ORIGIN_HANDLE_CREATE, flags, name, fn, NULL)
-
-static struct mdt_handler mdt_llog_ops[] = {
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_CREATE,        mdt_llog_create),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_NEXT_BLOCK,    mdt_llog_next_block),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_READ_HEADER,   mdt_llog_read_header),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_WRITE_REC,     NULL),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_CLOSE,         NULL),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_CONNECT,              NULL),
-DEF_LLOG_HDL(0,                LLOG_CATINFO,                     NULL),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_PREV_BLOCK,    mdt_llog_prev_block),
-DEF_LLOG_HDL(0,                LLOG_ORIGIN_HANDLE_DESTROY,       mdt_llog_destroy),
-};
-
-#define DEF_SEC_HDL(flags, name, fn)                                   \
-       DEFINE_RPC_HANDLER(SEC_CTX_INIT, flags, name, fn, NULL)
-
-static struct mdt_handler mdt_sec_ctx_ops[] = {
-DEF_SEC_HDL(0,                         SEC_CTX_INIT,     mdt_sec_ctx_handle),
-DEF_SEC_HDL(0,                         SEC_CTX_INIT_CONT,mdt_sec_ctx_handle),
-DEF_SEC_HDL(0,                         SEC_CTX_FINI,     mdt_sec_ctx_handle)
-};
-
-#define DEF_QUOTA_HDL(flags, name, fn)                         \
-       DEFINE_RPC_HANDLER(QUOTA_DQACQ, flags, name, fn, &RQF_ ## name)
-
-static struct mdt_handler mdt_quota_ops[] = {
-DEF_QUOTA_HDL(HABEO_REFERO,            QUOTA_DQACQ,      mdt_quota_dqacq),
-};
-
-struct mdt_opc_slice mdt_regular_handlers[] = {
-       {
-               .mos_opc_start  = MDS_GETATTR,
-               .mos_opc_end    = MDS_LAST_OPC,
-               .mos_hs         = mdt_mds_ops
-       },
-       {
-               .mos_opc_start  = OBD_PING,
-               .mos_opc_end    = OBD_LAST_OPC,
-               .mos_hs         = mdt_obd_ops
-       },
-       {
-               .mos_opc_start  = LDLM_ENQUEUE,
-               .mos_opc_end    = LDLM_LAST_OPC,
-               .mos_hs         = mdt_dlm_ops
-       },
-       {
-               .mos_opc_start  = LLOG_ORIGIN_HANDLE_CREATE,
-               .mos_opc_end    = LLOG_LAST_OPC,
-               .mos_hs         = mdt_llog_ops
-       },
-       {
-               .mos_opc_start  = SEC_CTX_INIT,
-               .mos_opc_end    = SEC_LAST_OPC,
-               .mos_hs         = mdt_sec_ctx_ops
-       },
-       {
-               .mos_opc_start  = QUOTA_DQACQ,
-               .mos_opc_end    = QUOTA_LAST_OPC,
-               .mos_hs         = mdt_quota_ops
-       },
-       {
-               .mos_hs         = NULL
-       }
-};
-
-/* Readpage/readdir handlers */
-static struct mdt_handler mdt_readpage_ops[] = {
-DEF_MDT_HDL(0,                 MDS_CONNECT,  mdt_connect),
-DEF_MDT_HDL(HABEO_CORPUS | HABEO_REFERO, MDS_READPAGE, mdt_readpage),
-/* XXX: this is ugly and should be fixed one day, see mdc_close() for
- * detailed comments. --umka */
-DEF_MDT_HDL(HABEO_CORPUS,              MDS_CLOSE,        mdt_close),
-DEF_MDT_HDL(HABEO_CORPUS,              MDS_DONE_WRITING, mdt_done_writing),
-};
+static struct cfs_cpt_table *mdt_io_cptable;
 
-static struct mdt_opc_slice mdt_readpage_handlers[] = {
-       {
-               .mos_opc_start = MDS_GETATTR,
-               .mos_opc_end   = MDS_LAST_OPC,
-               .mos_hs = mdt_readpage_ops
-       },
-       {
-               .mos_opc_start = OBD_FIRST_OPC,
-               .mos_opc_end   = OBD_LAST_OPC,
-               .mos_hs = mdt_obd_ops
-       },
-       {
-               .mos_hs = NULL
-       }
-};
-
-/* Sequence service handlers */
-#define DEF_SEQ_HDL(flags, name, fn)                                   \
-       DEFINE_RPC_HANDLER(SEQ_QUERY, flags, name, fn, &RQF_ ## name)
-
-static struct mdt_handler mdt_seq_ops[] = {
-DEF_SEQ_HDL(0,                         SEQ_QUERY,        (void *)seq_query),
-};
-
-struct mdt_opc_slice mdt_seq_handlers[] = {
-       {
-               .mos_opc_start = SEQ_QUERY,
-               .mos_opc_end   = SEQ_LAST_OPC,
-               .mos_hs = mdt_seq_ops
-       },
-       {
-               .mos_hs = NULL
-       }
-};
-
-/* FID Location Database handlers */
-#define DEF_FLD_HDL(flags, name, fn)                                   \
-       DEFINE_RPC_HANDLER(FLD_QUERY, flags, name, fn, &RQF_ ## name)
-
-static struct mdt_handler mdt_fld_ops[] = {
-DEF_FLD_HDL(0,                         FLD_QUERY,        (void *)fld_query),
-};
-
-struct mdt_opc_slice mdt_fld_handlers[] = {
-       {
-               .mos_opc_start = FLD_QUERY,
-               .mos_opc_end   = FLD_LAST_OPC,
-               .mos_hs = mdt_fld_ops
-       },
-       {
-               .mos_hs = NULL
-       }
-};
-
-/* Request with a format known in advance */
-#define DEF_UPDATE_HDL(flags, name, fn)                                        \
-       DEFINE_RPC_HANDLER(UPDATE_OBJ, flags, name, fn, &RQF_ ## name)
-
-#define target_handler mdt_handler
-static struct target_handler out_ops[] = {
-       DEF_UPDATE_HDL(MUTABOR,         UPDATE_OBJ,     out_handle),
-};
-
-static struct mdt_opc_slice update_handlers[] = {
-       {
-               .mos_opc_start = MDS_GETATTR,
-               .mos_opc_end   = MDS_LAST_OPC,
-               .mos_hs        = mdt_mds_ops
-       },
-       {
-               .mos_opc_start = OBD_PING,
-               .mos_opc_end   = OBD_LAST_OPC,
-               .mos_hs        = mdt_obd_ops
-       },
-       {
-               .mos_opc_start = LDLM_ENQUEUE,
-               .mos_opc_end   = LDLM_LAST_OPC,
-               .mos_hs        = mdt_dlm_ops
-       },
-       {
-               .mos_opc_start = SEC_CTX_INIT,
-               .mos_opc_end   = SEC_LAST_OPC,
-               .mos_hs        = mdt_sec_ctx_ops
-       },
-       {
-               .mos_opc_start = UPDATE_OBJ,
-               .mos_opc_end   = UPDATE_LAST_OPC,
-               .mos_hs        = out_ops
-       },
-       {
-               .mos_hs        = NULL
-       }
-};
+static char *mds_num_cpts;
+module_param(mds_num_cpts, charp, 0444);
+MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
 
-static int mds_regular_handle(struct ptlrpc_request *req)
-{
-       return mdt_handle_common(req, mdt_regular_handlers);
-}
+static unsigned long mds_rdpg_num_threads;
+module_param(mds_rdpg_num_threads, ulong, 0444);
+MODULE_PARM_DESC(mds_rdpg_num_threads,
+                "number of MDS readpage service threads to start");
 
-static int mds_readpage_handle(struct ptlrpc_request *req)
-{
-       return mdt_handle_common(req, mdt_readpage_handlers);
-}
+static unsigned int mds_rdpg_cpu_bind = 1;
+module_param(mds_rdpg_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(mds_rdpg_cpu_bind,
+                "bind MDS readpage threads to particular CPU partitions");
 
-static int mds_mdsc_handle(struct ptlrpc_request *req)
-{
-       return mdt_handle_common(req, mdt_seq_handlers);
-}
+static char *mds_rdpg_num_cpts;
+module_param(mds_rdpg_num_cpts, charp, 0444);
+MODULE_PARM_DESC(mds_rdpg_num_cpts,
+                "CPU partitions MDS readpage threads should run on");
 
-static int mdt_out_handle(struct ptlrpc_request *req)
-{
-       return mdt_handle_common(req, update_handlers);
-}
+/* NB: these two should be removed along with setattr service in the future */
+static unsigned long mds_attr_num_threads;
+module_param(mds_attr_num_threads, ulong, 0444);
+MODULE_PARM_DESC(mds_attr_num_threads,
+                "number of MDS setattr service threads to start");
 
-static int mds_mdss_handle(struct ptlrpc_request *req)
-{
-       return mdt_handle_common(req, mdt_seq_handlers);
-}
+static unsigned int mds_attr_cpu_bind = 1;
+module_param(mds_attr_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(mds_attr_cpu_bind,
+                "bind MDS setattr threads to particular CPU partitions");
 
-static int mds_fld_handle(struct ptlrpc_request *req)
-{
-       return mdt_handle_common(req, mdt_fld_handlers);
-}
+static char *mds_attr_num_cpts;
+module_param(mds_attr_num_cpts, charp, 0444);
+MODULE_PARM_DESC(mds_attr_num_cpts,
+                "CPU partitions MDS setattr threads should run on");
 
 /* device init/fini methods */
 static void mds_stop_ptlrpc_service(struct mds_device *m)
 {
        ENTRY;
+
+       mutex_lock(&m->mds_health_mutex);
        if (m->mds_regular_service != NULL) {
                ptlrpc_unregister_service(m->mds_regular_service);
                m->mds_regular_service = NULL;
@@ -410,6 +166,17 @@ static void mds_stop_ptlrpc_service(struct mds_device *m)
                ptlrpc_unregister_service(m->mds_fld_service);
                m->mds_fld_service = NULL;
        }
+       if (m->mds_io_service != NULL) {
+               ptlrpc_unregister_service(m->mds_io_service);
+               m->mds_io_service = NULL;
+       }
+       mutex_unlock(&m->mds_health_mutex);
+
+       if (mdt_io_cptable != NULL) {
+               cfs_cpt_table_free(mdt_io_cptable);
+               mdt_io_cptable = NULL;
+       }
+
        EXIT;
 }
 
@@ -417,21 +184,19 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
 {
        static struct ptlrpc_service_conf conf;
        struct obd_device *obd = m->mds_md_dev.md_lu_dev.ld_obd;
-       cfs_proc_dir_entry_t *procfs_entry;
+       nodemask_t *mask;
        int rc = 0;
-       ENTRY;
 
-       procfs_entry = obd->obd_proc_entry;
-       LASSERT(procfs_entry != NULL);
+       ENTRY;
 
        conf = (typeof(conf)) {
                .psc_name               = LUSTRE_MDT_NAME,
                .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
                .psc_buf                = {
                        .bc_nbufs               = MDS_NBUFS,
-                       .bc_buf_size            = MDS_LOV_BUFSIZE,
-                       .bc_req_max_size        = MDS_LOV_MAXREQSIZE,
-                       .bc_rep_max_size        = MDS_LOV_MAXREPSIZE,
+                       .bc_buf_size            = MDS_REG_BUFSIZE,
+                       .bc_req_max_size        = MDS_REG_MAXREQSIZE,
+                       .bc_rep_max_size        = MDS_REG_MAXREPSIZE,
                        .bc_req_portal          = MDS_REQUEST_PORTAL,
                        .bc_rep_portal          = MDC_REPLY_PORTAL,
                },
@@ -446,19 +211,24 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_nthrs_base          = MDS_NTHRS_BASE,
                        .tc_nthrs_max           = MDS_NTHRS_MAX,
                        .tc_nthrs_user          = mds_num_threads,
-                       .tc_cpu_affinity        = 1,
-                       .tc_ctx_tags            = LCT_MD_THREAD,
+                       .tc_cpu_bind            = mds_cpu_bind,
+                       /* LCT_DT_THREAD is required as MDT threads may scan
+                        * all LDLM namespaces (including OFD-originated) to
+                        * cancel LDLM locks */
+                       .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD,
                },
                .psc_cpt                = {
                        .cc_pattern             = mds_num_cpts,
+                       .cc_affinity            = true,
                },
                .psc_ops                = {
-                       .so_req_handler         = mds_regular_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                        .so_hpreq_handler       = ptlrpc_hpreq_handler,
                },
        };
-       m->mds_regular_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                        obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_regular_service)) {
                rc = PTR_ERR(m->mds_regular_service);
                CERROR("failed to start regular mdt service: %d\n", rc);
@@ -490,18 +260,20 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_nthrs_base          = MDS_RDPG_NTHRS_BASE,
                        .tc_nthrs_max           = MDS_RDPG_NTHRS_MAX,
                        .tc_nthrs_user          = mds_rdpg_num_threads,
-                       .tc_cpu_affinity        = 1,
+                       .tc_cpu_bind            = mds_rdpg_cpu_bind,
                        .tc_ctx_tags            = LCT_MD_THREAD,
                },
                .psc_cpt                = {
                        .cc_pattern             = mds_rdpg_num_cpts,
+                       .cc_affinity            = true,
                },
                .psc_ops                = {
-                       .so_req_handler         = mds_readpage_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                },
        };
-       m->mds_readpage_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                         obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_readpage_service)) {
                rc = PTR_ERR(m->mds_readpage_service);
                CERROR("failed to start readpage service: %d\n", rc);
@@ -536,19 +308,21 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_nthrs_base          = MDS_SETA_NTHRS_BASE,
                        .tc_nthrs_max           = MDS_SETA_NTHRS_MAX,
                        .tc_nthrs_user          = mds_attr_num_threads,
-                       .tc_cpu_affinity        = 1,
+                       .tc_cpu_bind            = mds_attr_cpu_bind,
                        .tc_ctx_tags            = LCT_MD_THREAD,
                },
                .psc_cpt                = {
                        .cc_pattern             = mds_attr_num_cpts,
+                       .cc_affinity            = true,
                },
                .psc_ops                = {
-                       .so_req_handler         = mds_regular_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                        .so_hpreq_handler       = NULL,
                },
        };
-       m->mds_setattr_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_setattr_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                        obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_setattr_service)) {
                rc = PTR_ERR(m->mds_setattr_service);
                CERROR("failed to start setattr service: %d\n", rc);
@@ -563,11 +337,11 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
                .psc_buf                = {
                        .bc_nbufs               = MDS_NBUFS,
-                       .bc_buf_size            = MDS_OUT_BUFSIZE,
-                       .bc_req_max_size        = MDS_OUT_MAXREQSIZE,
-                       .bc_rep_max_size        = MDS_OUT_MAXREPSIZE,
-                       .bc_req_portal          = MDS_MDS_PORTAL,
-                       .bc_rep_portal          = MDC_REPLY_PORTAL,
+                       .bc_buf_size            = OUT_BUFSIZE,
+                       .bc_req_max_size        = OUT_MAXREQSIZE,
+                       .bc_rep_max_size        = OUT_MAXREPSIZE,
+                       .bc_req_portal          = OUT_PORTAL,
+                       .bc_rep_portal          = OSC_REPLY_PORTAL,
                },
                /*
                 * We'd like to have a mechanism to set this on a per-device
@@ -580,19 +354,22 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_nthrs_base          = MDS_NTHRS_BASE,
                        .tc_nthrs_max           = MDS_NTHRS_MAX,
                        .tc_nthrs_user          = mds_num_threads,
-                       .tc_cpu_affinity        = 1,
-                       .tc_ctx_tags            = LCT_MD_THREAD,
+                       .tc_cpu_bind            = mds_cpu_bind,
+                       .tc_ctx_tags            = LCT_MD_THREAD |
+                                                 LCT_DT_THREAD,
                },
                .psc_cpt                = {
                        .cc_pattern             = mds_num_cpts,
+                       .cc_affinity            = true,
                },
                .psc_ops                = {
-                       .so_req_handler         = mdt_out_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                        .so_hpreq_handler       = NULL,
                },
        };
-       m->mds_out_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                    obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_out_service)) {
                rc = PTR_ERR(m->mds_out_service);
                CERROR("failed to start out service: %d\n", rc);
@@ -622,12 +399,13 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_ctx_tags            = LCT_MD_THREAD,
                },
                .psc_ops                = {
-                       .so_req_handler         = mds_mdsc_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                        .so_hpreq_handler       = NULL,
                },
        };
-       m->mds_mdsc_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                     obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_mdsc_service)) {
                rc = PTR_ERR(m->mds_mdsc_service);
                CERROR("failed to start seq controller service: %d\n", rc);
@@ -658,12 +436,13 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD
                },
                .psc_ops                = {
-                       .so_req_handler         = mds_mdss_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                        .so_hpreq_handler       = NULL,
                },
        };
-       m->mds_mdss_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                     obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_mdss_service)) {
                rc = PTR_ERR(m->mds_mdss_service);
                CERROR("failed to start metadata seq server service: %d\n", rc);
@@ -689,15 +468,16 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                        .tc_thr_name            = LUSTRE_MDT_NAME "_fld",
                        .tc_nthrs_init          = MDS_OTHR_NTHRS_INIT,
                        .tc_nthrs_max           = MDS_OTHR_NTHRS_MAX,
-                       .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD
+                       .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
                },
                .psc_ops                = {
-                       .so_req_handler         = mds_fld_handle,
+                       .so_req_handler         = tgt_request_handle,
                        .so_req_printer         = target_print_req,
                        .so_hpreq_handler       = NULL,
                },
        };
-       m->mds_fld_service = ptlrpc_register_service(&conf, procfs_entry);
+       m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                    obd->obd_debugfs_entry);
        if (IS_ERR(m->mds_fld_service)) {
                rc = PTR_ERR(m->mds_fld_service);
                CERROR("failed to start fld service: %d\n", rc);
@@ -706,6 +486,77 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
                GOTO(err_mds_svc, rc);
        }
 
+
+       mask = cfs_cpt_nodemask(cfs_cpt_tab, CFS_CPT_ANY);
+       /* event CPT feature is disabled in libcfs level by set partition
+        * number to 1, we still want to set node affinity for io service */
+       if (cfs_cpt_number(cfs_cpt_tab) == 1 && nodes_weight(*mask) > 1) {
+               int cpt = 0;
+               int i;
+
+               mdt_io_cptable = cfs_cpt_table_alloc(nodes_weight(*mask));
+               for_each_node_mask(i, *mask) {
+                       if (mdt_io_cptable == NULL) {
+                               CWARN("MDS failed to create CPT table\n");
+                               break;
+                       }
+
+                       rc = cfs_cpt_set_node(mdt_io_cptable, cpt++, i);
+                       if (!rc) {
+                               CWARN("MDS Failed to set node %d for IO CPT table\n",
+                                     i);
+                               cfs_cpt_table_free(mdt_io_cptable);
+                               mdt_io_cptable = NULL;
+                               break;
+                       }
+               }
+       }
+
+       memset(&conf, 0, sizeof(conf));
+       conf = (typeof(conf)) {
+               .psc_name               = LUSTRE_MDT_NAME "_io",
+               .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_IO_BUFSIZE,
+                       .bc_req_max_size        = OST_IO_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_IO_MAXREPSIZE,
+                       .bc_req_portal          = MDS_IO_PORTAL,
+                       .bc_rep_portal          = MDC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = LUSTRE_MDT_NAME "_io",
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = mds_max_io_threads,
+                       .tc_nthrs_user          = mds_num_threads,
+                       .tc_cpu_bind            = mds_io_cpu_bind,
+                       .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_cptable             = mdt_io_cptable,
+                       .cc_pattern             = mdt_io_cptable == NULL ?
+                                                 mds_io_num_cpts : NULL,
+                       .cc_affinity            = true,
+               },
+               .psc_ops                = {
+                       .so_thr_init            = tgt_io_thread_init,
+                       .so_thr_done            = tgt_io_thread_done,
+                       .so_req_handler         = tgt_request_handle,
+                       .so_req_printer         = target_print_req,
+                       .so_hpreq_handler       = tgt_hpreq_handler,
+               },
+       };
+       m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                   obd->obd_debugfs_entry);
+       if (IS_ERR(m->mds_io_service)) {
+               rc = PTR_ERR(m->mds_io_service);
+               CERROR("failed to start MDT I/O service: %d\n", rc);
+               m->mds_io_service = NULL;
+               GOTO(err_mds_svc, rc);
+       }
+
        EXIT;
 err_mds_svc:
        if (rc)
@@ -765,21 +616,22 @@ static struct lu_device *mds_device_alloc(const struct lu_env *env,
        /* set this lu_device to obd, because error handling need it */
        obd->obd_lu_dev = l;
 
-       rc = lprocfs_obd_setup(obd, lprocfs_mds_obd_vars);
+       rc = lprocfs_obd_setup(obd, true);
        if (rc != 0) {
                mds_device_free(env, l);
                l = ERR_PTR(rc);
                return l;
        }
 
-       rc = mds_start_ptlrpc_service(m);
+       mutex_init(&m->mds_health_mutex);
 
+       rc = mds_start_ptlrpc_service(m);
        if (rc != 0) {
+               lprocfs_obd_cleanup(obd);
                mds_device_free(env, l);
                l = ERR_PTR(rc);
                return l;
        }
-
        return l;
 }
 
@@ -805,25 +657,35 @@ static struct lu_device_type mds_device_type = {
        .ldt_ctx_tags = LCT_MD_THREAD
 };
 
-static struct obd_ops mds_obd_device_ops = {
+static int mds_health_check(const struct lu_env *env, struct obd_device *obd)
+{
+       struct mds_device *mds = mds_dev(obd->obd_lu_dev);
+       int rc = 0;
+
+
+       mutex_lock(&mds->mds_health_mutex);
+       rc |= ptlrpc_service_health_check(mds->mds_regular_service);
+       rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
+       rc |= ptlrpc_service_health_check(mds->mds_out_service);
+       rc |= ptlrpc_service_health_check(mds->mds_setattr_service);
+       rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
+       rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
+       rc |= ptlrpc_service_health_check(mds->mds_fld_service);
+       rc |= ptlrpc_service_health_check(mds->mds_io_service);
+       mutex_unlock(&mds->mds_health_mutex);
+
+       return rc != 0 ? 1 : 0;
+}
+
+static const struct obd_ops mds_obd_device_ops = {
        .o_owner           = THIS_MODULE,
+       .o_health_check    = mds_health_check,
 };
 
 int mds_mod_init(void)
 {
-       int rc;
-
-       if (mdt_num_threads != 0 && mds_num_threads == 0) {
-               LCONSOLE_INFO("mdt_num_threads module parameter is deprecated, "
-                             "use mds_num_threads instead or unset both for "
-                             "dynamic thread startup\n");
-               mds_num_threads = mdt_num_threads;
-       }
-
-       rc = class_register_type(&mds_obd_device_ops, NULL,
-                                lprocfs_mds_module_vars, LUSTRE_MDS_NAME,
-                                &mds_device_type);
-       return rc;
+       return class_register_type(&mds_obd_device_ops, NULL, false, NULL,
+                                  LUSTRE_MDS_NAME, &mds_device_type);
 }
 
 void mds_mod_exit(void)