Whamcloud - gitweb
LU-1940 ost: wait a while for OBD_FAIL_OST_EROFS
[fs/lustre-release.git] / lustre / ost / ost_handler.c
index 714d7e1..23c1420 100644 (file)
@@ -51,7 +51,6 @@
 #include <linux/init.h>
 #include <lprocfs_status.h>
 #include <libcfs/list.h>
-#include <lustre_quota.h>
 #include "ost_internal.h"
 
 static int oss_num_threads;
@@ -66,6 +65,23 @@ static int oss_num_create_threads;
 CFS_MODULE_PARM(oss_num_create_threads, "i", int, 0444,
                 "number of OSS create threads to start");
 
+static char *oss_cpts;
+CFS_MODULE_PARM(oss_cpts, "s", charp, 0444,
+               "CPU partitions OSS threads should run on");
+
+static char *oss_io_cpts;
+CFS_MODULE_PARM(oss_io_cpts, "s", charp, 0444,
+               "CPU partitions OSS IO threads should run on");
+
+/*
+ * this page is allocated statically when module is initializing
+ * it is used to simulate data corruptions, see ost_checksum_bulk()
+ * for details. as the original pages provided by the layers below
+ * can be remain in the internal cache, we do not want to modify
+ * them.
+ */
+static struct page *ost_page_to_corrupt = NULL;
+
 /**
  * Do not return server-side uid/gid to remote client
  */
@@ -195,7 +211,7 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req,
  */
 static int ost_lock_get(struct obd_export *exp, struct obdo *oa,
                         __u64 start, __u64 count, struct lustre_handle *lh,
-                        int mode, int flags)
+                       int mode, __u64 flags)
 {
         struct ldlm_res_id res_id;
         ldlm_policy_data_t policy;
@@ -314,6 +330,9 @@ static int ost_statfs(struct ptlrpc_request *req)
         if (req->rq_status != 0)
                 CERROR("ost: statfs failed: rc %d\n", req->rq_status);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OST_STATFS_EINPROGRESS))
+               req->rq_status = -EINPROGRESS;
+
         RETURN(0);
 }
 
@@ -350,8 +369,9 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
                      struct obd_trans_info *oti)
 {
         struct ost_body *body, *repbody;
-        int rc, flags = 0;
+       __u64 flags = 0;
         struct lustre_handle lh = {0,};
+       int rc;
         ENTRY;
 
         /* check that we do support OBD_CONNECT_TRUNCLOCK. */
@@ -544,9 +564,19 @@ static __u32 ost_checksum_bulk(struct ptlrpc_bulk_desc *desc, int opc,
                    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
                        int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
                        int len = desc->bd_iov[i].kiov_len;
+                       struct page *np = ost_page_to_corrupt;
                        char *ptr = kmap(desc->bd_iov[i].kiov_page) + off;
-                       memcpy(ptr, "bad3", min(4, len));
-                       kunmap(desc->bd_iov[i].kiov_page);
+
+                       if (np) {
+                               char *ptr2 = kmap(np) + off;
+
+                               memcpy(ptr2, ptr, len);
+                               memcpy(ptr2, "bad3", min(4, len));
+                               kunmap(np);
+                               desc->bd_iov[i].kiov_page = np;
+                       } else {
+                               CERROR("can't alloc page for corruption\n");
+                       }
                }
                cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
                                  desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
@@ -558,11 +588,19 @@ static __u32 ost_checksum_bulk(struct ptlrpc_bulk_desc *desc, int opc,
                    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
                        int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
                        int len = desc->bd_iov[i].kiov_len;
+                       struct page *np = ost_page_to_corrupt;
                        char *ptr = kmap(desc->bd_iov[i].kiov_page) + off;
-                       memcpy(ptr, "bad4", min(4, len));
-                       kunmap(desc->bd_iov[i].kiov_page);
-                       /* nobody should use corrupted page again */
-                       ClearPageUptodate(desc->bd_iov[i].kiov_page);
+
+                       if (np) {
+                               char *ptr2 = kmap(np) + off;
+
+                               memcpy(ptr2, ptr, len);
+                               memcpy(ptr2, "bad4", min(4, len));
+                               kunmap(np);
+                               desc->bd_iov[i].kiov_page = np;
+                       } else {
+                               CERROR("can't alloc page for corruption\n");
+                       }
                }
        }
 
@@ -578,7 +616,7 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp,
                             struct obd_ioobj *obj, struct niobuf_remote *nb,
                             struct lustre_handle *lh)
 {
-        int flags                 = 0;
+       __u64 flags               = 0;
         int nrbufs                = obj->ioo_bufcnt;
         struct ldlm_res_id res_id;
         ldlm_policy_data_t policy;
@@ -628,7 +666,7 @@ static struct ost_thread_local_cache *ost_tls_get(struct ptlrpc_request *r)
 
         /* In normal mode of operation an I/O request is serviced only
          * by ll_ost_io threads each of them has own tls buffers allocated by
-         * ost_thread_init().
+         * ost_io_thread_init().
          * During recovery, an I/O request may be queued until any of the ost
          * service threads process it. Not necessary it should be one of
          * ll_ost_io threads. In that case we dynamically allocating tls
@@ -778,9 +816,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 nob += page_rc;
                 if (page_rc != 0) {             /* some data! */
                         LASSERT (local_nb[i].page != NULL);
-                        ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                              local_nb[i].offset & ~CFS_PAGE_MASK,
-                                              page_rc);
+                       ptlrpc_prep_bulk_page_nopin(desc, local_nb[i].page,
+                                                   local_nb[i].lnb_page_offset,
+                                                   page_rc);
                 }
 
                 if (page_rc != local_nb[i].len) { /* short read */
@@ -828,7 +866,7 @@ out_tls:
         ost_tls_put(req);
 out_bulk:
         if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
-                ptlrpc_free_bulk(desc);
+               ptlrpc_free_bulk_nopin(desc);
 out:
         LASSERT(rc <= 0);
         if (rc == 0) {
@@ -862,7 +900,7 @@ out:
                 lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL);
                 l_wait_event(waitq, 0, &lwi1);
                 rc = target_bulk_io(exp, desc, &lwi);
-                ptlrpc_free_bulk(desc);
+               ptlrpc_free_bulk_nopin(desc);
         }
 
         RETURN(rc);
@@ -1012,9 +1050,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         /* NB Having prepped, we must commit... */
 
         for (i = 0; i < npages; i++)
-                ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                      local_nb[i].offset & ~CFS_PAGE_MASK,
-                                      local_nb[i].len);
+               ptlrpc_prep_bulk_page_nopin(desc, local_nb[i].page,
+                                           local_nb[i].lnb_page_offset,
+                                           local_nb[i].len);
 
         rc = sptlrpc_svc_prep_bulk(req, desc);
         if (rc != 0)
@@ -1101,8 +1139,8 @@ skip_transfer:
                                    body->oa.o_id,
                                    body->oa.o_valid & OBD_MD_FLGROUP ?
                                                 body->oa.o_seq : (__u64)0,
-                                   local_nb[0].offset,
-                                   local_nb[npages-1].offset +
+                                  local_nb[0].lnb_file_offset,
+                                  local_nb[npages-1].lnb_file_offset +
                                    local_nb[npages-1].len - 1 );
                 CERROR("client csum %x, original server csum %x, "
                        "server csum now %x\n",
@@ -1137,7 +1175,7 @@ out_tls:
         ost_tls_put(req);
 out_bulk:
         if (desc)
-                ptlrpc_free_bulk(desc);
+               ptlrpc_free_bulk_nopin(desc);
 out:
         if (rc == 0) {
                 oti_to_request(oti, req);
@@ -1288,7 +1326,6 @@ static int ost_get_info(struct obd_export *exp, struct ptlrpc_request *req)
         RETURN(rc);
 }
 
-#ifdef HAVE_QUOTA_SUPPORT
 static int ost_handle_quotactl(struct ptlrpc_request *req)
 {
         struct obd_quotactl *oqctl, *repoqc;
@@ -1325,35 +1362,11 @@ static int ost_handle_quotacheck(struct ptlrpc_request *req)
         if (rc)
                 RETURN(-ENOMEM);
 
-        req->rq_status = obd_quotacheck(req->rq_export, oqctl);
-        RETURN(0);
+       /* deprecated, not used any more */
+       req->rq_status = -EOPNOTSUPP;
+       RETURN(-EOPNOTSUPP);
 }
 
-static int ost_handle_quota_adjust_qunit(struct ptlrpc_request *req)
-{
-        struct quota_adjust_qunit *oqaq, *repoqa;
-        struct lustre_quota_ctxt *qctxt;
-        int rc;
-        ENTRY;
-
-        qctxt = &req->rq_export->exp_obd->u.obt.obt_qctxt;
-        oqaq = req_capsule_client_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT);
-        if (oqaq == NULL)
-                GOTO(out, rc = -EPROTO);
-
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                GOTO(out, rc);
-
-        repoqa = req_capsule_server_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT);
-        req->rq_status = obd_quota_adjust_qunit(req->rq_export, oqaq, qctxt, NULL);
-        *repoqa = *oqaq;
-
- out:
-        RETURN(rc);
-}
-#endif
-
 static int ost_llog_handle_connect(struct obd_export *exp,
                                    struct ptlrpc_request *req)
 {
@@ -1543,52 +1556,59 @@ static int ost_connect_check_sptlrpc(struct ptlrpc_request *req)
 
 /* Ensure that data and metadata are synced to the disk when lock is cancelled
  * (if requested) */
-int ost_blocking_ast(struct ldlm_lock *lock,
-                             struct ldlm_lock_desc *desc,
-                             void *data, int flag)
+int ost_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                    void *data, int flag)
 {
-        __u32 sync_lock_cancel = 0;
-        __u32 len = sizeof(sync_lock_cancel);
-        int rc = 0;
-        ENTRY;
-
-        rc = obd_get_info(NULL, lock->l_export, sizeof(KEY_SYNC_LOCK_CANCEL),
-                          KEY_SYNC_LOCK_CANCEL, &len, &sync_lock_cancel, NULL);
-
-        if (!rc && flag == LDLM_CB_CANCELING &&
-            (lock->l_granted_mode & (LCK_PW|LCK_GROUP)) &&
-            (sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
-             (sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
-              lock->l_flags & LDLM_FL_CBPENDING))) {
-                struct obd_info *oinfo;
-                struct obdo *oa;
-                int rc;
-
-                OBD_ALLOC_PTR(oinfo);
-                if (!oinfo)
-                        RETURN(-ENOMEM);
-                OBDO_ALLOC(oa);
-                if (!oa) {
-                        OBD_FREE_PTR(oinfo);
-                        RETURN(-ENOMEM);
-                }
-                oa->o_id = lock->l_resource->lr_name.name[0];
-                oa->o_seq = lock->l_resource->lr_name.name[1];
-                oa->o_valid = OBD_MD_FLID|OBD_MD_FLGROUP;
-                oinfo->oi_oa = oa;
-
-                rc = obd_sync(NULL, lock->l_export, oinfo,
-                              lock->l_policy_data.l_extent.start,
-                              lock->l_policy_data.l_extent.end, NULL);
-                if (rc)
-                        CERROR("Error %d syncing data on lock cancel\n", rc);
-
-                OBDO_FREE(oa);
-                OBD_FREE_PTR(oinfo);
-        }
+       struct lu_env   env;
+       __u32           sync_lock_cancel = 0;
+       __u32           len = sizeof(sync_lock_cancel);
+       int             rc = 0;
+
+       ENTRY;
+
+       rc = lu_env_init(&env, LCT_DT_THREAD);
+       if (unlikely(rc != 0))
+               RETURN(rc);
+
+       rc = obd_get_info(&env, lock->l_export, sizeof(KEY_SYNC_LOCK_CANCEL),
+                         KEY_SYNC_LOCK_CANCEL, &len, &sync_lock_cancel, NULL);
+       if (rc == 0 && flag == LDLM_CB_CANCELING &&
+           (lock->l_granted_mode & (LCK_PW|LCK_GROUP)) &&
+           (sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
+            (sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
+             lock->l_flags & LDLM_FL_CBPENDING))) {
+               struct obd_info *oinfo;
+               struct obdo     *oa;
+               int              rc;
+
+               OBD_ALLOC_PTR(oinfo);
+               if (!oinfo)
+                       GOTO(out_env, rc = -ENOMEM);
+               OBDO_ALLOC(oa);
+               if (!oa) {
+                       OBD_FREE_PTR(oinfo);
+                       GOTO(out_env, rc = -ENOMEM);
+               }
+               oa->o_id = lock->l_resource->lr_name.name[0];
+               oa->o_seq = lock->l_resource->lr_name.name[1];
+               oa->o_valid = OBD_MD_FLID|OBD_MD_FLGROUP;
+               oinfo->oi_oa = oa;
+               oinfo->oi_capa = BYPASS_CAPA;
+
+               rc = obd_sync(&env, lock->l_export, oinfo,
+                             lock->l_policy_data.l_extent.start,
+                             lock->l_policy_data.l_extent.end, NULL);
+               if (rc)
+                       CERROR("Error %d syncing data on lock cancel\n", rc);
+
+               OBDO_FREE(oa);
+               OBD_FREE_PTR(oinfo);
+       }
 
-        rc = ldlm_server_blocking_ast(lock, desc, data, flag);
-        RETURN(rc);
+       rc = ldlm_server_blocking_ast(lock, desc, data, flag);
+out_env:
+       lu_env_fini(&env);
+       RETURN(rc);
 }
 
 static int ost_filter_recovery_request(struct ptlrpc_request *req,
@@ -1648,11 +1668,8 @@ int ost_msg_check_version(struct lustre_msg *msg)
         case OST_SYNC:
         case OST_SET_INFO:
         case OST_GET_INFO:
-#ifdef HAVE_QUOTA_SUPPORT
         case OST_QUOTACHECK:
         case OST_QUOTACTL:
-        case OST_QUOTA_ADJUST_QUNIT:
-#endif
                 rc = lustre_msg_check_version(msg, LUSTRE_OST_VERSION);
                 if (rc)
                         CERROR("bad opc %u version %08x, expecting %08x\n",
@@ -1681,6 +1698,10 @@ int ost_msg_check_version(struct lustre_msg *msg)
                                lustre_msg_get_version(msg),
                                LUSTRE_LOG_VERSION);
                 break;
+       case OST_QUOTA_ADJUST_QUNIT:
+               rc = -ENOTSUPP;
+               CERROR("Quota adjust is deprecated as of 2.4.0\n");
+               break;
         default:
                 CERROR("Unexpected opcode %d\n", lustre_msg_get_opc(msg));
                 rc = -ENOTSUPP;
@@ -1704,12 +1725,13 @@ struct ost_prolong_data {
  */
 static inline int prolong_timeout(struct ptlrpc_request *req)
 {
-        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
 
-        if (AT_OFF)
-                return obd_timeout / 2;
+       if (AT_OFF)
+               return obd_timeout / 2;
 
-        return max(at_est2timeout(at_get(&svc->srv_at_estimate)), ldlm_timeout);
+       return max(at_est2timeout(at_get(&svcpt->scp_at_estimate)),
+                  ldlm_timeout);
 }
 
 static void ost_prolong_lock_one(struct ost_prolong_data *opd,
@@ -1980,7 +2002,7 @@ struct ptlrpc_hpreq_ops ost_hpreq_punch = {
 };
 
 /** Assign high priority operations to the request if needed. */
-static int ost_hpreq_handler(struct ptlrpc_request *req)
+static int ost_io_hpreq_handler(struct ptlrpc_request *req)
 {
         ENTRY;
         if (req->rq_export) {
@@ -2204,18 +2226,18 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_BRW_WRITE);
                 CDEBUG(D_INODE, "write\n");
                 /* req->rq_request_portal would be nice, if it was set */
-                if (req->rq_rqbd->rqbd_service->srv_req_portal !=OST_IO_PORTAL){
-                        CERROR("%s: deny write request from %s to portal %u\n",
-                               req->rq_export->exp_obd->obd_name,
-                               obd_export_nid2str(req->rq_export),
-                               req->rq_rqbd->rqbd_service->srv_req_portal);
+               if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+                       CERROR("%s: deny write request from %s to portal %u\n",
+                              req->rq_export->exp_obd->obd_name,
+                              obd_export_nid2str(req->rq_export),
+                              ptlrpc_req2svc(req)->srv_req_portal);
                         GOTO(out, rc = -EPROTO);
                 }
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_NET))
                         RETURN(0);
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOSPC))
                         GOTO(out, rc = -ENOSPC);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OST_EROFS))
+                if (OBD_FAIL_TIMEOUT(OBD_FAIL_OST_EROFS, 1))
                         GOTO(out, rc = -EROFS);
                 rc = ost_brw_write(req, oti);
                 LASSERT(current->journal_info == NULL);
@@ -2225,11 +2247,11 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_BRW_READ);
                 CDEBUG(D_INODE, "read\n");
                 /* req->rq_request_portal would be nice, if it was set */
-                if (req->rq_rqbd->rqbd_service->srv_req_portal !=OST_IO_PORTAL){
-                        CERROR("%s: deny read request from %s to portal %u\n",
-                               req->rq_export->exp_obd->obd_name,
-                               obd_export_nid2str(req->rq_export),
-                               req->rq_rqbd->rqbd_service->srv_req_portal);
+               if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+                       CERROR("%s: deny read request from %s to portal %u\n",
+                              req->rq_export->exp_obd->obd_name,
+                              obd_export_nid2str(req->rq_export),
+                              ptlrpc_req2svc(req)->srv_req_portal);
                         GOTO(out, rc = -EPROTO);
                 }
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_NET))
@@ -2271,7 +2293,6 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_GET_INFO_GENERIC);
                 rc = ost_get_info(req->rq_export, req);
                 break;
-#ifdef HAVE_QUOTA_SUPPORT
         case OST_QUOTACHECK:
                 CDEBUG(D_INODE, "quotacheck\n");
                 req_capsule_set(&req->rq_pill, &RQF_OST_QUOTACHECK);
@@ -2286,12 +2307,6 @@ int ost_handle(struct ptlrpc_request *req)
                         RETURN(0);
                 rc = ost_handle_quotactl(req);
                 break;
-        case OST_QUOTA_ADJUST_QUNIT:
-                CDEBUG(D_INODE, "quota_adjust_qunit\n");
-                req_capsule_set(&req->rq_pill, &RQF_OST_QUOTA_ADJUST_QUNIT);
-                rc = ost_handle_quota_adjust_qunit(req);
-                break;
-#endif
         case OBD_PING:
                 DEBUG_REQ(D_INODE, req, "ping");
                 req_capsule_set(&req->rq_pill, &RQF_OBD_PING);
@@ -2320,30 +2335,30 @@ int ost_handle(struct ptlrpc_request *req)
                 if (rc)
                         RETURN(rc);
                 RETURN(ptlrpc_reply(req));
-        case LDLM_ENQUEUE:
-                CDEBUG(D_INODE, "enqueue\n");
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_ENQUEUE);
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE))
-                        RETURN(0);
-                rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
-                                         ost_blocking_ast,
-                                         ldlm_server_glimpse_ast);
-                fail = OBD_FAIL_OST_LDLM_REPLY_NET;
-                break;
-        case LDLM_CONVERT:
-                CDEBUG(D_INODE, "convert\n");
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CONVERT))
-                        RETURN(0);
-                rc = ldlm_handle_convert(req);
-                break;
-        case LDLM_CANCEL:
-                CDEBUG(D_INODE, "cancel\n");
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL))
-                        RETURN(0);
-                rc = ldlm_handle_cancel(req);
-                break;
+       case LDLM_ENQUEUE:
+               CDEBUG(D_INODE, "enqueue\n");
+               req_capsule_set(&req->rq_pill, &RQF_LDLM_ENQUEUE);
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_NET))
+                       RETURN(0);
+               rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
+                                        ost_blocking_ast,
+                                        ldlm_server_glimpse_ast);
+               fail = OBD_FAIL_OST_LDLM_REPLY_NET;
+               break;
+       case LDLM_CONVERT:
+               CDEBUG(D_INODE, "convert\n");
+               req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CONVERT_NET))
+                       RETURN(0);
+               rc = ldlm_handle_convert(req);
+               break;
+       case LDLM_CANCEL:
+               CDEBUG(D_INODE, "cancel\n");
+               req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET))
+                       RETURN(0);
+               rc = ldlm_handle_cancel(req);
+               break;
         case LDLM_BL_CALLBACK:
         case LDLM_CP_CALLBACK:
                 CDEBUG(D_INODE, "callback\n");
@@ -2372,10 +2387,11 @@ out:
         return 0;
 }
 EXPORT_SYMBOL(ost_handle);
+
 /*
- * free per-thread pool created by ost_thread_init().
+ * free per-thread pool created by ost_io_thread_init().
  */
-static void ost_thread_done(struct ptlrpc_thread *thread)
+static void ost_io_thread_done(struct ptlrpc_thread *thread)
 {
         struct ost_thread_local_cache *tls; /* TLS stands for Thread-Local
                                              * Storage */
@@ -2386,7 +2402,7 @@ static void ost_thread_done(struct ptlrpc_thread *thread)
 
         /*
          * be prepared to handle partially-initialized pools (because this is
-         * called from ost_thread_init() for cleanup.
+         * called from ost_io_thread_init() for cleanup.
          */
         tls = thread->t_data;
         if (tls != NULL) {
@@ -2399,7 +2415,7 @@ static void ost_thread_done(struct ptlrpc_thread *thread)
 /*
  * initialize per-thread page pool (bug 5137).
  */
-static int ost_thread_init(struct ptlrpc_thread *thread)
+static int ost_io_thread_init(struct ptlrpc_thread *thread)
 {
         struct ost_thread_local_cache *tls;
 
@@ -2407,7 +2423,6 @@ static int ost_thread_init(struct ptlrpc_thread *thread)
 
         LASSERT(thread != NULL);
         LASSERT(thread->t_data == NULL);
-        LASSERTF(thread->t_id <= OSS_THREADS_MAX, "%u\n", thread->t_id);
 
         OBD_ALLOC_PTR(tls);
         if (tls == NULL)
@@ -2418,18 +2433,17 @@ static int ost_thread_init(struct ptlrpc_thread *thread)
 
 #define OST_WATCHDOG_TIMEOUT (obd_timeout * 1000)
 
+static struct cfs_cpt_table    *ost_io_cptable;
+
 /* Sigh - really, this is an OSS, the _server_, not the _target_ */
 static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
 {
        static struct ptlrpc_service_conf       svc_conf;
-        struct ost_obd *ost = &obd->u.ost;
-        struct lprocfs_static_vars lvars;
-        int oss_min_threads;
-        int oss_max_threads;
-        int oss_min_create_threads;
-        int oss_max_create_threads;
-        int rc;
-        ENTRY;
+       struct ost_obd *ost = &obd->u.ost;
+       struct lprocfs_static_vars lvars;
+       nodemask_t              *mask;
+       int rc;
+       ENTRY;
 
         rc = cfs_cleanup_group_info();
         if (rc)
@@ -2440,26 +2454,6 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
 
         cfs_mutex_init(&ost->ost_health_mutex);
 
-        if (oss_num_threads) {
-                /* If oss_num_threads is set, it is the min and the max. */
-                if (oss_num_threads > OSS_THREADS_MAX)
-                        oss_num_threads = OSS_THREADS_MAX;
-                if (oss_num_threads < OSS_THREADS_MIN)
-                        oss_num_threads = OSS_THREADS_MIN;
-                oss_max_threads = oss_min_threads = oss_num_threads;
-       } else {
-               /* Base min threads on memory and cpus */
-               oss_min_threads =
-                       cfs_num_online_cpus() * CFS_NUM_CACHEPAGES >>
-                       (27 - CFS_PAGE_SHIFT);
-               if (oss_min_threads < OSS_THREADS_MIN)
-                       oss_min_threads = OSS_THREADS_MIN;
-               /* Insure a 4x range for dynamic threads */
-               if (oss_min_threads > OSS_THREADS_MAX / 4)
-                       oss_min_threads = OSS_THREADS_MAX / 4;
-               oss_max_threads = min(OSS_THREADS_MAX, oss_min_threads * 4 + 1);
-       }
-
        svc_conf = (typeof(svc_conf)) {
                .psc_name               = LUSTRE_OSS_NAME,
                .psc_watchdog_factor    = OSS_SERVICE_WATCHDOG_FACTOR,
@@ -2473,13 +2467,21 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
                },
                .psc_thr                = {
                        .tc_thr_name            = "ll_ost",
-                       .tc_nthrs_min           = oss_min_threads,
-                       .tc_nthrs_max           = oss_max_threads,
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_threads,
+                       .tc_cpu_affinity        = 1,
                        .tc_ctx_tags            = LCT_DT_THREAD,
                },
+               .psc_cpt                = {
+                       .cc_pattern             = oss_cpts,
+               },
                .psc_ops                = {
                        .so_req_handler         = ost_handle,
                        .so_req_printer         = target_print_req,
+                       .so_hpreq_handler       = ptlrpc_hpreq_handler,
                },
        };
        ost->ost_service = ptlrpc_register_service(&svc_conf,
@@ -2490,18 +2492,6 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
                GOTO(out_lprocfs, rc);
         }
 
-        if (oss_num_create_threads) {
-                if (oss_num_create_threads > OSS_MAX_CREATE_THREADS)
-                        oss_num_create_threads = OSS_MAX_CREATE_THREADS;
-                if (oss_num_create_threads < OSS_MIN_CREATE_THREADS)
-                        oss_num_create_threads = OSS_MIN_CREATE_THREADS;
-                oss_min_create_threads = oss_max_create_threads =
-                        oss_num_create_threads;
-        } else {
-                oss_min_create_threads = OSS_MIN_CREATE_THREADS;
-                oss_max_create_threads = OSS_MAX_CREATE_THREADS;
-        }
-
        memset(&svc_conf, 0, sizeof(svc_conf));
        svc_conf = (typeof(svc_conf)) {
                .psc_name               = "ost_create",
@@ -2516,10 +2506,17 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
                },
                .psc_thr                = {
                        .tc_thr_name            = "ll_ost_create",
-                       .tc_nthrs_min           = oss_min_create_threads,
-                       .tc_nthrs_max           = oss_max_create_threads,
+                       .tc_thr_factor          = OSS_CR_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_CR_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_CR_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_CR_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_create_threads,
+                       .tc_cpu_affinity        = 1,
                        .tc_ctx_tags            = LCT_DT_THREAD,
                },
+               .psc_cpt                = {
+                       .cc_pattern             = oss_cpts,
+               },
                .psc_ops                = {
                        .so_req_handler         = ost_handle,
                        .so_req_printer         = target_print_req,
@@ -2533,6 +2530,31 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
                GOTO(out_service, rc);
         }
 
+       mask = cfs_cpt_table->ctb_nodemask;
+       /* event CPT feature is disabled in libcfs level by set partition
+        * number to 1, we still want to set node affinity for io service */
+       if (cfs_cpt_number(cfs_cpt_table) == 1 && nodes_weight(*mask) > 1) {
+               int     cpt = 0;
+               int     i;
+
+               ost_io_cptable = cfs_cpt_table_alloc(nodes_weight(*mask));
+               for_each_node_mask(i, *mask) {
+                       if (ost_io_cptable == NULL) {
+                               CWARN("OSS failed to create CPT table\n");
+                               break;
+                       }
+
+                       rc = cfs_cpt_set_node(ost_io_cptable, cpt++, i);
+                       if (!rc) {
+                               CWARN("OSS Failed to set node %d for"
+                                     "IO CPT table\n", i);
+                               cfs_cpt_table_free(ost_io_cptable);
+                               ost_io_cptable = NULL;
+                               break;
+                       }
+               }
+       }
+
        memset(&svc_conf, 0, sizeof(svc_conf));
        svc_conf = (typeof(svc_conf)) {
                .psc_name               = "ost_io",
@@ -2547,16 +2569,24 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
                },
                .psc_thr                = {
                        .tc_thr_name            = "ll_ost_io",
-                       .tc_nthrs_min           = oss_min_threads,
-                       .tc_nthrs_max           = oss_max_threads,
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_threads,
                        .tc_cpu_affinity        = 1,
                        .tc_ctx_tags            = LCT_DT_THREAD,
                },
+               .psc_cpt                = {
+                       .cc_cptable             = ost_io_cptable,
+                       .cc_pattern             = ost_io_cptable == NULL ?
+                                                 oss_io_cpts : NULL,
+               },
                .psc_ops                = {
-                       .so_thr_init            = ost_thread_init,
-                       .so_thr_done            = ost_thread_done,
+                       .so_thr_init            = ost_io_thread_init,
+                       .so_thr_done            = ost_io_thread_done,
                        .so_req_handler         = ost_handle,
-                       .so_hpreq_handler       = ost_hpreq_handler,
+                       .so_hpreq_handler       = ost_io_hpreq_handler,
                        .so_req_printer         = target_print_req,
                },
        };
@@ -2603,11 +2633,16 @@ static int ost_cleanup(struct obd_device *obd)
         ost->ost_create_service = NULL;
        ost->ost_io_service = NULL;
 
-        cfs_mutex_unlock(&ost->ost_health_mutex);
+       cfs_mutex_unlock(&ost->ost_health_mutex);
 
-        lprocfs_obd_cleanup(obd);
+       lprocfs_obd_cleanup(obd);
 
-        RETURN(err);
+       if (ost_io_cptable != NULL) {
+               cfs_cpt_table_free(ost_io_cptable);
+               ost_io_cptable = NULL;
+       }
+
+       RETURN(err);
 }
 
 static int ost_health_check(const struct lu_env *env, struct obd_device *obd)
@@ -2651,6 +2686,8 @@ static int __init ost_init(void)
         int rc;
         ENTRY;
 
+       ost_page_to_corrupt = cfs_alloc_page(CFS_ALLOC_STD);
+
         lprocfs_ost_init_vars(&lvars);
         rc = class_register_type(&ost_obd_ops, NULL, lvars.module_vars,
                                  LUSTRE_OSS_NAME, NULL);
@@ -2667,6 +2704,9 @@ static int __init ost_init(void)
 
 static void /*__exit*/ ost_exit(void)
 {
+       if (ost_page_to_corrupt)
+               page_cache_release(ost_page_to_corrupt);
+
         class_unregister_type(LUSTRE_OSS_NAME);
 }