Whamcloud - gitweb
LU-3751 ost: disable OUT_PORTAL request handler on OST.
[fs/lustre-release.git] / lustre / ost / ost_handler.c
index cbe543b..405d69c 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -29,7 +27,7 @@
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ * Copyright (c) 2011, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,9 +39,6 @@
  * Author: Phil Schwan <phil@clusterfs.com>
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
 #define DEBUG_SUBSYSTEM S_OST
 
 #include <linux/module.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
 #include <lustre_debug.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
 #include <linux/init.h>
 #include <lprocfs_status.h>
 #include <libcfs/list.h>
 #include <lustre_quota.h>
+#include <lustre_fid.h>
 #include "ost_internal.h"
+#include <lustre_fid.h>
 
 static int oss_num_threads;
 CFS_MODULE_PARM(oss_num_threads, "i", int, 0444,
@@ -71,6 +70,23 @@ static int oss_num_create_threads;
 CFS_MODULE_PARM(oss_num_create_threads, "i", int, 0444,
                 "number of OSS create threads to start");
 
+static char *oss_cpts;
+CFS_MODULE_PARM(oss_cpts, "s", charp, 0444,
+               "CPU partitions OSS threads should run on");
+
+static char *oss_io_cpts;
+CFS_MODULE_PARM(oss_io_cpts, "s", charp, 0444,
+               "CPU partitions OSS IO threads should run on");
+
+/*
+ * this page is allocated statically when module is initializing
+ * it is used to simulate data corruptions, see ost_checksum_bulk()
+ * for details. as the original pages provided by the layers below
+ * can be remain in the internal cache, we do not want to modify
+ * them.
+ */
+static struct page *ost_page_to_corrupt = NULL;
+
 /**
  * Do not return server-side uid/gid to remote client
  */
@@ -87,28 +103,72 @@ static void ost_drop_id(struct obd_export *exp, struct obdo *oa)
  * Validate oa from client.
  * If the request comes from 2.0 clients, currently only RSVD seq and IDIF
  * req are valid.
- *    a. for single MDS  seq = FID_SEQ_OST_MDT0,
- *    b. for CMD, seq = FID_SEQ_OST_MDT0, FID_SEQ_OST_MDT1 - FID_SEQ_OST_MAX
+ *    a. objects in Single MDT FS  seq = FID_SEQ_OST_MDT0, oi_id != 0
+ *    b. Echo objects(seq = 2), old echo client still use oi_id/oi_seq to
+ *       pack ost_id. Because non-zero oi_seq will make it diffcult to tell
+ *       whether this is oi_fid or real ostid. So it will check
+ *       OBD_CONNECT_FID, then convert the ostid to FID for old client.
+ *    c. Old FID-disable osc will send IDIF.
+ *    d. new FID-enable osc/osp will send normal FID.
+ *
+ * And also oi_id/f_oid should always start from 1. oi_id/f_oid = 0 will
+ * be used for LAST_ID file, and only being accessed inside OST now.
  */
 static int ost_validate_obdo(struct obd_export *exp, struct obdo *oa,
-                             struct obd_ioobj *ioobj)
+                            struct obd_ioobj *ioobj)
 {
-        if (oa != NULL && !(oa->o_valid & OBD_MD_FLGROUP)) {
-                oa->o_seq = FID_SEQ_OST_MDT0;
-                if (ioobj)
-                        ioobj->ioo_seq = FID_SEQ_OST_MDT0;
-        /* remove fid_seq_is_rsvd() after FID-on-OST allows SEQ > 9 */
-        } else if (oa == NULL ||
-                   !(fid_seq_is_rsvd(oa->o_seq) || fid_seq_is_idif(oa->o_seq))) {
-                CERROR("%s: client %s sent invalid object "POSTID"\n",
-                       exp->exp_obd->obd_name, obd_export_nid2str(exp),
-                       oa ? oa->o_id : -1, oa ? oa->o_seq : -1);
-                return -EPROTO;
-        }
-        obdo_from_ostid(oa, &oa->o_oi);
-        if (ioobj)
-                ioobj_from_obdo(ioobj, oa);
-        return 0;
+       int rc = 0;
+
+       if (unlikely(!(exp_connect_flags(exp) & OBD_CONNECT_FID) &&
+                    fid_seq_is_echo(oa->o_oi.oi.oi_seq) && oa != NULL)) {
+               /* Sigh 2.[123] client still sends echo req with oi_id = 0
+                * during create, and we will reset this to 1, since this
+                * oi_id is basically useless in the following create process,
+                * but oi_id == 0 will make it difficult to tell whether it is
+                * real FID or ost_id. */
+               oa->o_oi.oi_fid.f_oid = oa->o_oi.oi.oi_id ?: 1;
+               oa->o_oi.oi_fid.f_seq = FID_SEQ_ECHO;
+               oa->o_oi.oi_fid.f_ver = 0;
+       } else {
+               if (unlikely((oa == NULL) || ostid_id(&oa->o_oi) == 0))
+                       GOTO(out, rc = -EPROTO);
+
+               /* Note: this check might be forced in 2.5 or 2.6, i.e.
+                * all of the requests are required to setup FLGROUP */
+               if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) {
+                       ostid_set_seq_mdt0(&oa->o_oi);
+                       if (ioobj)
+                               ostid_set_seq_mdt0(&ioobj->ioo_oid);
+                       oa->o_valid |= OBD_MD_FLGROUP;
+               }
+
+               if (unlikely(!(fid_seq_is_idif(ostid_seq(&oa->o_oi)) ||
+                              fid_seq_is_mdt0(ostid_seq(&oa->o_oi)) ||
+                              fid_seq_is_norm(ostid_seq(&oa->o_oi)) ||
+                              fid_seq_is_echo(ostid_seq(&oa->o_oi)))))
+                       GOTO(out, rc = -EPROTO);
+       }
+
+       if (ioobj != NULL) {
+               unsigned max_brw = ioobj_max_brw_get(ioobj);
+
+               if (unlikely((max_brw & (max_brw - 1)) != 0)) {
+                       CERROR("%s: client %s sent bad ioobj max %u for "DOSTID
+                              ": rc = -EPROTO\n", exp->exp_obd->obd_name,
+                              obd_export_nid2str(exp), max_brw,
+                              POSTID(&oa->o_oi));
+                       GOTO(out, rc = -EPROTO);
+               }
+               ioobj->ioo_oid = oa->o_oi;
+       }
+
+out:
+       if (rc != 0)
+               CERROR("%s: client %s sent bad object "DOSTID": rc = %d\n",
+                      exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                      oa ? ostid_seq(&oa->o_oi) : -1,
+                      oa ? ostid_id(&oa->o_oi) : -1, rc);
+       return rc;
 }
 
 void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
@@ -149,8 +209,8 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req,
         if (body == NULL)
                 RETURN(-EFAULT);
 
-        if (body->oa.o_id == 0)
-                RETURN(-EPROTO);
+       if (ostid_id(&body->oa.o_oi) == 0)
+               RETURN(-EPROTO);
 
         rc = ost_validate_obdo(exp, &body->oa, NULL);
         if (rc)
@@ -189,7 +249,8 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req,
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
 
         /* Do the destroy and set the reply status accordingly  */
-        req->rq_status = obd_destroy(exp, &repbody->oa, NULL, oti, NULL, capa);
+        req->rq_status = obd_destroy(req->rq_svc_thread->t_env, exp,
+                                     &repbody->oa, NULL, oti, NULL, capa);
         RETURN(0);
 }
 
@@ -199,7 +260,7 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req,
  */
 static int ost_lock_get(struct obd_export *exp, struct obdo *oa,
                         __u64 start, __u64 count, struct lustre_handle *lh,
-                        int mode, int flags)
+                       int mode, __u64 flags)
 {
         struct ldlm_res_id res_id;
         ldlm_policy_data_t policy;
@@ -218,7 +279,7 @@ static int ost_lock_get(struct obd_export *exp, struct obdo *oa,
             !(oa->o_flags & OBD_FL_SRVLOCK))
                 RETURN(0);
 
-        osc_build_res_name(oa->o_id, oa->o_seq, &res_id);
+       ostid_build_res_name(&oa->o_oi, &res_id);
         CDEBUG(D_INODE, "OST-side extent lock.\n");
 
         policy.l_extent.start = start & CFS_PAGE_MASK;
@@ -233,7 +294,8 @@ static int ost_lock_get(struct obd_export *exp, struct obdo *oa,
         RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id,
                                       LDLM_EXTENT, &policy, mode, &flags,
                                       ldlm_blocking_ast, ldlm_completion_ast,
-                                      ldlm_glimpse_ast, NULL, 0, NULL, lh));
+                                     ldlm_glimpse_ast, NULL, 0, LVB_T_NONE,
+                                     NULL, lh));
 }
 
 /* Helper function: release lock, if any. */
@@ -288,7 +350,7 @@ static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req)
         oinfo->oi_oa = &repbody->oa;
         oinfo->oi_capa = capa;
 
-        req->rq_status = obd_getattr(exp, oinfo);
+        req->rq_status = obd_getattr(req->rq_svc_thread->t_env, exp, oinfo);
 
         OBD_FREE_PTR(oinfo);
 
@@ -311,12 +373,16 @@ static int ost_statfs(struct ptlrpc_request *req)
 
         osfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
 
-        req->rq_status = obd_statfs(req->rq_export->exp_obd, osfs,
+        req->rq_status = obd_statfs(req->rq_svc_thread->t_env, req->rq_export,
+                                    osfs,
                                     cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
                                     0);
         if (req->rq_status != 0)
                 CERROR("ost: statfs failed: rc %d\n", req->rq_status);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OST_STATFS_EINPROGRESS))
+               req->rq_status = -EINPROGRESS;
+
         RETURN(0);
 }
 
@@ -343,7 +409,8 @@ static int ost_create(struct obd_export *exp, struct ptlrpc_request *req,
         repbody->oa = body->oa;
         oti->oti_logcookies = &body->oa.o_lcookie;
 
-        req->rq_status = obd_create(exp, &repbody->oa, NULL, oti);
+        req->rq_status = obd_create(req->rq_svc_thread->t_env, exp,
+                                    &repbody->oa, NULL, oti);
         //obd_log_cancel(conn, NULL, 1, oti->oti_logcookies, 0);
         RETURN(0);
 }
@@ -352,8 +419,9 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
                      struct obd_trans_info *oti)
 {
         struct ost_body *body, *repbody;
-        int rc, flags = 0;
+       __u64 flags = 0;
         struct lustre_handle lh = {0,};
+       int rc;
         ENTRY;
 
         /* check that we do support OBD_CONNECT_TRUNCLOCK. */
@@ -378,7 +446,7 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
         /* standard truncate optimization: if file body is completely
          * destroyed, don't send data back to the server. */
         if (body->oa.o_size == 0)
-                flags |= LDLM_AST_DISCARD_DATA;
+               flags |= LDLM_FL_AST_DISCARD_DATA;
 
         repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
         repbody->oa = body->oa;
@@ -416,7 +484,8 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
                 oinfo->oi_capa = capa;
                 oinfo->oi_flags = OBD_FL_PUNCH;
 
-                req->rq_status = obd_punch(exp, oinfo, oti, NULL);
+                req->rq_status = obd_punch(req->rq_svc_thread->t_env, exp,
+                                           oinfo, oti, NULL);
                 OBD_FREE_PTR(oinfo);
 unlock:
                 ost_lock_put(exp, &lh, LCK_PW);
@@ -426,7 +495,8 @@ unlock:
         RETURN(rc);
 }
 
-static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req)
+static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req,
+                   struct obd_trans_info *oti)
 {
         struct ost_body *body, *repbody;
         struct obd_info *oinfo;
@@ -463,8 +533,10 @@ static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req)
 
         oinfo->oi_oa = &repbody->oa;
         oinfo->oi_capa = capa;
-        req->rq_status = obd_sync(exp, oinfo, repbody->oa.o_size,
-                                  repbody->oa.o_blocks, NULL);
+       oinfo->oi_jobid = oti->oti_jobid;
+        req->rq_status = obd_sync(req->rq_svc_thread->t_env, exp, oinfo,
+                                  repbody->oa.o_size, repbody->oa.o_blocks,
+                                  NULL);
         OBD_FREE_PTR(oinfo);
 
         ost_drop_id(exp, &repbody->oa);
@@ -509,7 +581,8 @@ static int ost_setattr(struct obd_export *exp, struct ptlrpc_request *req,
         oinfo->oi_oa = &repbody->oa;
         oinfo->oi_capa = capa;
 
-        req->rq_status = obd_setattr(exp, oinfo, oti);
+        req->rq_status = obd_setattr(req->rq_svc_thread->t_env, exp, oinfo,
+                                     oti);
 
         OBD_FREE_PTR(oinfo);
 
@@ -518,50 +591,89 @@ static int ost_setattr(struct obd_export *exp, struct ptlrpc_request *req,
 }
 
 static __u32 ost_checksum_bulk(struct ptlrpc_bulk_desc *desc, int opc,
-                               cksum_type_t cksum_type)
+                              cksum_type_t cksum_type)
 {
-        __u32 cksum;
-        int i;
-
-        cksum = init_checksum(cksum_type);
-        for (i = 0; i < desc->bd_iov_count; i++) {
-                struct page *page = desc->bd_iov[i].kiov_page;
-                int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
-                char *ptr = kmap(page) + off;
-                int len = desc->bd_iov[i].kiov_len;
-
-                /* corrupt the data before we compute the checksum, to
-                 * simulate a client->OST data error */
-                if (i == 0 && opc == OST_WRITE &&
-                    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE))
-                        memcpy(ptr, "bad3", min(4, len));
-                cksum = compute_checksum(cksum, ptr, len, cksum_type);
-                /* corrupt the data after we compute the checksum, to
-                 * simulate an OST->client data error */
-                if (i == 0 && opc == OST_READ &&
-                    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
-                        memcpy(ptr, "bad4", min(4, len));
-                        /* nobody should use corrupted page again */
-                        ClearPageUptodate(page);
-                }
-                kunmap(page);
-        }
-
-        return fini_checksum(cksum, cksum_type);
+       struct cfs_crypto_hash_desc     *hdesc;
+       unsigned int                    bufsize;
+       int                             i, err;
+       unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
+       __u32                           cksum;
+
+       hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+       if (IS_ERR(hdesc)) {
+               CERROR("Unable to initialize checksum hash %s\n",
+                      cfs_crypto_hash_name(cfs_alg));
+               return PTR_ERR(hdesc);
+       }
+       CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
+       for (i = 0; i < desc->bd_iov_count; i++) {
+
+               /* corrupt the data before we compute the checksum, to
+                * simulate a client->OST data error */
+               if (i == 0 && opc == OST_WRITE &&
+                   OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
+                       int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+                       int len = desc->bd_iov[i].kiov_len;
+                       struct page *np = ost_page_to_corrupt;
+                       char *ptr = kmap(desc->bd_iov[i].kiov_page) + off;
+
+                       if (np) {
+                               char *ptr2 = kmap(np) + off;
+
+                               memcpy(ptr2, ptr, len);
+                               memcpy(ptr2, "bad3", min(4, len));
+                               kunmap(np);
+                               desc->bd_iov[i].kiov_page = np;
+                       } else {
+                               CERROR("can't alloc page for corruption\n");
+                       }
+               }
+               cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+                                 desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+                                 desc->bd_iov[i].kiov_len);
+
+                /* corrupt the data after we compute the checksum, to
+                * simulate an OST->client data error */
+               if (i == 0 && opc == OST_READ &&
+                   OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
+                       int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+                       int len = desc->bd_iov[i].kiov_len;
+                       struct page *np = ost_page_to_corrupt;
+                       char *ptr = kmap(desc->bd_iov[i].kiov_page) + off;
+
+                       if (np) {
+                               char *ptr2 = kmap(np) + off;
+
+                               memcpy(ptr2, ptr, len);
+                               memcpy(ptr2, "bad4", min(4, len));
+                               kunmap(np);
+                               desc->bd_iov[i].kiov_page = np;
+                       } else {
+                               CERROR("can't alloc page for corruption\n");
+                       }
+               }
+       }
+
+       bufsize = 4;
+       err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+       if (err)
+               cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+       return cksum;
 }
 
 static int ost_brw_lock_get(int mode, struct obd_export *exp,
                             struct obd_ioobj *obj, struct niobuf_remote *nb,
                             struct lustre_handle *lh)
 {
-        int flags                 = 0;
+       __u64 flags               = 0;
         int nrbufs                = obj->ioo_bufcnt;
         struct ldlm_res_id res_id;
         ldlm_policy_data_t policy;
         int i;
         ENTRY;
 
-        osc_build_res_name(obj->ioo_id, obj->ioo_seq, &res_id);
+       ostid_build_res_name(&obj->ioo_oid, &res_id);
         LASSERT(mode == LCK_PR || mode == LCK_PW);
         LASSERT(!lustre_handle_is_used(lh));
 
@@ -580,7 +692,8 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp,
         RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id,
                                       LDLM_EXTENT, &policy, mode, &flags,
                                       ldlm_blocking_ast, ldlm_completion_ast,
-                                      ldlm_glimpse_ast, NULL, 0, NULL, lh));
+                                     ldlm_glimpse_ast, NULL, 0, LVB_T_NONE,
+                                     NULL, lh));
 }
 
 static void ost_brw_lock_put(int mode,
@@ -604,7 +717,7 @@ static struct ost_thread_local_cache *ost_tls_get(struct ptlrpc_request *r)
 
         /* In normal mode of operation an I/O request is serviced only
          * by ll_ost_io threads each of them has own tls buffers allocated by
-         * ost_thread_init().
+         * ost_io_thread_init().
          * During recovery, an I/O request may be queued until any of the ost
          * service threads process it. Not necessary it should be one of
          * ll_ost_io threads. In that case we dynamically allocating tls
@@ -712,34 +825,35 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
         if (rc != 0)
                 GOTO(out_tls, rc);
 
-        /*
-         * If getting the lock took more time than
-         * client was willing to wait, drop it. b=11330
-         */
-        if (cfs_time_current_sec() > req->rq_deadline ||
-            OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
-                no_reply = 1;
-                CERROR("Dropping timed-out read from %s because locking"
-                       "object "LPX64" took %ld seconds (limit was %ld).\n",
-                       libcfs_id2str(req->rq_peer), ioo->ioo_id,
-                       cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
-                       req->rq_deadline - req->rq_arrival_time.tv_sec);
-                GOTO(out_lock, rc = -ETIMEDOUT);
-        }
+       /*
+        * If getting the lock took more time than
+        * client was willing to wait, drop it. b=11330
+        */
+       if (cfs_time_current_sec() > req->rq_deadline ||
+           OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
+               no_reply = 1;
+               CERROR("Dropping timed-out read from %s because locking"
+                      "object "DOSTID" took %ld seconds (limit was %ld).\n",
+                      libcfs_id2str(req->rq_peer), POSTID(&ioo->ioo_oid),
+                      cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+                      req->rq_deadline - req->rq_arrival_time.tv_sec);
+               GOTO(out_lock, rc = -ETIMEDOUT);
+       }
 
         repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
         memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa));
 
         npages = OST_THREAD_POOL_SIZE;
-        rc = obd_preprw(OBD_BRW_READ, exp, &repbody->oa, 1, ioo,
-                        remote_nb, &npages, local_nb, oti, capa);
+        rc = obd_preprw(req->rq_svc_thread->t_env, OBD_BRW_READ, exp,
+                        &repbody->oa, 1, ioo, remote_nb, &npages, local_nb,
+                        oti, capa);
         if (rc != 0)
                 GOTO(out_lock, rc);
 
-        desc = ptlrpc_prep_bulk_exp(req, npages,
-                                     BULK_PUT_SOURCE, OST_BULK_PORTAL);
-        if (desc == NULL)
-                GOTO(out_commitrw, rc = -ENOMEM);
+       desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+                                   BULK_PUT_SOURCE, OST_BULK_PORTAL);
+       if (desc == NULL)
+               GOTO(out_commitrw, rc = -ENOMEM);
 
         nob = 0;
         for (i = 0; i < npages; i++) {
@@ -753,9 +867,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 nob += page_rc;
                 if (page_rc != 0) {             /* some data! */
                         LASSERT (local_nb[i].page != NULL);
-                        ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                              local_nb[i].offset & ~CFS_PAGE_MASK,
-                                              page_rc);
+                       ptlrpc_prep_bulk_page_nopin(desc, local_nb[i].page,
+                                                   local_nb[i].lnb_page_offset,
+                                                   page_rc);
                 }
 
                 if (page_rc != local_nb[i].len) { /* short read */
@@ -790,8 +904,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
 
 out_commitrw:
         /* Must commit after prep above in all cases */
-        rc = obd_commitrw(OBD_BRW_READ, exp, &repbody->oa, 1, ioo,
-                          remote_nb, npages, local_nb, oti, rc);
+        rc = obd_commitrw(req->rq_svc_thread->t_env, OBD_BRW_READ, exp,
+                          &repbody->oa, 1, ioo, remote_nb, npages, local_nb,
+                          oti, rc);
 
         if (rc == 0)
                 ost_drop_id(exp, &repbody->oa);
@@ -802,7 +917,7 @@ out_tls:
         ost_tls_put(req);
 out_bulk:
         if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
-                ptlrpc_free_bulk(desc);
+               ptlrpc_free_bulk_nopin(desc);
 out:
         LASSERT(rc <= 0);
         if (rc == 0) {
@@ -818,12 +933,11 @@ out:
         } else {
                 /* reply out callback would free */
                 ptlrpc_req_drop_rs(req);
-                CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
-                      "client will retry\n",
-                      exp->exp_obd->obd_name,
-                      exp->exp_client_uuid.uuid,
-                      exp->exp_connection->c_remote_uuid.uuid,
-                      libcfs_id2str(req->rq_peer));
+                LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), "
+                              "client will retry: rc %d\n",
+                              exp->exp_obd->obd_name,
+                              obd_uuid2str(&exp->exp_client_uuid),
+                              obd_export_nid2str(exp), rc);
         }
         /* send a bulk after reply to simulate a network delay or reordering
          * by a router */
@@ -837,12 +951,57 @@ out:
                 lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL);
                 l_wait_event(waitq, 0, &lwi1);
                 rc = target_bulk_io(exp, desc, &lwi);
-                ptlrpc_free_bulk(desc);
+               ptlrpc_free_bulk_nopin(desc);
         }
 
         RETURN(rc);
 }
 
+static void ost_warn_on_cksum(struct ptlrpc_request *req,
+                             struct ptlrpc_bulk_desc *desc,
+                             struct niobuf_local *local_nb, int npages,
+                             obd_count client_cksum, obd_count server_cksum,
+                             int mmap)
+{
+       struct obd_export *exp = req->rq_export;
+       struct ost_body *body;
+       char *router;
+       char *via;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT (body != NULL);
+
+       if (req->rq_peer.nid == desc->bd_sender) {
+               via = router = "";
+       } else {
+               via = " via ";
+               router = libcfs_nid2str(desc->bd_sender);
+       }
+
+       if (mmap) {
+               CDEBUG_LIMIT(D_INFO, "client csum %x, server csum %x\n",
+                            client_cksum, server_cksum);
+               return;
+       }
+
+       LCONSOLE_ERROR_MSG(0x168, "BAD WRITE CHECKSUM: %s from %s%s%s inode "
+                          DFID" object "DOSTID" extent ["LPU64"-"LPU64
+                          "]: client csum %x, server csum %x\n",
+                          exp->exp_obd->obd_name, libcfs_id2str(req->rq_peer),
+                          via, router,
+                          body->oa.o_valid & OBD_MD_FLFID ?
+                          body->oa.o_parent_seq : (__u64)0,
+                          body->oa.o_valid & OBD_MD_FLFID ?
+                          body->oa.o_parent_oid : 0,
+                          body->oa.o_valid & OBD_MD_FLFID ?
+                          body->oa.o_parent_ver : 0,
+                          POSTID(&body->oa.o_oi),
+                          local_nb[0].lnb_file_offset,
+                          local_nb[npages-1].lnb_file_offset +
+                          local_nb[npages-1].len - 1,
+                          client_cksum, server_cksum);
+}
+
 static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
         struct ptlrpc_bulk_desc *desc = NULL;
@@ -905,7 +1064,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
 
         if ((remote_nb[0].flags & OBD_BRW_MEMALLOC) &&
             (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self))
-                cfs_memory_pressure_set();
+               memory_pressure_set();
 
         if (body->oa.o_valid & OBD_MD_FLOSSCAPA) {
                 capa = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
@@ -932,20 +1091,20 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         if (rc != 0)
                 GOTO(out_tls, rc);
 
-        /*
-         * If getting the lock took more time than
-         * client was willing to wait, drop it. b=11330
-         */
-        if (cfs_time_current_sec() > req->rq_deadline ||
-            OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
-                no_reply = 1;
-                CERROR("Dropping timed-out write from %s because locking "
-                       "object "LPX64" took %ld seconds (limit was %ld).\n",
-                       libcfs_id2str(req->rq_peer), ioo->ioo_id,
-                       cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
-                       req->rq_deadline - req->rq_arrival_time.tv_sec);
-                GOTO(out_lock, rc = -ETIMEDOUT);
-        }
+       /*
+        * If getting the lock took more time than
+        * client was willing to wait, drop it. b=11330
+        */
+       if (cfs_time_current_sec() > req->rq_deadline ||
+           OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
+               no_reply = 1;
+               CERROR("Dropping timed-out write from %s because locking "
+                      "object "DOSTID" took %ld seconds (limit was %ld).\n",
+                      libcfs_id2str(req->rq_peer), POSTID(&ioo->ioo_oid),
+                      cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+                      req->rq_deadline - req->rq_arrival_time.tv_sec);
+               GOTO(out_lock, rc = -ETIMEDOUT);
+       }
 
         /* obd_preprw clobbers oa->valid, so save what we need */
         if (body->oa.o_valid & OBD_MD_FLCKSUM) {
@@ -973,22 +1132,22 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa));
 
         npages = OST_THREAD_POOL_SIZE;
-        rc = obd_preprw(OBD_BRW_WRITE, exp, &repbody->oa, objcount,
-                        ioo, remote_nb, &npages, local_nb, oti, capa);
+        rc = obd_preprw(req->rq_svc_thread->t_env, OBD_BRW_WRITE, exp,
+                        &repbody->oa, objcount, ioo, remote_nb, &npages,
+                        local_nb, oti, capa);
         if (rc != 0)
                 GOTO(out_lock, rc);
 
-        desc = ptlrpc_prep_bulk_exp(req, npages,
-                                     BULK_GET_SINK, OST_BULK_PORTAL);
-        if (desc == NULL)
-                GOTO(skip_transfer, rc = -ENOMEM);
-
-        /* NB Having prepped, we must commit... */
+       desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+                                   BULK_GET_SINK, OST_BULK_PORTAL);
+       if (desc == NULL)
+               GOTO(skip_transfer, rc = -ENOMEM);
 
-        for (i = 0; i < npages; i++)
-                ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                      local_nb[i].offset & ~CFS_PAGE_MASK,
-                                      local_nb[i].len);
+       /* NB Having prepped, we must commit... */
+       for (i = 0; i < npages; i++)
+               ptlrpc_prep_bulk_page_nopin(desc, local_nb[i].page,
+                                           local_nb[i].lnb_page_offset,
+                                           local_nb[i].len);
 
         rc = sptlrpc_svc_prep_bulk(req, desc);
         if (rc != 0)
@@ -998,7 +1157,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         no_reply = rc != 0;
 
 skip_transfer:
-        if (unlikely(client_cksum != 0 && rc == 0)) {
+        if (client_cksum != 0 && rc == 0) {
                 static int cksum_counter;
                 repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
                 repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
@@ -1007,10 +1166,10 @@ skip_transfer:
                 repbody->oa.o_cksum = server_cksum;
                 cksum_counter++;
                 if (unlikely(client_cksum != server_cksum)) {
-                        CDEBUG_LIMIT(mmap ? D_INFO : D_ERROR,
-                                     "client csum %x, server csum %x\n",
-                                     client_cksum, server_cksum);
+                       ost_warn_on_cksum(req, desc, local_nb, npages,
+                                         client_cksum, server_cksum, mmap);
                         cksum_counter = 0;
+
                 } else if ((cksum_counter & (-cksum_counter)) == cksum_counter){
                         CDEBUG(D_INFO, "Checksum %u from %s OK: %x\n",
                                cksum_counter, libcfs_id2str(req->rq_peer),
@@ -1019,8 +1178,9 @@ skip_transfer:
         }
 
         /* Must commit after prep above in all cases */
-        rc = obd_commitrw(OBD_BRW_WRITE, exp, &repbody->oa, objcount, ioo,
-                          remote_nb, npages, local_nb, oti, rc);
+        rc = obd_commitrw(req->rq_svc_thread->t_env, OBD_BRW_WRITE, exp,
+                          &repbody->oa, objcount, ioo, remote_nb, npages,
+                          local_nb, oti, rc);
         if (rc == -ENOTCONN)
                 /* quota acquire process has been given up because
                  * either the client has been evicted or the client
@@ -1039,49 +1199,6 @@ skip_transfer:
          */
         repbody->oa.o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLATIME);
 
-        if (unlikely(client_cksum != server_cksum && rc == 0 &&  !mmap)) {
-                int  new_cksum = ost_checksum_bulk(desc, OST_WRITE, cksum_type);
-                char *msg;
-                char *via;
-                char *router;
-
-                if (new_cksum == server_cksum)
-                        msg = "changed in transit before arrival at OST";
-                else if (new_cksum == client_cksum)
-                        msg = "initial checksum before message complete";
-                else
-                        msg = "changed in transit AND after initial checksum";
-
-                if (req->rq_peer.nid == desc->bd_sender) {
-                        via = router = "";
-                } else {
-                        via = " via ";
-                        router = libcfs_nid2str(desc->bd_sender);
-                }
-
-                LCONSOLE_ERROR_MSG(0x168, "%s: BAD WRITE CHECKSUM: %s from "
-                                   "%s%s%s inode "DFID" object "
-                                   LPU64"/"LPU64" extent ["LPU64"-"LPU64"]\n",
-                                   exp->exp_obd->obd_name, msg,
-                                   libcfs_id2str(req->rq_peer),
-                                   via, router,
-                                   body->oa.o_valid & OBD_MD_FLFID ?
-                                                body->oa.o_parent_seq : (__u64)0,
-                                   body->oa.o_valid & OBD_MD_FLFID ?
-                                                body->oa.o_parent_oid : 0,
-                                   body->oa.o_valid & OBD_MD_FLFID ?
-                                                body->oa.o_parent_ver : 0,
-                                   body->oa.o_id,
-                                   body->oa.o_valid & OBD_MD_FLGROUP ?
-                                                body->oa.o_seq : (__u64)0,
-                                   local_nb[0].offset,
-                                   local_nb[npages-1].offset +
-                                   local_nb[npages-1].len - 1 );
-                CERROR("client csum %x, original server csum %x, "
-                       "server csum now %x\n",
-                       client_cksum, server_cksum, new_cksum);
-        }
-
         if (rc == 0) {
                 int nob = 0;
 
@@ -1110,7 +1227,7 @@ out_tls:
         ost_tls_put(req);
 out_bulk:
         if (desc)
-                ptlrpc_free_bulk(desc);
+               ptlrpc_free_bulk_nopin(desc);
 out:
         if (rc == 0) {
                 oti_to_request(oti, req);
@@ -1124,14 +1241,13 @@ out:
         } else {
                 /* reply out callback would free */
                 ptlrpc_req_drop_rs(req);
-                CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
-                      "client will retry\n",
-                      exp->exp_obd->obd_name,
-                      exp->exp_client_uuid.uuid,
-                      exp->exp_connection->c_remote_uuid.uuid,
-                      libcfs_id2str(req->rq_peer));
+                LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s), "
+                              "client will retry: rc %d\n",
+                              exp->exp_obd->obd_name,
+                              obd_uuid2str(&exp->exp_client_uuid),
+                              obd_export_nid2str(exp), rc);
         }
-        cfs_memory_pressure_clr();
+       memory_pressure_clr();
         RETURN(rc);
 }
 
@@ -1207,17 +1323,96 @@ static int ost_set_info(struct obd_export *exp, struct ptlrpc_request *req)
 
         /* OBD will also check if KEY_IS(KEY_GRANT_SHRINK), and will cast val to
          * a struct ost_body * value */
-        rc = obd_set_info_async(exp, keylen, key, vallen, val, NULL);
+        rc = obd_set_info_async(req->rq_svc_thread->t_env, exp, keylen,
+                                key, vallen, val, NULL);
 out:
         lustre_msg_set_status(req->rq_repmsg, 0);
         RETURN(rc);
 }
 
+struct locked_region {
+       cfs_list_t  list;
+       struct lustre_handle lh;
+};
+
+static int lock_region(struct obd_export *exp, struct obdo *oa,
+                      unsigned long long begin, unsigned long long end,
+                      cfs_list_t *locked)
+{
+       struct locked_region *region = NULL;
+       int rc;
+
+       LASSERT(begin <= end);
+       OBD_ALLOC_PTR(region);
+       if (region == NULL)
+               return -ENOMEM;
+
+       rc = ost_lock_get(exp, oa, begin, end - begin, &region->lh, LCK_PR, 0);
+       if (rc) {
+               OBD_FREE_PTR(region);
+               return rc;
+       }
+
+       CDEBUG(D_OTHER, "ost lock [%llu,%llu], lh=%p\n",
+              begin, end, &region->lh);
+       cfs_list_add(&region->list, locked);
+
+       return 0;
+}
+
+static int lock_zero_regions(struct obd_export *exp, struct obdo *oa,
+                            struct ll_user_fiemap *fiemap,
+                            cfs_list_t *locked)
+{
+       __u64 begin = fiemap->fm_start;
+       unsigned int i;
+       int rc = 0;
+       struct ll_fiemap_extent *fiemap_start = fiemap->fm_extents;
+       ENTRY;
+
+       CDEBUG(D_OTHER, "extents count %u\n", fiemap->fm_mapped_extents);
+       for (i = 0; i < fiemap->fm_mapped_extents; i++) {
+               if (fiemap_start[i].fe_logical > begin) {
+                       CDEBUG(D_OTHER, "ost lock [%llu,%llu]\n",
+                              begin, fiemap_start[i].fe_logical);
+                       rc = lock_region(exp, oa, begin,
+                                   fiemap_start[i].fe_logical, locked);
+                       if (rc)
+                               RETURN(rc);
+               }
+
+               begin = fiemap_start[i].fe_logical + fiemap_start[i].fe_length;
+       }
+
+       if (begin < (fiemap->fm_start + fiemap->fm_length)) {
+               CDEBUG(D_OTHER, "ost lock [%llu,%llu]\n",
+                      begin, fiemap->fm_start + fiemap->fm_length);
+               rc = lock_region(exp, oa, begin,
+                                fiemap->fm_start + fiemap->fm_length, locked);
+       }
+
+       RETURN(rc);
+}
+
+static void unlock_zero_regions(struct obd_export *exp, cfs_list_t *locked)
+{
+       struct locked_region *entry, *temp;
+       cfs_list_for_each_entry_safe(entry, temp, locked, list) {
+               CDEBUG(D_OTHER, "ost unlock lh=%p\n", &entry->lh);
+               ost_lock_put(exp, &entry->lh, LCK_PR);
+               cfs_list_del(&entry->list);
+               OBD_FREE_PTR(entry);
+       }
+}
+
 static int ost_get_info(struct obd_export *exp, struct ptlrpc_request *req)
 {
         void *key, *reply;
         int keylen, replylen, rc = 0;
         struct req_capsule *pill = &req->rq_pill;
+       cfs_list_t locked = CFS_LIST_HEAD_INIT(locked);
+       struct ll_fiemap_info_key *fm_key = NULL;
+       struct ll_user_fiemap *fiemap;
         ENTRY;
 
         /* this common part for get_info rpc */
@@ -1229,37 +1424,70 @@ static int ost_get_info(struct obd_export *exp, struct ptlrpc_request *req)
         keylen = req_capsule_get_size(pill, &RMF_SETINFO_KEY, RCL_CLIENT);
 
         if (KEY_IS(KEY_FIEMAP)) {
-                struct ll_fiemap_info_key *fm_key = key;
-                int rc;
-
+               fm_key = key;
                 rc = ost_validate_obdo(exp, &fm_key->oa, NULL);
                 if (rc)
                         RETURN(rc);
-        }
+       }
 
-        rc = obd_get_info(exp, keylen, key, &replylen, NULL, NULL);
+        rc = obd_get_info(req->rq_svc_thread->t_env, exp, keylen, key,
+                          &replylen, NULL, NULL);
         if (rc)
-                RETURN(rc);
+               RETURN(rc);
 
         req_capsule_set_size(pill, &RMF_GENERIC_DATA,
                              RCL_SERVER, replylen);
 
         rc = req_capsule_server_pack(pill);
         if (rc)
-                RETURN(rc);
+               RETURN(rc);
 
         reply = req_capsule_server_get(pill, &RMF_GENERIC_DATA);
         if (reply == NULL)
-                RETURN(-ENOMEM);
+               RETURN(-ENOMEM);
+
+       if (KEY_IS(KEY_LAST_FID)) {
+               void *val;
+               int vallen;
+
+               req_capsule_extend(pill, &RQF_OST_GET_INFO_LAST_FID);
+               val = req_capsule_client_get(pill, &RMF_SETINFO_VAL);
+               vallen = req_capsule_get_size(pill, &RMF_SETINFO_VAL,
+                                             RCL_CLIENT);
+               if (val != NULL && vallen > 0 && replylen >= vallen) {
+                       memcpy(reply, val, vallen);
+               } else {
+                       CERROR("%s: invalid req val %p vallen %d replylen %d\n",
+                              exp->exp_obd->obd_name, val, vallen, replylen);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       /* call again to fill in the reply buffer */
+       rc = obd_get_info(req->rq_svc_thread->t_env, exp, keylen, key,
+                         &replylen, reply, NULL);
+
+       /* LU-3219: Lock the sparse areas to make sure dirty flushed back
+        * from client, then call fiemap again. */
+       if (KEY_IS(KEY_FIEMAP) && (fm_key->oa.o_valid & OBD_MD_FLFLAGS) &&
+           (fm_key->oa.o_flags & OBD_FL_SRVLOCK)) {
+               fiemap = (struct ll_user_fiemap *)reply;
+               fm_key = key;
+
+               rc = lock_zero_regions(exp, &fm_key->oa, fiemap, &locked);
+               if (rc == 0 && !cfs_list_empty(&locked))
+                       rc = obd_get_info(req->rq_svc_thread->t_env, exp,
+                                         keylen, key, &replylen, reply, NULL);
+               unlock_zero_regions(exp, &locked);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       lustre_msg_set_status(req->rq_repmsg, 0);
 
-        /* call again to fill in the reply buffer */
-        rc = obd_get_info(exp, keylen, key, &replylen, reply, NULL);
-
-        lustre_msg_set_status(req->rq_repmsg, 0);
         RETURN(rc);
 }
 
-#ifdef HAVE_QUOTA_SUPPORT
 static int ost_handle_quotactl(struct ptlrpc_request *req)
 {
         struct obd_quotactl *oqctl, *repoqc;
@@ -1296,34 +1524,10 @@ static int ost_handle_quotacheck(struct ptlrpc_request *req)
         if (rc)
                 RETURN(-ENOMEM);
 
-        req->rq_status = obd_quotacheck(req->rq_export, oqctl);
-        RETURN(0);
-}
-
-static int ost_handle_quota_adjust_qunit(struct ptlrpc_request *req)
-{
-        struct quota_adjust_qunit *oqaq, *repoqa;
-        struct lustre_quota_ctxt *qctxt;
-        int rc;
-        ENTRY;
-
-        qctxt = &req->rq_export->exp_obd->u.obt.obt_qctxt;
-        oqaq = req_capsule_client_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT);
-        if (oqaq == NULL)
-                GOTO(out, rc = -EPROTO);
-
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                GOTO(out, rc);
-
-        repoqa = req_capsule_server_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT);
-        req->rq_status = obd_quota_adjust_qunit(req->rq_export, oqaq, qctxt, NULL);
-        *repoqa = *oqaq;
-
- out:
-        RETURN(rc);
+       /* deprecated, not used any more */
+       req->rq_status = -EOPNOTSUPP;
+       RETURN(-EOPNOTSUPP);
 }
-#endif
 
 static int ost_llog_handle_connect(struct obd_export *exp,
                                    struct ptlrpc_request *req)
@@ -1337,14 +1541,14 @@ static int ost_llog_handle_connect(struct obd_export *exp,
         RETURN(rc);
 }
 
-#define ost_init_sec_none(reply, exp)                                   \
-do {                                                                    \
-        reply->ocd_connect_flags &= ~(OBD_CONNECT_RMT_CLIENT |          \
-                                      OBD_CONNECT_RMT_CLIENT_FORCE |    \
-                                      OBD_CONNECT_OSS_CAPA);            \
-        cfs_spin_lock(&exp->exp_lock);                                  \
-        exp->exp_connect_flags = reply->ocd_connect_flags;              \
-        cfs_spin_unlock(&exp->exp_lock);                                \
+#define ost_init_sec_none(reply, exp)                                  \
+do {                                                                   \
+       reply->ocd_connect_flags &= ~(OBD_CONNECT_RMT_CLIENT |          \
+                                     OBD_CONNECT_RMT_CLIENT_FORCE |    \
+                                     OBD_CONNECT_OSS_CAPA);            \
+       spin_lock(&exp->exp_lock);                                      \
+       *exp_connect_flags_ptr(exp) = reply->ocd_connect_flags;         \
+       spin_unlock(&exp->exp_lock);                                    \
 } while (0)
 
 static int ost_init_sec_level(struct ptlrpc_request *req)
@@ -1441,9 +1645,9 @@ static int ost_init_sec_level(struct ptlrpc_request *req)
                         if (!filter->fo_fl_oss_capa)
                                 reply->ocd_connect_flags &= ~OBD_CONNECT_OSS_CAPA;
 
-                        cfs_spin_lock(&exp->exp_lock);
-                        exp->exp_connect_flags = reply->ocd_connect_flags;
-                        cfs_spin_unlock(&exp->exp_lock);
+                       spin_lock(&exp->exp_lock);
+                       *exp_connect_flags_ptr(exp) = reply->ocd_connect_flags;
+                       spin_unlock(&exp->exp_lock);
                 }
                 break;
         default:
@@ -1476,14 +1680,14 @@ static int ost_connect_check_sptlrpc(struct ptlrpc_request *req)
         }
 
         if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
-                cfs_read_lock(&filter->fo_sptlrpc_lock);
-                sptlrpc_target_choose_flavor(&filter->fo_sptlrpc_rset,
-                                             req->rq_sp_from,
-                                             req->rq_peer.nid,
-                                             &flvr);
-                cfs_read_unlock(&filter->fo_sptlrpc_lock);
+               read_lock(&filter->fo_sptlrpc_lock);
+               sptlrpc_target_choose_flavor(&filter->fo_sptlrpc_rset,
+                                            req->rq_sp_from,
+                                            req->rq_peer.nid,
+                                            &flvr);
+               read_unlock(&filter->fo_sptlrpc_lock);
 
-                cfs_spin_lock(&exp->exp_lock);
+               spin_lock(&exp->exp_lock);
 
                 exp->exp_sp_peer = req->rq_sp_from;
                 exp->exp_flvr = flvr;
@@ -1497,7 +1701,7 @@ static int ost_connect_check_sptlrpc(struct ptlrpc_request *req)
                         rc = -EACCES;
                 }
 
-                cfs_spin_unlock(&exp->exp_lock);
+               spin_unlock(&exp->exp_lock);
         } else {
                 if (exp->exp_sp_peer != req->rq_sp_from) {
                         CERROR("RPC source %s doesn't match %s\n",
@@ -1514,52 +1718,59 @@ static int ost_connect_check_sptlrpc(struct ptlrpc_request *req)
 
 /* Ensure that data and metadata are synced to the disk when lock is cancelled
  * (if requested) */
-int ost_blocking_ast(struct ldlm_lock *lock,
-                             struct ldlm_lock_desc *desc,
-                             void *data, int flag)
+int ost_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                    void *data, int flag)
 {
-        __u32 sync_lock_cancel = 0;
-        __u32 len = sizeof(sync_lock_cancel);
-        int rc = 0;
-        ENTRY;
-
-        rc = obd_get_info(lock->l_export, sizeof(KEY_SYNC_LOCK_CANCEL),
-                          KEY_SYNC_LOCK_CANCEL, &len, &sync_lock_cancel, NULL);
-
-        if (!rc && flag == LDLM_CB_CANCELING &&
-            (lock->l_granted_mode & (LCK_PW|LCK_GROUP)) &&
-            (sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
-             (sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
-              lock->l_flags & LDLM_FL_CBPENDING))) {
-                struct obd_info *oinfo;
-                struct obdo *oa;
-                int rc;
-
-                OBD_ALLOC_PTR(oinfo);
-                if (!oinfo)
-                        RETURN(-ENOMEM);
-                OBDO_ALLOC(oa);
-                if (!oa) {
-                        OBD_FREE_PTR(oinfo);
-                        RETURN(-ENOMEM);
-                }
-                oa->o_id = lock->l_resource->lr_name.name[0];
-                oa->o_seq = lock->l_resource->lr_name.name[1];
-                oa->o_valid = OBD_MD_FLID|OBD_MD_FLGROUP;
-                oinfo->oi_oa = oa;
-
-                rc = obd_sync(lock->l_export, oinfo,
-                              lock->l_policy_data.l_extent.start,
-                              lock->l_policy_data.l_extent.end, NULL);
-                if (rc)
-                        CERROR("Error %d syncing data on lock cancel\n", rc);
-
-                OBDO_FREE(oa);
-                OBD_FREE_PTR(oinfo);
-        }
-
-        rc = ldlm_server_blocking_ast(lock, desc, data, flag);
-        RETURN(rc);
+       struct lu_env   env;
+       __u32           sync_lock_cancel = 0;
+       __u32           len = sizeof(sync_lock_cancel);
+       int             rc = 0;
+
+       ENTRY;
+
+       rc = lu_env_init(&env, LCT_DT_THREAD);
+       if (unlikely(rc != 0))
+               RETURN(rc);
+
+       rc = obd_get_info(&env, lock->l_export, sizeof(KEY_SYNC_LOCK_CANCEL),
+                         KEY_SYNC_LOCK_CANCEL, &len, &sync_lock_cancel, NULL);
+       if (rc == 0 && flag == LDLM_CB_CANCELING &&
+           (lock->l_granted_mode & (LCK_PW|LCK_GROUP)) &&
+           (sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
+            (sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
+             lock->l_flags & LDLM_FL_CBPENDING))) {
+               struct obd_info *oinfo;
+               struct obdo     *oa;
+               int              rc;
+
+               OBD_ALLOC_PTR(oinfo);
+               if (!oinfo)
+                       GOTO(out_env, rc = -ENOMEM);
+               OBDO_ALLOC(oa);
+               if (!oa) {
+                       OBD_FREE_PTR(oinfo);
+                       GOTO(out_env, rc = -ENOMEM);
+               }
+
+               ostid_res_name_to_id(&oa->o_oi, &lock->l_resource->lr_name);
+               oa->o_valid = OBD_MD_FLID|OBD_MD_FLGROUP;
+               oinfo->oi_oa = oa;
+               oinfo->oi_capa = BYPASS_CAPA;
+
+               rc = obd_sync(&env, lock->l_export, oinfo,
+                             lock->l_policy_data.l_extent.start,
+                             lock->l_policy_data.l_extent.end, NULL);
+               if (rc)
+                       CERROR("Error %d syncing data on lock cancel\n", rc);
+
+               OBDO_FREE(oa);
+               OBD_FREE_PTR(oinfo);
+       }
+
+       rc = ldlm_server_blocking_ast(lock, desc, data, flag);
+out_env:
+       lu_env_fini(&env);
+       RETURN(rc);
 }
 
 static int ost_filter_recovery_request(struct ptlrpc_request *req,
@@ -1584,7 +1795,7 @@ static int ost_filter_recovery_request(struct ptlrpc_request *req,
                 RETURN(0);
 
         default:
-                DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
+                DEBUG_REQ(D_WARNING, req, "not permitted during recovery");
                 *process = -EAGAIN;
                 RETURN(0);
         }
@@ -1619,11 +1830,8 @@ int ost_msg_check_version(struct lustre_msg *msg)
         case OST_SYNC:
         case OST_SET_INFO:
         case OST_GET_INFO:
-#ifdef HAVE_QUOTA_SUPPORT
         case OST_QUOTACHECK:
         case OST_QUOTACTL:
-        case OST_QUOTA_ADJUST_QUNIT:
-#endif
                 rc = lustre_msg_check_version(msg, LUSTRE_OST_VERSION);
                 if (rc)
                         CERROR("bad opc %u version %08x, expecting %08x\n",
@@ -1652,6 +1860,10 @@ int ost_msg_check_version(struct lustre_msg *msg)
                                lustre_msg_get_version(msg),
                                LUSTRE_LOG_VERSION);
                 break;
+       case OST_QUOTA_ADJUST_QUNIT:
+               rc = -ENOTSUPP;
+               CERROR("Quota adjust is deprecated as of 2.4.0\n");
+               break;
         default:
                 CERROR("Unexpected opcode %d\n", lustre_msg_get_opc(msg));
                 rc = -ENOTSUPP;
@@ -1675,19 +1887,22 @@ struct ost_prolong_data {
  */
 static inline int prolong_timeout(struct ptlrpc_request *req)
 {
-        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
 
-        if (AT_OFF)
-                return obd_timeout / 2;
+       if (AT_OFF)
+               return obd_timeout / 2;
 
-        return max(at_est2timeout(at_get(&svc->srv_at_estimate)), ldlm_timeout);
+       return max(at_est2timeout(at_get(&svcpt->scp_at_estimate)),
+                  ldlm_timeout);
 }
 
 static void ost_prolong_lock_one(struct ost_prolong_data *opd,
                                  struct ldlm_lock *lock)
 {
-        LASSERT(lock->l_req_mode == lock->l_granted_mode);
-        LASSERT(lock->l_export == opd->opd_exp);
+       LASSERT(lock->l_export == opd->opd_exp);
+
+       if (lock->l_flags & LDLM_FL_DESTROYED) /* lock already cancelled */
+               return;
 
         /* XXX: never try to grab resource lock here because we're inside
          * exp_bl_list_lock; in ldlm_lockd.c to handle waiting list we take
@@ -1735,7 +1950,7 @@ static void ost_prolong_locks(struct ost_prolong_data *data)
         }
 
 
-        cfs_spin_lock_bh(&exp->exp_bl_list_lock);
+       spin_lock_bh(&exp->exp_bl_list_lock);
         cfs_list_for_each_entry(lock, &exp->exp_bl_list, l_exp_list) {
                 LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
                 LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
@@ -1749,9 +1964,9 @@ static void ost_prolong_locks(struct ost_prolong_data *data)
 
                 ost_prolong_lock_one(data, lock);
         }
-        cfs_spin_unlock_bh(&exp->exp_bl_list_lock);
+       spin_unlock_bh(&exp->exp_bl_list_lock);
 
-        EXIT;
+       EXIT;
 }
 
 /**
@@ -1780,10 +1995,9 @@ static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req,
         nb += ioo->ioo_bufcnt - 1;
         ext.end = nb->offset + nb->len - 1;
 
-        LASSERT(lock->l_resource != NULL);
-        if (!osc_res_name_eq(ioo->ioo_id, ioo->ioo_seq,
-                             &lock->l_resource->lr_name))
-                RETURN(0);
+       LASSERT(lock->l_resource != NULL);
+       if (!ostid_res_name_eq(&ioo->ioo_oid, &lock->l_resource->lr_name))
+               RETURN(0);
 
         mode = LCK_PW;
         if (opc == OST_READ)
@@ -1831,7 +2045,7 @@ static int ost_rw_hpreq_check(struct ptlrpc_request *req)
         LASSERT(nb != NULL);
         LASSERT(!(nb->flags & OBD_BRW_SRVLOCK));
 
-        osc_build_res_name(ioo->ioo_id, ioo->ioo_seq, &opd.opd_resid);
+       ostid_build_res_name(&ioo->ioo_oid, &opd.opd_resid);
 
         opd.opd_req = req;
         mode = LCK_PW;
@@ -1856,7 +2070,7 @@ static int ost_rw_hpreq_check(struct ptlrpc_request *req)
         CDEBUG(D_DLMTRACE, "%s: refreshed %u locks timeout for req %p.\n",
                obd->obd_name, opd.opd_locks, req);
 
-        RETURN(opd.opd_locks);
+        RETURN(opd.opd_locks > 0);
 }
 
 static void ost_rw_hpreq_fini(struct ptlrpc_request *req)
@@ -1915,7 +2129,7 @@ static int ost_punch_hpreq_check(struct ptlrpc_request *req)
                 opd.opd_extent.end = OBD_OBJECT_EOF;
         opd.opd_timeout = prolong_timeout(req);
 
-        osc_build_res_name(oa->o_id, oa->o_seq, &opd.opd_resid);
+       ostid_build_res_name(&oa->o_oi, &opd.opd_resid);
 
         CDEBUG(D_DLMTRACE,
                "%s: refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n",
@@ -1949,7 +2163,7 @@ struct ptlrpc_hpreq_ops ost_hpreq_punch = {
 };
 
 /** Assign high priority operations to the request if needed. */
-static int ost_hpreq_handler(struct ptlrpc_request *req)
+static int ost_io_hpreq_handler(struct ptlrpc_request *req)
 {
         ENTRY;
         if (req->rq_export) {
@@ -2055,29 +2269,35 @@ static int ost_hpreq_handler(struct ptlrpc_request *req)
 /* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
 int ost_handle(struct ptlrpc_request *req)
 {
-        struct obd_trans_info trans_info = { 0, };
-        struct obd_trans_info *oti = &trans_info;
-        int should_process, fail = OBD_FAIL_OST_ALL_REPLY_NET, rc = 0;
-        struct obd_device *obd = NULL;
-        ENTRY;
-
-        LASSERT(current->journal_info == NULL);
-
-        /* primordial rpcs don't affect server recovery */
-        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
-        case SEC_CTX_INIT:
-        case SEC_CTX_INIT_CONT:
-        case SEC_CTX_FINI:
-                GOTO(out, rc = 0);
-        }
-
-        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
-
-        if (lustre_msg_get_opc(req->rq_reqmsg) != OST_CONNECT) {
-                if (!class_connected_export(req->rq_export)) {
-                        CDEBUG(D_HA,"operation %d on unconnected OST from %s\n",
-                               lustre_msg_get_opc(req->rq_reqmsg),
-                               libcfs_id2str(req->rq_peer));
+       struct obd_trans_info trans_info = { 0, };
+       struct obd_trans_info *oti = &trans_info;
+       int should_process, fail = OBD_FAIL_OST_ALL_REPLY_NET, rc = 0;
+       struct obd_device *obd = NULL;
+       __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+       ENTRY;
+
+       /* OST module is kept between remounts, but the last reference
+        * to specific module (say, osd or ofd) kills all related keys
+        * from the environment. so we have to refill it until the root
+        * cause is fixed properly */
+       lu_env_refill(req->rq_svc_thread->t_env);
+
+       LASSERT(current->journal_info == NULL);
+
+       /* primordial rpcs don't affect server recovery */
+       switch (opc) {
+       case SEC_CTX_INIT:
+       case SEC_CTX_INIT_CONT:
+       case SEC_CTX_FINI:
+               GOTO(out, rc = 0);
+       }
+
+       req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+       if (opc != OST_CONNECT) {
+               if (!class_connected_export(req->rq_export)) {
+                       CDEBUG(D_HA,"operation %d on unconnected OST from %s\n",
+                              opc, libcfs_id2str(req->rq_peer));
                         req->rq_status = -ENOTCONN;
                         GOTO(out, rc = -ENOTCONN);
                 }
@@ -2104,7 +2324,11 @@ int ost_handle(struct ptlrpc_request *req)
         if (rc)
                 RETURN(rc);
 
-        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+       if (req && req->rq_reqmsg && req->rq_export &&
+           (exp_connect_flags(req->rq_export) & OBD_CONNECT_JOBSTATS))
+               oti->oti_jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+
+       switch (opc) {
         case OST_CONNECT: {
                 CDEBUG(D_INODE, "connect\n");
                 req_capsule_set(&req->rq_pill, &RQF_OST_CONNECT);
@@ -2163,18 +2387,18 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_BRW_WRITE);
                 CDEBUG(D_INODE, "write\n");
                 /* req->rq_request_portal would be nice, if it was set */
-                if (req->rq_rqbd->rqbd_service->srv_req_portal !=OST_IO_PORTAL){
-                        CERROR("%s: deny write request from %s to portal %u\n",
-                               req->rq_export->exp_obd->obd_name,
-                               obd_export_nid2str(req->rq_export),
-                               req->rq_rqbd->rqbd_service->srv_req_portal);
+               if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+                       CERROR("%s: deny write request from %s to portal %u\n",
+                              req->rq_export->exp_obd->obd_name,
+                              obd_export_nid2str(req->rq_export),
+                              ptlrpc_req2svc(req)->srv_req_portal);
                         GOTO(out, rc = -EPROTO);
                 }
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_NET))
                         RETURN(0);
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOSPC))
                         GOTO(out, rc = -ENOSPC);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OST_EROFS))
+                if (OBD_FAIL_TIMEOUT(OBD_FAIL_OST_EROFS, 1))
                         GOTO(out, rc = -EROFS);
                 rc = ost_brw_write(req, oti);
                 LASSERT(current->journal_info == NULL);
@@ -2184,11 +2408,11 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_BRW_READ);
                 CDEBUG(D_INODE, "read\n");
                 /* req->rq_request_portal would be nice, if it was set */
-                if (req->rq_rqbd->rqbd_service->srv_req_portal !=OST_IO_PORTAL){
-                        CERROR("%s: deny read request from %s to portal %u\n",
-                               req->rq_export->exp_obd->obd_name,
-                               obd_export_nid2str(req->rq_export),
-                               req->rq_rqbd->rqbd_service->srv_req_portal);
+               if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+                       CERROR("%s: deny read request from %s to portal %u\n",
+                              req->rq_export->exp_obd->obd_name,
+                              obd_export_nid2str(req->rq_export),
+                              ptlrpc_req2svc(req)->srv_req_portal);
                         GOTO(out, rc = -EPROTO);
                 }
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_NET))
@@ -2218,7 +2442,7 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_SYNC);
                 if (OBD_FAIL_CHECK(OBD_FAIL_OST_SYNC_NET))
                         RETURN(0);
-                rc = ost_sync(req->rq_export, req);
+               rc = ost_sync(req->rq_export, req, oti);
                 break;
         case OST_SET_INFO:
                 DEBUG_REQ(D_INODE, req, "set_info");
@@ -2230,7 +2454,6 @@ int ost_handle(struct ptlrpc_request *req)
                 req_capsule_set(&req->rq_pill, &RQF_OST_GET_INFO_GENERIC);
                 rc = ost_get_info(req->rq_export, req);
                 break;
-#ifdef HAVE_QUOTA_SUPPORT
         case OST_QUOTACHECK:
                 CDEBUG(D_INODE, "quotacheck\n");
                 req_capsule_set(&req->rq_pill, &RQF_OST_QUOTACHECK);
@@ -2245,12 +2468,6 @@ int ost_handle(struct ptlrpc_request *req)
                         RETURN(0);
                 rc = ost_handle_quotactl(req);
                 break;
-        case OST_QUOTA_ADJUST_QUNIT:
-                CDEBUG(D_INODE, "quota_adjust_qunit\n");
-                req_capsule_set(&req->rq_pill, &RQF_OST_QUOTA_ADJUST_QUNIT);
-                rc = ost_handle_quota_adjust_qunit(req);
-                break;
-#endif
         case OBD_PING:
                 DEBUG_REQ(D_INODE, req, "ping");
                 req_capsule_set(&req->rq_pill, &RQF_OBD_PING);
@@ -2266,51 +2483,37 @@ int ost_handle(struct ptlrpc_request *req)
                 if (rc)
                         RETURN(rc);
                 RETURN(ptlrpc_reply(req));
-        case OBD_LOG_CANCEL:
-                CDEBUG(D_INODE, "log cancel\n");
-                req_capsule_set(&req->rq_pill, &RQF_LOG_CANCEL);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_cancel(req);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_REP))
-                        RETURN(0);
-                req->rq_status = rc;
-                rc = req_capsule_server_pack(&req->rq_pill);
-                if (rc)
-                        RETURN(rc);
-                RETURN(ptlrpc_reply(req));
-        case LDLM_ENQUEUE:
-                CDEBUG(D_INODE, "enqueue\n");
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_ENQUEUE);
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE))
-                        RETURN(0);
-                rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
-                                         ost_blocking_ast,
-                                         ldlm_server_glimpse_ast);
-                fail = OBD_FAIL_OST_LDLM_REPLY_NET;
-                break;
-        case LDLM_CONVERT:
-                CDEBUG(D_INODE, "convert\n");
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CONVERT))
-                        RETURN(0);
-                rc = ldlm_handle_convert(req);
-                break;
-        case LDLM_CANCEL:
-                CDEBUG(D_INODE, "cancel\n");
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL))
-                        RETURN(0);
-                rc = ldlm_handle_cancel(req);
-                break;
+       case LDLM_ENQUEUE:
+               CDEBUG(D_INODE, "enqueue\n");
+               req_capsule_set(&req->rq_pill, &RQF_LDLM_ENQUEUE);
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_NET))
+                       RETURN(0);
+               rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
+                                        ost_blocking_ast,
+                                        ldlm_server_glimpse_ast);
+               fail = OBD_FAIL_OST_LDLM_REPLY_NET;
+               break;
+       case LDLM_CONVERT:
+               CDEBUG(D_INODE, "convert\n");
+               req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CONVERT_NET))
+                       RETURN(0);
+               rc = ldlm_handle_convert(req);
+               break;
+       case LDLM_CANCEL:
+               CDEBUG(D_INODE, "cancel\n");
+               req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET))
+                       RETURN(0);
+               rc = ldlm_handle_cancel(req);
+               break;
         case LDLM_BL_CALLBACK:
         case LDLM_CP_CALLBACK:
                 CDEBUG(D_INODE, "callback\n");
                 CERROR("callbacks should not happen on OST\n");
                 /* fall through */
         default:
-                CERROR("Unexpected opcode %d\n",
-                       lustre_msg_get_opc(req->rq_reqmsg));
+               CERROR("Unexpected opcode %d\n", opc);
                 req->rq_status = -ENOTSUPP;
                 rc = ptlrpc_error(req);
                 RETURN(rc);
@@ -2320,7 +2523,7 @@ int ost_handle(struct ptlrpc_request *req)
 
         EXIT;
         /* If we're DISCONNECTing, the export_data is already freed */
-        if (!rc && lustre_msg_get_opc(req->rq_reqmsg) != OST_DISCONNECT)
+       if (!rc && opc != OST_DISCONNECT)
                 target_committed_to_req(req);
 
 out:
@@ -2331,10 +2534,11 @@ out:
         return 0;
 }
 EXPORT_SYMBOL(ost_handle);
+
 /*
- * free per-thread pool created by ost_thread_init().
+ * free per-thread pool created by ost_io_thread_init().
  */
-static void ost_thread_done(struct ptlrpc_thread *thread)
+static void ost_io_thread_done(struct ptlrpc_thread *thread)
 {
         struct ost_thread_local_cache *tls; /* TLS stands for Thread-Local
                                              * Storage */
@@ -2345,7 +2549,7 @@ static void ost_thread_done(struct ptlrpc_thread *thread)
 
         /*
          * be prepared to handle partially-initialized pools (because this is
-         * called from ost_thread_init() for cleanup.
+         * called from ost_io_thread_init() for cleanup.
          */
         tls = thread->t_data;
         if (tls != NULL) {
@@ -2358,7 +2562,7 @@ static void ost_thread_done(struct ptlrpc_thread *thread)
 /*
  * initialize per-thread page pool (bug 5137).
  */
-static int ost_thread_init(struct ptlrpc_thread *thread)
+static int ost_io_thread_init(struct ptlrpc_thread *thread)
 {
         struct ost_thread_local_cache *tls;
 
@@ -2366,7 +2570,6 @@ static int ost_thread_init(struct ptlrpc_thread *thread)
 
         LASSERT(thread != NULL);
         LASSERT(thread->t_data == NULL);
-        LASSERTF(thread->t_id <= OSS_THREADS_MAX, "%u\n", thread->t_id);
 
         OBD_ALLOC_PTR(tls);
         if (tls == NULL)
@@ -2377,17 +2580,17 @@ static int ost_thread_init(struct ptlrpc_thread *thread)
 
 #define OST_WATCHDOG_TIMEOUT (obd_timeout * 1000)
 
+static struct cfs_cpt_table    *ost_io_cptable;
+
 /* Sigh - really, this is an OSS, the _server_, not the _target_ */
 static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
 {
-        struct ost_obd *ost = &obd->u.ost;
-        struct lprocfs_static_vars lvars;
-        int oss_min_threads;
-        int oss_max_threads;
-        int oss_min_create_threads;
-        int oss_max_create_threads;
-        int rc;
-        ENTRY;
+       static struct ptlrpc_service_conf       svc_conf;
+       struct ost_obd *ost = &obd->u.ost;
+       struct lprocfs_static_vars lvars;
+       nodemask_t              *mask;
+       int rc;
+       ENTRY;
 
         rc = cfs_cleanup_group_info();
         if (rc)
@@ -2396,101 +2599,246 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
         lprocfs_ost_init_vars(&lvars);
         lprocfs_obd_setup(obd, lvars.obd_vars);
 
-        cfs_sema_init(&ost->ost_health_sem, 1);
-
-        if (oss_num_threads) {
-                /* If oss_num_threads is set, it is the min and the max. */
-                if (oss_num_threads > OSS_THREADS_MAX)
-                        oss_num_threads = OSS_THREADS_MAX;
-                if (oss_num_threads < OSS_THREADS_MIN)
-                        oss_num_threads = OSS_THREADS_MIN;
-                oss_max_threads = oss_min_threads = oss_num_threads;
-        } else {
-                /* Base min threads on memory and cpus */
-                oss_min_threads =
-                        cfs_num_possible_cpus() * CFS_NUM_CACHEPAGES >>
-                        (27 - CFS_PAGE_SHIFT);
-                if (oss_min_threads < OSS_THREADS_MIN)
-                        oss_min_threads = OSS_THREADS_MIN;
-                /* Insure a 4x range for dynamic threads */
-                if (oss_min_threads > OSS_THREADS_MAX / 4)
-                        oss_min_threads = OSS_THREADS_MAX / 4;
-                oss_max_threads = min(OSS_THREADS_MAX, oss_min_threads * 4 + 1);
-        }
-
-        ost->ost_service =
-                ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
-                                OST_MAXREPSIZE, OST_REQUEST_PORTAL,
-                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
-                                ost_handle, LUSTRE_OSS_NAME,
-                                obd->obd_proc_entry, target_print_req,
-                                oss_min_threads, oss_max_threads,
-                                "ll_ost", LCT_DT_THREAD, NULL);
-        if (ost->ost_service == NULL) {
-                CERROR("failed to start service\n");
-                GOTO(out_lprocfs, rc = -ENOMEM);
+       mutex_init(&ost->ost_health_mutex);
+
+       svc_conf = (typeof(svc_conf)) {
+               .psc_name               = LUSTRE_OSS_NAME,
+               .psc_watchdog_factor    = OSS_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_BUFSIZE,
+                       .bc_req_max_size        = OST_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_MAXREPSIZE,
+                       .bc_req_portal          = OST_REQUEST_PORTAL,
+                       .bc_rep_portal          = OSC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_ost",
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_pattern             = oss_cpts,
+               },
+               .psc_ops                = {
+                       .so_req_handler         = ost_handle,
+                       .so_req_printer         = target_print_req,
+                       .so_hpreq_handler       = ptlrpc_hpreq_handler,
+               },
+       };
+       ost->ost_service = ptlrpc_register_service(&svc_conf,
+                                                  obd->obd_proc_entry);
+       if (IS_ERR(ost->ost_service)) {
+               rc = PTR_ERR(ost->ost_service);
+               CERROR("failed to start service: %d\n", rc);
+               GOTO(out_lprocfs, rc);
         }
 
-        rc = ptlrpc_start_threads(ost->ost_service);
-        if (rc)
-                GOTO(out_service, rc = -EINVAL);
-
-        if (oss_num_create_threads) {
-                if (oss_num_create_threads > OSS_MAX_CREATE_THREADS)
-                        oss_num_create_threads = OSS_MAX_CREATE_THREADS;
-                if (oss_num_create_threads < OSS_MIN_CREATE_THREADS)
-                        oss_num_create_threads = OSS_MIN_CREATE_THREADS;
-                oss_min_create_threads = oss_max_create_threads =
-                        oss_num_create_threads;
-        } else {
-                oss_min_create_threads = OSS_MIN_CREATE_THREADS;
-                oss_max_create_threads = OSS_MAX_CREATE_THREADS;
+       memset(&svc_conf, 0, sizeof(svc_conf));
+       svc_conf = (typeof(svc_conf)) {
+               .psc_name               = "ost_create",
+               .psc_watchdog_factor    = OSS_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_BUFSIZE,
+                       .bc_req_max_size        = OST_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_MAXREPSIZE,
+                       .bc_req_portal          = OST_CREATE_PORTAL,
+                       .bc_rep_portal          = OSC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_ost_create",
+                       .tc_thr_factor          = OSS_CR_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_CR_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_CR_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_CR_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_create_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_pattern             = oss_cpts,
+               },
+               .psc_ops                = {
+                       .so_req_handler         = ost_handle,
+                       .so_req_printer         = target_print_req,
+               },
+       };
+       ost->ost_create_service = ptlrpc_register_service(&svc_conf,
+                                                         obd->obd_proc_entry);
+       if (IS_ERR(ost->ost_create_service)) {
+               rc = PTR_ERR(ost->ost_create_service);
+               CERROR("failed to start OST create service: %d\n", rc);
+               GOTO(out_service, rc);
         }
 
-        ost->ost_create_service =
-                ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
-                                OST_MAXREPSIZE, OST_CREATE_PORTAL,
-                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
-                                ost_handle, "ost_create",
-                                obd->obd_proc_entry, target_print_req,
-                                oss_min_create_threads, oss_max_create_threads,
-                                "ll_ost_creat", LCT_DT_THREAD, NULL);
-        if (ost->ost_create_service == NULL) {
-                CERROR("failed to start OST create service\n");
-                GOTO(out_service, rc = -ENOMEM);
-        }
-
-        rc = ptlrpc_start_threads(ost->ost_create_service);
-        if (rc)
-                GOTO(out_create, rc = -EINVAL);
-
-        ost->ost_io_service =
-                ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
-                                OST_MAXREPSIZE, OST_IO_PORTAL,
-                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
-                                ost_handle, "ost_io",
-                                obd->obd_proc_entry, target_print_req,
-                                oss_min_threads, oss_max_threads,
-                                "ll_ost_io", LCT_DT_THREAD, ost_hpreq_handler);
-        if (ost->ost_io_service == NULL) {
-                CERROR("failed to start OST I/O service\n");
-                GOTO(out_create, rc = -ENOMEM);
+       mask = cfs_cpt_table->ctb_nodemask;
+       /* event CPT feature is disabled in libcfs level by set partition
+        * number to 1, we still want to set node affinity for io service */
+       if (cfs_cpt_number(cfs_cpt_table) == 1 && nodes_weight(*mask) > 1) {
+               int     cpt = 0;
+               int     i;
+
+               ost_io_cptable = cfs_cpt_table_alloc(nodes_weight(*mask));
+               for_each_node_mask(i, *mask) {
+                       if (ost_io_cptable == NULL) {
+                               CWARN("OSS failed to create CPT table\n");
+                               break;
+                       }
+
+                       rc = cfs_cpt_set_node(ost_io_cptable, cpt++, i);
+                       if (!rc) {
+                               CWARN("OSS Failed to set node %d for"
+                                     "IO CPT table\n", i);
+                               cfs_cpt_table_free(ost_io_cptable);
+                               ost_io_cptable = NULL;
+                               break;
+                       }
+               }
+       }
+
+       memset(&svc_conf, 0, sizeof(svc_conf));
+       svc_conf = (typeof(svc_conf)) {
+               .psc_name               = "ost_io",
+               .psc_watchdog_factor    = OSS_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_IO_BUFSIZE,
+                       .bc_req_max_size        = OST_IO_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_IO_MAXREPSIZE,
+                       .bc_req_portal          = OST_IO_PORTAL,
+                       .bc_rep_portal          = OSC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_ost_io",
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_cptable             = ost_io_cptable,
+                       .cc_pattern             = ost_io_cptable == NULL ?
+                                                 oss_io_cpts : NULL,
+               },
+               .psc_ops                = {
+                       .so_thr_init            = ost_io_thread_init,
+                       .so_thr_done            = ost_io_thread_done,
+                       .so_req_handler         = ost_handle,
+                       .so_hpreq_handler       = ost_io_hpreq_handler,
+                       .so_req_printer         = target_print_req,
+               },
+       };
+       ost->ost_io_service = ptlrpc_register_service(&svc_conf,
+                                                     obd->obd_proc_entry);
+       if (IS_ERR(ost->ost_io_service)) {
+               rc = PTR_ERR(ost->ost_io_service);
+               CERROR("failed to start OST I/O service: %d\n", rc);
+               ost->ost_io_service = NULL;
+               GOTO(out_create, rc);
         }
 
-        ost->ost_io_service->srv_init = ost_thread_init;
-        ost->ost_io_service->srv_done = ost_thread_done;
-        ost->ost_io_service->srv_cpu_affinity = 1;
-        rc = ptlrpc_start_threads(ost->ost_io_service);
-        if (rc)
-                GOTO(out_io, rc = -EINVAL);
-
-        ping_evictor_start();
-
-        RETURN(0);
+       memset(&svc_conf, 0, sizeof(svc_conf));
+       svc_conf = (typeof(svc_conf)) {
+               .psc_name               = "ost_seq",
+               .psc_watchdog_factor    = OSS_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_BUFSIZE,
+                       .bc_req_max_size        = OST_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_MAXREPSIZE,
+                       .bc_req_portal          = SEQ_DATA_PORTAL,
+                       .bc_rep_portal          = OSC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_ost_seq",
+                       .tc_thr_factor          = OSS_CR_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_CR_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_CR_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_CR_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_create_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD,
+               },
+
+               .psc_cpt                = {
+                       .cc_pattern          = oss_cpts,
+               },
+               .psc_ops                = {
+                       .so_req_handler         = tgt_request_handle,
+                       .so_req_printer         = target_print_req,
+                       .so_hpreq_handler       = NULL,
+               },
+       };
+       ost->ost_seq_service = ptlrpc_register_service(&svc_conf,
+                                                     obd->obd_proc_entry);
+       if (IS_ERR(ost->ost_seq_service)) {
+               rc = PTR_ERR(ost->ost_seq_service);
+               CERROR("failed to start OST seq service: %d\n", rc);
+               ost->ost_seq_service = NULL;
+               GOTO(out_io, rc);
+       }
+
+#if 0
+       /* Object update service */
+       memset(&svc_conf, 0, sizeof(svc_conf));
+       svc_conf = (typeof(svc_conf)) {
+               .psc_name               = "ost_out",
+               .psc_watchdog_factor    = OSS_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OUT_BUFSIZE,
+                       .bc_req_max_size        = OUT_MAXREQSIZE,
+                       .bc_rep_max_size        = OUT_MAXREPSIZE,
+                       .bc_req_portal          = OUT_PORTAL,
+                       .bc_rep_portal          = OSC_REPLY_PORTAL,
+               },
+               /*
+                * We'd like to have a mechanism to set this on a per-device
+                * basis, but alas...
+                */
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_ost_out",
+                       .tc_thr_factor          = OSS_CR_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_CR_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_CR_NTHRS_BASE,
+                       .tc_nthrs_max           = OSS_CR_NTHRS_MAX,
+                       .tc_nthrs_user          = oss_num_create_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_pattern             = oss_cpts,
+               },
+               .psc_ops                = {
+                       .so_req_handler         = tgt_request_handle,
+                       .so_req_printer         = target_print_req,
+                       .so_hpreq_handler       = NULL,
+               },
+       };
+       ost->ost_out_service = ptlrpc_register_service(&svc_conf,
+                                                      obd->obd_proc_entry);
+       if (IS_ERR(ost->ost_out_service)) {
+               rc = PTR_ERR(ost->ost_out_service);
+               CERROR("failed to start out service: %d\n", rc);
+               ost->ost_out_service = NULL;
+               GOTO(out_seq, rc);
+       }
+#endif
+       ping_evictor_start();
 
+       RETURN(0);
 out_io:
-        ptlrpc_unregister_service(ost->ost_io_service);
-        ost->ost_io_service = NULL;
+       ptlrpc_unregister_service(ost->ost_io_service);
+       ost->ost_io_service = NULL;
 out_create:
         ptlrpc_unregister_service(ost->ost_create_service);
         ost->ost_create_service = NULL;
@@ -2504,38 +2852,51 @@ out_lprocfs:
 
 static int ost_cleanup(struct obd_device *obd)
 {
-        struct ost_obd *ost = &obd->u.ost;
-        int err = 0;
-        ENTRY;
+       struct ost_obd *ost = &obd->u.ost;
+       int err = 0;
+       ENTRY;
+
+       ping_evictor_stop();
+
+       /* there is no recovery for OST OBD, all recovery is controlled by
+        * obdfilter OBD */
+       LASSERT(obd->obd_recovering == 0);
+       mutex_lock(&ost->ost_health_mutex);
+       ptlrpc_unregister_service(ost->ost_service);
+       ptlrpc_unregister_service(ost->ost_create_service);
+       ptlrpc_unregister_service(ost->ost_io_service);
+       ptlrpc_unregister_service(ost->ost_seq_service);
+#if 0
+       ptlrpc_unregister_service(ost->ost_out_service);
+#endif
+       ost->ost_service = NULL;
+       ost->ost_create_service = NULL;
+       ost->ost_io_service = NULL;
+       ost->ost_seq_service = NULL;
+       ost->ost_out_service = NULL;
 
-        ping_evictor_stop();
+       mutex_unlock(&ost->ost_health_mutex);
 
-        /* there is no recovery for OST OBD, all recovery is controlled by
-         * obdfilter OBD */
-        LASSERT(obd->obd_recovering == 0);
-        cfs_down(&ost->ost_health_sem);
-        ptlrpc_unregister_service(ost->ost_service);
-        ptlrpc_unregister_service(ost->ost_create_service);
-        ptlrpc_unregister_service(ost->ost_io_service);
-        ost->ost_service = NULL;
-        ost->ost_create_service = NULL;
-        cfs_up(&ost->ost_health_sem);
+       lprocfs_obd_cleanup(obd);
 
-        lprocfs_obd_cleanup(obd);
+       if (ost_io_cptable != NULL) {
+               cfs_cpt_table_free(ost_io_cptable);
+               ost_io_cptable = NULL;
+       }
 
-        RETURN(err);
+       RETURN(err);
 }
 
-static int ost_health_check(struct obd_device *obd)
+static int ost_health_check(const struct lu_env *env, struct obd_device *obd)
 {
         struct ost_obd *ost = &obd->u.ost;
         int rc = 0;
 
-        cfs_down(&ost->ost_health_sem);
+       mutex_lock(&ost->ost_health_mutex);
         rc |= ptlrpc_service_health_check(ost->ost_service);
         rc |= ptlrpc_service_health_check(ost->ost_create_service);
         rc |= ptlrpc_service_health_check(ost->ost_io_service);
-        cfs_up(&ost->ost_health_sem);
+       mutex_unlock(&ost->ost_health_mutex);
 
         /*
          * health_check to return 0 on healthy
@@ -2567,6 +2928,8 @@ static int __init ost_init(void)
         int rc;
         ENTRY;
 
+       ost_page_to_corrupt = alloc_page(GFP_IOFS);
+
         lprocfs_ost_init_vars(&lvars);
         rc = class_register_type(&ost_obd_ops, NULL, lvars.module_vars,
                                  LUSTRE_OSS_NAME, NULL);
@@ -2583,6 +2946,9 @@ static int __init ost_init(void)
 
 static void /*__exit*/ ost_exit(void)
 {
+       if (ost_page_to_corrupt)
+               page_cache_release(ost_page_to_corrupt);
+
         class_unregister_type(LUSTRE_OSS_NAME);
 }