+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ struct ptlrpc_request *reqcopy;
+ struct lustre_msg *reqmsg;
+ cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
+ time_t newdl;
+ int rc;
+ ENTRY;
+
+ /* deadline is when the client expects us to reply, margin is the
+ difference between clients' and servers' expectations */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "%ssending early reply (deadline %+lds, margin %+lds) for "
+ "%d+%d", AT_OFF ? "AT off - not " : "",
+ olddl, olddl - at_get(&svc->srv_at_estimate),
+ at_get(&svc->srv_at_estimate), at_extra);
+
+ if (AT_OFF)
+ RETURN(0);
+
+ if (olddl < 0) {
+ DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), "
+ "not sending early reply. Consider increasing "
+ "at_early_margin (%d)?", olddl, at_early_margin);
+
+ /* Return an error so we're not re-added to the timed list. */
+ RETURN(-ETIMEDOUT);
+ }
+
+ if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+ DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+ "but no AT support");
+ RETURN(-ENOSYS);
+ }
+
+ if (req->rq_export &&
+ lustre_msg_get_flags(req->rq_reqmsg) &
+ (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+ /* During recovery, we don't want to send too many early
+ * replies, but on the other hand we want to make sure the
+ * client has enough time to resend if the rpc is lost. So
+ * during the recovery period send at least 4 early replies,
+ * spacing them every at_extra if we can. at_estimate should
+ * always equal this fixed value during recovery. */
+ at_measured(&svc->srv_at_estimate, min(at_extra,
+ req->rq_export->exp_obd->obd_recovery_timeout / 4));
+ } else {
+ /* Fake our processing time into the future to ask the clients
+ * for some extra amount of time */
+ at_measured(&svc->srv_at_estimate, at_extra +
+ cfs_time_current_sec() -
+ req->rq_arrival_time.tv_sec);
+
+ /* Check to see if we've actually increased the deadline -
+ * we may be past adaptive_max */
+ if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+ at_get(&svc->srv_at_estimate)) {
+ DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+ "(%ld/%ld), not sending early reply\n",
+ olddl, req->rq_arrival_time.tv_sec +
+ at_get(&svc->srv_at_estimate) -
+ cfs_time_current_sec());
+ RETURN(-ETIMEDOUT);
+ }
+ }
+ newdl = cfs_time_current_sec() + at_get(&svc->srv_at_estimate);
+
+ OBD_ALLOC(reqcopy, sizeof *reqcopy);
+ if (reqcopy == NULL)
+ RETURN(-ENOMEM);
+ OBD_ALLOC(reqmsg, req->rq_reqlen);
+ if (!reqmsg) {
+ OBD_FREE(reqcopy, sizeof *reqcopy);
+ RETURN(-ENOMEM);
+ }
+
+ *reqcopy = *req;
+ reqcopy->rq_reply_state = NULL;
+ reqcopy->rq_rep_swab_mask = 0;
+ reqcopy->rq_pack_bulk = 0;
+ reqcopy->rq_pack_udesc = 0;
+ reqcopy->rq_packed_final = 0;
+ sptlrpc_svc_ctx_addref(reqcopy);
+ /* We only need the reqmsg for the magic */
+ reqcopy->rq_reqmsg = reqmsg;
+ memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+ LASSERT(cfs_atomic_read(&req->rq_refcount));
+ /** if it is last refcount then early reply isn't needed */
+ if (cfs_atomic_read(&req->rq_refcount) == 1) {
+ DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+ "abort sending early reply\n");
+ GOTO(out, rc = -EINVAL);
+ }
+
+ /* Connection ref */
+ reqcopy->rq_export = class_conn2export(
+ lustre_msg_get_handle(reqcopy->rq_reqmsg));
+ if (reqcopy->rq_export == NULL)
+ GOTO(out, rc = -ENODEV);
+
+ /* RPC ref */
+ class_export_rpc_get(reqcopy->rq_export);
+ if (reqcopy->rq_export->exp_obd &&
+ reqcopy->rq_export->exp_obd->obd_fail)
+ GOTO(out_put, rc = -ENODEV);
+
+ rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+ if (rc)
+ GOTO(out_put, rc);
+
+ rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+ if (!rc) {
+ /* Adjust our own deadline to what we told the client */
+ req->rq_deadline = newdl;
+ req->rq_early_count++; /* number sent, server side */
+ } else {
+ DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+ }
+
+ /* Free the (early) reply state from lustre_pack_reply.
+ (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+ ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+ class_export_rpc_put(reqcopy->rq_export);
+ class_export_put(reqcopy->rq_export);
+out:
+ sptlrpc_svc_ctx_decref(reqcopy);
+ OBD_FREE(reqmsg, req->rq_reqlen);
+ OBD_FREE(reqcopy, sizeof *reqcopy);
+ RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+ asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *rq, *n;
+ cfs_list_t work_list;
+ struct ptlrpc_at_array *array = &svc->srv_at_array;
+ __u32 index, count;
+ time_t deadline;
+ time_t now = cfs_time_current_sec();
+ cfs_duration_t delay;
+ int first, counter = 0;
+ ENTRY;
+
+ cfs_spin_lock(&svc->srv_at_lock);
+ if (svc->srv_at_check == 0) {
+ cfs_spin_unlock(&svc->srv_at_lock);
+ RETURN(0);
+ }
+ delay = cfs_time_sub(cfs_time_current(), svc->srv_at_checktime);
+ svc->srv_at_check = 0;
+
+ if (array->paa_count == 0) {
+ cfs_spin_unlock(&svc->srv_at_lock);
+ RETURN(0);
+ }
+
+ /* The timer went off, but maybe the nearest rpc already completed. */
+ first = array->paa_deadline - now;
+ if (first > at_early_margin) {
+ /* We've still got plenty of time. Reset the timer. */
+ cfs_spin_unlock(&svc->srv_at_lock);
+ ptlrpc_at_set_timer(svc);
+ RETURN(0);
+ }
+
+ /* We're close to a timeout, and we don't know how much longer the
+ server will take. Send early replies to everyone expiring soon. */
+ CFS_INIT_LIST_HEAD(&work_list);
+ deadline = -1;
+ index = (unsigned long)array->paa_deadline % array->paa_size;
+ count = array->paa_count;
+ while (count > 0) {
+ count -= array->paa_reqs_count[index];
+ cfs_list_for_each_entry_safe(rq, n,
+ &array->paa_reqs_array[index],
+ rq_timed_list) {
+ if (rq->rq_deadline <= now + at_early_margin) {
+ cfs_list_del_init(&rq->rq_timed_list);
+ /**
+ * ptlrpc_server_drop_request() may drop
+ * refcount to 0 already. Let's check this and
+ * don't add entry to work_list
+ */
+ if (likely(cfs_atomic_inc_not_zero(&rq->rq_refcount)))
+ cfs_list_add(&rq->rq_timed_list, &work_list);
+ counter++;
+ array->paa_reqs_count[index]--;
+ array->paa_count--;
+ cfs_spin_lock(&rq->rq_lock);
+ rq->rq_at_linked = 0;
+ cfs_spin_unlock(&rq->rq_lock);
+ continue;
+ }
+
+ /* update the earliest deadline */
+ if (deadline == -1 || rq->rq_deadline < deadline)
+ deadline = rq->rq_deadline;
+
+ break;
+ }
+
+ if (++index >= array->paa_size)
+ index = 0;
+ }
+ array->paa_deadline = deadline;
+ cfs_spin_unlock(&svc->srv_at_lock);
+
+ /* we have a new earliest deadline, restart the timer */
+ ptlrpc_at_set_timer(svc);
+
+ CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+ "replies\n", first, at_extra, counter);
+ if (first < 0) {
+ /* We're already past request deadlines before we even get a
+ chance to send early replies */
+ LCONSOLE_WARN("%s: This server is not able to keep up with "
+ "request traffic (cpu-bound).\n", svc->srv_name);
+ CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, "
+ "delay="CFS_DURATION_T"(jiff)\n",
+ counter, svc->srv_n_queued_reqs, svc->srv_n_active_reqs,
+ at_get(&svc->srv_at_estimate), delay);
+ }
+
+ /* we took additional refcount so entries can't be deleted from list, no
+ * locking is needed */
+ while (!cfs_list_empty(&work_list)) {
+ rq = cfs_list_entry(work_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ cfs_list_del_init(&rq->rq_timed_list);
+
+ if (ptlrpc_at_send_early_reply(rq) == 0)
+ ptlrpc_at_add_timed(rq);
+
+ ptlrpc_server_drop_request(rq);
+ }
+
+ RETURN(0);
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_hpreq_init(struct ptlrpc_service *svc,
+ struct ptlrpc_request *req)
+{
+ int rc;
+ ENTRY;
+
+ if (svc->srv_hpreq_handler) {
+ rc = svc->srv_hpreq_handler(req);
+ if (rc)
+ RETURN(rc);
+ }
+ if (req->rq_export && req->rq_ops) {
+ cfs_spin_lock_bh(&req->rq_export->exp_rpc_lock);
+ cfs_list_add(&req->rq_exp_list,
+ &req->rq_export->exp_queued_rpc);
+ cfs_spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+ }
+
+ RETURN(0);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_hpreq_fini(struct ptlrpc_request *req)
+{
+ ENTRY;
+ if (req->rq_export && req->rq_ops) {
+ cfs_spin_lock_bh(&req->rq_export->exp_rpc_lock);
+ cfs_list_del_init(&req->rq_exp_list);
+ cfs_spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+ }
+ EXIT;
+}
+
+/**
+ * Make the request a high priority one.
+ *
+ * All the high priority requests are queued in a separate FIFO
+ * ptlrpc_service::srv_request_hpq list which is parallel to
+ * ptlrpc_service::srv_request_queue list but has a higher priority
+ * for handling.
+ *
+ * \see ptlrpc_server_handle_request().
+ */
+static void ptlrpc_hpreq_reorder_nolock(struct ptlrpc_service *svc,
+ struct ptlrpc_request *req)
+{
+ ENTRY;
+ LASSERT(svc != NULL);
+ cfs_spin_lock(&req->rq_lock);
+ if (req->rq_hp == 0) {
+ int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ /* Add to the high priority queue. */
+ cfs_list_move_tail(&req->rq_list, &svc->srv_request_hpq);
+ req->rq_hp = 1;
+ if (opc != OBD_PING)
+ DEBUG_REQ(D_NET, req, "high priority req");
+ }
+ cfs_spin_unlock(&req->rq_lock);
+ EXIT;
+}
+
+/**
+ * \see ptlrpc_hpreq_reorder_nolock
+ */
+void ptlrpc_hpreq_reorder(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ ENTRY;
+
+ cfs_spin_lock(&svc->srv_rq_lock);
+ /* It may happen that the request is already taken for the processing
+ * but still in the export list, do not re-add it into the HP list. */
+ if (req->rq_phase == RQ_PHASE_NEW)
+ ptlrpc_hpreq_reorder_nolock(svc, req);
+ cfs_spin_unlock(&svc->srv_rq_lock);
+ EXIT;
+}
+
+/** Check if the request is a high priority one. */
+static int ptlrpc_server_hpreq_check(struct ptlrpc_request *req)
+{
+ int opc, rc = 0;
+ ENTRY;
+
+ /* Check by request opc. */
+ opc = lustre_msg_get_opc(req->rq_reqmsg);
+ if (opc == OBD_PING)
+ RETURN(1);
+
+ /* Perform request specific check. */
+ if (req->rq_ops && req->rq_ops->hpreq_check)
+ rc = req->rq_ops->hpreq_check(req);
+ RETURN(rc);
+}
+
+/** Check if a request is a high priority one. */
+static int ptlrpc_server_request_add(struct ptlrpc_service *svc,
+ struct ptlrpc_request *req)
+{
+ int rc;
+ ENTRY;
+
+ rc = ptlrpc_server_hpreq_check(req);
+ if (rc < 0)
+ RETURN(rc);
+
+ cfs_spin_lock(&svc->srv_rq_lock);
+ /* Before inserting the request into the queue, check if it is not
+ * inserted yet, or even already handled -- it may happen due to
+ * a racing ldlm_server_blocking_ast(). */
+ if (req->rq_phase == RQ_PHASE_NEW && cfs_list_empty(&req->rq_list)) {
+ if (rc)
+ ptlrpc_hpreq_reorder_nolock(svc, req);
+ else
+ cfs_list_add_tail(&req->rq_list,
+ &svc->srv_request_queue);
+ }
+ cfs_spin_unlock(&svc->srv_rq_lock);
+
+ RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold ptlrpc_service::srv_rq_lock
+ * to get reliable result
+ */
+static int ptlrpc_server_allow_high(struct ptlrpc_service *svc, int force)
+{
+ if (force)
+ return 1;
+
+ if (svc->srv_n_active_reqs >= svc->srv_threads_running - 1)
+ return 0;
+
+ return cfs_list_empty(&svc->srv_request_queue) ||
+ svc->srv_hpreq_count < svc->srv_hpreq_ratio;
+}
+
+static int ptlrpc_server_high_pending(struct ptlrpc_service *svc, int force)
+{
+ return ptlrpc_server_allow_high(svc, force) &&
+ !cfs_list_empty(&svc->srv_request_hpq);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold ptlrpc_service::srv_rq_lock
+ * to get reliable result
+ */
+static int ptlrpc_server_allow_normal(struct ptlrpc_service *svc, int force)
+{
+ if (force ||
+ svc->srv_n_active_reqs < svc->srv_threads_running - 2)
+ return 1;
+
+ if (svc->srv_n_active_reqs >= svc->srv_threads_running - 1)
+ return 0;
+
+ return svc->srv_n_active_hpreq > 0 || svc->srv_hpreq_handler == NULL;
+}
+
+static int ptlrpc_server_normal_pending(struct ptlrpc_service *svc, int force)
+{
+ return ptlrpc_server_allow_normal(svc, force) &&
+ !cfs_list_empty(&svc->srv_request_queue);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::srv_rq_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline int
+ptlrpc_server_request_pending(struct ptlrpc_service *svc, int force)
+{
+ return ptlrpc_server_high_pending(svc, force) ||
+ ptlrpc_server_normal_pending(svc, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service *svc, int force)
+{
+ struct ptlrpc_request *req;
+ ENTRY;
+
+ if (ptlrpc_server_high_pending(svc, force)) {
+ req = cfs_list_entry(svc->srv_request_hpq.next,
+ struct ptlrpc_request, rq_list);
+ svc->srv_hpreq_count++;
+ RETURN(req);
+
+ }
+
+ if (ptlrpc_server_normal_pending(svc, force)) {
+ req = cfs_list_entry(svc->srv_request_queue.next,
+ struct ptlrpc_request, rq_list);
+ svc->srv_hpreq_count = 0;
+ RETURN(req);
+ }
+ RETURN(NULL);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *req;
+ __u32 deadline;
+ int rc;
+ ENTRY;
+
+ LASSERT(svc);
+
+ cfs_spin_lock(&svc->srv_lock);
+ if (cfs_list_empty(&svc->srv_req_in_queue)) {
+ cfs_spin_unlock(&svc->srv_lock);
+ RETURN(0);
+ }
+
+ req = cfs_list_entry(svc->srv_req_in_queue.next,
+ struct ptlrpc_request, rq_list);
+ cfs_list_del_init (&req->rq_list);
+ svc->srv_n_queued_reqs--;
+ /* Consider this still a "queued" request as far as stats are
+ concerned */
+ cfs_spin_unlock(&svc->srv_lock);
+
+ /* go through security check/transform */
+ rc = sptlrpc_svc_unwrap_request(req);
+ switch (rc) {
+ case SECSVC_OK:
+ break;
+ case SECSVC_COMPLETE:
+ target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+ goto err_req;
+ case SECSVC_DROP:
+ goto err_req;
+ default:
+ LBUG();
+ }
+
+ /*
+ * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+ * redo it wouldn't be harmful.
+ */
+ if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+ rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+ if (rc != 0) {
+ CERROR("error unpacking request: ptl %d from %s "
+ "x"LPU64"\n", svc->srv_req_portal,
+ libcfs_id2str(req->rq_peer), req->rq_xid);
+ goto err_req;
+ }
+ }
+
+ rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+ LPU64"\n", svc->srv_req_portal,
+ libcfs_id2str(req->rq_peer), req->rq_xid);
+ goto err_req;
+ }
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+ lustre_msg_get_opc(req->rq_reqmsg) == obd_fail_val) {
+ CERROR("drop incoming rpc opc %u, x"LPU64"\n",
+ obd_fail_val, req->rq_xid);
+ goto err_req;
+ }
+
+ rc = -EINVAL;
+ if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+ CERROR("wrong packet type received (type=%u) from %s\n",
+ lustre_msg_get_type(req->rq_reqmsg),
+ libcfs_id2str(req->rq_peer));
+ goto err_req;
+ }
+
+ switch(lustre_msg_get_opc(req->rq_reqmsg)) {
+ case MDS_WRITEPAGE:
+ case OST_WRITE:
+ req->rq_bulk_write = 1;
+ break;
+ case MDS_READPAGE:
+ case OST_READ:
+ req->rq_bulk_read = 1;
+ break;
+ }
+
+ CDEBUG(D_NET, "got req "LPU64"\n", req->rq_xid);
+
+ req->rq_export = class_conn2export(
+ lustre_msg_get_handle(req->rq_reqmsg));
+ if (req->rq_export) {
+ rc = ptlrpc_check_req(req);
+ if (rc == 0) {
+ rc = sptlrpc_target_export_check(req->rq_export, req);
+ if (rc)
+ DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+ "illegal security flavor,");
+ }
+
+ if (rc)
+ goto err_req;
+ ptlrpc_update_export_timer(req->rq_export, 0);
+ }
+
+ /* req_in handling should/must be fast */
+ if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+ DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+ cfs_time_sub(cfs_time_current_sec(),
+ req->rq_arrival_time.tv_sec));
+
+ /* Set rpc server deadline and add it to the timed list */
+ deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+ MSGHDR_AT_SUPPORT) ?
+ /* The max time the client expects us to take */
+ lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+ req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+ if (unlikely(deadline == 0)) {
+ DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+ goto err_req;
+ }
+
+ ptlrpc_at_add_timed(req);
+ rc = ptlrpc_hpreq_init(svc, req);
+ if (rc)
+ GOTO(err_req, rc);
+
+ /* Move it over to the request processing queue */
+ rc = ptlrpc_server_request_add(svc, req);
+ if (rc)
+ GOTO(err_req, rc);
+ cfs_waitq_signal(&svc->srv_waitq);
+ RETURN(1);
+
+err_req:
+ cfs_spin_lock(&svc->srv_rq_lock);
+ svc->srv_n_active_reqs++;
+ cfs_spin_unlock(&svc->srv_rq_lock);
+ ptlrpc_server_finish_request(svc, req);
+
+ RETURN(1);