Whamcloud - gitweb
LU-12838 ptlrpc: fix watchdog ratelimit logic
[fs/lustre-release.git] / lustre / ptlrpc / service.c
index e8e887c..987982a 100644 (file)
@@ -41,6 +41,7 @@
 #include <lu_object.h>
 #include <uapi/linux/lnet/lnet-types.h>
 #include "ptlrpc_internal.h"
+#include <linux/delay.h>
 
 /* The following are visible and mutable through /sys/module/ptlrpc */
 int test_req_buffer_pressure = 0;
@@ -449,11 +450,10 @@ static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
                rqbd = list_entry(svcpt->scp_rqbd_idle.next,
                                      struct ptlrpc_request_buffer_desc,
                                      rqbd_list);
-               list_del(&rqbd->rqbd_list);
 
                /* assume we will post successfully */
                svcpt->scp_nrqbds_posted++;
-               list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+               list_move(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
 
                spin_unlock(&svcpt->scp_lock);
 
@@ -467,8 +467,7 @@ static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
        spin_lock(&svcpt->scp_lock);
 
        svcpt->scp_nrqbds_posted--;
-       list_del(&rqbd->rqbd_list);
-       list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+       list_move_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
 
        /*
         * Don't complain if no request buffers are posted right now; LNET
@@ -944,9 +943,7 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req)
        refcount = --(rqbd->rqbd_refcount);
        if (refcount == 0) {
                /* request buffer is now idle: add to history */
-               list_del(&rqbd->rqbd_list);
-
-               list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+               list_move_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
                svcpt->scp_hist_nrqbds++;
 
                /*
@@ -1028,6 +1025,30 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req)
        }
 }
 
+static void ptlrpc_add_exp_list_nolock(struct ptlrpc_request *req,
+                                      struct obd_export *export, bool hp)
+{
+       __u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+
+       if (hp)
+               list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+       else
+               list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
+       if (tag && export->exp_used_slots)
+               set_bit(tag - 1, export->exp_used_slots);
+}
+
+static void ptlrpc_del_exp_list(struct ptlrpc_request *req)
+{
+       __u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+
+       spin_lock(&req->rq_export->exp_rpc_lock);
+       list_del_init(&req->rq_exp_list);
+       if (tag && !req->rq_obsolete && req->rq_export->exp_used_slots)
+               clear_bit(tag - 1, req->rq_export->exp_used_slots);
+       spin_unlock(&req->rq_export->exp_rpc_lock);
+}
+
 /** Change request export and move hp request from old export to new */
 void ptlrpc_request_change_export(struct ptlrpc_request *req,
                                  struct obd_export *export)
@@ -1035,19 +1056,13 @@ void ptlrpc_request_change_export(struct ptlrpc_request *req,
        if (req->rq_export != NULL) {
                LASSERT(!list_empty(&req->rq_exp_list));
                /* remove rq_exp_list from last export */
-               spin_lock(&req->rq_export->exp_rpc_lock);
-               list_del_init(&req->rq_exp_list);
-               spin_unlock(&req->rq_export->exp_rpc_lock);
-               /*
-                * export has one reference already, so it`s safe to
+               ptlrpc_del_exp_list(req);
+               /* export has one reference already, so it's safe to
                 * add req to export queue here and get another
                 * reference for request later
                 */
                spin_lock(&export->exp_rpc_lock);
-               if (req->rq_ops != NULL) /* hp request */
-                       list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
-               else
-                       list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
+               ptlrpc_add_exp_list_nolock(req, export, req->rq_ops != NULL);
                spin_unlock(&export->exp_rpc_lock);
 
                class_export_rpc_dec(req->rq_export);
@@ -1363,6 +1378,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                RETURN(0);
 
        if (olddl < 0) {
+               /* below message is checked in replay-ost-single.sh test_9 */
                DEBUG_REQ(D_WARNING, req,
                          "Already past deadline (%+llds), not sending early reply. Consider increasing at_early_margin (%d)?",
                          (s64)olddl, at_early_margin);
@@ -1421,7 +1437,8 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
         * we may be past adaptive_max
         */
        if (req->rq_deadline >= newdl) {
-               DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%lld/%lld), not sending early reply\n",
+               DEBUG_REQ(D_WARNING, req,
+                         "Could not add any time (%lld/%lld), not sending early reply",
                          (s64)olddl, (s64)(newdl - ktime_get_real_seconds()));
                RETURN(-ETIMEDOUT);
        }
@@ -1453,10 +1470,10 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                GOTO(out, rc = -ETIMEDOUT);
 
        LASSERT(atomic_read(&req->rq_refcount));
-       /** if it is last refcount then early reply isn't needed */
+       /* if it is last refcount then early reply isn't needed */
        if (atomic_read(&req->rq_refcount) == 1) {
                DEBUG_REQ(D_ADAPTTO, reqcopy,
-                         "Normal reply already sent out, abort sending early reply\n");
+                         "Normal reply already sent, abort early reply");
                GOTO(out, rc = -EINVAL);
        }
 
@@ -1483,7 +1500,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                req->rq_deadline = newdl;
                req->rq_early_count++; /* number sent, server side */
        } else {
-               DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+               DEBUG_REQ(D_ERROR, req, "Early reply send failed: rc = %d", rc);
        }
 
        /*
@@ -1666,6 +1683,47 @@ found:
        return tmp;
 }
 
+#ifdef HAVE_SERVER_SUPPORT
+static void ptlrpc_server_mark_obsolete(struct ptlrpc_request *req)
+{
+       req->rq_obsolete = 1;
+}
+
+static void
+ptlrpc_server_mark_in_progress_obsolete(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request   *tmp = NULL;
+       __u16                   tag;
+
+       if (!tgt_is_increasing_xid_client(req->rq_export) ||
+           req->rq_export->exp_used_slots == NULL)
+               return;
+
+       tag = lustre_msg_get_tag(req->rq_reqmsg);
+       if (tag == 0)
+               return;
+
+       if (!test_bit(tag - 1, req->rq_export->exp_used_slots))
+               return;
+
+       /* This list should not be longer than max_requests in
+        * flights on the client, so it is not all that long.
+        * Also we only hit this codepath in case of a resent
+        * request which makes it even more rarely hit */
+       list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, rq_exp_list) {
+               if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) &&
+                   req->rq_xid > tmp->rq_xid)
+                       ptlrpc_server_mark_obsolete(tmp);
+
+       }
+       list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, rq_exp_list) {
+               if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) &&
+                   req->rq_xid > tmp->rq_xid)
+                       ptlrpc_server_mark_obsolete(tmp);
+       }
+}
+#endif
+
 /**
  * Check if a request should be assigned with a high priority.
  *
@@ -1723,9 +1781,7 @@ static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
                if (req->rq_ops && req->rq_ops->hpreq_fini)
                        req->rq_ops->hpreq_fini(req);
 
-               spin_lock(&req->rq_export->exp_rpc_lock);
-               list_del_init(&req->rq_exp_list);
-               spin_unlock(&req->rq_export->exp_rpc_lock);
+               ptlrpc_del_exp_list(req);
        }
        EXIT;
 }
@@ -1772,7 +1828,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
        hp = rc > 0;
        ptlrpc_nrs_req_initialize(svcpt, req, hp);
 
-       if (req->rq_export != NULL) {
+       while (req->rq_export != NULL) {
                struct obd_export *exp = req->rq_export;
 
                /*
@@ -1780,7 +1836,18 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
                 * atomically
                 */
                spin_lock_bh(&exp->exp_rpc_lock);
+#ifdef HAVE_SERVER_SUPPORT
+               ptlrpc_server_mark_in_progress_obsolete(req);
+#endif
                orig = ptlrpc_server_check_resend_in_progress(req);
+               if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
+                       spin_unlock_bh(&exp->exp_rpc_lock);
+
+                       OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+                       msleep(4 * MSEC_PER_SEC);
+                       continue;
+               }
+
                if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
                        bool linked;
 
@@ -1803,14 +1870,17 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
                                ptlrpc_at_add_timed(orig);
                        ptlrpc_server_drop_request(orig);
                        ptlrpc_nrs_req_finalize(req);
+
+                       /* don't mark slot unused for resend in progress */
+                       req->rq_obsolete = 1;
+
                        RETURN(-EBUSY);
                }
 
-               if (hp || req->rq_ops != NULL)
-                       list_add(&req->rq_exp_list, &exp->exp_hp_rpcs);
-               else
-                       list_add(&req->rq_exp_list, &exp->exp_reg_rpcs);
+               ptlrpc_add_exp_list_nolock(req, exp, hp || req->rq_ops != NULL);
+
                spin_unlock_bh(&exp->exp_rpc_lock);
+               break;
        }
 
        /*
@@ -2083,7 +2153,7 @@ static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
                        rc = sptlrpc_target_export_check(req->rq_export, req);
                        if (rc)
                                DEBUG_REQ(D_ERROR, req,
-                                         "DROPPING req with illegal security flavor,");
+                                         "DROPPING req with illegal security flavor");
                }
 
                if (rc)
@@ -2211,7 +2281,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
         * The deadline is increased if we send an early reply.
         */
        if (ktime_get_real_seconds() > request->rq_deadline) {
-               DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld:%llds ago\n",
+               DEBUG_REQ(D_ERROR, request,
+                         "Dropping timed-out request from %s: deadline %lld/%llds ago",
                          libcfs_id2str(request->rq_peer),
                          request->rq_deadline -
                          request->rq_arrival_time.tv_sec,
@@ -2229,7 +2300,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
               lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
               libcfs_id2str(request->rq_peer),
               lustre_msg_get_opc(request->rq_reqmsg),
-              lustre_msg_get_jobid(request->rq_reqmsg));
+              lustre_msg_get_jobid(request->rq_reqmsg) ?: "");
 
        if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
                CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
@@ -2250,8 +2321,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 put_conn:
        if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) {
                DEBUG_REQ(D_WARNING, request,
-                         "Request took longer than estimated (%lld:%llds); "
-                         "client may timeout.",
+                         "Request took longer than estimated (%lld/%llds); client may timeout",
                          request->rq_deadline -
                          request->rq_arrival_time.tv_sec,
                          ktime_get_real_seconds() - request->rq_deadline);
@@ -2271,7 +2341,7 @@ put_conn:
               request->rq_xid,
               libcfs_id2str(request->rq_peer),
               lustre_msg_get_opc(request->rq_reqmsg),
-              lustre_msg_get_jobid(request->rq_reqmsg),
+              lustre_msg_get_jobid(request->rq_reqmsg) ?: "",
               timediff_usecs,
               arrived_usecs,
               (request->rq_repmsg ?
@@ -2586,13 +2656,16 @@ static void ptlrpc_watchdog_fire(struct work_struct *w)
        u64 ms_lapse = ktime_ms_delta(ktime_get(), thread->t_touched);
        u32 ms_frac = do_div(ms_lapse, MSEC_PER_SEC);
 
-       if (!__ratelimit(&watchdog_limit)) {
+       /* ___ratelimit() returns true if the action is NOT ratelimited */
+       if (__ratelimit(&watchdog_limit)) {
+               /* below message is checked in sanity-quota.sh test_6,18 */
                LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
                              thread->t_task->comm, thread->t_task->pid,
                              ms_lapse, ms_frac);
 
                libcfs_debug_dumpstack(thread->t_task);
        } else {
+               /* below message is checked in sanity-quota.sh test_6,18 */
                LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. Watchdog stack traces are limited to 3 per %u seconds, skipping this one.\n",
                              thread->t_task->comm, thread->t_task->pid,
                              ms_lapse, ms_frac, libcfs_watchdog_ratelimit);
@@ -3023,8 +3096,7 @@ static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
                thread = list_entry(svcpt->scp_threads.next,
                                        struct ptlrpc_thread, t_link);
                if (thread_is_stopped(thread)) {
-                       list_del(&thread->t_link);
-                       list_add(&thread->t_link, &zombie);
+                       list_move(&thread->t_link, &zombie);
                        continue;
                }
                spin_unlock(&svcpt->scp_lock);