Whamcloud - gitweb
LU-6142 lustre: ptlrpc: don't use list_for_each_entry_safe unnecessarily.
[fs/lustre-release.git] / lustre / ptlrpc / import.c
index 321385c..beb67b3 100644 (file)
@@ -278,15 +278,12 @@ static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req,
 static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
 {
        time64_t now = ktime_get_real_seconds();
-       struct list_head *tmp, *n;
        struct ptlrpc_request *req;
        time64_t timeout = 0;
 
        spin_lock(&imp->imp_lock);
-       list_for_each_safe(tmp, n, &imp->imp_sending_list) {
-               req = list_entry(tmp, struct ptlrpc_request, rq_list);
+       list_for_each_entry(req, &imp->imp_sending_list, rq_list)
                timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
-       }
        spin_unlock(&imp->imp_lock);
        return timeout;
 }
@@ -299,7 +296,6 @@ static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
  */
 void ptlrpc_invalidate_import(struct obd_import *imp)
 {
-       struct list_head *tmp, *n;
        struct ptlrpc_request *req;
        time64_t timeout;
        int rc;
@@ -376,19 +372,13 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
                                 * this point. */
                                rc = 1;
                        } else {
-                               list_for_each_safe(tmp, n,
-                                                  &imp->imp_sending_list) {
-                                       req = list_entry(tmp,
-                                                        struct ptlrpc_request,
-                                                        rq_list);
+                               list_for_each_entry(req, &imp->imp_sending_list,
+                                                   rq_list) {
                                        DEBUG_REQ(D_ERROR, req,
                                                  "still on sending list");
                                }
-                               list_for_each_safe(tmp, n,
-                                                  &imp->imp_delayed_list) {
-                                       req = list_entry(tmp,
-                                                        struct ptlrpc_request,
-                                                        rq_list);
+                               list_for_each_entry(req, &imp->imp_delayed_list,
+                                                   rq_list) {
                                        DEBUG_REQ(D_ERROR, req,
                                                  "still on delayed list");
                                }
@@ -466,7 +456,7 @@ void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 
 int ptlrpc_reconnect_import(struct obd_import *imp)
 {
-#ifdef CONFIG_LUSTRE_PINGER
+#ifdef CONFIG_LUSTRE_FS_PINGER
        long timeout_jiffies = cfs_time_seconds(obd_timeout);
        int rc;
 
@@ -579,28 +569,28 @@ static int import_select_connection(struct obd_import *imp)
         */
        if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
                struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+
                if (at_get(at) < CONNECTION_SWITCH_MAX) {
                        at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
                        if (at_get(at) > CONNECTION_SWITCH_MAX)
                                at_reset(at, CONNECTION_SWITCH_MAX);
                }
                LASSERT(imp_conn->oic_last_attempt);
-               CDEBUG(D_HA, "%s: tried all connections, increasing latency "
-                      "to %ds\n", imp->imp_obd->obd_name, at_get(at));
+               CDEBUG(D_HA,
+                      "%s: tried all connections, increasing latency to %ds\n",
+                      imp->imp_obd->obd_name, at_get(at));
        }
 
        imp_conn->oic_last_attempt = ktime_get_seconds();
 
        /* switch connection, don't mind if it's same as the current one */
-       if (imp->imp_connection)
-               ptlrpc_connection_put(imp->imp_connection);
+       ptlrpc_connection_put(imp->imp_connection);
        imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
 
        dlmexp = class_conn2export(&imp->imp_dlm_handle);
        if (!dlmexp)
                GOTO(out_unlock, rc = -EINVAL);
-       if (dlmexp->exp_connection)
-               ptlrpc_connection_put(dlmexp->exp_connection);
+       ptlrpc_connection_put(dlmexp->exp_connection);
        dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
        class_export_put(dlmexp);
 
@@ -634,14 +624,13 @@ out_unlock:
  */
 static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
 {
-       struct ptlrpc_request   *req;
-       struct list_head        *tmp;
+       struct ptlrpc_request *req;
 
        /* The requests in committed_list always have smaller transnos than
         * the requests in replay_list */
        if (!list_empty(&imp->imp_committed_list)) {
-               tmp = imp->imp_committed_list.next;
-               req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+               req = list_first_entry(&imp->imp_committed_list,
+                                      struct ptlrpc_request, rq_replay_list);
                *transno = req->rq_transno;
                if (req->rq_transno == 0) {
                        DEBUG_REQ(D_ERROR, req,
@@ -651,8 +640,8 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
                return 1;
        }
        if (!list_empty(&imp->imp_replay_list)) {
-               tmp = imp->imp_replay_list.next;
-               req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+               req = list_first_entry(&imp->imp_committed_list,
+                                      struct ptlrpc_request, rq_replay_list);
                *transno = req->rq_transno;
                if (req->rq_transno == 0) {
                        DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
@@ -1046,7 +1035,6 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 * for connecting*/
                imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
                spin_unlock(&imp->imp_lock);
-               ptlrpc_maybe_ping_import_soon(imp);
                GOTO(out, rc);
        }
 
@@ -1347,6 +1335,8 @@ out:
 
        if (rc != 0) {
                bool inact = false;
+               time64_t now = ktime_get_seconds();
+               time64_t next_connect;
 
                import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                if (rc == -EACCES) {
@@ -1390,7 +1380,28 @@ out:
                                import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
                                inact = true;
                        }
+               } else if (rc == -ENODEV || rc == -ETIMEDOUT) {
+                       /* ENODEV means there is no service, force reconnection
+                        * to a pair if attempt happen ptlrpc_next_reconnect
+                        * before now. ETIMEDOUT could be set during network
+                        * error and do not guarantee request deadline happened.
+                        */
+                       struct obd_import_conn *conn;
+                       time64_t reconnect_time;
+
+                       /* Same as ptlrpc_next_reconnect, but in past */
+                       reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
+                       list_for_each_entry(conn, &imp->imp_conn_list,
+                                           oic_item) {
+                               if (conn->oic_last_attempt <= reconnect_time) {
+                                       imp->imp_force_verify = 1;
+                                       break;
+                               }
+                       }
                }
+
+               next_connect = imp->imp_conn_current->oic_last_attempt +
+                              (request->rq_deadline - request->rq_sent);
                spin_unlock(&imp->imp_lock);
 
                if (inact)
@@ -1399,6 +1410,18 @@ out:
                if (rc == -EPROTO)
                        RETURN(rc);
 
+               /* adjust imp_next_ping to request deadline + 1 and reschedule
+                * a pinger if import lost processing during CONNECTING or far
+                * away from request deadline. It could happen when connection
+                * was initiated outside of pinger, like
+                * ptlrpc_set_import_discon().
+                */
+               if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
+                   imp->imp_next_ping > next_connect)) {
+                       imp->imp_next_ping = max(now, next_connect) + 1;
+                       ptlrpc_pinger_wake_up();
+               }
+
                ptlrpc_maybe_ping_import_soon(imp);
 
                CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
@@ -1453,8 +1476,8 @@ static int signal_completed_replay(struct obd_import *imp)
        if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
                RETURN(0);
 
-       LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
-       atomic_inc(&imp->imp_replay_inflight);
+       if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1))
+               RETURN(0);
 
        req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
                                        OBD_PING);
@@ -1485,9 +1508,6 @@ static int ptlrpc_invalidate_import_thread(void *data)
         struct obd_import *imp = data;
 
         ENTRY;
-
-       unshare_fs_struct();
-
         CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
                imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
                imp->imp_connection->c_remote_uuid.uuid);
@@ -1535,6 +1555,8 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         ENTRY;
         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+               struct task_struct *task;
+
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
                 /* Don't care about MGC eviction */
@@ -1545,7 +1567,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                                           "using this service will fail.\n",
                                           imp->imp_obd->obd_name, target_len,
                                           target_start);
-                       LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction");
+                       LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction\n");
                 }
                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
                        obd2cli_tgt(imp->imp_obd),
@@ -1555,24 +1577,22 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                imp->imp_vbr_failed = 0;
                spin_unlock(&imp->imp_lock);
 
-               {
-               struct task_struct *task;
                /* bug 17802:  XXX client_disconnect_export vs connect request
                 * race. if client is evicted at this time then we start
                 * invalidate thread without reference to import and import can
                 * be freed at same time. */
                class_import_get(imp);
                task = kthread_run(ptlrpc_invalidate_import_thread, imp,
-                                    "ll_imp_inval");
+                                  "ll_imp_inval");
                if (IS_ERR(task)) {
                        class_import_put(imp);
-                       CERROR("error starting invalidate thread: %d\n", rc);
                        rc = PTR_ERR(task);
+                       CERROR("%s: can't start invalidate thread: rc = %d\n",
+                              imp->imp_obd->obd_name, rc);
                } else {
                        rc = 0;
                }
                RETURN(rc);
-               }
         }
 
        if (imp->imp_state == LUSTRE_IMP_REPLAY) {
@@ -1612,12 +1632,16 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                        GOTO(out, rc);
                ptlrpc_activate_import(imp, true);
 
-               CDEBUG_LIMIT(imp->imp_was_idle ?
-                               imp->imp_idle_debug : D_CONSOLE,
-                            "%s: Connection restored to %s (at %s)\n",
-                            imp->imp_obd->obd_name,
-                            obd_uuid2str(&conn->c_remote_uuid),
-                            obd_import_nid2str(imp));
+               /* Reverse import are flagged with dlm_fake == 1.
+                * They do not do recovery and connection are not "restored".
+                */
+               if (!imp->imp_dlm_fake)
+                       CDEBUG_LIMIT(imp->imp_was_idle ?
+                                       imp->imp_idle_debug : D_CONSOLE,
+                                    "%s: Connection restored to %s (at %s)\n",
+                                    imp->imp_obd->obd_name,
+                                    obd_uuid2str(&conn->c_remote_uuid),
+                                    obd_import_nid2str(imp));
                spin_lock(&imp->imp_lock);
                imp->imp_was_idle = 0;
                spin_unlock(&imp->imp_lock);
@@ -1786,7 +1810,7 @@ static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env,
                memset(&imp->imp_remote_handle, 0,
                       sizeof(imp->imp_remote_handle));
                /* take our DISCONNECT into account */
-               if (atomic_read(&imp->imp_inflight) > 1) {
+               if (atomic_read(&imp->imp_reqs) > 1) {
                        imp->imp_generation++;
                        imp->imp_initiated_at = imp->imp_generation;
                        import_set_state_nolock(imp, LUSTRE_IMP_NEW);
@@ -1864,45 +1888,48 @@ void ptlrpc_cleanup_imp(struct obd_import *imp)
 
 /* Adaptive Timeout utils */
 
-/* Update at_current with the specified value (bounded by at_min and at_max),
- * as well as the AT history "bins".
+/* Update at_current_timeout with the specified value (bounded by at_min and
+ * at_max), as well as the AT history "bins".
  *  - Bin into timeslices using AT_BINS bins.
  *  - This gives us a max of the last at_history seconds without the storage,
  *    but still smoothing out a return to normalcy from a slow response.
  *  - (E.g. remember the maximum latency in each minute of the last 4 minutes.)
  */
-int at_measured(struct adaptive_timeout *at, unsigned int val)
+timeout_t at_measured(struct adaptive_timeout *at, timeout_t timeout)
 {
-        unsigned int old = at->at_current;
+       timeout_t old_timeout = at->at_current_timeout;
        time64_t now = ktime_get_real_seconds();
        long binlimit = max_t(long, at_history / AT_BINS, 1);
 
         LASSERT(at);
-        CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
-              val, at, (long)(now - at->at_binstart), at->at_current,
+       CDEBUG(D_OTHER, "add %u to %p time=%lld v=%u (%u %u %u %u)\n",
+              timeout, at, now - at->at_binstart, at->at_current_timeout,
                at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
 
-        if (val == 0)
-                /* 0's don't count, because we never want our timeout to
-                   drop to 0, and because 0 could mean an error */
+       if (timeout <= 0)
+               /* Negative timeouts and 0's don't count, because we never
+                * want our timeout to drop to 0 or below, and because 0 could
+                * mean an error
+                */
                 return 0;
 
        spin_lock(&at->at_lock);
 
         if (unlikely(at->at_binstart == 0)) {
                 /* Special case to remove default from history */
-                at->at_current = val;
-                at->at_worst_ever = val;
-                at->at_worst_time = now;
-                at->at_hist[0] = val;
+               at->at_current_timeout = timeout;
+               at->at_worst_timeout_ever = timeout;
+               at->at_worst_timestamp = now;
+               at->at_hist[0] = timeout;
                 at->at_binstart = now;
         } else if (now - at->at_binstart < binlimit ) {
                 /* in bin 0 */
-                at->at_hist[0] = max(val, at->at_hist[0]);
-                at->at_current = max(val, at->at_current);
+               at->at_hist[0] = max_t(timeout_t, timeout, at->at_hist[0]);
+               at->at_current_timeout = max_t(timeout_t, timeout,
+                                              at->at_current_timeout);
         } else {
                 int i, shift;
-                unsigned int maxv = val;
+               timeout_t maxv = timeout;
 
                /* move bins over */
                shift = (u32)(now - at->at_binstart) / binlimit;
@@ -1910,42 +1937,45 @@ int at_measured(struct adaptive_timeout *at, unsigned int val)
                 for(i = AT_BINS - 1; i >= 0; i--) {
                         if (i >= shift) {
                                 at->at_hist[i] = at->at_hist[i - shift];
-                                maxv = max(maxv, at->at_hist[i]);
+                               maxv = max_t(timeout_t, maxv, at->at_hist[i]);
                         } else {
                                 at->at_hist[i] = 0;
                         }
                 }
-                at->at_hist[0] = val;
-                at->at_current = maxv;
+               at->at_hist[0] = timeout;
+               at->at_current_timeout = maxv;
                 at->at_binstart += shift * binlimit;
         }
 
-        if (at->at_current > at->at_worst_ever) {
-                at->at_worst_ever = at->at_current;
-                at->at_worst_time = now;
-        }
+       if (at->at_current_timeout > at->at_worst_timeout_ever) {
+               at->at_worst_timeout_ever = at->at_current_timeout;
+               at->at_worst_timestamp = now;
+       }
 
-        if (at->at_flags & AT_FLG_NOHIST)
+       if (at->at_flags & AT_FLG_NOHIST)
                 /* Only keep last reported val; keeping the rest of the history
-                   for proc only */
-                at->at_current = val;
+                * for debugfs only
+                */
+               at->at_current_timeout = timeout;
 
         if (at_max > 0)
-                at->at_current =  min(at->at_current, at_max);
-        at->at_current =  max(at->at_current, at_min);
-
-        if (at->at_current != old)
-                CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
-                       "(val=%u) hist %u %u %u %u\n", at,
-                       old, at->at_current, at->at_current - old, val,
+               at->at_current_timeout = min_t(timeout_t,
+                                              at->at_current_timeout, at_max);
+       at->at_current_timeout = max_t(timeout_t, at->at_current_timeout,
+                                      at_min);
+       if (at->at_current_timeout != old_timeout)
+               CDEBUG(D_OTHER,
+                      "AT %p change: old=%u new=%u delta=%d (val=%d) hist %u %u %u %u\n",
+                      at, old_timeout, at->at_current_timeout,
+                      at->at_current_timeout - old_timeout, timeout,
                        at->at_hist[0], at->at_hist[1], at->at_hist[2],
                        at->at_hist[3]);
 
-        /* if we changed, report the old value */
-        old = (at->at_current != old) ? old : 0;
+       /* if we changed, report the old timeout value */
+       old_timeout = (at->at_current_timeout != old_timeout) ? old_timeout : 0;
 
        spin_unlock(&at->at_lock);
-        return old;
+       return old_timeout;
 }
 
 /* Find the imp_at index for a given portal; assign if space available */