Whamcloud - gitweb
LU-2467 ptlrpc: Allow OBD_PINGs to be suppressed
authorLi Wei <wei.g.li@intel.com>
Wed, 30 Jan 2013 12:38:59 +0000 (20:38 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 5 Feb 2013 03:33:21 +0000 (22:33 -0500)
This patch introduces a new ptlrpc module parameter, "suppress_pings",
to provide an option for reducing excessive OBD_PINGs in large
clusters.  The parameter affects all MDTs and OSTs on a node.  It is
off (zero) by default, giving a behavior identical to current
implementation.  If it is on (non-zero), all clients of the affected
targets who understand OBD_CONNECT_PINGLESS will know, at connect
time, that pings are not required.  When suppressing pings, there must
be an external mechanism to notify the targets of client deaths, via
the targets' "evict_client" procfs entries.  In addition, a highly
available standalone MGS is also recommended when suppressing pings,
so that clients are notified (through Imperative Recovery) of target
recoveries.

The changes do two basically independent things.  One is initializing
import and export states (i.e., imp_connect_data and
exp_obd_chain_timed) according to "suppress_pings", since whether to
ping or not is a property of each import-export pair.  MGC pings can
not be suppressed, because maintaining MGS connections is dictated by
the reliance on Imperative Recovery.

The other thing is changing pinger and import routines to respect the
import property set earlier.  (The export side does not need any
change at all.)  Pings are still needed to query last committed
transactions if there are uncommitted requests on an import, so that
resources pinned for replays can be released even when applications
become idle.  An early version of this patch removes imports that do
not need to be pinged from pinger_imports and add them back when last
committed transactions are needed or recoveries must be initiated.
This version does not do that because a) the overheads of iterating
through 10,000 imports are not prohibitively large---around 10 ms, b)
adding imports back to pinger_imports requires the global pinger_mutex
to be held, and c) the imp_lock contention added on each import is
small.

Change-Id: Iabc84d395c978c3f156c52aebfad83621facb4fe
Signed-off-by: Li Wei <wei.g.li@intel.com>
Reviewed-on: http://review.whamcloud.com/5009
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Tested-by: Hudson
Reviewed-by: Emoly Liu <emoly.liu@intel.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
12 files changed:
lustre/include/lustre/lustre_idl.h
lustre/include/lustre_import.h
lustre/include/lustre_net.h
lustre/ldlm/ldlm_lib.c
lustre/llite/llite_lib.c
lustre/lod/lod_lov.c
lustre/mdt/mdt_handler.c
lustre/obdclass/genops.c
lustre/obdclass/obd_mount.c
lustre/ofd/ofd_obd.c
lustre/ptlrpc/client.c
lustre/ptlrpc/pinger.c

index 09594bc..92613ef 100644 (file)
@@ -1219,7 +1219,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
                                OBD_CONNECT_EINPROGRESS | \
                                OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
-                               OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK)
+                               OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+                               OBD_CONNECT_PINGLESS)
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
@@ -1235,11 +1236,12 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                OBD_CONNECT_EINPROGRESS | \
                                OBD_CONNECT_JOBSTATS | \
                                OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
-                               OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID)
+                               OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+                               OBD_CONNECT_PINGLESS)
 #define ECHO_CONNECT_SUPPORTED (0)
 #define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
                                OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
-                               OBD_CONNECT_MNE_SWAB)
+                               OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
 
 /* Features required for this version of the client to work with server */
 #define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
index cba4647..cb1c04b 100644 (file)
@@ -261,6 +261,7 @@ struct obd_import {
                                   imp_no_lock_replay:1,   /* VBR: if gap was found then no lock replays */
                                   imp_vbr_failed:1,       /* recovery by versions was failed */
                                   imp_force_verify:1,     /* force an immidiate ping */
+                                 imp_force_next_verify:1,/* force a scheduled ping */
                                   imp_pingable:1,         /* pingable */
                                   imp_resend_replay:1,    /* resend for replay */
                                   imp_no_pinger_recover:1,/* disable normal recovery, for test only. */
index e5e7885..74983d0 100644 (file)
@@ -2956,6 +2956,7 @@ int server_disconnect_export(struct obd_export *exp);
  * Pinger API (client side only)
  * @{
  */
+extern int suppress_pings;
 enum timeout_event {
         TIMEOUT_GRANT = 1
 };
index 0db5ccd..85f0fec 100644 (file)
@@ -418,7 +418,6 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                 GOTO(err_ldlm, rc = -ENOENT);
         imp->imp_client = &obddev->obd_ldlm_client;
         imp->imp_connect_op = connect_op;
-        CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
         memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
                LUSTRE_CFG_BUFLEN(lcfg, 1));
         class_import_put(imp);
index 501fb38..af98592 100644 (file)
@@ -218,7 +218,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                   OBD_CONNECT_FULL20   | OBD_CONNECT_64BITHASH|
                                  OBD_CONNECT_EINPROGRESS |
                                  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
-                                 OBD_CONNECT_LAYOUTLOCK;
+                                 OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
 
         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
@@ -402,7 +402,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                   OBD_CONNECT_MAXBYTES |
                                  OBD_CONNECT_EINPROGRESS |
                                  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
-                                 OBD_CONNECT_LAYOUTLOCK;
+                                 OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
 
         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
index b7460b4..1d6986b 100644 (file)
@@ -227,7 +227,8 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod,
                                           OBD_CONNECT_SKIP_ORPHAN |
                                           OBD_CONNECT_FID |
                                           OBD_CONNECT_LVB_TYPE |
-                                          OBD_CONNECT_VERSION;
+                                          OBD_CONNECT_VERSION |
+                                          OBD_CONNECT_PINGLESS;
 
                data->ocd_group = tgt_index;
                ltd = &lod->lod_ost_descs;
@@ -242,7 +243,8 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod,
                                           OBD_CONNECT_MDS_MDS |
                                           OBD_CONNECT_FID |
                                           OBD_CONNECT_AT |
-                                          OBD_CONNECT_FULL20;
+                                          OBD_CONNECT_FULL20 |
+                                          OBD_CONNECT_PINGLESS;
                /* XXX set MDS-MDS flags, remove this when running this
                 * on client*/
                data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS;
index bef495f..87c2fc7 100644 (file)
@@ -5066,6 +5066,16 @@ static int mdt_connect_internal(struct obd_export *exp,
                return -EBADE;
        }
 
+       if (data->ocd_connect_flags & OBD_CONNECT_PINGLESS) {
+               if (suppress_pings) {
+                       spin_lock(&exp->exp_obd->obd_dev_lock);
+                       list_del_init(&exp->exp_obd_chain_timed);
+                       spin_unlock(&exp->exp_obd->obd_dev_lock);
+               } else {
+                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+               }
+       }
+
        return 0;
 }
 
index 7c06088..cd1443c 100644 (file)
@@ -1037,6 +1037,7 @@ struct obd_import *class_new_import(struct obd_device *obd)
         if (imp == NULL)
                 return NULL;
 
+       CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
         CFS_INIT_LIST_HEAD(&imp->imp_zombie_chain);
         CFS_INIT_LIST_HEAD(&imp->imp_replay_list);
         CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
index ccf5a6e..8a2c429 100644 (file)
@@ -1056,7 +1056,8 @@ static int lustre_lwp_connect(struct obd_device *lwp)
        data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
                                   OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
                                   OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
-                                  OBD_CONNECT_LIGHTWEIGHT;
+                                  OBD_CONNECT_LIGHTWEIGHT |
+                                  OBD_CONNECT_PINGLESS;
        OBD_ALLOC_PTR(uuid);
        if (uuid == NULL)
                GOTO(out, rc = -ENOMEM);
index 1f4c108..be76426 100644 (file)
@@ -237,7 +237,17 @@ static int ofd_parse_connect_data(const struct lu_env *env,
        if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
                data->ocd_maxbytes = ofd->ofd_dt_conf.ddp_maxbytes;
 
-        RETURN(0);
+       if (data->ocd_connect_flags & OBD_CONNECT_PINGLESS) {
+               if (suppress_pings) {
+                       spin_lock(&exp->exp_obd->obd_dev_lock);
+                       list_del_init(&exp->exp_obd_chain_timed);
+                       spin_unlock(&exp->exp_obd->obd_dev_lock);
+               } else {
+                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+               }
+       }
+
+       RETURN(0);
 }
 
 static int ofd_obd_reconnect(const struct lu_env *env, struct obd_export *exp,
index 54208b8..819276c 100644 (file)
@@ -1334,10 +1334,22 @@ static int after_reply(struct ptlrpc_request *req)
                         imp->imp_peer_committed_transno =
                                 lustre_msg_get_last_committed(req->rq_repmsg);
                 }
-                ptlrpc_free_committed(imp);
 
-                if (req->rq_transno > imp->imp_peer_committed_transno)
-                        ptlrpc_pinger_commit_expected(imp);
+               ptlrpc_free_committed(imp);
+
+               if (!cfs_list_empty(&imp->imp_replay_list)) {
+                       struct ptlrpc_request *last;
+
+                       last = cfs_list_entry(imp->imp_replay_list.prev,
+                                             struct ptlrpc_request,
+                                             rq_replay_list);
+                       /*
+                        * Requests with rq_replay stay on the list even if no
+                        * commit is expected.
+                        */
+                       if (last->rq_transno > imp->imp_peer_committed_transno)
+                               ptlrpc_pinger_commit_expected(imp);
+               }
 
                spin_unlock(&imp->imp_lock);
        }
index 61f66b9..f5bf639 100644 (file)
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
 
+int suppress_pings;
+EXPORT_SYMBOL(suppress_pings);
+CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings");
+
 struct mutex pinger_mutex;
 static CFS_LIST_HEAD(pinger_imports);
 static cfs_list_t timeout_list = CFS_LIST_HEAD_INIT(timeout_list);
+
 struct ptlrpc_request *
 ptlrpc_prep_ping(struct obd_import *imp)
 {
@@ -225,23 +230,39 @@ int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req)
 static void ptlrpc_pinger_process_import(struct obd_import *imp,
                                          unsigned long this_ping)
 {
-       int force, level;
+       int level;
+       int force;
+       int force_next;
+       int suppress;
 
        spin_lock(&imp->imp_lock);
+
        level = imp->imp_state;
        force = imp->imp_force_verify;
-       if (force)
-               imp->imp_force_verify = 0;
-       spin_unlock(&imp->imp_lock);
+       force_next = imp->imp_force_next_verify;
+       /*
+        * This will be used below only if the import is "FULL".
+        */
+       suppress = !!(imp->imp_connect_data.ocd_connect_flags &
+                     OBD_CONNECT_PINGLESS);
 
-        CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA,
-               "level %s/%u force %u deactive %u pingable %u\n",
-               ptlrpc_import_state_name(level), level,
-               force, imp->imp_deactive, imp->imp_pingable);
+       imp->imp_force_verify = 0;
 
-        if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK,
-                             this_ping) && force == 0)
-                return;
+       if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+           !force) {
+               spin_unlock(&imp->imp_lock);
+               return;
+       }
+
+       imp->imp_force_next_verify = 0;
+
+       spin_unlock(&imp->imp_lock);
+
+       CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+              "force %u force_next %u deactive %u pingable %u suppress %u\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+              ptlrpc_import_state_name(level), level, force, force_next,
+              imp->imp_deactive, imp->imp_pingable, suppress);
 
         if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
                 /* wait for a while before trying recovery again */
@@ -251,13 +272,13 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
         } else if (level != LUSTRE_IMP_FULL ||
                    imp->imp_obd->obd_no_recov ||
                    imp_is_deactive(imp)) {
-                CDEBUG(D_HA, "not pinging %s (in recovery "
-                       " or recovery disabled: %s)\n",
-                       obd2cli_tgt(imp->imp_obd),
-                       ptlrpc_import_state_name(level));
-        } else if (imp->imp_pingable || force) {
-                ptlrpc_ping(imp);
-        }
+               CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+                      "or recovery disabled: %s)\n",
+                      imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+                      ptlrpc_import_state_name(level));
+       } else if ((imp->imp_pingable && !suppress) || force_next || force) {
+               ptlrpc_ping(imp);
+       }
 }
 
 static int ptlrpc_pinger_main(void *arg)
@@ -372,7 +393,14 @@ int ptlrpc_start_pinger(void)
         l_wait_event(pinger_thread->t_ctl_waitq,
                      thread_is_running(pinger_thread), &lwi);
 
-        RETURN(0);
+       if (suppress_pings)
+               CWARN("Pings will be suppressed at the request of the "
+                     "administrator.  The configuration shall meet the "
+                     "additional requirements described in the manual.  "
+                     "(Search for the \"suppress_pings\" kernel module "
+                     "parameter.)\n");
+
+       RETURN(0);
 }
 
 int ptlrpc_pinger_remove_timeouts(void);
@@ -411,7 +439,17 @@ EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
 
 void ptlrpc_pinger_commit_expected(struct obd_import *imp)
 {
-        ptlrpc_update_next_ping(imp, 1);
+       ptlrpc_update_next_ping(imp, 1);
+       LASSERT_SPIN_LOCKED(&imp->imp_lock);
+       /*
+        * Avoid reading stale imp_connect_data.  When not sure if pings are
+        * expected or not on next connection, we assume they are not and force
+        * one anyway to guarantee the chance of updating
+        * imp_peer_committed_transno.
+        */
+       if (imp->imp_state != LUSTRE_IMP_FULL ||
+           imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_PINGLESS)
+               imp->imp_force_next_verify = 1;
 }
 
 int ptlrpc_pinger_add_import(struct obd_import *imp)