From 0aa58d26f5df2b71a040ed6f0f419b925528b6ad Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Thu, 13 Sep 2018 21:44:23 -0600 Subject: [PATCH] LU-11128 ptlrpc: add debugging for idle connections Add a "debug" parameter for the idle client disconnection so that it can log disconnect/reconnect events to the console. Print the idle time in the "import" file. Enable the connection debugging for all test runs. Signed-off-by: Andreas Dilger Change-Id: I47a4f73542fb4864e4f35275ecfe8907493ebbe5 Reviewed-on: https://review.whamcloud.com/33168 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Nathaniel Clark Reviewed-by: Oleg Drokin --- lustre/include/lustre_import.h | 13 +++++++------ lustre/obdclass/lprocfs_status.c | 6 ++++-- lustre/osc/lproc_osc.c | 35 +++++++++++++++++++++++------------ lustre/osc/osc_request.c | 1 + lustre/ptlrpc/client.c | 6 ++++-- lustre/ptlrpc/import.c | 4 +++- lustre/ptlrpc/pinger.c | 2 ++ lustre/tests/test-framework.sh | 3 ++- 8 files changed, 46 insertions(+), 24 deletions(-) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index dfa22bb..9ba6a87 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -306,12 +306,13 @@ struct obd_import { imp_connect_tried:1, /* connected but not FULL yet */ imp_connected:1; - __u32 imp_connect_op; - __u32 imp_idle_timeout; - struct obd_connect_data imp_connect_data; - __u64 imp_connect_flags_orig; - __u64 imp_connect_flags2_orig; - int imp_connect_error; + u32 imp_connect_op; + u32 imp_idle_timeout; + u32 imp_idle_debug; + struct obd_connect_data imp_connect_data; + __u64 imp_connect_flags_orig; + __u64 imp_connect_flags2_orig; + int imp_connect_error; enum lustre_msg_magic imp_msg_magic; /* adjusted based on server capability */ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 663fdf1..e98ca6b 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -996,11 +996,13 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data) " current_connection: %s\n" " connection_attempts: %u\n" " generation: %u\n" - " in-progress_invalidations: %u\n", + " in-progress_invalidations: %u\n" + " idle: %lld sec\n", nidstr, imp->imp_conn_cnt, imp->imp_generation, - atomic_read(&imp->imp_inval_count)); + atomic_read(&imp->imp_inval_count), + ktime_get_real_seconds() - imp->imp_last_reply_time); spin_unlock(&imp->imp_lock); if (obd->obd_svc_stats == NULL) diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 463c179..7ff4d11 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -630,24 +630,35 @@ static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr, obd_kset.kobj); struct client_obd *cli = &dev->u.cli; struct ptlrpc_request *req; + unsigned int idle_debug = 0; unsigned int val; int rc; - rc = kstrtouint(buffer, 10, &val); - if (rc) - return rc; + if (strncmp(buffer, "debug", 5) == 0) { + idle_debug = D_CONSOLE; + } else if (strncmp(buffer, "nodebug", 6) == 0) { + idle_debug = D_HA; + } else { + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; - if (val > CONNECTION_SWITCH_MAX) - return -ERANGE; + if (val > CONNECTION_SWITCH_MAX) + return -ERANGE; + } LPROCFS_CLIMP_CHECK(dev); - cli->cl_import->imp_idle_timeout = val; - - /* to initiate the connection if it's in IDLE state */ - if (!val) { - req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_STATFS); - if (req != NULL) - ptlrpc_req_finished(req); + if (idle_debug) { + cli->cl_import->imp_idle_debug = idle_debug; + } else { + if (!val) { + /* initiate the connection if it's in IDLE state */ + req = ptlrpc_request_alloc(cli->cl_import, + &RQF_OST_STATFS); + if (req != NULL) + ptlrpc_req_finished(req); + } + cli->cl_import->imp_idle_timeout = val; } LPROCFS_CLIMP_EXIT(dev); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 5c707de..1ac295b8 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3252,6 +3252,7 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) list_add_tail(&cli->cl_shrink_list, &osc_shrink_list); spin_unlock(&osc_shrink_lock); cli->cl_import->imp_idle_timeout = osc_idle_timeout; + cli->cl_import->imp_idle_debug = D_HA; RETURN(0); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 22c0d6f..a3b8504 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -869,8 +869,10 @@ ptlrpc_request_alloc_internal(struct obd_import *imp, if (unlikely(imp->imp_state == LUSTRE_IMP_IDLE)) { int rc; - CDEBUG(D_INFO, "%s: connect at new req\n", - imp->imp_obd->obd_name); + CDEBUG_LIMIT(imp->imp_idle_debug, + "%s: reconnect after %llds idle\n", + imp->imp_obd->obd_name, ktime_get_real_seconds() - + imp->imp_last_reply_time); spin_lock(&imp->imp_lock); if (imp->imp_state == LUSTRE_IMP_IDLE) { imp->imp_generation++; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 82fe902..dd9ba73 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -1696,7 +1696,9 @@ int ptlrpc_disconnect_and_idle_import(struct obd_import *imp) if (IS_ERR(req)) RETURN(PTR_ERR(req)); - CDEBUG(D_INFO, "%s: disconnect\n", imp->imp_obd->obd_name); + CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n", + imp->imp_obd->obd_name, + ktime_get_real_seconds() - imp->imp_last_reply_time); req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret; ptlrpcd_add_req(req); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index b704b9e..ef808d8 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -99,6 +99,7 @@ static bool ptlrpc_check_import_is_idle(struct obd_import *imp) if (!imp->imp_idle_timeout) return false; + /* 4 comes from: * - client_obd_setup() - hashed import * - ptlrpcd_alloc_work() @@ -107,6 +108,7 @@ static bool ptlrpc_check_import_is_idle(struct obd_import *imp) */ if (atomic_read(&imp->imp_refcount) > 4) return false; + /* any lock increases ns_bref being a resource holder */ if (ns && atomic_read(&ns->ns_bref) > 0) return false; diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index afd64d2..1604790 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -4804,6 +4804,7 @@ init_param_vars () { osc_ensure_active $SINGLEMDS $TIMEOUT osc_ensure_active client $TIMEOUT + $LCTL set_param osc.*.idle_timeout=debug if [ -n "$(lctl get_param -n mdc.*.connect_flags|grep jobstats)" ]; then local current_jobid_var=$($LCTL get_param -n jobid_var) @@ -4811,7 +4812,7 @@ init_param_vars () { if [ $JOBID_VAR = "existing" ]; then echo "keeping jobstats as $current_jobid_var" elif [ $current_jobid_var != $JOBID_VAR ]; then - echo "seting jobstats to $JOBID_VAR" + echo "setting jobstats to $JOBID_VAR" set_conf_param_and_check client \ "$LCTL get_param -n jobid_var" \ -- 1.8.3.1