From f53eea15d470c9bb29a7b867b733db9249aec95b Mon Sep 17 00:00:00 2001 From: James Simmons Date: Sun, 30 Aug 2020 12:45:42 -0400 Subject: [PATCH] LU-11986 lnet: don't read debugfs lnet stats when shutting down A race exist on shutdown with an external application reading the debugfs file containing lnet stats which causes an kernel crash. [ 257.192117] BUG: unable to handle kernel paging request at fffffffffffffff0 [ 257.194859] IP: [] cfs_percpt_number+0x6/0x10 [libcfs] [ 257.196863] PGD 7c14067 PUD 7c16067 PMD 0 [ 257.198665] Oops: 0000 [#1] SMP [ 257.200431] Modules linked in: ksocklnd(OE) lnet(OE) libcfs(OE) dm_service_time iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi sunrpc zfs(POE) zunicode(POE) zavl(POE) icp(POE) zcommon(POE) znvpair(POE) spl(OE) ppdev iosf_mbi crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd pcspkr sg e1000 video parport_pc parport i2c_piix4 dm_multipath dm_mod ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi crct10dif_pclmul crct10dif_common ata_piix serio_raw libata [last unloaded: obdclass] [ 257.222895] CPU: 0 PID: 7331 Comm: lctl Tainted: P OE ------------ 3.10.0-957.el7_lustre.x86_64 #1 [ 257.229312] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 257.233659] task: ffff9c9fbaf15140 ti: ffff9c9fbabcc000 task.ti: ffff9c9fbabcc000 [ 257.238388] RIP: 0010:[] [] cfs_percpt_number+0x6/0x10 [libcfs] [ 257.243851] RSP: 0018:ffff9c9fbabcfdb0 EFLAGS: 00010296 [ 257.246400] RAX: 0000000000000000 RBX: ffff9c9fba2a5200 RCX: 0000000000000000 [ 257.250304] RDX: 0000000000000001 RSI: 00000000ffffffff RDI: 0000000000000000 [ 257.253677] RBP: ffff9c9fbabcfdd0 R08: 000000000001f120 R09: ffff9c9fbe001700 [ 257.257073] R10: ffffffffc0c376db R11: 0000000000000246 R12: 0000000000000000 [ 257.260339] R13: 0000000000000000 R14: 0000000000001000 R15: ffff9c9fba2a5200 [ 257.263204] FS: 00007fbdc89c6740(0000) GS:ffff9c9fbfc00000(0000) knlGS:0000000000000000 [ 257.266409] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 257.269105] CR2: fffffffffffffff0 CR3: 0000000022e36000 CR4: 00000000000606f0 [ 257.272529] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 257.275209] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 257.277936] Call Trace: [ 257.279245] [] ? lnet_counters_get_common+0xeb/0x150 [lnet] [ 257.283071] [] lnet_counters_get+0x6c/0x150 [lnet] [ 257.286224] [] __proc_lnet_stats+0xfb/0x810 [lnet] [ 257.288975] [] lprocfs_call_handler+0x22/0x50 [libcfs] [ 257.292387] [] proc_lnet_stats+0x25/0x30 [lnet] [ 257.295184] [] lnet_debugfs_read+0x2d/0x40 [libcfs] The solution is to only allow reading of the lnet stats when the lnet state is LNET_STATE_RUNNING. Test-Parameters: trivial testlist=sanity-lnet Change-Id: I8720a51ec358e4f6ae121acb34cc23020054ab84 Signed-off-by: James Simmons Reviewed-on: https://review.whamcloud.com/39404 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Serguei Smirnov Reviewed-by: Nathaniel Clark Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 2 +- lnet/lnet/api-ni.c | 38 +++++++++++++++++++++++++++----------- lnet/lnet/router_proc.c | 17 ++++++----------- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index a7de9a4..c7d7afc 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -708,7 +708,7 @@ bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg); /** @} lnet_fault_simulation */ void lnet_counters_get_common(struct lnet_counters_common *common); -void lnet_counters_get(struct lnet_counters *counters); +int lnet_counters_get(struct lnet_counters *counters); void lnet_counters_reset(void); unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov); diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 500f4a1..dd606e8 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -926,16 +926,17 @@ lnet_unregister_lnd(const struct lnet_lnd *lnd) } EXPORT_SYMBOL(lnet_unregister_lnd); -void -lnet_counters_get_common(struct lnet_counters_common *common) +static void +lnet_counters_get_common_locked(struct lnet_counters_common *common) { struct lnet_counters *ctr; int i; + /* FIXME !!! Their is no assert_lnet_net_locked() to ensure this + * actually called under the protection of the lnet_net_lock. + */ memset(common, 0, sizeof(*common)); - lnet_net_lock(LNET_LOCK_EX); - cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { common->lcc_msgs_max += ctr->lct_common.lcc_msgs_max; common->lcc_msgs_alloc += ctr->lct_common.lcc_msgs_alloc; @@ -949,23 +950,33 @@ lnet_counters_get_common(struct lnet_counters_common *common) common->lcc_route_length += ctr->lct_common.lcc_route_length; common->lcc_drop_length += ctr->lct_common.lcc_drop_length; } +} + +void +lnet_counters_get_common(struct lnet_counters_common *common) +{ + lnet_net_lock(LNET_LOCK_EX); + lnet_counters_get_common_locked(common); lnet_net_unlock(LNET_LOCK_EX); } EXPORT_SYMBOL(lnet_counters_get_common); -void +int lnet_counters_get(struct lnet_counters *counters) { struct lnet_counters *ctr; struct lnet_counters_health *health = &counters->lct_health; - int i; + int i, rc = 0; memset(counters, 0, sizeof(*counters)); - lnet_counters_get_common(&counters->lct_common); - lnet_net_lock(LNET_LOCK_EX); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + GOTO(out_unlock, rc = -ENODEV); + + lnet_counters_get_common_locked(&counters->lct_common); + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { health->lch_rst_alloc += ctr->lct_health.lch_rst_alloc; health->lch_resend_count += ctr->lct_health.lch_resend_count; @@ -992,7 +1003,9 @@ lnet_counters_get(struct lnet_counters *counters) health->lch_network_timeout_count += ctr->lct_health.lch_network_timeout_count; } +out_unlock: lnet_net_unlock(LNET_LOCK_EX); + return rc; } EXPORT_SYMBOL(lnet_counters_get); @@ -1004,9 +1017,12 @@ lnet_counters_reset(void) lnet_net_lock(LNET_LOCK_EX); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + goto avoid_reset; + cfs_percpt_for_each(counters, i, the_lnet.ln_counters) memset(counters, 0, sizeof(struct lnet_counters)); - +avoid_reset: lnet_net_unlock(LNET_LOCK_EX); } @@ -3751,9 +3767,9 @@ LNetCtl(unsigned int cmd, void *arg) return -EINVAL; mutex_lock(&the_lnet.ln_api_mutex); - lnet_counters_get(&lnet_stats->st_cntrs); + rc = lnet_counters_get(&lnet_stats->st_cntrs); mutex_unlock(&the_lnet.ln_api_mutex); - return 0; + return rc; } case IOC_LIBCFS_CONFIG_RTR: diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index b517dcf..4f42ea3 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -85,8 +85,7 @@ static int __proc_lnet_stats(void *data, int write, struct lnet_counters *ctrs; struct lnet_counters_common common; int len; - char *tmpstr; - const int tmpsiz = 256; /* 7 %u and 4 __u64 */ + char tmpstr[256]; /* 7 %u and 4 u64 */ if (write) { lnet_counters_reset(); @@ -99,16 +98,13 @@ static int __proc_lnet_stats(void *data, int write, if (ctrs == NULL) return -ENOMEM; - LIBCFS_ALLOC(tmpstr, tmpsiz); - if (tmpstr == NULL) { - LIBCFS_FREE(ctrs, sizeof(*ctrs)); - return -ENOMEM; - } + rc = lnet_counters_get(ctrs); + if (rc) + goto out_no_ctrs; - lnet_counters_get(ctrs); common = ctrs->lct_common; - len = scnprintf(tmpstr, tmpsiz, + len = scnprintf(tmpstr, sizeof(tmpstr), "%u %u %u %u %u %u %u %llu %llu " "%llu %llu", common.lcc_msgs_alloc, common.lcc_msgs_max, @@ -123,8 +119,7 @@ static int __proc_lnet_stats(void *data, int write, else rc = cfs_trace_copyout_string(buffer, nob, tmpstr + pos, "\n"); - - LIBCFS_FREE(tmpstr, tmpsiz); +out_no_ctrs: LIBCFS_FREE(ctrs, sizeof(*ctrs)); return rc; } -- 1.8.3.1