From 703a700b4135d669bac66e9b9f780ff6e72c2024 Mon Sep 17 00:00:00 2001 From: bobijam Date: Wed, 20 Feb 2008 02:34:29 +0000 Subject: [PATCH] Branch HEAD b=14529 i=adilger i=panda Description: MDS or OSS nodes crash due to stack overflow Details : Code changes in 1.8.0 increased the stack usage of some functions. In some cases, in conjunction with device drivers that use a lot of stack the MDS (or possibly OSS) service threads could overflow the stack. One change which was identified to consume additional stack has been reworked to avoid the extra stack usage. --- lustre/ChangeLog | 10 ++++++++ lustre/include/lprocfs_status.h | 54 +++------------------------------------ lustre/lvfs/lvfs_lib.c | 56 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 50 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index b98e81e..ab46c21 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -12,6 +12,16 @@ tbd Sun Microsystems, Inc. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a removed cwd "./" (refer to Bugzilla 14399). +Severity : major +Frequency : rare, depends on device drivers and load +Bugzilla : 14529 +Description: MDS or OSS nodes crash due to stack overflow +Details : Code changes in 1.8.0 increased the stack usage of some functions. + In some cases, in conjunction with device drivers that use a lot + of stack the MDS (or possibly OSS) service threads could overflow + the stack. One change which was identified to consume additional + stack has been reworked to avoid the extra stack usage. + Severity : normal Frequency : occasional Bugzilla : 13730 diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index bce41a8..3dc9416 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -316,59 +316,13 @@ static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats) * count itself to reside within a single cache line. */ -static inline void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, - long amount) -{ - struct lprocfs_counter *percpu_cntr; - int smp_id; - - if (stats == NULL) - return; - - /* With per-client stats, statistics are allocated only for - * single CPU area, so the smp_id should be 0 always. */ - smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); - - percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); - atomic_inc(&percpu_cntr->lc_cntl.la_entry); - percpu_cntr->lc_count++; - - if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) { - percpu_cntr->lc_sum += amount; - if (percpu_cntr->lc_config & LPROCFS_CNTR_STDDEV) - percpu_cntr->lc_sumsquare += (__u64)amount * amount; - if (amount < percpu_cntr->lc_min) - percpu_cntr->lc_min = amount; - if (amount > percpu_cntr->lc_max) - percpu_cntr->lc_max = amount; - } - atomic_inc(&percpu_cntr->lc_cntl.la_exit); - lprocfs_stats_unlock(stats); -} +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); #define lprocfs_counter_incr(stats, idx) \ lprocfs_counter_add(stats, idx, 1) - -static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, - long amount) -{ - struct lprocfs_counter *percpu_cntr; - int smp_id; - - if (stats == NULL) - return; - - /* With per-client stats, statistics are allocated only for - * single CPU area, so the smp_id should be 0 always. */ - smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); - - percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); - atomic_inc(&percpu_cntr->lc_cntl.la_entry); - if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) - percpu_cntr->lc_sum -= amount; - atomic_inc(&percpu_cntr->lc_cntl.la_exit); - lprocfs_stats_unlock(stats); -} #define lprocfs_counter_decr(stats, idx) \ lprocfs_counter_sub(stats, idx, 1) diff --git a/lustre/lvfs/lvfs_lib.c b/lustre/lvfs/lvfs_lib.c index 1306c06..a83c171 100644 --- a/lustre/lvfs/lvfs_lib.c +++ b/lustre/lvfs/lvfs_lib.c @@ -29,6 +29,7 @@ #include #endif #include +#include unsigned int obd_fail_val = 0; unsigned long obd_fail_loc = 0; @@ -123,6 +124,61 @@ int __obd_fail_check_set(__u32 id, __u32 value, int set) } EXPORT_SYMBOL(__obd_fail_check_set); +#ifdef LPROCFS +void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount) +{ + struct lprocfs_counter *percpu_cntr; + int smp_id; + + if (stats == NULL) + return; + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); + + percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); + atomic_inc(&percpu_cntr->lc_cntl.la_entry); + percpu_cntr->lc_count++; + + if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) { + percpu_cntr->lc_sum += amount; + if (percpu_cntr->lc_config & LPROCFS_CNTR_STDDEV) + percpu_cntr->lc_sumsquare += (__u64)amount * amount; + if (amount < percpu_cntr->lc_min) + percpu_cntr->lc_min = amount; + if (amount > percpu_cntr->lc_max) + percpu_cntr->lc_max = amount; + } + atomic_inc(&percpu_cntr->lc_cntl.la_exit); + lprocfs_stats_unlock(stats); +} +EXPORT_SYMBOL(lprocfs_counter_add); + +void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount) +{ + struct lprocfs_counter *percpu_cntr; + int smp_id; + + if (stats == NULL) + return; + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); + + percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); + atomic_inc(&percpu_cntr->lc_cntl.la_entry); + if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) + percpu_cntr->lc_sum -= amount; + atomic_inc(&percpu_cntr->lc_cntl.la_exit); + lprocfs_stats_unlock(stats); +} +EXPORT_SYMBOL(lprocfs_counter_sub); +#endif /* LPROCFS */ + EXPORT_SYMBOL(obd_fail_loc); EXPORT_SYMBOL(obd_alloc_fail_rate); EXPORT_SYMBOL(obd_fail_val); -- 1.8.3.1