From 08ab5c19529fe1200827cbee267bf3eb8d45c119 Mon Sep 17 00:00:00 2001 From: bobijam Date: Wed, 20 Feb 2008 02:17:38 +0000 Subject: [PATCH] Branch b1_6 b=14529 i=adilger i=panda Description: MDS or OSS nodes crash due to stack overflow Details : Code changes in 1.6.4 increased the stack usage of some functions. In some cases, in conjunction with device drivers that use a lot of stack the MDS (or possibly OSS) service threads could overflow the stack. One change which was identified to consume additional stack has been reworked to avoid the extra stack usage. --- lustre/ChangeLog | 10 ++++++++ lustre/include/lprocfs_status.h | 54 +++----------------------------------- lustre/lvfs/lvfs_lib.c | 57 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 69 insertions(+), 52 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 45d57a2..4846887 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -17,6 +17,16 @@ tbd Sun Microsystems, Inc. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a removed cwd "./" (refer to Bugzilla 14399). +Severity : major +Frequency : rare, depends on device drivers and load +Bugzilla : 14529 +Description: MDS or OSS nodes crash due to stack overflow +Details : Code changes in 1.6.5 increased the stack usage of some functions. + In some cases, in conjunction with device drivers that use a lot + of stack the MDS (or possibly OSS) service threads could overflow + the stack. One change which was identified to consume additional + stack has been reworked to avoid the extra stack usage. + Severity : enhancement Bugzilla : 14876 Description: Update to RHEL5 latest kernel-2.6.18-53.1.13.el5. diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 55c01d5..d2d0697 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -300,59 +300,13 @@ static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats) * count itself to reside within a single cache line. */ -static inline void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, - long amount) -{ - struct lprocfs_counter *percpu_cntr; - int smp_id; - - if (!stats) - return; - - /* With per-client stats, statistics are allocated only for - * single CPU area, so the smp_id should be 0 always. */ - smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); - - percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); - atomic_inc(&percpu_cntr->lc_cntl.la_entry); - percpu_cntr->lc_count++; - - if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) { - percpu_cntr->lc_sum += amount; - if (percpu_cntr->lc_config & LPROCFS_CNTR_STDDEV) - percpu_cntr->lc_sumsquare += (__u64)amount * amount; - if (amount < percpu_cntr->lc_min) - percpu_cntr->lc_min = amount; - if (amount > percpu_cntr->lc_max) - percpu_cntr->lc_max = amount; - } - atomic_inc(&percpu_cntr->lc_cntl.la_exit); - lprocfs_stats_unlock(stats); -} +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); #define lprocfs_counter_incr(stats, idx) \ lprocfs_counter_add(stats, idx, 1) - -static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, - long amount) -{ - struct lprocfs_counter *percpu_cntr; - int smp_id; - - if (!stats) - return; - - /* With per-client stats, statistics are allocated only for - * single CPU area, so the smp_id should be 0 always. */ - smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); - - percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); - atomic_inc(&percpu_cntr->lc_cntl.la_entry); - if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) - percpu_cntr->lc_sum -= amount; - atomic_inc(&percpu_cntr->lc_cntl.la_exit); - lprocfs_stats_unlock(stats); -} #define lprocfs_counter_decr(stats, idx) \ lprocfs_counter_sub(stats, idx, 1) diff --git a/lustre/lvfs/lvfs_lib.c b/lustre/lvfs/lvfs_lib.c index 208801a..237b4be 100644 --- a/lustre/lvfs/lvfs_lib.c +++ b/lustre/lvfs/lvfs_lib.c @@ -29,6 +29,7 @@ #include #endif #include +#include __u64 obd_max_pages = 0; __u64 obd_max_alloc = 0; @@ -65,6 +66,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, } return 0; } +EXPORT_SYMBOL(obd_alloc_fail); #ifdef __KERNEL__ void obd_update_maxusage() @@ -155,9 +157,60 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc, RETURN(ret); } EXPORT_SYMBOL(lprocfs_read_helper); -#endif -EXPORT_SYMBOL(obd_alloc_fail); +void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount) +{ + struct lprocfs_counter *percpu_cntr; + int smp_id; + + if (stats == NULL) + return; + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); + + percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); + atomic_inc(&percpu_cntr->lc_cntl.la_entry); + percpu_cntr->lc_count++; + + if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) { + percpu_cntr->lc_sum += amount; + if (percpu_cntr->lc_config & LPROCFS_CNTR_STDDEV) + percpu_cntr->lc_sumsquare += (__u64)amount * amount; + if (amount < percpu_cntr->lc_min) + percpu_cntr->lc_min = amount; + if (amount > percpu_cntr->lc_max) + percpu_cntr->lc_max = amount; + } + atomic_inc(&percpu_cntr->lc_cntl.la_exit); + lprocfs_stats_unlock(stats); +} +EXPORT_SYMBOL(lprocfs_counter_add); + +void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount) +{ + struct lprocfs_counter *percpu_cntr; + int smp_id; + + if (stats == NULL) + return; + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID); + + percpu_cntr = &(stats->ls_percpu[smp_id]->lp_cntr[idx]); + atomic_inc(&percpu_cntr->lc_cntl.la_entry); + if (percpu_cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) + percpu_cntr->lc_sum -= amount; + atomic_inc(&percpu_cntr->lc_cntl.la_exit); + lprocfs_stats_unlock(stats); +} +EXPORT_SYMBOL(lprocfs_counter_sub); +#endif /* LPROCFS */ EXPORT_SYMBOL(obd_fail_loc); EXPORT_SYMBOL(obd_alloc_fail_rate); -- 1.8.3.1