From 6180cba98a9f94d85f070e20fd57e254a09b25d3 Mon Sep 17 00:00:00 2001 From: adilger Date: Tue, 13 May 2008 05:10:52 +0000 Subject: [PATCH] Branch b_release_1_6_5 b=15759 Description: MDS or OSS service threads fail startup with -24 (-EMFILE) Details : During startup under recovery, it is possible for service thread startup to fail in ptlrpc_start_threads() if one of the threads begins processing a request and then starts an additional thread. This causes ptlrpc_start_threads() to try and start 1 too many threads and get an error. i=robert.read i=oleg.drokin --- lustre/ChangeLog | 12 +++++++++++- lustre/include/obd_support.h | 1 + lustre/ptlrpc/service.c | 11 ++++++++--- lustre/tests/conf-sanity.sh | 11 +++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8f5d8c4..e182fa9 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -26,7 +26,17 @@ For more information, please refer to bugzilla 13904. Severity : minor -Frequency : very rare +Frequency : rare, only if {mds,oss}_num_threads is specified +Bugzilla : 15759 +Description: MDS or OSS service threads fail startup with -24 (-EMFILE) +Details : During startup under recovery, it is possible for service + thread startup to fail in ptlrpc_start_threads() if one + of the threads begins processing a request and then starts + an additional thread. This causes ptlrpc_start_threads() + to try and start 1 too many threads and get an error. + +Severity : minor +Frequency : rare Bugzilla : 13380 Description: MDT cannot be unmounted, reporting "Mount still busy" Details : Mountpoint references were being leaked during open reply diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index e97b58d..792a6d4 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -253,6 +253,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_TGT_DELAY_CONNECT 0x703 #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 #define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 9d8c811..dc97a3a 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -1490,11 +1490,14 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc) int i, rc = 0; ENTRY; - /* We require 2 threads min - see note in - ptlrpc_server_handle_request */ + /* We require 2 threads min - see note in + * ptlrpc_server_handle_request() */ LASSERT(svc->srv_threads_min >= 2); for (i = 0; i < svc->srv_threads_min; i++) { rc = ptlrpc_start_thread(dev, svc); + /* We have enough threads, don't start more. b=15759 */ + if (rc == -EMFILE) + break; if (rc) { CERROR("cannot start %s thread #%d: rc %d\n", svc->srv_thread_name, i, rc); @@ -1516,7 +1519,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc) CDEBUG(D_RPCTRACE, "%s started %d min %d max %d running %d\n", svc->srv_name, svc->srv_threads_started, svc->srv_threads_min, svc->srv_threads_max, svc->srv_threads_running); - if (svc->srv_threads_started >= svc->srv_threads_max) + if (unlikely(svc->srv_threads_started >= svc->srv_threads_max) || + (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && + svc->srv_threads_started == svc->srv_threads_min - 1)) RETURN(-EMFILE); OBD_ALLOC(thread, sizeof(*thread)); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 6dbc058..c966af2 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1155,6 +1155,7 @@ test_32a() { [ -z "$TUNEFS" ] && skip "No tunefs" && return local DISK1_4=$LUSTRE/tests/disk1_4.zip [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return + mkdir -p $TMP/$tdir unzip -o -j -d $TMP/$tdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; } load_modules sysctl lnet.debug=$PTLDEBUG @@ -1220,6 +1221,7 @@ test_32b() { [ -z "$TUNEFS" ] && skip "No tunefs" && return local DISK1_4=$LUSTRE/tests/disk1_4.zip [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return + mkdir -p $TMP/$tdir unzip -o -j -d $TMP/$tdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; } load_modules sysctl lnet.debug=$PTLDEBUG @@ -1564,5 +1566,14 @@ test_39() { #bug 14413 } run_test 39 "leak_finder recognizes both LUSTRE and LNET malloc messages" +test_40() { # bug 15759 + start_ost + #define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 + do_facet mds "sysctl -w lustre.fail_loc=0x80000706" + start_mds + cleanup +} +run_test 40 "race during service thread startup" + equals_msg `basename $0`: test complete [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true -- 1.8.3.1