From d17f6bf5031315a0c99502f760bc6f07ea3718b1 Mon Sep 17 00:00:00 2001
From: bobijam <bobijam>
Date: Thu, 21 May 2009 02:01:17 +0000
Subject: [PATCH] Branch b_release_1_8_1 b=19380 i=johann i=sheng.yang

* limit recursive symlink depth to 7 on 8k stack machine
* fix sanity test_140() accordingly.
* revert local_nid_dist_zero=0 patch
---
 lustre/llite/symlink.c         |  9 +++++---
 lustre/tests/sanity.sh         | 52 +++++++++++++++++++++---------------------
 lustre/tests/test-framework.sh | 50 +++++++++++++++++++++-------------------
 3 files changed, 58 insertions(+), 53 deletions(-)
diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c
index acaf37a..351c837 100644
--- a/lustre/llite/symlink.c
+++ b/lustre/llite/symlink.c
@@ -80,7 +80,7 @@ static int ll_readlink_internal(struct inode *inode,
                 CERROR("OBD_MD_LINKNAME not set on reply\n");
                 GOTO(failed, rc = -EPROTO);
         }
-        
+
         LASSERT(symlen != 0);
         if (body->eadatasize != symlen) {
                 CERROR("inode %lu: symlink length %d not expected %d\n",
@@ -168,8 +168,11 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry, struct n
 
         CDEBUG(D_VFSTRACE, "VFS Op\n");
         /* Limit the recursive symlink depth to 5 instead of default
-         * 8 links when kernel has 4k stack to prevent stack overflow. */
-        if (THREAD_SIZE < 8192 && current->link_count >= 5) {
+         * 8 links when kernel has 4k stack to prevent stack overflow.
+         * For 8k stacks we need to limit it to 7 for local servers. */
+        if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+                rc = -ELOOP;
+        } else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
                 rc = -ELOOP;
         } else {
                 down(&lli->lli_size_sem);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index a5ad989..d697a17 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -2684,7 +2684,7 @@ test_57b() {
 
 	rm -rf $dir || error "removing $dir"
 	mkdir -p $dir || error "creating $dir"
-	
+
 	echo "mcreating $FILECOUNT files"
 	createmany -m $dir/f 1 $FILECOUNT || \
 		error "creating files in $dir"
@@ -3025,7 +3025,7 @@ test_67b() { # bug 3285 - supplementary group fails on MDS, passes on client
 	# needs to be in /etc/groups on MDS, gid == uid
 	# Let's use RUNAS_ID
 	T67_UID=${T67_UID:-$RUNAS_ID}
-	
+
 	[ "$UID" = "$T67_UID" ] && skip "UID = T67_UID = $UID -- skipping" && return
 	check_kernel_version 35 || return 0
 	do_facet mds grep -q ":$T67_UID:$T67_UID" /etc/passwd || \
@@ -3603,7 +3603,7 @@ test_99a() {
 	chown $RUNAS_ID $DIR/d99cvsroot || error "chown $DIR/d99cvsroot failed"
 	local oldPWD=$PWD	# bug 13584, use $TMP as working dir
 	cd $TMP
-	
+
 	$RUNAS cvs -d $DIR/d99cvsroot init || error "cvs init failed"
 	cd $oldPWD
 }
@@ -3963,9 +3963,9 @@ test_102b() {
 	local testfile2=${testfile}2
 	local value=`getfattr -n trusted.lov $testfile 2> /dev/null | \
 		     grep "trusted.lov" |sed -e 's/[^=]\+=//'`
-	
+
 	$MCREATE $testfile2
-	setfattr -n trusted.lov -v $value $testfile2 	
+	setfattr -n trusted.lov -v $value $testfile2
 	local tmp_file=${testfile}3
 	$GETSTRIPE -v $testfile2 > $tmp_file
 	local stripe_size=`grep "size"  $tmp_file| awk '{print $2}'`
@@ -3989,9 +3989,9 @@ test_102c() {
 	local testfile2=${testfile}2
 	local value=`getfattr -n lustre.lov $testfile 2> /dev/null | \
 		     grep "lustre.lov" |sed -e 's/[^=]\+=//'  `
-	
+
 	$RUNAS $MCREATE $testfile2
-	$RUNAS setfattr -n lustre.lov -v $value $testfile2 	
+	$RUNAS setfattr -n lustre.lov -v $value $testfile2
 	local tmp_file=${testfile}3
 	$RUNAS $GETSTRIPE -v $testfile2 > $tmp_file
 	local stripe_size=`grep "size"  $tmp_file| awk '{print $2}'`
@@ -4175,7 +4175,7 @@ test_104() {
 	lfs df -i $DIR || error "lfs df -i $DIR failed"
 	lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed"
 	lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed"
-	
+
 	OSC=`lctl get_param -n devices | awk '/-osc-|OSC.*MNT/ {print $4}' | head -n 1`
 	lctl --device %$OSC deactivate
 	lfs df || error "lfs df with deactivated OSC failed"
@@ -4433,7 +4433,7 @@ reset_async() {
 test_118a() #bug 11710
 {
 	reset_async
-	
+
  	multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
 	DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty)
         WRITEBACK=$(lctl get_param "llite.*.dump_page_cache" | grep -c writeback)
@@ -4475,7 +4475,7 @@ test_118b()
 	# until a subsequent RPC completes successfully without error.
 	multiop $DIR/$tfile Ow4096yc
 	rm -f $DIR/$tfile
-	
+
 	return 0
 }
 run_test 118b "Reclaim dirty pages on fatal error =========="
@@ -4515,7 +4515,7 @@ test_118c()
 	if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
 		error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
 	fi
-	
+
 	rm -f $DIR/$tfile
 	echo "Dirty pages flushed via fsync on EROFS"
 	return 0
@@ -4531,7 +4531,7 @@ test_118d()
 	#define OBD_FAIL_OST_BRW_PAUSE_BULK
 	set_nodes_failloc "$(osts_nodes)" 0x214
 	# multiop should block due to fsync until pages are written
-	multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c &	
+	multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c &
 	MULTIPID=$!
 	sleep 1
 
@@ -4571,7 +4571,7 @@ test_118f() {
 	if [[ $RC -eq 0 ]]; then
 		error "Must return error due to dropped pages, rc=$RC"
 	fi
-	
+
         lctl set_param fail_loc=0x0
 
         LOCKED=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c locked)
@@ -4602,7 +4602,7 @@ test_118g() {
 	# simulate local -ENOMEM
         multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
         RC=$?
-	
+
         lctl set_param fail_loc=0
 	if [[ $RC -eq 0 ]]; then
 		error "Must return error due to dropped pages, rc=$RC"
@@ -4614,7 +4614,7 @@ test_118g() {
 	if [[ $LOCKED -ne 0 ]]; then
 		error "Locked pages remain in cache, locked=$LOCKED"
 	fi
-	
+
 	if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
 		error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
 	fi
@@ -4637,7 +4637,7 @@ test_118h() {
 	# Should simulate ENOMEM error which is recoverable and should be handled by timeout
         multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
         RC=$?
-	
+
         set_nodes_failloc "$(osts_nodes)" 0
 	if [[ $RC -eq 0 ]]; then
 		error "Must return error due to dropped pages, rc=$RC"
@@ -4649,7 +4649,7 @@ test_118h() {
 	if [[ $LOCKED -ne 0 ]]; then
 		error "Locked pages remain in cache, locked=$LOCKED"
 	fi
-	
+
 	if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
 		error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
 	fi
@@ -4668,13 +4668,13 @@ test_118i() {
 
 	#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
         set_nodes_failloc "$(osts_nodes)" 0x20e
-	
+
 	# Should simulate ENOMEM error which is recoverable and should be handled by timeout
         multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c &
 	PID=$!
 	sleep 5
 	set_nodes_failloc "$(osts_nodes)" 0
-	
+
 	wait $PID
         RC=$?
 	if [[ $RC -ne 0 ]]; then
@@ -4687,7 +4687,7 @@ test_118i() {
 	if [[ $LOCKED -ne 0 ]]; then
 		error "Locked pages remain in cache, locked=$LOCKED"
 	fi
-	
+
 	if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
 		error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
 	fi
@@ -4721,7 +4721,7 @@ test_118j() {
 	if [[ $LOCKED -ne 0 ]]; then
 		error "Locked pages remain in cache, locked=$LOCKED"
 	fi
-	
+
 	# in recoverable error on OST we want resend and stay until it finished
 	if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
 		error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
@@ -5107,7 +5107,7 @@ run_test 123a "verify statahead work"
 test_123b () { # statahead(bug 15027)
 	mkdir -p $DIR/$tdir
 	createmany -o $DIR/$tdir/$tfile-%d 1000
-	
+
         cancel_lru_locks mdc
         cancel_lru_locks osc
 
@@ -5305,7 +5305,7 @@ test_127() { # bug 15521
                 echo "got $COUNT $NAME"
                 [ ! $MIN ] && error "Missing min value for $NAME proc entry"
                 eval $NAME=$COUNT || error "Wrong proc format"
-		
+
                 case $NAME in
                         read_bytes|write_bytes)
                         [ $MIN -lt 4096 ] && error "min is too small: $MIN"
@@ -5647,7 +5647,7 @@ test_140() { #bug-17379
         cd $DIR/$tdir || error "Changing to $DIR/$tdir"
         cp /usr/bin/stat . || error "Copying stat to $DIR/$tdir"
 
-        # VFS limits max symlink depth to 5(4KSTACK) or 8
+        # VFS limits max symlink depth to 5(4KSTACK) or 7(8KSTACK) or 8
         local i=0
         while i=`expr $i + 1`; do
                 mkdir -p $i || error "Creating dir $i"
@@ -5668,7 +5668,7 @@ test_140() { #bug-17379
         done
         i=`expr $i - 1`
         echo "The symlink depth = $i"
-        [ $i -eq 4 -o $i -eq 8 ] || error "Invalid symlink depth"
+        [ $i -eq 5 -o $i -eq 7 -o $i -eq 8 ] || error "Invalid symlink depth"
 }
 run_test 140 "Check reasonable stack depth (shouldn't LBUG) ===="
 
@@ -5773,7 +5773,7 @@ test_152() {
         cp $TF $DIR/$tfile
         sync || error "sync failed"
         lctl set_param fail_loc=0
-	
+
         # discard client's cache
         cancel_lru_locks osc
 
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index 71917c6..b7adaee 100644
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -181,10 +181,12 @@ load_modules() {
         LNETOPTS=$(awk '/^options lnet/ { print $0}' $MODPROBECONF | sed 's/^options lnet //g')
     echo $LNETOPTS | grep -q "accept=all"  || LNETOPTS="$LNETOPTS accept=all";
     # bug 19380
-    if [ "$NETTYPE" = "tcp" -o "$NETTYPE" = "o2ib" -o "$NETTYPE" = "ptl" ]; then
-        echo $LNETOPTS | grep -q "local_nid_dist_zero=0" ||
-        LNETOPTS="$LNETOPTS local_nid_dist_zero=0"
-    fi
+    # disable it for now since it only hides the stack overflow upon test w/
+    # local servers
+#    if [ "$NETTYPE" = "tcp" -o "$NETTYPE" = "o2ib" -o "$NETTYPE" = "ptl" ]; then
+#        echo $LNETOPTS | grep -q "local_nid_dist_zero=0" ||
+#        LNETOPTS="$LNETOPTS local_nid_dist_zero=0"
+#    fi
     echo "lnet options: '$LNETOPTS'"
     # note that insmod will ignore anything in modprobe.conf
     load_module ../lnet/lnet/lnet $LNETOPTS
@@ -396,14 +398,14 @@ quota_save_version() {
     done
 }
 
-# client could mount several lustre 
+# client could mount several lustre
 quota_type () {
     local fsname=${1:-$FSNAME}
     local rc=0
     do_facet mgs lctl get_param mds.${fsname}-MDT*.quota_type || rc=$?
     do_nodes $(comma_list $(osts_nodes)) \
         lctl get_param obdfilter.${fsname}-OST*.quota_type || rc=$?
-    return $rc 
+    return $rc
 }
 
 restore_quota_type () {
@@ -486,7 +488,7 @@ zconf_umount() {
     local client=$1
     local mnt=$2
     local force
-    local busy 
+    local busy
     local need_kill
 
     [ "$3" ] && force=-f
@@ -527,7 +529,7 @@ if [ \\\$running -ne \\\$mpts ]; then
     echo \\\$(hostname) env are INSANE!;
     exit 1;
 fi"
-    [ $? -eq 0 ] || rc=1 
+    [ $? -eq 0 ] || rc=1
     done
     return $rc
 }
@@ -639,7 +641,7 @@ shudown_node_hard () {
         ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0
         echo "waiting for $host to fail attempts=$attempts"
         [ $i -lt $attempts ] || \
-            { echo "$host still pingable after power down! attempts=$attempts" && return 1; } 
+            { echo "$host still pingable after power down! attempts=$attempts" && return 1; }
     done
 }
 
@@ -649,7 +651,7 @@ shutdown_client() {
     local attempts=3
 
     if [ "$FAILURE_MODE" = HARD ]; then
-        shudown_node_hard $client 
+        shudown_node_hard $client
     else
        zconf_umount_clients $client $mnt -f
     fi
@@ -729,14 +731,14 @@ start_client_loads () {
     done
 }
 
-# only for remote client 
+# only for remote client
 check_client_load () {
     local client=$1
     local var=$(client_var_name $client)_load
     local TESTLOAD=run_${!var}.sh
 
     ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-    
+
     # bug 18914: try to connect several times not only when
     # check ps, but  while check_catastrophe also
     local tries=3
@@ -809,12 +811,12 @@ restart_client_loads () {
         if [ "$rc" != 0 -a "$expectedfail" ]; then
             start_client_load $client
             echo "Restarted client load: on $client. Checking ..."
-            check_client_load $client 
+            check_client_load $client
             rc=${PIPESTATUS[0]}
             if [ "$rc" != 0 ]; then
                 log "Client load failed to restart on node $client, rc=$rc"
                 # failure one client load means test fail
-                # we do not need to check other 
+                # we do not need to check other
                 return $rc
             fi
         else
@@ -916,7 +918,7 @@ wait_recovery_complete () {
     # as we are in process of changing obd_timeout in different ways
     # let's set MAX longer than that
     local MAX=${2:-$(( TIMEOUT * 4 ))}
- 
+
     local var_svc=${facet}_svc
     local procfile="*.${!var_svc}.recovery_status"
     local WAIT=0
@@ -1444,7 +1446,7 @@ init_facet_vars () {
 }
 
 init_facets_vars () {
-    remote_mds_nodsh || 
+    remote_mds_nodsh ||
         init_facet_vars mds $MDSDEV $MDS_MOUNT_OPTS
 
     remote_ost_nodsh && return
@@ -1462,7 +1464,7 @@ init_param_vars () {
         export CLIVER=$(lctl get_param version | cut -d. -f 1,2)
     fi
 
-    remote_mds_nodsh || 
+    remote_mds_nodsh ||
         TIMEOUT=$(do_facet mds "lctl get_param -n timeout")
 
     log "Using TIMEOUT=$TIMEOUT"
@@ -1474,9 +1476,9 @@ init_param_vars () {
 
 check_config () {
     local mntpt=$1
-    local myMGS_host=$mgs_HOST   
+    local myMGS_host=$mgs_HOST
     if [ "$NETTYPE" = "ptl" ]; then
-        myMGS_host=$(h2ptl $mgs_HOST | sed -e s/@ptl//) 
+        myMGS_host=$(h2ptl $mgs_HOST | sed -e s/@ptl//)
     fi
 
     echo Checking config lustre mounted on $mntpt
@@ -1590,7 +1592,7 @@ exclude_items_from_list () {
     for item in ${excluded//,/ }; do
         list=$(echo " $list " | sed -re "s/\s+$item\s+/ /g")
     done
-    echo $(comma_list $list) 
+    echo $(comma_list $list)
 }
 
 # list, expand  are the comma separated lists
@@ -1971,7 +1973,7 @@ trace() {
 }
 
 pass() {
-    $TEST_FAILED && echo -n "FAIL " || echo -n "PASS " 
+    $TEST_FAILED && echo -n "FAIL " || echo -n "PASS "
     echo $@
 }
 
@@ -2192,7 +2194,7 @@ get_random_entry () {
     rnodes=${rnodes//,/ }
 
     local -a nodes=($rnodes)
-    local num=${#nodes[@]} 
+    local num=${#nodes[@]}
     local i=$((RANDOM * num * 2 / 65536))
 
     echo ${nodes[i]}
@@ -2387,7 +2389,7 @@ delayed_recovery_enabled () {
 ################################################################################
 
 get_lustre_version () {
-    local node=${1:-"mds"}    
+    local node=${1:-"mds"}
     do_facet $node $LCTL get_param -n version |  awk '/^lustre:/ {print $2}'
 }
 
@@ -2505,7 +2507,7 @@ wait_osc_import_state() {
     while [ "${CONN_STATE}" != "${expected}" ]; do
         # for disconn we can check after proc entry is removed
         [ "x${CONN_STATE}" == "x" -a "${expected}" == "DISCONN" ] && return 0
-        # disconnect rpc should be wait not more obd_timeout 
+        # disconnect rpc should be wait not more obd_timeout
         [ $i -ge $(($TIMEOUT * 3 / 2)) ] && \
             error "can't put import for ${ost}(${ost_facet}) into ${expected} state" && return 1
         sleep 1
-- 
1.8.3.1