From d17f6bf5031315a0c99502f760bc6f07ea3718b1 Mon Sep 17 00:00:00 2001 From: bobijam Date: Thu, 21 May 2009 02:01:17 +0000 Subject: [PATCH] Branch b_release_1_8_1 b=19380 i=johann i=sheng.yang * limit recursive symlink depth to 7 on 8k stack machine * fix sanity test_140() accordingly. * revert local_nid_dist_zero=0 patch --- lustre/llite/symlink.c | 9 +++++--- lustre/tests/sanity.sh | 52 +++++++++++++++++++++--------------------- lustre/tests/test-framework.sh | 50 +++++++++++++++++++++------------------- 3 files changed, 58 insertions(+), 53 deletions(-) diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index acaf37a..351c837 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -80,7 +80,7 @@ static int ll_readlink_internal(struct inode *inode, CERROR("OBD_MD_LINKNAME not set on reply\n"); GOTO(failed, rc = -EPROTO); } - + LASSERT(symlen != 0); if (body->eadatasize != symlen) { CERROR("inode %lu: symlink length %d not expected %d\n", @@ -168,8 +168,11 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry, struct n CDEBUG(D_VFSTRACE, "VFS Op\n"); /* Limit the recursive symlink depth to 5 instead of default - * 8 links when kernel has 4k stack to prevent stack overflow. */ - if (THREAD_SIZE < 8192 && current->link_count >= 5) { + * 8 links when kernel has 4k stack to prevent stack overflow. + * For 8k stacks we need to limit it to 7 for local servers. */ + if (THREAD_SIZE < 8192 && current->link_count >= 6) { + rc = -ELOOP; + } else if (THREAD_SIZE == 8192 && current->link_count >= 8) { rc = -ELOOP; } else { down(&lli->lli_size_sem); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a5ad989..d697a17 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -2684,7 +2684,7 @@ test_57b() { rm -rf $dir || error "removing $dir" mkdir -p $dir || error "creating $dir" - + echo "mcreating $FILECOUNT files" createmany -m $dir/f 1 $FILECOUNT || \ error "creating files in $dir" @@ -3025,7 +3025,7 @@ test_67b() { # bug 3285 - supplementary group fails on MDS, passes on client # needs to be in /etc/groups on MDS, gid == uid # Let's use RUNAS_ID T67_UID=${T67_UID:-$RUNAS_ID} - + [ "$UID" = "$T67_UID" ] && skip "UID = T67_UID = $UID -- skipping" && return check_kernel_version 35 || return 0 do_facet mds grep -q ":$T67_UID:$T67_UID" /etc/passwd || \ @@ -3603,7 +3603,7 @@ test_99a() { chown $RUNAS_ID $DIR/d99cvsroot || error "chown $DIR/d99cvsroot failed" local oldPWD=$PWD # bug 13584, use $TMP as working dir cd $TMP - + $RUNAS cvs -d $DIR/d99cvsroot init || error "cvs init failed" cd $oldPWD } @@ -3963,9 +3963,9 @@ test_102b() { local testfile2=${testfile}2 local value=`getfattr -n trusted.lov $testfile 2> /dev/null | \ grep "trusted.lov" |sed -e 's/[^=]\+=//'` - + $MCREATE $testfile2 - setfattr -n trusted.lov -v $value $testfile2 + setfattr -n trusted.lov -v $value $testfile2 local tmp_file=${testfile}3 $GETSTRIPE -v $testfile2 > $tmp_file local stripe_size=`grep "size" $tmp_file| awk '{print $2}'` @@ -3989,9 +3989,9 @@ test_102c() { local testfile2=${testfile}2 local value=`getfattr -n lustre.lov $testfile 2> /dev/null | \ grep "lustre.lov" |sed -e 's/[^=]\+=//' ` - + $RUNAS $MCREATE $testfile2 - $RUNAS setfattr -n lustre.lov -v $value $testfile2 + $RUNAS setfattr -n lustre.lov -v $value $testfile2 local tmp_file=${testfile}3 $RUNAS $GETSTRIPE -v $testfile2 > $tmp_file local stripe_size=`grep "size" $tmp_file| awk '{print $2}'` @@ -4175,7 +4175,7 @@ test_104() { lfs df -i $DIR || error "lfs df -i $DIR failed" lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed" lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed" - + OSC=`lctl get_param -n devices | awk '/-osc-|OSC.*MNT/ {print $4}' | head -n 1` lctl --device %$OSC deactivate lfs df || error "lfs df with deactivated OSC failed" @@ -4433,7 +4433,7 @@ reset_async() { test_118a() #bug 11710 { reset_async - + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty) WRITEBACK=$(lctl get_param "llite.*.dump_page_cache" | grep -c writeback) @@ -4475,7 +4475,7 @@ test_118b() # until a subsequent RPC completes successfully without error. multiop $DIR/$tfile Ow4096yc rm -f $DIR/$tfile - + return 0 } run_test 118b "Reclaim dirty pages on fatal error ==========" @@ -4515,7 +4515,7 @@ test_118c() if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" fi - + rm -f $DIR/$tfile echo "Dirty pages flushed via fsync on EROFS" return 0 @@ -4531,7 +4531,7 @@ test_118d() #define OBD_FAIL_OST_BRW_PAUSE_BULK set_nodes_failloc "$(osts_nodes)" 0x214 # multiop should block due to fsync until pages are written - multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & MULTIPID=$! sleep 1 @@ -4571,7 +4571,7 @@ test_118f() { if [[ $RC -eq 0 ]]; then error "Must return error due to dropped pages, rc=$RC" fi - + lctl set_param fail_loc=0x0 LOCKED=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c locked) @@ -4602,7 +4602,7 @@ test_118g() { # simulate local -ENOMEM multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c RC=$? - + lctl set_param fail_loc=0 if [[ $RC -eq 0 ]]; then error "Must return error due to dropped pages, rc=$RC" @@ -4614,7 +4614,7 @@ test_118g() { if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi - + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" fi @@ -4637,7 +4637,7 @@ test_118h() { # Should simulate ENOMEM error which is recoverable and should be handled by timeout multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c RC=$? - + set_nodes_failloc "$(osts_nodes)" 0 if [[ $RC -eq 0 ]]; then error "Must return error due to dropped pages, rc=$RC" @@ -4649,7 +4649,7 @@ test_118h() { if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi - + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" fi @@ -4668,13 +4668,13 @@ test_118i() { #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e set_nodes_failloc "$(osts_nodes)" 0x20e - + # Should simulate ENOMEM error which is recoverable and should be handled by timeout multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & PID=$! sleep 5 set_nodes_failloc "$(osts_nodes)" 0 - + wait $PID RC=$? if [[ $RC -ne 0 ]]; then @@ -4687,7 +4687,7 @@ test_118i() { if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi - + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" fi @@ -4721,7 +4721,7 @@ test_118j() { if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi - + # in recoverable error on OST we want resend and stay until it finished if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" @@ -5107,7 +5107,7 @@ run_test 123a "verify statahead work" test_123b () { # statahead(bug 15027) mkdir -p $DIR/$tdir createmany -o $DIR/$tdir/$tfile-%d 1000 - + cancel_lru_locks mdc cancel_lru_locks osc @@ -5305,7 +5305,7 @@ test_127() { # bug 15521 echo "got $COUNT $NAME" [ ! $MIN ] && error "Missing min value for $NAME proc entry" eval $NAME=$COUNT || error "Wrong proc format" - + case $NAME in read_bytes|write_bytes) [ $MIN -lt 4096 ] && error "min is too small: $MIN" @@ -5647,7 +5647,7 @@ test_140() { #bug-17379 cd $DIR/$tdir || error "Changing to $DIR/$tdir" cp /usr/bin/stat . || error "Copying stat to $DIR/$tdir" - # VFS limits max symlink depth to 5(4KSTACK) or 8 + # VFS limits max symlink depth to 5(4KSTACK) or 7(8KSTACK) or 8 local i=0 while i=`expr $i + 1`; do mkdir -p $i || error "Creating dir $i" @@ -5668,7 +5668,7 @@ test_140() { #bug-17379 done i=`expr $i - 1` echo "The symlink depth = $i" - [ $i -eq 4 -o $i -eq 8 ] || error "Invalid symlink depth" + [ $i -eq 5 -o $i -eq 7 -o $i -eq 8 ] || error "Invalid symlink depth" } run_test 140 "Check reasonable stack depth (shouldn't LBUG) ====" @@ -5773,7 +5773,7 @@ test_152() { cp $TF $DIR/$tfile sync || error "sync failed" lctl set_param fail_loc=0 - + # discard client's cache cancel_lru_locks osc diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 71917c6..b7adaee 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -181,10 +181,12 @@ load_modules() { LNETOPTS=$(awk '/^options lnet/ { print $0}' $MODPROBECONF | sed 's/^options lnet //g') echo $LNETOPTS | grep -q "accept=all" || LNETOPTS="$LNETOPTS accept=all"; # bug 19380 - if [ "$NETTYPE" = "tcp" -o "$NETTYPE" = "o2ib" -o "$NETTYPE" = "ptl" ]; then - echo $LNETOPTS | grep -q "local_nid_dist_zero=0" || - LNETOPTS="$LNETOPTS local_nid_dist_zero=0" - fi + # disable it for now since it only hides the stack overflow upon test w/ + # local servers +# if [ "$NETTYPE" = "tcp" -o "$NETTYPE" = "o2ib" -o "$NETTYPE" = "ptl" ]; then +# echo $LNETOPTS | grep -q "local_nid_dist_zero=0" || +# LNETOPTS="$LNETOPTS local_nid_dist_zero=0" +# fi echo "lnet options: '$LNETOPTS'" # note that insmod will ignore anything in modprobe.conf load_module ../lnet/lnet/lnet $LNETOPTS @@ -396,14 +398,14 @@ quota_save_version() { done } -# client could mount several lustre +# client could mount several lustre quota_type () { local fsname=${1:-$FSNAME} local rc=0 do_facet mgs lctl get_param mds.${fsname}-MDT*.quota_type || rc=$? do_nodes $(comma_list $(osts_nodes)) \ lctl get_param obdfilter.${fsname}-OST*.quota_type || rc=$? - return $rc + return $rc } restore_quota_type () { @@ -486,7 +488,7 @@ zconf_umount() { local client=$1 local mnt=$2 local force - local busy + local busy local need_kill [ "$3" ] && force=-f @@ -527,7 +529,7 @@ if [ \\\$running -ne \\\$mpts ]; then echo \\\$(hostname) env are INSANE!; exit 1; fi" - [ $? -eq 0 ] || rc=1 + [ $? -eq 0 ] || rc=1 done return $rc } @@ -639,7 +641,7 @@ shudown_node_hard () { ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0 echo "waiting for $host to fail attempts=$attempts" [ $i -lt $attempts ] || \ - { echo "$host still pingable after power down! attempts=$attempts" && return 1; } + { echo "$host still pingable after power down! attempts=$attempts" && return 1; } done } @@ -649,7 +651,7 @@ shutdown_client() { local attempts=3 if [ "$FAILURE_MODE" = HARD ]; then - shudown_node_hard $client + shudown_node_hard $client else zconf_umount_clients $client $mnt -f fi @@ -729,14 +731,14 @@ start_client_loads () { done } -# only for remote client +# only for remote client check_client_load () { local client=$1 local var=$(client_var_name $client)_load local TESTLOAD=run_${!var}.sh ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1 - + # bug 18914: try to connect several times not only when # check ps, but while check_catastrophe also local tries=3 @@ -809,12 +811,12 @@ restart_client_loads () { if [ "$rc" != 0 -a "$expectedfail" ]; then start_client_load $client echo "Restarted client load: on $client. Checking ..." - check_client_load $client + check_client_load $client rc=${PIPESTATUS[0]} if [ "$rc" != 0 ]; then log "Client load failed to restart on node $client, rc=$rc" # failure one client load means test fail - # we do not need to check other + # we do not need to check other return $rc fi else @@ -916,7 +918,7 @@ wait_recovery_complete () { # as we are in process of changing obd_timeout in different ways # let's set MAX longer than that local MAX=${2:-$(( TIMEOUT * 4 ))} - + local var_svc=${facet}_svc local procfile="*.${!var_svc}.recovery_status" local WAIT=0 @@ -1444,7 +1446,7 @@ init_facet_vars () { } init_facets_vars () { - remote_mds_nodsh || + remote_mds_nodsh || init_facet_vars mds $MDSDEV $MDS_MOUNT_OPTS remote_ost_nodsh && return @@ -1462,7 +1464,7 @@ init_param_vars () { export CLIVER=$(lctl get_param version | cut -d. -f 1,2) fi - remote_mds_nodsh || + remote_mds_nodsh || TIMEOUT=$(do_facet mds "lctl get_param -n timeout") log "Using TIMEOUT=$TIMEOUT" @@ -1474,9 +1476,9 @@ init_param_vars () { check_config () { local mntpt=$1 - local myMGS_host=$mgs_HOST + local myMGS_host=$mgs_HOST if [ "$NETTYPE" = "ptl" ]; then - myMGS_host=$(h2ptl $mgs_HOST | sed -e s/@ptl//) + myMGS_host=$(h2ptl $mgs_HOST | sed -e s/@ptl//) fi echo Checking config lustre mounted on $mntpt @@ -1590,7 +1592,7 @@ exclude_items_from_list () { for item in ${excluded//,/ }; do list=$(echo " $list " | sed -re "s/\s+$item\s+/ /g") done - echo $(comma_list $list) + echo $(comma_list $list) } # list, expand are the comma separated lists @@ -1971,7 +1973,7 @@ trace() { } pass() { - $TEST_FAILED && echo -n "FAIL " || echo -n "PASS " + $TEST_FAILED && echo -n "FAIL " || echo -n "PASS " echo $@ } @@ -2192,7 +2194,7 @@ get_random_entry () { rnodes=${rnodes//,/ } local -a nodes=($rnodes) - local num=${#nodes[@]} + local num=${#nodes[@]} local i=$((RANDOM * num * 2 / 65536)) echo ${nodes[i]} @@ -2387,7 +2389,7 @@ delayed_recovery_enabled () { ################################################################################ get_lustre_version () { - local node=${1:-"mds"} + local node=${1:-"mds"} do_facet $node $LCTL get_param -n version | awk '/^lustre:/ {print $2}' } @@ -2505,7 +2507,7 @@ wait_osc_import_state() { while [ "${CONN_STATE}" != "${expected}" ]; do # for disconn we can check after proc entry is removed [ "x${CONN_STATE}" == "x" -a "${expected}" == "DISCONN" ] && return 0 - # disconnect rpc should be wait not more obd_timeout + # disconnect rpc should be wait not more obd_timeout [ $i -ge $(($TIMEOUT * 3 / 2)) ] && \ error "can't put import for ${ost}(${ost_facet}) into ${expected} state" && return 1 sleep 1 -- 1.8.3.1