The sanity test_160f test was failing intermittently because the first
Changelog user ("cl6") was being unregistered in some cases when it
set changelog_max_idle_time=10, but the test slept for 9s and then did
some operations that could be slow. In rare cases the test runs too
long and the MDS evicts the "good" user along with the bad user:
MDD0000: Force deregister of ChangeLog user cl7 idle more than 35s
MDD0000: Force deregister of ChangeLog user cl6 idle more than 11s
Change the test sleep interval to be half of the max_idle limit so
that there is no risk of the "good" Changelog user being evicted.
Add some logging to the test so that it is easier to correlate test
script actions with events in the MDS debug log.
Lustre-change: https://review.whamcloud.com/36468
Lustre-commit:
4b0f0164c6ed761897409186376e9edc989323c9
Fixes:
31fef6845e8b ("LU-10680 mdd: create gc thread when no current transaction")
Test-Parameters: trivial envdefinitions=ONLY=160 testlist=sanity,sanity
Test-Parameters: envdefinitions=ONLY=160 mdscount=2 testlist=sanity,sanity
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I0e4c9c271d98a2716f848e75676780b0383ebbe5
Reviewed-by: Faccini Bruno <bruno.faccini@intel.com>
Signed-off-by: James Nunez <jnunez@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/38833
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
# generate some changelog records to accumulate on each MDT
test_mkdir -c $MDSCOUNT $DIR/$tdir || error "test_mkdir $tdir failed"
# generate some changelog records to accumulate on each MDT
test_mkdir -c $MDSCOUNT $DIR/$tdir || error "test_mkdir $tdir failed"
+ log "$(date +%s): creating first files"
createmany -m $DIR/$tdir/$tfile $((MDSCOUNT * 2)) ||
error "create $DIR/$tdir/$tfile failed"
# check changelogs have been generated
createmany -m $DIR/$tdir/$tfile $((MDSCOUNT * 2)) ||
error "create $DIR/$tdir/$tfile failed"
# check changelogs have been generated
+ local start=$SECONDS
+ local idle_time=$((MDSCOUNT * 5 + 5))
local nbcl=$(changelog_dump | wc -l)
[[ $nbcl -eq 0 ]] && error "no changelogs found"
local nbcl=$(changelog_dump | wc -l)
[[ $nbcl -eq 0 ]] && error "no changelogs found"
- for param in "changelog_max_idle_time=10" \
+ for param in "changelog_max_idle_time=$idle_time" \
"changelog_gc=1" \
"changelog_min_gc_interval=2" \
"changelog_min_free_cat_entries=3"; do
"changelog_gc=1" \
"changelog_min_gc_interval=2" \
"changelog_min_free_cat_entries=3"; do
do_nodes $mdts $LCTL set_param mdd.*.$param
done
do_nodes $mdts $LCTL set_param mdd.*.$param
done
- # force cl_user2 to be idle (1st part)
- sleep 9
+ # force cl_user2 to be idle (1st part), but also cancel the
+ # cl_user1 records so that it is not evicted later in the test.
+ local sleep1=$((idle_time / 2))
+ echo "$(date +%s): sleep1 $sleep1/${idle_time}s"
+ sleep $sleep1
# simulate changelog catalog almost full
#define OBD_FAIL_CAT_FREE_RECORDS 0x1313
# simulate changelog catalog almost full
#define OBD_FAIL_CAT_FREE_RECORDS 0x1313
"$user_rec1, but is $user_rec2"
done
"$user_rec1, but is $user_rec2"
done
- # force cl_user2 to be idle (2nd part) and to reach
- # changelog_max_idle_time
- sleep 2
+ # force cl_user2 idle (2nd part) to just exceed changelog_max_idle_time
+ local sleep2=$((idle_time - (SECONDS - start) + 1))
+ echo "$(date +%s): sleep2 $sleep2/${idle_time}s"
+ sleep $sleep2
- # generate one more changelog to trigger fail_loc
- createmany -m $DIR/$tdir/${tfile}bis $((MDSCOUNT * 2)) ||
- error "create $DIR/$tdir/${tfile}bis failed"
+ # Generate one more changelog to trigger GC at fail_loc for cl_user2.
+ # cl_user1 should be OK because it recently processed records.
+ echo "$(date +%s): creating $((MDSCOUNT * 2)) files"
+ createmany -m $DIR/$tdir/${tfile}b $((MDSCOUNT * 2)) ||
+ error "create $DIR/$tdir/${tfile}b failed"
# ensure gc thread is done
for i in $(mdts_nodes); do
# ensure gc thread is done
for i in $(mdts_nodes); do