From 23fde1f89bec0adf4f7181ccce5a236eac371a38 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Tue, 29 Mar 2016 11:29:57 -0600 Subject: [PATCH] LU-7759 llite: handle inactive OSTs better in statfs Change the order of checks for inactive OSCs in lov_prep_statfs_set() so that administratively disabled OSTs do not generate any output in "lfs df" at all, to avoid needlessly cluttering the output. Enable the lazystatfs mount option by default, so that "df" does not hang when an OST is temporarily offline. Fix test-framework.sh to use "lfs df $MOUNT" instead of "df $MOUNT" to determine when recovery is complete, now that "df" does not block. Signed-off-by: Andreas Dilger Change-Id: I993761a7eb120a36a1b80c2822cb6d8011ccab07 Reviewed-on: http://review.whamcloud.com/19195 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/llite/llite_lib.c | 1 + lustre/lov/lov_request.c | 21 +++++++++++---------- lustre/obdclass/genops.c | 3 +++ lustre/tests/recovery-small.sh | 17 ++++++++++++----- lustre/tests/replay-ost-single.sh | 8 ++++---- lustre/tests/test-framework.sh | 29 +++++++++++++++-------------- 6 files changed, 46 insertions(+), 33 deletions(-) diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index d442e22..276457b 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -117,6 +117,7 @@ static struct ll_sb_info *ll_init_sbi(void) #ifdef HAVE_LRU_RESIZE_SUPPORT sbi->ll_flags |= LL_SBI_LRU_RESIZE; #endif + sbi->ll_flags |= LL_SBI_LAZYSTATFS; for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index a96bfbb..dc03ce6 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -361,19 +361,20 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, continue; } + /* skip targets that have been explicitely disabled by the + * administrator */ + if (!lov->lov_tgts[i]->ltd_exp) { + CDEBUG(D_HA, "lov idx %d administratively disabled\n", + i); + continue; + } + if (!lov->lov_tgts[i]->ltd_active) lov_check_and_wait_active(lov, i); - /* skip targets that have been explicitely disabled by the - * administrator */ - if (!lov->lov_tgts[i]->ltd_exp) { - CDEBUG(D_HA, "lov idx %d administratively disabled\n", i); - continue; - } - - OBD_ALLOC(req, sizeof(*req)); - if (req == NULL) - GOTO(out_set, rc = -ENOMEM); + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs)); if (req->rq_oi.oi_osfs == NULL) { diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 749d0ac..fc93b4a 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1123,6 +1123,7 @@ void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) lock, exp, lock->l_exp_refs_nr); spin_unlock(&exp->exp_locks_list_guard); } +EXPORT_SYMBOL(__class_export_add_lock_ref); void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) { @@ -1141,6 +1142,7 @@ void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) lock, exp, lock->l_exp_refs_nr); spin_unlock(&exp->exp_locks_list_guard); } +EXPORT_SYMBOL(__class_export_del_lock_ref); #endif /* A connection defines an export context in which preallocation can @@ -1529,6 +1531,7 @@ int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid) #if LUSTRE_TRACKS_LOCK_EXP_REFS void (*class_export_dump_hook)(struct obd_export*) = NULL; +EXPORT_SYMBOL(class_export_dump_hook); #endif static void print_export_data(struct obd_export *exp, const char *status, diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 0d2cfa7..1c498b7 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -255,11 +255,12 @@ test_10d() { rm -f $TMP/$tfile echo -n ", world" | dd of=$TMP/$tfile bs=1c seek=5 + remount_client $MOUNT mount_client $MOUNT2 cancel_lru_locks osc $LFS setstripe -i 0 -c 1 $DIR1/$tfile - echo -n hello > $DIR1/$tfile + echo -n hello | dd of=$DIR1/$tfile bs=5 stat $DIR2/$tfile >& /dev/null $LCTL set_param fail_err=71 @@ -267,8 +268,9 @@ test_10d() { client_reconnect - cmp $DIR1/$tfile $DIR2/$tfile || error "file contents differ" - cmp $DIR1/$tfile $TMP/$tfile || error "wrong content found" + cancel_lru_locks osc + cmp -l $DIR1/$tfile $DIR2/$tfile || error "file contents differ" + cmp -l $DIR1/$tfile $TMP/$tfile || error "wrong content found" evict=$(do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state | \ tr -d '\-\[\] ' | \ @@ -519,10 +521,10 @@ test_18c() { do_facet ost1 lctl set_param fail_loc=0x80000225 # force reconnect sleep 1 - df $MOUNT > /dev/null 2>&1 + $LFS df $MOUNT > /dev/null 2>&1 sleep 2 # my understanding is that there should be nothing in the page - # cache after the client reconnects? + # cache after the client reconnects? rc=0 pgcache_empty || rc=2 rm -f $f $TMP/$tfile @@ -1495,6 +1497,11 @@ check_target_ir_state() local recovery_proc=obdfilter.${!name}.recovery_status local st + while : ; do + st=$(do_facet $target "$LCTL get_param -n $recovery_proc | + awk '/status:/{ print \\\$2}'") + [ x$st = xRECOVERING ] || break + done st=$(do_facet $target "lctl get_param -n $recovery_proc | awk '/IR:/{ print \\\$2}'") [ $st != ON -o $st != OFF -o $st != ENABLED -o $st != DISABLED ] || diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 476b91d..ac9f543 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -49,7 +49,7 @@ test_0a() { # needs to run during initial client->OST connection #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 do_facet ost1 "lctl set_param fail_loc=0x80000211" - zconf_mount $(hostname) $MOUNT && df $MOUNT || error "0a mount fail" + zconf_mount $(hostname) $MOUNT && $LFS df $MOUNT || error "mount fail" } run_test 0a "target handle mismatch (bug 5317)" @@ -382,12 +382,12 @@ test_8e() { sleep 1 # ensure we have a fresh statfs #define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 do_facet ost1 "lctl set_param fail_loc=0x231" - df $MOUNT & + $LFS df $MOUNT & dfpid=$! sleep $TIMEOUT if ! ps -p $dfpid > /dev/null 2>&1; then - do_facet ost1 "lctl set_param fail_loc=0" - error "df shouldn't have completed!" + do_facet ost1 "lctl set_param fail_loc=0" + error "df shouldn't have completed!" fi } run_test 8e "Verify that ptlrpc resends request on -EINPROGRESS" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 427683d..6d6843a 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2425,16 +2425,17 @@ client_evicted() { } client_reconnect_try() { - uname -n >> $MOUNT/recon - if [ -z "$CLIENTS" ]; then - df $MOUNT; uname -n >> $MOUNT/recon - else - do_nodes $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null - fi - echo Connected clients: - cat $MOUNT/recon - ls -l $MOUNT/recon > /dev/null - rm $MOUNT/recon + local f=$MOUNT/recon + + uname -n >> $f + if [ -z "$CLIENTS" ]; then + $LFS df $MOUNT; uname -n >> $f + else + do_nodes $CLIENTS "$LFS df $MOUNT; uname -n >> $f" > /dev/null + fi + echo "Connected clients: $(cat $f)" + ls -l $f > /dev/null + rm $f } client_reconnect() { @@ -2529,7 +2530,7 @@ obd_name() { replay_barrier() { local facet=$1 do_facet $facet "sync; sync; sync" - df $MOUNT + $LFS df $MOUNT # make sure there will be no seq change local clients=${CLIENTS:-$HOSTNAME} @@ -2614,7 +2615,7 @@ fail() { facet_failover $* || error "failover: $?" wait_clients_import_state "$clients" "$facets" FULL - clients_up || error "post-failover df: $?" + clients_up || error "post-failover stat: $?" } fail_nodf() { @@ -2628,8 +2629,8 @@ fail_abort() { change_active $facet wait_for_facet $facet mount_facet $facet -o abort_recovery - clients_up || echo "first df failed: $?" - clients_up || error "post-failover df: $?" + clients_up || echo "first stat failed: $?" + clients_up || error "post-failover stat: $?" } do_lmc() { -- 1.8.3.1