From 74abc8b96393894fb8e5f600d66ee58f162110e3 Mon Sep 17 00:00:00 2001 From: shadow Date: Fri, 2 Oct 2009 11:12:28 +0000 Subject: [PATCH] Remove set_info(KEY_UNLINKED) from MDS/OSC Branch b1_8 b=19662 i=adilger i=shadow --- lustre/ChangeLog | 4 +++ lustre/include/obd.h | 1 - lustre/lov/lov_obd.c | 2 +- lustre/lov/lov_qos.c | 35 ++----------------- lustre/mds/mds_reint.c | 4 +-- lustre/osc/osc_create.c | 90 +++++++++++++++++++++++++----------------------- lustre/osc/osc_request.c | 43 ++++++++++++++++------- lustre/tests/sanity.sh | 26 +++++++------- 8 files changed, 98 insertions(+), 107 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index b3340c5..9a68bc7 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -15,6 +15,10 @@ tbd Sun Microsystems, Inc. of Lustre filesystem with 4K stack may cause a stack overflow. For more information, please refer to bugzilla 17630. +Severity : enhancement +Bugzilla : 19662 +Description: Remove set_info(KEY_UNLINKED) from MDS/OSC + Severity : normal Bugzilla : 19917 Description: Drop unnecessary __GFP_NOMEMALLOC flag from filter_get_page() diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 3229771..10375f0 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1060,7 +1060,6 @@ enum obd_cleanup_stage { #define KEY_CHECKSUM "checksum" #define KEY_READONLY "read-only" #define KEY_READONLY_166COMPAT "readonly" -#define KEY_UNLINKED "unlinked" #define KEY_EVICT_BY_NID "evict_by_nid" #define KEY_REGISTER_TARGET "register_target" #define KEY_SET_FS "set_fs" diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 9d1040a..9da4cd6 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -3011,7 +3011,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, next_id = 1; } else if (KEY_IS(KEY_CHECKSUM)) { do_inactive = 1; - } else if (KEY_IS(KEY_MDS_CONN) || KEY_IS(KEY_UNLINKED)) { + } else if (KEY_IS(KEY_MDS_CONN)) { check_uuid = val ? 1 : 0; } else if (KEY_IS(KEY_EVICT_BY_NID)) { /* use defaults: diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 8d5556e..0397878 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -55,8 +55,6 @@ #define TGT_BAVAIL(i) (lov->lov_tgts[i]->ltd_exp->exp_obd->obd_osfs.os_bavail*\ lov->lov_tgts[i]->ltd_exp->exp_obd->obd_osfs.os_bsize) -#define TGT_FFREE(i) (lov->lov_tgts[i]->ltd_exp->exp_obd->obd_osfs.os_ffree) - int qos_add_tgt(struct obd_device *obd, __u32 index) { @@ -730,10 +728,8 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, char *poolname, int flags) { struct lov_obd *lov = &exp->exp_obd->u.lov; - static time_t last_warn = 0; - time_t now = cfs_time_current_sec(); - __u64 total_bavail, total_weight = 0; - int nfound, good_osts, i, warn = 0, rc = 0; + __u64 total_weight = 0; + int nfound, good_osts, i, rc = 0; int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags); struct pool_desc *pool; struct ost_pool *osts; @@ -768,35 +764,12 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, if (rc) GOTO(out, rc); - total_bavail = 0; good_osts = 0; - /* Warn users about zero available space/inode every 30 min */ - if (cfs_time_sub(now, last_warn) > 60 * 30) - warn = 1; /* Find all the OSTs that are valid stripe candidates */ for (i = 0; i < osts->op_count; i++) { - __u64 bavail; - if (!lov->lov_tgts[osts->op_array[i]] || !lov->lov_tgts[osts->op_array[i]]->ltd_active) continue; - bavail = TGT_BAVAIL(osts->op_array[i]); - if (!bavail) { - if (warn) { - CDEBUG(D_QOS, "no free space on %s\n", - obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid)); - last_warn = now; - } - continue; - } - if (!TGT_FFREE(osts->op_array[i])) { - if (warn) { - CDEBUG(D_QOS, "no free inodes on %s\n", - obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid)); - last_warn = now; - } - continue; - } /* Fail Check before osc_precreate() is called so we can only 'fail' single OSC. */ @@ -808,7 +781,6 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable = 1; qos_calc_weight(lov, osts->op_array[i]); - total_bavail += bavail; total_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight; good_osts++; @@ -821,9 +793,6 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, if (good_osts < stripe_cnt_min) GOTO(out, rc = -EAGAIN); - if (!total_bavail) - GOTO(out, rc = -ENOSPC); - /* We have enough osts */ if (good_osts < *stripe_cnt) *stripe_cnt = good_osts; diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 6d23645..c3880f6 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -2085,9 +2085,7 @@ cleanup: inodes[0] = dparent ? dparent->d_inode : NULL; inodes[1] = child_inode; rc = mds_finish_transno(mds, inodes, handle, req, rc, 0, 0); - if (!rc) - (void)obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_UNLINKED), - KEY_UNLINKED, 0, NULL, NULL); + cleanup_no_trans: switch(cleanup_phase) { case 5: /* pending_dir semaphore */ diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 8f5af52..30b0736 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -127,7 +127,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) oscc->oscc_flags |= OSCC_FLAG_RDONLY; case -ENOSPC: case -EFBIG: - if (rc != EROFS) { + if (rc != -EROFS) { oscc->oscc_flags |= OSCC_FLAG_NOSPC; if (body && rc == -ENOSPC) { oscc->oscc_last_id = body->oa.o_id; @@ -193,7 +193,8 @@ static int oscc_internal_create(struct osc_creator *oscc) LASSERT_SPIN_LOCKED(&oscc->oscc_lock); - if(oscc->oscc_flags & OSCC_FLAG_RECOVERING) { + if ((oscc->oscc_flags & OSCC_FLAG_RECOVERING) || + (oscc->oscc_flags & OSCC_FLAG_DEGRADED)) { spin_unlock(&oscc->oscc_lock); RETURN(0); } @@ -283,9 +284,9 @@ static int oscc_wait_for_objects(struct osc_creator *oscc, int count) OSCC_FLAG_EXITING) & oscc->oscc_flags; have_objs = oscc_has_objects_nolock(oscc, count); - if (!ost_unusable) + if (!ost_unusable && !have_objs) /* they release lock himself */ - oscc_internal_create(oscc); + have_objs = oscc_internal_create(oscc); else spin_unlock(&oscc->oscc_lock); @@ -307,16 +308,6 @@ static int oscc_precreate(struct osc_creator *oscc) NULL, NULL); rc = l_wait_event(oscc->oscc_waitq, oscc_wait_for_objects(oscc, 1), &lwi); - - if (!oscc_has_objects(oscc, 1) || (oscc->oscc_flags & OSCC_FLAG_NOSPC)) - rc = -ENOSPC; - - if (oscc->oscc_flags & OSCC_FLAG_RDONLY) - rc = -EROFS; - - if (oscc->oscc_obd->u.cli.cl_import->imp_invalid) - rc = -EIO; - RETURN(rc); } @@ -348,10 +339,11 @@ int osc_precreate(struct obd_export *exp) if (imp != NULL && imp->imp_deactive) RETURN(1000); - /* until oscc in recovery - other flags is wrong */ + /* Handle critical states first */ spin_lock(&oscc->oscc_lock); if (oscc->oscc_flags & OSCC_FLAG_NOSPC || - oscc->oscc_flags & OSCC_FLAG_RDONLY) { + oscc->oscc_flags & OSCC_FLAG_RDONLY || + oscc->oscc_flags & OSCC_FLAG_EXITING) { spin_unlock(&oscc->oscc_lock); RETURN(1000); } @@ -373,7 +365,8 @@ int osc_precreate(struct obd_export *exp) RETURN(1); } - oscc_internal_create(oscc); + if (oscc_internal_create(oscc)) + RETURN(1000); RETURN(1); } @@ -390,9 +383,22 @@ static int handle_async_create(struct ptlrpc_request *req, int rc) if(rc) GOTO(out_wake, rc); - if ((oscc->oscc_flags & OSCC_FLAG_EXITING)) + /* Handle the critical type errors first. + * Should we also test cl_import state as well ? */ + if (oscc->oscc_flags & OSCC_FLAG_EXITING) GOTO(out_wake, rc = -EIO); + if (oscc->oscc_flags & OSCC_FLAG_NOSPC) + GOTO(out_wake, rc = -ENOSPC); + + if (oscc->oscc_flags & OSCC_FLAG_RDONLY) + GOTO(out_wake, rc = -EROFS); + + /* should be try wait until recovery finished */ + if((oscc->oscc_flags & OSCC_FLAG_RECOVERING) || + (oscc->oscc_flags & OSCC_FLAG_DEGRADED)) + RETURN(-EAGAIN); + if (oscc_has_objects_nolock(oscc, 1)) { memcpy(oa, &oscc->oscc_oa, sizeof(*oa)); oa->o_id = oscc->oscc_next_id; @@ -401,20 +407,10 @@ static int handle_async_create(struct ptlrpc_request *req, int rc) CDEBUG(D_RPCTRACE, " set oscc_next_id = "LPU64"\n", oscc->oscc_next_id); - GOTO(out_wake, rc = 0); + GOTO(out_wake, rc = 0); } - /* should be try wait until recovery finished */ - if(oscc->oscc_flags & OSCC_FLAG_RECOVERING) - RETURN(-EAGAIN); - - if (oscc->oscc_flags & OSCC_FLAG_NOSPC) - GOTO(out_wake, rc = -ENOSPC); - - if (oscc->oscc_flags & OSCC_FLAG_RDONLY) - GOTO(out_wake, rc = -EROFS); - - /* we not have objects now - continue wait */ + /* we don't have objects now - continue wait */ RETURN(-EAGAIN); out_wake: @@ -600,20 +596,34 @@ int osc_create(struct obd_export *exp, struct obdo *oa, oscc->oscc_obd->obd_name); rc = oscc_precreate(oscc); - if (rc) { + if (rc) CDEBUG(D_HA,"%s: error create %d\n", oscc->oscc_obd->obd_name, rc); + + spin_lock(&oscc->oscc_lock); + + /* wakeup but recovery did not finished */ + if ((oscc->oscc_obd->u.cli.cl_import->imp_invalid) || + (oscc->oscc_flags & OSCC_FLAG_RECOVERING)) { + rc = -EIO; + spin_unlock(&oscc->oscc_lock); break; } - spin_lock(&oscc->oscc_lock); - if (oscc->oscc_flags & OSCC_FLAG_EXITING) { + if (oscc->oscc_flags & OSCC_FLAG_NOSPC) { + rc = -ENOSPC; spin_unlock(&oscc->oscc_lock); break; } - /* wakeup but recovery not finished */ - if (oscc->oscc_flags & OSCC_FLAG_RECOVERING) { - rc = -EIO; + + if (oscc->oscc_flags & OSCC_FLAG_RDONLY) { + rc = -EROFS; + spin_unlock(&oscc->oscc_lock); + break; + } + + // Should we report -EIO error ? + if (oscc->oscc_flags & OSCC_FLAG_EXITING) { spin_unlock(&oscc->oscc_lock); break; } @@ -629,14 +639,6 @@ int osc_create(struct obd_export *exp, struct obdo *oa, CDEBUG(D_RPCTRACE, "%s: set oscc_next_id = "LPU64"\n", exp->exp_obd->obd_name, oscc->oscc_next_id); break; - } else if (oscc->oscc_flags & OSCC_FLAG_NOSPC) { - rc = -ENOSPC; - spin_unlock(&oscc->oscc_lock); - break; - } else if (oscc->oscc_flags & OSCC_FLAG_RDONLY) { - rc = -EROFS; - spin_unlock(&oscc->oscc_lock); - break; } spin_unlock(&oscc->oscc_lock); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index a3051a5..ab0e243 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3550,6 +3550,7 @@ static int osc_statfs_interpret(struct ptlrpc_request *req, struct osc_async_args *aa = data; struct client_obd *cli = &req->rq_import->imp_obd->u.cli; struct obd_statfs *msfs; + __u64 used; ENTRY; if (rc == -EBADR) @@ -3577,12 +3578,39 @@ static int osc_statfs_interpret(struct ptlrpc_request *req, /* Reinitialize the RDONLY and DEGRADED flags at the client * on each statfs, so they don't stay set permanently. */ spin_lock(&cli->cl_oscc.oscc_lock); - cli->cl_oscc.oscc_flags &= ~(OSCC_FLAG_RDONLY | OSCC_FLAG_DEGRADED); - if (msfs->os_state & OS_STATE_DEGRADED) + + if (unlikely(msfs->os_state & OS_STATE_DEGRADED)) cli->cl_oscc.oscc_flags |= OSCC_FLAG_DEGRADED; + else if (unlikely(cli->cl_oscc.oscc_flags & OSCC_FLAG_DEGRADED)) + cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_DEGRADED; - if (msfs->os_state & OS_STATE_READONLY) + if (unlikely(msfs->os_state & OS_STATE_READONLY)) cli->cl_oscc.oscc_flags |= OSCC_FLAG_RDONLY; + else if (unlikely(cli->cl_oscc.oscc_flags & OSCC_FLAG_RDONLY)) + cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_RDONLY; + + /* Add a bit of hysteresis so this flag isn't continually flapping, + * and ensure that new files don't get extremely fragmented due to + * only a small amount of available space in the filesystem. + * We want to set the NOSPC flag when there is less than ~0.1% free + * and clear it when there is at least ~0.2% free space, so: + * avail < ~0.1% max max = avail + used + * 1025 * avail < avail + used used = blocks - free + * 1024 * avail < used + * 1024 * avail < blocks - free + * avail < ((blocks - free) >> 10) + * + * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to + * lose that amount of space so in those cases we report no space left + * if their is less than 1 GB left. */ + used = min((msfs->os_blocks - msfs->os_bfree) >> 10, 1 << 30); + if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) == 0) && + ((msfs->os_ffree < 32) || (msfs->os_bavail < used)))) + cli->cl_oscc.oscc_flags |= OSCC_FLAG_NOSPC; + else if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) != 0) && + (msfs->os_ffree > 64) && (msfs->os_bavail > (used << 1)))) + cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_NOSPC; + spin_unlock(&cli->cl_oscc.oscc_lock); memcpy(aa->aa_oi->oi_osfs, msfs, sizeof(*msfs)); @@ -4012,15 +4040,6 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } - if (KEY_IS(KEY_UNLINKED)) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - spin_lock(&oscc->oscc_lock); - oscc->oscc_flags &= ~OSCC_FLAG_NOSPC; - spin_unlock(&oscc->oscc_lock); - RETURN(0); - } - if (KEY_IS(KEY_INIT_RECOV)) { if (vallen != sizeof(int)) RETURN(-EINVAL); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 86a899a..09cff2f 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -941,17 +941,22 @@ test_27m() { } run_test 27m "create file while OST0 was full ==================" -# osc's keep a NOSPC stick flag that gets unset with rmdir +sleep_maxage() { + local DELAY=$(do_facet mds lctl get_param -n lov.*.qos_maxage | awk '{print $1 + 2}') + sleep $DELAY +} + +# OSCs keep a NOSPC flag that will be reset after ~5s (qos_maxage) +# if the OST isn't full anymore. reset_enospc() { local FAIL_LOC=${1:-0} local OSTIDX=${2:-""} - mkdir -p $DIR/d27/nospc - rmdir $DIR/d27/nospc local list=$(comma_list $(osts_nodes)) [ "$OSTIDX" ] && list=$(facet_host ost$((OSTIDX + 1))) do_nodes $list lctl set_param fail_loc=$FAIL_LOC + sleep_maxage } exhaust_precreations() { @@ -1006,7 +1011,6 @@ test_27o() { reset_enospc rm -f $DIR/d27/f27o exhaust_all_precreations 0x215 - sleep 5 touch $DIR/d27/f27o && error "able to create $DIR/d27/f27o" @@ -1021,6 +1025,7 @@ test_27p() { reset_enospc rm -f $DIR/d27/f27p + mkdir -p $DIR/d27 $MCREATE $DIR/d27/f27p || error $TRUNCATE $DIR/d27/f27p 80000000 || error @@ -1129,8 +1134,6 @@ test_27v() { # bug 4900 local START=`date +%s` createmany -o $DIR/$tdir/$tfile 32 - reset_enospc - local FINISH=`date +%s` local TIMEOUT=`lctl get_param -n timeout` [ $((FINISH - START)) -ge $((TIMEOUT / 2)) ] && \ @@ -1161,7 +1164,6 @@ run_test 27w "check lfs setstripe -c -s -i options =============" test_27x() { [ "$OSTCOUNT" -lt "2" ] && skip_env "$OSTCOUNT < 2 OSTs" && return - DELAY=$(do_facet mds lctl get_param -n lov.*.qos_maxage | awk '{print $1 + 2}') OFFSET=$(($OSTCOUNTi - 1)) OSTIDX=0 local OST=$(lfs osts | awk '/'${OSTIDX}': / { print $2 }' | sed -e 's/_UUID$//') @@ -1169,7 +1171,7 @@ test_27x() { mkdir -p $DIR/$tdir $SETSTRIPE $DIR/$tdir -c 1 # 1 stripe per file do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 1 - sleep $DELAY + sleep_maxage createmany -o $DIR/$tdir/$tfile $OSTCOUNT for i in `seq 0 $OFFSET`; do [ `$GETSTRIPE $DIR/$tdir/$tfile$i | grep -A 10 obdidx | awk '{print $1}' | grep -w "$OSTIDX"` ] && @@ -1184,7 +1186,6 @@ test_27y() { remote_mds_nodsh && skip "remote MDS with nodsh" && return MDS_OSCS=`do_facet mds lctl dl | awk '/[oO][sS][cC].*md[ts]/ { print $4 }'` - DELAY=$(do_facet mds lctl get_param -n lov.*.qos_maxage | awk '{print $1 + 2}') OFFSET=$(($OSTCOUNT-1)) OST=-1 for OSC in $MDS_OSCS; do @@ -1201,7 +1202,7 @@ test_27y() { $SETSTRIPE $DIR/$tdir -c 1 # 1 stripe / file do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 1 - sleep $DELAY + sleep_maxage createmany -o $DIR/$tdir/$tfile $OSTCOUNT do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 0 @@ -4434,7 +4435,6 @@ test_116() { echo -n "Free space priority " lctl get_param -n lov.*.qos_prio_free - DELAY=$(lctl get_param -n lov.*.qos_maxage | head -1 | awk '{print $1}') declare -a AVAIL free_min_max [ $MINV -gt 960000 ] && skip "too much free space in OST$MINI" &&\ @@ -4455,7 +4455,7 @@ test_116() { done FILL=$(($MINV / 4)) sync - sleep $DELAY + sleep_maxage free_min_max DIFF=$(($MAXV - $MINV)) @@ -4484,7 +4484,7 @@ test_116() { done echo "wrote $i 200k files" sync - sleep $DELAY + sleep_maxage echo "Note: free space may not be updated, so measurements might be off" free_min_max -- 1.8.3.1