Whamcloud - gitweb
LU-13073 osp: don't block waiting for new objects
authorAlex Zhuravlev <bzzz@whamcloud.com>
Fri, 16 Oct 2020 16:09:04 +0000 (19:09 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 5 May 2021 19:00:59 +0000 (19:00 +0000)
if OST is down, then it's possible that few threads trying
to get already precreated object will get stuck. even worse
that all QoS-based allocations then are serialized by the
single semaphore, even those that wouldn't try to allocate
on failed OST.

the patch introduces noblock flag in the allocation hint
which is passed to OSP. then QoS code tries to allocate
objects in a non-blocking manner.

Lustre-commit: 2112ccb3c48ccf86aaf2a61c9f040571a6323f9c
Lustre-change: https://review.whamcloud.com/40274

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I38e66d7569aefecf800dbc32f1049ac87853439e
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-on: https://review.whamcloud.com/43148
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/dt_object.h
lustre/lod/lod_internal.h
lustre/lod/lod_qos.c
lustre/osp/osp_internal.h
lustre/osp/osp_object.c
lustre/osp/osp_precreate.c
lustre/tests/sanity.sh

index 74abebc..33296de 100644 (file)
@@ -402,6 +402,7 @@ struct dt_allocation_hint {
        int                     dah_eadata_len;
        __u32                   dah_mode;
        int                     dah_append_stripes;
+       bool                    dah_can_block;
        char                    *dah_append_pool;
 };
 
index 7a63eb2..d4be19a 100644 (file)
@@ -438,6 +438,7 @@ struct lod_thread_info {
        /* object allocation avoid guide info */
        struct lod_avoid_guide          lti_avoid;
        union lmv_mds_md                lti_lmv;
+       struct dt_allocation_hint       lti_ah;
 };
 
 extern const struct lu_device_operations lod_lu_ops;
index cf7e209..8e3f8b1 100644 (file)
@@ -370,8 +370,10 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_descs *ltd,
 static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env,
                                                   struct lod_device *d,
                                                   __u32 ost_idx,
+                                                  bool can_block,
                                                   struct thandle *th)
 {
+       struct dt_allocation_hint *ah = &lod_env_info(env)->lti_ah;
        struct lod_tgt_desc *ost;
        struct lu_object *o, *n;
        struct lu_device *nd;
@@ -405,7 +407,8 @@ static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env,
 
        dt = container_of(n, struct dt_object, do_lu);
 
-       rc = lod_sub_declare_create(env, dt, NULL, NULL, NULL, th);
+       ah->dah_can_block = can_block;
+       rc = lod_sub_declare_create(env, dt, NULL, ah, NULL, th);
        if (rc < 0) {
                CDEBUG(D_OTHER, "can't declare creation on #%u: %d\n",
                       ost_idx, rc);
@@ -683,7 +686,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
                        RETURN(rc);
        }
 
-       o = lod_qos_declare_object_on(env, lod, ost_idx, th);
+       o = lod_qos_declare_object_on(env, lod, ost_idx, true, th);
        if (IS_ERR(o)) {
                CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                       ost_idx, (int) PTR_ERR(o));
@@ -1162,7 +1165,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
                if (rc < 0) /* this OSP doesn't feel well */
                        break;
 
-               o = lod_qos_declare_object_on(env, m, ost_idx, th);
+               o = lod_qos_declare_object_on(env, m, ost_idx, true, th);
                if (IS_ERR(o)) {
                        rc = PTR_ERR(o);
                        CDEBUG(D_OTHER,
@@ -1321,7 +1324,7 @@ repeat_find:
                if (i && !tgt->ltd_statfs.os_fprecreated && !speed)
                        continue;
 
-               o = lod_qos_declare_object_on(env, m, ost_idx, th);
+               o = lod_qos_declare_object_on(env, m, ost_idx, true, th);
                if (IS_ERR(o)) {
                        CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                               ost_idx, (int) PTR_ERR(o));
@@ -1423,6 +1426,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
        __u32 nfound, good_osts, stripe_count, stripe_count_min;
        bool overstriped = false;
        int stripes_per_ost = 1;
+       bool slow = false;
        int rc = 0;
        ENTRY;
 
@@ -1528,6 +1532,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                 */
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
+                       struct lod_tgt_desc *ost = OST_TGT(lod, idx);
 
                        if (lod_should_avoid_ost(lo, lag, idx))
                                continue;
@@ -1563,7 +1568,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                                        continue;
                        }
 
-                       o = lod_qos_declare_object_on(env, lod, idx, th);
+                       o = lod_qos_declare_object_on(env, lod, idx, slow, th);
                        if (IS_ERR(o)) {
                                QOS_DEBUG("can't declare object on #%u: %d\n",
                                          idx, (int) PTR_ERR(o));
@@ -1580,6 +1585,13 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                        break;
                }
 
+               if (rc && !slow && nfound < stripe_count) {
+                       /* couldn't allocate using precreated objects
+                        * so try to wait for new precreations */
+                       slow = true;
+                       rc = 0;
+               }
+
                if (rc) {
                        /* no OST found on this iteration, give up */
                        break;
index 7246cd2..7438355 100644 (file)
@@ -849,7 +849,8 @@ extern const struct dt_index_operations osp_md_index_ops;
 
 /* osp_precreate.c */
 int osp_init_precreate(struct osp_device *d);
-int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d);
+int osp_precreate_reserve(const struct lu_env *env,
+                         struct osp_device *d, bool can_block);
 __u64 osp_precreate_get_id(struct osp_device *d);
 int osp_precreate_get_fid(const struct lu_env *env, struct osp_device *d,
                          struct lu_fid *fid);
index fabf7d6..a37cf14 100644 (file)
@@ -1446,7 +1446,7 @@ static int osp_declare_create(const struct lu_env *env, struct dt_object *dt,
         * in declaration we need to reserve object so that we don't block
         * awaiting precreation RPC to complete
         */
-       rc = osp_precreate_reserve(env, d);
+       rc = osp_precreate_reserve(env, d, !hint || hint->dah_can_block);
        /*
         * we also need to declare update to local "last used id" file for
         * recovery if object isn't used for a reason, we need to release
index bbbf7c4..1f2addb 100644 (file)
@@ -1406,7 +1406,8 @@ static int osp_precreate_ready_condition(const struct lu_env *env,
  * \retval             -EAGAIN try later, slow precreation in progress
  * \retval             -EIO when no access to OST
  */
-int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
+int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d,
+                         bool can_block)
 {
        time64_t expire = ktime_get_seconds() + obd_timeout;
        int precreated, rc, synced = 0;
@@ -1494,6 +1495,12 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
                        break;
                }
 
+               if (!can_block) {
+                       LASSERT(d->opd_pre);
+                       rc = -ENOBUFS;
+                       break;
+               }
+
                if (wait_event_idle_timeout(
                            d->opd_pre_user_waitq,
                            osp_precreate_ready_condition(env, d),
index fc56e10..a731b80 100755 (executable)
@@ -1908,6 +1908,57 @@ test_27o() {
 }
 run_test 27o "create file with all full OSTs (should error)"
 
+function create_and_checktime() {
+       local fname=$1
+       local loops=$2
+       local i
+
+       for ((i=0; i < $loops; i++)); do
+               local start=$SECONDS
+               multiop $fname-$i Oc
+               ((SECONDS-start < TIMEOUT)) ||
+                       error "creation took " $((SECONDS-$start)) && return 1
+       done
+}
+
+test_27oo() {
+       local mdts=$(comma_list $(mdts_nodes))
+
+       [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
+               skip "Need MDS version at least 2.13.57"
+
+       local f0=$DIR/${tfile}-0
+       local f1=$DIR/${tfile}-1
+
+       wait_delete_completed
+
+       # refill precreated objects
+       $LFS setstripe -i0 -c1 $f0
+
+       saved=$(do_facet mds1 $LCTL get_param -n lov.*0000*.qos_threshold_rr)
+       # force QoS allocation policy
+       do_nodes $mdts $LCTL set_param lov.*.qos_threshold_rr=0%
+       stack_trap "do_nodes $mdts $LCTL set_param \
+               lov.*.qos_threshold_rr=$saved" EXIT
+       sleep_maxage
+
+       # one OST is unavailable, but still have few objects preallocated
+       stop ost1
+       stack_trap "start ost1 $(ostdevname 1) $OST_MOUNT_OPTS; \
+               rm -rf $f1 $DIR/$tdir*" EXIT
+
+       for ((i=0; i < 7; i++)); do
+               mkdir $DIR/$tdir$i || error "can't create dir"
+               $LFS setstripe -c$((OSTCOUNT-1)) $DIR/$tdir$i ||
+                       error "can't set striping"
+       done
+       for ((i=0; i < 7; i++)); do
+               create_and_checktime $DIR/$tdir$i/$tfile 100 &
+       done
+       wait
+}
+run_test 27oo "don't let few threads to reserve too many objects"
+
 test_27p() {
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs"
        [ $PARALLEL == "yes" ] && skip "skip parallel run"