Whamcloud - gitweb
LU-13073 osp: don't block waiting for new objects 02/43202/2
authorAlex Zhuravlev <bzzz@whamcloud.com>
Fri, 16 Oct 2020 16:09:04 +0000 (19:09 +0300)
committerOleg Drokin <green@whamcloud.com>
Wed, 5 May 2021 21:23:19 +0000 (21:23 +0000)
if OST is down, then it's possible that few threads trying
to get already precreated object will get stuck. even worse
that all QoS-based allocations then are serialized by the
single semaphore, even those that wouldn't try to allocate
on failed OST.

the patch introduces noblock flag in the allocation hint
which is passed to OSP. then QoS code tries to allocate
objects in a non-blocking manner.

Lustre-change: https://review.whamcloud.com/40274
Lustre-commit: 2112ccb3c48ccf86aaf2a61c9f040571a6323f9c

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I38e66d7569aefecf800dbc32f1049ac87853439e
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Reviewed-on: https://review.whamcloud.com/43202
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/dt_object.h
lustre/lod/lod_internal.h
lustre/lod/lod_qos.c
lustre/osp/osp_internal.h
lustre/osp/osp_object.c
lustre/osp/osp_precreate.c
lustre/tests/sanity.sh

index 5afe481..d27a7d9 100644 (file)
@@ -379,6 +379,7 @@ struct dt_allocation_hint {
        int                     dah_eadata_len;
        __u32                   dah_mode;
        int                     dah_append_stripes;
+       bool                    dah_can_block;
        char                    *dah_append_pool;
 };
 
index 62bab33..bef4db5 100644 (file)
@@ -446,6 +446,7 @@ struct lod_thread_info {
        struct lu_attr                  lti_layout_attr;
        /* object allocation avoid guide info */
        struct lod_avoid_guide          lti_avoid;
+       struct dt_allocation_hint       lti_ah;
 };
 
 extern const struct lu_device_operations lod_lu_ops;
index f5754e8..ebb36c0 100644 (file)
@@ -690,8 +690,10 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool,
 static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env,
                                                   struct lod_device *d,
                                                   __u32 ost_idx,
+                                                  bool can_block,
                                                   struct thandle *th)
 {
+       struct dt_allocation_hint *ah = &lod_env_info(env)->lti_ah;
        struct lod_tgt_desc *ost;
        struct lu_object *o, *n;
        struct lu_device *nd;
@@ -725,7 +727,8 @@ static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env,
 
        dt = container_of(n, struct dt_object, do_lu);
 
-       rc = lod_sub_declare_create(env, dt, NULL, NULL, NULL, th);
+       ah->dah_can_block = can_block;
+       rc = lod_sub_declare_create(env, dt, NULL, ah, NULL, th);
        if (rc < 0) {
                CDEBUG(D_OTHER, "can't declare creation on #%u: %d\n",
                       ost_idx, rc);
@@ -995,7 +998,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        if (lod_qos_is_ost_used(env, ost_idx, stripe_idx))
                RETURN(rc);
 
-       o = lod_qos_declare_object_on(env, lod, ost_idx, th);
+       o = lod_qos_declare_object_on(env, lod, ost_idx, true, th);
        if (IS_ERR(o)) {
                CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                       ost_idx, (int) PTR_ERR(o));
@@ -1261,7 +1264,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
                if (rc < 0) /* this OSP doesn't feel well */
                        break;
 
-               o = lod_qos_declare_object_on(env, m, ost_idx, th);
+               o = lod_qos_declare_object_on(env, m, ost_idx, true, th);
                if (IS_ERR(o)) {
                        rc = PTR_ERR(o);
                        CDEBUG(D_OTHER,
@@ -1404,7 +1407,7 @@ repeat_find:
                if (i != 0 && sfs->os_fprecreated == 0 && speed == 0)
                        continue;
 
-               o = lod_qos_declare_object_on(env, m, ost_idx, th);
+               o = lod_qos_declare_object_on(env, m, ost_idx, true, th);
                if (IS_ERR(o)) {
                        CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                               ost_idx, (int) PTR_ERR(o));
@@ -1527,6 +1530,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
        struct ost_pool *osts;
        unsigned int i;
        __u32 nfound, good_osts, stripe_count, stripe_count_min;
+       bool slow = false;
        int rc = 0;
        ENTRY;
 
@@ -1646,6 +1650,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                 * 0-weight OSTs will always get used last (only when rand=0) */
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
+                       struct lod_tgt_desc *ost = OST_TGT(lod, idx);
 
                        if (lod_should_avoid_ost(lo, lag, idx))
                                continue;
@@ -1672,7 +1677,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                            lod_comp_is_ost_used(env, lo, idx))
                                continue;
 
-                       o = lod_qos_declare_object_on(env, lod, idx, th);
+                       o = lod_qos_declare_object_on(env, lod, idx, slow, th);
                        if (IS_ERR(o)) {
                                QOS_DEBUG("can't declare object on #%u: %d\n",
                                          idx, (int) PTR_ERR(o));
@@ -1689,6 +1694,13 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                        break;
                }
 
+               if (rc && !slow && nfound < stripe_count) {
+                       /* couldn't allocate using precreated objects
+                        * so try to wait for new precreations */
+                       slow = true;
+                       rc = 0;
+               }
+
                if (rc) {
                        /* no OST found on this iteration, give up */
                        break;
index 2c10cb9..a223c79 100644 (file)
@@ -858,7 +858,8 @@ extern const struct dt_index_operations osp_md_index_ops;
 
 /* osp_precreate.c */
 int osp_init_precreate(struct osp_device *d);
-int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d);
+int osp_precreate_reserve(const struct lu_env *env,
+                         struct osp_device *d, bool can_block);
 __u64 osp_precreate_get_id(struct osp_device *d);
 int osp_precreate_get_fid(const struct lu_env *env, struct osp_device *d,
                          struct lu_fid *fid);
index 336eb54..8c42210 100644 (file)
@@ -1446,7 +1446,7 @@ static int osp_declare_create(const struct lu_env *env, struct dt_object *dt,
         * in declaration we need to reserve object so that we don't block
         * awaiting precreation RPC to complete
         */
-       rc = osp_precreate_reserve(env, d);
+       rc = osp_precreate_reserve(env, d, !hint || hint->dah_can_block);
        /*
         * we also need to declare update to local "last used id" file for
         * recovery if object isn't used for a reason, we need to release
index 2e585da..490fee5 100644 (file)
@@ -1439,7 +1439,8 @@ static int osp_precreate_timeout_condition(void *data)
  * \retval             -EAGAIN try later, slow precreation in progress
  * \retval             -EIO when no access to OST
  */
-int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
+int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d,
+                         bool can_block)
 {
        time64_t expire = ktime_get_seconds() + obd_timeout;
        struct l_wait_info lwi;
@@ -1529,6 +1530,13 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
                        break;
                }
 
+
+               if (!can_block) {
+                       LASSERT(d->opd_pre);
+                       rc = -ENOBUFS;
+                       break;
+               }
+
                l_wait_event(d->opd_pre_user_waitq,
                             osp_precreate_ready_condition(env, d), &lwi);
        }
index 89d52fe..5fc5034 100755 (executable)
@@ -1738,6 +1738,57 @@ test_27o() {
 }
 run_test 27o "create file with all full OSTs (should error)"
 
+function create_and_checktime() {
+       local fname=$1
+       local loops=$2
+       local i
+
+       for ((i=0; i < $loops; i++)); do
+               local start=$SECONDS
+               multiop $fname-$i Oc
+               ((SECONDS-start < TIMEOUT)) ||
+                       error "creation took " $((SECONDS-$start)) && return 1
+       done
+}
+
+test_27oo() {
+       local mdts=$(comma_list $(mdts_nodes))
+
+       [ $MDS1_VERSION -lt $(version_code 2.12.6) ] &&
+               skip "Need MDS version at least 2.12.6"
+
+       local f0=$DIR/${tfile}-0
+       local f1=$DIR/${tfile}-1
+
+       wait_delete_completed
+
+       # refill precreated objects
+       $LFS setstripe -i0 -c1 $f0
+
+       saved=$(do_facet mds1 $LCTL get_param -n lov.*0000*.qos_threshold_rr)
+       # force QoS allocation policy
+       do_nodes $mdts $LCTL set_param lov.*.qos_threshold_rr=0%
+       stack_trap "do_nodes $mdts $LCTL set_param \
+               lov.*.qos_threshold_rr=$saved" EXIT
+       sleep_maxage
+
+       # one OST is unavailable, but still have few objects preallocated
+       stop ost1
+       stack_trap "start ost1 $(ostdevname 1) $OST_MOUNT_OPTS; \
+               rm -rf $f1 $DIR/$tdir*" EXIT
+
+       for ((i=0; i < 7; i++)); do
+               mkdir $DIR/$tdir$i || error "can't create dir"
+               $LFS setstripe -c$((OSTCOUNT-1)) $DIR/$tdir$i ||
+                       error "can't set striping"
+       done
+       for ((i=0; i < 7; i++)); do
+               create_and_checktime $DIR/$tdir$i/$tfile 100 &
+       done
+       wait
+}
+run_test 27oo "don't let few threads to reserve too many objects"
+
 test_27p() {
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs"
        [ $PARALLEL == "yes" ] && skip "skip parallel run"