+ * Allocate a striping using an algorithm with weights.
+ *
+ * The function allocates remote MDT objects to create a striping, the first
+ * object was already allocated on current MDT to ensure master object and
+ * the first object are on the same MDT. The algorithm used is based on weights
+ * (both free space and inodes), and it's trying to ensure the space/inodes are
+ * used evenly by MDTs and MDSs. The striping configuration (# of stripes,
+ * offset, pool) is taken from the object and is prepared by the caller.
+ *
+ * If prepared configuration can't be met due to too few MDTs, then allocation
+ * fails.
+ *
+ * No concurrent allocation is allowed on the object and this must be ensured
+ * by the caller. All the internal structures are protected by the function.
+ *
+ * The algorithm has two steps: find available MDTs and calculate their
+ * weights, then select the MDTs with their weights used as the probability.
+ * An MDT with a higher weight is proportionately more likely to be selected
+ * than one with a lower weight.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lo LOD object
+ * \param[out] stripes striping created
+ *
+ * \retval positive stripes allocated, and it should be equal to
+ * lo->ldo_dir_stripe_count
+ * \retval -EAGAIN not enough tgts are found for specified stripe count
+ * \retval -EINVAL requested MDT index is invalid
+ * \retval negative errno on failure
+ */
+int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo,
+ struct dt_object **stripes)
+{
+ struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+ struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
+ struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
+ struct lu_fid fid = { 0 };
+ const struct lu_tgt_pool *pool;
+ struct lu_tgt_desc *mdt;
+ struct dt_object *dto;
+ u64 total_weight = 0;
+ u32 stripe_count = lo->ldo_dir_stripe_count;
+ unsigned int nfound;
+ unsigned int good_mdts;
+ unsigned int i;
+ int rc = 0;
+
+ ENTRY;
+
+ if (stripe_count == 1)
+ RETURN(1);
+
+ pool = <d->ltd_tgt_pool;
+
+ /* Detect -EAGAIN early, before expensive lock is taken. */
+ if (!ltd_qos_is_usable(ltd))
+ RETURN(-EAGAIN);
+
+ /* Do actual allocation, use write lock here. */
+ down_write(<d->ltd_qos.lq_rw_sem);
+
+ /*
+ * Check again, while we were sleeping on @lq_rw_sem things could
+ * change.
+ */
+ if (!ltd_qos_is_usable(ltd))
+ GOTO(unlock, rc = -EAGAIN);
+
+ rc = ltd_qos_penalties_calc(ltd);
+ if (rc)
+ GOTO(unlock, rc);
+
+ rc = lod_qos_tgt_in_use_clear(env, stripe_count);
+ if (rc)
+ GOTO(unlock, rc);
+
+ good_mdts = 0;
+ /* Find all the tgts that are valid stripe candidates */
+ for (i = 0; i < pool->op_count; i++) {
+ if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, pool->op_array[i]))
+ continue;
+
+ mdt = LTD_TGT(ltd, pool->op_array[i]);
+ mdt->ltd_qos.ltq_usable = 0;
+
+ rc = lod_is_tgt_usable(ltd, mdt);
+ if (rc)
+ continue;
+
+ if (mdt->ltd_statfs.os_state & OS_STATE_DEGRADED)
+ continue;
+
+ mdt->ltd_qos.ltq_usable = 1;
+ lu_tgt_qos_weight_calc(mdt);
+ total_weight += mdt->ltd_qos.ltq_weight;
+
+ good_mdts++;
+ }
+
+ QOS_DEBUG("found %d good tgts\n", good_mdts);
+
+ if (good_mdts < stripe_count - 1)
+ GOTO(unlock, rc = -EAGAIN);
+
+ /* Find enough tgts with weighted random allocation. */
+ nfound = 1;
+ while (nfound < stripe_count) {
+ u64 rand, cur_weight;
+
+ cur_weight = 0;
+ rc = -ENOSPC;
+
+ rand = lu_prandom_u64_max(total_weight);
+
+ /* On average, this will hit larger-weighted tgts more often.
+ * 0-weight tgts will always get used last (only when rand=0) */
+ for (i = 0; i < pool->op_count; i++) {
+ __u32 idx = pool->op_array[i];
+ int rc2;
+
+ mdt = LTD_TGT(ltd, idx);
+
+ if (!mdt->ltd_qos.ltq_usable)
+ continue;
+
+ cur_weight += mdt->ltd_qos.ltq_weight;
+
+ QOS_DEBUG("idx=%d nfound=%d cur_weight=%llu rand=%llu total_weight=%llu\n",
+ idx, nfound, cur_weight, rand,
+ total_weight);
+
+ if (cur_weight < rand)
+ continue;
+
+ QOS_DEBUG("stripe=%d to idx=%d\n", nfound, idx);
+
+ if (lod_qos_is_tgt_used(env, idx, nfound))
+ continue;
+
+ rc2 = obd_fid_alloc(env, mdt->ltd_exp, &fid, NULL);
+ if (rc2) {
+ QOS_DEBUG("can't alloc FID on #%u: %d\n",
+ idx, rc2);
+ continue;
+ }
+
+ conf.loc_flags = LOC_F_NEW;
+ dto = dt_locate_at(env, mdt->ltd_tgt, &fid,
+ lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
+ &conf);
+ if (IS_ERR(dto)) {
+ QOS_DEBUG("can't alloc stripe on #%u: %d\n",
+ idx, (int) PTR_ERR(dto));
+ continue;
+ }
+
+ lod_qos_tgt_in_use(env, nfound, idx);
+ stripes[nfound] = dto;
+ ltd_qos_update(ltd, mdt, &total_weight);
+ nfound++;
+ rc = 0;
+ break;
+ }
+
+ /* no MDT found on this iteration, give up */
+ if (rc)
+ break;
+ }
+
+ if (unlikely(nfound != stripe_count)) {
+ /*
+ * when the decision to use weighted algorithm was made
+ * we had enough appropriate OSPs, but this state can
+ * change anytime (no space on MDT, broken connection, etc)
+ * so it's possible OSP won't be able to provide us with
+ * an object due to just changed state
+ */
+ QOS_DEBUG("%s: wanted %d objects, found only %d\n",
+ lod2obd(lod)->obd_name, stripe_count, nfound);
+ for (i = 1; i < nfound; i++) {
+ LASSERT(stripes[i] != NULL);
+ dt_object_put(env, stripes[i]);
+ stripes[i] = NULL;
+ }
+
+ /* makes sense to rebalance next time */
+ ltd->ltd_qos.lq_dirty = 1;
+ ltd->ltd_qos.lq_same_space = 0;
+
+ rc = -EAGAIN;
+ } else {
+ rc = nfound;
+ }
+
+unlock:
+ up_write(<d->ltd_qos.lq_rw_sem);
+
+ RETURN(rc);
+}
+
+/**