Whamcloud - gitweb
LU-14762 lmv: compare space to mkdir on parent MDT
[fs/lustre-release.git] / lustre / obdclass / lu_tgt_descs.c
index c7d8bbe..d872db4 100644 (file)
@@ -64,7 +64,7 @@ u64 lu_prandom_u64_max(u64 ep_ro)
                 * 32 bits (truncated to the upper limit, if needed)
                 */
                if (ep_ro > 0xffffffffULL)
-                       rand = prandom_u32_max((u32)(ep_ro >> 32)) << 32;
+                       rand = (u64)prandom_u32_max((u32)(ep_ro >> 32)) << 32;
 
                if (rand == (ep_ro & 0xffffffff00000000ULL))
                        rand |= prandom_u32_max((u32)ep_ro);
@@ -79,13 +79,6 @@ u64 lu_prandom_u64_max(u64 ep_ro)
 }
 EXPORT_SYMBOL(lu_prandom_u64_max);
 
-void lu_qos_rr_init(struct lu_qos_rr *lqr)
-{
-       spin_lock_init(&lqr->lqr_alloc);
-       lqr->lqr_dirty = 1;
-}
-EXPORT_SYMBOL(lu_qos_rr_init);
-
 /**
  * Add a new target to Quality of Service (QoS) target table.
  *
@@ -110,10 +103,6 @@ int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt)
 
        ENTRY;
 
-       /* tgt not connected, this function will be called again later */
-       if (!exp)
-               RETURN(0);
-
        down_write(&qos->lq_rw_sem);
        /*
         * a bit hacky approach to learn NID of corresponding connection
@@ -164,9 +153,10 @@ int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt)
         */
        list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list);
 
-       qos->lq_dirty = 1;
-       qos->lq_rr.lqr_dirty = 1;
-
+       set_bit(LQ_DIRTY, &qos->lq_flags);
+#ifdef HAVE_SERVER_SUPPORT
+       set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags);
+#endif
 out:
        up_write(&qos->lq_rw_sem);
        RETURN(rc);
@@ -206,8 +196,10 @@ static int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
                OBD_FREE_PTR(svr);
        }
 
-       qos->lq_dirty = 1;
-       qos->lq_rr.lqr_dirty = 1;
+       set_bit(LQ_DIRTY, &qos->lq_flags);
+#ifdef HAVE_SERVER_SUPPORT
+       set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags);
+#endif
 out:
        up_write(&qos->lq_rw_sem);
        RETURN(rc);
@@ -236,14 +228,15 @@ static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
 void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt)
 {
        struct lu_tgt_qos *ltq = &tgt->ltd_qos;
-       __u64 temp, temp2;
+       __u64 penalty;
 
-       temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
-       temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
-       if (temp < temp2)
+       ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) *
+                        (tgt_statfs_iavail(tgt) >> 8);
+       penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
+       if (ltq->ltq_avail < penalty)
                ltq->ltq_weight = 0;
        else
-               ltq->ltq_weight = temp - temp2;
+               ltq->ltq_weight = ltq->ltq_avail - penalty;
 }
 EXPORT_SYMBOL(lu_tgt_qos_weight_calc);
 
@@ -268,7 +261,7 @@ int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt)
         * the tgt array and bitmap are allocated/grown dynamically as tgts are
         * added to the LOD/LMV, see lu_tgt_descs_add()
         */
-       ltd->ltd_tgt_bitmap = CFS_ALLOCATE_BITMAP(BITS_PER_LONG);
+       ltd->ltd_tgt_bitmap = bitmap_zalloc(BITS_PER_LONG, GFP_NOFS);
        if (!ltd->ltd_tgt_bitmap)
                return -ENOMEM;
 
@@ -279,15 +272,23 @@ int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt)
        /* Set up allocation policy (QoS and RR) */
        INIT_LIST_HEAD(&ltd->ltd_qos.lq_svr_list);
        init_rwsem(&ltd->ltd_qos.lq_rw_sem);
-       ltd->ltd_qos.lq_dirty = 1;
-       ltd->ltd_qos.lq_reset = 1;
-       /* Default priority is toward free space balance */
-       ltd->ltd_qos.lq_prio_free = 232;
-       /* Default threshold for rr (roughly 17%) */
-       ltd->ltd_qos.lq_threshold_rr = 43;
+       set_bit(LQ_DIRTY, &ltd->ltd_qos.lq_flags);
+       set_bit(LQ_RESET, &ltd->ltd_qos.lq_flags);
        ltd->ltd_is_mdt = is_mdt;
-
-       lu_qos_rr_init(&ltd->ltd_qos.lq_rr);
+       /* MDT imbalance threshold is low to balance across MDTs
+        * relatively quickly, because each directory may result
+        * in a large number of files/subdirs created therein.
+        */
+       if (is_mdt) {
+               ltd->ltd_lmv_desc.ld_pattern = LMV_HASH_TYPE_DEFAULT;
+               ltd->ltd_qos.lq_prio_free = LMV_QOS_DEF_PRIO_FREE * 256 / 100;
+               ltd->ltd_qos.lq_threshold_rr =
+                       LMV_QOS_DEF_THRESHOLD_RR_PCT * 256 / 100;
+       } else {
+               ltd->ltd_qos.lq_prio_free = LOV_QOS_DEF_PRIO_FREE * 256 / 100;
+               ltd->ltd_qos.lq_threshold_rr =
+                       LOV_QOS_DEF_THRESHOLD_RR_PCT * 256 / 100;
+       }
 
        return 0;
 }
@@ -302,8 +303,8 @@ void lu_tgt_descs_fini(struct lu_tgt_descs *ltd)
 {
        int i;
 
-       CFS_FREE_BITMAP(ltd->ltd_tgt_bitmap);
-       for (i = 0; i < TGT_PTRS; i++) {
+       bitmap_free(ltd->ltd_tgt_bitmap);
+       for (i = 0; i < ARRAY_SIZE(ltd->ltd_tgt_idx); i++) {
                if (ltd->ltd_tgt_idx[i])
                        OBD_FREE_PTR(ltd->ltd_tgt_idx[i]);
        }
@@ -326,27 +327,27 @@ EXPORT_SYMBOL(lu_tgt_descs_fini);
  */
 static int lu_tgt_descs_resize(struct lu_tgt_descs *ltd, __u32 newsize)
 {
-       struct cfs_bitmap *new_bitmap, *old_bitmap = NULL;
+       unsigned long *new_bitmap, *old_bitmap = NULL;
 
        /* someone else has already resize the array */
        if (newsize <= ltd->ltd_tgts_size)
                return 0;
 
-       new_bitmap = CFS_ALLOCATE_BITMAP(newsize);
+       new_bitmap = bitmap_zalloc(newsize, GFP_NOFS);
        if (!new_bitmap)
                return -ENOMEM;
 
        if (ltd->ltd_tgts_size > 0) {
                /* the bitmap already exists, copy data from old one */
-               cfs_bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap);
+               bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap,
+                           ltd->ltd_tgts_size);
                old_bitmap = ltd->ltd_tgt_bitmap;
        }
 
        ltd->ltd_tgts_size  = newsize;
        ltd->ltd_tgt_bitmap = new_bitmap;
 
-       if (old_bitmap)
-               CFS_FREE_BITMAP(old_bitmap);
+       bitmap_free(old_bitmap);
 
        CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size);
 
@@ -377,13 +378,16 @@ int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
        if (index >= ltd->ltd_tgts_size) {
                __u32 newsize = 1;
 
+               if (index > TGT_PTRS * TGT_PTRS_PER_BLOCK)
+                       RETURN(-ENFILE);
+
                while (newsize < index + 1)
                        newsize = newsize << 1;
 
                rc = lu_tgt_descs_resize(ltd, newsize);
                if (rc)
                        RETURN(rc);
-       } else if (cfs_bitmap_check(ltd->ltd_tgt_bitmap, index)) {
+       } else if (test_bit(index, ltd->ltd_tgt_bitmap)) {
                RETURN(-EEXIST);
        }
 
@@ -394,7 +398,7 @@ int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
        }
 
        LTD_TGT(ltd, tgt->ltd_index) = tgt;
-       cfs_bitmap_set(ltd->ltd_tgt_bitmap, tgt->ltd_index);
+       set_bit(tgt->ltd_index, ltd->ltd_tgt_bitmap);
 
        ltd->ltd_lov_desc.ld_tgt_count++;
        if (tgt->ltd_active)
@@ -411,7 +415,7 @@ void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
 {
        lu_qos_del_tgt(&ltd->ltd_qos, tgt);
        LTD_TGT(ltd, tgt->ltd_index) = NULL;
-       cfs_bitmap_clear(ltd->ltd_tgt_bitmap, tgt->ltd_index);
+       clear_bit(tgt->ltd_index, ltd->ltd_tgt_bitmap);
        ltd->ltd_lov_desc.ld_tgt_count--;
        if (tgt->ltd_active)
                ltd->ltd_lov_desc.ld_active_tgt_count--;
@@ -419,21 +423,6 @@ void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
 EXPORT_SYMBOL(ltd_del_tgt);
 
 /**
- * Whether QoS data is up-to-date and QoS can be applied.
- */
-bool ltd_qos_is_usable(struct lu_tgt_descs *ltd)
-{
-       if (!ltd->ltd_qos.lq_dirty && ltd->ltd_qos.lq_same_space)
-               return false;
-
-       if (ltd->ltd_lov_desc.ld_active_tgt_count < 2)
-               return false;
-
-       return true;
-}
-EXPORT_SYMBOL(ltd_qos_is_usable);
-
-/**
  * Calculate penalties per-tgt and per-server
  *
  * Re-calculate penalties when the configuration changes, active targets
@@ -465,7 +454,7 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
 
        ENTRY;
 
-       if (!qos->lq_dirty)
+       if (!test_bit(LQ_DIRTY, &qos->lq_flags))
                GOTO(out, rc = 0);
 
        num_active = desc->ld_active_tgt_count - 1;
@@ -531,12 +520,13 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
                 * per-tgt penalty is
                 * prio * bavail * iavail / (num_tgt - 1) / 2
                 */
-               tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
+               tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8;
                do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active);
                tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
 
                age = (now - tgt->ltd_qos.ltq_used) >> 3;
-               if (qos->lq_reset || age > 32 * desc->ld_qos_maxage)
+               if (test_bit(LQ_RESET, &qos->lq_flags) || 
+                   age > 32 * desc->ld_qos_maxage)
                        tgt->ltd_qos.ltq_penalty = 0;
                else if (age > desc->ld_qos_maxage)
                        /* Decay tgt penalty. */
@@ -565,36 +555,38 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
        list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
                ba = svr->lsq_bavail;
                ia = svr->lsq_iavail;
-               svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
-               do_div(ba, svr->lsq_tgt_count * num_active);
+               svr->lsq_penalty_per_obj = prio_wide * ba  * ia >> 8;
+               do_div(svr->lsq_penalty_per_obj,
+                      svr->lsq_tgt_count * num_active);
                svr->lsq_penalty_per_obj >>= 1;
 
                age = (now - svr->lsq_used) >> 3;
-               if (qos->lq_reset || age > 32 * desc->ld_qos_maxage)
+               if (test_bit(LQ_RESET, &qos->lq_flags) || 
+                   age > 32 * desc->ld_qos_maxage)
                        svr->lsq_penalty = 0;
                else if (age > desc->ld_qos_maxage)
                        /* Decay server penalty. */
                        svr->lsq_penalty >>= age / desc->ld_qos_maxage;
        }
 
-       qos->lq_dirty = 0;
-       qos->lq_reset = 0;
+       clear_bit(LQ_DIRTY, &qos->lq_flags);
+       clear_bit(LQ_RESET, &qos->lq_flags);
 
        /*
         * If each tgt has almost same free space, do rr allocation for better
         * creation performance
         */
-       qos->lq_same_space = 0;
+       clear_bit(LQ_SAME_SPACE, &qos->lq_flags);
        if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
            (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
-               qos->lq_same_space = 1;
+               set_bit(LQ_SAME_SPACE, &qos->lq_flags);
                /* Reset weights for the next time we enter qos mode */
-               qos->lq_reset = 1;
+               set_bit(LQ_RESET, &qos->lq_flags);
        }
        rc = 0;
 
 out:
-       if (!rc && qos->lq_same_space)
+       if (!rc && test_bit(LQ_SAME_SPACE, &qos->lq_flags))
                RETURN(-EAGAIN);
 
        RETURN(rc);
@@ -645,7 +637,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
        ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
                            ltd->ltd_lov_desc.ld_active_tgt_count;
        svr->lsq_penalty += svr->lsq_penalty_per_obj *
-                           ltd->ltd_lov_desc.ld_active_tgt_count;
+                           qos->lq_active_svr_count;
 
        /* Decrease all MDS penalties */
        list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
@@ -661,6 +653,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
                if (!tgt->ltd_active)
                        continue;
 
+               ltq = &tgt->ltd_qos;
                if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
                        ltq->ltq_penalty = 0;
                else
@@ -672,9 +665,10 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
                if (ltq->ltq_usable)
                        *total_wt += ltq->ltq_weight;
 
-               CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
+               CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
                          tgt->ltd_index, ltq->ltq_usable,
-                         tgt_statfs_bavail(tgt) >> 10,
+                         tgt_statfs_bavail(tgt) >> 16,
+                         tgt_statfs_iavail(tgt) >> 8,
                          ltq->ltq_penalty_per_obj >> 10,
                          ltq->ltq_penalty >> 10,
                          ltq->ltq_svr->lsq_penalty_per_obj >> 10,