Whamcloud - gitweb
LU-6910 osp: add procfs values for OST reserved size 56/17656/10
authorAlexander Boyko <alexander.boyko@seagate.com>
Mon, 10 Aug 2015 11:40:28 +0000 (14:40 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 29 Sep 2016 14:59:18 +0000 (14:59 +0000)
osp_pre_status=-ENOSPC is used to skip OST from object allocation.
The error was set when OST available space is less than 0.1% of total
OST size. This value is not configurable, so procfs files was
added:
reserved_mb_low - low watermark, if available space is less
  than it, object allocation is stopped.
reserved_mb_high - highw watermark, if available space is more
   than it, object allocation is enabled.

By default ~0.1% is reserved as low watermark. The high watermark
is twice bigger than the low by default.
High and low watermark could be changed by:
lctl set_param osp.lustre-OST0000-osc-MDT0000.reserved_mb_high=1024

When object allocation is disabled, a clients could appened to
existing files. And 0.1% is too low for them. For example, OST size
is 8TB, 0.1% is 8GB, if cluster has 1k clients, reserved space is
~8MB per client. The main reason of the patch is ability to increase
reserved space.

Signed-off-by: Alexander Boyko <alexander.boyko@seagate.com>
Seagate-bug-id: MRP-2606
Test-Parameters: testlist=sanity,sanity,sanity,sanity,sanity,sanity envdefinitions=ONLY=253
Change-Id: Idd759352cec30a6039c228695f753465fbccc75f
Reviewed-on: http://review.whamcloud.com/17656
Tested-by: Jenkins
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@seagate.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre/lustre_user.h
lustre/lod/lod_qos.c
lustre/ofd/ofd_obd.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osp/lproc_osp.c
lustre/osp/osp_internal.h
lustre/osp/osp_precreate.c
lustre/tests/sanity.sh

index a27b876..c7f80f7 100644 (file)
@@ -99,9 +99,8 @@ typedef struct stat64 lstat_t;
 enum obd_statfs_state {
         OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
         OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
-        OS_STATE_RDONLY_1       = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
-        OS_STATE_RDONLY_2       = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
-        OS_STATE_RDONLY_3       = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+       OS_STATE_ENOSPC         = 0x00000020, /**< not enough free space */
+       OS_STATE_ENOINO         = 0x00000040, /**< not enough inodes */
 };
 
 struct obd_statfs {
index 5e3fe00..9159b11 100644 (file)
@@ -199,6 +199,11 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
        LASSERT(ost);
 
        rc = dt_statfs(env, ost->ltd_ost, sfs);
+
+       if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) ||
+           (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0)))
+               RETURN(-ENOSPC);
+
        if (rc && rc != -ENOTCONN)
                CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
 
@@ -742,30 +747,6 @@ static int min_stripe_count(__u32 stripe_cnt, int flags)
 #define LOV_CREATE_RESEED_MIN  2000
 
 /**
- * Check if an OST is full.
- *
- * Check whether an OST should be considered full based
- * on the given statfs data.
- *
- * \param[in] msfs     statfs data
- *
- * \retval false       not full
- * \retval true                full
- */
-static int inline lod_qos_dev_is_full(struct obd_statfs *msfs)
-{
-       __u64 used;
-       int   bs = msfs->os_bsize;
-
-       LASSERT(((bs - 1) & bs) == 0);
-
-       /* the minimum of 0.1% used blocks and 1GB bytes. */
-       used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
-                       1 << (31 - ffs(bs)));
-       return (msfs->os_bavail < used);
-}
-
-/**
  * Initialize temporary OST-in-use array.
  *
  * Allocate or extend the array used to mark targets already assigned to a new
@@ -856,14 +837,6 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        }
 
        /*
-        * skip full devices
-        */
-       if (lod_qos_dev_is_full(sfs)) {
-               QOS_DEBUG("#%d is full\n", ost_idx);
-               goto out_return;
-       }
-
-       /*
         * We expect number of precreated objects in f_ffree at
         * the first iteration, skip OSPs with no objects ready
         */
@@ -1438,12 +1411,6 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                        continue;
                }
 
-               /*
-                * skip full devices
-                */
-               if (lod_qos_dev_is_full(sfs))
-                       continue;
-
                if (sfs->os_state & OS_STATE_DEGRADED)
                        continue;
 
index 09922d8..307f678 100644 (file)
@@ -1439,7 +1439,7 @@ static int ofd_health_check(const struct lu_env *nul, struct obd_device *obd)
        if (unlikely(rc))
                GOTO(out, rc);
 
-       if (info->fti_u.osfs.os_state == OS_STATE_READONLY)
+       if (info->fti_u.osfs.os_state & OS_STATE_READONLY)
                GOTO(out, rc = -EROFS);
 
 #ifdef USE_HEALTH_CHECK_WRITE
index 2946850..54239b4 100644 (file)
@@ -1988,7 +1988,7 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d,
 
        statfs_pack(sfs, ksfs);
        if (unlikely(sb->s_flags & MS_RDONLY))
-               sfs->os_state = OS_STATE_READONLY;
+               sfs->os_state |= OS_STATE_READONLY;
        if (LDISKFS_HAS_INCOMPAT_FEATURE(sb,
                                         LDISKFS_FEATURE_INCOMPAT_EXTENTS))
                sfs->os_maxbytes = sb->s_maxbytes;
index b1d782d..d315147 100644 (file)
@@ -744,6 +744,126 @@ LPROC_SEQ_FOPS_RO_TYPE(osp, timeouts);
 LPROC_SEQ_FOPS_RW_TYPE(osp, import);
 LPROC_SEQ_FOPS_RO_TYPE(osp, state);
 
+/**
+ * Show high watermark (in megabytes). If available free space at OST is grater
+ * than high watermark and object allocation for OST is disabled, enable it.
+ *
+ * \param[in] m                seq_file handle
+ * \param[in] data     unused for single entry
+ * \retval             0 on success
+ * \retval             negative number on error
+ */
+static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data)
+{
+       struct obd_device       *dev = m->private;
+       struct osp_device       *osp = lu2osp_dev(dev->obd_lu_dev);
+
+       if (osp == NULL)
+               return -EINVAL;
+
+       seq_printf(m, "%u\n", osp->opd_reserved_mb_high);
+       return 0;
+}
+
+/**
+ * Change high watermark
+ *
+ * \param[in] file     proc file
+ * \param[in] buffer   string which represents new value (in megabytes)
+ * \param[in] count    \a buffer length
+ * \param[in] off      unused for single entry
+ * \retval             \a count on success
+ * \retval             negative number on error
+ */
+static ssize_t
+osp_reserved_mb_high_seq_write(struct file *file, const char __user *buffer,
+                       size_t count, loff_t *off)
+{
+       struct seq_file         *m = file->private_data;
+       struct obd_device       *dev = m->private;
+       struct osp_device       *osp = lu2osp_dev(dev->obd_lu_dev);
+       __s64                   val;
+       int                     rc;
+
+       if (osp == NULL)
+               return -EINVAL;
+
+       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M');
+       if (rc)
+               return rc;
+       val >>= 20;
+       if (val < 1)
+               return -ERANGE;
+
+       spin_lock(&osp->opd_pre_lock);
+       osp->opd_reserved_mb_high = val;
+       if (val <= osp->opd_reserved_mb_low)
+               osp->opd_reserved_mb_low = val - 1;
+       spin_unlock(&osp->opd_pre_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osp_reserved_mb_high);
+
+/**
+ * Show low watermark (in megabytes). If available free space at OST is less
+ * than low watermark, object allocation for OST is disabled.
+ *
+ * \param[in] m                seq_file handle
+ * \param[in] data     unused for single entry
+ * \retval             0 on success
+ * \retval             negative number on error
+ */
+static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data)
+{
+       struct obd_device       *dev = m->private;
+       struct osp_device       *osp = lu2osp_dev(dev->obd_lu_dev);
+
+       if (osp == NULL)
+               return -EINVAL;
+
+       seq_printf(m, "%u\n", osp->opd_reserved_mb_low);
+       return 0;
+}
+
+/**
+ * Change low watermark
+ *
+ * \param[in] file     proc file
+ * \param[in] buffer   string which represents new value (in megabytes)
+ * \param[in] count    \a buffer length
+ * \param[in] off      unused for single entry
+ * \retval             \a count on success
+ * \retval             negative number on error
+ */
+static ssize_t
+osp_reserved_mb_low_seq_write(struct file *file, const char __user *buffer,
+                       size_t count, loff_t *off)
+{
+       struct seq_file         *m = file->private_data;
+       struct obd_device       *dev = m->private;
+       struct osp_device       *osp = lu2osp_dev(dev->obd_lu_dev);
+       __s64                   val;
+       int                     rc;
+
+       if (osp == NULL)
+               return -EINVAL;
+
+       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M');
+       if (rc)
+               return rc;
+       val >>= 20;
+
+       spin_lock(&osp->opd_pre_lock);
+       osp->opd_reserved_mb_low = val;
+       if (val >= osp->opd_reserved_mb_high)
+               osp->opd_reserved_mb_high = val + 1;
+       spin_unlock(&osp->opd_pre_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osp_reserved_mb_low);
+
 static struct lprocfs_vars lprocfs_osp_obd_vars[] = {
        { .name =       "uuid",
          .fops =       &osp_uuid_fops                  },
@@ -794,6 +914,10 @@ static struct lprocfs_vars lprocfs_osp_obd_vars[] = {
          .fops =       &osp_syn_in_prog_fops           },
        { .name =       "old_sync_processed",
          .fops =       &osp_old_sync_processed_fops    },
+       { .name =       "reserved_mb_high",
+         .fops =       &osp_reserved_mb_high_fops      },
+       { .name =       "reserved_mb_low",
+         .fops =       &osp_reserved_mb_low_fops       },
 
        /* for compatibility reasons */
        { .name =       "destroys_in_flight",
index bfb43a6..00e14ed 100644 (file)
@@ -251,6 +251,12 @@ struct osp_device {
        struct list_head                 opd_async_updates;
        struct rw_semaphore              opd_async_updates_rwsem;
        atomic_t                         opd_async_updates_count;
+
+       /*
+        * Limit the object allocation using ENOSPC for opd_pre_status
+        */
+       int                             opd_reserved_mb_high;
+       int                             opd_reserved_mb_low;
 };
 
 #define opd_pre_lock                   opd_pre->osp_pre_lock
index 022254b..17e062e 100644 (file)
@@ -931,17 +931,8 @@ out:
  * Add a bit of hysteresis so this flag isn't continually flapping,
  * and ensure that new files don't get extremely fragmented due to
  * only a small amount of available space in the filesystem.
- * We want to set the NOSPC flag when there is less than ~0.1% free
- * and clear it when there is at least ~0.2% free space, so:
- *                   avail < ~0.1% max          max = avail + used
- *            1025 * avail < avail + used       used = blocks - free
- *            1024 * avail < used
- *            1024 * avail < blocks - free
- *                   avail < ((blocks - free) >> 10)
- *
- * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
- * lose that amount of space so in those cases we report no space left
- * if their is less than 1 GB left.
+ * We want to set the ENOSPC when there is less than reserved size
+ * free and clear it when there is at least 2*reserved size free space.
  * the function updates current precreation status used: functional or not
  *
  * \param[in] d                OSP device
@@ -954,43 +945,72 @@ void osp_pre_update_status(struct osp_device *d, int rc)
 {
        struct obd_statfs       *msfs = &d->opd_statfs;
        int                      old = d->opd_pre_status;
-       __u64                    used;
+       __u64                    available;
 
        d->opd_pre_status = rc;
        if (rc)
                goto out;
 
        if (likely(msfs->os_type)) {
-               used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
-                                   1 << 30);
-               if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) {
+               if (unlikely(d->opd_reserved_mb_high == 0 &&
+                            d->opd_reserved_mb_low == 0)) {
+                       /* Use ~0.1% by default to disable object allocation,
+                        * and ~0.2% to enable, size in MB, set both watermark
+                        */
+                       spin_lock(&d->opd_pre_lock);
+                       if (d->opd_reserved_mb_high == 0 &&
+                           d->opd_reserved_mb_low == 0) {
+                               d->opd_reserved_mb_low =
+                                       ((msfs->os_bsize >> 10) *
+                                       msfs->os_blocks) >> 20;
+                               if (d->opd_reserved_mb_low == 0)
+                                       d->opd_reserved_mb_low = 1;
+                               d->opd_reserved_mb_high =
+                                       (d->opd_reserved_mb_low << 1) + 1;
+                       }
+                       spin_unlock(&d->opd_pre_lock);
+               }
+               /* in MB */
+               available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
+               if (msfs->os_ffree < 32)
+                       msfs->os_state |= OS_STATE_ENOINO;
+               else if (msfs->os_ffree > 64)
+                       msfs->os_state &= ~OS_STATE_ENOINO;
+
+               if (available < d->opd_reserved_mb_low)
+                       msfs->os_state |= OS_STATE_ENOSPC;
+               else if (available > d->opd_reserved_mb_high)
+                       msfs->os_state &= ~OS_STATE_ENOSPC;
+               if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) {
                        d->opd_pre_status = -ENOSPC;
                        if (old != -ENOSPC)
-                               CDEBUG(D_INFO, "%s: status: %llu blocks, "
-                                      "%llu free, %llu used, %llu "
-                                      "avail -> %d: rc = %d\n",
+                               CDEBUG(D_INFO, "%s: status: %llu blocks, %llu "
+                                      "free, %llu avail, %llu MB avail, %u "
+                                      "hwm -> %d: rc = %d\n",
                                       d->opd_obd->obd_name, msfs->os_blocks,
-                                      msfs->os_bfree, used, msfs->os_bavail,
+                                      msfs->os_bfree, msfs->os_bavail,
+                                      available, d->opd_reserved_mb_high,
                                       d->opd_pre_status, rc);
                        CDEBUG(D_INFO,
                               "non-committed changes: %u, in progress: %u\n",
                               atomic_read(&d->opd_syn_changes),
                               atomic_read(&d->opd_syn_rpc_in_progress));
-               } else if (old == -ENOSPC) {
+               } else if (unlikely(old == -ENOSPC)) {
                        d->opd_pre_status = 0;
                        spin_lock(&d->opd_pre_lock);
                        d->opd_pre_create_slow = 0;
                        d->opd_pre_create_count = OST_MIN_PRECREATE;
                        spin_unlock(&d->opd_pre_lock);
                        wake_up(&d->opd_pre_waitq);
-                       CDEBUG(D_INFO, "%s: no space: %llu blocks, %llu"
-                              " free, %llu used, %llu avail -> %d: "
-                              "rc = %d\n", d->opd_obd->obd_name,
-                              msfs->os_blocks, msfs->os_bfree, used,
-                              msfs->os_bavail, d->opd_pre_status, rc);
+
+                       CDEBUG(D_INFO, "%s: space available: %llu blocks, %llu"
+                              " free, %llu avail, %lluMB avail, %u lwm"
+                              " -> %d: rc = %d\n", d->opd_obd->obd_name,
+                              msfs->os_blocks, msfs->os_bfree, msfs->os_bavail,
+                              available, d->opd_reserved_mb_low,
+                              d->opd_pre_status, rc);
                }
        }
-
 out:
        wake_up(&d->opd_pre_user_waitq);
 }
@@ -1593,6 +1613,8 @@ int osp_init_precreate(struct osp_device *d)
        d->opd_pre_create_count = OST_MIN_PRECREATE;
        d->opd_pre_min_create_count = OST_MIN_PRECREATE;
        d->opd_pre_max_create_count = OST_MAX_PRECREATE;
+       d->opd_reserved_mb_high = 0;
+       d->opd_reserved_mb_low = 0;
 
        spin_lock_init(&d->opd_pre_lock);
        init_waitqueue_head(&d->opd_pre_waitq);
index 5a26c77..1492176 100755 (executable)
@@ -14028,6 +14028,129 @@ test_252() {
 }
 run_test 252 "check lr_reader tool"
 
+test_253_fill_ost() {
+       local size_mb #how many MB should we write to pass watermark
+       local lwm=$3  #low watermark
+       local free_10mb #10% of free space
+
+       free_kb=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
+       size_mb=$((free_kb / 1024 - lwm))
+       free_10mb=$((free_kb / 10240))
+       #If 10% of free space cross low watermark use it
+       if (( free_10mb > size_mb )); then
+               size_mb=$free_10mb
+       else
+               #At least we need to store 1.1 of difference between
+               #free space and low watermark
+               size_mb=$((size_mb + size_mb / 10))
+       fi
+       if (( lwm <= $((free_kb / 1024)) )) || [ ! -f $DIR/$tdir/1 ]; then
+               dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_mb \
+                        oflag=append conv=notrunc
+       fi
+
+       sleep_maxage
+
+       free_kb=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
+       echo "OST still has $((free_kb / 1024)) mbytes free"
+}
+
+test_253() {
+       local ostidx=0
+       local rc=0
+
+       [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_mgs_nodsh && skip "remote MGS with nodsh" && return
+
+       local ost_name=$($LFS osts | grep ${ostidx}": " | \
+               awk '{print $2}' | sed -e 's/_UUID$//')
+       # on the mdt's osc
+       local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name)
+       do_facet $SINGLEMDS $LCTL get_param -n \
+               osp.$mdtosc_proc1.reserved_mb_high ||
+               { skip  "remote MDS does not support reserved_mb_high" &&
+                 return; }
+
+       rm -rf $DIR/$tdir
+       wait_mds_ost_sync
+       wait_delete_completed
+       mkdir $DIR/$tdir
+
+       local last_wm_h=$(do_facet $SINGLEMDS $LCTL get_param -n \
+                       osp.$mdtosc_proc1.reserved_mb_high)
+       local last_wm_l=$(do_facet $SINGLEMDS $LCTL get_param -n \
+                       osp.$mdtosc_proc1.reserved_mb_low)
+       echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l"
+
+       do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME ||
+               error "Pool creation failed"
+       do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name ||
+               error "Adding $ost_name to pool failed"
+
+       # Wait for client to see a OST at pool
+       wait_update $HOSTNAME "$LCTL get_param -n
+               lov.$FSNAME-*.pools.$TESTNAME | sort -u |
+               grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) ||
+               error "Client can not see the pool"
+       $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME ||
+               error "Setstripe failed"
+
+       dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10
+       local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }')
+       echo "OST still has $((blocks/1024)) mbytes free"
+
+       local new_lwm=$((blocks/1024-10))
+       do_facet $SINGLEMDS $LCTL set_param \
+                       osp.$mdtosc_proc1.reserved_mb_high=$((new_lwm+5))
+       do_facet $SINGLEMDS $LCTL set_param \
+                       osp.$mdtosc_proc1.reserved_mb_low=$new_lwm
+
+       test_253_fill_ost $ost_name $mdtosc_proc1 $new_lwm
+
+       #First enospc could execute orphan deletion so repeat.
+       test_253_fill_ost $ost_name $mdtosc_proc1 $new_lwm
+
+       local oa_status=$(do_facet $SINGLEMDS $LCTL get_param -n \
+                       osp.$mdtosc_proc1.prealloc_status)
+       echo "prealloc_status $oa_status"
+
+       dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 &&
+               error "File creation should fail"
+       #object allocation was stopped, but we still able to append files
+       dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append ||
+               error "Append failed"
+       rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r*
+
+       wait_delete_completed
+
+       sleep_maxage
+
+       for i in $(seq 10 12); do
+               dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null ||
+                       error "File creation failed after rm";
+       done
+
+       oa_status=$(do_facet $SINGLEMDS $LCTL get_param -n \
+                       osp.$mdtosc_proc1.prealloc_status)
+       echo "prealloc_status $oa_status"
+
+       if (( oa_status != 0 )); then
+               error "Object allocation still disable after rm"
+       fi
+       do_facet $SINGLEMDS $LCTL set_param \
+                       osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h
+       do_facet $SINGLEMDS $LCTL set_param \
+                       osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l
+
+
+       do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name ||
+               error "Remove $ost_name from pool failed"
+       do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME ||
+               error "Pool destroy fialed"
+}
+run_test 253 "Check object allocation limit"
+
 test_254() {
        local cl_user