Whamcloud - gitweb
LU-13802 llite: add read & write switch thresholds 95/52595/35
authorPatrick Farrell <paf0187@gmail.com>
Mon, 1 Apr 2024 15:30:29 +0000 (11:30 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 29 May 2024 04:41:22 +0000 (04:41 +0000)
The main criteria for switching to from buffered IO to
hybrid is IO size.  This adds that switching.  The correct
size for cutover is not the same for read and write, so we
have separate checks for read and write.

These checks are elaborated on in further patches, adding
different thresholds based on the backing storage type.

Adding the switching thresholds is what really enables
hybrid IO, so we have to adjust a number of tests which
assume buffered IO.

There are a few obscure hang bugs which have been difficult
to track down, and we are past feature freeze, so this patch
now leaves hybrid IO disabled by default.

Signed-off-by: Patrick Farrell <patrick.farrell@oracle.com>
Change-Id: I491cd7b2bdafe8bb2c1a4d692442a62154324bec
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52595
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/lproc_llite.c
lustre/tests/sanity.sh

index 2346077..dff3caa 100644 (file)
@@ -1775,6 +1775,14 @@ ll_hybrid_bio_dio_switch_check(struct file *file, struct kiocb *iocb,
 
        if (!test_bit(LL_SBI_HYBRID_IO, sbi->ll_flags))
                RETURN(false);
+
+       if (iot == CIT_WRITE &&
+           count >= sbi->ll_hybrid_io_write_threshold_bytes)
+               RETURN(true);
+
+       if (iot == CIT_READ &&
+           count >= sbi->ll_hybrid_io_read_threshold_bytes)
+               RETURN(true);
 #endif
        RETURN(false);
 }
index dd80d27..e14d439 100644 (file)
@@ -966,6 +966,10 @@ struct ll_sb_info {
        /* Time in ms after last file close we no longer count prior opens*/
        u32                       ll_oc_max_ms;
 
+       /* I/O size thresholds for switching from buffered I/O to direct I/O */
+       u32                       ll_hybrid_io_write_threshold_bytes;
+       u32                       ll_hybrid_io_read_threshold_bytes;
+
        /* filesystem fsname */
        char                      ll_fsname[LUSTRE_MAXFSNAME + 1];
 
@@ -1970,6 +1974,11 @@ static inline int d_lustre_invalid(const struct dentry *dentry)
        return !ll_d2d(dentry) || ll_d2d(dentry)->lld_invalid;
 }
 
+/* 8 MiB is where reads are reliably better as DIO on most configs */
+#define SBI_DEFAULT_HYBRID_IO_READ_THRESHOLD   (8 * 1024 * 1024) /* 8 MiB */
+/* 2 MiB is where writes are reliably better as DIO on most configs */
+#define SBI_DEFAULT_HYBRID_IO_WRITE_THRESHOLD  (2 * 1024 * 1024) /* 2 MiB */
+
 /*
  * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
  * ll_md_blocking_ast), it will be pruned by ll_prune_aliases() and
index 8babc83..a99cd35 100644 (file)
@@ -210,7 +210,6 @@ static struct ll_sb_info *ll_init_sbi(struct lustre_sb_info *lsi)
        set_bit(LL_SBI_STATFS_PROJECT, sbi->ll_flags);
        ll_sbi_set_encrypt(sbi, true);
        ll_sbi_set_name_encrypt(sbi, true);
-       set_bit(LL_SBI_HYBRID_IO, sbi->ll_flags);
 
        /* root squash */
        sbi->ll_squash.rsi_uid = 0;
@@ -226,6 +225,10 @@ static struct ll_sb_info *ll_init_sbi(struct lustre_sb_info *lsi)
        sbi->ll_oc_thrsh_count = SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT;
        sbi->ll_oc_max_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS;
        sbi->ll_oc_thrsh_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MS;
+       sbi->ll_hybrid_io_write_threshold_bytes =
+               SBI_DEFAULT_HYBRID_IO_WRITE_THRESHOLD;
+       sbi->ll_hybrid_io_read_threshold_bytes =
+               SBI_DEFAULT_HYBRID_IO_READ_THRESHOLD;
 
        INIT_LIST_HEAD(&sbi->ll_all_quota_list);
        RETURN(sbi);
index 5ae18df..8d8151b 100644 (file)
@@ -1806,6 +1806,76 @@ static ssize_t inode_cache_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(inode_cache);
 
+/* an arbitrary but very large maximum value for sanity */
+#define HYBRID_IO_THRESHOLD_BYTES_MAX (2 * 1024 * 1024 * 1024UL) /* 2 GiB */
+static ssize_t hybrid_io_write_threshold_bytes_show(struct kobject *kobj,
+                                                   struct attribute *attr,
+                                                   char *buf)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+
+       return snprintf(buf, PAGE_SIZE, "%u\n",
+                       sbi->ll_hybrid_io_write_threshold_bytes);
+}
+
+static ssize_t hybrid_io_write_threshold_bytes_store(struct kobject *kobj,
+                                                    struct attribute *attr,
+                                                    const char *buffer,
+                                                    size_t count)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+       u64 val;
+       int rc;
+
+       rc = sysfs_memparse(buffer, count, &val, "B");
+       if (rc)
+               return rc;
+
+       if (val > HYBRID_IO_THRESHOLD_BYTES_MAX)
+               return -ERANGE;
+
+       sbi->ll_hybrid_io_write_threshold_bytes = val;
+
+       return count;
+}
+LUSTRE_RW_ATTR(hybrid_io_write_threshold_bytes);
+
+static ssize_t hybrid_io_read_threshold_bytes_show(struct kobject *kobj,
+                                                  struct attribute *attr,
+                                                  char *buf)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+
+       return snprintf(buf, PAGE_SIZE, "%u\n",
+                       sbi->ll_hybrid_io_read_threshold_bytes);
+}
+
+static ssize_t hybrid_io_read_threshold_bytes_store(struct kobject *kobj,
+                                                   struct attribute *attr,
+                                                   const char *buffer,
+                                                   size_t count)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+       u64 val;
+       int rc;
+
+       rc = sysfs_memparse(buffer, count, &val, "B");
+       if (rc)
+               return rc;
+
+       if (val > HYBRID_IO_THRESHOLD_BYTES_MAX)
+               return -ERANGE;
+
+       sbi->ll_hybrid_io_read_threshold_bytes = val;
+
+       return count;
+}
+LUSTRE_RW_ATTR(hybrid_io_read_threshold_bytes);
+
 static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
        struct super_block      *sb    = m->private;
@@ -2137,6 +2207,8 @@ static struct attribute *llite_attrs[] = {
        &lustre_attr_opencache_threshold_ms.attr,
        &lustre_attr_opencache_max_ms.attr,
        &lustre_attr_inode_cache.attr,
+       &lustre_attr_hybrid_io_write_threshold_bytes.attr,
+       &lustre_attr_hybrid_io_read_threshold_bytes.attr,
 #ifdef CONFIG_LL_ENCRYPTION
        &lustre_attr_enable_filename_encryption.attr,
 #endif
index c1c7f9d..a375ff9 100755 (executable)
@@ -3510,6 +3510,12 @@ test_27T() {
 #define OBD_FAIL_OST_ENOSPC              0x215
        do_facet ost1 "$LCTL set_param fail_loc=0x80000215"
        $LFS setstripe -i 0 -c 1 $DIR/$tfile
+       # DIO does not support partial writes to a single stripe - a write to
+       # each stripe will fail or succeed entirely.  So we disable hybrid IO
+       # so we can see the partial write behavior of buffered IO
+       local hybrid=$($LCTL get_param -n llite.*.hybrid_io)
+       $LCTL set_param llite.*.hybrid_io=0
+       stack_trap "$LCTL set_param -n llite.*.hybrid_io=$hybrid" EXIT
        $MULTIOP $DIR/$tfile oO_WRONLY:P$((4 * 1024 * 1024 + 10 * 4096))c ||
                error "multiop failed"
 }
@@ -9887,6 +9893,11 @@ test_64f() {
 
        $LFS setstripe -c 1 -i 0 $DIR/$tfile || error "lfs setstripe failed"
 
+       # Hybrid means this won't really be buffered IO, so we disable it for
+       # this part of the test
+       local hybrid=$($LCTL get_param -n llite.*.hybrid_io)
+       $LCTL set_param llite.*.hybrid_io=0
+       stack_trap "$LCTL set_param -n llite.*.hybrid_io=$hybrid" EXIT
        # Testing that buffered IO consumes grant on the client
 
        # Delay the RPC on the server so it's guaranteed to not complete even
@@ -11838,6 +11849,9 @@ run_test 101g "Big bulk(4/16 MiB) readahead"
 
 test_101h() {
        $LFS setstripe -i 0 -c 1 $DIR/$tfile
+       local hybrid=$($LCTL get_param -n llite.*.hybrid_io)
+       $LCTL set_param llite.*.hybrid_io=0
+       stack_trap "$LCTL set_param -n llite.*.hybrid_io=$hybrid" EXIT
 
        dd if=/dev/zero of=$DIR/$tfile bs=1M count=70 ||
                error "dd 70M file failed"
@@ -11886,6 +11900,9 @@ test_101j() {
        local file_size=$((1048576 * 16))
        local old_ra=$($LCTL get_param -n llite.*.max_read_ahead_mb | head -n 1)
        stack_trap "$LCTL set_param -n llite.*.max_read_ahead_mb $old_ra" EXIT
+       local hybrid=$($LCTL get_param -n llite.*.hybrid_io)
+       $LCTL set_param llite.*.hybrid_io=0
+       stack_trap "$LCTL set_param -n llite.*.hybrid_io=$hybrid" EXIT
 
        echo Disable read-ahead
        $LCTL set_param -n llite.*.max_read_ahead_mb=0
@@ -24443,6 +24460,12 @@ ladvise_willread_performance()
        local average_cache=0
        local average_ladvise=0
 
+       # Hybrid IO switches to DIO, which invalidates much of the caching
+       # So disable it for this test
+       local hybrid=$($LCTL get_param -n llite.*.hybrid_io)
+       $LCTL set_param llite.*.hybrid_io=0
+       stack_trap "$LCTL set_param -n llite.*.hybrid_io=$hybrid" EXIT
+
        for ((i = 1; i <= $repeat; i++)); do
                echo "Iter $i/$repeat: reading without willread hint"
                cancel_lru_locks osc
@@ -25434,6 +25457,11 @@ test_271ba() {
 
        lctl set_param -n mdc.*.stats=clear
        lctl set_param -n osc.*.stats=clear
+       # Hybrid switches to DIO, so does not hold the required lock to skip
+       # the glimpse, so we disable it here...
+       local hybrid=$($LCTL get_param -n llite.*.hybrid_io)
+       $LCTL set_param llite.*.hybrid_io=0
+       stack_trap "$LCTL set_param -n llite.*.hybrid_io=$hybrid" EXIT
        dd if=/dev/zero of=$dom bs=2048K count=1 || return 1
        cancel_lru_locks mdc
        $CHECKSTAT -t file -s 2097152 $dom || error "stat"