Whamcloud - gitweb
LU-10958 ofd: data corruption due to RPC reordering 81/32281/18
authorAndrew Perepechko <c17827@cray.com>
Mon, 9 Dec 2019 17:13:50 +0000 (20:13 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 8 Feb 2021 21:54:44 +0000 (21:54 +0000)
Without read-only cache, it is possible that a client
resends a BRW RPC, receives a reply from the original
BRW RPC, modifies the same data and sends a new BRW
RPC, however, because of RPC reordering stale data
gets to disk.

Let's use range locking to protect against this race.

Change-Id: I35cbf95594601eacfc5f108b14e4c447962b0bbf
Signed-off-by: Andrew Perepechko <c17827@cray.com>
Cray-bug-id: LUS-5578,LUS-8943
Reviewed-on: https://review.whamcloud.com/32281
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
15 files changed:
lustre/include/Makefile.am
lustre/include/range_lock.h [moved from lustre/llite/range_lock.h with 100% similarity]
lustre/ldlm/Makefile.am
lustre/llite/Makefile.in
lustre/llite/llite_internal.h
lustre/obdclass/Makefile.in
lustre/obdclass/interval_tree.c [moved from lustre/ldlm/interval_tree.c with 99% similarity]
lustre/obdclass/range_lock.c [moved from lustre/llite/range_lock.c with 97% similarity]
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_internal.h
lustre/ofd/ofd_io.c
lustre/ptlrpc/Makefile.in
lustre/ptlrpc/autoMakefile.am
lustre/target/tgt_handler.c
lustre/tests/recovery-small.sh

index 16a6cab..584b646 100644 (file)
@@ -95,6 +95,7 @@ EXTRA_DIST = \
        obd_support.h \
        obd_target.h \
        obj_update.h \
+       range_lock.h \
        upcall_cache.h \
        lustre_kernelcomm.h \
        seq_range.h
index 610a53f..092ef28 100644 (file)
@@ -40,4 +40,4 @@ MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
 EXTRA_DIST = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
        ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c         \
        ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c        \
-       interval_tree.c ldlm_reclaim.c
+       ldlm_reclaim.c
index 3a5293f..dd82b4d 100644 (file)
@@ -7,9 +7,9 @@ lustre-objs += glimpse.o
 lustre-objs += lcommon_cl.o
 lustre-objs += lcommon_misc.o
 lustre-objs += vvp_dev.o vvp_page.o vvp_io.o vvp_object.o
-lustre-objs += range_lock.o pcc.o crypto.o
+lustre-objs += pcc.o crypto.o
 
 EXTRA_DIST := $(lustre-objs:.o=.c) xattr.c rw26.c super25.c
-EXTRA_DIST += llite_internal.h vvp_internal.h range_lock.h pcc.h
+EXTRA_DIST += llite_internal.h vvp_internal.h pcc.h
 
 @INCLUDE_RULES@
index 1a061b0..45e6864 100644 (file)
@@ -46,9 +46,9 @@
 #include <linux/aio.h>
 #include <lustre_compat.h>
 #include <lustre_crypto.h>
+#include <range_lock.h>
 
 #include "vvp_internal.h"
-#include "range_lock.h"
 #include "pcc.h"
 
 #ifndef FMODE_EXEC
index f1925e9..50b4c46 100644 (file)
@@ -13,6 +13,7 @@ obdclass-all-objs += linkea.o
 obdclass-all-objs += kernelcomm.o jobid.o
 obdclass-all-objs += integrity.o obd_cksum.o
 obdclass-all-objs += lu_tgt_descs.o
+obdclass-all-objs += range_lock.o interval_tree.o
 
 @SERVER_TRUE@obdclass-all-objs += acl.o
 @SERVER_TRUE@obdclass-all-objs += idmap.o
@@ -31,6 +32,7 @@ EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs
 
 EXTRA_DIST = $(obdclass-all-objs:.o=.c) llog_test.c llog_internal.h
 EXTRA_DIST += cl_internal.h local_storage.h
+EXTRA_DIST += range_lock.c interval_tree.c
 
 @SERVER_FALSE@EXTRA_DIST += acl.c
 @SERVER_FALSE@EXTRA_DIST += idmap.c
similarity index 99%
rename from lustre/ldlm/interval_tree.c
rename to lustre/obdclass/interval_tree.c
index 1b1bc3a..1a9d897 100644 (file)
@@ -780,3 +780,4 @@ void interval_expand(struct interval_node *root,
                ext->end = interval_expand_high(root, ext->end);
        LASSERT(interval_is_overlapped(root, ext) == 0);
 }
+EXPORT_SYMBOL(interval_expand);
similarity index 97%
rename from lustre/llite/range_lock.c
rename to lustre/obdclass/range_lock.c
index 7a4c9c4..140337f 100644 (file)
@@ -36,8 +36,8 @@
 #ifdef HAVE_SCHED_HEADERS
 #include <linux/sched/signal.h>
 #endif
-#include "range_lock.h"
 #include <uapi/linux/lustre/lustre_user.h>
+#include <range_lock.h>
 
 /**
  * Initialize a range lock tree
@@ -53,6 +53,7 @@ void range_lock_tree_init(struct range_lock_tree *tree)
        tree->rlt_sequence = 0;
        spin_lock_init(&tree->rlt_lock);
 }
+EXPORT_SYMBOL(range_lock_tree_init);
 
 /**
  * Intialize a range lock node
@@ -82,6 +83,7 @@ int range_lock_init(struct range_lock *lock, __u64 start, __u64 end)
        lock->rl_sequence = 0;
        return rc;
 }
+EXPORT_SYMBOL(range_lock_init);
 
 static inline struct range_lock *next_lock(struct range_lock *lock)
 {
@@ -168,6 +170,7 @@ void range_unlock(struct range_lock_tree *tree, struct range_lock *lock)
 
        EXIT;
 }
+EXPORT_SYMBOL(range_unlock);
 
 /**
  * Helper function of range_lock()
@@ -245,3 +248,4 @@ int range_lock(struct range_lock_tree *tree, struct range_lock *lock)
 out:
        RETURN(rc);
 }
+EXPORT_SYMBOL(range_lock);
index 8b76814..73364a0 100644 (file)
@@ -545,6 +545,7 @@ static struct lu_object *ofd_object_alloc(const struct lu_env *env,
                lu_object_init(o, h, d);
                lu_object_add_top(h, o);
                o->lo_ops = &ofd_obj_ops;
+               range_lock_tree_init(&of->ofo_write_tree);
                RETURN(o);
        } else {
                RETURN(NULL);
index 8bdb4c9..ce6561e 100644 (file)
@@ -38,6 +38,7 @@
 #include <dt_object.h>
 #include <md_object.h>
 #include <lustre_fid.h>
+#include <range_lock.h>
 
 #define OFD_INIT_OBJID 0
 #define OFD_PRECREATE_BATCH_DEFAULT (OBJ_SUBDIR_COUNT * 4)
@@ -188,6 +189,7 @@ struct ofd_object {
        time64_t                ofo_atime_ondisk;
        unsigned int            ofo_pfid_checking:1,
                                ofo_pfid_verified:1;
+       struct range_lock_tree  ofo_write_tree;
 };
 
 static inline struct ofd_object *ofd_obj(struct lu_object *o)
@@ -286,6 +288,8 @@ struct ofd_thread_info {
                struct lfsck_req_local   fti_lrl;
                struct obd_connect_data  fti_ocd;
        };
+       struct range_lock                fti_write_range;
+       unsigned                         fti_range_locked:1;
 };
 
 extern void target_recovery_fini(struct obd_device *obd);
index 90d1807..a3c2ac3 100644 (file)
@@ -688,6 +688,7 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
        int maxlnb = *nr_local;
        __u64 begin, end;
        ktime_t kstart = ktime_get();
+       struct range_lock *range = &ofd_info(env)->fti_write_range;
 
        ENTRY;
        LASSERT(env != NULL);
@@ -842,6 +843,26 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
                obj->ioo_bufcnt,
                WRITE);
 
+       /*
+        * Reordering precautions: make sure that request processing that
+        * was able to receive its bulk data should not get reordered with
+        * overlapping BRW requests, e.g.
+        *  1) BRW1 sent, bulk data received, but disk I/O delayed
+        *  2) BRW1 resent and fully processed
+        *  3) the page was unlocked on the client and its writeback bit reset
+        *  4) BRW2 sent and fully processed
+        *  5) BRW1 processing wakes up and writes stale data to disk
+        * If on step 1 bulk data was not received, client resend will invalidate
+        * its bulk descriptor and the RPC will be dropped due to failed bulk
+        * transfer, which is just fine.
+        */
+       range_lock_init(range,
+                       rnb[0].rnb_offset,
+                       rnb[obj->ioo_bufcnt - 1].rnb_offset +
+                       rnb[obj->ioo_bufcnt - 1].rnb_len - 1);
+       range_lock(&fo->ofo_write_tree, range);
+       ofd_info(env)->fti_range_locked = 1;
+
        ofd_counter_incr(exp, LPROC_OFD_STATS_WRITE_BYTES, jobid, tot_bytes);
        ofd_counter_incr(exp, LPROC_OFD_STATS_WRITE, jobid,
                         ktime_us_delta(ktime_get(), kstart));
@@ -1220,6 +1241,7 @@ ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp,
        bool soft_sync = false;
        bool cb_registered = false;
        bool fake_write = false;
+       struct range_lock *range = &ofd_info(env)->fti_write_range;
 
        ENTRY;
 
@@ -1384,6 +1406,10 @@ out_stop:
                dt_commit_async(env, ofd->ofd_osd);
 
 out:
+       if (info->fti_range_locked) {
+               range_unlock(&fo->ofo_write_tree, range);
+               info->fti_range_locked = 0;
+       }
        dt_bufs_put(env, o, lnb, niocount);
        ofd_object_put(env, fo);
        if (granted > 0)
index 89bf4aa..12abf31 100644 (file)
@@ -7,8 +7,7 @@ ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
 ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
 ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
 ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
-ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)interval_tree.o
-ldlm_objs += $(LDLM)ldlm_reclaim.o
+ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)ldlm_reclaim.o
 
 target_objs := $(TARGET)tgt_main.o $(TARGET)tgt_lastrcvd.o
 target_objs += $(TARGET)tgt_handler.o $(TARGET)out_handler.o
@@ -43,9 +42,6 @@ ldlm_%.c: @LUSTRE@/ldlm/ldlm_%.c
 l_lock.c: @LUSTRE@/ldlm/l_lock.c
        ln -sf $< $@
 
-interval_tree.c: @LUSTRE@/ldlm/interval_tree.c
-       ln -sf $< $@
-
 tgt_%.c: @LUSTRE@/target/tgt_%.c
        ln -sf $< $@
 
index 8eefa63..cadc356 100644 (file)
@@ -44,4 +44,4 @@ if GSS
 SUBDIRS = gss
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c interval_tree.c
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c
index 45e3b30..340d491 100644 (file)
@@ -2700,6 +2700,8 @@ skip_transfer:
                }
        }
 
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK2, cfs_fail_val);
+
 out_commitrw:
        /* Must commit after prep above in all cases */
        rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
index ab8fcfe..675f616 100755 (executable)
@@ -3107,6 +3107,51 @@ test_147() {
 }
 run_test 147 "Check client reconnect"
 
+test_148() {
+       local wce_param="obdfilter.$FSNAME-OST0000.writethrough_cache_enable"
+       local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
+       local amc=$(at_max_get client)
+       local amo=$(at_max_get ost1)
+       local timeout
+
+       at_max_set 0 client
+       at_max_set 0 ost1
+       timeout=$(request_timeout client)
+
+       [ "$(facet_fstype ost1)" = "ldiskfs" ] && {
+               # save old r/o cache settings
+               save_lustre_params ost1 $wce_param > $p
+
+               # disable r/o cache
+               do_facet ost1 "$LCTL set_param -n $wce_param=0"
+       }
+
+       $LFS setstripe -i 0 -c 1 $DIR/$tfile
+       dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 oflag=direct
+       cp $DIR/$tfile $TMP/$tfile
+       #define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+       do_facet ost1 $LCTL set_param fail_loc=0x80000227
+       do_facet ost1 $LCTL set_param fail_val=$((timeout+2))
+       dd if=/dev/urandom of=$DIR/$tfile bs=4096 count=1 conv=notrunc,fdatasync
+       dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=notrunc,fdatasync
+       sleep 2
+       cancel_lru_locks osc
+       cmp -b $DIR/$tfile $TMP/$tfile || error "wrong data"
+
+       rm -f $DIR/$tfile $TMP/$tfile
+
+       at_max_set $amc client
+       at_max_set $amo ost1
+
+       [ "$(facet_fstype ost1)" = "ldiskfs" ] && {
+               # restore initial r/o cache settings
+               restore_lustre_params < $p
+       }
+
+       return 0
+}
+run_test 148 "data corruption through resend"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status