From 35679a730bf0b7a8d4ce84cadc3ecc7c289ef491 Mon Sep 17 00:00:00 2001 From: Andrew Perepechko Date: Mon, 9 Dec 2019 20:13:50 +0300 Subject: [PATCH] LU-10958 ofd: data corruption due to RPC reordering Without read-only cache, it is possible that a client resends a BRW RPC, receives a reply from the original BRW RPC, modifies the same data and sends a new BRW RPC, however, because of RPC reordering stale data gets to disk. Let's use range locking to protect against this race. Change-Id: I35cbf95594601eacfc5f108b14e4c447962b0bbf Signed-off-by: Andrew Perepechko Cray-bug-id: LUS-5578,LUS-8943 Reviewed-on: https://review.whamcloud.com/32281 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alexey Lyashkov Reviewed-by: Oleg Drokin --- lustre/include/Makefile.am | 1 + lustre/{llite => include}/range_lock.h | 0 lustre/ldlm/Makefile.am | 2 +- lustre/llite/Makefile.in | 4 +-- lustre/llite/llite_internal.h | 2 +- lustre/obdclass/Makefile.in | 2 ++ lustre/{ldlm => obdclass}/interval_tree.c | 1 + lustre/{llite => obdclass}/range_lock.c | 6 ++++- lustre/ofd/ofd_dev.c | 1 + lustre/ofd/ofd_internal.h | 4 +++ lustre/ofd/ofd_io.c | 26 ++++++++++++++++++ lustre/ptlrpc/Makefile.in | 6 +---- lustre/ptlrpc/autoMakefile.am | 2 +- lustre/target/tgt_handler.c | 2 ++ lustre/tests/recovery-small.sh | 45 +++++++++++++++++++++++++++++++ 15 files changed, 93 insertions(+), 11 deletions(-) rename lustre/{llite => include}/range_lock.h (100%) rename lustre/{ldlm => obdclass}/interval_tree.c (99%) rename lustre/{llite => obdclass}/range_lock.c (97%) diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am index 16a6cab..584b646 100644 --- a/lustre/include/Makefile.am +++ b/lustre/include/Makefile.am @@ -95,6 +95,7 @@ EXTRA_DIST = \ obd_support.h \ obd_target.h \ obj_update.h \ + range_lock.h \ upcall_cache.h \ lustre_kernelcomm.h \ seq_range.h diff --git a/lustre/llite/range_lock.h b/lustre/include/range_lock.h similarity index 100% rename from lustre/llite/range_lock.h rename to lustre/include/range_lock.h diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index 610a53f..092ef28 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -40,4 +40,4 @@ MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ EXTRA_DIST = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \ ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c \ ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c \ - interval_tree.c ldlm_reclaim.c + ldlm_reclaim.c diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 3a5293f..dd82b4d 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -7,9 +7,9 @@ lustre-objs += glimpse.o lustre-objs += lcommon_cl.o lustre-objs += lcommon_misc.o lustre-objs += vvp_dev.o vvp_page.o vvp_io.o vvp_object.o -lustre-objs += range_lock.o pcc.o crypto.o +lustre-objs += pcc.o crypto.o EXTRA_DIST := $(lustre-objs:.o=.c) xattr.c rw26.c super25.c -EXTRA_DIST += llite_internal.h vvp_internal.h range_lock.h pcc.h +EXTRA_DIST += llite_internal.h vvp_internal.h pcc.h @INCLUDE_RULES@ diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 1a061b0..45e6864 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -46,9 +46,9 @@ #include #include #include +#include #include "vvp_internal.h" -#include "range_lock.h" #include "pcc.h" #ifndef FMODE_EXEC diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index f1925e9..50b4c46 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -13,6 +13,7 @@ obdclass-all-objs += linkea.o obdclass-all-objs += kernelcomm.o jobid.o obdclass-all-objs += integrity.o obd_cksum.o obdclass-all-objs += lu_tgt_descs.o +obdclass-all-objs += range_lock.o interval_tree.o @SERVER_TRUE@obdclass-all-objs += acl.o @SERVER_TRUE@obdclass-all-objs += idmap.o @@ -31,6 +32,7 @@ EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs EXTRA_DIST = $(obdclass-all-objs:.o=.c) llog_test.c llog_internal.h EXTRA_DIST += cl_internal.h local_storage.h +EXTRA_DIST += range_lock.c interval_tree.c @SERVER_FALSE@EXTRA_DIST += acl.c @SERVER_FALSE@EXTRA_DIST += idmap.c diff --git a/lustre/ldlm/interval_tree.c b/lustre/obdclass/interval_tree.c similarity index 99% rename from lustre/ldlm/interval_tree.c rename to lustre/obdclass/interval_tree.c index 1b1bc3a..1a9d897 100644 --- a/lustre/ldlm/interval_tree.c +++ b/lustre/obdclass/interval_tree.c @@ -780,3 +780,4 @@ void interval_expand(struct interval_node *root, ext->end = interval_expand_high(root, ext->end); LASSERT(interval_is_overlapped(root, ext) == 0); } +EXPORT_SYMBOL(interval_expand); diff --git a/lustre/llite/range_lock.c b/lustre/obdclass/range_lock.c similarity index 97% rename from lustre/llite/range_lock.c rename to lustre/obdclass/range_lock.c index 7a4c9c4..140337f 100644 --- a/lustre/llite/range_lock.c +++ b/lustre/obdclass/range_lock.c @@ -36,8 +36,8 @@ #ifdef HAVE_SCHED_HEADERS #include #endif -#include "range_lock.h" #include +#include /** * Initialize a range lock tree @@ -53,6 +53,7 @@ void range_lock_tree_init(struct range_lock_tree *tree) tree->rlt_sequence = 0; spin_lock_init(&tree->rlt_lock); } +EXPORT_SYMBOL(range_lock_tree_init); /** * Intialize a range lock node @@ -82,6 +83,7 @@ int range_lock_init(struct range_lock *lock, __u64 start, __u64 end) lock->rl_sequence = 0; return rc; } +EXPORT_SYMBOL(range_lock_init); static inline struct range_lock *next_lock(struct range_lock *lock) { @@ -168,6 +170,7 @@ void range_unlock(struct range_lock_tree *tree, struct range_lock *lock) EXIT; } +EXPORT_SYMBOL(range_unlock); /** * Helper function of range_lock() @@ -245,3 +248,4 @@ int range_lock(struct range_lock_tree *tree, struct range_lock *lock) out: RETURN(rc); } +EXPORT_SYMBOL(range_lock); diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 8b76814..73364a0 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -545,6 +545,7 @@ static struct lu_object *ofd_object_alloc(const struct lu_env *env, lu_object_init(o, h, d); lu_object_add_top(h, o); o->lo_ops = &ofd_obj_ops; + range_lock_tree_init(&of->ofo_write_tree); RETURN(o); } else { RETURN(NULL); diff --git a/lustre/ofd/ofd_internal.h b/lustre/ofd/ofd_internal.h index 8bdb4c9..ce6561e 100644 --- a/lustre/ofd/ofd_internal.h +++ b/lustre/ofd/ofd_internal.h @@ -38,6 +38,7 @@ #include #include #include +#include #define OFD_INIT_OBJID 0 #define OFD_PRECREATE_BATCH_DEFAULT (OBJ_SUBDIR_COUNT * 4) @@ -188,6 +189,7 @@ struct ofd_object { time64_t ofo_atime_ondisk; unsigned int ofo_pfid_checking:1, ofo_pfid_verified:1; + struct range_lock_tree ofo_write_tree; }; static inline struct ofd_object *ofd_obj(struct lu_object *o) @@ -286,6 +288,8 @@ struct ofd_thread_info { struct lfsck_req_local fti_lrl; struct obd_connect_data fti_ocd; }; + struct range_lock fti_write_range; + unsigned fti_range_locked:1; }; extern void target_recovery_fini(struct obd_device *obd); diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index 90d1807..a3c2ac3 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -688,6 +688,7 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, int maxlnb = *nr_local; __u64 begin, end; ktime_t kstart = ktime_get(); + struct range_lock *range = &ofd_info(env)->fti_write_range; ENTRY; LASSERT(env != NULL); @@ -842,6 +843,26 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, obj->ioo_bufcnt, WRITE); + /* + * Reordering precautions: make sure that request processing that + * was able to receive its bulk data should not get reordered with + * overlapping BRW requests, e.g. + * 1) BRW1 sent, bulk data received, but disk I/O delayed + * 2) BRW1 resent and fully processed + * 3) the page was unlocked on the client and its writeback bit reset + * 4) BRW2 sent and fully processed + * 5) BRW1 processing wakes up and writes stale data to disk + * If on step 1 bulk data was not received, client resend will invalidate + * its bulk descriptor and the RPC will be dropped due to failed bulk + * transfer, which is just fine. + */ + range_lock_init(range, + rnb[0].rnb_offset, + rnb[obj->ioo_bufcnt - 1].rnb_offset + + rnb[obj->ioo_bufcnt - 1].rnb_len - 1); + range_lock(&fo->ofo_write_tree, range); + ofd_info(env)->fti_range_locked = 1; + ofd_counter_incr(exp, LPROC_OFD_STATS_WRITE_BYTES, jobid, tot_bytes); ofd_counter_incr(exp, LPROC_OFD_STATS_WRITE, jobid, ktime_us_delta(ktime_get(), kstart)); @@ -1220,6 +1241,7 @@ ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp, bool soft_sync = false; bool cb_registered = false; bool fake_write = false; + struct range_lock *range = &ofd_info(env)->fti_write_range; ENTRY; @@ -1384,6 +1406,10 @@ out_stop: dt_commit_async(env, ofd->ofd_osd); out: + if (info->fti_range_locked) { + range_unlock(&fo->ofo_write_tree, range); + info->fti_range_locked = 0; + } dt_bufs_put(env, o, lnb, niocount); ofd_object_put(env, fo); if (granted > 0) diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in index 89bf4aa..12abf31 100644 --- a/lustre/ptlrpc/Makefile.in +++ b/lustre/ptlrpc/Makefile.in @@ -7,8 +7,7 @@ ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o -ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)interval_tree.o -ldlm_objs += $(LDLM)ldlm_reclaim.o +ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)ldlm_reclaim.o target_objs := $(TARGET)tgt_main.o $(TARGET)tgt_lastrcvd.o target_objs += $(TARGET)tgt_handler.o $(TARGET)out_handler.o @@ -43,9 +42,6 @@ ldlm_%.c: @LUSTRE@/ldlm/ldlm_%.c l_lock.c: @LUSTRE@/ldlm/l_lock.c ln -sf $< $@ -interval_tree.c: @LUSTRE@/ldlm/interval_tree.c - ln -sf $< $@ - tgt_%.c: @LUSTRE@/target/tgt_%.c ln -sf $< $@ diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am index 8eefa63..cadc356 100644 --- a/lustre/ptlrpc/autoMakefile.am +++ b/lustre/ptlrpc/autoMakefile.am @@ -44,4 +44,4 @@ if GSS SUBDIRS = gss endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c interval_tree.c +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index 45e3b30..340d491 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -2700,6 +2700,8 @@ skip_transfer: } } + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK2, cfs_fail_val); + out_commitrw: /* Must commit after prep above in all cases */ rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa, diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index ab8fcfe..675f616 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3107,6 +3107,51 @@ test_147() { } run_test 147 "Check client reconnect" +test_148() { + local wce_param="obdfilter.$FSNAME-OST0000.writethrough_cache_enable" + local p="$TMP/$TESTSUITE-$TESTNAME.parameters" + local amc=$(at_max_get client) + local amo=$(at_max_get ost1) + local timeout + + at_max_set 0 client + at_max_set 0 ost1 + timeout=$(request_timeout client) + + [ "$(facet_fstype ost1)" = "ldiskfs" ] && { + # save old r/o cache settings + save_lustre_params ost1 $wce_param > $p + + # disable r/o cache + do_facet ost1 "$LCTL set_param -n $wce_param=0" + } + + $LFS setstripe -i 0 -c 1 $DIR/$tfile + dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 oflag=direct + cp $DIR/$tfile $TMP/$tfile + #define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 + do_facet ost1 $LCTL set_param fail_loc=0x80000227 + do_facet ost1 $LCTL set_param fail_val=$((timeout+2)) + dd if=/dev/urandom of=$DIR/$tfile bs=4096 count=1 conv=notrunc,fdatasync + dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=notrunc,fdatasync + sleep 2 + cancel_lru_locks osc + cmp -b $DIR/$tfile $TMP/$tfile || error "wrong data" + + rm -f $DIR/$tfile $TMP/$tfile + + at_max_set $amc client + at_max_set $amo ost1 + + [ "$(facet_fstype ost1)" = "ldiskfs" ] && { + # restore initial r/o cache settings + restore_lustre_params < $p + } + + return 0 +} +run_test 148 "data corruption through resend" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1