From b9e1bb635039c6d2d985754a9a029c9d5c20b569 Mon Sep 17 00:00:00 2001
From: Alexander Boyko <alexander.boyko@seagate.com>
Date: Tue, 29 Mar 2016 14:39:47 +0300
Subject: [PATCH] LU-7934 osp: fix tr->otr_next_id overflow

The tracker next_id and current_id u32 type was based on
max llog records. But llog use cyclic store for records, so
llog could store infinite number of records and limited by
max number at moument of time. The u32 type could be
overflowed easy if a server isn`t rebooted.
osp_sync_id_get()) snx11126-OST0045-osc-MDT0000: next 0,
	last synced 4294967205
This fix changes type u32 to u64 for *id. Now, we store only
low part current_id to llog record header id. And restore the
full u64 from record header later. It is possible because
llog catalog can store 64768^2 and it is less than u32 max value.

The patch adds test to check u32 overflow for otr_next_id field
of osp_id_tracker.

Recreate the next assertion
LustreError: 185667:0:(osp_sync.c:1544:osp_sync_id_get())
	snx11126-OST0045-osc-MDT0000: next 0, last synced 4294967205
LustreError: 231396:0:(osp_sync.c:1545:osp_sync_id_get()) LBUG

Seagate-bug-id: MRP-3367
Signed-off-by: Alexander Boyko <alexander.boyko@seagate.com>
Change-Id: I89d70ecb068f8d0b5a1e1ac35b85a4b6e53211e5
Reviewed-on: http://review.whamcloud.com/19190
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@seagate.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/include/obd_support.h  |  1 +
 lustre/osp/osp_dev.c          |  2 +-
 lustre/osp/osp_internal.h     | 12 +++++------
 lustre/osp/osp_sync.c         | 47 ++++++++++++++++++++++++++++++++-----------
 lustre/tests/replay-single.sh | 17 ++++++++++++++++
 5 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 81a32c7..a6a83d9 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -250,6 +250,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
 #define OBD_FAIL_MDS_FLD_LOOKUP			0x15c
 #define OBD_FAIL_MDS_INTENT_DELAY		0x160
+#define OBD_FAIL_MDS_TRACK_OVERFLOW	 0x162
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c
index 505e484..329da7b 100644
--- a/lustre/osp/osp_dev.c
+++ b/lustre/osp/osp_dev.c
@@ -811,7 +811,7 @@ static int osp_sync(const struct lu_env *env, struct dt_device *dev)
 	if (rc != 0)
 		GOTO(out, rc);
 
-	CDEBUG(D_CACHE, "%s: id: used %lu, processed %lu\n",
+	CDEBUG(D_CACHE, "%s: id: used %lu, processed "LPU64"\n",
 	       d->opd_obd->obd_name, id, d->opd_syn_last_processed_id);
 
 	/* wait till all-in-line are processed */
diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h
index 88e2777..3714dc2 100644
--- a/lustre/osp/osp_internal.h
+++ b/lustre/osp/osp_internal.h
@@ -55,8 +55,8 @@
  */
 struct osp_id_tracker {
 	spinlock_t		 otr_lock;
-	__u32			 otr_next_id;
-	__u32			 otr_committed_id;
+	__u64			 otr_next_id;
+	__u64			 otr_committed_id;
 	/* callback is register once per diskfs -- that's the whole point */
 	struct dt_txn_callback	 otr_tx_cb;
 	/* single node can run many clusters */
@@ -215,12 +215,12 @@ struct osp_device {
 	/* osd api's commit cb control structure */
 	struct dt_txn_callback		 opd_syn_txn_cb;
 	/* last used change number -- semantically similar to transno */
-	unsigned long			 opd_syn_last_used_id;
+	__u64				 opd_syn_last_used_id;
 	/* last committed change number -- semantically similar to
 	 * last_committed */
-	unsigned long			 opd_syn_last_committed_id;
+	__u64				 opd_syn_last_committed_id;
 	/* last processed (taken from llog) id */
-	unsigned long			 opd_syn_last_processed_id;
+	__u64				 opd_syn_last_processed_id;
 	struct osp_id_tracker		*opd_syn_tracker;
 	struct list_head		 opd_syn_ontrack;
 	/* stop processing new requests until barrier=0 */
@@ -420,7 +420,7 @@ static inline struct osp_thread_info *osp_env_info(const struct lu_env *env)
 }
 
 struct osp_txn_info {
-	__u32   oti_current_id;
+	__u64   oti_current_id;
 };
 
 extern struct lu_context_key osp_txn_key;
diff --git a/lustre/osp/osp_sync.c b/lustre/osp/osp_sync.c
index 2c74443..582296f 100644
--- a/lustre/osp/osp_sync.c
+++ b/lustre/osp/osp_sync.c
@@ -50,7 +50,7 @@
 
 static int osp_sync_id_traction_init(struct osp_device *d);
 static void osp_sync_id_traction_fini(struct osp_device *d);
-static __u32 osp_sync_id_get(struct osp_device *d, __u32 id);
+static __u64 osp_sync_id_get(struct osp_device *d, __u64 id);
 static void osp_sync_remove_from_tracker(struct osp_device *d);
 
 /*
@@ -238,6 +238,25 @@ void __osp_sync_check_for_work(struct osp_device *d)
 	osp_sync_check_for_work(d);
 }
 
+static inline __u64 osp_sync_correct_id(struct osp_device *d,
+					struct llog_rec_hdr *rec)
+{
+	/*
+	 * llog use cyclic store with 32 bit lrh_id
+	 * so overflow lrh_id is possible. Range between
+	 * last_processed and last_committed is less than
+	 * 64745 ^ 2 and less than 2^32 - 1
+	 */
+	__u64 correct_id = d->opd_syn_last_committed_id;
+
+	if ((correct_id & 0xffffffffULL) < rec->lrh_id)
+		correct_id -= 0x100000000ULL;
+
+	correct_id &= ~0xffffffffULL;
+	correct_id |= rec->lrh_id;
+
+	return correct_id;
+}
 /**
  * Check and return ready-for-new status.
  *
@@ -271,7 +290,8 @@ static inline int osp_sync_can_process_new(struct osp_device *d,
 		return 1;
 	if (d->opd_syn_changes == 0)
 		return 0;
-	if (rec == NULL || rec->lrh_id <= d->opd_syn_last_committed_id)
+	if (rec == NULL ||
+	    osp_sync_correct_id(d, rec) <= d->opd_syn_last_committed_id)
 		return 1;
 	return 0;
 }
@@ -402,8 +422,7 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d,
 	LASSERT(txn);
 
 	txn->oti_current_id = osp_sync_id_get(d, txn->oti_current_id);
-	osi->osi_hdr.lrh_id = txn->oti_current_id;
-
+	osi->osi_hdr.lrh_id = (txn->oti_current_id & 0xffffffffULL);
 	ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT);
 	if (ctxt == NULL)
 		RETURN(-ENOMEM);
@@ -915,13 +934,14 @@ static void osp_sync_process_record(const struct lu_env *env,
 	 * we should decrease changes and bump last_processed_id.
 	 */
 	if (d->opd_syn_prev_done) {
+		__u64 correct_id = osp_sync_correct_id(d, rec);
 		LASSERT(d->opd_syn_changes > 0);
-		LASSERT(rec->lrh_id <= d->opd_syn_last_committed_id);
+		LASSERT(correct_id <= d->opd_syn_last_committed_id);
 		/* NOTE: it's possible to meet same id if
 		 * OST stores few stripes of same file
 		 */
-		if (rec->lrh_id > d->opd_syn_last_processed_id) {
-			d->opd_syn_last_processed_id = rec->lrh_id;
+		if (correct_id > d->opd_syn_last_processed_id) {
+			d->opd_syn_last_processed_id = correct_id;
 			wake_up(&d->opd_syn_barrier_waitq);
 		}
 		d->opd_syn_changes--;
@@ -1487,7 +1507,7 @@ static void osp_sync_tracker_commit_cb(struct thandle *th, void *cookie)
 
 	spin_lock(&tr->otr_lock);
 	if (likely(txn->oti_current_id > tr->otr_committed_id)) {
-		CDEBUG(D_OTHER, "committed: %u -> %u\n",
+		CDEBUG(D_OTHER, "committed: "LPU64" -> "LPU64"\n",
 		       tr->otr_committed_id, txn->oti_current_id);
 		tr->otr_committed_id = txn->oti_current_id;
 
@@ -1607,14 +1627,14 @@ static void osp_sync_id_traction_fini(struct osp_device *d)
  * Generates a new ID using the tracker associated with the given OSP device
  * \a d, if the given ID \a id is non-zero. Unconditially adds OSP device to
  * the wakeup list, so OSP won't miss when a transaction using the ID is
- * committed. Notice ID is 32bit, but llog doesn't support >2^32 records anyway.
+ * committed.
  *
  * \param[in] d		OSP device
  * \param[in] id	0 or ID generated previously
  *
  * \retval		ID the caller should use
  */
-static __u32 osp_sync_id_get(struct osp_device *d, __u32 id)
+static __u64 osp_sync_id_get(struct osp_device *d, __u64 id)
 {
 	struct osp_id_tracker *tr;
 
@@ -1623,9 +1643,12 @@ static __u32 osp_sync_id_get(struct osp_device *d, __u32 id)
 
 	/* XXX: we can improve this introducing per-cpu preallocated ids? */
 	spin_lock(&tr->otr_lock);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_TRACK_OVERFLOW))
+		tr->otr_next_id = 0xfffffff0;
+
 	if (unlikely(tr->otr_next_id <= d->opd_syn_last_used_id)) {
 		spin_unlock(&tr->otr_lock);
-		CERROR("%s: next %u, last synced %lu\n",
+		CERROR("%s: next "LPU64", last synced "LPU64"\n",
 		       d->opd_obd->obd_name, tr->otr_next_id,
 		       d->opd_syn_last_used_id);
 		LBUG();
@@ -1638,7 +1661,7 @@ static __u32 osp_sync_id_get(struct osp_device *d, __u32 id)
 	if (list_empty(&d->opd_syn_ontrack))
 		list_add(&d->opd_syn_ontrack, &tr->otr_wakeup_list);
 	spin_unlock(&tr->otr_lock);
-	CDEBUG(D_OTHER, "new id %u\n", (unsigned) id);
+	CDEBUG(D_OTHER, "new id "LPU64"\n", id);
 
 	return id;
 }
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index 0df8804..31586b2 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -3598,6 +3598,23 @@ test_102d() {
 }
 run_test 102d "check replay & reconstruction with multiple mod RPCs in flight"
 
+test_103() {
+	remote_mds_nodsh && skip "remote MDS with nodsh" && return
+#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162
+	do_facet mds1 $LCTL set_param fail_loc=0x80000162
+
+	mkdir -p $DIR/$tdir
+	createmany -o $DIR/$tdir/t- 30 ||
+		error "create files on remote directory failed"
+	sync
+	rm -rf $DIR/$tdir/t-*
+	sync
+#MDS should crash with tr->otr_next_id overflow
+	fail mds1
+}
+run_test 103 "Check otr_next_id overflow"
+
+
 check_striped_dir_110()
 {
 	$CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
-- 
1.8.3.1