From cc10922919325c212ae98a69d63328c0efbd4f83 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Thu, 12 Dec 2019 02:59:41 -0500 Subject: [PATCH] LU-13069 obdclass: don't skip records for wrapped catalog osp_sync_thread() uses opd_sync_last_catalog_idx as a start point of catalog processing. It is used at llog_cat_process_cb also, to skip records from processing. When catalog is wrapped, processing starts from second part of catalog and then a first part. So, a first part would be skipped at llog_cat_process_cb() base on lpd_startcat. osp_sync_thread() restarts a processing loop with a opd_sync_last_catalog_idx. For a wrapped it increases last index and one more increase do a llog_process_thread. This leads to a skipped records at catalog, they would not be processed. The patch fixes these issues. It also adds sanity test 135 and 136 as regression tests. Signed-off-by: Alexander Boyko Cray-bug-id: LUS-8053,LUS-8236 Change-Id: Ic75af1bf4468b9ef2de32cbf6d834b6a81376e88 Reviewed-on: https://review.whamcloud.com/36996 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andriy Skulysh Reviewed-by: Alexander Zarochentsev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 2 ++ lustre/obdclass/llog.c | 13 +++++++-- lustre/obdclass/llog_cat.c | 8 +++++ lustre/osp/osp_sync.c | 15 ++++++---- lustre/tests/sanity.sh | 69 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 97 insertions(+), 10 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 2a78d5a..a728bef 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -555,6 +555,8 @@ extern char obd_jobid_var[]; #define OBD_FAIL_FORCE_GC_THREAD 0x1316 #define OBD_FAIL_LLOG_PROCESS_TIMEOUT 0x1317 #define OBD_FAIL_LLOG_PURGE_DELAY 0x1318 +#define OBD_FAIL_PLAIN_RECORDS 0x1319 +#define OBD_FAIL_CATALOG_FULL_CHECK 0x131a #define OBD_FAIL_LLITE 0x1400 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 5ae88a2..c1fa40b 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -772,10 +772,12 @@ int llog_process_or_fork(const struct lu_env *env, struct llog_handle *loghandle, llog_cb_t cb, void *data, void *catdata, bool fork) { - struct llog_process_info *lpi; - int rc; + struct llog_process_info *lpi; + struct llog_process_data *d = data; + struct llog_process_cat_data *cd = catdata; + int rc; - ENTRY; + ENTRY; OBD_ALLOC_PTR(lpi); if (lpi == NULL) { @@ -787,6 +789,11 @@ int llog_process_or_fork(const struct lu_env *env, lpi->lpi_cbdata = data; lpi->lpi_catdata = catdata; + CDEBUG(D_OTHER, "Processing "DFID" flags 0x%03x startcat %d startidx %d first_idx %d last_idx %d\n", + PFID(&loghandle->lgh_id.lgl_oi.oi_fid), + loghandle->lgh_hdr->llh_flags, d ? d->lpd_startcat : -1, + d ? d->lpd_startidx : -1, cd ? cd->lpcd_first_idx : -1, + cd ? cd->lpcd_last_idx : -1); if (fork) { struct task_struct *task; diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c index c5b15f1..c28ca74 100644 --- a/lustre/obdclass/llog_cat.c +++ b/lustre/obdclass/llog_cat.c @@ -206,6 +206,13 @@ static int llog_cat_new_log(const struct lu_env *env, if (freespace > (128 << 20)) loghandle->lgh_max_size = 128 << 20; } + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_PLAIN_RECORDS) || + OBD_FAIL_PRECHECK(OBD_FAIL_CATALOG_FULL_CHECK))) { + // limit the numer of plain records for test + loghandle->lgh_max_size = loghandle->lgh_hdr_size + + cfs_fail_val * 64; + } + rc = 0; out: @@ -926,6 +933,7 @@ int llog_cat_process_or_fork(const struct lu_env *env, * catalog bottom. */ startcat = 0; + d.lpd_startcat = 0; if (rc != 0) RETURN(rc); } diff --git a/lustre/osp/osp_sync.c b/lustre/osp/osp_sync.c index 2907c70..f55672e 100644 --- a/lustre/osp/osp_sync.c +++ b/lustre/osp/osp_sync.c @@ -40,6 +40,7 @@ #define DEBUG_SUBSYSTEM S_MDS #include +#include #include #include #include "osp_internal.h" @@ -1176,6 +1177,9 @@ static int osp_sync_process_queues(const struct lu_env *env, llh = NULL; rec = NULL; } + if (OBD_FAIL_PRECHECK(OBD_FAIL_CATALOG_FULL_CHECK) && + cfs_fail_val != 1) + msleep(1 * MSEC_PER_SEC); wait_event_idle(d->opd_sync_waitq, !osp_sync_running(d) || @@ -1275,12 +1279,11 @@ next: /* processing reaches catalog bottom */ if (d->opd_sync_last_catalog_idx == size) d->opd_sync_last_catalog_idx = LLOG_CAT_FIRST; - else if (wrapped) - /* If catalog is wrapped we can`t predict last index of - * processing because lgh_last_idx could be changed. - * Starting form the next one */ - d->opd_sync_last_catalog_idx++; - + /* If catalog is wrapped we can`t predict last index of + * processing because lgh_last_idx could be changed. + * Starting form the next one. Index would be increased + * at llog_process_thread + */ } while (rc == 0 && (wrapped || d->opd_sync_last_catalog_idx == LLOG_CAT_FIRST)); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index ab86f14..c1cb616 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -70,7 +70,7 @@ if [ $LINUX_VERSION_CODE -ge $(version_code 4.14.0) ];then fi # 5 12 (min)" -[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 300o" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 135 136 300o" if [ "$mds1_FSTYPE" = "zfs" ]; then # bug number for skipped test: @@ -12614,6 +12614,73 @@ test_134b() { } run_test 134b "Server rejects lock request when reaching lock_limit_mb" +test_135() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ $MDS1_VERSION -lt $(version_code 2.13.50) ]] && + skip "Need MDS version at least 2.13.50" + local fname + + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + +#define OBD_FAIL_PLAIN_RECORDS 0x1319 + #set only one record at plain llog + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1319 fail_val=1 + + #fill already existed plain llog each 64767 + #wrapping whole catalog + createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1)) + + createmany -o $DIR/$tdir/$tfile_ 64700 + for (( i = 0; i < 64700; i = i + 2 )) + do + rm $DIR/$tdir/$tfile_$i & + rm $DIR/$tdir/$tfile_$((i + 1)) & + local pid=$! + wait $pid + done + + #waiting osp synchronization + wait_delete_completed +} +run_test 135 "Race catalog processing" + +test_136() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ $MDS1_VERSION -lt $(version_code 2.13.50) ]] && + skip "Need MDS version at least 2.13.50" + local fname + + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + $SETSTRIPE -c 1 -i 0 $DIR/$tdir || error "failed to set striping" + #set only one record at plain llog +#define OBD_FAIL_CATALOG_FULL_CHECK 0x131a + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x131a fail_val=1 + + #fill already existed 2 plain llogs each 64767 + #wrapping whole catalog + createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1)) + createmany -o -u $DIR/$tdir/$tfile- $((64767 * 3 / 2)) + wait_delete_completed + + createmany -o $DIR/$tdir/$tfile_ 10 + sleep 25 + + do_facet $SINGLEMDS $LCTL set_param fail_val=3 + for (( i = 0; i < 10; i = i + 3 )) + do + rm $DIR/$tdir/$tfile_$i & + rm $DIR/$tdir/$tfile_$((i + 1)) & + local pid=$! + wait $pid + sleep 7 + rm $DIR/$tdir/$tfile_$((i + 2)) & + done + + #waiting osp synchronization + wait_delete_completed +} +run_test 136 "Race catalog processing 2" + test_140() { #bug-17379 [ $PARALLEL == "yes" ] && skip "skip parallel run" -- 1.8.3.1