Whamcloud - gitweb
LU-13069 obdclass: don't skip records for wrapped catalog 96/36996/2
authorAlexander Boyko <c17825@cray.com>
Thu, 12 Dec 2019 07:59:41 +0000 (02:59 -0500)
committerOleg Drokin <green@whamcloud.com>
Fri, 10 Jan 2020 07:41:31 +0000 (07:41 +0000)
osp_sync_thread() uses opd_sync_last_catalog_idx as a start point of
catalog processing. It is used at llog_cat_process_cb also, to skip
records from processing. When catalog is wrapped, processing starts
from second part of catalog and then a first part. So, a first part
would be skipped at llog_cat_process_cb() base on lpd_startcat.

osp_sync_thread() restarts a processing loop with a
opd_sync_last_catalog_idx. For a wrapped it increases last
index and one more increase do a llog_process_thread. This leads
to a skipped records at catalog, they would not be processed.
The patch fixes these issues.
It also adds sanity test 135 and 136 as regression tests.

Signed-off-by: Alexander Boyko <c17825@cray.com>
Cray-bug-id: LUS-8053,LUS-8236
Change-Id: Ic75af1bf4468b9ef2de32cbf6d834b6a81376e88
Reviewed-on: https://review.whamcloud.com/36996
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Alexander Zarochentsev <c17826@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/obdclass/llog.c
lustre/obdclass/llog_cat.c
lustre/osp/osp_sync.c
lustre/tests/sanity.sh

index 2a78d5a..a728bef 100644 (file)
@@ -555,6 +555,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_FORCE_GC_THREAD                   0x1316
 #define OBD_FAIL_LLOG_PROCESS_TIMEOUT              0x1317
 #define OBD_FAIL_LLOG_PURGE_DELAY                  0x1318
+#define OBD_FAIL_PLAIN_RECORDS                     0x1319
+#define OBD_FAIL_CATALOG_FULL_CHECK                0x131a
 
 #define OBD_FAIL_LLITE                              0x1400
 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
index 5ae88a2..c1fa40b 100644 (file)
@@ -772,10 +772,12 @@ int llog_process_or_fork(const struct lu_env *env,
                         struct llog_handle *loghandle,
                         llog_cb_t cb, void *data, void *catdata, bool fork)
 {
-        struct llog_process_info *lpi;
-        int                      rc;
+       struct llog_process_info *lpi;
+       struct llog_process_data *d = data;
+       struct llog_process_cat_data *cd = catdata;
+       int                      rc;
 
-        ENTRY;
+       ENTRY;
 
        OBD_ALLOC_PTR(lpi);
        if (lpi == NULL) {
@@ -787,6 +789,11 @@ int llog_process_or_fork(const struct lu_env *env,
        lpi->lpi_cbdata    = data;
        lpi->lpi_catdata   = catdata;
 
+       CDEBUG(D_OTHER, "Processing "DFID" flags 0x%03x startcat %d startidx %d first_idx %d last_idx %d\n",
+              PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+              loghandle->lgh_hdr->llh_flags, d ? d->lpd_startcat : -1,
+              d ? d->lpd_startidx : -1, cd ? cd->lpcd_first_idx : -1,
+              cd ? cd->lpcd_last_idx : -1);
        if (fork) {
                struct task_struct *task;
 
index c5b15f1..c28ca74 100644 (file)
@@ -206,6 +206,13 @@ static int llog_cat_new_log(const struct lu_env *env,
                if (freespace > (128 << 20))
                        loghandle->lgh_max_size = 128 << 20;
        }
+       if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_PLAIN_RECORDS) ||
+                    OBD_FAIL_PRECHECK(OBD_FAIL_CATALOG_FULL_CHECK))) {
+               // limit the numer of plain records for test
+               loghandle->lgh_max_size = loghandle->lgh_hdr_size +
+                      cfs_fail_val * 64;
+       }
+
        rc = 0;
 
 out:
@@ -926,6 +933,7 @@ int llog_cat_process_or_fork(const struct lu_env *env,
                         * catalog bottom.
                         */
                        startcat = 0;
+                       d.lpd_startcat = 0;
                        if (rc != 0)
                                RETURN(rc);
                }
index 2907c70..f55672e 100644 (file)
@@ -40,6 +40,7 @@
 #define DEBUG_SUBSYSTEM S_MDS
 
 #include <linux/kthread.h>
+#include <linux/delay.h>
 #include <lustre_log.h>
 #include <lustre_update.h>
 #include "osp_internal.h"
@@ -1176,6 +1177,9 @@ static int osp_sync_process_queues(const struct lu_env *env,
                        llh = NULL;
                        rec = NULL;
                }
+               if (OBD_FAIL_PRECHECK(OBD_FAIL_CATALOG_FULL_CHECK) &&
+                           cfs_fail_val != 1)
+                       msleep(1 * MSEC_PER_SEC);
 
                wait_event_idle(d->opd_sync_waitq,
                                !osp_sync_running(d) ||
@@ -1275,12 +1279,11 @@ next:
                /* processing reaches catalog bottom */
                if (d->opd_sync_last_catalog_idx == size)
                        d->opd_sync_last_catalog_idx = LLOG_CAT_FIRST;
-               else if (wrapped)
-                       /* If catalog is wrapped we can`t predict last index of
-                        * processing because lgh_last_idx could be changed.
-                        * Starting form the next one */
-                       d->opd_sync_last_catalog_idx++;
-
+               /* If catalog is wrapped we can`t predict last index of
+                * processing because lgh_last_idx could be changed.
+                * Starting form the next one. Index would be increased
+                * at llog_process_thread
+                */
        } while (rc == 0 && (wrapped ||
                             d->opd_sync_last_catalog_idx == LLOG_CAT_FIRST));
 
index ab86f14..c1cb616 100644 (file)
@@ -70,7 +70,7 @@ if [ $LINUX_VERSION_CODE -ge $(version_code 4.14.0) ];then
 fi
 
 #                                  5          12          (min)"
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 300o"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 135 136 300o"
 
 if [ "$mds1_FSTYPE" = "zfs" ]; then
        # bug number for skipped test:
@@ -12614,6 +12614,73 @@ test_134b() {
 }
 run_test 134b "Server rejects lock request when reaching lock_limit_mb"
 
+test_135() {
+       remote_mds_nodsh && skip "remote MDS with nodsh"
+       [[ $MDS1_VERSION -lt $(version_code 2.13.50) ]] &&
+               skip "Need MDS version at least 2.13.50"
+       local fname
+
+       mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir"
+
+#define OBD_FAIL_PLAIN_RECORDS 0x1319
+       #set only one record at plain llog
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1319 fail_val=1
+
+       #fill already existed plain llog each 64767
+       #wrapping whole catalog
+       createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1))
+
+       createmany -o $DIR/$tdir/$tfile_ 64700
+       for (( i = 0; i < 64700; i = i + 2 ))
+       do
+               rm $DIR/$tdir/$tfile_$i &
+               rm $DIR/$tdir/$tfile_$((i + 1)) &
+               local pid=$!
+               wait $pid
+       done
+
+       #waiting osp synchronization
+       wait_delete_completed
+}
+run_test 135 "Race catalog processing"
+
+test_136() {
+       remote_mds_nodsh && skip "remote MDS with nodsh"
+       [[ $MDS1_VERSION -lt $(version_code 2.13.50) ]] &&
+               skip "Need MDS version at least 2.13.50"
+       local fname
+
+       mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir"
+       $SETSTRIPE -c 1 -i 0 $DIR/$tdir || error "failed to set striping"
+       #set only one record at plain llog
+#define OBD_FAIL_CATALOG_FULL_CHECK                0x131a
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x131a fail_val=1
+
+       #fill already existed 2 plain llogs each 64767
+       #wrapping whole catalog
+       createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1))
+       createmany -o -u $DIR/$tdir/$tfile- $((64767 * 3 / 2))
+       wait_delete_completed
+
+       createmany -o $DIR/$tdir/$tfile_ 10
+       sleep 25
+
+       do_facet $SINGLEMDS $LCTL set_param fail_val=3
+       for (( i = 0; i < 10; i = i + 3 ))
+       do
+               rm $DIR/$tdir/$tfile_$i &
+               rm $DIR/$tdir/$tfile_$((i + 1)) &
+               local pid=$!
+               wait $pid
+               sleep 7
+               rm $DIR/$tdir/$tfile_$((i + 2)) &
+       done
+
+       #waiting osp synchronization
+       wait_delete_completed
+}
+run_test 136 "Race catalog processing 2"
+
 test_140() { #bug-17379
        [ $PARALLEL == "yes" ] && skip "skip parallel run"