Whamcloud - gitweb
LU-13102 llog: fix processing of a wrapped catalog 02/37102/4
authorAlexander Boyko <c17825@cray.com>
Mon, 16 Dec 2019 13:24:16 +0000 (08:24 -0500)
committerOleg Drokin <green@whamcloud.com>
Thu, 5 Mar 2020 22:35:23 +0000 (22:35 +0000)
The logic for rereading a llog buffer had an exception
for a full catalog, when lgh_last_idx = llh_cat_idx and a first
processing index is a llh_cat_idx+1. This check is based on
a value lh_last_idx, which stays unchanged between buffer read.
But llh_cat_idx could go forward, and this lead to a wrong check
where reread doesn't happen. As a result Lustre got ENOENT for
a record and stoped osp processing.

llog_cat_set_first_idx())
catlog [0x6:0x1:0x0] first idx 34730, last_idx 34731
osp_sync_process_queues()) 1 changes, 0 in progress, 0 in flight
llog_process_thread())
stop processing plain 0x76941:1:0 index 64767 count 1
llog_process_thread())
index: 34731, lh_last_idx: 34730 synced_idx: 34730 lgh_last_idx: 34731
llog_cat_process_common()) processing log [0x2780f:0x1:0x0]:0
at index 34731 of catalog [0x6:0x1:0x0]
llog_cat_id2handle()) snx11281-OST0001-osc-MDT0001:
error opening log id [0x2780f:0x1:0x0]:0:rc = -2

The patch fixes logic and also adds/changes debugging for
llog and osp.

Cray-bug-id: LUS-8193
Signed-off-by: Alexander Boyko <c17825@cray.com>
Change-Id: I9463223a1ea904b96643b19e1580f5894142c12b
Reviewed-on: https://review.whamcloud.com/37102
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
lustre/obdclass/llog.c
lustre/obdclass/llog_cat.c
lustre/osp/osp_sync.c

index 4725f77..c9f9616 100644 (file)
@@ -585,13 +585,17 @@ repeat:
                         * could be less than index. So we detect last index
                         * for processing as index == lh_last_idx+1. But when
                         * catalog is wrapped and full lgh_last_idx=llh_cat_idx,
-                        * the first processing index is llh_cat_idx+1.
+                        * the first processing index is llh_cat_idx+1.The
+                        * exception is !(lgh_last_idx == llh_cat_idx &&
+                        * index == llh_cat_idx + 1), and after simplification
+                        * it turns to
+                        * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index
+                        * This exception is working for catalog only.
                         */
 
                        if ((index == lh_last_idx && synced_idx != index) ||
                            (index == (lh_last_idx + 1) &&
-                            !(index == (llh->llh_cat_idx + 1) &&
-                              (llh->llh_flags & LLOG_F_IS_CAT))) ||
+                            lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) ||
                            (rec->lrh_index == 0 && !repeated)) {
 
                                /* save offset inside buffer for the re-read */
@@ -652,7 +656,9 @@ repeat:
                                __u64   tmp_off;
                                int     tmp_idx;
 
-                               CDEBUG(D_OTHER, "index: %d, lh_last_idx: %d "
+                               CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ?
+                                       D_HA : D_OTHER),
+                                      "index: %d, lh_last_idx: %d "
                                       "synced_idx: %d lgh_last_idx: %d\n",
                                       index, lh_last_idx, synced_idx,
                                       loghandle->lgh_last_idx);
@@ -702,6 +708,11 @@ repeat:
        }
 
 out:
+       CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n",
+              ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"),
+              POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen,
+              index, llh->llh_count);
+
        if (cd != NULL)
                cd->lpcd_last_idx = last_called_index;
 
index 4843ae4..9ada0e8 100644 (file)
@@ -1144,7 +1144,7 @@ static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx)
                        }
                }
 
-               CDEBUG(D_RPCTRACE, "catlog "DFID" first idx %u, last_idx %u\n",
+               CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n",
                       PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
                       llh->llh_cat_idx, cathandle->lgh_last_idx);
        }
@@ -1157,11 +1157,13 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
                     struct llog_handle *loghandle, int index)
 {
        int rc;
+       struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
 
        LASSERT(index);
        if (loghandle != NULL) {
                /* remove destroyed llog from catalog list and
                 * chd_current_log variable */
+               fid = loghandle->lgh_id.lgl_oi.oi_fid;
                down_write(&cathandle->lgh_lock);
                if (cathandle->u.chd.chd_current_log == loghandle)
                        cathandle->u.chd.chd_current_log = NULL;
@@ -1180,7 +1182,9 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
        llog_cat_set_first_idx(cathandle, index);
        rc = llog_cancel_rec(env, cathandle, index);
        if (rc == 0)
-               CDEBUG(D_HA, "cancel plain log at index %u of catalog "DFID"\n",
-                      index, PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+               CDEBUG(D_HA,
+                      "cancel plain log "DFID" at index %u of catalog "DFID"\n",
+                      PFID(&fid), index,
+                      PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
        return rc;
 }
index 1e82b9c..4df823e 100644 (file)
@@ -990,7 +990,7 @@ static void osp_sync_process_record(const struct lu_env *env,
                               rec->lrh_id, rc);
        }
 
-       CDEBUG(D_HA, "found record %x, %d, idx %u, id %u\n",
+       CDEBUG(D_OTHER, "found record %x, %d, idx %u, id %u\n",
               rec->lrh_type, rec->lrh_len, rec->lrh_index, rec->lrh_id);
 
        RETURN_EXIT;
@@ -1056,7 +1056,7 @@ static void osp_sync_process_committed(const struct lu_env *env,
        list_for_each(le, &list)
                count++;
        if (count > 2)
-               OBD_ALLOC_WAIT(arr, sizeof(int) * count);
+               OBD_ALLOC_LARGE(arr, sizeof(int) * count);
        else
                arr = NULL;
        i = 0;
@@ -1109,15 +1109,19 @@ static void osp_sync_process_committed(const struct lu_env *env,
                               PFID(&lgid.lgl_oi.oi_fid), i);
        }
        if (arr)
-               OBD_FREE(arr, sizeof(int) * count);
+               OBD_FREE_LARGE(arr, sizeof(int) * count);
 
        llog_ctxt_put(ctxt);
 
        LASSERT(atomic_read(&d->opd_sync_rpcs_in_progress) >= done);
        atomic_sub(done, &d->opd_sync_rpcs_in_progress);
-       CDEBUG(D_OTHER, "%s: %d in flight, %d in progress, done %d\n",
-              d->opd_obd->obd_name, atomic_read(&d->opd_sync_rpcs_in_flight),
-              atomic_read(&d->opd_sync_rpcs_in_progress), done);
+       CDEBUG((done > 2 ? D_HA : D_OTHER), "%s: %u changes, %u in progress,"
+                                    " %u in flight, %u done\n",
+                                    d->opd_obd->obd_name,
+                                    atomic_read(&d->opd_sync_changes),
+                                    atomic_read(&d->opd_sync_rpcs_in_progress),
+                                    atomic_read(&d->opd_sync_rpcs_in_flight),
+                                    done);
 
        osp_sync_check_for_work(d);
 
@@ -1166,11 +1170,6 @@ static int osp_sync_process_queues(const struct lu_env *env,
                if (osp_sync_can_process_new(d, rec)) {
                        if (llh == NULL) {
                                /* ask llog for another record */
-                               CDEBUG(D_HA, "%u changes, %u in progress,"
-                                    " %u in flight\n",
-                                    atomic_read(&d->opd_sync_changes),
-                                    atomic_read(&d->opd_sync_rpcs_in_progress),
-                                    atomic_read(&d->opd_sync_rpcs_in_flight));
                                return 0;
                        }
                        osp_sync_process_record(env, d, llh, rec);