From 21b34dd8a226fbaf2fee52b294f2482aa8670ad2 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Mon, 16 Dec 2019 08:24:16 -0500 Subject: [PATCH] LU-13102 llog: fix processing of a wrapped catalog The logic for rereading a llog buffer had an exception for a full catalog, when lgh_last_idx = llh_cat_idx and a first processing index is a llh_cat_idx+1. This check is based on a value lh_last_idx, which stays unchanged between buffer read. But llh_cat_idx could go forward, and this lead to a wrong check where reread doesn't happen. As a result Lustre got ENOENT for a record and stoped osp processing. llog_cat_set_first_idx()) catlog [0x6:0x1:0x0] first idx 34730, last_idx 34731 osp_sync_process_queues()) 1 changes, 0 in progress, 0 in flight llog_process_thread()) stop processing plain 0x76941:1:0 index 64767 count 1 llog_process_thread()) index: 34731, lh_last_idx: 34730 synced_idx: 34730 lgh_last_idx: 34731 llog_cat_process_common()) processing log [0x2780f:0x1:0x0]:0 at index 34731 of catalog [0x6:0x1:0x0] llog_cat_id2handle()) snx11281-OST0001-osc-MDT0001: error opening log id [0x2780f:0x1:0x0]:0:rc = -2 The patch fixes logic and also adds/changes debugging for llog and osp. Lustre-change: https://review.whamcloud.com/37102 Lustre-commit: a4f049b96562fd502b1948fb082767351e040a1c Cray-bug-id: LUS-8193 Signed-off-by: Alexander Boyko Change-Id: I9463223a1ea904b96643b19e1580f5894142c12b Reviewed-by: Andreas Dilger Reviewed-by: Alexander Zarochentsev Reviewed-by: Mike Pershin Signed-off-by: Minh Diep Reviewed-on: https://review.whamcloud.com/38541 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/obdclass/llog.c | 19 +++++++++++++++---- lustre/obdclass/llog_cat.c | 10 +++++++--- lustre/osp/osp_sync.c | 17 ++++++++--------- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 3880725..486c3a1 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -576,13 +576,17 @@ repeat: * could be less than index. So we detect last index * for processing as index == lh_last_idx+1. But when * catalog is wrapped and full lgh_last_idx=llh_cat_idx, - * the first processing index is llh_cat_idx+1. + * the first processing index is llh_cat_idx+1.The + * exception is !(lgh_last_idx == llh_cat_idx && + * index == llh_cat_idx + 1), and after simplification + * it turns to + * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index + * This exception is working for catalog only. */ if ((index == lh_last_idx && synced_idx != index) || (index == (lh_last_idx + 1) && - !(index == (llh->llh_cat_idx + 1) && - (llh->llh_flags & LLOG_F_IS_CAT))) || + lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) || (rec->lrh_index == 0 && !repeated)) { /* save offset inside buffer for the re-read */ @@ -643,7 +647,9 @@ repeat: __u64 tmp_off; int tmp_idx; - CDEBUG(D_OTHER, "index: %d, lh_last_idx: %d " + CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ? + D_HA : D_OTHER), + "index: %d, lh_last_idx: %d " "synced_idx: %d lgh_last_idx: %d\n", index, lh_last_idx, synced_idx, loghandle->lgh_last_idx); @@ -693,6 +699,11 @@ repeat: } out: + CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n", + ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"), + POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, + index, llh->llh_count); + if (cd != NULL) cd->lpcd_last_idx = last_called_index; diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c index 6045261..1a9ae15 100644 --- a/lustre/obdclass/llog_cat.c +++ b/lustre/obdclass/llog_cat.c @@ -1118,7 +1118,7 @@ static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx) } } - CDEBUG(D_RPCTRACE, "catlog "DFID" first idx %u, last_idx %u\n", + CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n", PFID(&cathandle->lgh_id.lgl_oi.oi_fid), llh->llh_cat_idx, cathandle->lgh_last_idx); } @@ -1131,11 +1131,13 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, struct llog_handle *loghandle, int index) { int rc; + struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0}; LASSERT(index); if (loghandle != NULL) { /* remove destroyed llog from catalog list and * chd_current_log variable */ + fid = loghandle->lgh_id.lgl_oi.oi_fid; down_write(&cathandle->lgh_lock); if (cathandle->u.chd.chd_current_log == loghandle) cathandle->u.chd.chd_current_log = NULL; @@ -1154,7 +1156,9 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, llog_cat_set_first_idx(cathandle, index); rc = llog_cancel_rec(env, cathandle, index); if (rc == 0) - CDEBUG(D_HA, "cancel plain log at index %u of catalog "DFID"\n", - index, PFID(&cathandle->lgh_id.lgl_oi.oi_fid)); + CDEBUG(D_HA, + "cancel plain log "DFID" at index %u of catalog "DFID"\n", + PFID(&fid), index, + PFID(&cathandle->lgh_id.lgl_oi.oi_fid)); return rc; } diff --git a/lustre/osp/osp_sync.c b/lustre/osp/osp_sync.c index 5c42b2b..b18317a 100644 --- a/lustre/osp/osp_sync.c +++ b/lustre/osp/osp_sync.c @@ -989,7 +989,7 @@ static void osp_sync_process_record(const struct lu_env *env, rec->lrh_id, rc); } - CDEBUG(D_HA, "found record %x, %d, idx %u, id %u\n", + CDEBUG(D_OTHER, "found record %x, %d, idx %u, id %u\n", rec->lrh_type, rec->lrh_len, rec->lrh_index, rec->lrh_id); RETURN_EXIT; @@ -1083,9 +1083,13 @@ static void osp_sync_process_committed(const struct lu_env *env, LASSERT(atomic_read(&d->opd_sync_rpcs_in_progress) >= done); atomic_sub(done, &d->opd_sync_rpcs_in_progress); - CDEBUG(D_OTHER, "%s: %d in flight, %d in progress\n", - d->opd_obd->obd_name, atomic_read(&d->opd_sync_rpcs_in_flight), - atomic_read(&d->opd_sync_rpcs_in_progress)); + CDEBUG((done > 2 ? D_HA : D_OTHER), "%s: %u changes, %u in progress," + " %u in flight, %u done\n", + d->opd_obd->obd_name, + atomic_read(&d->opd_sync_changes), + atomic_read(&d->opd_sync_rpcs_in_progress), + atomic_read(&d->opd_sync_rpcs_in_flight), + done); osp_sync_check_for_work(d); @@ -1136,11 +1140,6 @@ static int osp_sync_process_queues(const struct lu_env *env, if (osp_sync_can_process_new(d, rec)) { if (llh == NULL) { /* ask llog for another record */ - CDEBUG(D_HA, "%u changes, %u in progress," - " %u in flight\n", - atomic_read(&d->opd_sync_changes), - atomic_read(&d->opd_sync_rpcs_in_progress), - atomic_read(&d->opd_sync_rpcs_in_flight)); return 0; } osp_sync_process_record(env, d, llh, rec); -- 1.8.3.1