Whamcloud - gitweb
LU-12674 osp: handle -EINPROGRESS on llog objects 44/35844/4
authorAlexey Zhuravlev <bzzz@whamcloud.com>
Wed, 21 Aug 2019 08:32:56 +0000 (11:32 +0300)
committerOleg Drokin <green@whamcloud.com>
Sat, 7 Sep 2019 01:53:15 +0000 (01:53 +0000)
if llog object is corrupted and OI doesn't allow access to that
OSP panics being asked to declare new llog record (e.g. for unlink).
Instead OSP should complain in the logs, skip llogging and suggest
to run LFSCK to fix orphans.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I18d4d68811833c08cdc1937d147ac6e8c3408a30
Reviewed-on: https://review.whamcloud.com/35844
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_sync.c
lustre/tests/sanity.sh

index ddc11c1..6b73b9a 100644 (file)
@@ -687,6 +687,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_CHECK_ENOMEM              0x2101
 #define OBD_FAIL_OSP_FAKE_PRECREATE            0x2102
 #define OBD_FAIL_OSP_RPCS_SEM                  0x2104
 #define OBD_FAIL_OSP_CHECK_ENOMEM              0x2101
 #define OBD_FAIL_OSP_FAKE_PRECREATE            0x2102
 #define OBD_FAIL_OSP_RPCS_SEM                  0x2104
+#define OBD_FAIL_OSP_CANT_PROCESS_LLOG         0x2105
 
  /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
 
  /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
index 074864f..6aa00d2 100644 (file)
@@ -343,7 +343,13 @@ int osp_sync_declare_add(const struct lu_env *env, struct osp_object *o,
        }
 
        ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT);
        }
 
        ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT);
-       LASSERT(ctxt);
+       if (!ctxt) {
+               /* for a reason OSP wasn't able to open llog,
+                * just skip logging this operation and hope
+                * LFSCK will fix it eventually */
+               CERROR("logging isn't available, run LFSCK\n");
+               RETURN(0);
+       }
 
        rc = llog_declare_add(env, ctxt->loc_handle, &osi->osi_hdr,
                              storage_th);
 
        rc = llog_declare_add(env, ctxt->loc_handle, &osi->osi_hdr,
                              storage_th);
@@ -436,8 +442,10 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d,
        spin_unlock(&d->opd_sync_lock);
 
        ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT);
        spin_unlock(&d->opd_sync_lock);
 
        ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT);
-       if (ctxt == NULL)
-               RETURN(-ENOMEM);
+       if (ctxt == NULL) {
+               /* see comment in osp_sync_declare_add() */
+               RETURN(0);
+       }
 
        rc = llog_add(env, ctxt->loc_handle, &osi->osi_hdr, &osi->osi_cookie,
                      storage_th);
 
        rc = llog_add(env, ctxt->loc_handle, &osi->osi_hdr, &osi->osi_cookie,
                      storage_th);
@@ -1231,6 +1239,7 @@ static int osp_sync_thread(void *_arg)
        spin_unlock(&d->opd_sync_lock);
        wake_up(&thread->t_ctl_waitq);
 
        spin_unlock(&d->opd_sync_lock);
        wake_up(&thread->t_ctl_waitq);
 
+again:
        ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
        if (ctxt == NULL) {
                CERROR("can't get appropriate context\n");
        ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
        if (ctxt == NULL) {
                CERROR("can't get appropriate context\n");
@@ -1257,9 +1266,14 @@ static int osp_sync_thread(void *_arg)
                wrapped = (llh->lgh_hdr->llh_cat_idx >= llh->lgh_last_idx &&
                           llh->lgh_hdr->llh_count > 1);
 
                wrapped = (llh->lgh_hdr->llh_cat_idx >= llh->lgh_last_idx &&
                           llh->lgh_hdr->llh_count > 1);
 
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSP_CANT_PROCESS_LLOG)) {
+                       rc = -EINPROGRESS;
+                       goto next;
+               }
                rc = llog_cat_process(&env, llh, osp_sync_process_queues, d,
                                      d->opd_sync_last_catalog_idx, 0);
 
                rc = llog_cat_process(&env, llh, osp_sync_process_queues, d,
                                      d->opd_sync_last_catalog_idx, 0);
 
+next:
                size = OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ?
                       cfs_fail_val : (LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1);
                /* processing reaches catalog bottom */
                size = OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ?
                       cfs_fail_val : (LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1);
                /* processing reaches catalog bottom */
@@ -1275,6 +1289,17 @@ static int osp_sync_thread(void *_arg)
                             d->opd_sync_last_catalog_idx == LLOG_CAT_FIRST));
 
        if (rc < 0) {
                             d->opd_sync_last_catalog_idx == LLOG_CAT_FIRST));
 
        if (rc < 0) {
+               if (rc == -EINPROGRESS) {
+                       /* can't access the llog now - OI scrub is trying to fix
+                        * underlying issue. let's wait and try again */
+                       llog_cat_close(&env, llh);
+                       rc = llog_cleanup(&env, ctxt);
+                       if (rc)
+                               GOTO(out, rc);
+                       schedule_timeout_interruptible(HZ * 5);
+                       goto again;
+               }
+
                CERROR("%s: llog process with osp_sync_process_queues "
                       "failed: %d\n", d->opd_obd->obd_name, rc);
                GOTO(close, rc);
                CERROR("%s: llog process with osp_sync_process_queues "
                       "failed: %d\n", d->opd_obd->obd_name, rc);
                GOTO(close, rc);
index 3924119..67f6563 100644 (file)
@@ -21873,6 +21873,19 @@ test_817() {
 }
 run_test 817 "nfsd won't cache write lock for exec file"
 
 }
 run_test 817 "nfsd won't cache write lock for exec file"
 
+test_818() {
+       mkdir $DIR/$tdir
+       $LFS setstripe -c1 -i0 $DIR/$tfile
+       $LFS setstripe -c1 -i1 $DIR/$tfile
+       stop $SINGLEMDS
+       #define OBD_FAIL_OSP_CANT_PROCESS_LLOG          0x2105
+       do_facet $SINGLEMDS lctl set_param fail_loc=0x80002105
+       start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS ||
+               error "start $SINGLEMDS failed"
+       rm -rf $DIR/$tdir
+}
+run_test 818 "unlink with failed llog"
+
 #
 # tests that do cleanup/setup should be run at the end
 #
 #
 # tests that do cleanup/setup should be run at the end
 #