Whamcloud - gitweb
LU-8367 osp: do not block orphan cleanup
[fs/lustre-release.git] / lustre / osp / osp_precreate.c
index 3d84f6d..6910860 100644 (file)
@@ -573,6 +573,7 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
                RETURN(rc);
        }
 
+       LASSERT(d->opd_pre->osp_pre_delorphan_sent != 0);
        spin_lock(&d->opd_pre_lock);
        if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
                d->opd_pre_create_count = d->opd_pre_max_create_count / 2;
@@ -778,18 +779,20 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        int                      update_status = 0;
        int                      rc;
        int                      diff;
+       struct lu_fid            fid;
 
        ENTRY;
 
        /*
-        * wait for local recovery to finish, so we can cleanup orphans
-        * orphans are all objects since "last used" (assigned), but
-        * there might be objects reserved and in some cases they won't
-        * be used. we can't cleanup them till we're sure they won't be
-        * used. also can't we allow new reservations because they may
-        * end up getting orphans being cleaned up below. so we block
-        * new reservations and wait till all reserved objects either
-        * user or released.
+        * wait for local recovery to finish, so we can cleanup orphans.
+        * orphans are all objects since "last used" (assigned).
+        * consider reserved objects as created otherwise we can get into
+        * a livelock when one blocked thread holding a reservation can
+        * block recovery. see LU-8367 for the details. in some cases this
+        * can result in gaps (i.e. leaked objects), but we've got LFSCK...
+        *
+        * do not allow new reservations because they may end up getting
+        * orphans being cleaned up below. so we block new reservations.
         */
        spin_lock(&d->opd_pre_lock);
        d->opd_pre_recovering = 1;
@@ -799,16 +802,12 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
         * catch all osp_precreate_reserve() calls who find
         * "!opd_pre_recovering".
         */
-       l_wait_event(d->opd_pre_waitq,
-                    (!d->opd_pre_reserved && d->opd_recovery_completed) ||
+       l_wait_event(d->opd_pre_waitq, d->opd_recovery_completed ||
                     !osp_precreate_running(d) || d->opd_got_disconnected,
                     &lwi);
        if (!osp_precreate_running(d) || d->opd_got_disconnected)
                GOTO(out, rc = -EAGAIN);
 
-       CDEBUG(D_HA, "%s: going to cleanup orphans since "DFID"\n",
-              d->opd_obd->obd_name, PFID(&d->opd_last_used_fid));
-
        *last_fid = d->opd_last_used_fid;
        /* The OSP should already get the valid seq now */
        LASSERT(!fid_is_zero(last_fid));
@@ -840,7 +839,19 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        body->oa.o_flags = OBD_FL_DELORPHAN;
        body->oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
 
-       fid_to_ostid(&d->opd_last_used_fid, &body->oa.o_oi);
+       /* unless this is the very first DELORPHAN (when we really
+        * can destroy some orphans), just tell OST to recreate
+        * missing objects in our precreate pool */
+       spin_lock(&d->opd_pre_lock);
+       if (d->opd_pre->osp_pre_delorphan_sent)
+               fid = d->opd_pre_last_created_fid;
+       else
+               fid = d->opd_last_used_fid;
+       spin_unlock(&d->opd_pre_lock);
+       fid_to_ostid(&fid, &body->oa.o_oi);
+
+       CDEBUG(D_HA, "%s: going to cleanup orphans since "DFID"\n",
+              d->opd_obd->obd_name, PFID(&fid));
 
        ptlrpc_request_set_replen(req);
 
@@ -863,10 +874,10 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        ostid_to_fid(last_fid, &body->oa.o_oi, d->opd_index);
 
        spin_lock(&d->opd_pre_lock);
-       diff = osp_fid_diff(&d->opd_last_used_fid, last_fid);
+       diff = osp_fid_diff(&fid, last_fid);
        if (diff > 0) {
                d->opd_pre_create_count = OST_MIN_PRECREATE + diff;
-               d->opd_pre_last_created_fid = d->opd_last_used_fid;
+               d->opd_pre_last_created_fid = *last_fid;
        } else {
                d->opd_pre_create_count = OST_MIN_PRECREATE;
                d->opd_pre_last_created_fid = *last_fid;
@@ -877,9 +888,11 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
         */
        LASSERT(fid_oid(&d->opd_pre_last_created_fid) <=
                LUSTRE_DATA_SEQ_MAX_WIDTH);
-       d->opd_pre_used_fid = d->opd_pre_last_created_fid;
+       if (d->opd_pre->osp_pre_delorphan_sent == 0)
+               d->opd_pre_used_fid = d->opd_pre_last_created_fid;
        d->opd_pre_create_slow = 0;
        spin_unlock(&d->opd_pre_lock);
+       d->opd_pre->osp_pre_delorphan_sent = 1;
 
        CDEBUG(D_HA, "%s: Got last_id "DFID" from OST, last_created "DFID
               "last_used is "DFID"\n", d->opd_obd->obd_name, PFID(last_fid),
@@ -1355,6 +1368,12 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
        if (d->opd_pre_max_create_count == 0)
                RETURN(-ENOBUFS);
 
+       if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_OSP_PRECREATE_WAIT)) {
+               if (d->opd_index == cfs_fail_val)
+                       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_OSP_PRECREATE_WAIT,
+                                        obd_timeout);
+       }
+
        /*
         * wait till:
         *  - preallocation is done