LU-2285 osp: Block new reservations during orphan recoveries

[fs/lustre-release.git] / lustre / osp / osp_precreate.c
diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c

index 49ea453..81b5cf0 100644 (file)
--- a/lustre/osp/osp_precreate.c
+++ b/lustre/osp/osp_precreate.c
@@ -112,7 +112,7 @@ out:
         /* couldn't update statfs, try again as soon as possible */
         cfs_waitq_signal(&d->opd_pre_waitq);
         if (req->rq_import_generation == imp->imp_generation)
-               CERROR("%s: couldn't update statfs: rc = %d\n",
+               CDEBUG(D_CACHE, "%s: couldn't update statfs: rc = %d\n",
                        d->opd_obd->obd_name, rc);
         RETURN(rc);
  }
@@ -148,16 +148,15 @@ static int osp_statfs_update(struct osp_device *d)
         aa = ptlrpc_req_async_args(req);
         aa->pointer_arg[0] = d;
  
-       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
-
-       cfs_timer_disarm(&d->opd_statfs_timer);
-
         /*
          * no updates till reply
          */
+       cfs_timer_disarm(&d->opd_statfs_timer);
         d->opd_statfs_fresh_till = cfs_time_shift(obd_timeout * 1000);
         d->opd_statfs_update_in_progress = 1;
  
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
         RETURN(0);
  }
  
@@ -200,7 +199,7 @@ static inline int osp_precreate_stopped(struct osp_device *d)
  
  static inline int osp_precreate_near_empty_nolock(struct osp_device *d)
  {
-       int window = d->opd_pre_last_created - d->opd_pre_next;
+       int window = d->opd_pre_last_created - d->opd_pre_used_id;
  
         /* don't consider new precreation till OST is healty and
          * has free space */
@@ -213,9 +212,9 @@ static inline int osp_precreate_near_empty(struct osp_device *d)
         int rc;
  
         /* XXX: do we really need locking here? */
-       cfs_spin_lock(&d->opd_pre_lock);
+       spin_lock(&d->opd_pre_lock);
         rc = osp_precreate_near_empty_nolock(d);
-       cfs_spin_unlock(&d->opd_pre_lock);
+       spin_unlock(&d->opd_pre_lock);
         return rc;
  }
  
@@ -255,11 +254,11 @@ static int osp_precreate_send(struct osp_device *d)
                 RETURN(rc);
         }
  
-       cfs_spin_lock(&d->opd_pre_lock);
+       spin_lock(&d->opd_pre_lock);
         if (d->opd_pre_grow_count > d->opd_pre_max_grow_count / 2)
                 d->opd_pre_grow_count = d->opd_pre_max_grow_count / 2;
         grow = d->opd_pre_grow_count;
-       cfs_spin_unlock(&d->opd_pre_lock);
+       spin_unlock(&d->opd_pre_lock);
  
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body);
@@ -281,12 +280,13 @@ static int osp_precreate_send(struct osp_device *d)
         if (body == NULL)
                 GOTO(out_req, rc = -EPROTO);
  
-       CDEBUG(D_HA, "new last_created %lu\n", (unsigned long) body->oa.o_id);
-       LASSERT(body->oa.o_id > d->opd_pre_next);
+       CDEBUG(D_HA, "%s: new last_created "LPU64"\n", d->opd_obd->obd_name,
+              body->oa.o_id);
+       LASSERT(body->oa.o_id > d->opd_pre_used_id);
  
         diff = body->oa.o_id - d->opd_pre_last_created;
  
-       cfs_spin_lock(&d->opd_pre_lock);
+       spin_lock(&d->opd_pre_lock);
         if (diff < grow) {
                 /* the OST has not managed to create all the
                  * objects we asked for */
@@ -299,9 +299,9 @@ static int osp_precreate_send(struct osp_device *d)
                 d->opd_pre_grow_slow = 0;
         }
         d->opd_pre_last_created = body->oa.o_id;
-       cfs_spin_unlock(&d->opd_pre_lock);
+       spin_unlock(&d->opd_pre_lock);
         CDEBUG(D_OTHER, "current precreated pool: %llu-%llu\n",
-              d->opd_pre_next, d->opd_pre_last_created);
+              d->opd_pre_used_id, d->opd_pre_last_created);
  
  out_req:
         /* now we can wakeup all users awaiting for objects */
@@ -312,6 +312,56 @@ out_req:
         RETURN(rc);
  }
  
+
+static int osp_get_lastid_from_ost(struct osp_device *d)
+{
+       struct ptlrpc_request   *req;
+       struct obd_import       *imp;
+       obd_id                  *reply;
+       char                    *tmp;
+       int                      rc;
+
+       imp = d->opd_obd->u.cli.cl_import;
+       LASSERT(imp);
+
+       req = ptlrpc_request_alloc(imp, &RQF_OST_GET_INFO_LAST_ID);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                            RCL_CLIENT, sizeof(KEY_LAST_ID));
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       memcpy(tmp, KEY_LAST_ID, sizeof(KEY_LAST_ID));
+
+       req->rq_no_delay = req->rq_no_resend = 1;
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               /* bad-bad OST.. let sysadm sort this out */
+               ptlrpc_set_import_active(imp, 0);
+               GOTO(out, rc);
+       }
+
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+       if (reply == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       d->opd_last_used_id = *reply;
+       CDEBUG(D_HA, "%s: got last_id "LPU64" from OST\n",
+              d->opd_obd->obd_name, d->opd_last_used_id);
+
+out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+
+}
+
  /**
   * asks OST to clean precreate orphans
   * and gets next id for new objects
@@ -321,36 +371,69 @@ static int osp_precreate_cleanup_orphans(struct osp_device *d)
         struct ptlrpc_request   *req = NULL;
         struct obd_import       *imp;
         struct ost_body         *body;
+       struct l_wait_info       lwi = { 0 };
+       int                      update_status = 0;
         int                      rc;
  
         ENTRY;
  
-       LASSERT(d->opd_recovery_completed);
-       LASSERT(d->opd_pre_reserved == 0);
+       /*
+        * wait for local recovery to finish, so we can cleanup orphans
+        * orphans are all objects since "last used" (assigned), but
+        * there might be objects reserved and in some cases they won't
+        * be used. we can't cleanup them till we're sure they won't be
+        * used. also can't we allow new reservations because they may
+        * end up getting orphans being cleaned up below. so we block
+        * new reservations and wait till all reserved objects either
+        * user or released.
+        */
+       spin_lock(&d->opd_pre_lock);
+       d->opd_pre_recovering = 1;
+       spin_unlock(&d->opd_pre_lock);
+       /*
+        * The locking above makes sure the opd_pre_reserved check below will
+        * catch all osp_precreate_reserve() calls who find
+        * "!opd_pre_recovering".
+        */
+       l_wait_event(d->opd_pre_waitq,
+                    (!d->opd_pre_reserved && d->opd_recovery_completed) ||
+                    !osp_precreate_running(d) || d->opd_got_disconnected,
+                    &lwi);
+       if (!osp_precreate_running(d) || d->opd_got_disconnected)
+               GOTO(out, rc = -EAGAIN);
+
+       CDEBUG(D_HA, "%s: going to cleanup orphans since "LPU64"\n",
+               d->opd_obd->obd_name, d->opd_last_used_id);
+
+       if (d->opd_last_used_id < 2) {
+               /* lastid looks strange... ask OST */
+               rc = osp_get_lastid_from_ost(d);
+               if (rc)
+                       GOTO(out, rc);
+       }
  
         imp = d->opd_obd->u.cli.cl_import;
         LASSERT(imp);
  
         req = ptlrpc_request_alloc(imp, &RQF_OST_CREATE);
         if (req == NULL)
-               RETURN(-ENOMEM);
+               GOTO(out, rc = -ENOMEM);
  
         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
         if (rc) {
                 ptlrpc_request_free(req);
-               RETURN(rc);
+               GOTO(out, rc);
         }
  
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         if (body == NULL)
-               GOTO(out_req, rc = -EPROTO);
+               GOTO(out, rc = -EPROTO);
  
         body->oa.o_flags = OBD_FL_DELORPHAN;
         body->oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
         body->oa.o_seq = FID_SEQ_OST_MDT0;
  
-       /* remove from NEXT after used one */
-       body->oa.o_id = d->opd_last_used_id + 1;
+       body->oa.o_id = d->opd_last_used_id;
  
         ptlrpc_request_set_replen(req);
  
@@ -358,40 +441,66 @@ static int osp_precreate_cleanup_orphans(struct osp_device *d)
         req->rq_no_resend = req->rq_no_delay = 1;
  
         rc = ptlrpc_queue_wait(req);
-       if (rc)
-               GOTO(out_req, rc);
+       if (rc) {
+               update_status = 1;
+               GOTO(out, rc);
+       }
  
         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
         if (body == NULL)
-               GOTO(out_req, rc = -EPROTO);
+               GOTO(out, rc = -EPROTO);
  
         /*
          * OST provides us with id new pool starts from in body->oa.o_id
          */
-       cfs_spin_lock(&d->opd_pre_lock);
+       spin_lock(&d->opd_pre_lock);
         if (le64_to_cpu(d->opd_last_used_id) > body->oa.o_id) {
                 d->opd_pre_grow_count = OST_MIN_PRECREATE +
                                         le64_to_cpu(d->opd_last_used_id) -
                                         body->oa.o_id;
-               d->opd_pre_last_created = le64_to_cpu(d->opd_last_used_id) + 1;
+               d->opd_pre_last_created = le64_to_cpu(d->opd_last_used_id);
         } else {
                 d->opd_pre_grow_count = OST_MIN_PRECREATE;
-               d->opd_pre_last_created = body->oa.o_id + 1;
+               d->opd_pre_last_created = body->oa.o_id;
         }
-       d->opd_pre_next = d->opd_pre_last_created;
+       /*
+        * This empties the pre-creation pool and effectively blocks any new
+        * reservations.
+        */
+       d->opd_pre_used_id = d->opd_pre_last_created;
         d->opd_pre_grow_slow = 0;
-       cfs_spin_unlock(&d->opd_pre_lock);
+       spin_unlock(&d->opd_pre_lock);
  
-       /* now we can wakeup all users awaiting for objects */
-       osp_pre_update_status(d, rc);
-       cfs_waitq_signal(&d->opd_pre_user_waitq);
+       CDEBUG(D_HA, "%s: Got last_id "LPU64" from OST, last_used is "LPU64
+              ", pre_used "LPU64"\n", d->opd_obd->obd_name, body->oa.o_id,
+              le64_to_cpu(d->opd_last_used_id), d->opd_pre_used_id);
+
+out:
+       if (req)
+               ptlrpc_req_finished(req);
  
-       CDEBUG(D_HA, "Got last_id "LPU64" from OST, last_used is "LPU64
-              ", next "LPU64"\n", body->oa.o_id,
-              le64_to_cpu(d->opd_last_used_id), d->opd_pre_next);
+       d->opd_pre_recovering = 0;
+
+       /*
+        * If rc is zero, the pre-creation window should have been emptied.
+        * Since waking up the herd would be useless without pre-created
+        * objects, we defer the signal to osp_precreate_send() in that case.
+        */
+       if (rc != 0) {
+               if (update_status) {
+                       CERROR("%s: cannot cleanup orphans: rc = %d\n",
+                              d->opd_obd->obd_name, rc);
+                       /* we can't proceed from here, OST seem to
+                        * be in a bad shape, better to wait for
+                        * a new instance of the server and repeat
+                        * from the beginning. notify possible waiters
+                        * this OSP isn't quite functional yet */
+                       osp_pre_update_status(d, rc);
+               } else {
+                       cfs_waitq_signal(&d->opd_pre_user_waitq);
+               }
+       }
  
-out_req:
-       ptlrpc_req_finished(req);
         RETURN(rc);
  }
  
@@ -413,6 +522,20 @@ void osp_pre_update_status(struct osp_device *d, int rc)
         if (rc)
                 goto out;
  
+       /* Add a bit of hysteresis so this flag isn't continually flapping,
+        * and ensure that new files don't get extremely fragmented due to
+        * only a small amount of available space in the filesystem.
+        * We want to set the NOSPC flag when there is less than ~0.1% free
+        * and clear it when there is at least ~0.2% free space, so:
+        *                   avail < ~0.1% max          max = avail + used
+        *            1025 * avail < avail + used       used = blocks - free
+        *            1024 * avail < used
+        *            1024 * avail < blocks - free
+        *                   avail < ((blocks - free) >> 10)
+        *
+        * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
+        * lose that amount of space so in those cases we report no space left
+        * if their is less than 1 GB left.                             */
         if (likely(msfs->os_type)) {
                 used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
                                     1 << 30);
@@ -425,6 +548,9 @@ void osp_pre_update_status(struct osp_device *d, int rc)
                                        d->opd_obd->obd_name, msfs->os_blocks,
                                        msfs->os_bfree, used, msfs->os_bavail,
                                        d->opd_pre_status, rc);
+                       CDEBUG(D_INFO,
+                              "non-commited changes: %lu, in progress: %u\n",
+                              d->opd_syn_changes, d->opd_syn_rpc_in_progress);
                 } else if (old == -ENOSPC) {
                         d->opd_pre_status = 0;
                         d->opd_pre_grow_slow = 0;
@@ -455,9 +581,9 @@ static int osp_precreate_thread(void *_arg)
         sprintf(pname, "osp-pre-%u\n", d->opd_index);
         cfs_daemonize(pname);
  
-       cfs_spin_lock(&d->opd_pre_lock);
+       spin_lock(&d->opd_pre_lock);
         thread->t_flags = SVC_RUNNING;
-       cfs_spin_unlock(&d->opd_pre_lock);
+       spin_unlock(&d->opd_pre_lock);
         cfs_waitq_signal(&thread->t_ctl_waitq);
  
         while (osp_precreate_running(d)) {
@@ -484,25 +610,11 @@ static int osp_precreate_thread(void *_arg)
                 osp_statfs_update(d);
  
                 /*
-                * wait for local recovery to finish, so we can cleanup orphans
-                * orphans are all objects since "last used" (assigned), but
-                * there might be objects reserved and in some cases they won't
-                * be used. we can't cleanup them till we're sure they won't be
-                * used. so we block new reservations and wait till all reserved
-                * objects either user or released.
+                * Clean up orphans or recreate missing objects.
                  */
-               l_wait_event(d->opd_pre_waitq, (!d->opd_pre_reserved &&
-                                               d->opd_recovery_completed) ||
-                            !osp_precreate_running(d) ||
-                            d->opd_got_disconnected, &lwi);
-
-               if (osp_precreate_running(d) && !d->opd_got_disconnected) {
-                       rc = osp_precreate_cleanup_orphans(d);
-                       if (rc) {
-                               CERROR("%s: cannot cleanup orphans: rc = %d\n",
-                                      d->opd_obd->obd_name,  rc);
-                       }
-               }
+               rc = osp_precreate_cleanup_orphans(d);
+               if (rc != 0)
+                       continue;
  
                 /*
                  * connected, can handle precreates now
@@ -547,12 +659,20 @@ static int osp_precreate_thread(void *_arg)
  
  static int osp_precreate_ready_condition(struct osp_device *d)
  {
+       __u64 next;
+
+       if (d->opd_pre_recovering)
+               return 0;
+
         /* ready if got enough precreated objects */
-       if (d->opd_pre_next + d->opd_pre_reserved < d->opd_pre_last_created)
+       /* we need to wait for others (opd_pre_reserved) and our object (+1) */
+       next = d->opd_pre_used_id + d->opd_pre_reserved + 1;
+       if (next <= d->opd_pre_last_created)
                 return 1;
  
-       /* ready if OST reported no space */
-       if (d->opd_pre_status != 0)
+       /* ready if OST reported no space and no destoys in progress */
+       if (d->opd_syn_changes + d->opd_syn_rpc_in_progress == 0 &&
+           d->opd_pre_status != 0)
                 return 1;
  
         return 0;
@@ -563,9 +683,11 @@ static int osp_precreate_timeout_condition(void *data)
         struct osp_device *d = data;
  
         LCONSOLE_WARN("%s: slow creates, last="LPU64", next="LPU64", "
-                     "reserved="LPU64", status=%d\n",
+                     "reserved="LPU64", syn_changes=%lu, "
+                     "syn_rpc_in_progress=%d, status=%d\n",
                       d->opd_obd->obd_name, d->opd_pre_last_created,
-                     d->opd_pre_next, d->opd_pre_reserved,
+                     d->opd_pre_used_id, d->opd_pre_reserved,
+                     d->opd_syn_changes, d->opd_syn_rpc_in_progress,
                       d->opd_pre_status);
  
         return 0;
@@ -583,10 +705,11 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
         struct l_wait_info       lwi;
         cfs_time_t               expire = cfs_time_shift(obd_timeout);
         int                      precreated, rc;
+       int                      count = 0;
  
         ENTRY;
  
-       LASSERT(d->opd_pre_last_created >= d->opd_pre_next);
+       LASSERT(d->opd_pre_last_created >= d->opd_pre_used_id);
  
         lwi = LWI_TIMEOUT(cfs_time_seconds(obd_timeout),
                           osp_precreate_timeout_condition, d);
@@ -604,27 +727,36 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
                                 break;
                 }
  
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0)
+               /*
+                * to address Andreas's concern on possible busy-loop
+                * between this thread and osp_precreate_send()
+                */
+               if (unlikely(count++ == 1000)) {
+                       osp_precreate_timeout_condition(d);
+                       LBUG();
+               }
+#endif
+
                 /*
                  * increase number of precreations
                  */
                 if (d->opd_pre_grow_count < d->opd_pre_max_grow_count &&
                     d->opd_pre_grow_slow == 0 &&
-                   (d->opd_pre_last_created - d->opd_pre_next <=
+                   (d->opd_pre_last_created - d->opd_pre_used_id <=
                      d->opd_pre_grow_count / 4 + 1)) {
-                       cfs_spin_lock(&d->opd_pre_lock);
+                       spin_lock(&d->opd_pre_lock);
                         d->opd_pre_grow_slow = 1;
                         d->opd_pre_grow_count *= 2;
-                       cfs_spin_unlock(&d->opd_pre_lock);
+                       spin_unlock(&d->opd_pre_lock);
                 }
  
-               /*
-                * we never use the last object in the window
-                */
-               cfs_spin_lock(&d->opd_pre_lock);
-               precreated = d->opd_pre_last_created - d->opd_pre_next;
-               if (precreated > d->opd_pre_reserved) {
+               spin_lock(&d->opd_pre_lock);
+               precreated = d->opd_pre_last_created - d->opd_pre_used_id;
+               if (precreated > d->opd_pre_reserved &&
+                   !d->opd_pre_recovering) {
                         d->opd_pre_reserved++;
-                       cfs_spin_unlock(&d->opd_pre_lock);
+                       spin_unlock(&d->opd_pre_lock);
                         rc = 0;
  
                         /* XXX: don't wake up if precreation is in progress */
@@ -633,7 +765,29 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
  
                         break;
                 }
-               cfs_spin_unlock(&d->opd_pre_lock);
+               spin_unlock(&d->opd_pre_lock);
+
+               /*
+                * all precreated objects have been used and no-space
+                * status leave us no chance to succeed very soon
+                * but if there is destroy in progress, then we should
+                * wait till that is done - some space might be released
+                */
+               if (unlikely(rc == -ENOSPC)) {
+                       if (d->opd_syn_changes) {
+                               /* force local commit to release space */
+                               dt_commit_async(env, d->opd_storage);
+                       }
+                       if (d->opd_syn_rpc_in_progress) {
+                               /* just wait till destroys are done */
+                               /* see l_wait_even() few lines below */
+                       }
+                       if (d->opd_syn_changes +
+                           d->opd_syn_rpc_in_progress == 0) {
+                               /* no hope for free space */
+                               break;
+                       }
+               }
  
                 /* XXX: don't wake up if precreation is in progress */
                 cfs_waitq_signal(&d->opd_pre_waitq);
@@ -653,16 +807,16 @@ __u64 osp_precreate_get_id(struct osp_device *d)
         obd_id objid;
  
         /* grab next id from the pool */
-       cfs_spin_lock(&d->opd_pre_lock);
-       LASSERT(d->opd_pre_next <= d->opd_pre_last_created);
-       objid = d->opd_pre_next++;
+       spin_lock(&d->opd_pre_lock);
+       LASSERT(d->opd_pre_used_id < d->opd_pre_last_created);
+       objid = ++d->opd_pre_used_id;
         d->opd_pre_reserved--;
         /*
          * last_used_id must be changed along with getting new id otherwise
          * we might miscalculate gap causing object loss or leak
          */
         osp_update_last_id(d, objid);
-       cfs_spin_unlock(&d->opd_pre_lock);
+       spin_unlock(&d->opd_pre_lock);
  
         /*
          * probably main thread suspended orphan cleanup till
@@ -754,8 +908,8 @@ int osp_init_precreate(struct osp_device *d)
  
         /* initially precreation isn't ready */
         d->opd_pre_status = -EAGAIN;
-       d->opd_pre_next = 1;
-       d->opd_pre_last_created = 1;
+       d->opd_pre_used_id = 0;
+       d->opd_pre_last_created = 0;
         d->opd_pre_reserved = 0;
         d->opd_got_disconnected = 1;
         d->opd_pre_grow_slow = 0;
@@ -763,7 +917,7 @@ int osp_init_precreate(struct osp_device *d)
         d->opd_pre_min_grow_count = OST_MIN_PRECREATE;
         d->opd_pre_max_grow_count = OST_MAX_PRECREATE;
  
-       cfs_spin_lock_init(&d->opd_pre_lock);
+       spin_lock_init(&d->opd_pre_lock);
         cfs_waitq_init(&d->opd_pre_waitq);
         cfs_waitq_init(&d->opd_pre_user_waitq);
         cfs_waitq_init(&d->opd_pre_thread.t_ctl_waitq);