/* couldn't update statfs, try again as soon as possible */
cfs_waitq_signal(&d->opd_pre_waitq);
if (req->rq_import_generation == imp->imp_generation)
- CERROR("%s: couldn't update statfs: rc = %d\n",
+ CDEBUG(D_CACHE, "%s: couldn't update statfs: rc = %d\n",
d->opd_obd->obd_name, rc);
RETURN(rc);
}
aa = ptlrpc_req_async_args(req);
aa->pointer_arg[0] = d;
- ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
-
- cfs_timer_disarm(&d->opd_statfs_timer);
-
/*
* no updates till reply
*/
+ cfs_timer_disarm(&d->opd_statfs_timer);
d->opd_statfs_fresh_till = cfs_time_shift(obd_timeout * 1000);
d->opd_statfs_update_in_progress = 1;
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
RETURN(0);
}
static inline int osp_precreate_near_empty_nolock(struct osp_device *d)
{
- int window = d->opd_pre_last_created - d->opd_pre_next;
+ int window = d->opd_pre_last_created - d->opd_pre_used_id;
/* don't consider new precreation till OST is healty and
* has free space */
int rc;
/* XXX: do we really need locking here? */
- cfs_spin_lock(&d->opd_pre_lock);
+ spin_lock(&d->opd_pre_lock);
rc = osp_precreate_near_empty_nolock(d);
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
return rc;
}
RETURN(rc);
}
- cfs_spin_lock(&d->opd_pre_lock);
+ spin_lock(&d->opd_pre_lock);
if (d->opd_pre_grow_count > d->opd_pre_max_grow_count / 2)
d->opd_pre_grow_count = d->opd_pre_max_grow_count / 2;
grow = d->opd_pre_grow_count;
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
LASSERT(body);
if (body == NULL)
GOTO(out_req, rc = -EPROTO);
- CDEBUG(D_HA, "new last_created %lu\n", (unsigned long) body->oa.o_id);
- LASSERT(body->oa.o_id > d->opd_pre_next);
+ CDEBUG(D_HA, "%s: new last_created "LPU64"\n", d->opd_obd->obd_name,
+ body->oa.o_id);
+ LASSERT(body->oa.o_id > d->opd_pre_used_id);
diff = body->oa.o_id - d->opd_pre_last_created;
- cfs_spin_lock(&d->opd_pre_lock);
+ spin_lock(&d->opd_pre_lock);
if (diff < grow) {
/* the OST has not managed to create all the
* objects we asked for */
d->opd_pre_grow_slow = 0;
}
d->opd_pre_last_created = body->oa.o_id;
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
CDEBUG(D_OTHER, "current precreated pool: %llu-%llu\n",
- d->opd_pre_next, d->opd_pre_last_created);
+ d->opd_pre_used_id, d->opd_pre_last_created);
out_req:
/* now we can wakeup all users awaiting for objects */
RETURN(rc);
}
+
+static int osp_get_lastid_from_ost(struct osp_device *d)
+{
+ struct ptlrpc_request *req;
+ struct obd_import *imp;
+ obd_id *reply;
+ char *tmp;
+ int rc;
+
+ imp = d->opd_obd->u.cli.cl_import;
+ LASSERT(imp);
+
+ req = ptlrpc_request_alloc(imp, &RQF_OST_GET_INFO_LAST_ID);
+ if (req == NULL)
+ RETURN(-ENOMEM);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+ RCL_CLIENT, sizeof(KEY_LAST_ID));
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+ if (rc) {
+ ptlrpc_request_free(req);
+ RETURN(rc);
+ }
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+ memcpy(tmp, KEY_LAST_ID, sizeof(KEY_LAST_ID));
+
+ req->rq_no_delay = req->rq_no_resend = 1;
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc) {
+ /* bad-bad OST.. let sysadm sort this out */
+ ptlrpc_set_import_active(imp, 0);
+ GOTO(out, rc);
+ }
+
+ reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+ if (reply == NULL)
+ GOTO(out, rc = -EPROTO);
+
+ d->opd_last_used_id = *reply;
+ CDEBUG(D_HA, "%s: got last_id "LPU64" from OST\n",
+ d->opd_obd->obd_name, d->opd_last_used_id);
+
+out:
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+
+}
+
/**
* asks OST to clean precreate orphans
* and gets next id for new objects
struct ptlrpc_request *req = NULL;
struct obd_import *imp;
struct ost_body *body;
+ struct l_wait_info lwi = { 0 };
+ int update_status = 0;
int rc;
ENTRY;
- LASSERT(d->opd_recovery_completed);
- LASSERT(d->opd_pre_reserved == 0);
+ /*
+ * wait for local recovery to finish, so we can cleanup orphans
+ * orphans are all objects since "last used" (assigned), but
+ * there might be objects reserved and in some cases they won't
+ * be used. we can't cleanup them till we're sure they won't be
+ * used. also can't we allow new reservations because they may
+ * end up getting orphans being cleaned up below. so we block
+ * new reservations and wait till all reserved objects either
+ * user or released.
+ */
+ spin_lock(&d->opd_pre_lock);
+ d->opd_pre_recovering = 1;
+ spin_unlock(&d->opd_pre_lock);
+ /*
+ * The locking above makes sure the opd_pre_reserved check below will
+ * catch all osp_precreate_reserve() calls who find
+ * "!opd_pre_recovering".
+ */
+ l_wait_event(d->opd_pre_waitq,
+ (!d->opd_pre_reserved && d->opd_recovery_completed) ||
+ !osp_precreate_running(d) || d->opd_got_disconnected,
+ &lwi);
+ if (!osp_precreate_running(d) || d->opd_got_disconnected)
+ GOTO(out, rc = -EAGAIN);
+
+ CDEBUG(D_HA, "%s: going to cleanup orphans since "LPU64"\n",
+ d->opd_obd->obd_name, d->opd_last_used_id);
+
+ if (d->opd_last_used_id < 2) {
+ /* lastid looks strange... ask OST */
+ rc = osp_get_lastid_from_ost(d);
+ if (rc)
+ GOTO(out, rc);
+ }
imp = d->opd_obd->u.cli.cl_import;
LASSERT(imp);
req = ptlrpc_request_alloc(imp, &RQF_OST_CREATE);
if (req == NULL)
- RETURN(-ENOMEM);
+ GOTO(out, rc = -ENOMEM);
rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
if (rc) {
ptlrpc_request_free(req);
- RETURN(rc);
+ GOTO(out, rc);
}
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
if (body == NULL)
- GOTO(out_req, rc = -EPROTO);
+ GOTO(out, rc = -EPROTO);
body->oa.o_flags = OBD_FL_DELORPHAN;
body->oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
body->oa.o_seq = FID_SEQ_OST_MDT0;
- /* remove from NEXT after used one */
- body->oa.o_id = d->opd_last_used_id + 1;
+ body->oa.o_id = d->opd_last_used_id;
ptlrpc_request_set_replen(req);
req->rq_no_resend = req->rq_no_delay = 1;
rc = ptlrpc_queue_wait(req);
- if (rc)
- GOTO(out_req, rc);
+ if (rc) {
+ update_status = 1;
+ GOTO(out, rc);
+ }
body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
if (body == NULL)
- GOTO(out_req, rc = -EPROTO);
+ GOTO(out, rc = -EPROTO);
/*
* OST provides us with id new pool starts from in body->oa.o_id
*/
- cfs_spin_lock(&d->opd_pre_lock);
+ spin_lock(&d->opd_pre_lock);
if (le64_to_cpu(d->opd_last_used_id) > body->oa.o_id) {
d->opd_pre_grow_count = OST_MIN_PRECREATE +
le64_to_cpu(d->opd_last_used_id) -
body->oa.o_id;
- d->opd_pre_last_created = le64_to_cpu(d->opd_last_used_id) + 1;
+ d->opd_pre_last_created = le64_to_cpu(d->opd_last_used_id);
} else {
d->opd_pre_grow_count = OST_MIN_PRECREATE;
- d->opd_pre_last_created = body->oa.o_id + 1;
+ d->opd_pre_last_created = body->oa.o_id;
}
- d->opd_pre_next = d->opd_pre_last_created;
+ /*
+ * This empties the pre-creation pool and effectively blocks any new
+ * reservations.
+ */
+ d->opd_pre_used_id = d->opd_pre_last_created;
d->opd_pre_grow_slow = 0;
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
- /* now we can wakeup all users awaiting for objects */
- osp_pre_update_status(d, rc);
- cfs_waitq_signal(&d->opd_pre_user_waitq);
+ CDEBUG(D_HA, "%s: Got last_id "LPU64" from OST, last_used is "LPU64
+ ", pre_used "LPU64"\n", d->opd_obd->obd_name, body->oa.o_id,
+ le64_to_cpu(d->opd_last_used_id), d->opd_pre_used_id);
+
+out:
+ if (req)
+ ptlrpc_req_finished(req);
- CDEBUG(D_HA, "Got last_id "LPU64" from OST, last_used is "LPU64
- ", next "LPU64"\n", body->oa.o_id,
- le64_to_cpu(d->opd_last_used_id), d->opd_pre_next);
+ d->opd_pre_recovering = 0;
+
+ /*
+ * If rc is zero, the pre-creation window should have been emptied.
+ * Since waking up the herd would be useless without pre-created
+ * objects, we defer the signal to osp_precreate_send() in that case.
+ */
+ if (rc != 0) {
+ if (update_status) {
+ CERROR("%s: cannot cleanup orphans: rc = %d\n",
+ d->opd_obd->obd_name, rc);
+ /* we can't proceed from here, OST seem to
+ * be in a bad shape, better to wait for
+ * a new instance of the server and repeat
+ * from the beginning. notify possible waiters
+ * this OSP isn't quite functional yet */
+ osp_pre_update_status(d, rc);
+ } else {
+ cfs_waitq_signal(&d->opd_pre_user_waitq);
+ }
+ }
-out_req:
- ptlrpc_req_finished(req);
RETURN(rc);
}
if (rc)
goto out;
+ /* Add a bit of hysteresis so this flag isn't continually flapping,
+ * and ensure that new files don't get extremely fragmented due to
+ * only a small amount of available space in the filesystem.
+ * We want to set the NOSPC flag when there is less than ~0.1% free
+ * and clear it when there is at least ~0.2% free space, so:
+ * avail < ~0.1% max max = avail + used
+ * 1025 * avail < avail + used used = blocks - free
+ * 1024 * avail < used
+ * 1024 * avail < blocks - free
+ * avail < ((blocks - free) >> 10)
+ *
+ * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
+ * lose that amount of space so in those cases we report no space left
+ * if their is less than 1 GB left. */
if (likely(msfs->os_type)) {
used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
1 << 30);
d->opd_obd->obd_name, msfs->os_blocks,
msfs->os_bfree, used, msfs->os_bavail,
d->opd_pre_status, rc);
+ CDEBUG(D_INFO,
+ "non-commited changes: %lu, in progress: %u\n",
+ d->opd_syn_changes, d->opd_syn_rpc_in_progress);
} else if (old == -ENOSPC) {
d->opd_pre_status = 0;
d->opd_pre_grow_slow = 0;
sprintf(pname, "osp-pre-%u\n", d->opd_index);
cfs_daemonize(pname);
- cfs_spin_lock(&d->opd_pre_lock);
+ spin_lock(&d->opd_pre_lock);
thread->t_flags = SVC_RUNNING;
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
cfs_waitq_signal(&thread->t_ctl_waitq);
while (osp_precreate_running(d)) {
osp_statfs_update(d);
/*
- * wait for local recovery to finish, so we can cleanup orphans
- * orphans are all objects since "last used" (assigned), but
- * there might be objects reserved and in some cases they won't
- * be used. we can't cleanup them till we're sure they won't be
- * used. so we block new reservations and wait till all reserved
- * objects either user or released.
+ * Clean up orphans or recreate missing objects.
*/
- l_wait_event(d->opd_pre_waitq, (!d->opd_pre_reserved &&
- d->opd_recovery_completed) ||
- !osp_precreate_running(d) ||
- d->opd_got_disconnected, &lwi);
-
- if (osp_precreate_running(d) && !d->opd_got_disconnected) {
- rc = osp_precreate_cleanup_orphans(d);
- if (rc) {
- CERROR("%s: cannot cleanup orphans: rc = %d\n",
- d->opd_obd->obd_name, rc);
- }
- }
+ rc = osp_precreate_cleanup_orphans(d);
+ if (rc != 0)
+ continue;
/*
* connected, can handle precreates now
static int osp_precreate_ready_condition(struct osp_device *d)
{
+ __u64 next;
+
+ if (d->opd_pre_recovering)
+ return 0;
+
/* ready if got enough precreated objects */
- if (d->opd_pre_next + d->opd_pre_reserved < d->opd_pre_last_created)
+ /* we need to wait for others (opd_pre_reserved) and our object (+1) */
+ next = d->opd_pre_used_id + d->opd_pre_reserved + 1;
+ if (next <= d->opd_pre_last_created)
return 1;
- /* ready if OST reported no space */
- if (d->opd_pre_status != 0)
+ /* ready if OST reported no space and no destoys in progress */
+ if (d->opd_syn_changes + d->opd_syn_rpc_in_progress == 0 &&
+ d->opd_pre_status != 0)
return 1;
return 0;
struct osp_device *d = data;
LCONSOLE_WARN("%s: slow creates, last="LPU64", next="LPU64", "
- "reserved="LPU64", status=%d\n",
+ "reserved="LPU64", syn_changes=%lu, "
+ "syn_rpc_in_progress=%d, status=%d\n",
d->opd_obd->obd_name, d->opd_pre_last_created,
- d->opd_pre_next, d->opd_pre_reserved,
+ d->opd_pre_used_id, d->opd_pre_reserved,
+ d->opd_syn_changes, d->opd_syn_rpc_in_progress,
d->opd_pre_status);
return 0;
struct l_wait_info lwi;
cfs_time_t expire = cfs_time_shift(obd_timeout);
int precreated, rc;
+ int count = 0;
ENTRY;
- LASSERT(d->opd_pre_last_created >= d->opd_pre_next);
+ LASSERT(d->opd_pre_last_created >= d->opd_pre_used_id);
lwi = LWI_TIMEOUT(cfs_time_seconds(obd_timeout),
osp_precreate_timeout_condition, d);
break;
}
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0)
+ /*
+ * to address Andreas's concern on possible busy-loop
+ * between this thread and osp_precreate_send()
+ */
+ if (unlikely(count++ == 1000)) {
+ osp_precreate_timeout_condition(d);
+ LBUG();
+ }
+#endif
+
/*
* increase number of precreations
*/
if (d->opd_pre_grow_count < d->opd_pre_max_grow_count &&
d->opd_pre_grow_slow == 0 &&
- (d->opd_pre_last_created - d->opd_pre_next <=
+ (d->opd_pre_last_created - d->opd_pre_used_id <=
d->opd_pre_grow_count / 4 + 1)) {
- cfs_spin_lock(&d->opd_pre_lock);
+ spin_lock(&d->opd_pre_lock);
d->opd_pre_grow_slow = 1;
d->opd_pre_grow_count *= 2;
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
}
- /*
- * we never use the last object in the window
- */
- cfs_spin_lock(&d->opd_pre_lock);
- precreated = d->opd_pre_last_created - d->opd_pre_next;
- if (precreated > d->opd_pre_reserved) {
+ spin_lock(&d->opd_pre_lock);
+ precreated = d->opd_pre_last_created - d->opd_pre_used_id;
+ if (precreated > d->opd_pre_reserved &&
+ !d->opd_pre_recovering) {
d->opd_pre_reserved++;
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
rc = 0;
/* XXX: don't wake up if precreation is in progress */
break;
}
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
+
+ /*
+ * all precreated objects have been used and no-space
+ * status leave us no chance to succeed very soon
+ * but if there is destroy in progress, then we should
+ * wait till that is done - some space might be released
+ */
+ if (unlikely(rc == -ENOSPC)) {
+ if (d->opd_syn_changes) {
+ /* force local commit to release space */
+ dt_commit_async(env, d->opd_storage);
+ }
+ if (d->opd_syn_rpc_in_progress) {
+ /* just wait till destroys are done */
+ /* see l_wait_even() few lines below */
+ }
+ if (d->opd_syn_changes +
+ d->opd_syn_rpc_in_progress == 0) {
+ /* no hope for free space */
+ break;
+ }
+ }
/* XXX: don't wake up if precreation is in progress */
cfs_waitq_signal(&d->opd_pre_waitq);
obd_id objid;
/* grab next id from the pool */
- cfs_spin_lock(&d->opd_pre_lock);
- LASSERT(d->opd_pre_next <= d->opd_pre_last_created);
- objid = d->opd_pre_next++;
+ spin_lock(&d->opd_pre_lock);
+ LASSERT(d->opd_pre_used_id < d->opd_pre_last_created);
+ objid = ++d->opd_pre_used_id;
d->opd_pre_reserved--;
/*
* last_used_id must be changed along with getting new id otherwise
* we might miscalculate gap causing object loss or leak
*/
osp_update_last_id(d, objid);
- cfs_spin_unlock(&d->opd_pre_lock);
+ spin_unlock(&d->opd_pre_lock);
/*
* probably main thread suspended orphan cleanup till
/* initially precreation isn't ready */
d->opd_pre_status = -EAGAIN;
- d->opd_pre_next = 1;
- d->opd_pre_last_created = 1;
+ d->opd_pre_used_id = 0;
+ d->opd_pre_last_created = 0;
d->opd_pre_reserved = 0;
d->opd_got_disconnected = 1;
d->opd_pre_grow_slow = 0;
d->opd_pre_min_grow_count = OST_MIN_PRECREATE;
d->opd_pre_max_grow_count = OST_MAX_PRECREATE;
- cfs_spin_lock_init(&d->opd_pre_lock);
+ spin_lock_init(&d->opd_pre_lock);
cfs_waitq_init(&d->opd_pre_waitq);
cfs_waitq_init(&d->opd_pre_user_waitq);
cfs_waitq_init(&d->opd_pre_thread.t_ctl_waitq);