#define DEBUG_SUBSYSTEM S_MDS
#include <linux/kthread.h>
+
+#include <lustre_obdo.h>
+
#include "osp_internal.h"
/*
/* don't consider new precreation till OST is healty and
* has free space */
- return ((window - d->opd_pre_reserved < d->opd_pre_grow_count / 2) &&
+ return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2) &&
(d->opd_pre_status == 0));
}
}
spin_lock(&d->opd_pre_lock);
- if (d->opd_pre_grow_count > d->opd_pre_max_grow_count / 2)
- d->opd_pre_grow_count = d->opd_pre_max_grow_count / 2;
- grow = d->opd_pre_grow_count;
+ if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
+ d->opd_pre_create_count = d->opd_pre_max_create_count / 2;
+ grow = d->opd_pre_create_count;
spin_unlock(&d->opd_pre_lock);
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
if (diff < grow) {
/* the OST has not managed to create all the
* objects we asked for */
- d->opd_pre_grow_count = max(diff, OST_MIN_PRECREATE);
- d->opd_pre_grow_slow = 1;
+ d->opd_pre_create_count = max(diff, OST_MIN_PRECREATE);
+ d->opd_pre_create_slow = 1;
} else {
/* the OST is able to keep up with the work,
- * we could consider increasing grow_count
+ * we could consider increasing create_count
* next time if needed */
- d->opd_pre_grow_slow = 0;
+ d->opd_pre_create_slow = 0;
}
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
spin_lock(&d->opd_pre_lock);
diff = osp_fid_diff(&d->opd_last_used_fid, last_fid);
if (diff > 0) {
- d->opd_pre_grow_count = OST_MIN_PRECREATE + diff;
+ d->opd_pre_create_count = OST_MIN_PRECREATE + diff;
d->opd_pre_last_created_fid = d->opd_last_used_fid;
} else {
- d->opd_pre_grow_count = OST_MIN_PRECREATE;
+ d->opd_pre_create_count = OST_MIN_PRECREATE;
d->opd_pre_last_created_fid = *last_fid;
}
/*
LASSERT(fid_oid(&d->opd_pre_last_created_fid) <=
LUSTRE_DATA_SEQ_MAX_WIDTH);
d->opd_pre_used_fid = d->opd_pre_last_created_fid;
- d->opd_pre_grow_slow = 0;
+ d->opd_pre_create_slow = 0;
spin_unlock(&d->opd_pre_lock);
CDEBUG(D_HA, "%s: Got last_id "DFID" from OST, last_created "DFID
* Add a bit of hysteresis so this flag isn't continually flapping,
* and ensure that new files don't get extremely fragmented due to
* only a small amount of available space in the filesystem.
- * We want to set the ENOSPC when there is less than reserved size
- * free and clear it when there is at least 2*reserved size free space.
+ * We want to set the NOSPC flag when there is less than ~0.1% free
+ * and clear it when there is at least ~0.2% free space, so:
+ * avail < ~0.1% max max = avail + used
+ * 1025 * avail < avail + used used = blocks - free
+ * 1024 * avail < used
+ * 1024 * avail < blocks - free
+ * avail < ((blocks - free) >> 10)
+ *
+ * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
+ * lose that amount of space so in those cases we report no space left
+ * if their is less than 1 GB left.
* the function updates current precreation status used: functional or not
*
* \param[in] d OSP device
{
struct obd_statfs *msfs = &d->opd_statfs;
int old = d->opd_pre_status;
- __u64 available;
+ __u64 used;
d->opd_pre_status = rc;
if (rc)
goto out;
if (likely(msfs->os_type)) {
- if (d->opd_reserved_mb_high == 0 &&
- d->opd_reserved_mb_low == 0) {
- /* Use ~0.1% by default to disable object allocation,
- * and ~0.2% to enable, size in MB, set both watermark
- */
- spin_lock(&d->opd_pre_lock);
- if (d->opd_reserved_mb_high == 0 &&
- d->opd_reserved_mb_low == 0) {
- d->opd_reserved_mb_low = (msfs->os_bsize *
- msfs->os_blocks) >> 30;
- if (d->opd_reserved_mb_low == 0)
- d->opd_reserved_mb_low = 1;
- d->opd_reserved_mb_high =
- (d->opd_reserved_mb_low << 1) + 1;
- }
- spin_unlock(&d->opd_pre_lock);
- }
- /* in MB */
- available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
- if ((msfs->os_ffree < 32) ||
- (available < d->opd_reserved_mb_low)) {
+ used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
+ 1 << 30);
+ if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) {
d->opd_pre_status = -ENOSPC;
if (old != -ENOSPC)
CDEBUG(D_INFO, "%s: status: "LPU64" blocks, "
- LPU64" free, "LPU64" avail, "LPU64" "
- "MB avail, %u hwm -> %d: rc = %d\n",
+ LPU64" free, "LPU64" used, "LPU64" "
+ "avail -> %d: rc = %d\n",
d->opd_obd->obd_name, msfs->os_blocks,
- msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_low,
+ msfs->os_bfree, used, msfs->os_bavail,
d->opd_pre_status, rc);
CDEBUG(D_INFO,
- "non-commited changes: %lu, in progress: %u\n",
+ "non-committed changes: %lu, in progress: %u\n",
d->opd_syn_changes, d->opd_syn_rpc_in_progress);
- } else if (unlikely(old == -ENOSPC &&
- (msfs->os_ffree > 64) &&
- (available > d->opd_reserved_mb_high))) {
+ } else if (old == -ENOSPC) {
d->opd_pre_status = 0;
spin_lock(&d->opd_pre_lock);
- d->opd_pre_grow_slow = 0;
- d->opd_pre_grow_count = OST_MIN_PRECREATE;
+ d->opd_pre_create_slow = 0;
+ d->opd_pre_create_count = OST_MIN_PRECREATE;
spin_unlock(&d->opd_pre_lock);
wake_up(&d->opd_pre_waitq);
CDEBUG(D_INFO, "%s: no space: "LPU64" blocks, "LPU64
- " free, "LPU64" avail, "LPU64"MB avail, %u nwm"
- " -> %d: rc = %d\n", d->opd_obd->obd_name,
- msfs->os_blocks, msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_high,
- d->opd_pre_status, rc);
+ " free, "LPU64" used, "LPU64" avail -> %d: "
+ "rc = %d\n", d->opd_obd->obd_name,
+ msfs->os_blocks, msfs->os_bfree, used,
+ msfs->os_bavail, d->opd_pre_status, rc);
}
}
+
out:
wake_up(&d->opd_pre_user_waitq);
}
if (d->opd_pre_status != 0 &&
d->opd_pre_status != -EAGAIN &&
d->opd_pre_status != -ENODEV &&
+ d->opd_pre_status != -ENOTCONN &&
d->opd_pre_status != -ENOSPC) {
/* DEBUG LU-3230 */
if (d->opd_pre_status != -EIO)
* increase number of precreations
*/
precreated = osp_objs_precreated(env, d);
- if (d->opd_pre_grow_count < d->opd_pre_max_grow_count &&
- d->opd_pre_grow_slow == 0 &&
- precreated <= (d->opd_pre_grow_count / 4 + 1)) {
+ if (d->opd_pre_create_count < d->opd_pre_max_create_count &&
+ d->opd_pre_create_slow == 0 &&
+ precreated <= (d->opd_pre_create_count / 4 + 1)) {
spin_lock(&d->opd_pre_lock);
- d->opd_pre_grow_slow = 1;
- d->opd_pre_grow_count *= 2;
+ d->opd_pre_create_slow = 1;
+ d->opd_pre_create_count *= 2;
spin_unlock(&d->opd_pre_lock);
}
* XXX: decide how do we do here with resend
* if we don't resend, then client may see wrong file size
* if we do resend, then MDS thread can get stuck for quite long
+ * and if we don't resend, then client will also get -EWOULDBLOCK !!
+ * (see LU-7975 and sanity/test_27F use cases)
+ * but let's decide not to resend/delay this truncate request to OST
+ * and allow Client to decide to resend, in a less agressive way from
+ * after_reply(), by returning -EINPROGRESS instead of
+ * -EAGAIN/-EWOULDBLOCK upon return from ptlrpc_queue_wait() at the
+ * end of this routine
*/
req->rq_no_resend = req->rq_no_delay = 1;
ptlrpc_request_set_replen(req);
rc = ptlrpc_queue_wait(req);
- if (rc)
- CERROR("can't punch object: %d\n", rc);
+ if (rc) {
+ /* -EWOULDBLOCK/-EAGAIN means OST is unreachable at the moment
+ * since we have decided not to resend/delay, but this could
+ * lead to wrong size to be seen at Client side and even process
+ * trying to open to exit/fail if not itself handling -EAGAIN.
+ * So it should be better to return -EINPROGRESS instead and
+ * leave the decision to resend at Client side in after_reply()
+ */
+ if (rc == -EWOULDBLOCK) {
+ rc = -EINPROGRESS;
+ CDEBUG(D_HA, "returning -EINPROGRESS instead of "
+ "-EWOULDBLOCK/-EAGAIN to allow Client to "
+ "resend\n");
+ } else {
+ CERROR("can't punch object: %d\n", rc);
+ }
+ }
out:
ptlrpc_req_finished(req);
if (oa)
d->opd_pre_last_created_fid.f_oid = 1;
d->opd_pre_reserved = 0;
d->opd_got_disconnected = 1;
- d->opd_pre_grow_slow = 0;
- d->opd_pre_grow_count = OST_MIN_PRECREATE;
- d->opd_pre_min_grow_count = OST_MIN_PRECREATE;
- d->opd_pre_max_grow_count = OST_MAX_PRECREATE;
- d->opd_reserved_mb_high = 0;
- d->opd_reserved_mb_low = 0;
+ d->opd_pre_create_slow = 0;
+ d->opd_pre_create_count = OST_MIN_PRECREATE;
+ d->opd_pre_min_create_count = OST_MIN_PRECREATE;
+ d->opd_pre_max_create_count = OST_MAX_PRECREATE;
spin_lock_init(&d->opd_pre_lock);
init_waitqueue_head(&d->opd_pre_waitq);