When the obdfilter code was split into separate OFD and OSD modules,
the bulk IO page allocation was implemented to use GFP_NOFS to avoid
allocations recursing into the filesystem and causing deadlocks.
However, this is only possible if the RPC is coming from a local
client, as we might end up waiting on a page sent in the request we're
serving. Local RPCs use __GFP_HIGHMEM so that the pages can use all of
the available memory on the OSS on 32-bit machines.
It is possible to use more aggressive GFP_HIGHUSER flags for non-local
clients to be able to generate more memory pressure on the OSS and
allow inactive pages to be reclaimed, since the OSS doesn't have any
other processes or allocations that generate memory reclaim pressure.
See also b=17576 (
bdf50dc9) and b=19529 (
3dcf18d3) for details.
The patch also implements an LNet function to determine if a client NID
is local or not. This becomes more complex in the LNet Multi-Rail world
and it is really LNet's job to handle NIDs, not that of Lustre.
Lustre-change: https://review.whamcloud.com/27908
Lustre-commit:
b0ab95d6133e783acacc6329c025d17fb282775e
Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Change-Id: I2806c9c5c2fe269669eafdafaf2310924c3ebbe5
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Patrick Farrell <paf@cray.com>
Signed-off-by: Minh Diep <minh.diep@intel.com>
Reviewed-on: https://review.whamcloud.com/28318
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
int LNetGetId(unsigned int index, struct lnet_process_id *id);
int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
lnet_nid_t LNetPrimaryNID(lnet_nid_t nid);
+bool LNetIsPeerLocal(lnet_nid_t nid);
/** @} lnet_addr */
EXPORT_SYMBOL(LNetDebugPeer);
/**
+ * Determine if the specified peer \a nid is on the local node.
+ *
+ * \param nid peer nid to check
+ *
+ * \retval true If peer NID is on the local node.
+ * \retval false If peer NID is not on the local node.
+ */
+bool LNetIsPeerLocal(lnet_nid_t nid)
+{
+ struct lnet_net *net;
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+ list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+ if (ni->ni_nid == nid) {
+ lnet_net_unlock(cpt);
+ return true;
+ }
+ }
+ }
+ lnet_net_unlock(cpt);
+
+ return false;
+}
+EXPORT_SYMBOL(LNetIsPeerLocal);
+
+/**
* Retrieve the struct lnet_process_id ID of LNet interface at \a index.
* Note that all interfaces share a same PID, as requested by LNetNIInit().
*
const struct lu_buf *buf, struct thandle *th);
};
+enum dt_bufs_type {
+ DT_BUFS_TYPE_READ = 0x0000,
+ DT_BUFS_TYPE_WRITE = 0x0001,
+ DT_BUFS_TYPE_READAHEAD = 0x0002,
+ DT_BUFS_TYPE_LOCAL = 0x0004,
+};
+
/**
* Per-dt-object operations on "file body" - unstructure raw data.
*/
loff_t pos,
ssize_t len,
struct niobuf_local *lb,
- int rw);
+ enum dt_bufs_type rw);
/**
* Release reference granted by ->dbo_bufs_get().
static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
struct niobuf_remote *rnb,
- struct niobuf_local *lnb, int rw)
+ struct niobuf_local *lnb, enum dt_bufs_type rw)
{
LASSERT(d);
LASSERT(d->do_body_ops);
void ptlrpc_connection_fini(void);
extern lnet_pid_t ptl_get_pid(void);
+/*
+ * Check if the peer connection is on the local node. We need to use GFP_NOFS
+ * for requests from a local client to avoid recursing into the filesystem
+ * as we might end up waiting on a page sent in the request we're serving.
+ *
+ * Use __GFP_HIGHMEM so that the pages can use all of the available memory
+ * on 32-bit machines. Use more aggressive GFP_HIGHUSER flags from non-local
+ * clients to be able to generate more memory pressure on the OSS and allow
+ * inactive pages to be reclaimed, since it doesn't have any other processes
+ * or allocations that generate memory reclaim pressure.
+ *
+ * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details.
+ */
+static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn)
+{
+ if (!conn)
+ return false;
+
+ if (conn->c_peer.nid == conn->c_self)
+ return true;
+
+ RETURN(LNetIsPeerLocal(conn->c_peer.nid));
+}
+
/* ptlrpc/niobuf.c */
/**
* Actual interfacing with LNet to put/get/register/unregister stuff
static int ofd_ladvise_prefetch(const struct lu_env *env,
struct ofd_object *fo,
struct niobuf_local *lnb,
- __u64 start, __u64 end)
+ __u64 start, __u64 end, enum dt_bufs_type dbt)
{
- struct ofd_thread_info *info = ofd_info(env);
- pgoff_t start_index, end_index, pages;
- struct niobuf_remote rnb;
- unsigned long nr_local;
- int rc = 0;
+ struct ofd_thread_info *info = ofd_info(env);
+ pgoff_t start_index, end_index, pages;
+ struct niobuf_remote rnb;
+ unsigned long nr_local;
+ int rc = 0;
if (end <= start)
RETURN(-EINVAL);
PTLRPC_MAX_BRW_PAGES;
rnb.rnb_offset = start_index << PAGE_SHIFT;
rnb.rnb_len = nr_local << PAGE_SHIFT;
- rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, 0);
+ rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, dbt);
if (unlikely(rc < 0))
break;
nr_local = rc;
struct ptlrpc_thread *svc_thread = req->rq_svc_thread;
const struct lu_env *env = svc_thread->t_env;
struct tgt_thread_big_cache *tbc = svc_thread->t_data;
- int rc = 0;
+ enum dt_bufs_type dbt = DT_BUFS_TYPE_READAHEAD;
struct lu_ladvise *ladvise;
int num_advise;
struct ladvise_hdr *ladvise_hdr;
struct dt_object *dob;
__u64 start;
__u64 end;
+ int rc = 0;
ENTRY;
CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LADVISE_PAUSE, cfs_fail_val);
LASSERT(fo != NULL);
dob = ofd_object_child(fo);
+ if (ptlrpc_connection_is_local(exp->exp_connection))
+ dbt |= DT_BUFS_TYPE_LOCAL;
+
for (i = 0; i < num_advise; i++, ladvise++) {
start = ladvise->lla_start;
end = ladvise->lla_end;
req->rq_status = ofd_ladvise_prefetch(env, fo,
tbc->local,
- start, end);
+ start, end, dbt);
tgt_extent_unlock(&lockh, LCK_PR);
break;
case LU_LADVISE_DONTNEED:
struct niobuf_remote *rnb, int *nr_local,
struct niobuf_local *lnb, char *jobid)
{
- struct ofd_object *fo;
- int i, j, rc, tot_bytes = 0;
+ struct ofd_object *fo;
+ int i, j, rc, tot_bytes = 0;
+ enum dt_bufs_type dbt = DT_BUFS_TYPE_READ;
ENTRY;
LASSERT(env != NULL);
GOTO(unlock, rc);
}
- *nr_local = 0;
- for (i = 0, j = 0; i < niocount; i++) {
+ if (ptlrpc_connection_is_local(exp->exp_connection))
+ dbt |= DT_BUFS_TYPE_LOCAL;
+
+ for (*nr_local = 0, i = 0, j = 0; i < niocount; i++) {
rc = dt_bufs_get(env, ofd_object_child(fo), rnb + i,
- lnb + j, 0);
+ lnb + j, dbt);
if (unlikely(rc < 0))
GOTO(buf_put, rc);
LASSERT(rc <= PTLRPC_MAX_BRW_PAGES);
struct niobuf_remote *rnb, int *nr_local,
struct niobuf_local *lnb, char *jobid)
{
- struct ofd_object *fo;
- int i, j, k, rc = 0, tot_bytes = 0;
+ struct ofd_object *fo;
+ int i, j, k, rc = 0, tot_bytes = 0;
+ enum dt_bufs_type dbt = DT_BUFS_TYPE_WRITE;
ENTRY;
LASSERT(env != NULL);
* space back if possible */
tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt);
+ if (ptlrpc_connection_is_local(exp->exp_connection))
+ dbt |= DT_BUFS_TYPE_LOCAL;
+
/* parse remote buffers to local buffers and prepare the latter */
- *nr_local = 0;
- for (i = 0, j = 0; i < obj->ioo_bufcnt; i++) {
+ for (*nr_local = 0, i = 0, j = 0; i < obj->ioo_bufcnt; i++) {
rc = dt_bufs_get(env, ofd_object_child(fo),
- rnb + i, lnb + j, 1);
+ rnb + i, lnb + j, dbt);
if (unlikely(rc < 0))
GOTO(err, rc);
LASSERT(rc <= PTLRPC_MAX_BRW_PAGES);
RETURN(0);
}
-static struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw)
+static struct page *osd_get_page(struct dt_object *dt, loff_t offset,
+ gfp_t gfp_mask)
{
- struct inode *inode = osd_dt_obj(dt)->oo_inode;
- struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
- struct page *page;
+ struct inode *inode = osd_dt_obj(dt)->oo_inode;
+ struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
+ struct page *page;
LASSERT(inode);
page = find_or_create_page(inode->i_mapping, offset >> PAGE_SHIFT,
- GFP_NOFS | __GFP_HIGHMEM);
+ gfp_mask);
+
if (unlikely(page == NULL))
lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1);
* \param pos byte offset of IO start
* \param len number of bytes of IO
* \param lnb array of extents undergoing IO
- * \param rw read or write operation?
+ * \param rw read or write operation, and other flags
* \param capa capabilities
*
* \retval pages (zero or more) loaded successfully
*/
static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
loff_t pos, ssize_t len, struct niobuf_local *lnb,
- int rw)
+ enum dt_bufs_type rw)
{
- struct osd_object *obj = osd_dt_obj(dt);
+ struct osd_object *obj = osd_dt_obj(dt);
int npages, i, rc = 0;
+ gfp_t gfp_mask;
LASSERT(obj->oo_inode);
osd_map_remote_to_local(pos, len, &npages, lnb);
+ /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */
+ gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) :
+ GFP_HIGHUSER;
for (i = 0; i < npages; i++, lnb++) {
- lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, rw);
+ lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset,
+ gfp_mask);
if (lnb->lnb_page == NULL)
GOTO(cleanup, rc = -ENOMEM);
* \retval negative error number of failure
*/
static int osd_bufs_get_read(const struct lu_env *env, struct osd_object *obj,
- loff_t off, ssize_t len, struct niobuf_local *lnb)
+ loff_t off, ssize_t len, struct niobuf_local *lnb)
{
struct osd_device *osd = osd_obj2dev(obj);
unsigned long start = cfs_time_current();
}
static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj,
- loff_t off, ssize_t len, struct niobuf_local *lnb)
+ loff_t off, ssize_t len, struct niobuf_local *lnb)
{
struct osd_device *osd = osd_obj2dev(obj);
int plen, off_in_block, sz_in_block;
static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
loff_t offset, ssize_t len, struct niobuf_local *lnb,
- int rw)
+ enum dt_bufs_type rw)
{
struct osd_object *obj = osd_dt_obj(dt);
int rc;
LASSERT(dt_object_exists(dt));
LASSERT(obj->oo_dn);
- if (rw == 0)
- rc = osd_bufs_get_read(env, obj, offset, len, lnb);
- else
+ if (rw & DT_BUFS_TYPE_WRITE)
rc = osd_bufs_get_write(env, obj, offset, len, lnb);
+ else
+ rc = osd_bufs_get_read(env, obj, offset, len, lnb);
return rc;
}
RETURN(err_serious(-EPROTO));
if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) &&
- (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self))
+ ptlrpc_connection_is_local(exp->exp_connection))
memory_pressure_set();
req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER,