From b0ab95d6133e783acacc6329c025d17fb282775e Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 30 Jun 2017 14:37:07 -0600 Subject: [PATCH] LU-9728 osd: use GFP_HIGHUSER for non-local IO When the obdfilter code was split into separate OFD and OSD modules, the bulk IO page allocation was implemented to use GFP_NOFS to avoid allocations recursing into the filesystem and causing deadlocks. However, this is only possible if the RPC is coming from a local client, as we might end up waiting on a page sent in the request we're serving. Local RPCs use __GFP_HIGHMEM so that the pages can use all of the available memory on the OSS on 32-bit machines. It is possible to use more aggressive GFP_HIGHUSER flags for non-local clients to be able to generate more memory pressure on the OSS and allow inactive pages to be reclaimed, since the OSS doesn't have any other processes or allocations that generate memory reclaim pressure. See also b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details. The patch also implements an LNet function to determine if a client NID is local or not. This becomes more complex in the LNet Multi-Rail world and it is really LNet's job to handle NIDs, not that of Lustre. Signed-off-by: Andreas Dilger Change-Id: I2806c9c5c2fe269669eafdafaf2310924c3ebbe5 Reviewed-on: https://review.whamcloud.com/27908 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Dmitry Eremin Reviewed-by: Patrick Farrell Reviewed-by: Oleg Drokin --- lnet/include/lnet/api.h | 1 + lnet/lnet/api-ni.c | 29 +++++++++++++++++++++++++++++ lustre/include/dt_object.h | 11 +++++++++-- lustre/include/lustre_net.h | 24 ++++++++++++++++++++++++ lustre/ofd/ofd_dev.c | 22 +++++++++++++--------- lustre/ofd/ofd_io.c | 26 ++++++++++++++++---------- lustre/osd-ldiskfs/osd_io.c | 25 ++++++++++++++++--------- lustre/osd-zfs/osd_io.c | 12 ++++++------ lustre/target/tgt_handler.c | 2 +- 9 files changed, 115 insertions(+), 37 deletions(-) diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index 4fa2f66..84c6bd0 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -78,6 +78,7 @@ int LNetNIFini(void); int LNetGetId(unsigned int index, struct lnet_process_id *id); int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); lnet_nid_t LNetPrimaryNID(lnet_nid_t nid); +bool LNetIsPeerLocal(lnet_nid_t nid); /** @} lnet_addr */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 4341d99..1a5aa34 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -2935,6 +2935,35 @@ void LNetDebugPeer(struct lnet_process_id id) EXPORT_SYMBOL(LNetDebugPeer); /** + * Determine if the specified peer \a nid is on the local node. + * + * \param nid peer nid to check + * + * \retval true If peer NID is on the local node. + * \retval false If peer NID is not on the local node. + */ +bool LNetIsPeerLocal(lnet_nid_t nid) +{ + struct lnet_net *net; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_nid == nid) { + lnet_net_unlock(cpt); + return true; + } + } + } + lnet_net_unlock(cpt); + + return false; +} +EXPORT_SYMBOL(LNetIsPeerLocal); + +/** * Retrieve the struct lnet_process_id ID of LNet interface at \a index. * Note that all interfaces share a same PID, as requested by LNetNIInit(). * diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index c79cc02..436139f 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -1060,6 +1060,13 @@ struct dt_object_operations { const struct lu_buf *buf, struct thandle *th); }; +enum dt_bufs_type { + DT_BUFS_TYPE_READ = 0x0000, + DT_BUFS_TYPE_WRITE = 0x0001, + DT_BUFS_TYPE_READAHEAD = 0x0002, + DT_BUFS_TYPE_LOCAL = 0x0004, +}; + /** * Per-dt-object operations on "file body" - unstructure raw data. */ @@ -1177,7 +1184,7 @@ struct dt_body_operations { loff_t pos, ssize_t len, struct niobuf_local *lb, - int rw); + enum dt_bufs_type rw); /** * Release reference granted by ->dbo_bufs_get(). @@ -2379,7 +2386,7 @@ static inline int dt_ref_del(const struct lu_env *env, static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, struct niobuf_remote *rnb, - struct niobuf_local *lnb, int rw) + struct niobuf_local *lnb, enum dt_bufs_type rw) { LASSERT(d); LASSERT(d->do_body_ops); diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index f705ceb..db61399 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -2019,6 +2019,30 @@ int ptlrpc_connection_init(void); void ptlrpc_connection_fini(void); extern lnet_pid_t ptl_get_pid(void); +/* + * Check if the peer connection is on the local node. We need to use GFP_NOFS + * for requests from a local client to avoid recursing into the filesystem + * as we might end up waiting on a page sent in the request we're serving. + * + * Use __GFP_HIGHMEM so that the pages can use all of the available memory + * on 32-bit machines. Use more aggressive GFP_HIGHUSER flags from non-local + * clients to be able to generate more memory pressure on the OSS and allow + * inactive pages to be reclaimed, since it doesn't have any other processes + * or allocations that generate memory reclaim pressure. + * + * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details. + */ +static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn) +{ + if (!conn) + return false; + + if (conn->c_peer.nid == conn->c_self) + return true; + + RETURN(LNetIsPeerLocal(conn->c_peer.nid)); +} + /* ptlrpc/niobuf.c */ /** * Actual interfacing with LNet to put/get/register/unregister stuff diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index c3d578e..6879a92 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -2116,13 +2116,13 @@ out: static int ofd_ladvise_prefetch(const struct lu_env *env, struct ofd_object *fo, struct niobuf_local *lnb, - __u64 start, __u64 end) + __u64 start, __u64 end, enum dt_bufs_type dbt) { - struct ofd_thread_info *info = ofd_info(env); - pgoff_t start_index, end_index, pages; - struct niobuf_remote rnb; - unsigned long nr_local; - int rc = 0; + struct ofd_thread_info *info = ofd_info(env); + pgoff_t start_index, end_index, pages; + struct niobuf_remote rnb; + unsigned long nr_local; + int rc = 0; if (end <= start) RETURN(-EINVAL); @@ -2150,7 +2150,7 @@ static int ofd_ladvise_prefetch(const struct lu_env *env, PTLRPC_MAX_BRW_PAGES; rnb.rnb_offset = start_index << PAGE_SHIFT; rnb.rnb_len = nr_local << PAGE_SHIFT; - rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, 0); + rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, dbt); if (unlikely(rc < 0)) break; nr_local = rc; @@ -2188,7 +2188,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) struct ptlrpc_thread *svc_thread = req->rq_svc_thread; const struct lu_env *env = svc_thread->t_env; struct tgt_thread_big_cache *tbc = svc_thread->t_data; - int rc = 0; + enum dt_bufs_type dbt = DT_BUFS_TYPE_READAHEAD; struct lu_ladvise *ladvise; int num_advise; struct ladvise_hdr *ladvise_hdr; @@ -2199,6 +2199,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) struct dt_object *dob; __u64 start; __u64 end; + int rc = 0; ENTRY; CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LADVISE_PAUSE, cfs_fail_val); @@ -2247,6 +2248,9 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) LASSERT(fo != NULL); dob = ofd_object_child(fo); + if (ptlrpc_connection_is_local(exp->exp_connection)) + dbt |= DT_BUFS_TYPE_LOCAL; + for (i = 0; i < num_advise; i++, ladvise++) { start = ladvise->lla_start; end = ladvise->lla_end; @@ -2274,7 +2278,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) req->rq_status = ofd_ladvise_prefetch(env, fo, tbc->local, - start, end); + start, end, dbt); tgt_extent_unlock(&lockh, LCK_PR); break; case LU_LADVISE_DONTNEED: diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index 5ccdca5..5313655 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -453,8 +453,9 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp, struct niobuf_remote *rnb, int *nr_local, struct niobuf_local *lnb, char *jobid) { - struct ofd_object *fo; - int i, j, rc, tot_bytes = 0; + struct ofd_object *fo; + int i, j, rc, tot_bytes = 0; + enum dt_bufs_type dbt = DT_BUFS_TYPE_READ; ENTRY; LASSERT(env != NULL); @@ -474,10 +475,12 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp, GOTO(unlock, rc); } - *nr_local = 0; - for (i = 0, j = 0; i < niocount; i++) { + if (ptlrpc_connection_is_local(exp->exp_connection)) + dbt |= DT_BUFS_TYPE_LOCAL; + + for (*nr_local = 0, i = 0, j = 0; i < niocount; i++) { rc = dt_bufs_get(env, ofd_object_child(fo), rnb + i, - lnb + j, 0); + lnb + j, dbt); if (unlikely(rc < 0)) GOTO(buf_put, rc); LASSERT(rc <= PTLRPC_MAX_BRW_PAGES); @@ -538,8 +541,9 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, struct niobuf_remote *rnb, int *nr_local, struct niobuf_local *lnb, char *jobid) { - struct ofd_object *fo; - int i, j, k, rc = 0, tot_bytes = 0; + struct ofd_object *fo; + int i, j, k, rc = 0, tot_bytes = 0; + enum dt_bufs_type dbt = DT_BUFS_TYPE_WRITE; ENTRY; LASSERT(env != NULL); @@ -628,11 +632,13 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, * space back if possible */ tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt); + if (ptlrpc_connection_is_local(exp->exp_connection)) + dbt |= DT_BUFS_TYPE_LOCAL; + /* parse remote buffers to local buffers and prepare the latter */ - *nr_local = 0; - for (i = 0, j = 0; i < obj->ioo_bufcnt; i++) { + for (*nr_local = 0, i = 0, j = 0; i < obj->ioo_bufcnt; i++) { rc = dt_bufs_get(env, ofd_object_child(fo), - rnb + i, lnb + j, 1); + rnb + i, lnb + j, dbt); if (unlikely(rc < 0)) GOTO(err, rc); LASSERT(rc <= PTLRPC_MAX_BRW_PAGES); diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index c3545a4..56e5231 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -421,16 +421,18 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, RETURN(0); } -static struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) +static struct page *osd_get_page(struct dt_object *dt, loff_t offset, + gfp_t gfp_mask) { - struct inode *inode = osd_dt_obj(dt)->oo_inode; - struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); - struct page *page; + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); + struct page *page; LASSERT(inode); page = find_or_create_page(inode->i_mapping, offset >> PAGE_SHIFT, - GFP_NOFS | __GFP_HIGHMEM); + gfp_mask); + if (unlikely(page == NULL)) lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1); @@ -504,7 +506,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, * \param pos byte offset of IO start * \param len number of bytes of IO * \param lnb array of extents undergoing IO - * \param rw read or write operation? + * \param rw read or write operation, and other flags * \param capa capabilities * * \retval pages (zero or more) loaded successfully @@ -512,17 +514,22 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, */ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, loff_t pos, ssize_t len, struct niobuf_local *lnb, - int rw) + enum dt_bufs_type rw) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_object *obj = osd_dt_obj(dt); int npages, i, rc = 0; + gfp_t gfp_mask; LASSERT(obj->oo_inode); osd_map_remote_to_local(pos, len, &npages, lnb); + /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */ + gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) : + GFP_HIGHUSER; for (i = 0; i < npages; i++, lnb++) { - lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, rw); + lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, + gfp_mask); if (lnb->lnb_page == NULL) GOTO(cleanup, rc = -ENOMEM); diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 082d7dd..41e6ee7 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -313,7 +313,7 @@ static inline struct page *kmem_to_page(void *addr) * \retval negative error number of failure */ static int osd_bufs_get_read(const struct lu_env *env, struct osd_object *obj, - loff_t off, ssize_t len, struct niobuf_local *lnb) + loff_t off, ssize_t len, struct niobuf_local *lnb) { struct osd_device *osd = osd_obj2dev(obj); unsigned long start = cfs_time_current(); @@ -420,7 +420,7 @@ static inline arc_buf_t *osd_request_arcbuf(dnode_t *dn, size_t bs) } static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj, - loff_t off, ssize_t len, struct niobuf_local *lnb) + loff_t off, ssize_t len, struct niobuf_local *lnb) { struct osd_device *osd = osd_obj2dev(obj); int plen, off_in_block, sz_in_block; @@ -525,7 +525,7 @@ out_err: static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, loff_t offset, ssize_t len, struct niobuf_local *lnb, - int rw) + enum dt_bufs_type rw) { struct osd_object *obj = osd_dt_obj(dt); int rc; @@ -533,10 +533,10 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, LASSERT(dt_object_exists(dt)); LASSERT(obj->oo_dn); - if (rw == 0) - rc = osd_bufs_get_read(env, obj, offset, len, lnb); - else + if (rw & DT_BUFS_TYPE_WRITE) rc = osd_bufs_get_write(env, obj, offset, len, lnb); + else + rc = osd_bufs_get_read(env, obj, offset, len, lnb); return rc; } diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index 9cd2b1c..d630d85 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -2186,7 +2186,7 @@ int tgt_brw_write(struct tgt_session_info *tsi) RETURN(err_serious(-EPROTO)); if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) && - (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self)) + ptlrpc_connection_is_local(exp->exp_connection)) memory_pressure_set(); req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER, -- 1.8.3.1