From: dzogin Date: Wed, 8 Jul 2009 06:01:49 +0000 (+0000) Subject: Branch HEAD X-Git-Tag: v1_9_220~45 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=3dcf18d3;ds=sidebyside Branch HEAD b=19529 i=andrew.perepechko i=adilger Description: Avoid deadlock for local client writes Details : Use new OBD_BRW_MEMALLOC flag to notify OST about writes in the memory freeing context. This allows OST threads to set the PF_MEMALLOC flag on task structures in order to allocate memory from reserved pools and complete IO. Use GFP_HIGHUSER for OST allocations for non-local client writes, so that the OST threads generate memory pressure and allow inactive pages to be reclaimed --- diff --git a/libcfs/include/libcfs/linux/linux-mem.h b/libcfs/include/libcfs/linux/linux-mem.h index 19dec15..8fb2184 100644 --- a/libcfs/include/libcfs/linux/linux-mem.h +++ b/libcfs/include/libcfs/linux/linux-mem.h @@ -110,6 +110,10 @@ extern void __cfs_free_pages(cfs_page_t *page, unsigned int order); #define __cfs_free_page(page) __cfs_free_pages(page, 0) #define cfs_free_page(p) __free_pages(p, 0) +#define libcfs_memory_pressure_get() (current->flags & PF_MEMALLOC) +#define libcfs_memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0) +#define libcfs_memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0) + /* * In Linux there is no way to determine whether current execution context is * blockable. diff --git a/lustre/ChangeLog b/lustre/ChangeLog index dbfe209..1c2fd25 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -14,9 +14,20 @@ tbd Sun Microsystems, Inc. * File join has been disabled in this release, refer to Bugzilla 16929. Severity : normal +Bugzilla : 19529 +Description: Avoid deadlock for local client writes +Details : Use new OBD_BRW_MEMALLOC flag to notify OST about writes in the + memory freeing context. This allows OST threads to set the + PF_MEMALLOC flag on task structures in order to allocate memory + from reserved pools and complete IO. + Use GFP_HIGHUSER for OST allocations for non-local client writes, + so that the OST threads generate memory pressure and allow + inactive pages to be reclaimed. + +Severity : normal Frequency : rare Bugzilla : 18380 -Descriptoin: lock ordering violation between &cli->cl_sem and _lprocfs_lock +Description: lock ordering violation between &cli->cl_sem and _lprocfs_lock Details : move ldlm namespace creation in setup phase to avoid grab _lprocfs_lock with cli_sem held. diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index d758fb1..5d357c7 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -355,6 +355,10 @@ int test_and_clear_bit(int nr, unsigned long *addr) return oldbit; } +#define libcfs_memory_pressure_get() (0) +#define libcfs_memory_pressure_put() do {} while (0) +#define libcfs_memory_pressure_clr() do {} while (0) + /* FIXME sys/capability will finally included linux/fs.h thus * cause numerous trouble on x86-64. as temporary solution for * build broken at Cray, we copy definition we need from capability.h diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 3bad9a1..d11173f 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1088,6 +1088,7 @@ extern void lustre_swab_obd_statfs (struct obd_statfs *os); #define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ #define OBD_BRW_NOQUOTA 0x100 #define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ +#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ #define OBD_OBJECT_EOF 0xffffffffffffffffULL diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index bfd7092..ea2721e 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -264,20 +264,26 @@ long filter_grant(struct obd_export *exp, obd_size current_grant, /* * the routine is used to request pages from pagecache * - * use GFP_NOFS not allowing to enter FS as the client can run on this node - * and we might end waiting on a page he sent in the request we're serving. - * + * use GFP_NOFS for requests from a local client not allowing to enter FS + * as we might end up waiting on a page he sent in the request we're serving. * use __GFP_HIGHMEM so that the pages can use all of the available memory * on 32-bit machines + * use more agressive GFP_HIGHUSER flags from non-local clients to be able to + * generate more memory pressure, but at the same time use __GFP_NOMEMALLOC + * in order not to exhaust emergency reserves. + * + * See Bug 19529 and Bug 19917 for details. */ static struct page *filter_get_page(struct obd_device *obd, struct inode *inode, - obd_off offset) + obd_off offset, + int localreq) { struct page *page; page = find_or_create_page(inode->i_mapping, offset >> CFS_PAGE_SHIFT, - GFP_NOFS | __GFP_HIGHMEM); + (localreq ? (GFP_NOFS | __GFP_HIGHMEM) + : (GFP_HIGHUSER | __GFP_NOMEMALLOC))); if (unlikely(page == NULL)) lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_NO_PAGE, 1); @@ -459,7 +465,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, * so it's easy to detect later. */ break; - lnb->page = filter_get_page(obd, inode, lnb->offset); + lnb->page = filter_get_page(obd, inode, lnb->offset, 0); if (lnb->page == NULL) GOTO(cleanup, rc = -ENOMEM); @@ -673,7 +679,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, void *iobuf; obd_size left; unsigned long now = jiffies, timediff; - int rc = 0, i, tot_bytes = 0, cleanup_phase = 0; + int rc = 0, i, tot_bytes = 0, cleanup_phase = 0, localreq = 0; ENTRY; LASSERT(objcount == 1); LASSERT(obj->ioo_bufcnt > 0); @@ -683,6 +689,9 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, if (rc) RETURN(rc); + if (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self) + localreq = 1; + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); iobuf = filter_iobuf_get(&obd->u.filter, oti); if (IS_ERR(iobuf)) @@ -768,7 +777,8 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, * needs to keep the pages all aligned properly. */ lnb->dentry = dentry; - lnb->page = filter_get_page(obd, dentry->d_inode, lnb->offset); + lnb->page = filter_get_page(obd, dentry->d_inode, lnb->offset, + localreq); if (lnb->page == NULL) GOTO(cleanup, rc = -ENOMEM); diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index 72d0301..29d119c 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -541,6 +541,9 @@ void osc_io_submit_page(const struct lu_env *env, oap->oap_page_off = opg->ops_from; oap->oap_count = opg->ops_to - opg->ops_from; + /* Give a hint to OST that requests are coming from kswapd - bug19529 */ + if (libcfs_memory_pressure_get()) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; oap->oap_brw_flags |= OBD_BRW_SYNC; if (osc_io_srvlock(oio)) oap->oap_brw_flags |= OBD_BRW_SRVLOCK; diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 31cb35e4..e6bbf9a 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2897,6 +2897,9 @@ int osc_queue_async_io(const struct lu_env *env, oap->oap_page_off = off; oap->oap_count = count; oap->oap_brw_flags = brw_flags; + /* Give a hint to OST that requests are coming from kswapd - bug19529 */ + if (libcfs_memory_pressure_get()) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; oap->oap_async_flags = async_flags; if (cmd & OBD_BRW_WRITE) { diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 2796d9f..faf551d 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -903,6 +903,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); LASSERT(body != NULL); + if ((body->oa.o_flags & OBD_BRW_MEMALLOC) && + (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self)) + libcfs_memory_pressure_set(); + objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) / sizeof(*ioo); ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, @@ -1166,6 +1170,7 @@ out: exp->exp_connection->c_remote_uuid.uuid, libcfs_id2str(req->rq_peer)); } + libcfs_memory_pressure_clr(); RETURN(rc); }