Whamcloud - gitweb
LU-6356 ptlrpc: do not sleep if encpool reached max capacity 70/15070/4
authorSebastien Buisson <sebastien.buisson@bull.net>
Fri, 29 May 2015 13:20:41 +0000 (15:20 +0200)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 26 Aug 2015 21:42:12 +0000 (21:42 +0000)
When using krb5p Kerberos flavor, RPCs are encrypted just before being
sent. This encryption requires allocating memory in the encoding pool.
The current implementation in sptlrpc_enc_pool_get_pages() is
deadlock-prone. Indeed, if there is no more free pages in the pool,
all ptlrpcd threads can end up waiting in a queue, so there is no
thread available to process other requests. It means client is not
able to process replies from servers that yet contain last committed
transno useful to release memory allocated by previous requests,
including enc_pool pages.

To fix this, in sptlrpc_enc_pool_get_pages(), do not make ptlrpcd
threads wait in queue if encoding pool has already reached its maximum
capacity. Instead, return -ENOMEM. If functions calling ptl_send_rpc()
get -ENOMEM, then put back request in queue by moving it back to
RQ_PHASE_NEW phase.
As an optimization, do not call ptl_send_rpc() again for requests that
already failed to allocate in the enc_pool, as long as there is not
enough memory in the enc_pool to satisfy theirs needs.

In /proc/fs/lustre/sptlrpc/encrypt_page_pools, add a new 'out of mem'
stat to track how many requests fail to allocate memory in the
enc_pool.

Signed-off-by: Sebastien Buisson <sebastien.buisson@bull.net>
Change-Id: Ie4217bb15c8514d28d360e50a9be7716b52f5147
Reviewed-on: http://review.whamcloud.com/15070
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre_sec.h
lustre/ptlrpc/client.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/sec_bulk.c

index f2096f9..db2b553 100644 (file)
@@ -1129,6 +1129,8 @@ int sptlrpc_enc_pool_add_user(void);
 int sptlrpc_enc_pool_del_user(void);
 int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
 void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+int get_free_pages_in_pool(void);
+int pool_is_at_full_capacity(void);
 
 int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
                           struct ptlrpc_bulk_desc *desc);
index a1b15ef..370c145 100644 (file)
@@ -1478,6 +1478,13 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
         ENTRY;
 
         LASSERT(req->rq_phase == RQ_PHASE_NEW);
+
+       /* do not try to go further if there is not enough memory in enc_pool */
+       if (req->rq_sent && req->rq_bulk != NULL)
+               if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
+                   pool_is_at_full_capacity())
+                       RETURN(-ENOMEM);
+
         if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) &&
             (!req->rq_generation_set ||
              req->rq_import_generation == imp->imp_generation))
@@ -1568,6 +1575,16 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
               lustre_msg_get_opc(req->rq_reqmsg));
 
         rc = ptl_send_rpc(req, 0);
+       if (rc == -ENOMEM) {
+               spin_lock(&imp->imp_lock);
+               if (!list_empty(&req->rq_list)) {
+                       list_del_init(&req->rq_list);
+                       atomic_dec(&req->rq_import->imp_inflight);
+               }
+               spin_unlock(&imp->imp_lock);
+               ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+               RETURN(rc);
+       }
         if (rc) {
                 DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
                spin_lock(&req->rq_lock);
@@ -1837,6 +1854,14 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                                }
 
                                rc = ptl_send_rpc(req, 0);
+                               if (rc == -ENOMEM) {
+                                       spin_lock(&imp->imp_lock);
+                                       if (!list_empty(&req->rq_list))
+                                               list_del_init(&req->rq_list);
+                                       spin_unlock(&imp->imp_lock);
+                                       ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+                                       continue;
+                               }
                                if (rc) {
                                        DEBUG_REQ(D_HA, req,
                                                  "send failed: rc = %d", rc);
index c279ab5..7a96e2f 100644 (file)
@@ -730,9 +730,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         if (request->rq_memalloc)
                 mpflag = cfs_memory_pressure_get_and_set();
 
-        rc = sptlrpc_cli_wrap_request(request);
-        if (rc)
-                GOTO(out, rc);
+       rc = sptlrpc_cli_wrap_request(request);
+       if (rc == -ENOMEM)
+               /* set rq_sent so that this request is treated
+                * as a delayed send in the upper layers */
+               request->rq_sent = cfs_time_current_sec();
+       if (rc)
+               GOTO(out, rc);
 
         /* bulk register should be done after wrap_request() */
         if (request->rq_bulk != NULL) {
index f6f9138..8067bd1 100644 (file)
@@ -119,6 +119,7 @@ static struct ptlrpc_enc_page_pool {
         unsigned long    epp_st_lowfree;        /* lowest free pages reached */
         unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
         cfs_time_t       epp_st_max_wait;       /* in jeffies */
+       unsigned long    epp_st_outofmem;       /* # of out of mem requests */
        /*
         * pointers to pools
         */
@@ -160,6 +161,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
                       "low free mark:           %lu\n"
                       "max waitqueue depth:     %u\n"
                      "max wait time:           "CFS_TIME_T"/%lu\n"
+                     "out of mem:             %lu\n"
                       ,
                      totalram_pages,
                       PAGES_PER_POOL,
@@ -179,7 +181,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
                      page_pools.epp_st_lowfree,
                      page_pools.epp_st_max_wqlen,
                      page_pools.epp_st_max_wait,
-                     msecs_to_jiffies(MSEC_PER_SEC)
+                     msecs_to_jiffies(MSEC_PER_SEC),
+                     page_pools.epp_st_outofmem
                     );
 
        spin_unlock(&page_pools.epp_lock);
@@ -526,6 +529,24 @@ static int enc_pools_should_grow(int page_needed, long now)
 }
 
 /*
+ * Export the number of free pages in the pool
+ */
+int get_free_pages_in_pool(void)
+{
+       return page_pools.epp_free_pages;
+}
+EXPORT_SYMBOL(get_free_pages_in_pool);
+
+/*
+ * Let outside world know if enc_pool full capacity is reached
+ */
+int pool_is_at_full_capacity(void)
+{
+       return (page_pools.epp_total_pages == page_pools.epp_max_pages);
+}
+EXPORT_SYMBOL(pool_is_at_full_capacity);
+
+/*
  * we allocate the requested pages atomically.
  */
 int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
@@ -574,21 +595,37 @@ again:
 
                        enc_pools_wakeup();
                } else {
-                       if (++page_pools.epp_waitqlen >
-                           page_pools.epp_st_max_wqlen)
-                               page_pools.epp_st_max_wqlen =
-                                               page_pools.epp_waitqlen;
-
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       init_waitqueue_entry(&waitlink, current);
-                       add_wait_queue(&page_pools.epp_waitq, &waitlink);
-
-                       spin_unlock(&page_pools.epp_lock);
-                       schedule();
-                       remove_wait_queue(&page_pools.epp_waitq, &waitlink);
-                       LASSERT(page_pools.epp_waitqlen > 0);
-                       spin_lock(&page_pools.epp_lock);
-                       page_pools.epp_waitqlen--;
+                       if (page_pools.epp_growing) {
+                               if (++page_pools.epp_waitqlen >
+                                   page_pools.epp_st_max_wqlen)
+                                       page_pools.epp_st_max_wqlen =
+                                                       page_pools.epp_waitqlen;
+
+                               set_current_state(TASK_UNINTERRUPTIBLE);
+                               init_waitqueue_entry(&waitlink, current);
+                               add_wait_queue(&page_pools.epp_waitq,
+                                              &waitlink);
+
+                               spin_unlock(&page_pools.epp_lock);
+                               schedule();
+                               remove_wait_queue(&page_pools.epp_waitq,
+                                                 &waitlink);
+                               LASSERT(page_pools.epp_waitqlen > 0);
+                               spin_lock(&page_pools.epp_lock);
+                               page_pools.epp_waitqlen--;
+                       } else {
+                               /* ptlrpcd thread should not sleep in that case,
+                                * or deadlock may occur!
+                                * Instead, return -ENOMEM so that upper layers
+                                * will put request back in queue. */
+                               page_pools.epp_st_outofmem++;
+                               spin_unlock(&page_pools.epp_lock);
+                               OBD_FREE(GET_ENC_KIOV(desc),
+                                        desc->bd_iov_count *
+                                               sizeof(*GET_ENC_KIOV(desc)));
+                               GET_ENC_KIOV(desc) = NULL;
+                               return -ENOMEM;
+                       }
                }
 
                LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
@@ -778,6 +815,7 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_st_lowfree = 0;
         page_pools.epp_st_max_wqlen = 0;
         page_pools.epp_st_max_wait = 0;
+       page_pools.epp_st_outofmem = 0;
 
         enc_pools_alloc();
         if (page_pools.epp_pools == NULL)
@@ -812,13 +850,14 @@ void sptlrpc_enc_pool_fini(void)
                CDEBUG(D_SEC,
                       "max pages %lu, grows %u, grow fails %u, shrinks %u, "
                       "access %lu, missing %lu, max qlen %u, max wait "
-                      CFS_TIME_T"/%lu\n",
+                      CFS_TIME_T"/%lu, out of mem %lu\n",
                       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
                       page_pools.epp_st_grow_fails,
                       page_pools.epp_st_shrinks, page_pools.epp_st_access,
                       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
                       page_pools.epp_st_max_wait,
-                      msecs_to_jiffies(MSEC_PER_SEC));
+                      msecs_to_jiffies(MSEC_PER_SEC),
+                      page_pools.epp_st_outofmem);
        }
 }