LU-2139 osc: Track and limit "unstable" pages

author Prakash Surya <surya1@llnl.gov>

Wed, 2 Oct 2013 21:32:48 +0000 (14:32 -0700)

committer Oleg Drokin <oleg.drokin@intel.com>

Mon, 4 Nov 2013 03:09:23 +0000 (03:09 +0000)
author Prakash Surya <surya1@llnl.gov>
Wed, 2 Oct 2013 21:32:48 +0000 (14:32 -0700)
committer Oleg Drokin <oleg.drokin@intel.com>
Mon, 4 Nov 2013 03:09:23 +0000 (03:09 +0000)
diff --git a/libcfs/include/libcfs/user-mem.h b/libcfs/include/libcfs/user-mem.h

index 8ab9c25..2ad7fce 100644 (file)
--- a/libcfs/include/libcfs/user-mem.h
+++ b/libcfs/include/libcfs/user-mem.h
@@ -70,6 +70,9 @@ void kunmap(struct page *pg);
  #define page_cache_get(page) do { } while (0)
  #define page_cache_release(page) do { } while (0)
  
+#define inc_zone_page_state(page, state) do {} while (0)
+#define dec_zone_page_state(page, state) do {} while (0)
+
  /*
   * Memory allocator
   * Inline function, so utils can use them without linking of libcfs
diff --git a/lustre/include/lclient.h b/lustre/include/lclient.h

index dd782ee..5285071 100644 (file)
--- a/lustre/include/lclient.h
+++ b/lustre/include/lclient.h
@@ -432,16 +432,20 @@ struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
  void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
  
  /**
- * Data structure managing a client's cached clean pages. An LRU of
- * pages is maintained, along with other statistics.
+ * Data structure managing a client's cached pages. A count of
+ * "unstable" pages is maintained, and an LRU of clean pages is
+ * maintained. "unstable" pages are pages pinned by the ptlrpc
+ * layer for recovery purposes.
   */
  struct cl_client_cache {
-       cfs_atomic_t    ccc_users;    /* # of users (OSCs) of this data */
-       cfs_list_t      ccc_lru;      /* LRU list of cached clean pages */
-       spinlock_t      ccc_lru_lock; /* lock for list */
-       cfs_atomic_t    ccc_lru_left; /* # of LRU entries available */
-       unsigned long   ccc_lru_max;  /* Max # of LRU entries possible */
-       unsigned int    ccc_lru_shrinkers; /* # of threads reclaiming */
+       cfs_atomic_t            ccc_users;    /* # of users (OSCs) */
+       cfs_list_t              ccc_lru;      /* LRU of cached clean pages */
+       spinlock_t              ccc_lru_lock; /* lock for list */
+       cfs_atomic_t            ccc_lru_left; /* # of LRU entries available */
+       unsigned long           ccc_lru_max;  /* Max # of LRU entries */
+       unsigned int            ccc_lru_shrinkers;  /* # of threads shrinking */
+       cfs_atomic_t            ccc_unstable_nr;    /* # of pages pinned */
+       wait_queue_head_t       ccc_unstable_waitq; /* Signaled on BRW commit */
  };
  
  #endif /*LCLIENT_H */
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 1138ee0..df95d8a 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -1848,7 +1848,9 @@ struct ptlrpc_request {
                 rq_no_retry_einprogress:1,
                 /* allow the req to be sent if the import is in recovery
                  * status */
-               rq_allow_replay:1;
+               rq_allow_replay:1,
+               /* bulk request, sent to server, but uncommitted */
+               rq_unstable:1;
  
         unsigned int rq_nr_resend;
  
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 0a60555..bf14b7d 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -573,7 +573,7 @@ struct lov_obd {
          cfs_proc_dir_entry_t   *lov_pool_proc_entry;
          enum lustre_sec_part    lov_sp_me;
  
-       /* Cached LRU pages from upper layer */
+       /* Cached LRU and unstable data from upper layer */
         void                   *lov_cache;
  
         struct rw_semaphore     lov_notify_lock;
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 45d8e95..f1beff0 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -75,6 +75,7 @@ extern int at_early_margin;
  extern int at_extra;
  extern unsigned int obd_sync_filter;
  extern unsigned int obd_max_dirty_pages;
+extern cfs_atomic_t obd_unstable_pages;
  extern cfs_atomic_t obd_dirty_pages;
  extern cfs_atomic_t obd_dirty_transit_pages;
  extern unsigned int obd_alloc_fail_rate;
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index a0cfd37..97b6ee1 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -496,6 +496,10 @@ struct ll_sb_info {
  
          struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
  
+       /* Used to track "unstable" pages on a client, and maintain a
+        * LRU list of clean pages. An "unstable" page is defined as
+        * any page which is sent to a server as part of a bulk request,
+        * but is uncommitted to stable storage. */
         struct cl_client_cache    ll_cache;
  
          struct lprocfs_stats     *ll_ra_stats;
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 9815b08..66e72d5 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -99,13 +99,16 @@ static struct ll_sb_info *ll_init_sbi(void)
                 lru_page_max = (pages / 4) * 3;
         }
  
-       /* initialize lru data */
+       /* initialize ll_cache data */
         cfs_atomic_set(&sbi->ll_cache.ccc_users, 0);
         sbi->ll_cache.ccc_lru_max = lru_page_max;
         cfs_atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
         spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
         CFS_INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
  
+       cfs_atomic_set(&sbi->ll_cache.ccc_unstable_nr, 0);
+       init_waitqueue_head(&sbi->ll_cache.ccc_unstable_waitq);
+
          sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                             SBI_DEFAULT_READAHEAD_MAX);
          sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
@@ -1095,7 +1098,7 @@ void ll_put_super(struct super_block *sb)
          struct lustre_sb_info *lsi = s2lsi(sb);
          struct ll_sb_info *sbi = ll_s2sbi(sb);
          char *profilenm = get_profile_name(sb);
-        int force = 1, next;
+       int ccc_count, next, force = 1, rc = 0;
          ENTRY;
  
          CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
@@ -1114,6 +1117,19 @@ void ll_put_super(struct super_block *sb)
                          force = obd->obd_force;
          }
  
+       /* Wait for unstable pages to be committed to stable storage */
+       if (force == 0) {
+               struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(sbi->ll_cache.ccc_unstable_waitq,
+                       cfs_atomic_read(&sbi->ll_cache.ccc_unstable_nr) == 0,
+                       &lwi);
+       }
+
+       ccc_count = cfs_atomic_read(&sbi->ll_cache.ccc_unstable_nr);
+       if (force == 0 && rc != -EINTR)
+               LASSERTF(ccc_count == 0, "count: %i\n", ccc_count);
+
+
          /* We need to set force before the lov_disconnect in
             lustre_common_put_super, since l_d cleans up osc's as well. */
          if (force) {
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index 18b8cf3..5f995af 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -776,6 +776,23 @@ static int ll_rd_sbi_flags(char *page, char **start, off_t off,
         return rc;
  }
  
+static int ll_rd_unstable_stats(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct super_block      *sb    = data;
+       struct ll_sb_info       *sbi   = ll_s2sbi(sb);
+       struct cl_client_cache  *cache = &sbi->ll_cache;
+       int pages, mb, rc;
+
+       pages = cfs_atomic_read(&cache->ccc_unstable_nr);
+       mb    = (pages * PAGE_CACHE_SIZE) >> 20;
+
+       rc = snprintf(page, count, "unstable_pages: %8d\n"
+                                  "unstable_mb:    %8d\n", pages, mb);
+
+       return rc;
+}
+
  static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          { "uuid",         ll_rd_sb_uuid,          0, 0 },
          //{ "mntpt_path",   ll_rd_path,             0, 0 },
@@ -808,6 +825,7 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          { "max_easize",       ll_rd_maxea_size, 0, 0 },
         { "sbi_flags",        ll_rd_sbi_flags, 0, 0 },
         { "xattr_cache",      ll_rd_xattr_cache, ll_wr_xattr_cache, 0 },
+       { "unstable_stats",   ll_rd_unstable_stats, 0, 0},
          { 0 }
  };
  
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c

index 8aed982..0eb30ec 100644 (file)
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -61,7 +61,7 @@
  #include <lprocfs_status.h>
  #include <lustre_param.h>
  #include <cl_object.h>
-#include <lclient.h> /* for cl_client_lru */
+#include <lclient.h>
  #include <lustre/ll_fiemap.h>
  #include <lustre_log.h>
  #include <lustre_fid.h>
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c

index 8df4ad9..cb0ea79 100644 (file)
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -84,6 +84,8 @@ unsigned int obd_dump_on_eviction;
  EXPORT_SYMBOL(obd_dump_on_eviction);
  unsigned int obd_max_dirty_pages = 256;
  EXPORT_SYMBOL(obd_max_dirty_pages);
+cfs_atomic_t obd_unstable_pages;
+EXPORT_SYMBOL(obd_unstable_pages);
  cfs_atomic_t obd_dirty_pages;
  EXPORT_SYMBOL(obd_dirty_pages);
  unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c

index 4cabaca..df3dec4 100644 (file)
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -1318,11 +1318,13 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
  #define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do {                          \
         struct client_obd *__tmp = (cli);                                     \
         CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %d/%d "          \
-              "dropped: %ld avail: %ld, reserved: %ld, flight: %d } "        \
-              "lru {in list: %d, left: %d, waiters: %d }" fmt,               \
+              "unstable_pages: %d/%d dropped: %ld avail: %ld, "              \
+              "reserved: %ld, flight: %d } lru {in list: %d, "               \
+              "left: %d, waiters: %d }" fmt,                                 \
                __tmp->cl_import->imp_obd->obd_name,                           \
                __tmp->cl_dirty, __tmp->cl_dirty_max,                          \
                cfs_atomic_read(&obd_dirty_pages), obd_max_dirty_pages,        \
+              cfs_atomic_read(&obd_unstable_pages), obd_max_dirty_pages,     \
                __tmp->cl_lost_grant, __tmp->cl_avail_grant,                   \
                __tmp->cl_reserved_grant, __tmp->cl_w_in_flight,               \
                cfs_atomic_read(&__tmp->cl_lru_in_list),                       \
@@ -1475,7 +1477,8 @@ static int osc_enter_cache_try(struct client_obd *cli,
                 return 0;
  
         if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
-           cfs_atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
+           cfs_atomic_read(&obd_unstable_pages) + 1 +
+           cfs_atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
                 osc_consume_write_grant(cli, &oap->oap_brw_page);
                 if (transient) {
                         cli->cl_dirty_transit += PAGE_CACHE_SIZE;
@@ -1608,8 +1611,8 @@ void osc_wake_cache_waiters(struct client_obd *cli)
                 ocw->ocw_rc = -EDQUOT;
                 /* we can't dirty more */
                 if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) ||
-                   (cfs_atomic_read(&obd_dirty_pages) + 1 >
-                    obd_max_dirty_pages)) {
+                   (cfs_atomic_read(&obd_unstable_pages) + 1 +
+                    cfs_atomic_read(&obd_dirty_pages) > obd_max_dirty_pages)) {
                         CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
                                "osc max %ld, sys max %d\n", cli->cl_dirty,
                                cli->cl_dirty_max, obd_max_dirty_pages);
@@ -1777,6 +1780,85 @@ static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
                 ar->ar_force_sync = 0;
  }
  
+/* Performs "unstable" page accounting. This function balances the
+ * increment operations performed in osc_inc_unstable_pages. It is
+ * registered as the RPC request callback, and is executed when the
+ * bulk RPC is committed on the server. Thus at this point, the pages
+ * involved in the bulk transfer are no longer considered unstable. */
+void osc_dec_unstable_pages(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+       struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
+       obd_count                page_count = desc->bd_iov_count;
+       int i;
+
+       /* No unstable page tracking */
+       if (cli->cl_cache == NULL)
+               return;
+
+       LASSERT(page_count >= 0);
+
+       for (i = 0; i < page_count; i++)
+               dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+
+       cfs_atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr);
+       LASSERT(cfs_atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
+
+       cfs_atomic_sub(page_count, &obd_unstable_pages);
+       LASSERT(cfs_atomic_read(&obd_unstable_pages) >= 0);
+
+       spin_lock(&req->rq_lock);
+       req->rq_committed = 1;
+       req->rq_unstable  = 0;
+       spin_unlock(&req->rq_lock);
+
+       wake_up_all(&cli->cl_cache->ccc_unstable_waitq);
+}
+
+/* "unstable" page accounting. See: osc_dec_unstable_pages. */
+void osc_inc_unstable_pages(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+       struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
+       obd_count                page_count = desc->bd_iov_count;
+       int i;
+
+       /* No unstable page tracking */
+       if (cli->cl_cache == NULL)
+               return;
+
+       LASSERT(page_count >= 0);
+
+       for (i = 0; i < page_count; i++)
+               inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+
+       LASSERT(cfs_atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
+       cfs_atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr);
+
+       LASSERT(cfs_atomic_read(&obd_unstable_pages) >= 0);
+       cfs_atomic_add(page_count, &obd_unstable_pages);
+
+       spin_lock(&req->rq_lock);
+
+       /* If the request has already been committed (i.e. brw_commit
+        * called via rq_commit_cb), we need to undo the unstable page
+        * increments we just performed because rq_commit_cb wont be
+        * called again. Otherwise, just set the commit callback so the
+        * unstable page accounting is properly updated when the request
+        * is committed */
+       if (req->rq_committed) {
+               /* Drop lock before calling osc_dec_unstable_pages */
+               spin_unlock(&req->rq_lock);
+               osc_dec_unstable_pages(req);
+               spin_lock(&req->rq_lock);
+       } else {
+               req->rq_unstable  = 1;
+               req->rq_commit_cb = osc_dec_unstable_pages;
+       }
+
+       spin_unlock(&req->rq_lock);
+}
+
  /* this must be called holding the loi list lock to give coverage to exit_cache,
   * async_flag maintenance, and oap_request */
  static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
@@ -1788,6 +1870,9 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
  
         ENTRY;
         if (oap->oap_request != NULL) {
+               if (rc == 0)
+                       osc_inc_unstable_pages(oap->oap_request);
+
                 xid = ptlrpc_req_xid(oap->oap_request);
                 ptlrpc_req_finished(oap->oap_request);
                 oap->oap_request = NULL;
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h

index 3fd6522..d89bad9 100644 (file)
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -204,4 +204,7 @@ int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
  int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
                     struct obd_quotactl *oqctl);
  int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+void osc_inc_unstable_pages(struct ptlrpc_request *req);
+void osc_dec_unstable_pages(struct ptlrpc_request *req);
  #endif /* OSC_INTERNAL_H */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 44ff796..c5bca5e 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -838,13 +838,16 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
                 CERROR("dirty %lu - %lu > dirty_max %lu\n",
                        cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
                 oa->o_undirty = 0;
-       } else if (unlikely(cfs_atomic_read(&obd_dirty_pages) -
+       } else if (unlikely(cfs_atomic_read(&obd_unstable_pages) +
+                           cfs_atomic_read(&obd_dirty_pages) -
                             cfs_atomic_read(&obd_dirty_transit_pages) >
                             (long)(obd_max_dirty_pages + 1))) {
                 /* The cfs_atomic_read() allowing the cfs_atomic_inc() are
                  * not covered by a lock thus they may safely race and trip
                  * this CERROR() unless we add in a small fudge factor (+1). */
-               CERROR("dirty %d - %d > system dirty_max %d\n",
+               CERROR("%s: dirty %d + %d - %d > system dirty_max %d\n",
+                      cli->cl_import->imp_obd->obd_name,
+                      cfs_atomic_read(&obd_unstable_pages),
                        cfs_atomic_read(&obd_dirty_pages),
                        cfs_atomic_read(&obd_dirty_transit_pages),
                        obd_max_dirty_pages);
@@ -1758,6 +1761,7 @@ static int osc_brw_redo_request(struct ptlrpc_request *request,
          aa->aa_resends++;
          new_req->rq_interpret_reply = request->rq_interpret_reply;
          new_req->rq_async_args = request->rq_async_args;
+       new_req->rq_commit_cb = request->rq_commit_cb;
         /* cap resend delay to the current request timeout, this is similar to
          * what ptlrpc does (see after_reply()) */
         if (aa->aa_resends > new_req->rq_timeout)
@@ -2051,6 +2055,23 @@ static int brw_interpret(const struct lu_env *env,
         RETURN(rc);
  }
  
+static void brw_commit(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       /* If osc_inc_unstable_pages (via osc_extent_finish) races with
+        * this called via the rq_commit_cb, I need to ensure
+        * osc_dec_unstable_pages is still called. Otherwise unstable
+        * pages may be leaked. */
+       if (req->rq_unstable) {
+               spin_unlock(&req->rq_lock);
+               osc_dec_unstable_pages(req);
+               spin_lock(&req->rq_lock);
+       } else {
+               req->rq_committed = 1;
+       }
+       spin_unlock(&req->rq_lock);
+}
+
  /**
   * Build an RPC by the list of extent @ext_list. The caller must ensure
   * that the total pages in this list are NOT over max pages per RPC.
@@ -2162,7 +2183,9 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
                 GOTO(out, rc);
         }
  
+       req->rq_commit_cb = brw_commit;
         req->rq_interpret_reply = brw_interpret;
+
         if (mem_tight != 0)
                 req->rq_memalloc = 1;
author	Prakash Surya <surya1@llnl.gov>
	Wed, 2 Oct 2013 21:32:48 +0000 (14:32 -0700)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Mon, 4 Nov 2013 03:09:23 +0000 (03:09 +0000)
libcfs/include/libcfs/user-mem.h		patch \| blob \| history
lustre/include/lclient.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/lproc_llite.c		patch \| blob \| history
lustre/lov/lov_obd.c		patch \| blob \| history
lustre/obdclass/class_obd.c		patch \| blob \| history
lustre/osc/osc_cache.c		patch \| blob \| history
lustre/osc/osc_internal.h		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history