Whamcloud - gitweb
LU-1666 obdclass: reduce lock contention on coh_page_guard
[fs/lustre-release.git] / lustre / obdclass / cl_page.c
index 0a466e9..ea160eb 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -28,6 +26,8 @@
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,9 +39,6 @@
  */
 
 #define DEBUG_SUBSYSTEM S_CLASS
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
 
 #include <libcfs/libcfs.h>
 #include <obd_class.h>
@@ -95,7 +92,7 @@ static struct lu_kmem_descr cl_page_caches[] = {
 
 /**
  * Internal version of cl_page_top, it should be called with page referenced,
- * or coh_page_guard held.
+ * or cp_lock held.
  */
 static struct cl_page *cl_page_top_trusted(struct cl_page *page)
 {
@@ -137,10 +134,8 @@ cl_page_at_trusted(const struct cl_page *page,
         const struct cl_page_slice *slice;
 
 #ifdef INVARIANT_CHECK
-        struct cl_object_header *ch = cl_object_header(page->cp_obj);
-
         if (!cfs_atomic_read(&page->cp_ref))
-                LASSERT_SPIN_LOCKED(&ch->coh_page_guard);
+                LASSERT_SPIN_LOCKED(&page->cp_lock);
 #endif
         ENTRY;
 
@@ -184,11 +179,12 @@ EXPORT_SYMBOL(cl_page_lookup);
  *
  * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
  * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
  */
-void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
-                         struct cl_io *io, pgoff_t start, pgoff_t end,
-                         struct cl_page_list *queue, int nonblock,
-                         int *resched)
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_io *io, pgoff_t start, pgoff_t end,
+                        cl_page_gang_cb_t cb, void *cbdata)
 {
         struct cl_object_header *hdr;
         struct cl_page          *page;
@@ -199,15 +195,10 @@ void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
         unsigned int             nr;
         unsigned int             i;
         unsigned int             j;
-        int                    (*page_own)(const struct lu_env *env,
-                                           struct cl_io *io,
-                                           struct cl_page *pg);
+        int                      res = CLP_GANG_OKAY;
+        int                      tree_lock = 1;
         ENTRY;
 
-        if (resched != NULL)
-                *resched = 0;
-        page_own = nonblock ? cl_page_own_try : cl_page_own;
-
         idx = start;
         hdr = cl_object_header(obj);
         pvec = cl_env_info(env)->clt_pvec;
@@ -215,14 +206,17 @@ void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
         cfs_spin_lock(&hdr->coh_page_guard);
         while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
                                             idx, CLT_PVEC_SIZE)) > 0) {
+                int end_of_region = 0;
                 idx = pvec[nr - 1]->cp_index + 1;
                 for (i = 0, j = 0; i < nr; ++i) {
                         page = pvec[i];
                         pvec[i] = NULL;
 
                         LASSERT(page->cp_type == CPT_CACHEABLE);
-                        if (page->cp_index > end)
+                        if (page->cp_index > end) {
+                                end_of_region = 1;
                                 break;
+                        }
                         if (page->cp_state == CPS_FREEING)
                                 continue;
 
@@ -243,7 +237,7 @@ void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
                          */
                         cl_page_get_trust(page);
                         lu_ref_add_atomic(&page->cp_reference,
-                                          "page_list", cfs_current());
+                                          "gang_lookup", cfs_current());
                         pvec[j++] = page;
                 }
 
@@ -256,24 +250,30 @@ void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
                  * error in the latter case).
                  */
                 cfs_spin_unlock(&hdr->coh_page_guard);
+                tree_lock = 0;
+
                 for (i = 0; i < j; ++i) {
                         page = pvec[i];
-                        if (page_own(env, io, page) == 0)
-                                cl_page_list_add(queue, page);
+                        if (res == CLP_GANG_OKAY)
+                                res = (*cb)(env, io, page, cbdata);
                         lu_ref_del(&page->cp_reference,
-                                   "page_list", cfs_current());
+                                   "gang_lookup", cfs_current());
                         cl_page_put(env, page);
                 }
-                cfs_spin_lock(&hdr->coh_page_guard);
-                if (nr < CLT_PVEC_SIZE)
+                if (nr < CLT_PVEC_SIZE || end_of_region)
                         break;
-                if (resched != NULL && cfs_need_resched()) {
-                        *resched = 1;
+
+                if (res == CLP_GANG_OKAY && cfs_need_resched())
+                        res = CLP_GANG_RESCHED;
+                if (res != CLP_GANG_OKAY)
                         break;
-                }
+
+                cfs_spin_lock(&hdr->coh_page_guard);
+                tree_lock = 1;
         }
-        cfs_spin_unlock(&hdr->coh_page_guard);
-        EXIT;
+        if (tree_lock)
+                cfs_spin_unlock(&hdr->coh_page_guard);
+        RETURN(res);
 }
 EXPORT_SYMBOL(cl_page_gang_lookup);
 
@@ -342,6 +342,7 @@ static int cl_page_alloc(const struct lu_env *env, struct cl_object *o,
                                                      "cl_page", page);
                 page->cp_index = ind;
                 cl_page_state_set_trust(page, CPS_CACHED);
+               cfs_spin_lock_init(&page->cp_lock);
                 page->cp_type = type;
                 CFS_INIT_LIST_HEAD(&page->cp_layers);
                 CFS_INIT_LIST_HEAD(&page->cp_batch);
@@ -401,7 +402,7 @@ static struct cl_page *cl_page_find0(const struct lu_env *env,
         struct cl_site          *site = cl_object_site(o);
         int err;
 
-        LINVRNT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+        LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
         cfs_might_sleep();
 
         ENTRY;
@@ -413,6 +414,11 @@ static struct cl_page *cl_page_find0(const struct lu_env *env,
                idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
         /* fast path. */
         if (type == CPT_CACHEABLE) {
+               /* cl_page::cp_lock is used to protect the page state and
+                * refcount, but need an external lock to protect the
+                * child/parent relationship, so vmpage lock must be held for
+                * this purpose. */
+               KLASSERT(PageLocked(vmpage));
                 /*
                  * cl_vmpage_page() can be called here without any locks as
                  *
@@ -538,7 +544,7 @@ static inline int cl_page_invariant(const struct cl_page *pg)
                  * Either page is early in initialization (has neither child
                  * nor parent yet), or it is in the object radix tree.
                  */
-                ergo(pg->cp_state < CPS_FREEING,
+                ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
                      (void *)radix_tree_lookup(&header->coh_tree,
                                                pg->cp_index) == pg ||
                      (child == NULL && parent == NULL));
@@ -615,7 +621,6 @@ static void cl_page_state_set0(const struct lu_env *env,
 static void cl_page_state_set(const struct lu_env *env,
                               struct cl_page *page, enum cl_page_state state)
 {
-        PINVRNT(env, page, cl_page_invariant(page));
         cl_page_state_set0(env, page, state);
 }
 
@@ -647,7 +652,6 @@ EXPORT_SYMBOL(cl_page_get);
  */
 void cl_page_put(const struct lu_env *env, struct cl_page *page)
 {
-        struct cl_object_header *hdr;
         struct cl_site *site = cl_object_site(page->cp_obj);
 
         PASSERT(env, page, cfs_atomic_read(&page->cp_ref) > !!page->cp_parent);
@@ -656,19 +660,18 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)
         CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
                        cfs_atomic_read(&page->cp_ref));
 
-        hdr = cl_object_header(cl_object_top(page->cp_obj));
-        if (cfs_atomic_dec_and_lock(&page->cp_ref, &hdr->coh_page_guard)) {
+        if (cfs_atomic_dec_and_lock(&page->cp_ref, &page->cp_lock)) {
                 cfs_atomic_dec(&site->cs_pages.cs_busy);
                 /* We're going to access the page w/o a reference, but it's
-                 * ok because we have grabbed the lock coh_page_guard, which
+                 * ok because we have grabbed the lock cp_lock, which
                  * means nobody is able to free this page behind us.
                  */
                 if (page->cp_state == CPS_FREEING) {
                         /* We drop the page reference and check the page state
-                         * inside the coh_page_guard. So that if it gets here,
+                         * inside the cp_lock. So that if it gets here,
                          * it is the REALLY last reference to this page.
                          */
-                        cfs_spin_unlock(&hdr->coh_page_guard);
+                        cfs_spin_unlock(&page->cp_lock);
 
                         LASSERT(cfs_atomic_read(&page->cp_ref) == 0);
                         PASSERT(env, page, page->cp_owner == NULL);
@@ -682,7 +685,7 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)
                         EXIT;
                         return;
                 }
-                cfs_spin_unlock(&hdr->coh_page_guard);
+                cfs_spin_unlock(&page->cp_lock);
         }
 
         EXIT;
@@ -717,8 +720,8 @@ EXPORT_SYMBOL(cl_page_vmpage);
  */
 struct cl_page *cl_vmpage_page(cfs_page_t *vmpage, struct cl_object *obj)
 {
-        struct cl_page *page;
-        struct cl_object_header *hdr;
+       struct cl_page *top;
+       struct cl_page *page;
 
         ENTRY;
         KLASSERT(PageLocked(vmpage));
@@ -733,16 +736,18 @@ struct cl_page *cl_vmpage_page(cfs_page_t *vmpage, struct cl_object *obj)
          * This loop assumes that ->private points to the top-most page. This
          * can be rectified easily.
          */
-        hdr = cl_object_header(cl_object_top(obj));
-        cfs_spin_lock(&hdr->coh_page_guard);
-        for (page = (void *)vmpage->private;
-             page != NULL; page = page->cp_child) {
+        top = (struct cl_page *)vmpage->private;
+       if (top == NULL)
+               RETURN(NULL);
+
+       cfs_spin_lock(&top->cp_lock);
+        for (page = top; page != NULL; page = page->cp_child) {
                 if (cl_object_same(page->cp_obj, obj)) {
                         cl_page_get_trust(page);
                         break;
                 }
         }
-        cfs_spin_unlock(&hdr->coh_page_guard);
+        cfs_spin_unlock(&top->cp_lock);
         LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
         RETURN(page);
 }
@@ -960,7 +965,7 @@ static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
         io = cl_io_top(io);
 
         if (pg->cp_state == CPS_FREEING) {
-                result = -EAGAIN;
+                result = -ENOENT;
         } else {
                 result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
                                         (const struct lu_env *,
@@ -977,7 +982,7 @@ static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
                                 cl_page_state_set(env, pg, CPS_OWNED);
                         } else {
                                 cl_page_disown0(env, io, pg);
-                                result = -EAGAIN;
+                                result = -ENOENT;
                         }
                 }
         }
@@ -1022,16 +1027,14 @@ EXPORT_SYMBOL(cl_page_own_try);
 void cl_page_assume(const struct lu_env *env,
                     struct cl_io *io, struct cl_page *pg)
 {
-        PASSERT(env, pg, pg->cp_state < CPS_OWNED);
-        PASSERT(env, pg, pg->cp_owner == NULL);
         PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
-        PINVRNT(env, pg, cl_page_invariant(pg));
 
         ENTRY;
         pg = cl_page_top(pg);
         io = cl_io_top(io);
 
         cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+        PASSERT(env, pg, pg->cp_owner == NULL);
         pg->cp_owner = io;
         pg->cp_task = current;
         cl_page_owner_set(pg);
@@ -1341,7 +1344,6 @@ void cl_page_completion(const struct lu_env *env,
         /* cl_page::cp_req already cleared by the caller (osc_completion()) */
         PASSERT(env, pg, pg->cp_req == NULL);
         PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
-        PINVRNT(env, pg, cl_page_invariant(pg));
 
         ENTRY;
         CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
@@ -1355,17 +1357,11 @@ void cl_page_completion(const struct lu_env *env,
                                (const struct lu_env *,
                                 const struct cl_page_slice *, int), ioret);
         if (anchor) {
+                LASSERT(cl_page_is_vmlocked(env, pg));
                 LASSERT(pg->cp_sync_io == anchor);
                 pg->cp_sync_io = NULL;
                 cl_sync_io_note(anchor, ioret);
         }
-
-        /* Don't assert the page writeback bit here because the lustre file
-         * may be as a backend of swap space. in this case, the page writeback
-         * is set by VM, and obvious we shouldn't clear it at all. Fortunately
-         * this type of pages are all TRANSIENT pages. */
-        KLASSERT(ergo(pg->cp_type == CPT_CACHEABLE,
-                      !PageWriteback(cl_page_vmpage(env, pg))));
         EXIT;
 }
 EXPORT_SYMBOL(cl_page_completion);
@@ -1407,32 +1403,61 @@ EXPORT_SYMBOL(cl_page_make_ready);
  * its queues.
  *
  * \pre  cl_page_is_owned(pg, io)
- * \post ergo(result == 0,
- *            pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT)
+ * \post cl_page_is_owned(pg, io)
  *
  * \see cl_page_operations::cpo_cache_add()
  */
 int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
                       struct cl_page *pg, enum cl_req_type crt)
 {
-        int result;
+       const struct cl_page_slice *scan;
+       int result = 0;
 
-        PINVRNT(env, pg, crt < CRT_NR);
-        PINVRNT(env, pg, cl_page_is_owned(pg, io));
-        PINVRNT(env, pg, cl_page_invariant(pg));
+       PINVRNT(env, pg, crt < CRT_NR);
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
 
-        ENTRY;
-        result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_cache_add));
-        if (result == 0) {
-                cl_page_owner_clear(pg);
-                cl_page_state_set(env, pg, CPS_CACHED);
-        }
-        CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
-        RETURN(result);
+       ENTRY;
+
+       cfs_list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+               if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+                       continue;
+
+               result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+               if (result != 0)
+                       break;
+       }
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       RETURN(result);
 }
 EXPORT_SYMBOL(cl_page_cache_add);
 
 /**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page *pg)
+{
+       int result;
+
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+
+       result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
  * Checks whether page is protected by any extent lock is at least required
  * mode.
  *
@@ -1456,6 +1481,16 @@ int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
 }
 EXPORT_SYMBOL(cl_page_is_under_lock);
 
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, void *cbdata)
+{
+        cl_page_own(env, io, page);
+        cl_page_unmap(env, io, page);
+        cl_page_discard(env, io, page);
+        cl_page_disown(env, io, page);
+        return CLP_GANG_OKAY;
+}
+
 /**
  * Purges all cached pages belonging to the object \a obj.
  */
@@ -1464,13 +1499,10 @@ int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
         struct cl_thread_info   *info;
         struct cl_object        *obj = cl_object_top(clobj);
         struct cl_io            *io;
-        struct cl_page_list     *plist;
-        int                      resched;
         int                      result;
 
         ENTRY;
         info  = cl_env_info(env);
-        plist = &info->clt_list;
         io    = &info->clt_io;
 
         /*
@@ -1478,6 +1510,7 @@ int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
          * function, we just make cl_page_list functions happy. -jay
          */
         io->ci_obj = obj;
+       io->ci_ignore_layout = 1;
         result = cl_io_init(env, io, CIT_MISC, obj);
         if (result != 0) {
                 cl_io_fini(env, io);
@@ -1485,21 +1518,11 @@ int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
         }
 
         do {
-                cl_page_list_init(plist);
-                cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, plist, 0,
-                                    &resched);
-                /*
-                 * Since we're purging the pages of an object, we don't care
-                 * the possible outcomes of the following functions.
-                 */
-                cl_page_list_unmap(env, io, plist);
-                cl_page_list_discard(env, io, plist);
-                cl_page_list_disown(env, io, plist);
-                cl_page_list_fini(env, plist);
-
-                if (resched)
+                result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+                                             page_prune_cb, NULL);
+                if (result == CLP_GANG_RESCHED)
                         cfs_cond_resched();
-        } while (resched);
+        } while (result != CLP_GANG_OKAY);
 
         cl_io_fini(env, io);
         RETURN(result);