Whamcloud - gitweb
LU-8135 osc: limits the number of chunks in write RPC
[fs/lustre-release.git] / lustre / osc / osc_cache.c
index 2b15cb8..590f9f9 100644 (file)
@@ -130,9 +130,9 @@ static const char *oes_strings[] = {
                /* ----- part 4 ----- */                                      \
                ## __VA_ARGS__);                                              \
        if (lvl == D_ERROR && __ext->oe_dlmlock != NULL)                      \
-               LDLM_ERROR(__ext->oe_dlmlock, "extent: %p\n", __ext);         \
+               LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext);           \
        else                                                                  \
-               LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p\n", __ext);         \
+               LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext);           \
 } while (0)
 
 #undef EASSERTF
@@ -322,7 +322,7 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
 {
        struct osc_extent *ext;
 
-       OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+       OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_NOFS);
        if (ext == NULL)
                return NULL;
 
@@ -523,7 +523,7 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
                return -ERANGE;
 
        LASSERT(cur->oe_dlmlock == victim->oe_dlmlock);
-       ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+       ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
        chunk_start = cur->oe_start >> ppc_bits;
        chunk_end   = cur->oe_end   >> ppc_bits;
        if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
@@ -647,15 +647,20 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env,
        descr = &olck->ols_cl.cls_lock->cll_descr;
        LASSERT(descr->cld_mode >= CLM_WRITE);
 
-       LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
-       ppc_bits   = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT,
+                "chunkbits: %u\n", cli->cl_chunkbits);
+       ppc_bits   = cli->cl_chunkbits - PAGE_SHIFT;
        chunk_mask = ~((1 << ppc_bits) - 1);
        chunksize  = 1 << cli->cl_chunkbits;
        chunk      = index >> ppc_bits;
 
-       /* align end to rpc edge, rpc size may not be a power 2 integer. */
+       /* align end to RPC edge. */
        max_pages = cli->cl_max_pages_per_rpc;
-       LASSERT((max_pages & ~chunk_mask) == 0);
+       if ((max_pages & ~chunk_mask) != 0) {
+               CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n",
+                      max_pages, cli->cl_chunkbits, chunk_mask);
+               RETURN(ERR_PTR(-EINVAL));
+       }
        max_end = index - (index % max_pages) + max_pages - 1;
        max_end = min_t(pgoff_t, max_end, descr->cld_end);
 
@@ -876,8 +881,8 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
 
        if (!sent) {
                lost_grant = ext->oe_grants;
-       } else if (blocksize < PAGE_CACHE_SIZE &&
-                  last_count != PAGE_CACHE_SIZE) {
+       } else if (blocksize < PAGE_SIZE &&
+                  last_count != PAGE_SIZE) {
                /* For short writes we shouldn't count parts of pages that
                 * span a whole chunk on the OST side, or our accounting goes
                 * wrong.  Should match the code in filter_grant_check. */
@@ -887,7 +892,7 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
                if (end)
                        count += blocksize - end;
 
-               lost_grant = PAGE_CACHE_SIZE - count;
+               lost_grant = PAGE_SIZE - count;
        }
        if (ext->oe_grants > 0)
                osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants);
@@ -961,7 +966,6 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
 static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
                                bool partial)
 {
-       struct cl_env_nest     nest;
        struct lu_env         *env;
        struct cl_io          *io;
        struct osc_object     *obj = ext->oe_obj;
@@ -970,11 +974,12 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
        struct osc_async_page *tmp;
        int                    pages_in_chunk = 0;
        int                    ppc_bits    = cli->cl_chunkbits -
-                                            PAGE_CACHE_SHIFT;
+                                            PAGE_SHIFT;
        __u64                  trunc_chunk = trunc_index >> ppc_bits;
        int                    grants   = 0;
        int                    nr_pages = 0;
        int                    rc       = 0;
+       __u16                  refcheck;
        ENTRY;
 
        LASSERT(sanity_check(ext) == 0);
@@ -984,9 +989,10 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
        /* Request new lu_env.
         * We can't use that env from osc_cache_truncate_start() because
         * it's from lov_io_sub and not fully initialized. */
-       env = cl_env_nested_get(&nest);
+       env = cl_env_get(&refcheck);
        io  = &osc_env_info(env)->oti_io;
        io->ci_obj = cl_object_top(osc2cl(obj));
+       io->ci_ignore_layout = 1;
        rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
        if (rc < 0)
                GOTO(out, rc);
@@ -1069,7 +1075,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 
 out:
        cl_io_fini(env, io);
-       cl_env_nested_put(&nest, env);
+       cl_env_put(env, &refcheck);
        RETURN(rc);
 }
 
@@ -1126,7 +1132,7 @@ static int osc_extent_make_ready(const struct lu_env *env,
        if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
                int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
                LASSERT(last_oap_count > 0);
-               LASSERT(last->oap_page_off + last_oap_count <= PAGE_CACHE_SIZE);
+               LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE);
                last->oap_count = last_oap_count;
                spin_lock(&last->oap_lock);
                last->oap_async_flags |= ASYNC_COUNT_STABLE;
@@ -1137,7 +1143,7 @@ static int osc_extent_make_ready(const struct lu_env *env,
         * because it's known they are not the last page */
        list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
                if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
-                       oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+                       oap->oap_count = PAGE_SIZE - oap->oap_page_off;
                        spin_lock(&oap->oap_lock);
                        oap->oap_async_flags |= ASYNC_COUNT_STABLE;
                        spin_unlock(&oap->oap_lock);
@@ -1164,7 +1170,7 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index,
        struct osc_object *obj = ext->oe_obj;
        struct client_obd *cli = osc_cli(obj);
        struct osc_extent *next;
-       int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
        pgoff_t chunk = index >> ppc_bits;
        pgoff_t end_chunk;
        pgoff_t end_index;
@@ -1301,9 +1307,9 @@ static int osc_refresh_count(const struct lu_env *env,
                return 0;
        else if (cl_offset(obj, index + 1) > kms)
                /* catch sub-page write at end of file */
-               return kms % PAGE_CACHE_SIZE;
+               return kms % PAGE_SIZE;
        else
-               return PAGE_CACHE_SIZE;
+               return PAGE_SIZE;
 }
 
 static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
@@ -1382,7 +1388,7 @@ static void osc_consume_write_grant(struct client_obd *cli,
        cli->cl_dirty_pages++;
        pga->flag |= OBD_BRW_FROM_GRANT;
        CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
-              PAGE_CACHE_SIZE, pga, pga->pg);
+              PAGE_SIZE, pga, pga->pg);
        osc_update_next_shrink(cli);
 }
 
@@ -1463,7 +1469,7 @@ static void osc_unreserve_grant(struct client_obd *cli,
  * used, we should return these grants to OST. There're two cases where grants
  * can be lost:
  * 1. truncate;
- * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ * 2. blocksize at OST is less than PAGE_SIZE and a partial page was
  *    written. In this case OST may use less chunks to serve this partial
  *    write. OSTs don't actually know the page size on the client side. so
  *    clients have to calculate lost grant by the blocksize on the OST.
@@ -1491,7 +1497,7 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
        spin_unlock(&cli->cl_loi_list_lock);
        CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n",
               lost_grant, cli->cl_lost_grant,
-              cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_CACHE_SHIFT,
+              cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT,
               cli->cl_dirty_grant);
 }
 
@@ -1877,6 +1883,21 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
        EXIT;
 }
 
+struct extent_rpc_data {
+       struct list_head        *erd_rpc_list;
+       unsigned int            erd_page_count;
+       unsigned int            erd_max_pages;
+       unsigned int            erd_max_chunks;
+};
+
+static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
+{
+       struct client_obd *cli = osc_cli(ext->oe_obj);
+       unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+
+       return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
+}
+
 /**
  * Try to add extent to one RPC. We need to think about the following things:
  * - # of pages must not be over max_pages_per_rpc
@@ -1884,10 +1905,10 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
  */
 static int try_to_add_extent_for_io(struct client_obd *cli,
                                    struct osc_extent *ext,
-                                   struct list_head *rpclist,
-                                   unsigned int *pc, unsigned int *max_pages)
+                                   struct extent_rpc_data *data)
 {
        struct osc_extent *tmp;
+       unsigned int chunk_count;
        struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
                                                      struct osc_async_page,
                                                      oap_pending_item);
@@ -1896,11 +1917,15 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
        EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
                ext);
 
-       *max_pages = max(ext->oe_mppr, *max_pages);
-       if (*pc + ext->oe_nr_pages > *max_pages)
+       chunk_count = osc_extent_chunks(ext);
+       if (chunk_count > data->erd_max_chunks)
                RETURN(0);
 
-       list_for_each_entry(tmp, rpclist, oe_link) {
+       data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
+       if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
+               RETURN(0);
+
+       list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
                struct osc_async_page *oap2;
                oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
                                        oap_pending_item);
@@ -1912,8 +1937,8 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
                }
 #endif
                if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
-                       CDEBUG(D_CACHE, "Do not permit different type of IO"
-                                       " for a same RPC\n");
+                       CDEBUG(D_CACHE, "Do not permit different types of IO "
+                              "in one RPC\n");
                        RETURN(0);
                }
 
@@ -1926,12 +1951,41 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
                break;
        }
 
-       *pc += ext->oe_nr_pages;
-       list_move_tail(&ext->oe_link, rpclist);
+       data->erd_max_chunks -= chunk_count;
+       data->erd_page_count += ext->oe_nr_pages;
+       list_move_tail(&ext->oe_link, data->erd_rpc_list);
        ext->oe_owner = current;
        RETURN(1);
 }
 
+static inline unsigned osc_max_write_chunks(const struct client_obd *cli)
+{
+       /*
+        * LU-8135:
+        *
+        * The maximum size of a single transaction is about 64MB in ZFS.
+        * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
+        *
+        * Since ZFS is a copy-on-write file system, a single dirty page in
+        * a chunk will result in the rewrite of the whole chunk, therefore
+        * an RPC shouldn't be allowed to contain too many chunks otherwise
+        * it will make transaction size much bigger than 64MB, especially
+        * with big block size for ZFS.
+        *
+        * This piece of code is to make sure that OSC won't send write RPCs
+        * with too many chunks. The maximum chunk size that an RPC can cover
+        * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
+        * OST should tell the client what the biggest transaction size is,
+        * but it's good enough for now.
+        *
+        * This limitation doesn't apply to ldiskfs, which allows as many
+        * chunks in one RPC as we want. However, it won't have any benefits
+        * to have too many discontiguous pages in one RPC. Therefore, it
+        * can only have 256 chunks at most in one RPC.
+        */
+       return min(PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits, 256);
+}
+
 /**
  * In order to prevent multiple ptlrpcd from breaking contiguous extents,
  * get_write_extent() takes all appropriate extents in atomic.
@@ -1950,28 +2004,30 @@ static unsigned int get_write_extents(struct osc_object *obj,
 {
        struct client_obd *cli = osc_cli(obj);
        struct osc_extent *ext;
-       unsigned int page_count = 0;
-       unsigned int max_pages = cli->cl_max_pages_per_rpc;
+       struct extent_rpc_data data = {
+               .erd_rpc_list   = rpclist,
+               .erd_page_count = 0,
+               .erd_max_pages  = cli->cl_max_pages_per_rpc,
+               .erd_max_chunks = osc_max_write_chunks(cli),
+       };
 
        LASSERT(osc_object_is_locked(obj));
        while (!list_empty(&obj->oo_hp_exts)) {
                ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
                                 oe_link);
                LASSERT(ext->oe_state == OES_CACHE);
-               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
-                                             &max_pages))
-                       return page_count;
-               EASSERT(ext->oe_nr_pages <= max_pages, ext);
+               if (!try_to_add_extent_for_io(cli, ext, &data))
+                       return data.erd_page_count;
+               EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
        }
-       if (page_count == max_pages)
-               return page_count;
+       if (data.erd_page_count == data.erd_max_pages)
+               return data.erd_page_count;
 
        while (!list_empty(&obj->oo_urgent_exts)) {
                ext = list_entry(obj->oo_urgent_exts.next,
                                 struct osc_extent, oe_link);
-               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
-                                             &max_pages))
-                       return page_count;
+               if (!try_to_add_extent_for_io(cli, ext, &data))
+                       return data.erd_page_count;
 
                if (!ext->oe_intree)
                        continue;
@@ -1982,13 +2038,12 @@ static unsigned int get_write_extents(struct osc_object *obj,
                             ext->oe_owner != NULL))
                                continue;
 
-                       if (!try_to_add_extent_for_io(cli, ext, rpclist,
-                                                     &page_count, &max_pages))
-                               return page_count;
+                       if (!try_to_add_extent_for_io(cli, ext, &data))
+                               return data.erd_page_count;
                }
        }
-       if (page_count == max_pages)
-               return page_count;
+       if (data.erd_page_count == data.erd_max_pages)
+               return data.erd_page_count;
 
        ext = first_extent(obj);
        while (ext != NULL) {
@@ -1999,13 +2054,12 @@ static unsigned int get_write_extents(struct osc_object *obj,
                        continue;
                }
 
-               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
-                                             &max_pages))
-                       return page_count;
+               if (!try_to_add_extent_for_io(cli, ext, &data))
+                       return data.erd_page_count;
 
                ext = next_extent(ext);
        }
-       return page_count;
+       return data.erd_page_count;
 }
 
 static int
@@ -2090,24 +2144,26 @@ __must_hold(osc)
        struct osc_extent *ext;
        struct osc_extent *next;
        struct list_head rpclist = LIST_HEAD_INIT(rpclist);
-       unsigned int page_count = 0;
-       unsigned int max_pages = cli->cl_max_pages_per_rpc;
+       struct extent_rpc_data data = {
+               .erd_rpc_list   = &rpclist,
+               .erd_page_count = 0,
+               .erd_max_pages  = cli->cl_max_pages_per_rpc,
+               .erd_max_chunks = UINT_MAX,
+       };
        int rc = 0;
        ENTRY;
 
        LASSERT(osc_object_is_locked(osc));
-       list_for_each_entry_safe(ext, next,
-                                    &osc->oo_reading_exts, oe_link) {
+       list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
                EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
-               if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
-                                             &max_pages))
+               if (!try_to_add_extent_for_io(cli, ext, &data))
                        break;
                osc_extent_state_set(ext, OES_RPC);
-               EASSERT(ext->oe_nr_pages <= max_pages, ext);
+               EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
        }
-       LASSERT(page_count <= max_pages);
+       LASSERT(data.erd_page_count <= data.erd_max_pages);
 
-       osc_update_pending(osc, OBD_BRW_READ, -page_count);
+       osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
 
        if (!list_empty(&rpclist)) {
                osc_object_unlock(osc);
@@ -2281,14 +2337,14 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
        oap->oap_obj_off = offset;
        LASSERT(!(offset & ~PAGE_MASK));
 
-       if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE))
+       if (cfs_capable(CFS_CAP_SYS_RESOURCE))
                oap->oap_brw_flags = OBD_BRW_NOQUOTA;
 
        INIT_LIST_HEAD(&oap->oap_pending_item);
        INIT_LIST_HEAD(&oap->oap_rpc_item);
 
        spin_lock_init(&oap->oap_lock);
-       CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n",
+       CDEBUG(D_INFO, "oap %p page %p obj off %llu\n",
               oap, page, oap->oap_obj_off);
        RETURN(0);
 }
@@ -2322,8 +2378,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
        /* Set the OBD_BRW_SRVLOCK before the page is queued. */
        brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
-       if (!client_is_remote(osc_export(osc)) &&
-           cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+       if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
                brw_flags |= OBD_BRW_NOQUOTA;
                cmd |= OBD_BRW_NOQUOTA;
        }
@@ -2332,7 +2387,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
        if (!(cmd & OBD_BRW_NOQUOTA)) {
                struct cl_object *obj;
                struct cl_attr   *attr;
-               unsigned int qid[MAXQUOTAS];
+               unsigned int qid[LL_MAXQUOTAS];
 
                obj = cl_object_top(&osc->oo_cl);
                attr = &osc_env_info(env)->oti_attr;
@@ -2748,7 +2803,7 @@ again:
                        break;
                }
 
-               OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size);
+               OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
 
                osc_extent_get(ext);
                if (ext->oe_state == OES_ACTIVE) {
@@ -2811,7 +2866,7 @@ again:
                        LASSERT(*extp == NULL);
                        *extp = osc_extent_get(ext);
                        OSC_EXTENT_DUMP(D_CACHE, ext,
-                                       "trunc at "LPU64"\n", size);
+                                       "trunc at %llu\n", size);
                }
                osc_extent_put(env, ext);
        }