LU-18931 build: Update ZFS version to 2.2.7

author Jian Yu <yujian@whamcloud.com>

Thu, 8 May 2025 06:48:47 +0000 (23:48 -0700)

committer Oleg Drokin <green@whamcloud.com>

Wed, 21 May 2025 03:42:33 +0000 (03:42 +0000)
author Jian Yu <yujian@whamcloud.com>
Thu, 8 May 2025 06:48:47 +0000 (23:48 -0700)
committer Oleg Drokin <green@whamcloud.com>
Wed, 21 May 2025 03:42:33 +0000 (03:42 +0000)
diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild

index b08bf12..e4e2484 100755 (executable)
--- a/contrib/lbuild/lbuild
+++ b/contrib/lbuild/lbuild
@@ -1029,7 +1029,7 @@ build_spl_zfs() {
      # The spl/zfs spec files expect RPM_BUILD_ROOT to point to the root of the
      # destination for the rpms
      export RPM_BUILD_ROOT=$TOPDIR
-    SPLZFSVER=${SPLZFSVER:-2.1.15}
+    SPLZFSVER=${SPLZFSVER:-2.2.7}
      SPLZFSTAG=${SPLZFSTAG:-}
      # "spl zfs" prior to 0.8.0
      # "zfs" for 0.8.0 and later
diff --git a/lustre.spec.in b/lustre.spec.in

index ef340a9..52e60d9 100644 (file)
--- a/lustre.spec.in
+++ b/lustre.spec.in
@@ -313,7 +313,7 @@ Requires: libmount
  Provides: lustre-osd-mount = %{version}
  Obsoletes: lustre-osd-mount < %{version}
  %if 0%{confzfsdobjpath} != 0
-BuildRequires: (libzfs-devel or libzfs4-devel or libzfs5-devel)
+BuildRequires: (libzfs-devel or libzfs4-devel or libzfs5-devel or libzfs6-devel)
  %endif
  # end confzfsdobjpath
  # Tests also require zpool from zfs package:
diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c

index c87d1c3..71f4715 100644 (file)
--- a/lustre/osd-zfs/osd_handler.c
+++ b/lustre/osd-zfs/osd_handler.c
@@ -757,10 +757,14 @@ static void *osd_key_init(const struct lu_context *ctx,
         struct osd_thread_info *info;
  
         OBD_ALLOC_PTR(info);
-       if (info != NULL)
-               info->oti_env = container_of(ctx, struct lu_env, le_ctx);
-       else
-               info = ERR_PTR(-ENOMEM);
+       if (!info)
+               return ERR_PTR(-ENOMEM);
+
+       info->oti_env = container_of(ctx, struct lu_env, le_ctx);
+#ifdef ZAP_MAXNAMELEN_NEW
+       info->oti_za.za_name_len = MAXNAMELEN;
+       info->oti_za2.za_name_len = MAXNAMELEN;
+#endif
         return info;
  }
  
diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c

index 8b32422..03ee107 100644 (file)
--- a/lustre/osd-zfs/osd_index.c
+++ b/lustre/osd-zfs/osd_index.c
@@ -158,6 +158,9 @@ static struct dt_it *osd_index_it_init(const struct lu_env *env,
  
         it->ozi_obj   = obj;
         it->ozi_reset = 1;
+#ifdef ZAP_MAXNAMELEN_NEW
+       it->ozi_za.za_name_len = MAXNAMELEN;
+#endif
         lu_object_get(lo);
  
         RETURN((struct dt_it *)it);
@@ -1330,7 +1333,11 @@ static int osd_dir_it_next(const struct lu_env *env, struct dt_it *di)
         ENTRY;
  
         /* temp. storage should be enough for any key supported by ZFS */
+#ifdef ZAP_MAXNAMELEN_NEW
+       LASSERT(za->za_name_len <= sizeof(it->ozi_name));
+#else
         BUILD_BUG_ON(sizeof(za->za_name) > sizeof(it->ozi_name));
+#endif
  
         /*
          * the first ->next() moves the cursor to .
diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h

index e11f84d..15c41dd 100644 (file)
--- a/lustre/osd-zfs/osd_internal.h
+++ b/lustre/osd-zfs/osd_internal.h
@@ -165,6 +165,10 @@ struct osd_zap_it {
         enum osd_zap_pos         ozi_pos;
         struct luz_direntry      ozi_zde;
         zap_attribute_t          ozi_za;
+#ifdef ZAP_MAXNAMELEN_NEW
+       /* flexible array: zap_attribute_t.za_name[], ensure space allocated */
+       char                     ozi_za_name_buffer[MAXNAMELEN];
+#endif
         union {
                 char             ozi_name[MAXNAMELEN]; /* file name for dir */
                 __u64            ozi_key; /* binary key for index files */
@@ -258,7 +262,15 @@ struct osd_thread_info {
         struct lu_attr           oti_la;
         struct osa_attr          oti_osa;
         zap_attribute_t          oti_za;
+#ifdef ZAP_MAXNAMELEN_NEW
+       /* flexible array: zap_attribute_t.za_name[], ensure space allocated */
+       char                     oti_za_name_buffer[MAXNAMELEN];
+#endif
         zap_attribute_t          oti_za2;
+#ifdef ZAP_MAXNAMELEN_NEW
+       /* flexible array: zap_attribute_t.za_name[], ensure space allocated */
+       char                     oti_za2_name_buffer[MAXNAMELEN];
+#endif
         dmu_object_info_t        oti_doi;
         struct luz_direntry      oti_zde;
  
@@ -466,6 +478,7 @@ struct osd_object {
  
         /* the i_flags in LMA */
         __u32                    oo_lma_flags;
+       __u32                    oo_next_blocksize;
         union {
                 int             oo_ea_in_bonus; /* EA bytes we expect */
                 struct {
diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c

index ac98fba..ef867e9 100644 (file)
--- a/lustre/osd-zfs/osd_io.c
+++ b/lustre/osd-zfs/osd_io.c
@@ -62,9 +62,13 @@
  
  char osd_0copy_tag[] = "zerocopy";
  
+static void osd_choose_next_blocksize(struct osd_object *obj,
+                                     loff_t off, ssize_t len);
+
  static void dbuf_set_pending_evict(dmu_buf_t *db)
  {
         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+
         dbi->db_pending_evict = TRUE;
  }
  
@@ -168,19 +172,20 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
                                 const struct lu_buf *buf, loff_t pos,
                                 struct thandle *th)
  {
-       struct osd_object  *obj  = osd_dt_obj(dt);
-       struct osd_device  *osd = osd_obj2dev(obj);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
         loff_t _pos = pos, max = 0;
         struct osd_thandle *oh;
-       uint64_t            oid;
-       ENTRY;
+       uint64_t oid;
  
+       ENTRY;
         oh = container_of(th, struct osd_thandle, ot_super);
  
         /* in some cases declare can race with creation (e.g. llog)
          * and we need to wait till object is initialized. notice
          * LOHA_EXISTs is supposed to be the last step in the
-        * initialization */
+        * initialization
+        */
  
         /* size change (in dnode) will be declared by dmu_tx_hold_write() */
         if (dt_object_exists(dt))
@@ -190,7 +195,8 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
  
         /* XXX: we still miss for append declaration support in ZFS
          *      -1 means append which is used by llog mostly, llog
-        *      can grow upto LLOG_MIN_CHUNK_SIZE*8 records */
+        *      can grow upto LLOG_MIN_CHUNK_SIZE*8 records
+        */
         max = max_t(loff_t, 256 * 8 * LLOG_MIN_CHUNK_SIZE,
                     obj->oo_attr.la_size + (2 << 20));
         if (pos == -1)
@@ -233,7 +239,8 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
  
         /* dt_declare_write() is usually called for system objects, such
          * as llog or last_rcvd files. We needn't enforce quota on those
-        * objects, so always set the lqi_space as 0. */
+        * objects, so always set the lqi_space as 0.
+        */
         RETURN(osd_declare_quota(env, osd, obj->oo_attr.la_uid,
                                  obj->oo_attr.la_gid, obj->oo_attr.la_projid,
                                  0, oh, NULL, OSD_QID_BLK));
@@ -248,6 +255,7 @@ static dmu_buf_t *osd_get_dbuf(struct osd_object *obj, uint64_t offset)
         blkid = dbuf_whichblock(obj->oo_dn, 0, offset);
         for (i = 0; i < OSD_MAX_DBUFS; i++) {
                 dmu_buf_impl_t *dbi = (void *)dbs[i];
+
                 if (!dbs[i])
                         continue;
                 if (dbi->db_blkid == blkid)
@@ -320,14 +328,13 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
                         const struct lu_buf *buf, loff_t *pos,
                         struct thandle *th)
  {
-       struct osd_object  *obj  = osd_dt_obj(dt);
-       struct osd_device  *osd = osd_obj2dev(obj);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
         struct osd_thandle *oh;
-       uint64_t            offset = *pos;
-       int                 rc;
+       uint64_t offset = *pos;
+       int rc;
  
         ENTRY;
-
         LASSERT(dt_object_exists(dt));
         LASSERT(obj->oo_dn);
  
@@ -350,7 +357,8 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
                 write_unlock(&obj->oo_attr_lock);
                 /* osd_object_sa_update() will be copying directly from oo_attr
                  * into dbuf.  any update within a single txg will copy the
-                * most actual */
+                * most actual
+                */
                 rc = osd_object_sa_update(obj, SA_ZPL_SIZE(osd),
                                         &obj->oo_attr.la_size, 8, oh);
                 if (unlikely(rc))
@@ -382,8 +390,8 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
  {
         struct osd_object *obj  = osd_dt_obj(dt);
         struct osd_device *osd = osd_obj2dev(obj);
-       unsigned long      ptr;
-       int                i;
+       unsigned long ptr;
+       int i;
  
         LASSERT(dt_object_exists(dt));
         LASSERT(obj->oo_dn);
@@ -405,10 +413,12 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
                                 atomic_dec(&osd->od_zerocopy_pin);
                         } else if (lnb[i].lnb_data != NULL) {
                                 int j, apages, abufsz;
+
                                 abufsz = arc_buf_size(lnb[i].lnb_data);
                                 apages = abufsz >> PAGE_SHIFT;
                                 /* these references to pages must be invalidated
-                                * to prevent access in osd_bufs_put() */
+                                * to prevent access in osd_bufs_put()
+                                */
                                 for (j = 0; j < apages; j++)
                                         lnb[i + j].lnb_page = NULL;
                                 dmu_return_arcbuf(lnb[i].lnb_data);
@@ -521,7 +531,8 @@ static int osd_bufs_get_read(const struct lu_env *env, struct osd_object *obj,
                                 lnb->lnb_page = kmem_to_page(dbp[i]->db_data +
                                                              bufoff);
                                 /* mark just a single slot: we need this
-                                * reference to dbuf to be released once */
+                                * reference to dbuf to be released once
+                                */
                                 lnb->lnb_data = dbf;
                                 dbf = NULL;
  
@@ -537,8 +548,7 @@ static int osd_bufs_get_read(const struct lu_env *env, struct osd_object *obj,
                         if (drop_cache)
                                 dbuf_set_pending_evict(dbp[i]);
  
-                       /* steal dbuf so dmu_buf_rele_array() can't release
-                        * it */
+                       /* steal dbuf so dmu_buf_rele_array() can't free it */
                         dbp[i] = NULL;
                 }
  
@@ -587,13 +597,16 @@ static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj,
                               int maxlnb)
  {
         struct osd_device *osd = osd_obj2dev(obj);
-       int                poff, plen, off_in_block, sz_in_block;
-       int                rc, i = 0, npages = 0;
+       int poff, plen, off_in_block, sz_in_block;
+       int rc, i = 0, npages = 0;
         dnode_t *dn = obj->oo_dn;
         arc_buf_t *abuf;
         uint32_t bs = dn->dn_datablksz;
+
         ENTRY;
  
+       osd_choose_next_blocksize(obj, off, len);
+
         /*
          * currently only full blocks are subject to zerocopy approach:
          * so that we're sure nobody is trying to update the same block
@@ -617,7 +630,8 @@ static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj,
                         atomic_inc(&osd->od_zerocopy_loan);
  
                         /* go over pages arcbuf contains, put them as
-                        * local niobufs for ptlrpc's bulks */
+                        * local niobufs for ptlrpc's bulks
+                        */
                         while (sz_in_block > 0) {
                                 plen = min_t(int, sz_in_block, PAGE_SIZE);
  
@@ -704,7 +718,7 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
                         int maxlnb, enum dt_bufs_type rw)
  {
         struct osd_object *obj  = osd_dt_obj(dt);
-       int                rc;
+       int rc;
  
         LASSERT(dt_object_exists(dt));
         LASSERT(obj->oo_dn);
@@ -747,21 +761,21 @@ static int osd_declare_write_commit(const struct lu_env *env,
                                     struct niobuf_local *lnb, int npages,
                                     struct thandle *th)
  {
-       struct osd_object  *obj = osd_dt_obj(dt);
-       struct osd_device  *osd = osd_obj2dev(obj);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
         struct osd_thandle *oh;
-       uint64_t            offset = 0;
-       uint32_t            size = 0;
+       uint64_t offset = 0;
+       uint32_t size = 0;
         uint32_t blksz = obj->oo_dn->dn_datablksz;
-       int                 i, rc;
+       int i, rc;
         bool synced = false;
-       long long           space = 0;
-       struct page        *last_page = NULL;
-       unsigned long       discont_pages = 0;
+       long long space = 0;
+       struct page *last_page = NULL;
+       unsigned long discont_pages = 0;
         enum osd_quota_local_flags local_flags = 0;
         enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
-       ENTRY;
  
+       ENTRY;
         LASSERT(dt_object_exists(dt));
         LASSERT(obj->oo_dn);
  
@@ -778,13 +792,15 @@ static int osd_declare_write_commit(const struct lu_env *env,
                         /* ENOSPC, network RPC error, etc.
                          * We don't want to book space for pages which will be
                          * skipped in osd_write_commit(). Hence we skip pages
-                        * with lnb_rc != 0 here too */
+                        * with lnb_rc != 0 here too
+                        */
                         continue;
                 /* ignore quota for the whole request if any page is from
                  * client cache or written by root.
                  *
                  * XXX we could handle this on per-lnb basis as done by
-                * grant. */
+                * grant.
+                */
                 if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
                     (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
                     !(lnb[i].lnb_flags & OBD_BRW_SYNC))
@@ -809,7 +825,8 @@ static int osd_declare_write_commit(const struct lu_env *env,
                  * indirect blocks and just use as a rough estimate the worse
                  * case where the old space is being held by a snapshot. Quota
                  * overrun will be adjusted once the operation is committed, if
-                * required. */
+                * required.
+                */
                 space += osd_roundup2blocksz(size, offset, blksz);
  
                 offset = lnb[i].lnb_file_offset;
@@ -822,8 +839,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
                 space += osd_roundup2blocksz(size, offset, blksz);
         }
  
-       /* backend zfs filesystem might be configured to store multiple data
-        * copies */
+       /* backend zfs FS might be configured to store multiple data copies */
         space  *= osd->od_os->os_copies;
         space   = toqb(space);
         CDEBUG(D_QUOTA, "writing %d pages, reserving %lldK of quota space\n",
@@ -847,7 +863,8 @@ retry:
  
         /* we need only to store the overquota flags in the first lnb for
          * now, once we support multiple objects BRW, this code needs be
-        * revised. */
+        * revised.
+        */
         if (local_flags & QUOTA_FL_OVER_USRQUOTA)
                 lnb[0].lnb_flags |= OBD_BRW_OVER_USRQUOTA;
         if (local_flags & QUOTA_FL_OVER_GRPQUOTA)
@@ -866,57 +883,71 @@ retry:
   * maximum blocksize the dataset can support. Otherwise, it will pick a
   * a block size by the writing region of this I/O.
   */
-static int osd_grow_blocksize(struct osd_object *obj, struct osd_thandle *oh,
-                             uint64_t start, uint64_t end)
+static int osd_grow_blocksize(struct osd_object *obj, struct osd_thandle *oh)
  {
-       struct osd_device       *osd = osd_obj2dev(obj);
+       struct osd_device *osd = osd_obj2dev(obj);
         dnode_t *dn = obj->oo_dn;
-       uint32_t                 blksz;
-       int                      rc = 0;
+       int rc = 0;
  
         ENTRY;
  
+       if (obj->oo_next_blocksize == 0)
+               return 0;
         if (dn->dn_maxblkid > 0) /* can't change block size */
                 GOTO(out, rc);
-
         if (dn->dn_datablksz >= osd->od_max_blksz)
                 GOTO(out, rc);
+       if (dn->dn_datablksz == obj->oo_next_blocksize)
+               GOTO(out, rc);
  
         down_write(&obj->oo_guard);
-
-       blksz = dn->dn_datablksz;
-       if (blksz >= osd->od_max_blksz) /* check again after grabbing lock */
-               GOTO(out_unlock, rc);
-
-       /* now ZFS can support up to 16MB block size, and if the write
-        * is sequential, it just increases the block size gradually */
-       if (start <= blksz) { /* sequential */
-               blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz, end);
-       } else { /* sparse, pick a block size by write region */
-               blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz,
-                                       end - start);
-       }
-
-       if (!is_power_of_2(blksz))
-               blksz = size_roundup_power2(blksz);
-
-       if (blksz > dn->dn_datablksz) {
+       if (dn->dn_datablksz < obj->oo_next_blocksize) {
+               CDEBUG(D_INODE, "set blksz to %u\n", obj->oo_next_blocksize);
                 rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object,
-                                              blksz, 0, oh->ot_tx);
-               LASSERT(ergo(rc == 0, dn->dn_datablksz >= blksz));
+                                              obj->oo_next_blocksize, 0,
+                                              oh->ot_tx);
                 if (rc < 0)
-                       CDEBUG(D_INODE, "object "DFID": change block size"
+                       CDEBUG(D_ERROR, "object "DFID": change block size"
                                "%u -> %u error rc = %d\n",
                                PFID(lu_object_fid(&obj->oo_dt.do_lu)),
-                              dn->dn_datablksz, blksz, rc);
+                              dn->dn_datablksz, obj->oo_next_blocksize, rc);
         }
         EXIT;
-out_unlock:
         up_write(&obj->oo_guard);
  out:
         return rc;
  }
  
+static void osd_choose_next_blocksize(struct osd_object *obj,
+                                     loff_t off, ssize_t len)
+{
+       struct osd_device *osd = osd_obj2dev(obj);
+       dnode_t *dn = obj->oo_dn;
+       uint32_t blksz;
+
+       if (dn->dn_maxblkid > 0)
+               return;
+
+       if (dn->dn_datablksz >= osd->od_max_blksz)
+               return;
+
+       /*
+        * client sends data from own writeback cache after local
+        * aggregation. there is a chance this is a "unit of write"
+        * so blocksize.
+        */
+       if (off != 0)
+               return;
+
+       blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz, len);
+       if (!is_power_of_2(blksz))
+               blksz = size_roundup_power2(blksz);
+
+       /* XXX: locking? */
+       if (blksz > obj->oo_next_blocksize)
+               obj->oo_next_blocksize = blksz;
+}
+
  static void osd_evict_dbufs_after_write(struct osd_object *obj,
                                         loff_t off, ssize_t len)
  {
@@ -938,14 +969,14 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                         struct niobuf_local *lnb, int npages,
                         struct thandle *th, __u64 user_size)
  {
-       struct osd_object  *obj  = osd_dt_obj(dt);
-       struct osd_device  *osd = osd_obj2dev(obj);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
         struct osd_thandle *oh;
-       uint64_t            new_size = 0;
-       int                 i, abufsz, rc = 0, drop_cache = 0;
-       unsigned long      iosize = 0;
-       ENTRY;
+       uint64_t new_size = 0;
+       int i, abufsz, rc = 0, drop_cache = 0;
+       unsigned long iosize = 0;
  
+       ENTRY;
         LASSERT(dt_object_exists(dt));
         LASSERT(obj->oo_dn);
  
@@ -953,9 +984,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
         oh = container_of(th, struct osd_thandle, ot_super);
  
         /* adjust block size. Assume the buffers are sorted. */
-       (void)osd_grow_blocksize(obj, oh, lnb[0].lnb_file_offset,
-                                lnb[npages - 1].lnb_file_offset +
-                                lnb[npages - 1].lnb_len);
+       (void)osd_grow_blocksize(obj, oh);
  
         if (obj->oo_attr.la_size >= osd->od_readcache_max_filesize ||
             lnb[npages - 1].lnb_file_offset + lnb[npages - 1].lnb_len >=
@@ -1003,13 +1032,14 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
  
         for (i = 0; i < npages; i++) {
                 CDEBUG(D_INODE, "write %u bytes at %u\n",
-                       (unsigned) lnb[i].lnb_len,
-                       (unsigned) lnb[i].lnb_file_offset);
+                       (unsigned int) lnb[i].lnb_len,
+                       (unsigned int) lnb[i].lnb_file_offset);
  
                 if (lnb[i].lnb_rc) {
                         /* ENOSPC, network RPC error, etc.
                          * Unlike ldiskfs, zfs allocates new blocks on rewrite,
-                        * so we skip this page if lnb_rc is set to -ENOSPC */
+                        * so we skip this page if lnb_rc is set to -ENOSPC
+                        */
                         CDEBUG(D_INODE, "obj "DFID": skipping lnb[%u]: rc=%d\n",
                                 PFID(lu_object_fid(&dt->do_lu)), i,
                                 lnb[i].lnb_rc);
@@ -1030,30 +1060,35 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                         abufsz = lnb[i].lnb_len; /* to drop cache below */
                 } else if (lnb[i].lnb_data) {
                         int j, apages;
+
                         LASSERT(((unsigned long)lnb[i].lnb_data & 1) == 0);
                         /* buffer loaned for zerocopy, try to use it.
                          * notice that dmu_assign_arcbuf() is smart
                          * enough to recognize changed blocksize
-                        * in this case it fallbacks to dmu_write() */
+                        * in this case it fallbacks to dmu_write()
+                        */
                         abufsz = arc_buf_size(lnb[i].lnb_data);
                         LASSERT(abufsz & PAGE_MASK);
                         apages = abufsz >> PAGE_SHIFT;
                         LASSERT(i + apages <= npages);
                         /* these references to pages must be invalidated
-                        * to prevent access in osd_bufs_put() */
+                        * to prevent access in osd_bufs_put()
+                        */
                         for (j = 0; j < apages; j++)
                                 lnb[i + j].lnb_page = NULL;
                         dmu_assign_arcbuf(&obj->oo_dn->dn_bonus->db,
                                           lnb[i].lnb_file_offset,
                                           lnb[i].lnb_data, oh->ot_tx);
                         /* drop the reference, otherwise osd_put_bufs()
-                        * will be releasing it - bad! */
+                        * will be releasing it - bad!
+                        */
                         lnb[i].lnb_data = NULL;
                         atomic_dec(&osd->od_zerocopy_loan);
                         iosize += abufsz;
                 } else {
                         /* we don't want to deal with cache if nothing
-                        * has been send to ZFS at this step */
+                        * has been send to ZFS at this step
+                        */
                         continue;
                 }
  
@@ -1062,7 +1097,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
  
                 /* we have to mark dbufs for eviction here because
                  * dmu_assign_arcbuf() may create a new dbuf for
-                * loaned abuf */
+                * loaned abuf
+                */
                 osd_evict_dbufs_after_write(obj, lnb[i].lnb_file_offset,
                                             abufsz);
         }
@@ -1071,7 +1107,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                 /* no pages to write, no transno is needed */
                 th->th_local = 1;
                 /* it is important to return 0 even when all lnb_rc == -ENOSPC
-                * since ofd_commitrw_write() retries several times on ENOSPC */
+                * since ofd_commitrw_write() retries several times on ENOSPC
+                */
                 up_read(&obj->oo_guard);
                 record_end_io(osd, WRITE, 0, 0, 0);
                 RETURN(0);
@@ -1086,7 +1123,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                 write_unlock(&obj->oo_attr_lock);
                 /* osd_object_sa_update() will be copying directly from
                  * oo_attr into dbuf. any update within a single txg will copy
-                * the most actual */
+                * the most actual
+                */
                 rc = osd_object_sa_update(obj, SA_ZPL_SIZE(osd),
                                           &obj->oo_attr.la_size, 8, oh);
         } else {
@@ -1104,8 +1142,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
                         struct niobuf_local *lnb, int npages)
  {
         struct osd_object *obj  = osd_dt_obj(dt);
-       int                i;
-       loff_t             eof;
+       int i;
+       loff_t eof;
  
         LASSERT(dt_object_exists(dt));
         LASSERT(obj->oo_dn);
@@ -1156,12 +1194,9 @@ static int __osd_object_punch(struct osd_object *obj, objset_t *os,
         uint64_t size = obj->oo_attr.la_size;
         int rc = 0;
  
-       /* Assert that the transaction has been assigned to a
-          transaction group. */
+       /* Confirm if transaction has been assigned to a transaction group */
         LASSERT(tx->tx_txg != 0);
-       /*
-        * Nothing to do if file already at desired length.
-        */
+       /* Nothing to do if file already at desired length. */
         if (len == DMU_OBJECT_END && size == off)
                 return 0;
  
@@ -1188,13 +1223,13 @@ static int __osd_object_punch(struct osd_object *obj, objset_t *os,
  static int osd_punch(const struct lu_env *env, struct dt_object *dt,
                         __u64 start, __u64 end, struct thandle *th)
  {
-       struct osd_object  *obj = osd_dt_obj(dt);
-       struct osd_device  *osd = osd_obj2dev(obj);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
         struct osd_thandle *oh;
-       __u64               len;
-       int                 rc = 0;
-       ENTRY;
+       __u64 len;
+       int rc = 0;
  
+       ENTRY;
         LASSERT(dt_object_exists(dt));
         LASSERT(osd_invariant(obj));
  
@@ -1234,9 +1269,9 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
         struct osd_object  *obj = osd_dt_obj(dt);
         struct osd_device  *osd = osd_obj2dev(obj);
         struct osd_thandle *oh;
-       __u64               len;
-       ENTRY;
+       __u64 len;
  
+       ENTRY;
         oh = container_of(handle, struct osd_thandle, ot_super);
  
         read_lock(&obj->oo_attr_lock);
@@ -1270,9 +1305,9 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
  static int osd_ladvise(const struct lu_env *env, struct dt_object *dt,
                        __u64 start, __u64 end, enum lu_ladvise_type advice)
  {
-       int     rc;
-       ENTRY;
+       int rc;
  
+       ENTRY;
         switch (advice) {
         default:
                 rc = -ENOTSUPP;
@@ -1286,8 +1321,8 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
                          __u64 start, __u64 end, int mode, struct thandle *th)
  {
         int rc = -EOPNOTSUPP;
-       ENTRY;
  
+       ENTRY;
          /*
           * space preallocation is not supported for ZFS
           * Returns -EOPNOTSUPP for now
@@ -1300,8 +1335,8 @@ static int osd_declare_fallocate(const struct lu_env *env,
                                  int mode, struct thandle *th)
  {
         int rc = -EOPNOTSUPP;
-       ENTRY;
  
+       ENTRY;
          /*
           * space preallocation is not supported for ZFS
           * Returns -EOPNOTSUPP for now
@@ -1320,7 +1355,6 @@ static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt,
         boolean_t hole = whence == SEEK_HOLE;
  
         ENTRY;
-
         LASSERT(dt_object_exists(dt));
         LASSERT(osd_invariant(obj));
         LASSERT(offset >= 0);
diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c

index a421ff8..5906076 100644 (file)
--- a/lustre/osd-zfs/osd_object.c
+++ b/lustre/osd-zfs/osd_object.c
@@ -142,7 +142,8 @@ void osd_object_sa_dirty_rele(const struct lu_env *env, struct osd_thandle *oh)
                         }
                         up_write(&obj->oo_guard);
                 }
-               sa_spill_rele(obj->oo_sa_hdl);
+               if (obj->oo_sa_hdl)
+                       sa_spill_rele(obj->oo_sa_hdl);
         }
  }
  
diff --git a/lustre/osd-zfs/osd_scrub.c b/lustre/osd-zfs/osd_scrub.c

index b2ce389..cb1d270 100644 (file)
--- a/lustre/osd-zfs/osd_scrub.c
+++ b/lustre/osd-zfs/osd_scrub.c
@@ -1852,6 +1852,11 @@ static int osd_scan_dir(const struct lu_env *env, struct osd_device *dev,
  
         za = &it->ozi_za;
         zde = &it->ozi_zde;
+
+#ifdef ZAP_MAXNAMELEN_NEW
+       za->za_name_len = MAXNAMELEN;
+#endif
+
         while (1) {
                 rc = -zap_cursor_retrieve(it->ozi_zc, za);
                 if (unlikely(rc)) {
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh

index 69e54b1..228e220 100755 (executable)
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -33,6 +33,12 @@ fi
  #                                  8  22  40  165  (min)
  [ "$SLOW" = "no" ] && EXCEPT_SLOW="45 69 106 111"
  
+if [[ "$mds1_FSTYPE" == "zfs" ]]; then
+       always_except LU-18652 108a 112a 112b 113 117 119 121 122a
+       always_except LU-18652 123aa 123ab 123ac 123ad 123ae 123af 123ag 123ah 123ahi
+       always_except LU-18652 123F 123G 123H 126 129 132 133 135 136 137 150 152 153a 153b 153c 155 802a
+fi
+
  build_test_filter
  
  # use small MDS + OST size to speed formatting time
@@ -1796,8 +1802,8 @@ t32_verify_quota() {
                 "$fsname.quota.ost" ug
  
         chmod 0777 $mnt
-       runas -u $T32_QID -g $T32_QID dd if=/dev/zero of=$mnt/t32_qf_new \
-               bs=1M count=$((img_blimit / 1024)) oflag=sync && {
+       runas -u $T32_QID -g $T32_QID $DD of=$mnt/t32_qf_new \
+               count=$((img_blimit / 1024)) oflag=sync && {
                 echo "Write succeed, but expect -EDQUOT"
                 return 1
         }
diff --git a/lustre/tests/obdfilter-survey.sh b/lustre/tests/obdfilter-survey.sh

index 059694e..5a1f655 100644 (file)
--- a/lustre/tests/obdfilter-survey.sh
+++ b/lustre/tests/obdfilter-survey.sh
@@ -8,7 +8,12 @@ init_logging
  
  # bug number for skipped test:
  ALWAYS_EXCEPT="$OBDFILTER_SURVEY_EXCEPT "
-# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+# it would be nice to have an "all" option :-)
+ if [[ $mds1_FSTYPE == zfs ]] &&
+    (( $(zfs_version_code mds1) >= $(version_code 2.2.7) )); then
+       always_except LU-18889 1a 1b 1c 2a 2b 3a
+fi
  
  build_test_filter
  
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 9f968b6..c5ada84 100755 (executable)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -75,19 +75,17 @@ if (( $LINUX_VERSION_CODE >= $(version_code 4.18.0) &&
         ALWAYS_EXCEPT+=" 411"
  fi
  
-#                                  5              12     8   12  15   (min)"
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 60i 64b 68 71 115 135 136 230d 300o"
+# minutes runtime:                   5              12     8   12   15
+[[ "$SLOW" = "no" ]] && EXCEPT_SLOW="27m 60i 64b 68 71 115 135 136 230d 300o"
  
-if [ "$mds1_FSTYPE" = "zfs" ]; then
-       # bug number for skipped test:
-       ALWAYS_EXCEPT+="              "
+if [[ "$mds1_FSTYPE" == "zfs" ]]; then
         #                                               13    (min)"
-       [ "$SLOW" = "no" ] && EXCEPT_SLOW="$EXCEPT_SLOW 51b"
+       [[ "$SLOW" == "no" ]] && EXCEPT_SLOW="$EXCEPT_SLOW 51b"
  fi
  
-if [ "$ost1_FSTYPE" = "zfs" ]; then
-       # bug number for skipped test:  LU-1941  LU-1941  LU-1941  LU-1941
-       ALWAYS_EXCEPT+="                130b 130c 130d 130e 130f 130g"
+if [[ "$ost1_FSTYPE" = "zfs" ]]; then
+       always_except LU-1941 130b 130c 130d 130e 130f 130g
+       always_except LU-9054 312
  fi
  
  proc_regexp="/{proc,sys}/{fs,sys,kernel/debug}/{lustre,lnet}/"
@@ -9554,7 +9552,7 @@ test_66() {
         [ $PARALLEL == "yes" ] && skip "skip parallel run"
  
         COUNT=${COUNT:-8}
-       dd if=/dev/zero of=$DIR/f66 bs=1k count=$COUNT
+       dd if=/dev/urandom of=$DIR/f66 bs=1k count=$COUNT
         sync; sync_all_data; sync; sync_all_data
         cancel_lru_locks osc
         BLOCKS=`ls -s $DIR/f66 | awk '{ print $1 }'`
@@ -24204,6 +24202,7 @@ test_311() {
         remote_mds_nodsh && skip "remote MDS with nodsh"
  
         local old_iused=$($LFS df -i | awk '/OST0000/ { print $3; exit; }')
+       echo "old_iused=$old_iused"
         local mdts=$(comma_list $(mdts_nodes))
  
         mkdir -p $DIR/$tdir
@@ -24212,11 +24211,13 @@ test_311() {
  
         # statfs data is not real time, let's just calculate it
         old_iused=$((old_iused + 1000))
+       echo "suppose current old_iused=$old_iused"
  
         local count=$(do_facet $SINGLEMDS "$LCTL get_param -n \
                         osp.*OST0000*MDT0000.create_count")
         local max_count=$(do_facet $SINGLEMDS "$LCTL get_param -n \
                                 osp.*OST0000*MDT0000.max_create_count")
+       echo "create_count=$count, max_create_count=$max_count"
         do_nodes $mdts "$LCTL set_param -n osp.*OST0000*.max_create_count=0"
  
         $LFS setstripe -i 0 $DIR/$tdir/$tfile || error "setstripe failed"
@@ -24224,6 +24225,8 @@ test_311() {
         [ $index -ne 0 ] || error "$tfile stripe index is 0"
  
         unlinkmany $DIR/$tdir/$tfile. 1000
+       wait_delete_completed
+       wait_zfs_commit $SINGLEMDS 10
  
         do_nodes $mdts "$LCTL set_param -n \
                         osp.*OST0000*.max_create_count=$max_count"
@@ -24236,14 +24239,15 @@ test_311() {
         local new_iused
         for i in $(seq 120); do
                 new_iused=$($LFS df -i | awk '/OST0000/ { print $3; exit; }')
+               echo -n "$new_iused "
                 # system may be too busy to destroy all objs in time, use
                 # a somewhat small value to not fail autotest
-               [ $((old_iused - new_iused)) -gt 400 ] && break
+               ((old_iused - new_iused > 400)) && break
                 sleep 1
         done
  
-       echo "waited $i sec, old Iused $old_iused, new Iused $new_iused"
-       [ $((old_iused - new_iused)) -gt 400 ] ||
+       echo -e "\nwaited $i sec, old Iused $old_iused, new Iused $new_iused"
+       ((old_iused - new_iused > 400)) ||
                 error "objs not destroyed after unlink"
  }
  run_test 311 "disable OSP precreate, and unlink should destroy objs"
@@ -25694,7 +25698,7 @@ generate_uneven_mdts() {
         if check_fallocate_supported mds$((min_index + 1)); then
                 cmd="fallocate -l 128K "
         else
-               cmd="dd if=/dev/zero bs=128K count=1 of="
+               cmd="$DD bs=128K count=1 of="
         fi
  
         echo "using cmd $cmd"
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 9a385a2..075ee67 100755 (executable)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -429,6 +429,13 @@ init_test_env() {
         export MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
         . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
         get_lustre_env
+       # use /dev/urandom when consuming space on ZFS to avoid compression
+       if [[ "$ost1_FSTYPE" == "zfs" ]]; then
+               DD_DEV="/dev/urandom"
+       else
+               DD_DEV="/dev/zero"
+       fi
+       DD="dd if=$DD_DEV bs=1M"
  
         # use localrecov to enable recovery for local clients, LU-12722
         [[ $MDS1_VERSION -lt $(version_code 2.13.52) ]] || {
@@ -540,6 +547,18 @@ lustre_version_code() {
         version_code $(lustre_build_version $1)
  }
  
+zfs_version_code() {
+       local facet=$1
+       local facet_version=${facet}_ZFS_VERSION
+
+       if [[ -z "${!facet_version}" ]]; then
+               local zfs_ver=$(do_facet $facet "modinfo --field version zfs")
+
+               export $facet_version=$(version_code ${zfs_ver%-*})
+       fi
+       echo ${!facet_version}
+}
+
  module_loaded () {
         /sbin/lsmod | grep -q "^\<$1\>"
  }
@@ -3185,7 +3204,7 @@ wait_zfs_commit() {
         # the occupied disk space will be released
         # only after TXGs are committed
         if [[ $(facet_fstype $1) == zfs ]]; then
-               echo "sleep $zfs_wait for ZFS $(facet_fstype $1)"
+               echo "sleep $zfs_wait for ZFS $(facet_type $1)"
                 sleep $zfs_wait
         fi
  }
diff --git a/lustre/utils/libmount_utils_zfs.c b/lustre/utils/libmount_utils_zfs.c

index 3f203c4..e6d30ca 100644 (file)
--- a/lustre/utils/libmount_utils_zfs.c
+++ b/lustre/utils/libmount_utils_zfs.c
@@ -631,6 +631,12 @@ static int zfs_create_vdev(struct mkfs_opts *mop, char *vdev)
  
         return ret;
  }
+/* interop will break if we change MAX_NAME from 255 */
+#ifdef ZAP_MAXNAMELEN_NEW
+#define ZFS_LONGNAME_FEATURE   " -o feature@longname=disabled"
+#else
+#define ZFS_LONGNAME_FEATURE   ""
+#endif
  
  int zfs_make_lustre(struct mkfs_opts *mop)
  {
@@ -708,7 +714,8 @@ int zfs_make_lustre(struct mkfs_opts *mop)
  
                 memset(mkfs_cmd, 0, PATH_MAX);
                 snprintf(mkfs_cmd, PATH_MAX,
-                       "zpool create -f -O canmount=off %s", pool);
+                       "zpool create%s -f -O canmount=off %s",
+                       ZFS_LONGNAME_FEATURE, pool);
  
                 /* Append the vdev config and create file vdevs as required */
                 while (*mop->mo_pool_vdevs != NULL) {
@@ -774,6 +781,7 @@ int zfs_make_lustre(struct mkfs_opts *mop)
          * zfs 0.6.1 - system attribute based xattrs
          * zfs 0.6.5 - large block support
          * zfs 0.7.0 - large dnode support
+        * zfs 2.2.6 - compression handling
          *
          * Check if zhp is NULL as a defensive measure. Any dataset
          * validation errors that would cause zfs_open() to fail
@@ -781,6 +789,8 @@ int zfs_make_lustre(struct mkfs_opts *mop)
          */
         zhp = zfs_open(g_zfs, ds, ZFS_TYPE_FILESYSTEM);
         if (zhp) {
+               char *opt;
+
                 /* zfs 0.6.1 - system attribute based xattrs */
                 if (!strstr(mop->mo_mkfsopts, "xattr="))
                         zfs_set_prop_str(zhp, "xattr", "sa");
@@ -797,6 +807,24 @@ int zfs_make_lustre(struct mkfs_opts *mop)
                                 zfs_set_prop_str(zhp, "recordsize", "1M");
                 }
  
+               /* zfs 2.2.6 - compression handling */
+               opt = strstr(mop->mo_mkfsopts, "compression=");
+               if (opt) {
+                       char *end = index(opt, ',');
+                       size_t len = strlen(opt);
+
+                       if (end) {
+                               len = end - opt;
+                               end = strndup(opt, len);
+                       }
+                       zfs_set_prop_str(zhp, "compression", end ? end : opt);
+                       if (end)
+                               free(end);
+               } else {
+                       /* By default turn off compression */
+                       zfs_set_prop_str(zhp, "compression", "off");
+               }
+
                 zfs_close(zhp);
         }
  
diff --git a/rpm/kmp-lustre-osd-zfs.preamble b/rpm/kmp-lustre-osd-zfs.preamble

index 34a36b1..11fa5e8 100644 (file)
--- a/rpm/kmp-lustre-osd-zfs.preamble
+++ b/rpm/kmp-lustre-osd-zfs.preamble
@@ -1,4 +1,3 @@
-Summary:        Lustre osd-zfs feature support
  Requires:       %{name}-osd-zfs-mount = %{version}
  %if 0%{confzfsdobjpath} != 0
  BuildRequires:  kmod-zfs-devel
author	Jian Yu <yujian@whamcloud.com>
	Thu, 8 May 2025 06:48:47 +0000 (23:48 -0700)
committer	Oleg Drokin <green@whamcloud.com>
	Wed, 21 May 2025 03:42:33 +0000 (03:42 +0000)
contrib/lbuild/lbuild		patch \| blob \| history
lustre.spec.in		patch \| blob \| history
lustre/osd-zfs/osd_handler.c		patch \| blob \| history
lustre/osd-zfs/osd_index.c		patch \| blob \| history
lustre/osd-zfs/osd_internal.h		patch \| blob \| history
lustre/osd-zfs/osd_io.c		patch \| blob \| history
lustre/osd-zfs/osd_object.c		patch \| blob \| history
lustre/osd-zfs/osd_scrub.c		patch \| blob \| history
lustre/tests/conf-sanity.sh		patch \| blob \| history
lustre/tests/obdfilter-survey.sh		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history
lustre/utils/libmount_utils_zfs.c		patch \| blob \| history
rpm/kmp-lustre-osd-zfs.preamble		patch \| blob \| history