Whamcloud - gitweb
LU-18813 osd-wbcfs: MemFS-based OSD with writeback support 39/58439/33
authorQian Yingjin <qian@ddn.com>
Mon, 17 Mar 2025 15:41:17 +0000 (23:41 +0800)
committerOleg Drokin <green@whamcloud.com>
Wed, 14 May 2025 03:54:41 +0000 (03:54 +0000)
Implement a memory filesystem based OSD with writeback support for
Lustre.
It borrows lots of design from memory-based file systems such as
tmpfs/ramfs. The data is first written into the memory-based file
system (called MemFS for short). And then, the data can be flushed
to the persistent storage in a delayed write-back manner.

This patch implemented the basic functionality to store data in
MemFS. It can reuse lots of VFS codes in Linux kernel such as:
- Page caching for data;
- dcache for dentry management and lookup;
- icache for inode management and lookup;
- Writeback mechanism in Linux kernel

Test-Parameters: testlist=sanity fstype=wbcfs mdscount=1 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs mdscount=1 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs mdscount=4 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs mdscount=4 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=1 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=1 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=4 mdtcount=1 osscount=4 ostcount=1
Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=4 mdtcount=1 osscount=4 ostcount=1
Signed-off-by: Yingjin Qian <qian@ddn.com>
Signed-off-by: Timothy Day <timday@amazon.com>
Change-Id: Ia07c1d95b7ad3f7f5e817a8de69d0a4ab6995ffa
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58439
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
31 files changed:
MAINTAINERS
config/lustre-build.m4
config/lustre-core.m4
lustre.spec.in
lustre/Makefile.in
lustre/autoMakefile.am
lustre/include/obd.h
lustre/include/uapi/linux/lustre/lustre_disk.h
lustre/mgs/lproc_mgs.c
lustre/osd-wbcfs/Makefile.in [new file with mode: 0644]
lustre/osd-wbcfs/TODO [new file with mode: 0644]
lustre/osd-wbcfs/autoMakefile.am [new file with mode: 0644]
lustre/osd-wbcfs/index.h [new file with mode: 0644]
lustre/osd-wbcfs/osd_dirent.c [new file with mode: 0644]
lustre/osd-wbcfs/osd_handler.c [new file with mode: 0644]
lustre/osd-wbcfs/osd_hash.c [new file with mode: 0644]
lustre/osd-wbcfs/osd_index_hash.c [new file with mode: 0644]
lustre/osd-wbcfs/osd_internal.h [new file with mode: 0644]
lustre/osd-wbcfs/osd_io.c [new file with mode: 0644]
lustre/osd-wbcfs/osd_object.c [new file with mode: 0644]
lustre/osd-wbcfs/wbcfs.c [new file with mode: 0644]
lustre/osd-wbcfs/wbcfs.h [new file with mode: 0644]
lustre/tests/sanity.sh
lustre/tests/test-framework.sh
lustre/utils/Makefile.am
lustre/utils/libmount_utils_wbcfs.c [new file with mode: 0644]
lustre/utils/mount_lustre.c
lustre/utils/mount_utils.c
lustre/utils/mount_utils.h
rpm/kmp-lustre-osd-wbcfs.files [new file with mode: 0644]
rpm/kmp-lustre-osd-wbcfs.preamble [new file with mode: 0644]

index 9af2aec..4a4cd4b 100644 (file)
@@ -432,6 +432,14 @@ R: Olaf Faaland <faaland1@llnl.gov>
 S:     Maintained
 F:     lustre/osd-zfs/
 
+Lustre OSD wbcfs
+R:     Timothy Day <timday@amazon.com>
+R:     Yingjin Qian <qian@ddn.com>
+S:     Supported
+F:     Documentation/osd-api.txt
+F:     lustre/osd-wbcfs/
+F:     lustre/utils/libmount_utils_wbcfs.c
+
 Lustre Patch Commit Hooks
 R:     Andreas Dilger <adilger@whamcloud.com>
 S:     Odd Fixes
index 74b2821..bf31cb5 100644 (file)
@@ -477,7 +477,7 @@ AS_IF([test "x$enable_modules" = xyes], [
 AS_IF([test x$enable_ldiskfs = xno -a x$enable_zfs = xno], [
        AS_CASE([$enable_server],
                [maybe], [enable_server=no],
-               [yes], [AC_MSG_ERROR([cannot enable servers, no backends were configured])])
+               [yes], [AC_MSG_WARN([no backends were configured])])
        ], [
                AS_IF([test x$enable_server = xmaybe], [enable_server=yes])
        ])
index bb9bc69..25eab06 100644 (file)
@@ -2867,6 +2867,22 @@ AC_DEFUN([LC_GENL_FAMILY_HAS_RESV_START_OP], [
 ]) # LC_GENL_FAMILY_HAS_RESV_START_OP
 
 #
+# LC_HAVE_FS_CONTEXT_HEADER
+#
+# Kernel version 5.0-rc2 commit 9bc61ab18b1d41f26dc06b9e6d3c203e65f83fe6
+# vfs: Introduce fs_context, switch vfs_kern_mount() to it.
+#
+AC_DEFUN([LC_SRC_HAVE_FS_CONTEXT_HEADER], [
+       LB2_CHECK_LINUX_HEADER_SRC([linux/fs_context.h], [-Werror])
+])
+AC_DEFUN([LC_HAVE_FS_CONTEXT_HEADER], [
+       LB2_CHECK_LINUX_HEADER_RESULT([linux/fs_context.h], [
+               AC_DEFINE(HAVE_FS_CONTEXT_H, 1,
+                       [fs_context.h is present])
+       ])
+]) # LC_HAVE_FS_CONTEXT_HEADER
+
+#
 # LC_HAVE_BVEC_ITER_ALL
 #
 # kernel 5.1 commit 6dc4f100c175dd0511ae8674786e7c9006cdfbfa
@@ -5217,6 +5233,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
 
        # 5.0
        LC_SRC_GENL_FAMILY_HAS_RESV_START_OP
+       LC_SRC_HAVE_FS_CONTEXT_HEADER
 
        # 5.1
        LC_SRC_HAVE_BVEC_ITER_ALL
@@ -5543,6 +5560,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
 
        # 5.0
        LC_GENL_FAMILY_HAS_RESV_START_OP
+       LC_HAVE_FS_CONTEXT_HEADER
 
        # 5.1
        LC_HAVE_BVEC_ITER_ALL
@@ -6163,6 +6181,8 @@ lustre/osd-ldiskfs/Makefile
 lustre/osd-ldiskfs/autoMakefile
 lustre/osd-zfs/Makefile
 lustre/osd-zfs/autoMakefile
+lustre/osd-wbcfs/Makefile
+lustre/osd-wbcfs/autoMakefile
 lustre/mgc/Makefile
 lustre/mgc/autoMakefile
 lustre/mgs/Makefile
index 48fd059..69e8967 100644 (file)
@@ -243,6 +243,8 @@ Source17: kmp-lnet-kfilnd.preamble
 Source18: kmp-lnet-kfilnd.files
 Source19: kmp-lnet-in-kernel-o2iblnd.preamble
 Source20: kmp-lnet-in-kernel-o2iblnd.files
+Source21: kmp-lustre-osd-wbcfs.preamble
+Source22: kmp-lustre-osd-wbcfs.files
 URL: https://wiki.whamcloud.com/
 BuildRoot: %{_tmppath}/lustre-%{version}-root
 BuildRequires: libtool pkgconfig(yaml-0.1) pkgconfig(zlib) pkgconfig(libnl-3.0) flex bison
@@ -424,8 +426,34 @@ lustre tools (mount/mkfs) to provide support for ZFS.
 %endif
 # with zfs
 %endif
+
+%if 0%{?suse_version:1}
+%else
+%if %{with servers}
+%kernel_module_package -n %{name}-osd-wbcfs -p %SOURCE21 -f %SOURCE22 %{_flavor}
+%if %{with lustre_utils}
+%package osd-wbcfs-mount
+Summary: Lustre mount's wbcfs-specific helper library
+BuildRequires: pkgconfig(mount)
+Provides: %{name}-osd-mount = %{version}
+Obsoletes: lustre-osd-mount < %{version}
+Provides: %{name}-osd-mount = %{version}
+Provides: %{name}-osd-wbcfs-mount = %{version}
+Requires: %{name}-osd-wbcfs = %{version}
+
+%description osd-wbcfs-mount
+Provide a shared library (dso) that can be loaded into various
+lustre tools (mount/mkfs) to provide support for in-memory OSD
+with writeback support.
+
+# with lustre_utils
+%endif
+# with servers
+%endif
 # with lustre_modules
 %endif
+# suse
+%endif
 
 %if %{with servers}
 %package resource-agents
@@ -823,6 +851,13 @@ mv $basemodpath/fs/ldiskfs.ko $basemodpath-osd-ldiskfs/fs/ldiskfs.ko
 mkdir -p $basemodpath-osd-zfs/fs
 mv $basemodpath/fs/osd_zfs.ko $basemodpath-osd-zfs/fs/osd_zfs.ko
 %endif
+%if 0%{?suse_version:1}
+%else
+%if %{with servers}
+mkdir -p $basemodpath-osd-wbcfs/fs
+mv $basemodpath/fs/osd_wbcfs.ko $basemodpath-osd-wbcfs/fs/osd_wbcfs.ko
+%endif
+%endif
 %if %{with lustre_tests}
 mkdir -p $basemodpath-tests/fs
 mv $basemodpath/fs/obd_test.ko $basemodpath-tests/fs/obd_test.ko
@@ -1048,6 +1083,20 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files
 %endif
 %endif
 
+%if %{with shared}
+%if 0%{?suse_version:1}
+%else
+%if %{with servers}
+%if %{with lustre_utils}
+%files osd-wbcfs-mount
+%defattr(-,root,root)
+%dir %{_libdir}/@PACKAGE@
+%{_libdir}/@PACKAGE@/mount_osd_wbcfs.so
+%endif
+%endif
+%endif
+%endif
+
 # with lustre_modules
 %endif
 
index 977d51d..cb63c14 100644 (file)
@@ -8,6 +8,7 @@ obj-m += ec/
 
 @TESTS_TRUE@obj-m += kunit/
 @SERVER_TRUE@obj-m += mgs/ mdt/ mdd/ ofd/ quota/ osp/ lod/ lfsck/ target/
+@SERVER_TRUE@obj-m += osd-wbcfs/
 @CLIENT_TRUE@obj-m += lov/ osc/ mdc/ lmv/ llite/ fld/
 @LDISKFS_ENABLED_TRUE@obj-m += osd-ldiskfs/
 @ZFS_ENABLED_TRUE@obj-m += osd-zfs/
index ad2390b..688fabd 100644 (file)
@@ -38,7 +38,7 @@ ALWAYS_SUBDIRS = include obdclass ldlm ptlrpc obdecho ec \
        mgc fid fld doc utils tests scripts conf
 
 SERVER_SUBDIRS = mgs mdt mdd ofd osd-zfs osd-ldiskfs \
-       quota osp lod target lfsck
+       quota osp lod target lfsck osd-wbcfs
 
 CLIENT_SUBDIRS = mdc lmv llite lov osc
 
index 92b8588..f75a646 100644 (file)
@@ -474,6 +474,7 @@ struct tgt_thread_big_cache {
 #define LUSTRE_MDD_NAME                "mdd"
 #define LUSTRE_OSD_LDISKFS_NAME        "osd-ldiskfs"
 #define LUSTRE_OSD_ZFS_NAME    "osd-zfs"
+#define LUSTRE_OSD_WBCFS_NAME  "osd-wbcfs"
 #define LUSTRE_VVP_NAME                "vvp"
 #define LUSTRE_LMV_NAME                "lmv"
 #define LUSTRE_SLP_NAME                "slp"
@@ -1527,4 +1528,9 @@ static inline struct inode *page2inode(struct page *page)
        }
 }
 
+static inline bool obd_is_osd_wbcfs(const struct obd_device *obd)
+{
+       return !strstr(obd->obd_name, LUSTRE_OSD_WBCFS_NAME);
+}
+
 #endif /* __OBD_H */
index 3fe114e..a39c436 100644 (file)
@@ -124,6 +124,7 @@ enum ldd_mount_type {
        LDD_MT_REISERFS = 3,
        LDD_MT_LDISKFS2 = 4,
        LDD_MT_ZFS = 5,
+       LDD_MT_WBCFS = 6,
        LDD_MT_LAST
 };
 
index 6be3812..bac5fe0 100644 (file)
@@ -307,6 +307,11 @@ int lproc_mgs_setup(struct mgs_device *mgs, const char *osd_name)
        debugfs_create_file("clear", 0644, obd->obd_debugfs_exports,
                            obd, &mgs_nid_stats_clear_fops);
 
+       /* TODO: OSD wbcfs does not have lprocfs. Add it later... */
+       osd_obd = mgs->mgs_bottom->dd_lu_dev.ld_obd;
+       if (obd_is_osd_wbcfs(osd_obd))
+               return 0;
+
        rc = sysfs_create_link(&obd->obd_kset.kobj, &mgs->mgs_bottom->dd_kobj,
                               "osd");
        if (rc) {
@@ -323,7 +328,6 @@ int lproc_mgs_setup(struct mgs_device *mgs, const char *osd_name)
        attr = get_attr_by_name(bottom_type, "mntdev");
        if (attr)
                mgs->mgs_fstype = mgs->mgs_mntdev;
-       osd_obd = mgs->mgs_bottom->dd_lu_dev.ld_obd;
        mgs->mgs_proc_osd = lprocfs_add_symlink("osd",
                                                obd->obd_proc_entry,
                                                "../../%s/%.*s",
diff --git a/lustre/osd-wbcfs/Makefile.in b/lustre/osd-wbcfs/Makefile.in
new file mode 100644 (file)
index 0000000..b3ff041
--- /dev/null
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Copyright (c) 2025-2026, DDN/Whamcloud, Inc
+#
+
+MODULES := osd_wbcfs
+osd_wbcfs-objs := osd_handler.o osd_object.o osd_hash.o osd_index_hash.o
+osd_wbcfs-objs += osd_io.o osd_dirent.o wbcfs.o
+
+@INCLUDE_RULES@
diff --git a/lustre/osd-wbcfs/TODO b/lustre/osd-wbcfs/TODO
new file mode 100644 (file)
index 0000000..42b184b
--- /dev/null
@@ -0,0 +1,32 @@
+BACKGROUND
+----------
+
+Implement an MemFS-based OSD device with writeback support for Lustre.
+It borrows lots of design from memory-based file systems such as tmpfs/ramfs.
+The data is frist written into the memory-based file system (called MemFS in
+short). And then, the data can be persisted to the permant storage in a delayed
+writeback manner.
+
+ +---------------------------------------------------------+
+ | This is experimental! Do NOT use for important data!    |
+ | Only bugs and data corruption lie ahead! Turn back now! |
+ +---------------------------------------------------------+
+
+For questions, please contact:
+- Yingjin Qian <qian@ddn.com>
+- Timothy Day <timday@amazon.com>
+
+TODO
+----
+- Inode and space usage accounting for statfs() system call.
+- Limiting for inodes and blocks.
+- Refine the mount command support for MemFS-based OSD.
+- lprocfs support. Track OSD stats and access them via lprocfs.
+- Use Maple Tree in new kernel to manage and access entries within a directory.
+- Implement the functionality needed by LFSCK.
+- Quota support.
+- Swap space support for large files.
+- Metadata on MemFS; Data on Persistent storage
+  (just like PCC naming with FID for data).
+- Writeback support with ldiskfs/ZFS or KV store as persistent backends.
+- Add transcation support.
diff --git a/lustre/osd-wbcfs/autoMakefile.am b/lustre/osd-wbcfs/autoMakefile.am
new file mode 100644 (file)
index 0000000..ba4574e
--- /dev/null
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+#
+
+if MODULES
+modulefs_DATA = osd_wbcfs.ko
+endif
+
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
+EXTRA_DIST := $(osd_wbcfs-objs:%.o=%.c) osd_internal.h wbcfs.h index.h
diff --git a/lustre/osd-wbcfs/index.h b/lustre/osd-wbcfs/index.h
new file mode 100644 (file)
index 0000000..867c85c
--- /dev/null
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Index Access Module.
+ *
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#ifndef __OSD_INDEX_H_
+#define __OSD_INDEX_H_
+
+#include <linux/rhashtable.h>
+
+/* Store key and value together in @he_buf. */
+struct hash_index_entry {
+       struct rhash_head       he_hash;
+       struct list_head        he_list_item;
+       __u64                   he_offset;
+       size_t                  he_len;
+       size_t                  he_keylen;
+       char                    he_buf[];
+};
+
+/* Index access via @rhashtable. */
+struct hash_index {
+       struct rhashtable               hi_htbl;
+       struct rhashtable_params        hi_htbl_params;
+       struct list_head                hi_list;
+       size_t                          hi_reclen;
+       __u64                           hi_next_offset;
+};
+
+int hash_index_init(struct hash_index *hind, size_t kenlen, size_t reclen);
+void hash_index_fini(struct hash_index *hind);
+struct hash_index_entry *hash_index_lookup_entry(struct hash_index *hind,
+                                                const void *key);
+int hash_index_lookup(struct hash_index *hind, const void *key, void *rec);
+int hash_index_insert(struct hash_index *hind, void *key, size_t keylen,
+                     void *rec, size_t reclen);
+void hash_index_remove(struct hash_index *hind, const void *key);
+
+/* TODO: Index access via Maple Tree. Only support in newer kernels. */
+
+#endif /* __OSD_INDEX_H_ */
diff --git a/lustre/osd-wbcfs/osd_dirent.c b/lustre/osd-wbcfs/osd_dirent.c
new file mode 100644 (file)
index 0000000..2926494
--- /dev/null
@@ -0,0 +1,821 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM        S_OSD
+
+#include <lustre_crypto.h>
+
+#include "osd_internal.h"
+#include "wbcfs.h"
+
+/* Lookup the directory entry (dentry) specified by @key. */
+static int osd_index_dir_lookup(const struct lu_env *env, struct dt_object *dt,
+                               struct dt_rec *rec, const struct dt_key *key)
+{
+       struct osd_object *pobj = osd_dt_obj(dt);
+       struct inode *dir = pobj->oo_inode;
+       struct lu_fid *fid = (struct lu_fid *)rec;
+       char *name = (char *)key;
+       struct dentry *parent;
+       struct dentry *dchild;
+       struct qstr qstr;
+       int rc = 0;
+
+       ENTRY;
+
+       LASSERT(S_ISDIR(dir->i_mode));
+       parent = d_find_any_alias(dir);
+       if (IS_ERR(parent))
+               RETURN(PTR_ERR(parent));
+
+       /* FIXME: more checking for ".." lookup. */
+       if (strcmp(name, "..") == 0) {
+               *fid = MEMFS_I(d_inode(parent->d_parent))->mei_fid;
+               GOTO(out, rc = 1);
+       }
+
+       qstr.name = name;
+       qstr.len = strlen(name);
+       qstr.hash = ll_full_name_hash(parent, qstr.name, qstr.len);
+       dchild = d_lookup(parent, &qstr);
+       if (dchild) {
+               *fid = MEMFS_I(d_inode(dchild))->mei_fid;
+               dput(dchild);
+               rc = 1;
+       }
+
+out:
+       CDEBUG(D_CACHE, "%s: lookup '%s' from parent %pd@%pK "DFID": rc=%d\n",
+              osd_name(osd_obj2dev(pobj)), name, parent, parent,
+              PFID(fid), rc);
+       dput(parent);
+       RETURN(rc);
+}
+
+/**
+ * osd_index_dir_insert() - Index add function.
+ * @key: it is key i.e. file entry to be inserted
+ * @record: it is value of given key i.e. fid
+ *
+ * It will add the directory entry.This entry is needed to
+ * maintain name->fid mapping.
+ *
+ * Return:
+ * * %0 - on success
+ * * %-ve - on error
+ */
+static int osd_index_dir_insert(const struct lu_env *env, struct dt_object *dt,
+                               const struct dt_rec *record,
+                               const struct dt_key *key,
+                               struct thandle *th)
+{
+       struct osd_object *pobj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_dev(dt->do_lu.lo_dev);
+       struct dt_insert_rec *rec = (struct dt_insert_rec *)record;
+       const struct lu_fid *fid = rec->rec_fid;
+       const char *name = (const char *)key;
+       struct inode *dir = pobj->oo_inode;
+       struct dentry *parent;
+       struct dentry *dentry;
+       struct dentry *dchild = NULL;
+       struct inode *inode;
+       struct qstr dname;
+       bool nedir_rename = false;
+       int rc = 0;
+
+       ENTRY;
+
+       if (!dt_object_exists(dt))
+               RETURN(-ENOENT);
+
+       LASSERT(!dt_object_remote(dt));
+       LASSERTF(fid_is_sane(fid), "fid "DFID" is insane!\n", PFID(fid));
+
+       /* Skip "." and ".." in MemFS. */
+       if (name[0] == '.' && (name[1] == '\0' ||
+                              (name[1] == '.' && name[2] == '\0')))
+               RETURN(0);
+
+       /* FIXME: handle remote object in DNE environment. */
+       /* TODO: Store inode in @osd_thread_info? */
+       inode = ilookup5(osd_sb(osd), lu_fid_build_ino(fid, 0),
+                        memfs_test_inode_by_fid, (void *)fid);
+       if (!inode) {
+               rc = -EINVAL;
+               CERROR("%s: lookup "DFID" from icache failed: rc=%d\n",
+                      osd_name(osd_obj2dev(pobj)), PFID(fid), rc);
+               RETURN(rc);
+       }
+
+       parent = d_find_any_alias(dir);
+       if (parent == NULL) {
+               rc = -ENOENT;
+               CERROR("%s: Cannot find dentry for inode@%pK "DFID": rc=%d\n",
+                      osd_name(osd_obj2dev(pobj)), dir,
+                      PFID(lu_object_fid(&pobj->oo_dt.do_lu)), rc);
+               GOTO(out_iput, rc);
+       }
+
+       dname.name = name;
+       dname.len = strlen(name);
+       dname.hash = ll_full_name_hash(parent, dname.name, dname.len);
+
+       dentry = d_alloc(parent, &dname);
+       if (!dentry)
+               GOTO(out_dput, rc = -ENOMEM);
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFDIR:
+               /*
+                * TODO: Store these info into OSD thread info @osd_thread_info,
+                * thus we can do undo (recovery) operations upon failure.
+                */
+               dchild = d_find_any_alias(inode);
+               /* mv (rename) a non-empty directory. */
+               if (dchild && !simple_empty(dchild))
+                       nedir_rename = true;
+               fallthrough;
+       case S_IFREG:
+               dir->i_size += BOGO_DIRENT_SIZE;
+               inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+               break;
+       case S_IFLNK:
+               /* FIXME: symlink support. */
+               CERROR("%s: symlink does not support\n",
+                      osd_name(osd_obj2dev(pobj)));
+               break;
+       default:
+               LBUG();
+       }
+
+       inode_inc_iversion(dir);
+       if (nedir_rename) {
+               d_move(dchild, dentry);
+               /* Put the refcount obtained by @d_find_any_alias() */
+               dput(dchild);
+               /* Finally release the @dentry. */
+               dput(dentry);
+       } else {
+               /* Add dentry into dentry hashtable for VFS lookup. */
+               d_add(dentry, inode);
+               ihold(inode);
+       }
+       /* Extra count (already obtain in @d_alloc) - pin the dentry in core */
+       /* dget(dentry); */
+
+       CDEBUG(D_CACHE,
+              "%s: Insert dirent "DFID"/%pd@%pK inode@%pK nlink=%d\n",
+              osd_name(osd_obj2dev(pobj)), PFID(fid), dentry, dentry,
+              inode, inode->i_nlink);
+out_dput:
+
+       dput(parent);
+out_iput:
+       iput(inode);
+
+       RETURN(rc);
+}
+
+/*
+ * Index delete funtion.
+ * It will remove the directory entry added by index insert.
+ * This entry is needed to maintain name->fid mapping.
+ */
+static int osd_index_dir_delete(const struct lu_env *env, struct dt_object *dt,
+                               const struct dt_key *key, struct thandle *th)
+{
+       struct osd_object *pobj = osd_dt_obj(dt);
+       struct inode *dir = pobj->oo_inode;
+       char *name = (char *)key;
+       struct dentry *parent;
+       struct dentry *dentry;
+       struct inode *inode;
+       struct qstr qstr;
+       bool nedir_rename = false;
+       int rc = 0;
+
+       ENTRY;
+
+       /* Skip "." and ".." in MemFS. */
+       if (name[0] == '.' && (name[1] == '\0' ||
+                              (name[1] == '.' && name[2] == '\0')))
+               RETURN(0);
+
+       parent = d_find_any_alias(dir);
+       if (parent == NULL && strcmp(name, "..") == 0) {
+               CDEBUG(D_CACHE, "%s: delete name %s from an empty dir@%pK\n",
+                      osd_name(osd_obj2dev(pobj)), name, dir);
+               RETURN(0);
+       }
+
+       if (parent == NULL) {
+               CDEBUG(D_CACHE, "%s: delete name %s from an empty dir@%pK\n",
+                      osd_name(osd_obj2dev(pobj)), name, dir);
+               RETURN(-ENOENT);
+       }
+
+       LASSERTF(parent != NULL, "dir@%pK name %s\n", dir, name);
+
+       qstr.name = name;
+       qstr.len = strlen(name);
+       qstr.hash = ll_full_name_hash(parent, qstr.name, qstr.len);
+       dentry = d_lookup(parent, &qstr);
+       if (dentry == NULL) {
+               CDEBUG(D_CACHE, "%s: cannot find %s from parent@%pK %pd\n",
+                      osd_name(osd_obj2dev(pobj)), name, dir, parent);
+               GOTO(out_dput_parent, rc = -ENOENT);
+       }
+
+       LASSERT(dentry != NULL);
+       inode = d_inode(dentry);
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFDIR:
+               /*
+                * FIXME: rename() operation, @dentry may be not empty:
+                * (sanity/214).
+                * TODO: Put @dir_rename and @dentry into OSD thread info.
+                */
+               if (!simple_empty(dentry))
+                       nedir_rename = true;
+
+               /*
+                * MDD layer drops @nlink later via @dt_ref_del().
+                * drop_nlink(inode);
+                * drop_nlink(dir);
+                */
+               fallthrough;
+       case S_IFREG:
+       case S_IFLNK:
+               dir->i_size -= BOGO_DIRENT_SIZE;
+               inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+               inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir,
+                                       inode_set_ctime_current(inode)));
+               inode_inc_iversion(dir);
+               /* MDD layer drops @nlink later via @dt_ref_del(). */
+               /* drop_nlink(inode); */
+               /*
+                * Undo the count from "create".
+                * Unhash the dentry from the parent dentry hashtable which is
+                * add by @d_add(), so that it would not be found through a VFS
+                * lookup anymore.
+                * Unpin/drop the dentry from dcache.
+                */
+               if (!nedir_rename)
+                       dput(dentry);
+               break;
+       default:
+               LBUG();
+       }
+
+       CDEBUG(D_CACHE,
+              "%s: Delete %s from dir@%pK %pd inode@%pK nlink=%d %d: rc=%d.\n",
+              osd_name(osd_obj2dev(pobj)), name, dir, parent, inode,
+              inode->i_nlink, dentry->d_lockref.count, rc);
+       dput(dentry);
+out_dput_parent:
+       dput(parent);
+       RETURN(rc);
+}
+
+static struct osd_it *
+__osd_dir_it_init(const struct lu_env *env, struct osd_device *dev,
+                 struct inode *inode, u32 attr)
+{
+       struct osd_it *oit;
+       struct file *file;
+       int rc;
+
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(oit, osd_it_cachep, GFP_NOFS);
+       if (oit == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       /* TODO: store buffer as thread context data @osd_thread_info. */
+       OBD_ALLOC(oit->oit_buf, OSD_IT_BUFSIZE);
+       if (!oit->oit_buf)
+               GOTO(out_free, rc = -ENOMEM);
+
+       oit->oit_obj = NULL;
+       file = &oit->oit_file;
+       /* Only FMODE_64BITHASH or FMODE_32BITHASH should be set, NOT both. */
+       if (attr & LUDA_64BITHASH)
+               file->f_mode |= FMODE_64BITHASH;
+       else
+               file->f_mode |= FMODE_32BITHASH;
+       file->f_path.dentry = d_find_any_alias(inode);
+       file->f_flags = O_NOATIME | __FMODE_NONOTIFY;
+       file->f_mapping = inode->i_mapping;
+       file->f_op = inode->i_fop;
+       file->f_inode = inode;
+
+       if (file->f_op->open) {
+               rc = file->f_op->open(inode, file);
+               if (rc) {
+                       dput(file->f_path.dentry);
+                       GOTO(out_free, rc);
+               }
+       }
+
+       RETURN(oit);
+
+out_free:
+       OBD_SLAB_FREE_PTR(oit, osd_it_cachep);
+       return ERR_PTR(rc);
+}
+
+/**
+ * osd_dir_it_init() - Creates or initializes iterator context.
+ *
+ * Returns: struct osd_it, iterator structure on success
+ */
+static struct dt_it *osd_dir_it_init(const struct lu_env *env,
+                                    struct dt_object *dt, __u32 attr)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *dev = osd_obj2dev(obj);
+       struct lu_object *lo = &dt->do_lu;
+       struct osd_it *oit;
+
+       ENTRY;
+
+       if (!dt_object_exists(dt) || obj->oo_destroyed)
+               RETURN(ERR_PTR(-ENOENT));
+
+       oit = __osd_dir_it_init(env, dev, obj->oo_inode, attr);
+       if (IS_ERR(oit))
+               RETURN(ERR_CAST(oit));
+
+       oit->oit_obj = obj;
+       lu_object_get(lo);
+       RETURN((struct dt_it *)oit);
+}
+
+/**
+ * osd_dir_it_fini() - Destroy or finishes iterator context.
+ * @di: iterator structure to be destroyed
+ */
+static void osd_dir_it_fini(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_it *oit = (struct osd_it *)di;
+       struct osd_object *obj = oit->oit_obj;
+       struct inode *inode = obj->oo_inode;
+
+       ENTRY;
+
+       dput(oit->oit_file.f_path.dentry);
+       oit->oit_file.f_op->release(inode, &oit->oit_file);
+       OBD_FREE(oit->oit_buf, OSD_IT_BUFSIZE);
+       OBD_SLAB_FREE_PTR(oit, osd_it_cachep);
+
+       osd_object_put(env, obj);
+
+       EXIT;
+}
+
+
+/*
+ * It position the iterator at given key, so that next lookup continues from
+ * that key Or it is similar to dio_it->load() but based on a key,
+ * rather than file position.
+ *
+ * As a special convention, osd_it_ea_get(env, di, "") has to rewind iterator
+ * to the beginning.
+ *
+ * TODO: Presently return 1 considering it is only used by mdd_dir_is_empty().
+ */
+static int osd_dir_it_get(const struct lu_env *env,
+                         struct dt_it *di, const struct dt_key *key)
+{
+       struct osd_it *it = (struct osd_it *)di;
+       struct file *file = &it->oit_file;
+
+       ENTRY;
+
+       LASSERT(((const char *)key)[0] == '\0');
+       if (file->f_op->llseek) {
+               loff_t offset;
+
+               offset = file->f_op->llseek(file, 0, 0);
+               if (offset != 0)
+                       CWARN("Failed to llseek(): offset %lld != 0\n", offset);
+       } else {
+               it->oit_file.f_pos = 0;
+       }
+
+       it->oit_rd_dirent = 0;
+       it->oit_it_dirent = 0;
+       it->oit_dirent = NULL;
+
+       RETURN(1);
+}
+
+/* Does nothing */
+static void osd_dir_it_put(const struct lu_env *env, struct dt_it *di)
+{
+}
+
+/**
+ * osd_memfs_filldir() - It is called internally by ->iterate*()
+ * @buf: in which information to be filled in.
+ * @name: name of the file in given dir
+ *
+ * It fills the iterator's in-memory data structure with required
+ * information i.e. name, namelen, rec_size etc.
+ *
+ * Returns:
+ * * %0 - on success
+ * * %1 - on buffer full
+ */
+#ifdef HAVE_FILLDIR_USE_CTX
+static FILLDIR_TYPE do_osd_memfs_filldir(struct dir_context *ctx,
+#else
+static int osd_memfs_filldir(void *ctx,
+#endif
+                            const char *name, int namelen,
+                            loff_t offset, __u64 ino, unsigned int d_type)
+{
+       struct memfs_dir_context *mctx = (struct memfs_dir_context *)ctx;
+       struct osd_it *oit = (struct osd_it *)mctx->cbdata;
+       struct osd_object *obj = oit->oit_obj;
+       struct osd_it_dirent *ent = oit->oit_dirent;
+       struct lu_fid *fid = &ent->oitd_fid;
+       char *buf = oit->oit_buf;
+
+       ENTRY;
+
+       /* This should never happen */
+       if (unlikely(namelen == 0 || namelen > NAME_MAX)) {
+               CERROR("MemFS return invalid namelen %d\n", namelen);
+               RETURN(-EIO);
+       }
+
+       /* Check for enough space. Note oitd_name is not NUL terminated. */
+       if (&ent->oitd_name[namelen] > buf + OSD_IT_BUFSIZE)
+               RETURN(1);
+
+       /* "." is just the object itself. */
+       if (namelen == 1 && name[0] == '.') {
+               if (obj != NULL)
+                       *fid = obj->oo_dt.do_lu.lo_header->loh_fid;
+       } else if (namelen == 2 && name[0] == '.' && name[1] == '.') {
+               if (obj != NULL) {
+                       struct inode *inode = obj->oo_inode;
+                       struct dentry *dentry;
+                       struct dentry *parent;
+
+                       LASSERT(S_ISDIR(inode->i_mode));
+                       dentry = d_find_any_alias(inode);
+                       parent = dentry->d_parent;
+                       *fid = MEMFS_I(d_inode(parent))->mei_fid;
+                       dput(dentry);
+               }
+       } else if (mctx->dentry) {
+               *fid = MEMFS_I(d_inode(mctx->dentry))->mei_fid;
+       } else {
+               fid_zero(fid);
+       }
+
+       /* NOT export local root. */
+       if (obj != NULL &&
+           unlikely(osd_sb(osd_obj2dev(obj))->s_root->d_inode->i_ino == ino)) {
+               ino = obj->oo_inode->i_ino;
+               *fid = obj->oo_dt.do_lu.lo_header->loh_fid;
+       }
+
+       if (obj == NULL || !(obj->oo_lma_flags & LUSTRE_ENCRYPT_FL)) {
+               ent->oitd_namelen = namelen;
+               memcpy(ent->oitd_name, name, namelen);
+       } else {
+               int encoded_namelen = critical_chars(name, namelen);
+
+               /* Check again for enough space. */
+               if (&ent->oitd_name[encoded_namelen] > buf + OSD_IT_BUFSIZE)
+                       RETURN(1);
+
+               ent->oitd_namelen = encoded_namelen;
+
+               if (encoded_namelen == namelen)
+                       memcpy(ent->oitd_name, name, namelen);
+               else
+                       critical_encode(name, namelen, ent->oitd_name);
+       }
+
+       ent->oitd_ino = ino;
+       ent->oitd_off = offset;
+       ent->oitd_type = d_type;
+
+       oit->oit_rd_dirent++;
+       oit->oit_dirent = (void *)ent +
+                         round_up(sizeof(*ent) + ent->oitd_namelen, 8);
+       CDEBUG(D_DENTRY, "Filldir: fid="DFID" name=%s off=%llu rd_dirent=%u\n",
+              PFID(fid), name, offset, oit->oit_rd_dirent);
+       RETURN(0);
+}
+
+WRAP_FILLDIR_FN(do_, osd_memfs_filldir)
+
+/**
+ * osd_memfs_it_fill() - Calls ->iterate*() to load a directory entry at
+ * a time and stored it in iterator's in-memory data structure.
+ * @di: iterator's in memory structure
+ *
+ * Returns:
+ * * %0 - on success
+ * * %-ve - on error
+ * * %1 - reach the end of entry
+ */
+static int osd_memfs_it_fill(const struct lu_env *env, const struct dt_it *di)
+{
+       struct osd_it *it = (struct osd_it *)di;
+       struct file *filp = &it->oit_file;
+       struct inode *dir = file_inode(filp);
+       struct memfs_dir_context mctx = {
+               .super.actor = osd_memfs_filldir,
+               .dentry = NULL,
+               .cbdata = it
+       };
+       int rc = 0;
+
+       ENTRY;
+
+       it->oit_dirent = it->oit_buf;
+       it->oit_rd_dirent = 0;
+
+#ifdef HAVE_FOP_ITERATE_SHARED
+       inode_lock_shared(dir);
+#else
+       inode_lock(dir);
+#endif
+       if (!IS_DEADDIR(dir)) {
+               if (filp->f_op->iterate_shared) {
+                       mctx.super.pos = filp->f_pos;
+                       rc = filp->f_op->iterate_shared(filp, &mctx.super);
+                       filp->f_pos = mctx.super.pos;
+               } else {
+#ifdef HAVE_FOP_READDIR
+                       rc = filp->f_op->readdir(filp, &mctx.super,
+                                                mctx.super.actor);
+                       mctx.super.pos = filp->f_pos;
+#else
+                       rc = -ENOTDIR;
+#endif
+               }
+       }
+#ifdef HAVE_FOP_ITERATE_SHARED
+       inode_unlock_shared(dir);
+#else
+       inode_unlock(dir);
+#endif
+       if (rc)
+               RETURN(rc);
+
+       if (it->oit_rd_dirent == 0) {
+               /*
+                * If it does not get any dirent, it means it has been reached
+                * to the end of the dir
+                */
+               it->oit_file.f_pos = MEMFS_DIR_EOF;
+               rc = 1;
+       } else {
+               it->oit_dirent = it->oit_buf;
+               it->oit_it_dirent = 1;
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * osd_dir_it_next() - It calls osd_memfs_it_fill() which will use
+ * ->iterate*() to load a directory entry at a time and stored it in
+ * iterator's in-memory data structure.
+ * @di: iterator's in memory structure
+ *
+ * Returns:
+ * * %ve - iterator reached to end
+ * * %0 - iterator not reached to end
+ * * %-ve - on error
+ */
+static int osd_dir_it_next(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_it *it = (struct osd_it *)di;
+       int rc;
+
+       ENTRY;
+
+       if (it->oit_it_dirent < it->oit_rd_dirent) {
+               it->oit_dirent =
+                       (void *)it->oit_dirent +
+                       round_up(sizeof(struct osd_it_dirent) +
+                                      it->oit_dirent->oitd_namelen, 8);
+               it->oit_it_dirent++;
+               rc = 0;
+       } else {
+               if (it->oit_file.f_pos == MEMFS_DIR_EOF)
+                       rc = 1;
+               else
+                       rc = osd_memfs_it_fill(env, di);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * osd_dir_it_key() - Returns the key at current position from
+ * iterator's in memory structure.
+ * @di: iterator's in memory structure
+ *
+ * Returns: key i.e. struct dt_key on success
+ */
+static struct dt_key *osd_dir_it_key(const struct lu_env *env,
+                                    const struct dt_it *di)
+{
+       struct osd_it *it = (struct osd_it *)di;
+
+       return (struct dt_key *)it->oit_dirent->oitd_name;
+}
+
+/**
+ * osd_dir_it_key_size() - Returns key's size at current position
+ * from iterator's in memory structure.
+ * @di: iterator's in memory structure
+ *
+ * Returns: key_size i.e. struct dt_key on success
+ */
+static int osd_dir_it_key_size(const struct lu_env *env, const struct dt_it *di)
+{
+       struct osd_it *it = (struct osd_it *)di;
+
+       return it->oit_dirent->oitd_namelen;
+}
+
+static inline void
+osd_it_append_attrs(struct lu_dirent *ent, int len, __u16 type)
+{
+       /* check if file type is required */
+       if (ent->lde_attrs & LUDA_TYPE) {
+               struct luda_type *lt;
+               int align = sizeof(*lt) - 1;
+
+               len = (len + align) & ~align;
+               lt = (struct luda_type *)(ent->lde_name + len);
+               lt->lt_type = cpu_to_le16(DTTOIF(type));
+       }
+
+       ent->lde_attrs = cpu_to_le32(ent->lde_attrs);
+}
+
+/*
+ * build lu direct from backend fs dirent.
+ */
+static inline void
+osd_it_pack_dirent(struct lu_dirent *ent, struct lu_fid *fid, __u64 offset,
+                  char *name, __u16 namelen, __u16 type, __u32 attr)
+{
+       ent->lde_attrs = attr | LUDA_FID;
+       fid_cpu_to_le(&ent->lde_fid, fid);
+
+       ent->lde_hash = cpu_to_le64(offset);
+       ent->lde_reclen = cpu_to_le16(lu_dirent_calc_size(namelen, attr));
+
+       strncpy(ent->lde_name, name, namelen);
+       ent->lde_name[namelen] = '\0';
+       ent->lde_namelen = cpu_to_le16(namelen);
+
+       /* append lustre attributes */
+       osd_it_append_attrs(ent, namelen, type);
+}
+
+/**
+ * osd_dir_it_rec() - Returns the value at current position from
+ * iterator's in memory structure.
+ * @di:        struct osd_it, iterator's in memory structure
+ * @dtrec: lustre dirent
+ * @attr: attr requested for dirent.
+ *
+ * Returns:
+ * %0 - no error and \param lde has correct lustre dirent.
+ * %-ve - on error
+ */
+static inline int osd_dir_it_rec(const struct lu_env *env,
+                                const struct dt_it *di,
+                                struct dt_rec *dtrec, __u32 attr)
+{
+       struct osd_it *it = (struct osd_it *)di;
+       struct lu_fid *fid = &it->oit_dirent->oitd_fid;
+       struct lu_dirent *lde = (struct lu_dirent *)dtrec;
+
+       ENTRY;
+
+       /* TODO: lfsck checking support.*/
+
+       attr &= ~LU_DIRENT_ATTRS_MASK;
+       /* Pack the entry anyway, at least the offset is right. */
+       osd_it_pack_dirent(lde, fid, it->oit_dirent->oitd_off,
+                          it->oit_dirent->oitd_name,
+                          it->oit_dirent->oitd_namelen,
+                          it->oit_dirent->oitd_type, attr);
+
+       RETURN(0);
+}
+
+/**
+ * osd_dir_it_rec_size() - Returns the record size at current position.
+ * @env: execution environment
+ * @di: iterator's in memory structure
+ * @attr: attribute of the entry, only requires LUDA_TYPE to
+ *        calculate the lu_dirent size.
+ *
+ * This function will return record(lu_dirent) size in bytes.
+ *
+ * Returns: record size(in bytes & in memory) of the current lu_dirent
+ *          entry.
+ */
+static int osd_dir_it_rec_size(const struct lu_env *env, const struct dt_it *di,
+                              __u32 attr)
+{
+       struct osd_it *it = (struct osd_it *)di;
+
+       return lu_dirent_calc_size(it->oit_dirent->oitd_namelen, attr);
+}
+
+/**
+ * osd_dir_it_store() - Returns a cookie for current position of the iterator
+ * head, so that user can use this cookie to load/start the iterator next
+ * time.
+ * @di: iterator's in memory structure
+ *
+ * Returns: cookie for current position, on success
+ */
+static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di)
+{
+       struct osd_it *it = (struct osd_it *)di;
+
+       return it->oit_dirent->oitd_off;
+}
+
+/**
+ * osd_dir_it_load() - It calls osd_memfs_it_fill() which will use
+ * ->iterate*() to load a directory entry at a time and stored it
+ * in iterator's in-memory data structure.
+ * @di: struct osd_it, iterator's in memory structure
+ *
+ * Returns:
+ * * %ve - on success
+ * * %-ve - on error
+ */
+static int osd_dir_it_load(const struct lu_env *env,
+                          const struct dt_it *di, __u64 hash)
+{
+       struct osd_it *it = (struct osd_it *)di;
+       struct file *file = &it->oit_file;
+       loff_t offset;
+       int rc;
+
+       ENTRY;
+
+       if (file->f_op->llseek) {
+               offset = file->f_op->llseek(file, hash, 0);
+               if (offset != hash)
+                       CWARN("Failed to llseek(): offset %lld != hash %llu\n",
+                             offset, hash);
+       } else {
+               it->oit_file.f_pos = hash;
+       }
+
+       rc = osd_memfs_it_fill(env, di);
+       if (rc > 0)
+               rc = -ENODATA;
+
+       if (rc == 0)
+               rc = 1;
+
+       RETURN(rc);
+}
+
+const struct dt_index_operations osd_dir_ops = {
+       .dio_lookup             = osd_index_dir_lookup,
+       .dio_insert             = osd_index_dir_insert,
+       .dio_delete             = osd_index_dir_delete,
+       .dio_it = {
+               .init           = osd_dir_it_init,
+               .fini           = osd_dir_it_fini,
+               .get            = osd_dir_it_get,
+               .put            = osd_dir_it_put,
+               .next           = osd_dir_it_next,
+               .key            = osd_dir_it_key,
+               .key_size       = osd_dir_it_key_size,
+               .rec            = osd_dir_it_rec,
+               .rec_size       = osd_dir_it_rec_size,
+               .store          = osd_dir_it_store,
+               .load           = osd_dir_it_load
+       }
+};
diff --git a/lustre/osd-wbcfs/osd_handler.c b/lustre/osd-wbcfs/osd_handler.c
new file mode 100644 (file)
index 0000000..9404211
--- /dev/null
@@ -0,0 +1,680 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * wbcFS OSD module
+ *
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM        S_OSD
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <md_object.h>
+#include <obd_class.h>
+
+#include "osd_internal.h"
+#include "wbcfs.h"
+
+struct kmem_cache *osd_it_cachep;
+struct kmem_cache *osd_hash_it_cachep;
+
+static struct lu_kmem_descr wbcfs_caches[] = {
+       {
+               .ckd_cache = &osd_it_cachep,
+               .ckd_name  = "osd_it_cache",
+               .ckd_size  = sizeof(struct osd_it)
+       },
+       {
+               .ckd_cache = &osd_hash_it_cachep,
+               .ckd_name  = "osd_hash_it_cache",
+               .ckd_size  = sizeof(struct osd_hash_it)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/* Copied form osd-ldiskfs to open/put file handle in kenrel. */
+struct work_struct flush_fput;
+atomic_t descriptors_cnt;
+unsigned int wbcfs_flush_descriptors_cnt = 5000;
+
+#ifdef HAVE_FLUSH_DELAYED_FPUT
+# define cfs_flush_delayed_fput() flush_delayed_fput()
+#else
+void (*cfs_flush_delayed_fput)(void);
+#endif /* HAVE_FLUSH_DELAYED_FPUT */
+
+static void osd_flush_fput(struct work_struct *work)
+{
+       /* flush file descriptors when too many files */
+       CDEBUG_LIMIT(D_HA, "Flushing file descriptors limit %d\n",
+                    wbcfs_flush_descriptors_cnt);
+
+       /* descriptors_cnt triggers the threshold when a flush is started,
+        * but all pending descriptors will be flushed each time, so it
+        * doesn't need to exactly match the number of descriptors.
+        */
+       atomic_set(&descriptors_cnt, 0);
+       cfs_flush_delayed_fput();
+}
+
+static struct lu_object *osd_object_alloc(const struct lu_env *env,
+                                         const struct lu_object_header *hdr,
+                                         struct lu_device *d)
+{
+       struct osd_object *obj;
+       struct lu_object *l;
+
+       OBD_ALLOC_PTR(obj);
+       if (!obj)
+               return NULL;
+
+       l = &obj->oo_dt.do_lu;
+       dt_object_init(&obj->oo_dt, NULL, d);
+       obj->oo_header = NULL;
+       obj->oo_dt.do_ops = &osd_obj_ops;
+       l->lo_ops = &osd_lu_obj_ops;
+       spin_lock_init(&obj->oo_guard);
+       init_rwsem(&obj->oo_dt.dd_sem);
+       init_rwsem(&obj->oo_sem);
+       return l;
+}
+
+static int osd_shutdown(const struct lu_env *env, struct osd_device *osd)
+{
+       seq_target_fini(env, &osd->od_dt_dev);
+       return 0;
+}
+
+static int osd_mount(const struct lu_env *env,
+                    struct osd_device *osd, struct lustre_cfg *cfg)
+{
+       struct file_system_type *type;
+       struct inode *inode;
+       unsigned long flags = 0;
+       struct lu_fid fid;
+       int rc = 0;
+
+       ENTRY;
+
+       if (osd->od_mnt != NULL)
+               RETURN(0);
+
+       type = get_fs_type("wbcfs");
+       if (type == NULL) {
+               CERROR("%s: Cannot find wbcfs FS type.\n", osd_name(osd));
+               RETURN(-ENODEV);
+       }
+
+       flags |= SB_KERNMOUNT;
+       osd->od_mnt = vfs_kern_mount(type, flags, NULL, NULL);
+       module_put(type->owner);
+
+       if (IS_ERR(osd->od_mnt)) {
+               rc = PTR_ERR(osd->od_mnt);
+               osd->od_mnt = NULL;
+               CERROR("%s: Failed to mount wbcfs in kernel: rc=%d\n",
+                      osd_name(osd), rc);
+               RETURN(rc);
+       }
+
+       inode = osd_sb(osd)->s_root->d_inode;
+       lu_local_obj_fid(&fid, OSD_FS_ROOT_OID);
+       inode->i_ino = lu_fid_build_ino(&fid, 0);
+       inode->i_generation = lu_fid_build_gen(&fid);
+       MEMFS_I(inode)->mei_fid = fid;
+       __insert_inode_hash(inode, inode->i_ino);
+
+       RETURN(rc);
+}
+
+static int osd_process_config(const struct lu_env *env,
+                             struct lu_device *d, struct lustre_cfg *cfg)
+{
+       struct osd_device *osd = osd_dev(d);
+       int count;
+       int rc;
+
+       ENTRY;
+
+       switch (cfg->lcfg_command) {
+       case LCFG_SETUP:
+               rc = osd_mount(env, osd, cfg);
+               break;
+       case LCFG_CLEANUP:
+               /*
+                * For the case LCFG_PRE_CLEANUP is not called in advance,
+                * that may happen if hit failure during mount process.
+                */
+               lu_dev_del_linkage(d->ld_site, d);
+               rc = osd_shutdown(env, osd);
+               break;
+       case LCFG_PARAM:
+               LASSERT(&osd->od_dt_dev);
+               count = class_modify_config(cfg, PARAM_OSD,
+                                           &osd->od_dt_dev.dd_kobj);
+               if (count < 0)
+                       count = class_modify_config(cfg, PARAM_OST,
+                                                   &osd->od_dt_dev.dd_kobj);
+               rc = count > 0 ? 0 : count;
+               break;
+       case LCFG_PRE_CLEANUP:
+               rc = 0;
+               break;
+       default:
+               rc = -EOPNOTSUPP;
+       }
+
+       RETURN(rc);
+}
+
+static int osd_recovery_complete(const struct lu_env *env, struct lu_device *d)
+{
+       RETURN(0);
+}
+
+static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
+                      struct lu_device *dev)
+{
+       struct osd_device *osd = osd_dev(dev);
+       int rc = 0;
+
+       rc = seq_target_init(env, &osd->od_dt_dev, osd->od_svname,
+                            osd->od_is_ost);
+
+       RETURN(rc);
+}
+
+const struct lu_device_operations osd_lu_ops = {
+       .ldo_object_alloc       = osd_object_alloc,
+       .ldo_process_config     = osd_process_config,
+       .ldo_recovery_complete  = osd_recovery_complete,
+       .ldo_prepare            = osd_prepare,
+       .ldo_fid_alloc          = fid_alloc_generic,
+};
+
+static int osd_root_get(const struct lu_env *env,
+                       struct dt_device *dev, struct lu_fid *f)
+{
+       lu_local_obj_fid(f, OSD_FS_ROOT_OID);
+       return 0;
+}
+
+static int osd_statfs(const struct lu_env *env, struct dt_device *d,
+                     struct obd_statfs *sfs, struct obd_statfs_info *info)
+{
+       struct osd_device *osd = osd_dt_dev(d);
+       struct super_block *sb = osd_sb(osd);
+       struct kstatfs ksfs;
+       int rc;
+
+       if (unlikely(!sb))
+               return -EINPROGRESS;
+
+       memset(&ksfs, 0, sizeof(ksfs));
+       rc = sb->s_op->statfs(sb->s_root, &ksfs);
+       if (rc)
+               RETURN(rc);
+
+       statfs_pack(sfs, &ksfs);
+       if (unlikely(sb->s_flags & SB_RDONLY))
+               sfs->os_state |= OS_STATFS_READONLY;
+
+       if (sfs->os_blocks == 0) {
+               sfs->os_blocks = memfs_default_max_blocks();
+               sfs->os_bfree = sfs->os_blocks;
+               sfs->os_bavail = sfs->os_bfree;
+       }
+
+       if (sfs->os_files == 0) {
+               sfs->os_files = memfs_default_max_inodes();
+               sfs->os_ffree = sfs->os_files;
+       }
+
+       sfs->os_state |= OS_STATFS_NONROT;
+       sfs->os_namelen = NAME_MAX;
+       sfs->os_maxbytes = sb->s_maxbytes;
+
+       return 0;
+}
+
+static struct thandle *osd_trans_create(const struct lu_env *env,
+                                       struct dt_device *d)
+{
+       struct osd_thandle *oh;
+       struct thandle *th;
+
+       ENTRY;
+
+       if (d->dd_rdonly) {
+               CERROR("%s: someone try to start transaction under readonly mode, should be disabled.\n",
+                      osd_name(osd_dt_dev(d)));
+               dump_stack();
+               RETURN(ERR_PTR(-EROFS));
+       }
+
+       sb_start_write(osd_sb(osd_dt_dev(d)));
+
+       OBD_ALLOC_PTR(oh);
+       if (!oh) {
+               sb_end_write(osd_sb(osd_dt_dev(d)));
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       th = &oh->ot_super;
+       th->th_dev = d;
+       th->th_result = 0;
+       INIT_LIST_HEAD(&oh->ot_commit_dcb_list);
+       INIT_LIST_HEAD(&oh->ot_stop_dcb_list);
+
+       RETURN(th);
+}
+
+static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
+                          struct thandle *th)
+{
+       int rc;
+
+       ENTRY;
+
+       rc = dt_txn_hook_start(env, d, th);
+       RETURN(rc);
+}
+
+static void osd_trans_commit_cb(struct osd_thandle *oh, int result)
+{
+       struct thandle *th = &oh->ot_super;
+       struct dt_txn_commit_cb *dcb, *tmp;
+
+       /* call per-transaction callbacks if any */
+       list_for_each_entry_safe(dcb, tmp, &oh->ot_commit_dcb_list,
+                                dcb_linkage) {
+               LASSERTF(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC,
+                        "commit callback entry: magic=%x name='%s'\n",
+                        dcb->dcb_magic, dcb->dcb_name);
+               list_del_init(&dcb->dcb_linkage);
+               dcb->dcb_func(NULL, th, dcb, result);
+       }
+}
+
+static void osd_trans_stop_cb(struct osd_thandle *oh, int result)
+{
+       struct thandle *th = &oh->ot_super;
+       struct dt_txn_commit_cb *dcb, *tmp;
+
+       /* call per-transaction stop callbacks if any */
+       list_for_each_entry_safe(dcb, tmp, &oh->ot_stop_dcb_list,
+                                dcb_linkage) {
+               LASSERTF(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC,
+                        "commit callback entry: magic=%x name='%s'\n",
+                        dcb->dcb_magic, dcb->dcb_name);
+               list_del_init(&dcb->dcb_linkage);
+               dcb->dcb_func(NULL, th, dcb, result);
+       }
+}
+
+static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
+                         struct thandle *th)
+{
+       struct osd_device *osd = osd_dt_dev(th->th_dev);
+       struct osd_thandle *oh;
+       int rc = 0;
+
+       ENTRY;
+       oh = container_of(th, struct osd_thandle, ot_super);
+
+       rc = dt_txn_hook_stop(env, th);
+       if (rc)
+               CERROR("%s: failed in transaction hook: rc=%d\n",
+                      osd_name(osd), rc);
+
+       osd_trans_stop_cb(oh, rc);
+       /* FIXME: using th->th_result? */
+       osd_trans_commit_cb(oh, rc);
+       sb_end_write(osd_sb(osd));
+
+       th->th_dev = NULL;
+       OBD_FREE_PTR(oh);
+       RETURN(rc);
+}
+
+static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb)
+{
+       struct osd_thandle *oh = container_of(th, struct osd_thandle,
+                                             ot_super);
+
+       LASSERT(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC);
+       LASSERT(&dcb->dcb_func != NULL);
+
+       if (dcb->dcb_flags & DCB_TRANS_STOP)
+               list_add(&dcb->dcb_linkage, &oh->ot_stop_dcb_list);
+       else
+               list_add(&dcb->dcb_linkage, &oh->ot_commit_dcb_list);
+
+       return 0;
+}
+
+static void osd_conf_get(const struct lu_env *env,
+                        const struct dt_device *dev,
+                        struct dt_device_param *param)
+{
+       struct osd_device *osd = osd_dt_dev(dev);
+       struct super_block *sb = osd_sb(osd);
+
+       param->ddp_max_name_len = NAME_MAX;
+       param->ddp_max_nlink = 1 << 31;
+       param->ddp_symlink_max = sb->s_blocksize;
+       param->ddp_mount_type = LDD_MT_WBCFS;
+       param->ddp_maxbytes = sb->s_maxbytes;
+       param->ddp_max_extent_blks = 1024;
+       param->ddp_extent_tax = 1024;
+
+       param->ddp_mntopts = MNTOPT_USERXATTR;
+
+       /* TODO: Add support for MNTOPT_ACL. */
+
+       param->ddp_max_ea_size = OBD_MAX_EA_SIZE;
+       param->ddp_inodespace = 1024;
+       param->ddp_brw_size = DT_DEF_BRW_SIZE;
+
+       param->ddp_has_lseek_data_hole = true;
+}
+
+static int osd_ro(const struct lu_env *env, struct dt_device *d)
+{
+       int rc = -EOPNOTSUPP;
+
+       ENTRY;
+
+       CERROR("%s: cannot be set readonly: rc=%d\n",
+              osd_dt_dev(d)->od_svname, rc);
+
+       RETURN(rc);
+}
+
+static int osd_reserve_or_free_quota(const struct lu_env *env,
+                                    struct dt_device *dev,
+                                    struct lquota_id_info *qi)
+{
+       RETURN(0);
+}
+
+static int osd_sync(const struct lu_env *env, struct dt_device *d)
+{
+       RETURN(0);
+}
+
+static int osd_commit_async(const struct lu_env *env, struct dt_device *dev)
+{
+       RETURN(0);
+}
+
+static const struct dt_device_operations osd_dt_ops = {
+       .dt_root_get              = osd_root_get,
+       .dt_statfs                = osd_statfs,
+       .dt_trans_create          = osd_trans_create,
+       .dt_trans_start           = osd_trans_start,
+       .dt_trans_stop            = osd_trans_stop,
+       .dt_trans_cb_add          = osd_trans_cb_add,
+       .dt_conf_get              = osd_conf_get,
+       .dt_ro                    = osd_ro,
+       .dt_reserve_or_free_quota = osd_reserve_or_free_quota,
+       .dt_sync                  = osd_sync,
+       .dt_commit_async          = osd_commit_async,
+};
+
+static void osd_umount(const struct lu_env *env, struct osd_device *dev)
+{
+       ENTRY;
+
+       if (dev->od_mnt) {
+               shrink_dcache_sb(osd_sb(dev));
+               mntput(dev->od_mnt);
+               dev->od_mnt = NULL;
+       }
+
+       /* to be sure all delayed fput are finished. */
+       cfs_flush_delayed_fput();
+
+       EXIT;
+}
+
+static int __osd_device_init(const struct lu_env *env, struct osd_device *osd,
+                            struct lustre_cfg *cfg)
+{
+       struct lu_device *ld = osd2lu_dev(osd);
+       int cplen = 0;
+       int rc;
+
+       rc = lu_env_refill((struct lu_env *)env);
+       if (rc)
+               RETURN(rc);
+
+       ld->ld_ops = &osd_lu_ops;
+       osd->od_dt_dev.dd_ops = &osd_dt_ops;
+
+       cplen = strscpy(osd->od_svname, lustre_cfg_string(cfg, 4),
+                       sizeof(osd->od_svname));
+       if (cplen < 0)
+               GOTO(out, rc = cplen);
+
+       /* -1 means that index is invalid. */
+       osd->od_index = -1;
+       rc = server_name2index(osd->od_svname, &osd->od_index, NULL);
+       if (rc == LDD_F_SV_TYPE_OST)
+               osd->od_is_ost = 1;
+
+       rc = osd_mount(env, osd, cfg);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = lu_site_init(&osd->od_site, ld);
+       if (rc)
+               GOTO(out_mnt, rc);
+       osd->od_site.ls_bottom_dev = ld;
+
+       rc = lu_site_init_finish(&osd->od_site);
+       if (rc)
+               GOTO(out_site, rc);
+
+       RETURN(0);
+
+out_site:
+       lu_site_fini(&osd->od_site);
+out_mnt:
+       osd_umount(env, osd);
+out:
+       return rc;
+}
+
+static struct lu_device *osd_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       struct osd_device *osd;
+       int rc;
+
+       ENTRY;
+
+       OBD_ALLOC_PTR(osd);
+       if (osd == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       rc = dt_device_init(&osd->od_dt_dev, t);
+       if (unlikely(rc)) {
+               OBD_FREE_PTR(osd);
+               GOTO(out, rc);
+       }
+
+       rc = __osd_device_init(env, osd, cfg);
+out:
+       RETURN(rc == 0 ? osd2lu_dev(osd) : ERR_PTR(rc));
+}
+
+static struct lu_device *osd_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct osd_device *osd = osd_dev(d);
+
+       ENTRY;
+
+       /* XXX: make osd top device in order to release reference */
+       d->ld_site->ls_top_dev = d;
+       lu_site_purge(env, d->ld_site, -1);
+       lu_site_print(env, d->ld_site, &d->ld_site->ls_obj_hash.nelems,
+                     D_ERROR, lu_cdebug_printer);
+
+       lu_site_fini(&osd->od_site);
+       dt_device_fini(&osd->od_dt_dev);
+       OBD_FREE_PTR(osd);
+
+       RETURN(NULL);
+}
+
+static int osd_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       return 0;
+}
+
+static struct lu_device *osd_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct osd_device *osd = osd_dev(d);
+
+       ENTRY;
+
+       osd_shutdown(env, osd);
+       osd_umount(env, osd);
+       RETURN(NULL);
+}
+
+static const struct lu_device_type_operations osd_device_type_ops = {
+       .ldto_device_alloc      = osd_device_alloc,
+       .ldto_device_free       = osd_device_free,
+       .ldto_device_init       = osd_device_init,
+       .ldto_device_fini       = osd_device_fini
+};
+
+static struct lu_device_type osd_device_type = {
+       .ldt_tags       = LU_DEVICE_DT,
+       .ldt_name       = LUSTRE_OSD_WBCFS_NAME,
+       .ldt_ops        = &osd_device_type_ops,
+       .ldt_ctx_tags   = LCT_LOCAL
+};
+
+/* We use exports to track all osd users. */
+static int osd_obd_connect(const struct lu_env *env, struct obd_export **exp,
+                          struct obd_device *obd, struct obd_uuid *cluuid,
+                          struct obd_connect_data *data, void *localdata)
+{
+       struct osd_device *osd = osd_dev(obd->obd_lu_dev);
+       struct lustre_handle conn;
+       int rc;
+
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "connect #%d\n", atomic_read(&osd->od_connects));
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc)
+               RETURN(rc);
+
+       *exp = class_conn2export(&conn);
+       atomic_inc(&osd->od_connects);
+
+       RETURN(0);
+}
+
+/*
+ * Once last export (we do not count self-export) disappeared,
+ * OSD can be released.
+ */
+static int osd_obd_disconnect(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct osd_device *osd = osd_dev(obd->obd_lu_dev);
+       int rc, release = 0;
+
+       ENTRY;
+
+       /* Only disconnect the underlying layers on the final disconnect. */
+       release = atomic_dec_and_test(&osd->od_connects);
+       rc = class_disconnect(exp);
+
+       if (rc == 0 && release)
+               class_manual_cleanup(obd);
+
+       RETURN(rc);
+}
+
+static int osd_health_check(const struct lu_env *env, struct obd_device *obd)
+{
+       struct osd_device *osd = osd_dev(obd->obd_lu_dev);
+       struct super_block *sb = osd_sb(osd);
+
+       return (!sb || sb->s_flags & SB_RDONLY);
+}
+
+static const struct obd_ops osd_obd_device_ops = {
+       .o_owner        = THIS_MODULE,
+       .o_connect      = osd_obd_connect,
+       .o_disconnect   = osd_obd_disconnect,
+       .o_health_check = osd_health_check,
+};
+
+static int __init osd_init(void)
+{
+       int rc;
+
+       rc = libcfs_setup();
+       if (rc)
+               return rc;
+
+       rc = lu_kmem_init(wbcfs_caches);
+       if (rc)
+               return rc;
+
+       rc = memfs_init();
+       if (rc)
+               GOTO(out_kmem, rc);
+
+       rc = class_register_type(&osd_obd_device_ops, NULL, true,
+                                LUSTRE_OSD_WBCFS_NAME, &osd_device_type);
+       if (rc)
+               GOTO(out_memfs, rc);
+
+#ifndef HAVE_FLUSH_DELAYED_FPUT
+       if (unlikely(cfs_flush_delayed_fput == NULL))
+               cfs_flush_delayed_fput =
+                       cfs_kallsyms_lookup_name("flush_delayed_fput");
+#endif
+
+       INIT_WORK(&flush_fput, osd_flush_fput);
+
+       return 0;
+
+out_memfs:
+       memfs_fini();
+out_kmem:
+       lu_kmem_fini(wbcfs_caches);
+       return rc;
+}
+
+static void __exit osd_exit(void)
+{
+       cancel_work_sync(&flush_fput);
+       class_unregister_type(LUSTRE_OSD_WBCFS_NAME);
+       memfs_fini();
+       lu_kmem_fini(wbcfs_caches);
+}
+
+MODULE_AUTHOR("Yingjin Qian <qian@ddn.com>");
+MODULE_DESCRIPTION("Lustre Object Storage Device ("LUSTRE_OSD_WBCFS_NAME")");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(osd_init);
+module_exit(osd_exit);
diff --git a/lustre/osd-wbcfs/osd_hash.c b/lustre/osd-wbcfs/osd_hash.c
new file mode 100644 (file)
index 0000000..1398f81
--- /dev/null
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2024-2025, Amazon and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Hash index with FIXED key length.
+ * Traverse the index via linear list scanning.
+ *
+ * Author: Timothy Day <timday@amazon.com>
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSD
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+
+#include "index.h"
+
+static u32 hash_index_keyhash(const void *data, u32 len, u32 seed)
+{
+       return jhash(data, len, seed);
+}
+
+static u32 hash_index_entry_keyhash(const void *data, u32 len, u32 seed)
+{
+       struct hash_index_entry *entry = (struct hash_index_entry *)data;
+
+       return hash_index_keyhash(&entry->he_buf, entry->he_keylen, seed);
+}
+
+static int hash_index_keycmp(struct rhashtable_compare_arg *arg,
+                            const void *obj)
+{
+       struct hash_index_entry *entry = (struct hash_index_entry *)obj;
+
+       LASSERT(arg->ht->key_len == entry->he_keylen);
+
+       if (!memcpy(entry->he_buf, arg->key, entry->he_keylen))
+               return 0;
+
+       /* ESRCH is typical for rhashtable */
+       return -ESRCH;
+}
+
+static const struct rhashtable_params hash_index_params = {
+       .head_offset            = offsetof(struct hash_index_entry, he_hash),
+       .hashfn                 = hash_index_keyhash,
+       .obj_hashfn             = hash_index_entry_keyhash,
+       .obj_cmpfn              = hash_index_keycmp,
+       .automatic_shrinking    = true,
+};
+
+int hash_index_init(struct hash_index *hind, size_t keylen, size_t reclen)
+{
+       int rc;
+
+       LASSERT(keylen > 0);
+       INIT_LIST_HEAD(&hind->hi_list);
+       hind->hi_htbl_params = hash_index_params;
+       hind->hi_htbl_params.key_len = keylen;
+       hind->hi_reclen = reclen;
+       rc = rhashtable_init(&hind->hi_htbl, &hind->hi_htbl_params);
+       return rc;
+}
+
+void hash_index_fini(struct hash_index *hind)
+{
+       struct hash_index_entry *entry, *tmp;
+
+       if (!hind)
+               return;
+
+       list_for_each_entry_safe(entry, tmp, &hind->hi_list, he_list_item) {
+               rhashtable_remove_fast(&hind->hi_htbl, &entry->he_hash,
+                                      hind->hi_htbl_params);
+               list_del(&entry->he_list_item);
+               OBD_FREE(entry, entry->he_len);
+       }
+
+       rhashtable_destroy(&hind->hi_htbl);
+}
+
+struct hash_index_entry *
+hash_index_lookup_entry(struct hash_index *hind, const void *key)
+{
+       struct hash_index_entry *entry;
+
+       entry = rhashtable_lookup_fast(&hind->hi_htbl, key,
+                                      hind->hi_htbl_params);
+       return entry;
+}
+
+int hash_index_lookup(struct hash_index *hind, const void *key, void *rec)
+{
+       struct hash_index_entry *entry;
+       int rc = 0;
+
+       entry = rhashtable_lookup_fast(&hind->hi_htbl, key,
+                                      hind->hi_htbl_params);
+       if (entry) {
+               size_t reclen;
+
+               reclen = entry->he_len - sizeof(*entry) - entry->he_keylen;
+               LASSERT(ergo(hind->hi_reclen, hind->hi_reclen == reclen));
+               memcpy(rec, entry->he_buf + entry->he_keylen, reclen);
+               return 1;
+       }
+
+       return rc;
+}
+
+int hash_index_insert(struct hash_index *hind, void *key, size_t keylen,
+                     void *rec, size_t reclen)
+{
+       struct hash_index_entry *entry;
+       size_t len;
+       int rc = 0;
+
+       ENTRY;
+
+       if (!keylen)
+               keylen = hind->hi_htbl_params.key_len;
+       else
+               LASSERT(keylen == hind->hi_htbl_params.key_len);
+       if (!reclen)
+               reclen = hind->hi_reclen;
+       else
+               LASSERT(reclen == hind->hi_reclen);
+
+       len = sizeof(*entry) + keylen + reclen;
+       OBD_ALLOC(entry, len);
+       if (!entry)
+               RETURN(-ENOMEM);
+
+       entry->he_len = len;
+       entry->he_keylen = keylen;
+       memcpy(entry->he_buf, key, keylen);
+       memcpy(entry->he_buf + keylen, rec, reclen);
+
+       rc = rhashtable_insert_fast(&hind->hi_htbl, &entry->he_hash,
+                                   hind->hi_htbl_params);
+       LASSERT(rc != -EBUSY);
+       if (rc)
+               GOTO(out_free, rc);
+
+       list_add_tail(&entry->he_list_item, &hind->hi_list);
+
+       /* TODO: Rollover? Should at least add detection... */
+       entry->he_offset = hind->hi_next_offset++;
+       RETURN(0);
+
+out_free:
+       OBD_FREE(entry, len);
+       RETURN(rc);
+}
+
+void hash_index_remove(struct hash_index *hind, const void *key)
+{
+       struct hash_index_entry *entry;
+
+       entry = rhashtable_lookup_fast(&hind->hi_htbl, key,
+                                      hind->hi_htbl_params);
+       if (!entry)
+               return;
+
+       rhashtable_remove_fast(&hind->hi_htbl, &entry->he_hash,
+                              hind->hi_htbl_params);
+       /* FIXME: use RCU for list insert/remove. */
+       list_del(&entry->he_list_item);
+       OBD_FREE(entry, entry->he_len);
+}
diff --git a/lustre/osd-wbcfs/osd_index_hash.c b/lustre/osd-wbcfs/osd_index_hash.c
new file mode 100644 (file)
index 0000000..886e9a2
--- /dev/null
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2024-2025, Amazon and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Index Access Module.
+ *
+ * Author: Timothy Day <timday@amazon.com>
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM        S_OSD
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "osd_internal.h"
+#include "wbcfs.h"
+
+static int osd_hash_index_lookup(const struct lu_env *env, struct dt_object *dt,
+                                struct dt_rec *rec, const struct dt_key *key)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       int rc;
+
+       ENTRY;
+
+       down_read(&obj->oo_sem);
+       rc = hash_index_lookup(hind, (void *)key, rec);
+       up_read(&obj->oo_sem);
+
+       RETURN(rc);
+}
+
+static int
+osd_hash_index_insert(const struct lu_env *env, struct dt_object *dt,
+                     const struct dt_rec *rec, const struct dt_key *key,
+                     struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       int rc;
+
+       ENTRY;
+
+       down_write(&obj->oo_sem);
+       rc = hash_index_insert(hind, (void *)key, 0, (void *)rec, 0);
+       up_write(&obj->oo_sem);
+       RETURN(rc);
+}
+
+static int osd_hash_index_delete(const struct lu_env *env, struct dt_object *dt,
+                                const struct dt_key *key, struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+
+       ENTRY;
+
+       down_write(&obj->oo_sem);
+       hash_index_remove(hind, (void *)key);
+       up_write(&obj->oo_sem);
+
+       RETURN(0);
+}
+
+static struct dt_it *osd_hash_index_it_init(const struct lu_env *env,
+                                           struct dt_object *dt, __u32 unused)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       struct osd_hash_it *it;
+
+       ENTRY;
+
+       if (obj->oo_destroyed)
+               RETURN(ERR_PTR(-ENOENT));
+
+       OBD_SLAB_ALLOC_PTR(it, osd_hash_it_cachep);
+       if (!it)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       /* FIXME: race between concurrent iterating and deleting */
+       it->hit_cursor = &hind->hi_list;
+       it->hit_obj = obj;
+
+       RETURN((struct dt_it *)it);
+}
+
+static void osd_hash_index_it_fini(const struct lu_env *env,
+                                  struct dt_it *di)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+
+       ENTRY;
+       OBD_SLAB_FREE_PTR(it, osd_hash_it_cachep);
+       EXIT;
+}
+
+static int osd_hash_index_it_get(const struct lu_env *env, struct dt_it *di,
+                                const struct dt_key *key)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+       struct hash_index_entry *entry;
+       struct hash_index *hind;
+       size_t keylen;
+       int rc = -EIO;
+
+       ENTRY;
+
+       if (obj->oo_destroyed)
+               RETURN(-ENOENT);
+
+       hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       keylen = hind->hi_htbl_params.key_len;
+
+       down_read(&obj->oo_sem);
+       list_for_each_entry(entry, &hind->hi_list, he_list_item) {
+               if (memcmp(key, entry->he_buf, keylen) == 0) {
+                       it->hit_cursor = &entry->he_list_item;
+                       rc = 0;
+                       break;
+               }
+       }
+       up_read(&obj->oo_sem);
+
+       RETURN(rc);
+}
+
+/* TODO: remove and make fp optional. */
+static void osd_hash_index_it_put(const struct lu_env *env, struct dt_it *di)
+{
+}
+
+static int osd_hash_index_it_next(const struct lu_env *env, struct dt_it *di)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+       struct hash_index *hind;
+       int rc = 0;
+
+       ENTRY;
+
+       if (obj->oo_destroyed)
+               RETURN(-ENOENT);
+
+       hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       down_read(&obj->oo_sem);
+       it->hit_cursor = it->hit_cursor->next;
+       if (it->hit_cursor == &hind->hi_list)
+               rc = 1;
+       up_read(&obj->oo_sem);
+       RETURN(rc);
+}
+
+static struct dt_key *osd_hash_index_it_key(const struct lu_env *env,
+                                           const struct dt_it *di)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+       struct hash_index_entry *entry;
+
+       ENTRY;
+
+       if (obj->oo_destroyed)
+               RETURN(ERR_PTR(-ENOENT));
+
+       entry = container_of(it->hit_cursor, struct hash_index_entry,
+                            he_list_item);
+       RETURN((struct dt_key *)entry->he_buf);
+}
+
+static int osd_hash_index_it_key_size(const struct lu_env *env,
+                                     const struct dt_it *di)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+
+       RETURN(MEMFS_I(obj->oo_inode)->mei_hash_index.hi_htbl_params.key_len);
+}
+
+static int osd_hash_index_it_rec(const struct lu_env *env,
+                                const struct dt_it *di, struct dt_rec *rec,
+                                __u32 attr)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+       struct hash_index_entry *entry;
+       struct hash_index *hind;
+       size_t reclen;
+
+       ENTRY;
+
+       hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       /* FIXME: use RCU to avoid concurrent operations on the list. */
+       entry = container_of(it->hit_cursor, struct hash_index_entry,
+                            he_list_item);
+       reclen = entry->he_len - sizeof(*entry) - entry->he_keylen;
+       LASSERT(ergo(hind->hi_reclen, hind->hi_reclen == reclen));
+       memcpy(rec, entry->he_buf + entry->he_keylen, reclen);
+       RETURN(0);
+}
+
+static int osd_hash_index_it_rec_size(const struct lu_env *env,
+                                     const struct dt_it *di, __u32 attr)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+       struct hash_index_entry *entry;
+       struct hash_index *hind;
+       size_t reclen;
+
+       ENTRY;
+
+       hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       if (hind->hi_reclen == 0) {
+               entry = container_of(it->hit_cursor, struct hash_index_entry,
+                                    he_list_item);
+               reclen = entry->he_len - sizeof(*entry) - entry->he_keylen;
+       } else {
+               reclen = hind->hi_reclen;
+       }
+
+       RETURN(reclen);
+}
+
+static __u64 osd_hash_index_it_store(const struct lu_env *env,
+                                    const struct dt_it *di)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct hash_index_entry *entry;
+
+       ENTRY;
+
+       entry = container_of(it->hit_cursor, struct hash_index_entry,
+                            he_list_item);
+       RETURN(entry->he_offset);
+}
+
+static int osd_hash_index_it_load(const struct lu_env *env,
+                                 const struct dt_it *di, __u64 hash)
+{
+       struct osd_hash_it *it = (struct osd_hash_it *)di;
+       struct osd_object *obj = it->hit_obj;
+       struct hash_index_entry *entry;
+       struct hash_index *hind;
+       int rc = 1;
+
+       ENTRY;
+
+       hind = &MEMFS_I(obj->oo_inode)->mei_hash_index;
+       if (hash == 0) {
+               it->hit_cursor = &hind->hi_list;
+               it->hit_cursor = it->hit_cursor->next;
+               if (it->hit_cursor == &hind->hi_list)
+                       rc = 0;
+
+               RETURN(rc);
+       }
+
+       /* TODO: A linear scan is not efficient, will use Maple Tree instead. */
+       list_for_each_entry(entry, &hind->hi_list, he_list_item) {
+               if (entry->he_offset == hash) {
+                       it->hit_cursor = &entry->he_list_item;
+                       rc = 1;
+                       break;
+               }
+       }
+
+       RETURN(rc);
+}
+
+const struct dt_index_operations osd_hash_index_ops = {
+       .dio_lookup             = osd_hash_index_lookup,
+       .dio_insert             = osd_hash_index_insert,
+       .dio_delete             = osd_hash_index_delete,
+       .dio_it = {
+               .init           = osd_hash_index_it_init,
+               .fini           = osd_hash_index_it_fini,
+               .get            = osd_hash_index_it_get,
+               .put            = osd_hash_index_it_put,
+               .next           = osd_hash_index_it_next,
+               .key            = osd_hash_index_it_key,
+               .key_size       = osd_hash_index_it_key_size,
+               .rec            = osd_hash_index_it_rec,
+               .rec_size       = osd_hash_index_it_rec_size,
+               .store          = osd_hash_index_it_store,
+               .load           = osd_hash_index_it_load
+       }
+};
diff --git a/lustre/osd-wbcfs/osd_internal.h b/lustre/osd-wbcfs/osd_internal.h
new file mode 100644 (file)
index 0000000..5a8cff7
--- /dev/null
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#ifndef _OSD_INTERNAL_H
+#define _OSD_INTERNAL_H
+
+#include <linux/rwsem.h>
+#include <linux/dcache.h>
+#include <linux/dirent.h>
+#include <linux/statfs.h>
+#include <linux/file.h>
+#include <lustre_compat.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+
+struct osd_object {
+       struct dt_object         oo_dt;
+       /*
+        * Inode in the memory FS for file system object represented by this
+        * osd_object. This inode is pinned for the whole duration of the file
+        * life.
+        */
+       struct inode            *oo_inode;
+       /* Used to implement osd_{read|write}_{lock|unlock}. */
+       struct rw_semaphore      oo_sem;
+       /* protects inode attributes. */
+       spinlock_t               oo_guard;
+       /* the i_flags in LMA */
+       __u32                    oo_lma_flags;
+       __u32                    oo_destroyed:1;
+       struct lu_object_header *oo_header;
+};
+
+struct osd_device {
+       /* Super-class */
+       struct dt_device         od_dt_dev;
+       /* Information about underlying memory file system */
+       struct vfsmount         *od_mnt;
+       /* Service name associated with the OSD device. */
+       char                     od_svname[MAX_OBD_NAME];
+       char                     od_mntdev[MAX_OBD_NAME];
+       int                      od_index;
+       atomic_t                 od_connects;
+       struct lu_site           od_site;
+       /*
+        * Enable to write back the data in the memory FS into the
+        * persistent storage.
+        */
+       unsigned int             od_writeback_enabled:1;
+       unsigned int             od_is_ost:1;
+};
+
+struct osd_thandle {
+       struct thandle          ot_super;
+       struct list_head        ot_commit_dcb_list;
+       struct list_head        ot_stop_dcb_list;
+};
+
+struct osd_it_dirent {
+       struct lu_fid   oitd_fid;
+       __u64           oitd_ino;
+       __u64           oitd_off;
+       unsigned short  oitd_namelen;
+       unsigned int    oitd_type;
+       char            oitd_name[];
+} __attribute__((packed));
+
+/*
+ * As @osd_it_dirent (in memory dirent struct for osd) is greater
+ * than lu_dirent struct. osd readdir reads less number of dirent than
+ * required for mdd dir page. so buffer size need to be increased so that
+ * there would be one MemFS readdir for every mdd readdir page.
+ */
+
+#define OSD_IT_BUFSIZE       (PAGE_SIZE + PAGE_SIZE/4)
+
+struct osd_it {
+       struct osd_object       *oit_obj;
+       struct file              oit_file;
+       /* How many entries have been read-cached from storage */
+       int                      oit_rd_dirent;
+       /* Current entry is being iterated by caller */
+       int                      oit_it_dirent;
+       /* Current processing entry */
+       struct osd_it_dirent    *oit_dirent;
+       /* Buffer to hold entries, size == OSD_IT_BUFSIZE */
+       void                    *oit_buf;
+};
+
+extern atomic_t descriptors_cnt;
+extern unsigned int wbcfs_flush_descriptors_cnt;
+extern struct work_struct flush_fput;
+#define osd_alloc_file_pseudo(inode, mnt, name, flags, fops)           \
+({                                                                     \
+       struct file *__f;                                               \
+       int __descriptors_cnt;                                          \
+       __f = alloc_file_pseudo(inode, mnt, name, flags, fops);         \
+       __descriptors_cnt = atomic_inc_return(&descriptors_cnt);        \
+       if (unlikely(__descriptors_cnt >= wbcfs_flush_descriptors_cnt)) {\
+               /* drop here to skip queue_work */                      \
+               atomic_set(&descriptors_cnt, 0);                        \
+               queue_work(system_long_wq, &flush_fput);                \
+       }                                                               \
+       __f;                                                            \
+})
+
+/* Slab to allocate osd_it */
+extern struct kmem_cache *osd_it_cachep;
+
+struct osd_hash_it {
+       struct list_head        *hit_cursor;
+       struct osd_object       *hit_obj;
+};
+
+extern struct kmem_cache *osd_hash_it_cachep;
+
+extern const struct dt_body_operations osd_body_ops;
+extern const struct dt_object_operations osd_obj_ops;
+extern const struct lu_object_operations osd_lu_obj_ops;
+extern const struct lu_device_operations osd_lu_ops;
+extern const struct dt_index_operations osd_dir_ops;
+extern const struct dt_index_operations osd_hash_index_ops;
+
+static inline int lu_device_is_osd(const struct lu_device *d)
+{
+       return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
+}
+
+static inline struct osd_device *osd_dt_dev(const struct dt_device *d)
+{
+       LASSERT(lu_device_is_osd(&d->dd_lu_dev));
+       return container_of(d, struct osd_device, od_dt_dev);
+}
+
+static inline struct osd_device *osd_dev(const struct lu_device *d)
+{
+       LASSERT(lu_device_is_osd(d));
+       return osd_dt_dev(container_of(d, struct dt_device, dd_lu_dev));
+}
+
+static inline struct osd_device *osd_obj2dev(const struct osd_object *o)
+{
+       return osd_dev(o->oo_dt.do_lu.lo_dev);
+}
+
+static inline struct super_block *osd_sb(const struct osd_device *dev)
+{
+       if (!dev->od_mnt)
+               return NULL;
+
+       return dev->od_mnt->mnt_sb;
+}
+
+static inline char *osd_name(struct osd_device *osd)
+{
+       return osd->od_svname;
+}
+
+static inline struct lu_device *osd2lu_dev(struct osd_device *osd)
+{
+       return &osd->od_dt_dev.dd_lu_dev;
+}
+
+static inline struct osd_object *osd_obj(const struct lu_object *o)
+{
+       LASSERT(lu_device_is_osd(o->lo_dev));
+       return container_of(o, struct osd_object, oo_dt.do_lu);
+}
+
+/*
+ * Put the osd object once done with it.
+ *
+ * \param obj osd object that needs to be put
+ */
+static inline void osd_object_put(const struct lu_env *env,
+                                 struct osd_object *obj)
+{
+       dt_object_put(env, &obj->oo_dt);
+}
+
+static inline struct osd_object *osd_dt_obj(const struct dt_object *d)
+{
+       return osd_obj(&d->do_lu);
+}
+
+#if defined HAVE_INODE_TIMESPEC64 || defined HAVE_INODE_GET_MTIME_SEC
+#define osd_timespec                   timespec64
+#else
+#define osd_timespec                   timespec
+#endif
+
+static inline struct osd_timespec osd_inode_time(struct inode *inode,
+                                                s64 seconds)
+{
+       struct osd_timespec ts = { .tv_sec = seconds };
+
+       return ts;
+}
+
+#ifdef HAVE_FILLDIR_USE_CTX_RETURN_BOOL
+#define WRAP_FILLDIR_FN(prefix, fill_fn) \
+static bool fill_fn(struct dir_context *buf, const char *name, int namelen, \
+                   loff_t offset, __u64 ino, unsigned int d_type)          \
+{                                                                          \
+       return !prefix##fill_fn(buf, name, namelen, offset, ino, d_type);   \
+}
+#elif defined(HAVE_FILLDIR_USE_CTX)
+#define WRAP_FILLDIR_FN(prefix, fill_fn) \
+static int fill_fn(struct dir_context *buf, const char *name, int namelen,  \
+                  loff_t offset, __u64 ino, unsigned int d_type)           \
+{                                                                          \
+       return prefix##fill_fn(buf, name, namelen, offset, ino, d_type);    \
+}
+#else
+#define WRAP_FILLDIR_FN(prefix, fill_fn)
+#endif
+
+/*
+ * Build inode number from passed @fid.
+ *
+ * For 32-bit systems or syscalls limit the inode number to a 32-bit value
+ * to avoid EOVERFLOW errors.  This will inevitably result in inode number
+ * collisions, but fid_flatten32() tries hard to avoid this if possible.
+ */
+static inline __u64 lu_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+       if (BITS_PER_LONG == 32 || api32)
+               RETURN(fid_flatten32(fid));
+
+       RETURN(fid_flatten64(fid));
+}
+
+/*
+ * Build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them.
+ */
+static inline __u32 lu_fid_build_gen(const struct lu_fid *fid)
+{
+       if (fid_is_igif(fid))
+               RETURN(lu_igif_gen(fid));
+
+       RETURN(fid_flatten64(fid) >> 32);
+}
+
+#endif /* _OSD_INTERNAL_H */
diff --git a/lustre/osd-wbcfs/osd_io.c b/lustre/osd-wbcfs/osd_io.c
new file mode 100644 (file)
index 0000000..5413647
--- /dev/null
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM        S_OSD
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+
+#include <lustre_compat.h>
+#include <obd_support.h>
+
+#include "osd_internal.h"
+
+/* Copied from osd-ldiskfs */
+static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages,
+                                  struct niobuf_local *lnb, int maxlnb)
+{
+       int rc = 0;
+
+       ENTRY;
+
+       *nrpages = 0;
+
+       while (len > 0) {
+               int poff = offset & (PAGE_SIZE - 1);
+               int plen = PAGE_SIZE - poff;
+
+               if (*nrpages >= maxlnb) {
+                       rc = -EOVERFLOW;
+                       break;
+               }
+
+               if (plen > len)
+                       plen = len;
+               lnb->lnb_file_offset = offset;
+               lnb->lnb_page_offset = poff;
+               lnb->lnb_len = plen;
+               lnb->lnb_flags = 0;
+               lnb->lnb_page = NULL;
+               lnb->lnb_rc = 0;
+               lnb->lnb_guard_rpc = 0;
+               lnb->lnb_guard_disk = 0;
+               lnb->lnb_locked = 0;
+               lnb->lnb_hole = 0;
+
+               LASSERTF(plen <= len, "plen %u, len %lld\n", plen,
+                        (long long) len);
+               offset += plen;
+               len -= plen;
+               lnb++;
+               (*nrpages)++;
+       }
+
+       RETURN(rc);
+}
+
+static int osd_get_page(const struct lu_env *env, struct dt_object *dt,
+                       struct niobuf_local *lnb, gfp_t gfp_mask, bool write)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       struct page *page;
+       pgoff_t index;
+
+       LASSERT(inode);
+       index = lnb->lnb_file_offset >> PAGE_SHIFT;
+       if (write) {
+               page = find_or_create_page(inode->i_mapping, index, gfp_mask);
+               if (page == NULL)
+                       return -ENOMEM;
+
+               LASSERT(!PagePrivate2(page));
+       } else {
+               /*
+                * Specially handling for hole in the memory FS during read.
+                * It does not allocate pages for holes, just records them and
+                * free them after reading.
+                * Otherwise, reading on a large sparse file may hit OOM.
+                */
+               page = find_lock_page(inode->i_mapping, index);
+               /* fallocated page? */
+               if (page && !PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       page = NULL;
+               }
+
+               if (page == NULL) {
+                       page = alloc_page(gfp_mask);
+                       if (!page)
+                               return -ENOMEM;
+
+                       SetPagePrivate2(page);
+                       lock_page(page);
+                       ClearPageUptodate(page);
+                       page->index = index;
+                       lnb->lnb_hole = 1;
+               }
+       }
+
+       lnb->lnb_page = page;
+       lnb->lnb_locked = 1;
+       if (!lnb->lnb_hole)
+               mark_page_accessed(page);
+
+       return 0;
+}
+
+/*
+ * Unlock and release pages loaded by @osd_bufs_get().
+ *
+ * Unlock \a npages pages from \a lnb and drop the refcount on them.
+ */
+static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
+                       struct niobuf_local *lnb, int npages)
+{
+       struct folio_batch fbatch;
+       int i;
+
+       ll_folio_batch_init(&fbatch, 0);
+       for (i = 0; i < npages; i++) {
+               struct page *page = lnb[i].lnb_page;
+
+               if (page == NULL)
+                       continue;
+
+               /* If the page is not cached in the memory FS, then free it. */
+               if (PagePrivate2(page)) {
+                       LASSERT(lnb[i].lnb_hole);
+                       LASSERT(PageLocked(page));
+                       ClearPagePrivate2(page);
+                       unlock_page(page);
+                       __free_page(page);
+               } else {
+                       if (lnb[i].lnb_locked)
+                               unlock_page(page);
+                       if (folio_batch_add_page(&fbatch, page) == 0)
+                               folio_batch_release(&fbatch);
+               }
+
+               lnb[i].lnb_page = NULL;
+       }
+
+       folio_batch_release(&fbatch);
+       return 0;
+}
+
+/**
+ * osd_bufs_get() - Load and lock pages undergoing IO
+ * @env: thread execution environment
+ * @dt: dt object undergoing IO (OSD object + methods)
+ * @pos: byte offset of IO start
+ * @len: number of bytes of IO
+ * @lnb: array of extents undergoing IO
+ * @maxlnb: maximum lnb
+ * @rw: read or write operation, and other flags
+ *
+ * Pages as described in the \a lnb array are fetched (from disk or cache)
+ * and locked for IO by the caller.
+ *
+ * Returns:
+ * %pages - (zero or more) loaded successfully
+ * %-ENOMEM - on memory/page allocation error
+ */
+static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
+                       loff_t pos, ssize_t len, struct niobuf_local *lnb,
+                       int maxlnb, enum dt_bufs_type rw)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       gfp_t gfp_mask;
+       int npages;
+       int rc;
+       int i;
+
+       LASSERT(obj->oo_inode);
+
+       if (unlikely(obj->oo_destroyed))
+               RETURN(-ENOENT);
+
+       rc = osd_map_remote_to_local(pos, len, &npages, lnb, maxlnb);
+       if (rc)
+               RETURN(rc);
+
+       /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */
+       gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) :
+                                            GFP_HIGHUSER;
+       for (i = 0; i < npages; i++, lnb++) {
+               rc = osd_get_page(env, dt, lnb, gfp_mask,
+                                 rw & DT_BUFS_TYPE_WRITE);
+               if (rc)
+                       GOTO(cleanup, rc);
+       }
+
+       RETURN(i);
+
+cleanup:
+       if (i > 0)
+               osd_bufs_put(env, dt, lnb - i, i);
+       return rc;
+}
+
+static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
+                       struct lu_buf *buf, loff_t *pos)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *dev = osd_obj2dev(obj);
+       struct inode *inode = obj->oo_inode;
+       struct file *file;
+       ssize_t result;
+
+       ENTRY;
+
+       /* TODO: Specially handling for symlink. */
+       if (S_ISLNK(dt->do_lu.lo_header->loh_attr))
+               RETURN(-EOPNOTSUPP);
+
+       file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/",
+                                    O_NOATIME | O_RDONLY, inode->i_fop);
+       if (IS_ERR(file))
+               RETURN(PTR_ERR(file));
+
+       result = cfs_kernel_read(file, buf->lb_buf, buf->lb_len, pos);
+       ihold(inode);
+       fput(file);
+       RETURN(result);
+}
+
+static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
+                        const struct lu_buf *buf, loff_t *pos,
+                        struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *dev = osd_obj2dev(obj);
+       struct inode *inode = obj->oo_inode;
+       struct file *file;
+       ssize_t result;
+
+       ENTRY;
+
+       /* TODO: Specially handling for symlink. */
+       if (S_ISLNK(dt->do_lu.lo_header->loh_attr))
+               RETURN(-EOPNOTSUPP);
+
+       file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/",
+                                    O_NOATIME | O_WRONLY, inode->i_fop);
+       if (IS_ERR(file))
+               RETURN(PTR_ERR(file));
+
+       result = cfs_kernel_write(file, buf->lb_buf, buf->lb_len, pos);
+       ihold(inode);
+       fput(file);
+       RETURN(result);
+}
+
+/* Can we move all osd_read_prep() codes into osd_bufs_get() ? */
+static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
+                        struct niobuf_local *lnb, int npages)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       loff_t isize;
+       int i;
+
+       ENTRY;
+
+       LASSERT(inode);
+       isize = i_size_read(inode);
+
+       for (i = 0; i < npages; i++) {
+               /*
+                * If there is no more data, abort early.
+                * lnb->lnb_rc == 0, so it is easy to detect later.
+                */
+               if (isize <= lnb[i].lnb_file_offset)
+                       break;
+
+               /*
+                * Instead of looking if we go beyond isize, send complete
+                * pages all the time.
+                */
+               lnb[i].lnb_rc = lnb[i].lnb_len;
+               if (lnb[i].lnb_hole) {
+                       void *kaddr;
+
+                       LASSERT(PagePrivate2(lnb[i].lnb_page));
+                       kaddr = kmap(lnb[i].lnb_page);
+                       memset(kaddr, 0, PAGE_SIZE);
+                       kunmap(lnb[i].lnb_page);
+                       SetPageUptodate(lnb[i].lnb_page);
+               } else {
+                       /*
+                        * The page in cache for MemFS should be always
+                        * in uptodate state.
+                        */
+                       LASSERT(PageUptodate(lnb[i].lnb_page));
+                       unlock_page(lnb[i].lnb_page);
+                       /*
+                        * No need to unlock in osd_bufs_put(). The sooner page
+                        * is unlocked, the earlier another client can access
+                        * it.
+                        */
+                       lnb[i].lnb_locked = 0;
+               }
+       }
+
+       RETURN(0);
+}
+
+static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
+                         struct niobuf_local *lnb, int npages)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       ssize_t isize;
+       __s64 maxidx;
+       int i;
+
+       ENTRY;
+
+       LASSERT(inode);
+
+       isize = i_size_read(inode);
+       maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1;
+       for (i = 0; i < npages; i++) {
+               /*
+                * Till commit the content of the page is undefined
+                * we will set it uptodate once bulk is done. Otherwise
+                * subsequent reads can access non-stable data.
+                */
+               ClearPageUptodate(lnb[i].lnb_page);
+
+               if (lnb[i].lnb_len == PAGE_SIZE)
+                       continue;
+
+               if (maxidx < lnb[i].lnb_page->index) {
+                       long off;
+                       char *p = kmap(lnb[i].lnb_page);
+
+                       off = lnb[i].lnb_page_offset;
+                       if (off)
+                               memset(p, 0, off);
+                       off = (lnb[i].lnb_page_offset + lnb[i].lnb_len) &
+                             ~PAGE_MASK;
+                       if (off)
+                               memset(p + off, 0, PAGE_SIZE - off);
+                       kunmap(lnb[i].lnb_page);
+               }
+       }
+
+       RETURN(0);
+}
+
+
+static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
+                           struct niobuf_local *lnb, int npages,
+                           struct thandle *th, __u64 user_size)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       struct address_space *mapping = inode->i_mapping;
+       size_t isize;
+       int i;
+
+       ENTRY;
+
+       LASSERT(inode);
+
+       for (i = 0; i < npages; i++) {
+               if (lnb[i].lnb_rc) { /* ENOSPC, network RPC error, etc. */
+                       LASSERT(lnb[i].lnb_page);
+                       generic_error_remove_folio(inode->i_mapping,
+                                                  page_folio(lnb[i].lnb_page));
+                       continue;
+               }
+
+               /*
+                * TODO: @lnb array is a sorted array according to the file
+                * offset, thus it just needs to check the last @lnb for
+                * file size.
+                */
+               if (user_size < lnb[i].lnb_file_offset + lnb[i].lnb_len)
+                       user_size = lnb[i].lnb_file_offset + lnb[i].lnb_len;
+
+               LASSERT(PageLocked(lnb[i].lnb_page));
+               LASSERT(!PageWriteback(lnb[i].lnb_page));
+               /* LASSERT(!PageDirty(lnb[i].lnb_page)); */
+
+               SetPageUptodate(lnb[i].lnb_page);
+#ifdef HAVE_DIRTY_FOLIO
+               mapping->a_ops->dirty_folio(mapping,
+                                           page_folio(lnb[i].lnb_page));
+#else
+               mapping->a_ops->set_page_dirty(lnb[i].lnb_page);
+#endif
+       }
+
+       spin_lock(&inode->i_lock);
+       isize = i_size_read(inode);
+       if (isize < user_size)
+               i_size_write(inode, user_size);
+       spin_unlock(&inode->i_lock);
+
+       CDEBUG(D_INFO, "Size after write: i_size=%lld user_size=%llu\n",
+              i_size_read(inode), user_size);
+       /* No transno is needed for in-memory FS. */
+       th->th_local = 1;
+       RETURN(0);
+}
+
+/* TODO: Implement punch operation. */
+static int osd_punch(const struct lu_env *env, struct dt_object *dt,
+                    __u64 start, __u64 end, struct thandle *th)
+{
+       RETURN(0);
+}
+
+/* TODO: Implemented lseek operation.  */
+static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt,
+                       loff_t offset, int whence)
+{
+       RETURN(0);
+}
+
+const struct dt_body_operations osd_body_ops = {
+       .dbo_read                       = osd_read,
+       .dbo_write                      = osd_write,
+       .dbo_bufs_get                   = osd_bufs_get,
+       .dbo_bufs_put                   = osd_bufs_put,
+       .dbo_write_prep                 = osd_write_prep,
+       .dbo_write_commit               = osd_write_commit,
+       .dbo_read_prep                  = osd_read_prep,
+       .dbo_punch                      = osd_punch,
+       .dbo_lseek                      = osd_lseek,
+};
+
diff --git a/lustre/osd-wbcfs/osd_object.c b/lustre/osd-wbcfs/osd_object.c
new file mode 100644 (file)
index 0000000..4856f28
--- /dev/null
@@ -0,0 +1,848 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM        S_OSD
+
+#include <linux/fs_struct.h>
+
+#include <dt_object.h>
+
+#include "osd_internal.h"
+#include "wbcfs.h"
+
+/* Concurrency: no external locking is necessary. */
+static int osd_index_try(const struct lu_env *env, struct dt_object *dt,
+                        const struct dt_index_features *feat)
+{
+       int rc;
+
+       if (likely(feat == &dt_directory_features)) {
+               dt->do_index_ops = &osd_dir_ops;
+               rc = 0;
+       } else if (unlikely(feat == &dt_acct_features)) {
+               /* TODO: Add quota support. */
+               rc = -ENOTSUPP;
+       } else if (unlikely(feat == &dt_otable_features)) {
+               /* TODO: Add scrub support. */
+               dt->do_index_ops = &osd_hash_index_ops;
+               rc = 0;
+       } else {
+               dt->do_index_ops = &osd_hash_index_ops;
+               rc = 0;
+       }
+
+       return rc;
+}
+
+static int osd_otable_it_attr_get(const struct lu_env *env,
+                                struct dt_object *dt,
+                                struct lu_attr *attr)
+{
+       attr->la_valid = 0;
+       return 0;
+}
+
+static const struct dt_object_operations osd_obj_otable_it_ops = {
+       .do_attr_get    = osd_otable_it_attr_get,
+       .do_index_try   = osd_index_try,
+};
+
+static void __osd_object_init(struct osd_object *obj)
+{
+       LASSERT(obj->oo_inode != NULL);
+       obj->oo_dt.do_body_ops = &osd_body_ops;
+       obj->oo_dt.do_lu.lo_header->loh_attr |=
+               (LOHA_EXISTS | (obj->oo_inode->i_mode & S_IFMT));
+}
+
+/*
+ * Concurrency: No concurrent access is possible that early in object
+ * life cycle.
+ */
+static int osd_object_init(const struct lu_env *env, struct lu_object *l,
+                          const struct lu_object_conf *conf)
+{
+       struct osd_object *obj = osd_obj(l);
+       struct osd_device *osd = osd_obj2dev(obj);
+       const struct lu_fid *fid = lu_object_fid(l);
+       struct inode *inode = NULL;
+       __u64 hash;
+
+       if (fid_is_otable_it(&l->lo_header->loh_fid)) {
+               obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
+               l->lo_header->loh_attr |= LOHA_EXISTS;
+               return 0;
+       }
+
+       hash = lu_fid_build_ino(fid, 0);
+       inode = ilookup5(osd_sb(osd), hash, memfs_test_inode_by_fid,
+                        (void *)fid);
+       obj->oo_dt.do_body_ops = &osd_body_ops;
+       if (inode) {
+               obj->oo_inode = inode;
+               __osd_object_init(obj);
+
+               /*
+                * TODO: check LMA EA and convert LMAI flags to lustre
+                * LMA flags and cache it in object.
+                */
+       }
+
+       CDEBUG(D_INODE, "%s: object init for fid="DFID" inode@%pK nlink=%d\n",
+              osd_name(osd), PFID(fid), inode, inode ? inode->i_nlink : 0);
+
+       return 0;
+}
+
+static void osd_object_free(const struct lu_env *env, struct lu_object *l)
+{
+       struct osd_object *obj = osd_obj(l);
+       struct lu_object_header *h = obj->oo_header;
+
+       dt_object_fini(&obj->oo_dt);
+       OBD_FREE_PTR(obj);
+       if (unlikely(h))
+               lu_object_header_free(h);
+}
+
+/*
+ * Called just before the object is freed. Releases all resources except for
+ * object itself (that is released by osd_object_free()).
+ *
+ * Concurrency: no concurrent access is possible that late in object
+ * life-cycle.
+ */
+static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
+{
+       struct osd_object *obj = osd_obj(l);
+       struct inode *inode = obj->oo_inode;
+
+       if (!inode)
+               return;
+
+       obj->oo_inode = NULL;
+       CDEBUG(D_INODE,
+              "%s: object "DFID" delete: inode@%pK nlink=%u count=%d\n",
+              osd_name(osd_obj2dev(obj)), PFID(lu_object_fid(l)),
+              inode, inode->i_nlink, atomic_read(&inode->i_count));
+       iput(inode);
+}
+
+/* Concurrency: ->loo_object_release() is called under site spin-lock. */
+static void osd_object_release(const struct lu_env *env, struct lu_object *l)
+{
+       struct osd_object *o = osd_obj(l);
+
+       /*
+        * Nobody should be releasing a non-destroyed object with nlink=0
+        * the API allows this, but wbcfs does not like and then report
+        * this inode as deleted.
+        */
+       if (o->oo_destroyed == 0 && o->oo_inode && o->oo_inode->i_nlink == 0)
+               CERROR("%s: Object "DFID" wrong: %d inode@%pK nlink=%u\n",
+                      osd_name(osd_obj2dev(o)), PFID(lu_object_fid(l)),
+                      o->oo_destroyed, o->oo_inode,
+                      o->oo_inode ? o->oo_inode->i_nlink : 0);
+
+       LASSERT(!(o->oo_destroyed == 0 && o->oo_inode &&
+                 o->oo_inode->i_nlink == 0));
+}
+
+static int osd_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *l)
+{
+       struct osd_object *o = osd_obj(l);
+
+       return (*p)(env, cookie,
+                   LUSTRE_OSD_WBCFS_NAME"-object@%p(i:%p:%lu/%u)",
+                   o, o->oo_inode,
+                   o->oo_inode ? o->oo_inode->i_ino : 0UL,
+                   o->oo_inode ? o->oo_inode->i_generation : 0);
+}
+
+static void osd_inode_getattr(const struct lu_env *env,
+                             struct inode *inode, struct lu_attr *attr)
+{
+       attr->la_valid  |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE |
+                          LA_SIZE | LA_BLOCKS | LA_UID | LA_GID |
+                          LA_PROJID | LA_FLAGS | LA_NLINK | LA_RDEV |
+                          LA_BLKSIZE | LA_TYPE | LA_BTIME;
+
+       attr->la_atime = inode_get_atime_sec(inode);
+       attr->la_mtime = inode_get_mtime_sec(inode);
+       attr->la_ctime = inode_get_ctime_sec(inode);
+       attr->la_btime = memfs_get_btime(inode);
+       attr->la_mode = inode->i_mode;
+       attr->la_size = i_size_read(inode);
+       attr->la_blocks = inode->i_blocks;
+       attr->la_uid = i_uid_read(inode);
+       attr->la_gid = i_gid_read(inode);
+       attr->la_projid = i_projid_read(inode);
+       attr->la_flags = ll_inode_to_ext_flags(inode->i_flags);
+       attr->la_nlink = inode->i_nlink;
+       attr->la_rdev = inode->i_rdev;
+       attr->la_blksize = 1 << inode->i_blkbits;
+       attr->la_blkbits = inode->i_blkbits;
+       /*
+        * MemFS did not transfer inherit flags from raw inode
+        * to inode flags, and MemFS internally test raw inode
+        * @i_flags directly. Instead of patching ext4, we do it here.
+        */
+       if (memfs_get_flags(inode) & LUSTRE_PROJINHERIT_FL)
+               attr->la_flags |= LUSTRE_PROJINHERIT_FL;
+}
+
+static int osd_attr_get(const struct lu_env *env, struct dt_object *dt,
+                       struct lu_attr *attr)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+
+       if (unlikely(!dt_object_exists(dt)))
+               return -ENOENT;
+       if (unlikely(obj->oo_destroyed))
+               return -ENOENT;
+
+       LASSERT(!dt_object_remote(dt));
+
+       spin_lock(&obj->oo_guard);
+       osd_inode_getattr(env, obj->oo_inode, attr);
+       if (obj->oo_lma_flags & LUSTRE_ORPHAN_FL) {
+               attr->la_valid |= LA_FLAGS;
+               attr->la_flags |= LUSTRE_ORPHAN_FL;
+       }
+       if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL) {
+               attr->la_valid |= LA_FLAGS;
+               attr->la_flags |= LUSTRE_ENCRYPT_FL;
+       }
+       spin_unlock(&obj->oo_guard);
+       CDEBUG(D_INFO, "%s: getattr "DFID" inode@%pK nlink=%d\n",
+              osd_name(osd_obj2dev(obj)), PFID(lu_object_fid(&dt->do_lu)),
+              obj->oo_inode, obj->oo_inode->i_nlink);
+       return 0;
+}
+
+static int osd_inode_setattr(const struct lu_env *env,
+                            struct inode *inode, const struct lu_attr *attr)
+{
+       __u64 bits = attr->la_valid;
+
+       /* Only allow set size for regular file */
+       if (!S_ISREG(inode->i_mode))
+               bits &= ~(LA_SIZE | LA_BLOCKS);
+
+       if (bits == 0)
+               return 0;
+
+       if (bits & LA_ATIME)
+               inode_set_atime_to_ts(inode,
+                                     osd_inode_time(inode, attr->la_atime));
+       if (bits & LA_CTIME)
+               inode_set_ctime_to_ts(inode,
+                                     osd_inode_time(inode, attr->la_ctime));
+       if (bits & LA_MTIME)
+               inode_set_mtime_to_ts(inode,
+                                     osd_inode_time(inode, attr->la_mtime));
+       if (bits & LA_SIZE) {
+               spin_lock(&inode->i_lock);
+               i_size_write(inode, attr->la_size);
+               spin_unlock(&inode->i_lock);
+       }
+
+       /*
+        * OSD should not change "i_blocks" which is used by quota.
+        * "i_blocks" should be changed by ldiskfs only.
+        */
+       if (bits & LA_MODE)
+               inode->i_mode = (inode->i_mode & S_IFMT) |
+                               (attr->la_mode & ~S_IFMT);
+       if (bits & LA_UID)
+               i_uid_write(inode, attr->la_uid);
+       if (bits & LA_GID)
+               i_gid_write(inode, attr->la_gid);
+       if (bits & LA_PROJID)
+               i_projid_write(inode, attr->la_projid);
+       if (bits & LA_NLINK)
+               set_nlink(inode, attr->la_nlink);
+       if (bits & LA_RDEV)
+               inode->i_rdev = attr->la_rdev;
+
+       if (bits & LA_FLAGS) {
+               /* always keep S_NOCMTIME */
+               inode->i_flags = ll_ext_to_inode_flags(attr->la_flags) |
+                                S_NOCMTIME;
+#if defined(S_ENCRYPTED)
+               /* Always remove S_ENCRYPTED, because ldiskfs must not be
+                * aware of encryption status. It is just stored into LMA
+                * so that it can be forwared to client side.
+                */
+               inode->i_flags &= ~S_ENCRYPTED;
+#endif
+               /*
+                * MemFS did not transfer inherit flags from
+                * @inode->i_flags to raw inode i_flags when writing
+                * flags, we do it explictly here.
+                */
+               if (attr->la_flags & LUSTRE_PROJINHERIT_FL)
+                       MEMFS_I(inode)->mei_flags |= LUSTRE_PROJINHERIT_FL;
+               else
+                       MEMFS_I(inode)->mei_flags &= ~LUSTRE_PROJINHERIT_FL;
+       }
+       return 0;
+}
+
+static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
+                       const struct lu_attr *attr, struct thandle *handle)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode;
+       int rc;
+
+       if (!dt_object_exists(dt))
+               return -ENOENT;
+
+       LASSERT(!dt_object_remote(dt));
+       inode = obj->oo_inode;
+       spin_lock(&obj->oo_guard);
+       rc = osd_inode_setattr(env, inode, attr);
+       spin_unlock(&obj->oo_guard);
+       if (rc)
+               RETURN(rc);
+
+       /* TODO: extra flags for LUSTRE_LMA_FL_MASKS */
+
+       return 0;
+}
+
+static int osd_mkfile(const struct lu_env *env, struct osd_object *obj,
+                     umode_t mode, struct dt_allocation_hint *hint,
+                     struct thandle *th, struct lu_attr *attr)
+{
+       struct osd_device *osd = osd_obj2dev(obj);
+       struct dt_object *parent = NULL;
+       struct inode *inode;
+       struct iattr iattr = {
+               .ia_valid = ATTR_UID | ATTR_GID |
+                           ATTR_CTIME | ATTR_MTIME | ATTR_ATIME,
+               .ia_ctime.tv_sec = attr->la_ctime,
+               .ia_mtime.tv_sec = attr->la_mtime,
+               .ia_atime.tv_sec = attr->la_atime,
+               .ia_uid = GLOBAL_ROOT_UID,
+               .ia_gid = GLOBAL_ROOT_GID,
+       };
+       const struct osd_timespec omit = { .tv_nsec = UTIME_OMIT };
+       const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
+
+       if (attr->la_valid & LA_UID)
+               iattr.ia_uid = make_kuid(&init_user_ns, attr->la_uid);
+       if (attr->la_valid & LA_GID)
+               iattr.ia_gid = make_kgid(&init_user_ns, attr->la_gid);
+
+       LASSERT(obj->oo_inode == NULL);
+
+       if (hint != NULL && hint->dah_parent != NULL &&
+           !dt_object_remote(hint->dah_parent))
+               parent = hint->dah_parent;
+
+       /* if a time component is not valid set it to UTIME_OMIT */
+       if (!(attr->la_valid & LA_CTIME))
+               iattr.ia_ctime = omit;
+       if (!(attr->la_valid & LA_MTIME))
+               iattr.ia_mtime = omit;
+       if (!(attr->la_valid & LA_ATIME))
+               iattr.ia_atime = omit;
+
+       inode = memfs_create_inode(osd_sb(osd),
+                                  parent ? osd_dt_obj(parent)->oo_inode :
+                                           osd_sb(osd)->s_root->d_inode,
+                                  mode, &iattr, 0, false);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       /* Do not update file c/mtime in MemFS. */
+       inode->i_flags |= S_NOCMTIME;
+       inode->i_ino = lu_fid_build_ino(fid, 0);
+       inode->i_generation = lu_fid_build_gen(fid);
+       MEMFS_I(inode)->mei_fid = *fid;
+       if (unlikely(insert_inode_locked(inode) < 0)) {
+               CERROR("%s: Failed to insert inode %lu "DFID": doubly allocated?\n",
+                      osd_name(osd), inode->i_ino, PFID(fid));
+               iput(inode);
+               RETURN(-EIO);
+       }
+
+       CDEBUG(D_INODE,
+              "%s: create object "DFID": inode@%pK nlink=%d mode=%#o\n",
+              osd_name(osd), PFID(fid), inode, inode->i_nlink, inode->i_mode);
+       obj->oo_inode = inode;
+       RETURN(0);
+}
+
+static int osd_mkdir(const struct lu_env *env, struct osd_object *obj,
+                    struct lu_attr *attr,
+                    struct dt_allocation_hint *hint,
+                    struct dt_object_format *dof,
+                    struct thandle *th)
+{
+       __u32 mode = (attr->la_mode & (S_IFMT | S_IRWXUGO | S_ISVTX | S_ISGID));
+
+       LASSERT(S_ISDIR(attr->la_mode));
+
+       return osd_mkfile(env, obj, mode, hint, th, attr);
+}
+
+static int osd_mk_index(const struct lu_env *env, struct osd_object *obj,
+                       struct lu_attr *attr,
+                       struct dt_allocation_hint *hint,
+                       struct dt_object_format *dof,
+                       struct thandle *th)
+{
+       __u32 mode = (attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX));
+       const struct dt_index_features *feat = dof->u.dof_idx.di_feat;
+       struct memfs_inode_info *mei;
+       size_t keylen = 0;
+       size_t reclen = 0;
+       int rc;
+
+       ENTRY;
+
+       LASSERT(S_ISREG(attr->la_mode));
+
+       /* Only support index with fixed key length. */
+       if (feat->dif_flags & DT_IND_VARKEY)
+               RETURN(-EINVAL);
+
+       keylen = feat->dif_keysize_max;
+       if (!(feat->dif_flags & DT_IND_VARREC))
+               reclen = feat->dif_recsize_max;
+
+       rc = osd_mkfile(env, obj, mode, hint, th, attr);
+       if (rc)
+               GOTO(out, rc);
+
+       LASSERT(obj->oo_inode != NULL);
+       mei = MEMFS_I(obj->oo_inode);
+       mei->mei_index_type = INDEX_TYPE_HASH;
+       rc = hash_index_init(&mei->mei_hash_index, keylen, reclen);
+       if (rc) {
+               CERROR("%s: failed to create index for FID="DFID": rc=%d\n",
+                      osd_name(osd_obj2dev(obj)),
+                      PFID(lu_object_fid(&obj->oo_dt.do_lu)), rc);
+               /* TODO: cleanup @oo_inode... */
+       }
+out:
+       RETURN(rc);
+}
+
+static int osd_mkreg(const struct lu_env *env, struct osd_object *obj,
+                    struct lu_attr *attr,
+                    struct dt_allocation_hint *hint,
+                    struct dt_object_format *dof,
+                    struct thandle *th)
+{
+       LASSERT(S_ISREG(attr->la_mode));
+       return osd_mkfile(env, obj, (attr->la_mode &
+                        (S_IFMT | S_IALLUGO | S_ISVTX)), hint, th,
+                         attr);
+}
+
+static int osd_mksym(const struct lu_env *env, struct osd_object *obj,
+                    struct lu_attr *attr,
+                    struct dt_allocation_hint *hint,
+                    struct dt_object_format *dof,
+                    struct thandle *th)
+{
+       LASSERT(S_ISLNK(attr->la_mode));
+       /* TODO: symlink support. */
+       RETURN(-EOPNOTSUPP);
+}
+
+static int osd_mknod(const struct lu_env *env, struct osd_object *obj,
+                    struct lu_attr *attr,
+                    struct dt_allocation_hint *hint,
+                    struct dt_object_format *dof,
+                    struct thandle *th)
+{
+       umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX);
+       int result;
+
+       LASSERT(obj->oo_inode == NULL);
+       LASSERT(S_ISCHR(mode) || S_ISBLK(mode) ||
+               S_ISFIFO(mode) || S_ISSOCK(mode));
+
+       result = osd_mkfile(env, obj, mode, hint, th, attr);
+       if (result == 0) {
+               LASSERT(obj->oo_inode != NULL);
+               /*
+                * This inode should be marked dirty for i_rdev.  Currently
+                * that is done in the osd_attr_init().
+                */
+               init_special_inode(obj->oo_inode, obj->oo_inode->i_mode,
+                                  attr->la_rdev);
+       }
+       return result;
+}
+
+typedef int (*osd_obj_type_f)(const struct lu_env *env,
+                             struct osd_object *obj,
+                             struct lu_attr *attr,
+                             struct dt_allocation_hint *hint,
+                             struct dt_object_format *dof,
+                             struct thandle *th);
+
+static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
+{
+       osd_obj_type_f result;
+
+       switch (type) {
+       case DFT_DIR:
+               result = osd_mkdir;
+               break;
+       case DFT_REGULAR:
+               result = osd_mkreg;
+               break;
+       case DFT_SYM:
+               result = osd_mksym;
+               break;
+       case DFT_NODE:
+               result = osd_mknod;
+               break;
+       case DFT_INDEX:
+               result = osd_mk_index;
+               break;
+       default:
+               LBUG();
+               break;
+       }
+       return result;
+}
+
+static void osd_attr_init(const struct lu_env *env, struct osd_object *obj,
+                         struct lu_attr *attr, struct dt_object_format *dof,
+                         struct thandle *handle)
+{
+       struct inode *inode = obj->oo_inode;
+       __u64 valid = attr->la_valid;
+       int result;
+
+       attr->la_valid &= ~(LA_TYPE | LA_MODE);
+
+       if (dof->dof_type != DFT_NODE)
+               attr->la_valid &= ~LA_RDEV;
+       if ((valid & LA_ATIME) &&
+           (attr->la_atime == inode_get_atime_sec(inode)))
+               attr->la_valid &= ~LA_ATIME;
+       if ((valid & LA_CTIME) &&
+           (attr->la_ctime == inode_get_ctime_sec(inode)))
+               attr->la_valid &= ~LA_CTIME;
+       if ((valid & LA_MTIME) &&
+           (attr->la_mtime == inode_get_mtime_sec(inode)))
+               attr->la_valid &= ~LA_MTIME;
+
+       /* TODO: Perform quota transfer. */
+
+       if (attr->la_valid != 0) {
+               result = osd_inode_setattr(env, inode, attr);
+               /*
+                * The osd_inode_setattr() should always succeed here.  The
+                * only error that could be returned is EDQUOT when we are
+                * trying to change the UID or GID of the inode. However, this
+                * should not happen since quota enforcement is no longer
+                * enabled on MemFS (lquota is supported and takes care of it).
+                */
+               LASSERTF(result == 0, "%d\n", result);
+       }
+
+       attr->la_valid = valid;
+}
+
+/* Helper function for osd_create(). */
+static int __osd_create(const struct lu_env *env, struct osd_object *obj,
+                       struct lu_attr *attr, struct dt_allocation_hint *hint,
+                       struct dt_object_format *dof, struct thandle *th)
+{
+       int result;
+       __u32 umask;
+
+       /* we drop umask so that permissions we pass are not affected */
+       umask = current->fs->umask;
+       current->fs->umask = 0;
+
+       result = osd_create_type_f(dof->dof_type)(env, obj, attr, hint, dof,
+                                                 th);
+       if (likely(obj->oo_inode && result == 0)) {
+               LASSERT(obj->oo_inode->i_state & I_NEW);
+
+               /*
+                * Unlock the inode before attr initialization to avoid
+                * unnecessary dqget operations. LU-6378
+                */
+               unlock_new_inode(obj->oo_inode);
+               osd_attr_init(env, obj, attr, dof, th);
+               __osd_object_init(obj);
+       }
+
+       /* restore previous umask value */
+       current->fs->umask = umask;
+
+       return result;
+}
+
+static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
+                       struct dt_object *parent, struct dt_object *child,
+                       umode_t child_mode)
+{
+       LASSERT(ah);
+
+       ah->dah_parent = parent;
+}
+
+/* OSD layer object creation funcation for OST objects. */
+static int osd_create(const struct lu_env *env, struct dt_object *dt,
+                     struct lu_attr *attr, struct dt_allocation_hint *hint,
+                     struct dt_object_format *dof, struct thandle *th)
+{
+       const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
+       struct osd_object *obj = osd_dt_obj(dt);
+       int rc;
+
+       ENTRY;
+
+       if (dt_object_exists(dt))
+               RETURN(-EEXIST);
+
+       LASSERT(!dt_object_remote(dt));
+       LASSERT(dt_write_locked(env, dt));
+
+       /* Quota files cannot be created from the kernel any more */
+       if (unlikely(fid_is_acct(fid)))
+               RETURN(-EPERM);
+
+       rc = __osd_create(env, obj, attr, hint, dof, th);
+       /* TODO: Update LMA EA with @fid. */
+       LASSERT(ergo(rc == 0,
+                    dt_object_exists(dt) && !dt_object_remote(dt)));
+       RETURN(rc);
+}
+
+static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
+                      struct thandle *th)
+{
+       const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode = obj->oo_inode;
+       struct osd_device *osd = osd_obj2dev(obj);
+
+       ENTRY;
+
+       LASSERT(inode);
+       LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
+
+       if (unlikely(fid_is_acct(fid)))
+               RETURN(-EPERM);
+
+       /* TODO: Agent entry remvoal... */
+       if (S_ISDIR(inode->i_mode)) {
+               if (inode->i_nlink > 2)
+                       CERROR("%s: dir "DFID" ino %lu nlink %u at unlink.\n",
+                              osd_name(osd), PFID(fid), inode->i_ino,
+                              inode->i_nlink);
+
+               spin_lock(&obj->oo_guard);
+               clear_nlink(inode);
+               spin_unlock(&obj->oo_guard);
+       }
+
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
+       obj->oo_destroyed = 1;
+       CDEBUG(D_INODE,
+              "%s: Object "DFID" destroyed: inode@%pK nlink=%d mode=%#o\n",
+              osd_name(osd), PFID(lu_object_fid(&dt->do_lu)), inode,
+              inode->i_nlink, inode->i_mode);
+
+       RETURN(0);
+}
+
+/*
+ * Concurrency: @dt is write locked.
+ */
+static int osd_ref_add(const struct lu_env *env, struct dt_object *dt,
+                      struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode = obj->oo_inode;
+       int rc = 0;
+
+       if (!dt_object_exists(dt) || obj->oo_destroyed)
+               return -ENOENT;
+
+       LASSERT(!dt_object_remote(dt));
+       LASSERT(dt_write_locked(env, dt));
+
+       CDEBUG(D_INODE, "%s:"DFID" increase nlink %d inode@%pK\n",
+              osd_name(osd_obj2dev(obj)), PFID(lu_object_fid(&dt->do_lu)),
+              inode->i_nlink, inode);
+       /*
+        * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX
+        * (65000) subdirectories by storing "1" in i_nlink if the link count
+        * would otherwise overflow. Directory tranversal tools understand
+        * that (st_nlink == 1) indicates that the filesystem dose not track
+        * hard links count on the directory, and will not abort subdirectory
+        * scanning early once (st_nlink - 2) subdirs have been found.
+        *
+        * This also has to properly handle the case of inodes with nlink == 0
+        * in case they are being linked into the PENDING directory
+        */
+       spin_lock(&obj->oo_guard);
+       if (unlikely(inode->i_nlink == 0))
+               /* inc_nlink from 0 may cause WARN_ON */
+               set_nlink(inode, 1);
+       else
+               inc_nlink(inode);
+       spin_unlock(&obj->oo_guard);
+
+       return rc;
+}
+
+/*
+ * Concurrency: @dt is write locked.
+ */
+static int osd_ref_del(const struct lu_env *env, struct dt_object *dt,
+                      struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode = obj->oo_inode;
+       struct osd_device *osd = osd_dev(dt->do_lu.lo_dev);
+
+       if (!dt_object_exists(dt))
+               return -ENOENT;
+
+       LASSERT(!dt_object_remote(dt));
+       LASSERT(dt_write_locked(env, dt));
+
+       if (CFS_FAIL_CHECK(OBD_FAIL_OSD_REF_DEL))
+               return -EIO;
+
+       spin_lock(&obj->oo_guard);
+       if (inode->i_nlink == 0) {
+               CDEBUG_LIMIT(fid_is_norm(lu_object_fid(&dt->do_lu)) ?
+                            D_ERROR : D_INODE, "%s: nlink == 0 on "DFID".\n",
+                            osd_name(osd), PFID(lu_object_fid(&dt->do_lu)));
+               spin_unlock(&obj->oo_guard);
+               return 0;
+       }
+
+       CDEBUG(D_INODE, DFID" decrease nlink %d inode@%pK\n",
+              PFID(lu_object_fid(&dt->do_lu)), inode->i_nlink, inode);
+
+       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+               drop_nlink(inode);
+       spin_unlock(&obj->oo_guard);
+
+       return 0;
+}
+
+/* Concurrency: @dt is write locked. */
+static int osd_xattr_set(const struct lu_env *env, struct dt_object *dt,
+                        const struct lu_buf *buf, const char *name, int fl,
+                        struct thandle *handle)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       int flags = 0;
+       int rc;
+
+       ENTRY;
+
+       LASSERT(inode);
+       LASSERT(buf);
+
+       if (fl & LU_XATTR_REPLACE)
+               flags |= XATTR_REPLACE;
+       if (fl & LU_XATTR_CREATE)
+               flags |= XATTR_CREATE;
+
+       /* FIXME: using VFS i_op->setxattr()? */
+       rc = memfs_xattr_set(inode, buf->lb_buf, buf->lb_len, name, flags);
+
+       RETURN(rc);
+}
+
+/* Concurrency: @dt is read locked. */
+static int osd_xattr_get(const struct lu_env *env, struct dt_object *dt,
+                        struct lu_buf *buf, const char *name)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       int rc;
+
+       ENTRY;
+       LASSERT(buf);
+
+       if (!dt_object_exists(dt))
+               RETURN(-ENOENT);
+
+       LASSERT(!dt_object_remote(dt));
+
+       /* FIXME: using VFS i_op->getxattr()? */
+       rc = memfs_xattr_get(inode, buf->lb_buf, buf->lb_len, name);
+       RETURN(rc);
+}
+
+/* Concurrency: @dt is write locked. */
+static int osd_xattr_del(const struct lu_env *env, struct dt_object *dt,
+                        const char *name, struct thandle *handle)
+{
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+
+       if (!dt_object_exists(dt))
+               return -ENOENT;
+
+       LASSERT(!dt_object_remote(dt));
+       /* FIXME: using VFS i_op->removexattr() */
+       memfs_xattr_del(inode, name);
+
+       return 0;
+}
+
+/* TODO: Implement xattr listing. */
+static int osd_xattr_list(const struct lu_env *env, struct dt_object *dt,
+                         const struct lu_buf *buf)
+{
+       RETURN(0);
+}
+
+/* MemFS does not support object sync, return zero to ignore the error. */
+static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
+                          __u64 start, __u64 end)
+{
+       RETURN(0);
+}
+
+const struct dt_object_operations osd_obj_ops = {
+       .do_attr_get            = osd_attr_get,
+       .do_attr_set            = osd_attr_set,
+       .do_ah_init             = osd_ah_init,
+       .do_create              = osd_create,
+       .do_destroy             = osd_destroy,
+       .do_index_try           = osd_index_try,
+       .do_ref_add             = osd_ref_add,
+       .do_ref_del             = osd_ref_del,
+       .do_xattr_get           = osd_xattr_get,
+       .do_xattr_set           = osd_xattr_set,
+       .do_xattr_del           = osd_xattr_del,
+       .do_xattr_list          = osd_xattr_list,
+       .do_object_sync         = osd_object_sync,
+};
+
+const struct lu_object_operations osd_lu_obj_ops = {
+       .loo_object_init      = osd_object_init,
+       .loo_object_delete    = osd_object_delete,
+       .loo_object_release   = osd_object_release,
+       .loo_object_free      = osd_object_free,
+       .loo_object_print     = osd_object_print,
+};
diff --git a/lustre/osd-wbcfs/wbcfs.c b/lustre/osd-wbcfs/wbcfs.c
new file mode 100644 (file)
index 0000000..152c296
--- /dev/null
@@ -0,0 +1,1335 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * lustre/osd-wbcfs/osd_wbcfs.c
+ *
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM        S_OSD
+
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/uidgid.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/dirent.h>
+#include <linux/xattr.h>
+#include <linux/swap.h>
+#include <linux/statfs.h>
+#ifdef HAVE_FS_CONTEXT_H
+#include <linux/fs_context.h>
+#endif
+
+#include <lustre_compat.h>
+
+#include "wbcfs.h"
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define inode_init_owner(ns, inode, dir, mode)  \
+       inode_init_owner(inode, dir, mode)
+#define memfs_mknod(ns, dir, dch, mode, rd)    memfs_mknod(dir, dch, mode, rd)
+#define memfs_mkdir(ns, dir, dch, mode)                memfs_mkdir(dir, dch, mode)
+#define memfs_create_nd(ns, dir, de, mode, ex) \
+       memfs_create_nd(dir, de, mode, ex)
+#endif /* HAVE_USER_NAMESPCE_ARG */
+
+/*
+ * In-memory xattr entry.
+ * Borrowed from osd-ldiskfs @osd_xattr_entry and @simple_xattrs in Linux
+ * kernel. This part of codes in-memory XATTRs should put into libcfs module.
+ * The first part of @mxe_buf is XATTR name, and is '\0' terminated.
+ * The left part is for value, binary mode.
+ */
+struct mem_xattr_entry {
+       struct list_head        mxe_list;
+       size_t                  mxe_len;
+       size_t                  mxe_namelen;
+       bool                    mxe_exist;
+       struct rcu_head         mxe_rcu;
+       char                    mxe_buf[];
+};
+
+static int mem_xattr_get(struct mem_xattrs *xattrs, const char *name,
+                        void *buf, size_t len)
+{
+       struct mem_xattr_entry *mxe = NULL;
+       struct mem_xattr_entry *tmp;
+       size_t namelen = strlen(name);
+       int rc;
+
+       ENTRY;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(tmp, &xattrs->mex_xattr_list, mxe_list) {
+               if (namelen == tmp->mxe_namelen &&
+                   strncmp(name, tmp->mxe_buf, namelen) == 0) {
+                       mxe = tmp;
+                       break;
+               }
+       }
+
+       if (mxe == NULL)
+               GOTO(out, rc = -ENODATA);
+
+       if (!mxe->mxe_exist)
+               GOTO(out, rc = -ENODATA);
+
+       /* Value length */
+       rc = mxe->mxe_len - sizeof(*mxe) - mxe->mxe_namelen - 1;
+       LASSERT(rc > 0);
+
+       if (buf == NULL)
+               GOTO(out, rc);
+
+       if (len < rc)
+               GOTO(out, rc = -ERANGE);
+
+       memcpy(buf, &mxe->mxe_buf[namelen + 1], rc);
+out:
+       rcu_read_unlock();
+       RETURN(rc);
+}
+
+static void mem_xattr_free(struct rcu_head *head)
+{
+       struct mem_xattr_entry *mxe;
+
+       mxe = container_of(head, struct mem_xattr_entry, mxe_rcu);
+       OBD_FREE(mxe, mxe->mxe_len);
+}
+
+static int mem_xattr_add(struct mem_xattrs *xattrs, const char *name,
+                        const char *buf, int buflen)
+{
+       struct mem_xattr_entry *mxe;
+       struct mem_xattr_entry *old = NULL;
+       struct mem_xattr_entry *tmp;
+       size_t namelen = strlen(name);
+       size_t len = sizeof(*mxe) + namelen + 1 + buflen;
+
+       ENTRY;
+
+       OBD_ALLOC(mxe, len);
+       if (mxe == NULL)
+               RETURN(-ENOMEM);
+
+       INIT_LIST_HEAD(&mxe->mxe_list);
+       mxe->mxe_len = len;
+       mxe->mxe_namelen = namelen;
+       memcpy(mxe->mxe_buf, name, namelen);
+       if (buflen > 0) {
+               LASSERT(buf != NULL);
+               memcpy(mxe->mxe_buf + namelen + 1, buf, buflen);
+               mxe->mxe_exist = true;
+       } else {
+               mxe->mxe_exist = false;
+       }
+
+       /* This should be rarely called, just remove old and add new */
+       spin_lock(&xattrs->mex_lock);
+       list_for_each_entry(tmp, &xattrs->mex_xattr_list, mxe_list) {
+               if (namelen == tmp->mxe_namelen &&
+                   strncmp(name, tmp->mxe_buf, namelen) == 0) {
+                       old = tmp;
+                       break;
+               }
+       }
+       if (old != NULL) {
+               list_replace_rcu(&old->mxe_list, &mxe->mxe_list);
+               call_rcu(&old->mxe_rcu, mem_xattr_free);
+       } else {
+               list_add_tail_rcu(&mxe->mxe_list, &xattrs->mex_xattr_list);
+       }
+       spin_unlock(&xattrs->mex_lock);
+
+       RETURN(0);
+}
+
+static void mem_xattr_del(struct mem_xattrs *xattrs, const char *name)
+{
+       struct mem_xattr_entry *mxe;
+       size_t namelen = strlen(name);
+
+       spin_lock(&xattrs->mex_lock);
+       list_for_each_entry(mxe, &xattrs->mex_xattr_list, mxe_list) {
+               if (namelen == mxe->mxe_namelen &&
+                   strncmp(name, mxe->mxe_buf, namelen) == 0) {
+                       list_del_rcu(&mxe->mxe_list);
+                       call_rcu(&mxe->mxe_rcu, mem_xattr_free);
+                       break;
+               }
+       }
+       spin_unlock(&xattrs->mex_lock);
+}
+
+static inline void mem_xattrs_init(struct mem_xattrs *xattrs)
+{
+       INIT_LIST_HEAD(&xattrs->mex_xattr_list);
+       spin_lock_init(&xattrs->mex_lock);
+}
+
+static void mem_xattrs_fini(struct mem_xattrs *xattrs)
+{
+       struct mem_xattr_entry *mxe, *next;
+
+       list_for_each_entry_safe(mxe, next, &xattrs->mex_xattr_list, mxe_list) {
+               list_del(&mxe->mxe_list);
+               OBD_FREE(mxe, mxe->mxe_len);
+       }
+}
+
+int memfs_xattr_get(struct inode *inode, void *buf, size_t len,
+                   const char *name)
+{
+       return mem_xattr_get(&MEMFS_I(inode)->mei_xattrs, name, buf, len);
+}
+
+int memfs_xattr_set(struct inode *inode, void *buf, size_t len,
+                   const char *name, int flags)
+{
+       return mem_xattr_add(&MEMFS_I(inode)->mei_xattrs, name, buf, len);
+}
+
+void memfs_xattr_del(struct inode *inode, const char *name)
+{
+       mem_xattr_del(&MEMFS_I(inode)->mei_xattrs, name);
+}
+
+static const struct super_operations memfs_ops;
+static const struct address_space_operations memfs_aops;
+static const struct file_operations memfs_file_operations;
+static const struct inode_operations memfs_inode_operations;
+static const struct file_operations memfs_dir_operations;
+static const struct inode_operations memfs_dir_inode_operations;
+static struct file_system_type memfs_fstype;
+
+static inline struct memfs_sb_info *MEMFS_SB(struct super_block *sb)
+{
+       return sb->s_fs_info;
+}
+
+static int memfs_reserve_inode(struct super_block *sb)
+{
+       return 0;
+}
+
+static void memfs_free_inode(struct super_block *sb)
+{
+}
+
+struct inode *memfs_create_inode(struct super_block *sb, struct inode *dir,
+                                umode_t mode, struct iattr *iattr, dev_t dev,
+                                bool update_link)
+{
+       struct memfs_sb_info *sbinfo = MEMFS_SB(sb);
+       struct memfs_inode_info *mei;
+       struct inode *inode;
+
+       ENTRY;
+
+       inode = new_inode(sb);
+       if (!inode)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       if (iattr) {
+               uid_t owner[2] = { 0, 0 };
+
+               if (iattr->ia_valid & ATTR_UID)
+                       owner[0] = from_kuid(&init_user_ns, iattr->ia_uid);
+               if (iattr->ia_valid & ATTR_GID)
+                       owner[1] = from_kgid(&init_user_ns, iattr->ia_gid);
+
+               inode->i_mode = mode;
+               i_uid_write(inode, owner[0]);
+               i_gid_write(inode, owner[1]);
+       } else {
+               inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+       }
+
+       if (iattr) {
+               if (iattr->ia_valid & ATTR_CTIME)
+                       inode_set_ctime_to_ts(inode, iattr->ia_ctime);
+               if (iattr->ia_valid & ATTR_MTIME)
+                       inode_set_mtime_to_ts(inode, iattr->ia_mtime);
+               if (iattr->ia_valid & ATTR_ATIME)
+                       inode_set_atime_to_ts(inode, iattr->ia_atime);
+       }
+
+       inode->i_blocks = 0;
+
+       mei = MEMFS_I(inode);
+       mei->mei_crtime = inode_get_mtime(inode);
+       mem_xattrs_init(&mei->mei_xattrs);
+       mei->mei_index_type = INDEX_TYPE_NONE;
+       cache_no_acl(inode);
+
+       if (sbinfo->msi_noswap)
+               mapping_set_unevictable(inode->i_mapping);
+
+       switch (mode & S_IFMT) {
+       case S_IFREG:
+               inode->i_mapping->a_ops = &memfs_aops;
+               inode->i_op = &memfs_inode_operations;
+               inode->i_fop = &memfs_file_operations;
+               break;
+       case S_IFDIR:
+               if (update_link)
+                       inc_nlink(inode);
+               /* Some things misbehave if size == 0 on a directory */
+               inode->i_size = 2 * BOGO_DIRENT_SIZE;
+               inode->i_op = &memfs_dir_inode_operations;
+               inode->i_fop = &memfs_dir_operations;
+               break;
+       case S_IFLNK:
+               break;
+       default:
+               CERROR("Unsupport file mode %#o\n", mode);
+               iput(inode);
+               /*
+                * TODO: Add support for other file types.
+                * Fix the error in sanity/test_28.
+                */
+               RETURN(ERR_PTR(-EOPNOTSUPP));
+       }
+
+       return inode;
+}
+
+static int memfs_mknod(struct mnt_idmap *map, struct inode *dir,
+                      struct dentry *dentry, umode_t mode, dev_t dev)
+{
+       struct inode *inode;
+
+       ENTRY;
+
+       inode = memfs_create_inode(dir->i_sb, dir, mode, NULL, dev, true);
+       if (IS_ERR(inode))
+               RETURN(PTR_ERR(inode));
+
+       dir->i_size += BOGO_DIRENT_SIZE;
+       inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+       d_instantiate(dentry, inode);
+       dget(dentry); /* Extra count - pin the dentry in core */
+
+       RETURN(0);
+}
+
+static int memfs_mkdir(struct mnt_idmap *map, struct inode *dir,
+                      struct dentry *dchild, umode_t mode)
+{
+       int rc;
+
+       rc = memfs_mknod(map, dir, dchild, mode | S_IFDIR, 0);
+       if (rc)
+               return rc;
+
+       inc_nlink(dir);
+       return 0;
+}
+
+static int memfs_create_nd(struct mnt_idmap *map, struct inode *dir,
+                          struct dentry *dentry, umode_t mode, bool want_excl)
+{
+       return memfs_mknod(map, dir, dentry, mode | S_IFREG, 0);
+}
+
+static int memfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+       struct inode *inode = d_inode(dentry);
+
+       if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
+               memfs_free_inode(inode->i_sb);
+
+       dir->i_size -= BOGO_DIRENT_SIZE;
+       inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir,
+                             inode_set_ctime_current(inode)));
+       inode_inc_iversion(dir);
+       drop_nlink(inode);
+       dput(dentry);
+       return 0;
+}
+
+static int memfs_rmdir(struct inode *dir, struct dentry *dchild)
+{
+       if (!simple_empty(dchild))
+               return -ENOTEMPTY;
+
+       drop_nlink(d_inode(dchild));
+       drop_nlink(dir);
+       return memfs_unlink(dir, dchild);
+}
+
+static int memfs_link(struct dentry *old_dentry, struct inode *dir,
+                     struct dentry *dentry)
+{
+       struct inode *inode = d_inode(old_dentry);
+
+       ENTRY;
+
+       /*
+        * No ordinary (disk based) filesystem counts links as inodes;
+        * but each new link needs a new dentry, pinning lowmem, and
+        * tmpfs dentries cannot be pruned until they are unlinked.
+        * But if an O_TMPFILE file is linked into the tmpfs, the
+        * first link must skip that, to get the accounting right.
+        */
+       if (inode->i_nlink) {
+               int rc = 0;
+
+               rc = memfs_reserve_inode(inode->i_sb);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       dir->i_size += BOGO_DIRENT_SIZE;
+       inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir,
+                             inode_set_ctime_current(inode)));
+       inode_inc_iversion(dir);
+       inc_nlink(inode);
+       ihold(inode);   /* New dentry reference */
+       dget(dentry);   /* Extra pinning count for the created dentry */
+       d_instantiate(dentry, inode);
+       return 0;
+}
+
+#ifdef HAVE_DENTRY_D_CHILDREN
+/* parent is locked at least shared */
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+static struct dentry *scan_positives(struct dentry *cursor,
+                                       struct hlist_node **p,
+                                       loff_t count,
+                                       struct dentry *last)
+{
+       struct dentry *dentry = cursor->d_parent, *found = NULL;
+
+       spin_lock(&dentry->d_lock);
+       while (*p) {
+               struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
+
+               p = &d->d_sib.next;
+               // we must at least skip cursors, to avoid livelocks
+               if (d->d_flags & DCACHE_DENTRY_CURSOR)
+                       continue;
+               if (simple_positive(d) && !--count) {
+                       spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+                       if (simple_positive(d))
+                               found = dget_dlock(d);
+                       spin_unlock(&d->d_lock);
+                       if (likely(found))
+                               break;
+                       count = 1;
+               }
+               if (need_resched()) {
+                       if (!hlist_unhashed(&cursor->d_sib))
+                               __hlist_del(&cursor->d_sib);
+                       hlist_add_behind(&cursor->d_sib, &d->d_sib);
+                       p = &cursor->d_sib.next;
+                       spin_unlock(&dentry->d_lock);
+                       cond_resched();
+                       spin_lock(&dentry->d_lock);
+               }
+       }
+       spin_unlock(&dentry->d_lock);
+       dput(last);
+       return found;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+static int memfs_dcache_readdir(struct file *file, struct dir_context *ctx)
+{
+       struct dentry *dentry = file->f_path.dentry;
+       struct dentry *cursor = file->private_data;
+       struct memfs_dir_context *mctx = (struct memfs_dir_context *)ctx;
+       struct dentry *next = NULL;
+       struct hlist_node **p;
+
+       if (!dir_emit_dots(file, ctx))
+               return 0;
+
+       if (ctx->pos == 2)
+               p = &dentry->d_children.first;
+       else
+               p = &cursor->d_sib.next;
+
+       while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
+               mctx->dentry = next;
+               if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+                             d_inode(next)->i_ino,
+                             fs_umode_to_dtype(d_inode(next)->i_mode)))
+                       break;
+               ctx->pos++;
+               p = &next->d_sib.next;
+       }
+       spin_lock(&dentry->d_lock);
+       hlist_del_init(&cursor->d_sib);
+       if (next)
+               hlist_add_before(&cursor->d_sib, &next->d_sib);
+       spin_unlock(&dentry->d_lock);
+       dput(next);
+
+       return 0;
+}
+
+#else /* !HAVE_DENTRY_D_CHILDREN */
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+       return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * linux/fs/libfs.c: simple_positive()
+ * Public in linux/include/linux/dcache.h
+ * kernel 4.1-rc3 commit dc3f4198eac14e52a98dfc79cd84b45e280f59cd
+ */
+static inline int __simple_positive(struct dentry *dentry)
+{
+       return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+/* parent is locked at least shared */
+static struct dentry *scan_positives(struct dentry *cursor,
+                                       struct list_head *p,
+                                       loff_t count,
+                                       struct dentry *last)
+{
+       struct dentry *dentry = cursor->d_parent, *found = NULL;
+
+       spin_lock(&dentry->d_lock);
+       while ((p = p->next) != &dentry->d_subdirs) {
+               struct dentry *d = list_entry(p, struct dentry, d_child);
+               /* We must at least skip cursors, to avoid livelocks */
+               if (d->d_flags & DCACHE_DENTRY_CURSOR)
+                       continue;
+               if (__simple_positive(d) && !--count) {
+                       spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+                       if (__simple_positive(d))
+                               found = dget_dlock(d);
+                       spin_unlock(&d->d_lock);
+                       if (likely(found))
+                               break;
+                       count = 1;
+               }
+               if (need_resched()) {
+                       list_move(&cursor->d_child, p);
+                       p = &cursor->d_child;
+                       spin_unlock(&dentry->d_lock);
+                       cond_resched();
+                       spin_lock(&dentry->d_lock);
+               }
+       }
+       spin_unlock(&dentry->d_lock);
+       dput(last);
+       return found;
+}
+
+/* linux/fs/libfs.c: dcache_readdir() */
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+static int memfs_dcache_readdir(struct file *file, struct dir_context *ctx)
+{
+       struct dentry *dentry = file->f_path.dentry;
+       struct dentry *cursor = file->private_data;
+       struct list_head *anchor = &dentry->d_subdirs;
+       struct memfs_dir_context *mctx = (struct memfs_dir_context *)ctx;
+       struct dentry *next = NULL;
+       struct list_head *p;
+
+       if (!dir_emit_dots(file, ctx))
+               return 0;
+
+       if (ctx->pos == 2)
+               p = anchor;
+       else if (!list_empty(&cursor->d_child))
+               p = &cursor->d_child;
+       else
+               return 0;
+
+       while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
+               /*
+                * TODO: Add a new f_flags O_HAVE_DIR_CONTEXT_EXT to
+                * distinguish the normal readdir() access from the user space.
+                */
+               mctx->dentry = next;
+               if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+                             d_inode(next)->i_ino, dt_type(d_inode(next))))
+                       break;
+               ctx->pos++;
+               p = &next->d_child;
+       }
+       spin_lock(&dentry->d_lock);
+       if (next)
+               list_move_tail(&cursor->d_child, &next->d_child);
+       else
+               list_del_init(&cursor->d_child);
+       spin_unlock(&dentry->d_lock);
+       dput(next);
+
+       return 0;
+}
+#endif /* HAVE_DENTRY_D_CHILDREN */
+
+/*
+ * Copied from @simple_write_end in the kernel.
+ * It does not export on the new kernel such as rhel9.
+ */
+static int memfs_write_end(struct file *file, struct address_space *mapping,
+                          loff_t pos, unsigned int len, unsigned int copied,
+                          struct page *page, void *fsdata)
+{
+       struct inode *inode = page->mapping->host;
+       loff_t last_pos = pos + copied;
+
+       /* zero the stale part of the page if we did a short copy */
+       if (!PageUptodate(page)) {
+               if (copied < len) {
+                       unsigned int from = pos & (PAGE_SIZE - 1);
+
+                       zero_user(page, from + copied, len - copied);
+               }
+               SetPageUptodate(page);
+       }
+       /*
+        * No need to use i_size_read() here, the i_size
+        * cannot change under us because we hold the i_mutex.
+        */
+       if (last_pos > inode->i_size)
+               i_size_write(inode, last_pos);
+
+       set_page_dirty(page);
+       unlock_page(page);
+       put_page(page);
+
+       return copied;
+}
+
+/* TODO: implement file splice read/write interface for MemFS. */
+static ssize_t memfs_file_splice_read(struct file *in_file, loff_t *ppos,
+                                     struct pipe_inode_info *pipe,
+                                     size_t count, unsigned int flags)
+{
+       RETURN(0);
+}
+
+/*
+ * linux/mm/shmem.c
+ * TODO: mmap support.
+ */
+static int memfs_getpage(struct inode *inode, pgoff_t index,
+                        struct page **pagep)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct page *page;
+
+       if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
+               return -EFBIG;
+
+       page = find_lock_page(mapping, index);
+       /* fallocated page? */
+       if (page && !PageUptodate(page)) {
+               unlock_page(page);
+               put_page(page);
+               page = NULL;
+       }
+
+       *pagep = page;
+       return 0;
+}
+
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+/* linux/mm/shmem.c shmem_file_read_iter() */
+static ssize_t memfs_file_read_iter(struct kiocb *iocb,
+                                   struct iov_iter *to)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct address_space *mapping = inode->i_mapping;
+       loff_t *ppos = &iocb->ki_pos;
+       unsigned long offset;
+       ssize_t retval = 0;
+       pgoff_t index;
+       int error = 0;
+
+       ENTRY;
+
+       /*
+        * Might this read be for a stacking filesystem?  Then when reading
+        * holes of a sparse file, we actually need to allocate those pages,
+        * and even mark them dirty, so it cannot exceed the max_blocks limit.
+        */
+
+       index = *ppos >> PAGE_SHIFT;
+       offset = *ppos & ~PAGE_MASK;
+
+       for (;;) {
+               struct page *page = NULL;
+               pgoff_t end_index;
+               unsigned long nr, ret;
+               loff_t i_size = i_size_read(inode);
+
+               end_index = i_size >> PAGE_SHIFT;
+               if (index > end_index)
+                       break;
+               if (index == end_index) {
+                       nr = i_size & ~PAGE_MASK;
+                       if (nr <= offset)
+                               break;
+               }
+
+               error = memfs_getpage(inode, index, &page);
+               if (error) {
+                       if (error == -EINVAL)
+                               error = 0;
+                       break;
+               }
+               if (page)
+                       unlock_page(page);
+
+               /*
+                * We must evaluate after, since reads (unlike writes)
+                * are called without i_mutex protection against truncate
+                */
+               nr = PAGE_SIZE;
+               i_size = i_size_read(inode);
+               end_index = i_size >> PAGE_SHIFT;
+               if (index == end_index) {
+                       nr = i_size & ~PAGE_MASK;
+                       if (nr <= offset) {
+                               if (page)
+                                       put_page(page);
+                               break;
+                       }
+               }
+               nr -= offset;
+
+               if (page) {
+                       /*
+                        * If users can be writing to this page using arbitrary
+                        * virtual addresses, take care about potential aliasing
+                        * before reading the page on the kernel side.
+                        */
+                       if (mapping_writably_mapped(mapping))
+                               flush_dcache_page(page);
+                       /*
+                        * Mark the page accessed if we read the beginning.
+                        */
+                       if (!offset)
+                               mark_page_accessed(page);
+               } else {
+                       page = ZERO_PAGE(0);
+                       get_page(page);
+               }
+
+               /*
+                * Ok, we have the page, and it's up-to-date, so
+                * now we can copy it to user space...
+                */
+               ret = copy_page_to_iter(page, offset, nr, to);
+               retval += ret;
+               offset += ret;
+               index += offset >> PAGE_SHIFT;
+               offset &= ~PAGE_MASK;
+
+               put_page(page);
+               if (!iov_iter_count(to))
+                       break;
+               if (ret < nr) {
+                       error = -EFAULT;
+                       break;
+               }
+               cond_resched();
+       }
+
+       *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
+       file_accessed(file);
+       return retval ? retval : error;
+}
+
+/* TODO: space limiting for write. */
+static ssize_t memfs_file_write_iter(struct kiocb *iocb,
+                                    struct iov_iter *iter)
+{
+       RETURN(generic_file_write_iter(iocb, iter));
+}
+
+#else
+
+/*
+ * It can not use simple_readpage() directly in Linux ramfs especially when
+ * there are holes in the file which is cached MemFS. It must rewrite the read
+ * VFS interface similar to Linux tmpfs.
+ */
+/* linux/mm/filemap.c */
+static int memfs_file_read_actor(read_descriptor_t *desc, struct page *page,
+                                unsigned long offset, unsigned long size)
+{
+       char *kaddr;
+       unsigned long left, count = desc->count;
+
+       if (size > count)
+               size = count;
+
+       /*
+        * Faults on the destination of a read are common, so do it before
+        * taking the kmap.
+        */
+       if (IS_ENABLED(CONFIG_HIGHMEM) &&
+           !fault_in_pages_writeable(desc->arg.buf, size)) {
+               kaddr = kmap_atomic(page);
+               left = __copy_to_user_inatomic(desc->arg.buf,
+                                               kaddr + offset, size);
+               kunmap_atomic(kaddr);
+               if (left == 0)
+                       goto success;
+       }
+
+       /* Do it the slow way */
+       kaddr = kmap(page);
+       left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+       kunmap(page);
+
+       if (left) {
+               size -= left;
+               desc->error = -EFAULT;
+       }
+success:
+       desc->count = count - size;
+       desc->written += size;
+       desc->arg.buf += size;
+       return size;
+}
+
+/* linux/mm/shmem.c do_shmem_file_read() */
+static void do_memfs_file_read(struct file *filp,
+                              loff_t *ppos, read_descriptor_t *desc,
+                              read_actor_t actor)
+{
+       struct inode *inode = file_inode(filp);
+       struct address_space *mapping = inode->i_mapping;
+       pgoff_t index;
+       unsigned long offset;
+
+       /*
+        * Might this read be for a stacking filesystem?  Then when reading
+        * holes of a sparse file, we actually need to allocate those pages,
+        * and even mark them dirty, so it cannot exceed the max_blocks limit.
+        */
+
+       index = *ppos >> PAGE_SHIFT;
+       offset = *ppos & ~PAGE_MASK;
+
+       for (;;) {
+               struct page *page = NULL;
+               pgoff_t end_index;
+               unsigned long nr, ret;
+               loff_t i_size = i_size_read(inode);
+
+               end_index = i_size >> PAGE_SHIFT;
+               if (index > end_index)
+                       break;
+               if (index == end_index) {
+                       nr = i_size & ~PAGE_MASK;
+                       if (nr <= offset)
+                               break;
+               }
+
+               desc->error = memfs_getpage(inode, index, &page);
+               if (desc->error) {
+                       if (desc->error == -EINVAL)
+                               desc->error = 0;
+                       break;
+               }
+               if (page)
+                       unlock_page(page);
+
+               /*
+                * We must evaluate after, since reads (unlike writes)
+                * are called without i_mutex protection against truncate
+                */
+               nr = PAGE_SIZE;
+               i_size = i_size_read(inode);
+               end_index = i_size >> PAGE_SHIFT;
+               if (index == end_index) {
+                       nr = i_size & ~PAGE_MASK;
+                       if (nr <= offset) {
+                               if (page)
+                                       put_page(page);
+                               break;
+                       }
+               }
+               nr -= offset;
+
+               if (page) {
+                       /*
+                        * If users can be writing to this page using arbitrary
+                        * virtual addresses, take care about potential aliasing
+                        * before reading the page on the kernel side.
+                        */
+                       if (mapping_writably_mapped(mapping))
+                               flush_dcache_page(page);
+                       /*
+                        * Mark the page accessed if we read the beginning.
+                        */
+                       if (!offset)
+                               mark_page_accessed(page);
+               } else {
+                       page = ZERO_PAGE(0);
+                       get_page(page);
+               }
+
+               /*
+                * Ok, we have the page, and it's up-to-date, so
+                * now we can copy it to user space...
+                *
+                * The actor routine returns how many bytes were actually used..
+                * NOTE! This may not be the same as how much of a user buffer
+                * we filled up (we may be padding etc), so we can only update
+                * "pos" here (the actor routine has to update the user buffer
+                * pointers and the remaining count).
+                */
+               ret = actor(desc, page, offset, nr);
+               offset += ret;
+               index += offset >> PAGE_SHIFT;
+               offset &= ~PAGE_MASK;
+
+               put_page(page);
+               if (ret != nr || !desc->count)
+                       break;
+
+               cond_resched();
+       }
+
+       *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
+       file_accessed(filp);
+}
+
+static ssize_t memfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                                  unsigned long nr_segs, loff_t pos)
+{
+       struct file *filp = iocb->ki_filp;
+       ssize_t retval;
+       unsigned long seg;
+       size_t count;
+       loff_t *ppos = &iocb->ki_pos;
+
+       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+       if (retval)
+               return retval;
+
+       for (seg = 0; seg < nr_segs; seg++) {
+               read_descriptor_t desc;
+
+               desc.written = 0;
+               desc.arg.buf = iov[seg].iov_base;
+               desc.count = iov[seg].iov_len;
+               if (desc.count == 0)
+                       continue;
+               desc.error = 0;
+               do_memfs_file_read(filp, ppos, &desc, memfs_file_read_actor);
+               retval += desc.written;
+               if (desc.error) {
+                       retval = retval ?: desc.error;
+                       break;
+               }
+               if (desc.count > 0)
+                       break;
+       }
+       return retval;
+}
+
+static ssize_t memfs_file_read(struct file *file, char __user *buf,
+                              size_t count, loff_t *ppos)
+{
+       RETURN(do_sync_read(file, buf, count, ppos));
+}
+
+static ssize_t memfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                                   unsigned long nr_segs, loff_t pos)
+{
+       RETURN(generic_file_aio_write(iocb, iov, nr_segs, pos));
+}
+
+static ssize_t memfs_file_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       RETURN(do_sync_write(file, buf, count, ppos));
+}
+#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+
+static void memfs_put_super(struct super_block *sb)
+{
+       struct memfs_sb_info *sbinfo = MEMFS_SB(sb);
+
+       OBD_FREE_PTR(sbinfo);
+       sb->s_fs_info = NULL;
+}
+
+#ifdef HAVE_FS_CONTEXT_H
+static int memfs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+       struct memfs_options *ctx = fc->fs_private;
+       struct memfs_sb_info *sbinfo;
+       struct inode *inode;
+       int rc;
+
+       ENTRY;
+
+       OBD_ALLOC_PTR(sbinfo);
+       if (!sbinfo)
+               return -ENOMEM;
+
+       sb->s_fs_info = sbinfo;
+       sb->s_flags |= SB_NOUSER | SB_NOSEC;
+
+       sbinfo->msi_uid = ctx->meo_uid;
+       sbinfo->msi_gid = ctx->meo_gid;
+       sbinfo->msi_mode = ctx->meo_mode;
+       sbinfo->msi_max_blocks = ctx->meo_blocks;
+       sbinfo->msi_free_inodes = sbinfo->msi_max_inodes = ctx->meo_inodes;
+       /* Swap space for the larger capacity is not supported. */
+       sbinfo->msi_noswap = true;
+
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+       sb->s_blocksize = PAGE_SIZE;
+       sb->s_blocksize_bits = PAGE_SHIFT;
+       sb->s_magic = WBCFS_MAGIC;
+       sb->s_op = &memfs_ops;
+       sb->s_d_op = &simple_dentry_operations;
+       sb->s_time_gran = 1;
+       uuid_gen(&sb->s_uuid);
+
+       inode = memfs_create_inode(sb, NULL, S_IFDIR | sbinfo->msi_mode,
+                                  NULL, 0, true);
+       if (IS_ERR(inode))
+               GOTO(out_fail, rc = PTR_ERR(inode));
+
+       inode->i_uid = sbinfo->msi_uid;
+       inode->i_gid = sbinfo->msi_gid;
+       sb->s_root = d_make_root(inode);
+       if (!sb->s_root)
+               GOTO(out_fail, rc = -ENOMEM);
+
+       RETURN(0);
+out_fail:
+       memfs_put_super(sb);
+       RETURN(rc);
+}
+
+static int memfs_get_tree(struct fs_context *fc)
+{
+       return get_tree_nodev(fc, memfs_fill_super);
+}
+
+static void memfs_free_fc(struct fs_context *fc)
+{
+       struct memfs_options *ctx = fc->fs_private;
+
+       if (ctx)
+               OBD_FREE_PTR(ctx);
+}
+
+static const struct fs_context_operations memfs_context_ops = {
+       .free           = memfs_free_fc,
+       .get_tree       = memfs_get_tree,
+};
+
+static int memfs_init_fs_context(struct fs_context *fc)
+{
+       struct memfs_options *ctx;
+
+       OBD_ALLOC_PTR(ctx);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->meo_mode = 0777 | S_ISVTX;
+       ctx->meo_uid = current_fsuid();
+       ctx->meo_gid = current_fsgid();
+
+       fc->fs_private = ctx;
+       fc->ops = &memfs_context_ops;
+       return 0;
+}
+
+#else /* !HAVE_FS_CONTEXT_H */
+
+static int memfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+       struct memfs_sb_info *sbinfo;
+       struct inode *inode;
+       int rc;
+
+       /* Round up to L1_CACHE_BYTES to resist false sharing */
+       OBD_ALLOC_PTR(sbinfo);
+       if (!sbinfo)
+               return -ENOMEM;
+
+       sbinfo->msi_mode = S_IRWXUGO | S_ISVTX;
+       sbinfo->msi_uid = current_fsuid();
+       sbinfo->msi_gid = current_fsgid();
+       sb->s_fs_info = sbinfo;
+
+       /*
+        * Per default we only allow half of the physical ram per
+        * tmpfs instance, limiting inodes to one per page of lowmem;
+        * but the internal instance is left unlimited.
+        */
+       if (!(sb->s_flags & MS_KERNMOUNT)) {
+               sbinfo->msi_max_blocks = memfs_default_max_blocks();
+               sbinfo->msi_max_inodes = memfs_default_max_inodes();
+       } else {
+               sb->s_flags |= MS_NOUSER;
+       }
+
+       sb->s_flags |= MS_NOSEC | MS_NOUSER;
+       sbinfo->msi_free_inodes = sbinfo->msi_max_inodes;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+       sb->s_blocksize = PAGE_SIZE;
+       sb->s_blocksize_bits = PAGE_SHIFT;
+       sb->s_magic = WBCFS_MAGIC;
+       sb->s_op = &memfs_ops;
+       sb->s_d_op = &simple_dentry_operations;
+       sb->s_time_gran = 1;
+
+       inode = memfs_create_inode(sb, NULL, S_IFDIR | sbinfo->msi_mode, NULL,
+                                  0, true);
+       if (IS_ERR(inode))
+               GOTO(out_fail, rc = PTR_ERR(inode));
+
+       inode->i_uid = sbinfo->msi_uid;
+       inode->i_gid = sbinfo->msi_gid;
+       sb->s_root = d_make_root(inode);
+       if (!sb->s_root)
+               GOTO(out_fail, rc = -ENOMEM);
+       return 0;
+out_fail:
+       memfs_put_super(sb);
+       return rc;
+}
+
+static struct dentry *memfs_mount(struct file_system_type *fs_type,
+                                 int flags, const char *dev_name, void *data)
+{
+       return mount_nodev(fs_type, flags, data, memfs_fill_super);
+}
+#endif /* HAVE_FS_CONTEXT_H */
+
+static struct kmem_cache *memfs_inode_cachep;
+
+static struct inode *memfs_alloc_inode(struct super_block *sb)
+{
+       struct memfs_inode_info *mei;
+
+       mei = kmem_cache_alloc(memfs_inode_cachep, GFP_KERNEL);
+       if (!mei)
+               return NULL;
+
+       return &mei->mei_vfs_inode;
+}
+
+static void memfs_destroy_callback(struct rcu_head *head)
+{
+       struct inode *inode = container_of(head, struct inode, i_rcu);
+
+       ENTRY;
+       /* TOOD: free symlink name. */
+       kmem_cache_free(memfs_inode_cachep, MEMFS_I(inode));
+       EXIT;
+}
+
+static void memfs_destroy_inode(struct inode *inode)
+{
+       struct memfs_inode_info *mei = MEMFS_I(inode);
+
+       if (mei->mei_index_type == INDEX_TYPE_HASH)
+               hash_index_fini(&mei->mei_hash_index);
+
+       call_rcu(&inode->i_rcu, memfs_destroy_callback);
+}
+
+static void memfs_init_inode(void *foo)
+{
+       struct memfs_inode_info *mei = (struct memfs_inode_info *)foo;
+
+       inode_init_once(&mei->mei_vfs_inode);
+}
+
+static void memfs_init_inodecache(void)
+{
+       memfs_inode_cachep = kmem_cache_create("memfs_inode_cache",
+                                              sizeof(struct memfs_inode_info),
+                                              0, SLAB_PANIC | SLAB_ACCOUNT,
+                                              memfs_init_inode);
+}
+
+static void memfs_destroy_inodecache(void)
+{
+       kmem_cache_destroy(memfs_inode_cachep);
+}
+
+static inline bool memfs_mapping(struct address_space *mapping)
+{
+       return mapping->a_ops == &memfs_aops;
+}
+
+static void memfs_evict_inode(struct inode *inode)
+{
+       struct memfs_inode_info *mei = MEMFS_I(inode);
+
+       if (memfs_mapping(inode->i_mapping)) {
+               inode->i_size = 0;
+               mapping_set_exiting(inode->i_mapping);
+               truncate_inode_pages_range(inode->i_mapping, 0, (loff_t)-1);
+       }
+
+       mem_xattrs_fini(&mei->mei_xattrs);
+       memfs_free_inode(inode->i_sb);
+       clear_inode(inode);
+}
+
+static int memfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       struct memfs_sb_info *sbinfo = MEMFS_SB(dentry->d_sb);
+
+       buf->f_type = WBCFS_MAGIC;
+       buf->f_bsize = PAGE_SIZE;
+       buf->f_namelen = NAME_MAX;
+       if (sbinfo->msi_max_blocks) {
+               buf->f_blocks = sbinfo->msi_max_blocks;
+               buf->f_bavail =
+               buf->f_bfree  = sbinfo->msi_max_blocks -
+                               percpu_counter_sum(&sbinfo->msi_used_blocks);
+       }
+       if (sbinfo->msi_max_inodes) {
+               buf->f_files = sbinfo->msi_max_inodes;
+               buf->f_ffree = sbinfo->msi_free_inodes;
+       }
+       /* else leave those fields 0 like simple_statfs */
+
+       return 0;
+}
+
+static const struct super_operations memfs_ops = {
+       .alloc_inode    = memfs_alloc_inode,
+       .destroy_inode  = memfs_destroy_inode,
+       .statfs         = memfs_statfs,
+       .evict_inode    = memfs_evict_inode,
+       .drop_inode     = generic_delete_inode,
+       .put_super      = memfs_put_super,
+};
+
+/*
+ * TODO: Using the new kernel data structure Maple Tree:
+ * @simple_offset_dir_operations to manage and access the dentries
+ * within a directory. It is much efficient than linear list.
+ */
+static const struct file_operations memfs_dir_operations = {
+       .open           = dcache_dir_open,
+       .release        = dcache_dir_close,
+       .llseek         = dcache_dir_lseek,
+       .read           = generic_read_dir,
+       .iterate_shared = memfs_dcache_readdir,
+       .fsync          = noop_fsync,
+};
+
+static const struct inode_operations memfs_dir_inode_operations = {
+       .mknod          = memfs_mknod,
+       .lookup         = simple_lookup,
+       .create         = memfs_create_nd,
+       .unlink         = memfs_unlink,
+       .mkdir          = memfs_mkdir,
+       .rmdir          = memfs_rmdir,
+       .link           = memfs_link,
+       .setattr        = simple_setattr,
+       .getattr        = simple_getattr,
+};
+
+static const struct file_operations memfs_file_operations = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+# endif
+       .read_iter      = memfs_file_read_iter,
+       .write_iter     = memfs_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .read           = memfs_file_read,
+       .aio_read       = memfs_file_aio_read,
+       .write          = memfs_file_write,
+       .aio_write      = memfs_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .mmap           = generic_file_mmap,
+       .llseek         = generic_file_llseek,
+       .splice_read    = memfs_file_splice_read,
+       .fsync          = noop_fsync,
+};
+
+static const struct address_space_operations memfs_aops = {
+#ifdef HAVE_DIRTY_FOLIO
+       .dirty_folio    = noop_dirty_folio,
+#else
+       /*
+        * TODO: reimplemet ->set_page_dirty() interface.
+        * - The call __set_page_dirty_nobuffers will mark the inode dirty and
+        *   put the inode into the writeback control list. Instead, it would
+        *   better to call mark_inode_dirty() only one time when close the file
+        *   once the file data was modified.
+        * - Here it can be optimized to use light weight function:
+        *   __set_page_dirty_no_writeback(); The writeback related data
+        *   structure can be delayed to initilize during data assimliation.
+        */
+       .set_page_dirty = __set_page_dirty_nobuffers,
+#endif
+       .write_begin    = simple_write_begin,
+       .write_end      = memfs_write_end,
+};
+
+static struct file_system_type memfs_fstype = {
+       .owner                  = THIS_MODULE,
+       .name                   = "wbcfs",
+#ifdef HAVE_FS_CONTEXT_H
+       .init_fs_context        = memfs_init_fs_context,
+#else
+       .mount                  = memfs_mount,
+#endif
+       .kill_sb                = kill_litter_super,
+       .fs_flags               = FS_USERNS_MOUNT,
+};
+
+int memfs_init(void)
+{
+       int rc;
+
+       memfs_init_inodecache();
+       rc = register_filesystem(&memfs_fstype);
+       if (rc)
+               memfs_destroy_inodecache();
+
+       return rc;
+}
+
+void memfs_fini(void)
+{
+       unregister_filesystem(&memfs_fstype);
+       memfs_destroy_inodecache();
+}
diff --git a/lustre/osd-wbcfs/wbcfs.h b/lustre/osd-wbcfs/wbcfs.h
new file mode 100644 (file)
index 0000000..fdcf746
--- /dev/null
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025-2026, DDN/Whamcloud, Inc.
+ */
+
+/*
+ * Embed memory file system with writeback support that using for OSD.
+ *
+ * Author: Yingjin Qian <qian@ddn.com>
+ */
+
+#ifndef _OSD_WBCFS_H_
+#define _OSD_WBCFS_H_
+
+#include <linux/spinlock.h>
+#include <linux/uidgid.h>
+#include <linux/percpu.h>
+#ifdef HAVE_INODE_IVERSION
+#include <linux/iversion.h>
+#else
+#define inode_peek_iversion(__inode)    ((__inode)->i_version)
+#define inode_inc_iversion(__inode)
+#endif
+
+#include <lustre_fid.h>
+
+#include "index.h"
+
+/* Pretend that each entry is of this size in directory's i_size */
+#define BOGO_DIRENT_SIZE       20
+
+/* Pretend that one inode + its dentry occupy this much memory */
+#define BOGO_INODE_SIZE                1024
+
+#define WBCFS_MAGIC            0xbdacbd05
+
+/* In-memory xattr list */
+struct mem_xattrs {
+       spinlock_t              mex_lock;
+       struct list_head        mex_xattr_list;
+};
+
+struct memfs_options {
+       unsigned long long      meo_blocks;
+       unsigned long long      meo_inodes;
+       kuid_t                  meo_uid;
+       kgid_t                  meo_gid;
+       umode_t                 meo_mode;
+       bool                    meo_noswap;
+};
+
+struct memfs_sb_info {
+       /* How many blocks are allowed. */
+       unsigned long           msi_max_blocks;
+       /* How many blocks are allocated. */
+       struct percpu_counter   msi_used_blocks;
+       /* How many inodes are allowed. */
+       unsigned long           msi_max_inodes;
+       /* How much ispace left for allocation. */
+       unsigned long           msi_free_inodes;
+       /* Serialize memfs_sb_info changes. */
+       spinlock_t              msi_stat_lock;
+       /* Mount mode for root directory */
+       umode_t                 msi_mode;
+       /* Mount uid for root directory */
+       kuid_t                  msi_uid;
+       /* Mount gid for root directory */
+       kgid_t                  msi_gid;
+       /* Whether enable swap with much larger capacity. */
+       bool                    msi_noswap;
+       /* Whether there is backing persistent store. */
+       bool                    msi_no_backing;
+       /* TODO: Quota limits support for MemFS. */
+};
+
+enum index_type {
+       INDEX_TYPE_NONE = 0,
+       INDEX_TYPE_HASH,
+       INDEX_TYPE_MTREE,
+};
+
+/* MemFS inode in-kernel data */
+struct memfs_inode_info {
+       __u32                    mei_flags;
+       struct mem_xattrs        mei_xattrs;
+       struct lu_fid            mei_fid;
+#ifdef HAVE_PROJECT_QUOTA
+       /* Project ID */
+       kprojid_t                mei_projid;
+#endif
+       /* File creation time. */
+       struct timespec64        mei_crtime;
+       /*
+        * Index access for dir dentry or indexing KV store.
+        * Currently only support hash index with linear iterating.
+        * Next step add Maple Tree index.
+        * TODO: use maple tree to manage dir entries under this dir.
+        */
+       enum index_type          mei_index_type;
+       struct hash_index        mei_hash_index;
+       /* Stack backing inode with the persistent storage. */
+       struct inode            *mei_backing;
+       struct inode             mei_vfs_inode;
+};
+
+#define MEMFS_I(inode) (container_of(inode, struct memfs_inode_info, \
+                                    mei_vfs_inode))
+
+#define MEMFS_DIR_EOF   ((1ULL << (64 - 1)) - 1)
+
+struct memfs_dir_context {
+       struct dir_context       super;
+       struct dentry           *dentry;
+       void                    *cbdata;
+};
+
+#ifdef HAVE_PROJECT_QUOTA
+static inline __u32 i_projid_read(struct inode *inode)
+{
+       return (__u32)from_kprojid(&init_user_ns, MEMFS_I(inode)->mei_projid);
+}
+
+static inline void i_projid_write(struct inode *inode, __u32 projid)
+{
+       kprojid_t kprojid;
+
+       kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+       MEMFS_I(inode)->mei_projid = kprojid;
+}
+#else
+static inline uid_t i_projid_read(struct inode *inode)
+{
+       return 0;
+}
+static inline void i_projid_write(struct inode *inode, __u32 projid)
+{
+}
+#endif
+
+static inline int memfs_test_inode_by_fid(struct inode *inode, void *opaque)
+{
+       return lu_fid_eq(&MEMFS_I(inode)->mei_fid, opaque);
+}
+
+static inline __u64 memfs_get_btime(struct inode *inode)
+{
+       return MEMFS_I(inode)->mei_crtime.tv_sec;
+}
+
+static inline __u32 memfs_get_flags(struct inode *inode)
+{
+       return MEMFS_I(inode)->mei_flags;
+}
+
+static inline unsigned long memfs_default_max_blocks(void)
+{
+       return cfs_totalram_pages() / 2;
+}
+
+static inline unsigned long memfs_default_max_inodes(void)
+{
+       unsigned long nr_pages = cfs_totalram_pages();
+
+       /*
+        * return min(nr_pages - totalhigh_pages(), nr_pages / 2);
+        */
+       return nr_pages / 2;
+}
+
+int memfs_xattr_get(struct inode *inode, void *buf, size_t len,
+                   const char *name);
+int memfs_xattr_set(struct inode *inode, void *buf, size_t len,
+                   const char *name, int flags);
+void memfs_xattr_del(struct inode *inode, const char *name);
+
+struct inode *memfs_create_inode(struct super_block *sb, struct inode *dir,
+                                umode_t mode, struct iattr *iattr, dev_t dev,
+                                bool update_link);
+
+int memfs_init(void);
+void memfs_fini(void);
+#endif /* _OSD_WBCFS_H_ */
index 0c8d90a..2d6238f 100755 (executable)
@@ -86,6 +86,105 @@ if [[ "$CLIENT_OS_ID_LIKE" =~ "rhel" ]]; then
        fi
 fi
 
+if [[ "$FSTYPE" = "wbcfs" ]]; then
+       # Lack of lprocfs support
+       always_except LU-18813 0f 27A 53 66 270a
+       # lack of lprocfs: osd.*.nonrotational
+       always_except LU-18813 119e 119f 119g 119h
+       # No stats (similar to openZFS)
+       always_except LU-18813 156
+       # MemFS-based OSD (wbcfs) cannot recovery from a server restart
+       always_except LU-18813 17o 27oo 27z 27F  60a 64i 232 257
+       always_except LU-18813 278 280  427 801c 818 820
+       # Symlink/CHR/SOCK/FIFO/BLK file types do not support
+       always_except LU-18813 17a 17b 17e 17g 17i 17p 21   25a
+       always_except LU-18813 25b 26a 26c 26d 26e 26f 27ga 27Q
+       always_except LU-18813 28  32e 32f 32g 32h 32m 32n  32o
+       always_except LU-18813 32p 48a 54a 54c 54d 56l 56m  56n  56rd
+       always_except LU-18813 56xb 56eb 56eg 56eh 56ei 133a 140 170b
+       always_except LU-18813 162a 226a
+       # Truncate operation is not supported yet.
+       always_except LU-18813 27p 27q 34a
+       # cross directory hardlink in DNE env
+       always_except LU-18813 31g 31l 31m
+       # FMD not expired: cannot reproduce on local testing
+       always_except LU-18813 36g
+       # Filemap is not supported yet.
+       always_except LU-18813 44f 130a 130b 130c 130d 130e 130i 430a
+       # inodes/blocks space usage accounting and statfs() is not supported
+       always_except LU-18813 51b 56ab 81b 220 413 418 806
+       # lsattr: append-only/immutable flags
+       always_except LU-18813 52a 52b
+       # xattr_list() is not implemented yet
+       always_except LU-18813 102a 102h 102i 102r 102t
+       # linkea and fid2path wrong...
+       always_except LU-18813 154B 154f 154g
+       # changelog related failures: wbcfs-target device label is not correct
+       always_except LU-18813 160 161c 161d 205a 65k 807 808 812
+       # DNE does not work well
+       always_except LU-18813 56 65e 65a 406
+       # user.job XATTR
+       always_except LU-18813 205h
+       # Exclusive open timeout
+       always_except LU-18813 208
+       # OFD access log failure
+       always_except LU-18813 165
+       # rename() operations: the source may not empty
+       # always_except LU-18813 214
+       # Data page cache has been updated during bulk write
+       always_except LU-18813 224d
+       # fid2path failure
+       always_except LU-18813 226d
+       # ladvise failure
+       always_except LU-18813 255
+       # sec related failure
+       always_except LU-18813 258
+       # DoM migration failure
+       always_except LU-18813 272
+       # Unkown reason timeout!
+       always_except LU-18813 275 277 311 410 414 419 831
+       # last_rcvd should fail
+       always_except LU-18813 313 314 315
+       # block accting is wrong...
+       always_except LU-18813 317
+       # Other timeouts
+       always_except LU-18813 200 350 398 399 403 404 408 432 433
+       # DIO locking issue?
+       always_except LU-18813 398a
+       # Layout swap is not working
+       always_except LU-18813 405
+       # Memory pressure under memcg control
+       always_except LU-18813 411
+       # rmfid in DNE and in large numbers
+       always_except LU-18813 421
+       # local testing passed but Maloo testing failed!
+       always_except LU-18813 27Cg 27U 422 424 425 426 428 429 434 442
+       # OOM failure
+       always_except LU-18813 430b 430c 431 814 833 850
+       # Expired barrier
+       always_except LU-18813 801a 801b
+       # ro is not implemented yet
+       always_except LU-18813 802b
+       # openZFS related partial page write
+       always_except LU-18813 810
+       # Quota is not supported yet...
+       always_except LU-18813 812b
+       # ldlm kunit test
+       always_except LU-18813 842
+       # fanotify does not work
+       always_except LU-18813 851
+       # MGC locks and client umount
+       always_except LU-18813 901
+       # destroy takes too much time
+       always_except LU-18813 903
+fi
+
+# Although every sanity.sh test has been run, we stop sooner for
+# stability reasons. As we get farther, increment the STOP_AT value.
+if [[ "$FSTYPE" = "wbcfs" ]]; then
+       export STOP_AT=${STOP_AT:-"440"}
+fi
+
 build_test_filter
 FAIL_ON_ERROR=false
 
@@ -15637,7 +15736,7 @@ test_123a_base() { # was test 123, statahead(b=11401)
        log "$lsx done"
 
        stime=$SECONDS
-       rm -r $DIR/$tdir
+       rm -r $DIR/$tdir || error "failed to rm $DIR/$tdir"
        sync
        etime=$SECONDS
        delta=$((etime - stime))
@@ -25773,6 +25872,9 @@ test_250() {
        [ "$(facet_fstype ost$(($($LFS getstripe -i $DIR/$tfile) + 1)))" = "zfs" ] \
         && skip "no 16TB file size limit on ZFS"
 
+       [ "$(facet_fstype ost$(($($LFS getstripe -i $DIR/$tfile) + 1)))" = "wbcfs" ] \
+        && skip "no 16TB file size limit on wbcfs"
+
        $LFS setstripe -c 1 $DIR/$tfile
        # ldiskfs extent file size limit is (16TB - 4KB - 1) bytes
        local size=$((16 * 1024 * 1024 * 1024 * 1024 - 4096 - 1))
index 8c24bcc..e392963 100755 (executable)
@@ -1135,6 +1135,8 @@ load_modules_local() {
                elif [[ $(node_fstypes $HOSTNAME) == *ldiskfs* ]]; then
                        load_module ../ldiskfs/ldiskfs
                        load_module osd-ldiskfs/osd_ldiskfs
+               elif [[ $(node_fstypes $HOSTNAME) == *wbcfs* ]]; then
+                       load_module osd-wbcfs/osd_wbcfs
                fi
                load_module mgs/mgs
                load_module mdd/mdd
@@ -1759,6 +1761,8 @@ devicelabel() {
        zfs)
                label=$(do_facet ${facet} "$ZFS get -H -o value lustre:svname \
                                           ${dev} 2>/dev/null");;
+       wbcfs)
+               label="wbcfs-target";;
        *)
                error "unknown fstype!";;
        esac
@@ -2497,6 +2501,8 @@ mount_facet() {
        local fstype=$(facet_fstype $facet)
        local devicelabel
        local dm_dev=${!dev}
+       local index=$(facet_index $facet)
+       local node_type=$(facet_type $facet)
 
        [[ $dev == "mgsfailover_dev" ]] && combined_mgs_mds &&
                dev=mds1failover_dev
@@ -2519,21 +2525,63 @@ mount_facet() {
 
                devicelabel=$(do_facet ${facet} "$ZFS get -H -o value \
                                                lustre:svname $dm_dev");;
+       wbcfs)
+               :;;
        *)
                error "unknown fstype!";;
        esac
 
-       echo "Starting ${facet}: $opts $dm_dev $mntpt"
        # for testing LU-482 error handling in mount_facets() and test_0a()
        if [ -f $TMP/test-lu482-trigger ]; then
                RC=2
        else
                local seq_width=$(($OSTSEQWIDTH / $OSTCOUNT))
                (( $seq_width >= 16384 )) || seq_width=16384
-               do_facet ${facet} \
-                       "mkdir -p $mntpt; $MOUNT_CMD $opts $dm_dev $mntpt"
+
+               case $fstype in
+               wbcfs)
+                       echo "Start ${facet}: $MOUNT_CMD -v lustre-wbcfs $mntpt"
+
+                       export OSD_WBC_FSNAME="$FSNAME"
+                       export OSD_WBC_INDEX="$index"
+                       export OSD_WBC_MGS_NID="$MGSNID"
+
+                       case $node_type in
+                       OST)
+                               export OSD_WBC_TGT_TYPE="OST"
+                               ;;
+                       MDS)
+                               export OSD_WBC_TGT_TYPE="MDT"
+                               if (( $index == 0 )) &&
+                                       [[ "$mds_HOST" == "$mgs_HOST" ]]; then
+                                       export OSD_WBC_PRIMARY_MDT="1"
+                               else
+                                       export OSD_WBC_PRIMARY_MDT="0"
+                               fi
+                               ;;
+                       MGS)
+                               export OSD_WBC_TGT_TYPE="MGT"
+                               ;;
+                       *)
+                               error "Unhandled node_type!"
+                       esac
+
+                       do_facet ${facet} "mkdir -p $mntpt; \
+                                OSD_WBC_TGT_TYPE=$OSD_WBC_TGT_TYPE \
+                                OSD_WBC_INDEX=$OSD_WBC_INDEX \
+                                OSD_WBC_MGS_NID=$OSD_WBC_MGS_NID \
+                                OSD_WBC_PRIMARY_MDT=$OSD_WBC_PRIMARY_MDT \
+                                OSD_WBC_FSNAME=$OSD_WBC_FSNAME \
+                                $MOUNT_CMD -v lustre-wbcfs $mntpt"
+                       ;;
+               *)
+                       echo "Start ${facet}: $MOUNT_CMD $opts $dm_dev $mntpt"
+                       do_facet ${facet} \
+                               "mkdir -p $mntpt; $MOUNT_CMD $opts $dm_dev $mntpt"
+               esac
+
                RC=${PIPESTATUS[0]}
-               if [[ ${facet} =~ ost ]]; then
+               if [[ ${facet} =~ ost ]] && [[ ! "$fstype" == "wbcfs" ]]; then
                        do_facet ${facet} "$LCTL set_param \
                                seq.cli-$(devicelabel $facet $dm_dev)-super.width=$seq_width"
                fi
@@ -2566,6 +2614,8 @@ mount_facet() {
                                grep -E ':[a-zA-Z]{3}[0-9]{4}'" "" ||
                                error "$dm_dev failed to initialize!";;
 
+               wbcfs)
+                       :;;
                *)
                        error "unknown fstype!";;
                esac
@@ -5019,6 +5069,8 @@ ostdevname() {
                        #try $OSTZFSDEVn - independent of vdev
                        DEVNAME=OSTZFSDEV$num
                        eval DEVPTR=${!DEVNAME:=${FSNAME}-ost${num}/ost${num}};;
+               wbcfs )
+                       :;;
                * )
                        error "unknown fstype!";;
        esac
@@ -5043,6 +5095,8 @@ ostvdevname() {
                        # Device formatted by zfs
                        DEVNAME=OSTDEV$num
                        eval VDEVPTR=${!DEVNAME:=${OSTDEVBASE}${num}};;
+               wbcfs )
+                       :;;
                * )
                        error "unknown fstype!";;
        esac
@@ -5067,6 +5121,8 @@ mdsdevname() {
                        # try $MDSZFSDEVn - independent of vdev
                        DEVNAME=MDSZFSDEV$num
                        eval DEVPTR=${!DEVNAME:=${FSNAME}-mdt${num}/mdt${num}};;
+               wbcfs )
+                       :;;
                * )
                        error "unknown fstype!";;
        esac
@@ -5089,6 +5145,8 @@ mdsvdevname() {
                        # Device formatted by ZFS
                        local DEVNAME=MDSDEV$num
                        eval VDEVPTR=${!DEVNAME:=${MDSDEVBASE}${num}};;
+               wbcfs )
+                       :;;
                * )
                        error "unknown fstype!";;
        esac
@@ -5117,6 +5175,8 @@ mgsdevname() {
                else
                        DEVPTR=${MGSZFSDEV:-${FSNAME}-mgs/mgs}
                fi;;
+       wbcfs )
+               :;;
        * )
                error "unknown fstype!";;
        esac
@@ -5141,6 +5201,8 @@ mgsvdevname() {
                elif [ -n "$MGSDEV" ]; then
                        VDEVPTR=$MGSDEV
                fi;;
+       wbcfs )
+               :;;
        * )
                error "unknown fstype!";;
        esac
@@ -5546,6 +5608,9 @@ __touch_device()
 
 format_mgs() {
        local quiet
+       local fstype=$(facet_fstype mgs)
+
+       [[ "$fstype" == "wbcfs" ]] && return
 
        if ! $VERBOSE; then
                quiet=yes
@@ -5565,6 +5630,9 @@ format_mgs() {
 format_mdt() {
        local num=$1
        local quiet
+       local fstype=$(facet_fstype mdt$num)
+
+       [[ "$fstype" == "wbcfs" ]] && return
 
        if ! $VERBOSE; then
                quiet=yes
@@ -5581,6 +5649,9 @@ format_mdt() {
 
 format_ost() {
        local num=$1
+       local fstype=$(facet_fstype ost$num)
+
+       [[ "$fstype" == "wbcfs" ]] && return
 
        if ! $VERBOSE; then
                quiet=yes
@@ -6640,6 +6711,11 @@ do_check_and_cleanup_lustre() {
                run_lfsck
        fi
 
+       # FIXME: The cleanup takes too long, times out...
+       if [[ "$FSTYPE" == "wbcfs" ]]; then
+               DO_CLEANUP=false
+       fi
+
        if is_mounted $MOUNT; then
                if $DO_CLEANUP; then
                        [[ -n "$DIR" ]] && rm -rf $DIR/[Rdfs][0-9]* ||
@@ -7513,6 +7589,10 @@ run_test() {
                return 0
        else
                run_one_logged $testnum "$testmsg"
+               # TODO: Avoid running out of space!?
+               if [[ "$FSTYPE" == "wbcfs" ]]; then
+                       rm -rf "$MOUNT/*"
+               fi
                return $?
        fi
 }
@@ -8715,7 +8795,13 @@ convert_facet2label() {
        if [ -n "${!varsvc}" ]; then
                echo ${!varsvc}
        else
-               error "No label for $facet!"
+               # FIXME: Cannot find label correctly for some reason.
+               # Just assume wbcfs OSD and continue...
+               if [[ "$FSTYPE" == "wbcfs" ]]; then
+                       echo "wbcfs-target"
+               else
+                       error "No label for $facet!"
+               fi
        fi
 }
 
index 0f775c6..d95c416 100644 (file)
@@ -128,6 +128,9 @@ endif
 if ZFS_ENABLED
 LIB_TARGETS += mount_osd_zfs.so
 endif
+if SERVER
+LIB_TARGETS += mount_osd_wbcfs.so
+endif
 endif
 
 install-exec-hook:
@@ -214,6 +217,24 @@ PLUGIN_LIB += libmount_utils_ldiskfs.a
 endif # PLUGINS
 endif # LDISKFS_ENABLED
 
+if SERVER
+noinst_LIBRARIES += libmount_utils_wbcfs.a
+
+libmount_utils_wbcfs_a_SOURCES = libmount_utils_wbcfs.c
+libmount_utils_wbcfs_a_CPPFLAGS :=
+
+if PLUGINS
+lib_LTLIBRARIES += libmount_utils_wbcfs.la
+libmount_utils_wbcfs.la : libmount_utils_wbcfs.a
+       $(CC) $(LDFLAGS) $(MNTMODLDFLAGS) -shared -Wl,--export-dynamic \
+                        -o mount_osd_wbcfs.so \
+                        `$(AR) -t libmount_utils_wbcfs.a` \
+                        $(MNTMODLIBS)
+else
+PLUGIN_LIB += libmount_utils_wbcfs.a
+endif # PLUGINS
+endif # SERVER
+
 mount_lustre_SOURCES = mount_lustre.c mount_utils.c mount_utils.h $(GSSSRC) \
                        lustre_param.c
 mount_lustre_CPPFLAGS := ${MNTMODCFLAGS}
diff --git a/lustre/utils/libmount_utils_wbcfs.c b/lustre/utils/libmount_utils_wbcfs.c
new file mode 100644 (file)
index 0000000..41cdf08
--- /dev/null
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2024, Amazon and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Author: Timothy Day <timday@amazon.com>
+ */
+
+#include "mount_utils.h"
+
+#define VAR_SIZE 64
+
+enum osd_tgt_type {
+       MGT,
+       MDT,
+       OST,
+       INVALID
+};
+
+int wbcfs_write_ldd(struct mkfs_opts *mop)
+{
+       return 0;
+}
+
+int wbcfs_erase_ldd(struct mkfs_opts *mop, char *param)
+{
+       return 0;
+}
+
+static int get_wbcfs_env(char *out, char *env)
+{
+       if (!getenv(env)) {
+               fprintf(stderr, "%s is undefined\n", env);
+               return -EINVAL;
+       }
+
+       strscpy(out, getenv(env), VAR_SIZE);
+       fprintf(stderr, "%s=%s\n", env, out);
+
+       return 0;
+}
+
+int wbcfs_read_ldd(char *ds, struct lustre_disk_data *ldd)
+{
+       enum osd_tgt_type tgt_type = INVALID;
+       char tgt_type_var[VAR_SIZE];
+       char name_var[VAR_SIZE];
+       char params[2 * VAR_SIZE];
+       char svname[2 * VAR_SIZE];
+       int rc = 0;
+
+       memset(ldd, 0, sizeof(struct lustre_disk_data));
+       ldd->ldd_magic = LDD_MAGIC;
+       ldd->ldd_config_ver = 1;
+       ldd->ldd_mount_type = LDD_MT_WBCFS;
+
+       rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_TGT_TYPE");
+       if (rc)
+               return rc;
+
+       if (!strcmp(tgt_type_var, "OST")) {
+               ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN |
+                       LDD_F_SV_TYPE_OST;
+               tgt_type = OST;
+       }
+
+       if (!strcmp(tgt_type_var, "MGT")) {
+               ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN |
+                       LDD_F_SV_TYPE_MGS;
+               tgt_type = MGT;
+       }
+
+       if (!strcmp(tgt_type_var, "MDT")) {
+               rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_PRIMARY_MDT");
+               if (rc)
+                       return rc;
+
+               if (!strcmp(tgt_type_var, "1")) {
+                       ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN |
+                               LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_MGS;
+               } else {
+                       ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN |
+                               LDD_F_SV_TYPE_MDT;
+               }
+
+               tgt_type = MDT;
+       }
+
+       if (tgt_type == INVALID) {
+               fprintf(stderr, "OSD_WBC_TGT_TYPE is invalid\n");
+               return -EINVAL;
+       }
+
+       rc = get_wbcfs_env(name_var, "OSD_WBC_FSNAME");
+       if (rc)
+               return rc;
+
+       strscpy(ldd->ldd_fsname, name_var, VAR_SIZE);
+
+       if (!getenv("OSD_WBC_INDEX")) {
+               fprintf(stderr, "OSD_WBC_INDEX is undefined\n");
+               return -EINVAL;
+       }
+
+       rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_INDEX");
+       if (rc)
+               return rc;
+
+       ldd->ldd_svindex = strtol(tgt_type_var,
+                                 NULL, 0);
+
+       if (tgt_type == MGT)
+               snprintf(svname, 2 * VAR_SIZE, "%s:%s%04x",
+                        ldd->ldd_fsname, "MGS",
+                        ldd->ldd_svindex);
+
+       if (tgt_type == MDT)
+               snprintf(svname, 2 * VAR_SIZE, "%s:%s%04x",
+                        ldd->ldd_fsname, "MDT",
+                        ldd->ldd_svindex);
+
+       if (tgt_type == OST)
+               snprintf(svname, 2 * VAR_SIZE, "%s:%s%04x",
+                        ldd->ldd_fsname, "OST",
+                        ldd->ldd_svindex);
+
+       strscpy(ldd->ldd_svname, svname, VAR_SIZE);
+
+       fprintf(stderr, "svname -> %s\n", svname);
+
+       rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_MGS_NID");
+       if (rc)
+               return rc;
+
+       if (tgt_type != MGT) {
+               snprintf(params, 2 * VAR_SIZE, "mgsnode=%s",
+                        tgt_type_var);
+               strscpy(ldd->ldd_params, params, VAR_SIZE);
+               fprintf(stderr, "params -> %s\n", params);
+       }
+
+       return 0;
+}
+
+void wbcfs_print_ldd_params(struct mkfs_opts *mop)
+{
+}
+
+int wbcfs_is_lustre(char *ds, unsigned int *mount_type)
+{
+       if (!strcmp(ds, OSD_WBCFS_DEV)) {
+               fprintf(stderr, "Lustre is using wbcfs as backend\n");
+               *mount_type = LDD_MT_WBCFS;
+               return 1;
+       }
+
+       return 0;
+}
+
+int wbcfs_make_lustre(struct mkfs_opts *mop)
+{
+       return 0;
+}
+
+int wbcfs_enable_quota(struct mkfs_opts *mop)
+{
+       return -EOPNOTSUPP;
+}
+
+int wbcfs_prepare_lustre(struct mkfs_opts *mop,
+                        char *wanted_mountopts, size_t len)
+{
+       return 0;
+}
+
+int wbcfs_tune_lustre(char *dev, struct mount_opts *mop)
+{
+       return 0;
+}
+
+int wbcfs_label_lustre(struct mount_opts *mop)
+{
+       return 0;
+}
+
+int wbcfs_rename_fsname(struct mkfs_opts *mop, const char *oldname)
+{
+       return 0;
+}
+
+int wbcfs_init(void)
+{
+       return 0;
+}
+
+void wbcfs_fini(void)
+{
+}
+
+#ifndef PLUGIN_DIR
+struct module_backfs_ops wbcfs_ops = {
+       .init                   = wbcfs_init,
+       .fini                   = wbcfs_fini,
+       .read_ldd               = wbcfs_read_ldd,
+       .write_ldd              = wbcfs_write_ldd,
+       .erase_ldd              = wbcfs_erase_ldd,
+       .print_ldd_params       = wbcfs_print_ldd_params,
+       .is_lustre              = wbcfs_is_lustre,
+       .make_lustre            = wbcfs_make_lustre,
+       .prepare_lustre         = wbcfs_prepare_lustre,
+       .tune_lustre            = wbcfs_tune_lustre,
+       .label_lustre           = wbcfs_label_lustre,
+       .enable_quota           = wbcfs_enable_quota,
+       .rename_fsname          = wbcfs_rename_fsname,
+};
+#endif /* PLUGIN_DIR */
index 257b528..ad7be84 100644 (file)
@@ -732,6 +732,22 @@ static int parse_opts(int argc, char *const argv[], struct mount_opts *mop)
        if (!mop->mo_usource)
                usage(stderr);
 
+#ifdef HAVE_SERVER_SUPPORT
+       /* osd-wbcfs lustre_tgt */
+       if (strcmp(mop->mo_usource, OSD_WBCFS_DEV) == 0) {
+               mop->mo_ldd.ldd_mount_type = LDD_MT_WBCFS;
+               mop->mo_source = strdup(mop->mo_usource);
+               if (!realpath(argv[optind + 1], mop->mo_target)) {
+                       rc = errno;
+                       fprintf(stderr, "warning: %s: cannot resolve: %s\n",
+                               argv[optind], strerror(errno));
+                       return rc;
+               }
+
+               return 0;
+       }
+#endif
+
        /**
         * Try to get the real path to the device, in case it is a
         * symbolic link for instance
index 8224929..08014fa 100644 (file)
@@ -597,6 +597,9 @@ struct module_backfs_ops *load_backfs_module(enum ldd_mount_type mount_type)
                ops = &zfs_ops;
                break;
 #endif /* HAVE_ZFS_OSD */
+       case LDD_MT_WBCFS:
+               ops = &wbcfs_ops;
+               break;
        default:
                ops = NULL;
                break;
index 2ebf615..0f04381 100644 (file)
@@ -139,6 +139,7 @@ static inline const char *mt_str(enum ldd_mount_type mt)
                "reiserfs",
                "ldiskfs2",
                "zfs",
+               "wbcfs",
        };
 
        return mount_type_string[mt];
@@ -156,10 +157,13 @@ static inline const char *mt_type(enum ldd_mount_type mt)
                "osd-reiserfs",
                "osd-ldiskfs",
                "osd-zfs",
+               "osd-wbcfs",
        };
 
        return mount_type_string[mt];
 }
+
+#define OSD_WBCFS_DEV "lustre-wbcfs"
 #endif /* HAVE_SERVER_SUPPORT */
 
 #define MT_STR(data)   mt_str((data)->ldd_mount_type)
@@ -241,6 +245,7 @@ struct module_backfs_ops {
 
 extern struct module_backfs_ops zfs_ops;
 extern struct module_backfs_ops ldiskfs_ops;
+extern struct module_backfs_ops wbcfs_ops;
 
 struct module_backfs_ops *load_backfs_module(enum ldd_mount_type mount_type);
 void unload_backfs_ops(struct module_backfs_ops *ops);
diff --git a/rpm/kmp-lustre-osd-wbcfs.files b/rpm/kmp-lustre-osd-wbcfs.files
new file mode 100644 (file)
index 0000000..f567b09
--- /dev/null
@@ -0,0 +1,4 @@
+%defattr(-,root,root)
+%dir %{modules_fs_path}/%{lustre_name}-osd-wbcfs
+%dir %{modules_fs_path}/%{lustre_name}-osd-wbcfs/fs
+%{modules_fs_path}/%{lustre_name}-osd-wbcfs/fs/osd_wbcfs.ko
diff --git a/rpm/kmp-lustre-osd-wbcfs.preamble b/rpm/kmp-lustre-osd-wbcfs.preamble
new file mode 100644 (file)
index 0000000..0933568
--- /dev/null
@@ -0,0 +1,8 @@
+License:        GPL-2.0-only
+%if 0%{?suse_version} > 1
+Requires:       kernel-%1
+%endif
+Requires:       %{name}-osd-wbcfs-mount = %{version}
+Provides:       %{name}-osd = %{version}
+Provides:       %{name}-osd-wbcfs = %{version}
+Obsoletes:      %{name}-osd-wbcfs < %{version}