From 25813cf8ba532ca1c690be6f9659c09a70722536 Mon Sep 17 00:00:00 2001 From: Qian Yingjin Date: Mon, 17 Mar 2025 23:41:17 +0800 Subject: [PATCH] LU-18813 osd-wbcfs: MemFS-based OSD with writeback support Implement a memory filesystem based OSD with writeback support for Lustre. It borrows lots of design from memory-based file systems such as tmpfs/ramfs. The data is first written into the memory-based file system (called MemFS for short). And then, the data can be flushed to the persistent storage in a delayed write-back manner. This patch implemented the basic functionality to store data in MemFS. It can reuse lots of VFS codes in Linux kernel such as: - Page caching for data; - dcache for dentry management and lookup; - icache for inode management and lookup; - Writeback mechanism in Linux kernel Test-Parameters: testlist=sanity fstype=wbcfs mdscount=1 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs mdscount=1 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs mdscount=4 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs mdscount=4 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=1 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=1 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=4 mdtcount=1 osscount=4 ostcount=1 Test-Parameters: testlist=sanity fstype=wbcfs combinedmdsmgs=false standalonemgs=true mdscount=4 mdtcount=1 osscount=4 ostcount=1 Signed-off-by: Yingjin Qian Signed-off-by: Timothy Day Change-Id: Ia07c1d95b7ad3f7f5e817a8de69d0a4ab6995ffa Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58439 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Shaun Tancheff Reviewed-by: Oleg Drokin --- MAINTAINERS | 8 + config/lustre-build.m4 | 2 +- config/lustre-core.m4 | 20 + lustre.spec.in | 49 + lustre/Makefile.in | 1 + lustre/autoMakefile.am | 2 +- lustre/include/obd.h | 6 + lustre/include/uapi/linux/lustre/lustre_disk.h | 1 + lustre/mgs/lproc_mgs.c | 6 +- lustre/osd-wbcfs/Makefile.in | 11 + lustre/osd-wbcfs/TODO | 32 + lustre/osd-wbcfs/autoMakefile.am | 12 + lustre/osd-wbcfs/index.h | 48 + lustre/osd-wbcfs/osd_dirent.c | 821 +++++++++++++++ lustre/osd-wbcfs/osd_handler.c | 680 ++++++++++++ lustre/osd-wbcfs/osd_hash.c | 177 ++++ lustre/osd-wbcfs/osd_index_hash.c | 299 ++++++ lustre/osd-wbcfs/osd_internal.h | 254 +++++ lustre/osd-wbcfs/osd_io.c | 438 ++++++++ lustre/osd-wbcfs/osd_object.c | 848 +++++++++++++++ lustre/osd-wbcfs/wbcfs.c | 1335 ++++++++++++++++++++++++ lustre/osd-wbcfs/wbcfs.h | 183 ++++ lustre/tests/sanity.sh | 104 +- lustre/tests/test-framework.sh | 96 +- lustre/utils/Makefile.am | 21 + lustre/utils/libmount_utils_wbcfs.c | 219 ++++ lustre/utils/mount_lustre.c | 16 + lustre/utils/mount_utils.c | 3 + lustre/utils/mount_utils.h | 5 + rpm/kmp-lustre-osd-wbcfs.files | 4 + rpm/kmp-lustre-osd-wbcfs.preamble | 8 + 31 files changed, 5700 insertions(+), 9 deletions(-) create mode 100644 lustre/osd-wbcfs/Makefile.in create mode 100644 lustre/osd-wbcfs/TODO create mode 100644 lustre/osd-wbcfs/autoMakefile.am create mode 100644 lustre/osd-wbcfs/index.h create mode 100644 lustre/osd-wbcfs/osd_dirent.c create mode 100644 lustre/osd-wbcfs/osd_handler.c create mode 100644 lustre/osd-wbcfs/osd_hash.c create mode 100644 lustre/osd-wbcfs/osd_index_hash.c create mode 100644 lustre/osd-wbcfs/osd_internal.h create mode 100644 lustre/osd-wbcfs/osd_io.c create mode 100644 lustre/osd-wbcfs/osd_object.c create mode 100644 lustre/osd-wbcfs/wbcfs.c create mode 100644 lustre/osd-wbcfs/wbcfs.h create mode 100644 lustre/utils/libmount_utils_wbcfs.c create mode 100644 rpm/kmp-lustre-osd-wbcfs.files create mode 100644 rpm/kmp-lustre-osd-wbcfs.preamble diff --git a/MAINTAINERS b/MAINTAINERS index 9af2aec..4a4cd4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -432,6 +432,14 @@ R: Olaf Faaland S: Maintained F: lustre/osd-zfs/ +Lustre OSD wbcfs +R: Timothy Day +R: Yingjin Qian +S: Supported +F: Documentation/osd-api.txt +F: lustre/osd-wbcfs/ +F: lustre/utils/libmount_utils_wbcfs.c + Lustre Patch Commit Hooks R: Andreas Dilger S: Odd Fixes diff --git a/config/lustre-build.m4 b/config/lustre-build.m4 index 74b2821..bf31cb5 100644 --- a/config/lustre-build.m4 +++ b/config/lustre-build.m4 @@ -477,7 +477,7 @@ AS_IF([test "x$enable_modules" = xyes], [ AS_IF([test x$enable_ldiskfs = xno -a x$enable_zfs = xno], [ AS_CASE([$enable_server], [maybe], [enable_server=no], - [yes], [AC_MSG_ERROR([cannot enable servers, no backends were configured])]) + [yes], [AC_MSG_WARN([no backends were configured])]) ], [ AS_IF([test x$enable_server = xmaybe], [enable_server=yes]) ]) diff --git a/config/lustre-core.m4 b/config/lustre-core.m4 index bb9bc69..25eab06 100644 --- a/config/lustre-core.m4 +++ b/config/lustre-core.m4 @@ -2867,6 +2867,22 @@ AC_DEFUN([LC_GENL_FAMILY_HAS_RESV_START_OP], [ ]) # LC_GENL_FAMILY_HAS_RESV_START_OP # +# LC_HAVE_FS_CONTEXT_HEADER +# +# Kernel version 5.0-rc2 commit 9bc61ab18b1d41f26dc06b9e6d3c203e65f83fe6 +# vfs: Introduce fs_context, switch vfs_kern_mount() to it. +# +AC_DEFUN([LC_SRC_HAVE_FS_CONTEXT_HEADER], [ + LB2_CHECK_LINUX_HEADER_SRC([linux/fs_context.h], [-Werror]) +]) +AC_DEFUN([LC_HAVE_FS_CONTEXT_HEADER], [ + LB2_CHECK_LINUX_HEADER_RESULT([linux/fs_context.h], [ + AC_DEFINE(HAVE_FS_CONTEXT_H, 1, + [fs_context.h is present]) + ]) +]) # LC_HAVE_FS_CONTEXT_HEADER + +# # LC_HAVE_BVEC_ITER_ALL # # kernel 5.1 commit 6dc4f100c175dd0511ae8674786e7c9006cdfbfa @@ -5217,6 +5233,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [ # 5.0 LC_SRC_GENL_FAMILY_HAS_RESV_START_OP + LC_SRC_HAVE_FS_CONTEXT_HEADER # 5.1 LC_SRC_HAVE_BVEC_ITER_ALL @@ -5543,6 +5560,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [ # 5.0 LC_GENL_FAMILY_HAS_RESV_START_OP + LC_HAVE_FS_CONTEXT_HEADER # 5.1 LC_HAVE_BVEC_ITER_ALL @@ -6163,6 +6181,8 @@ lustre/osd-ldiskfs/Makefile lustre/osd-ldiskfs/autoMakefile lustre/osd-zfs/Makefile lustre/osd-zfs/autoMakefile +lustre/osd-wbcfs/Makefile +lustre/osd-wbcfs/autoMakefile lustre/mgc/Makefile lustre/mgc/autoMakefile lustre/mgs/Makefile diff --git a/lustre.spec.in b/lustre.spec.in index 48fd059..69e8967 100644 --- a/lustre.spec.in +++ b/lustre.spec.in @@ -243,6 +243,8 @@ Source17: kmp-lnet-kfilnd.preamble Source18: kmp-lnet-kfilnd.files Source19: kmp-lnet-in-kernel-o2iblnd.preamble Source20: kmp-lnet-in-kernel-o2iblnd.files +Source21: kmp-lustre-osd-wbcfs.preamble +Source22: kmp-lustre-osd-wbcfs.files URL: https://wiki.whamcloud.com/ BuildRoot: %{_tmppath}/lustre-%{version}-root BuildRequires: libtool pkgconfig(yaml-0.1) pkgconfig(zlib) pkgconfig(libnl-3.0) flex bison @@ -424,8 +426,34 @@ lustre tools (mount/mkfs) to provide support for ZFS. %endif # with zfs %endif + +%if 0%{?suse_version:1} +%else +%if %{with servers} +%kernel_module_package -n %{name}-osd-wbcfs -p %SOURCE21 -f %SOURCE22 %{_flavor} +%if %{with lustre_utils} +%package osd-wbcfs-mount +Summary: Lustre mount's wbcfs-specific helper library +BuildRequires: pkgconfig(mount) +Provides: %{name}-osd-mount = %{version} +Obsoletes: lustre-osd-mount < %{version} +Provides: %{name}-osd-mount = %{version} +Provides: %{name}-osd-wbcfs-mount = %{version} +Requires: %{name}-osd-wbcfs = %{version} + +%description osd-wbcfs-mount +Provide a shared library (dso) that can be loaded into various +lustre tools (mount/mkfs) to provide support for in-memory OSD +with writeback support. + +# with lustre_utils +%endif +# with servers +%endif # with lustre_modules %endif +# suse +%endif %if %{with servers} %package resource-agents @@ -823,6 +851,13 @@ mv $basemodpath/fs/ldiskfs.ko $basemodpath-osd-ldiskfs/fs/ldiskfs.ko mkdir -p $basemodpath-osd-zfs/fs mv $basemodpath/fs/osd_zfs.ko $basemodpath-osd-zfs/fs/osd_zfs.ko %endif +%if 0%{?suse_version:1} +%else +%if %{with servers} +mkdir -p $basemodpath-osd-wbcfs/fs +mv $basemodpath/fs/osd_wbcfs.ko $basemodpath-osd-wbcfs/fs/osd_wbcfs.ko +%endif +%endif %if %{with lustre_tests} mkdir -p $basemodpath-tests/fs mv $basemodpath/fs/obd_test.ko $basemodpath-tests/fs/obd_test.ko @@ -1048,6 +1083,20 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %endif %endif +%if %{with shared} +%if 0%{?suse_version:1} +%else +%if %{with servers} +%if %{with lustre_utils} +%files osd-wbcfs-mount +%defattr(-,root,root) +%dir %{_libdir}/@PACKAGE@ +%{_libdir}/@PACKAGE@/mount_osd_wbcfs.so +%endif +%endif +%endif +%endif + # with lustre_modules %endif diff --git a/lustre/Makefile.in b/lustre/Makefile.in index 977d51d..cb63c14 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -8,6 +8,7 @@ obj-m += ec/ @TESTS_TRUE@obj-m += kunit/ @SERVER_TRUE@obj-m += mgs/ mdt/ mdd/ ofd/ quota/ osp/ lod/ lfsck/ target/ +@SERVER_TRUE@obj-m += osd-wbcfs/ @CLIENT_TRUE@obj-m += lov/ osc/ mdc/ lmv/ llite/ fld/ @LDISKFS_ENABLED_TRUE@obj-m += osd-ldiskfs/ @ZFS_ENABLED_TRUE@obj-m += osd-zfs/ diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index ad2390b..688fabd 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -38,7 +38,7 @@ ALWAYS_SUBDIRS = include obdclass ldlm ptlrpc obdecho ec \ mgc fid fld doc utils tests scripts conf SERVER_SUBDIRS = mgs mdt mdd ofd osd-zfs osd-ldiskfs \ - quota osp lod target lfsck + quota osp lod target lfsck osd-wbcfs CLIENT_SUBDIRS = mdc lmv llite lov osc diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 92b8588..f75a646 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -474,6 +474,7 @@ struct tgt_thread_big_cache { #define LUSTRE_MDD_NAME "mdd" #define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" #define LUSTRE_OSD_ZFS_NAME "osd-zfs" +#define LUSTRE_OSD_WBCFS_NAME "osd-wbcfs" #define LUSTRE_VVP_NAME "vvp" #define LUSTRE_LMV_NAME "lmv" #define LUSTRE_SLP_NAME "slp" @@ -1527,4 +1528,9 @@ static inline struct inode *page2inode(struct page *page) } } +static inline bool obd_is_osd_wbcfs(const struct obd_device *obd) +{ + return !strstr(obd->obd_name, LUSTRE_OSD_WBCFS_NAME); +} + #endif /* __OBD_H */ diff --git a/lustre/include/uapi/linux/lustre/lustre_disk.h b/lustre/include/uapi/linux/lustre/lustre_disk.h index 3fe114e..a39c436 100644 --- a/lustre/include/uapi/linux/lustre/lustre_disk.h +++ b/lustre/include/uapi/linux/lustre/lustre_disk.h @@ -124,6 +124,7 @@ enum ldd_mount_type { LDD_MT_REISERFS = 3, LDD_MT_LDISKFS2 = 4, LDD_MT_ZFS = 5, + LDD_MT_WBCFS = 6, LDD_MT_LAST }; diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c index 6be3812..bac5fe0 100644 --- a/lustre/mgs/lproc_mgs.c +++ b/lustre/mgs/lproc_mgs.c @@ -307,6 +307,11 @@ int lproc_mgs_setup(struct mgs_device *mgs, const char *osd_name) debugfs_create_file("clear", 0644, obd->obd_debugfs_exports, obd, &mgs_nid_stats_clear_fops); + /* TODO: OSD wbcfs does not have lprocfs. Add it later... */ + osd_obd = mgs->mgs_bottom->dd_lu_dev.ld_obd; + if (obd_is_osd_wbcfs(osd_obd)) + return 0; + rc = sysfs_create_link(&obd->obd_kset.kobj, &mgs->mgs_bottom->dd_kobj, "osd"); if (rc) { @@ -323,7 +328,6 @@ int lproc_mgs_setup(struct mgs_device *mgs, const char *osd_name) attr = get_attr_by_name(bottom_type, "mntdev"); if (attr) mgs->mgs_fstype = mgs->mgs_mntdev; - osd_obd = mgs->mgs_bottom->dd_lu_dev.ld_obd; mgs->mgs_proc_osd = lprocfs_add_symlink("osd", obd->obd_proc_entry, "../../%s/%.*s", diff --git a/lustre/osd-wbcfs/Makefile.in b/lustre/osd-wbcfs/Makefile.in new file mode 100644 index 0000000..b3ff041 --- /dev/null +++ b/lustre/osd-wbcfs/Makefile.in @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +# +# Copyright (c) 2025-2026, DDN/Whamcloud, Inc +# + +MODULES := osd_wbcfs +osd_wbcfs-objs := osd_handler.o osd_object.o osd_hash.o osd_index_hash.o +osd_wbcfs-objs += osd_io.o osd_dirent.o wbcfs.o + +@INCLUDE_RULES@ diff --git a/lustre/osd-wbcfs/TODO b/lustre/osd-wbcfs/TODO new file mode 100644 index 0000000..42b184b --- /dev/null +++ b/lustre/osd-wbcfs/TODO @@ -0,0 +1,32 @@ +BACKGROUND +---------- + +Implement an MemFS-based OSD device with writeback support for Lustre. +It borrows lots of design from memory-based file systems such as tmpfs/ramfs. +The data is frist written into the memory-based file system (called MemFS in +short). And then, the data can be persisted to the permant storage in a delayed +writeback manner. + + +---------------------------------------------------------+ + | This is experimental! Do NOT use for important data! | + | Only bugs and data corruption lie ahead! Turn back now! | + +---------------------------------------------------------+ + +For questions, please contact: +- Yingjin Qian +- Timothy Day + +TODO +---- +- Inode and space usage accounting for statfs() system call. +- Limiting for inodes and blocks. +- Refine the mount command support for MemFS-based OSD. +- lprocfs support. Track OSD stats and access them via lprocfs. +- Use Maple Tree in new kernel to manage and access entries within a directory. +- Implement the functionality needed by LFSCK. +- Quota support. +- Swap space support for large files. +- Metadata on MemFS; Data on Persistent storage + (just like PCC naming with FID for data). +- Writeback support with ldiskfs/ZFS or KV store as persistent backends. +- Add transcation support. diff --git a/lustre/osd-wbcfs/autoMakefile.am b/lustre/osd-wbcfs/autoMakefile.am new file mode 100644 index 0000000..ba4574e --- /dev/null +++ b/lustre/osd-wbcfs/autoMakefile.am @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +# +# Copyright (c) 2025-2026, DDN/Whamcloud, Inc. +# + +if MODULES +modulefs_DATA = osd_wbcfs.ko +endif + +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ +EXTRA_DIST := $(osd_wbcfs-objs:%.o=%.c) osd_internal.h wbcfs.h index.h diff --git a/lustre/osd-wbcfs/index.h b/lustre/osd-wbcfs/index.h new file mode 100644 index 0000000..867c85c --- /dev/null +++ b/lustre/osd-wbcfs/index.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Index Access Module. + * + * Author: Yingjin Qian + */ + +#ifndef __OSD_INDEX_H_ +#define __OSD_INDEX_H_ + +#include + +/* Store key and value together in @he_buf. */ +struct hash_index_entry { + struct rhash_head he_hash; + struct list_head he_list_item; + __u64 he_offset; + size_t he_len; + size_t he_keylen; + char he_buf[]; +}; + +/* Index access via @rhashtable. */ +struct hash_index { + struct rhashtable hi_htbl; + struct rhashtable_params hi_htbl_params; + struct list_head hi_list; + size_t hi_reclen; + __u64 hi_next_offset; +}; + +int hash_index_init(struct hash_index *hind, size_t kenlen, size_t reclen); +void hash_index_fini(struct hash_index *hind); +struct hash_index_entry *hash_index_lookup_entry(struct hash_index *hind, + const void *key); +int hash_index_lookup(struct hash_index *hind, const void *key, void *rec); +int hash_index_insert(struct hash_index *hind, void *key, size_t keylen, + void *rec, size_t reclen); +void hash_index_remove(struct hash_index *hind, const void *key); + +/* TODO: Index access via Maple Tree. Only support in newer kernels. */ + +#endif /* __OSD_INDEX_H_ */ diff --git a/lustre/osd-wbcfs/osd_dirent.c b/lustre/osd-wbcfs/osd_dirent.c new file mode 100644 index 0000000..2926494 --- /dev/null +++ b/lustre/osd-wbcfs/osd_dirent.c @@ -0,0 +1,821 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include + +#include "osd_internal.h" +#include "wbcfs.h" + +/* Lookup the directory entry (dentry) specified by @key. */ +static int osd_index_dir_lookup(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key) +{ + struct osd_object *pobj = osd_dt_obj(dt); + struct inode *dir = pobj->oo_inode; + struct lu_fid *fid = (struct lu_fid *)rec; + char *name = (char *)key; + struct dentry *parent; + struct dentry *dchild; + struct qstr qstr; + int rc = 0; + + ENTRY; + + LASSERT(S_ISDIR(dir->i_mode)); + parent = d_find_any_alias(dir); + if (IS_ERR(parent)) + RETURN(PTR_ERR(parent)); + + /* FIXME: more checking for ".." lookup. */ + if (strcmp(name, "..") == 0) { + *fid = MEMFS_I(d_inode(parent->d_parent))->mei_fid; + GOTO(out, rc = 1); + } + + qstr.name = name; + qstr.len = strlen(name); + qstr.hash = ll_full_name_hash(parent, qstr.name, qstr.len); + dchild = d_lookup(parent, &qstr); + if (dchild) { + *fid = MEMFS_I(d_inode(dchild))->mei_fid; + dput(dchild); + rc = 1; + } + +out: + CDEBUG(D_CACHE, "%s: lookup '%s' from parent %pd@%pK "DFID": rc=%d\n", + osd_name(osd_obj2dev(pobj)), name, parent, parent, + PFID(fid), rc); + dput(parent); + RETURN(rc); +} + +/** + * osd_index_dir_insert() - Index add function. + * @key: it is key i.e. file entry to be inserted + * @record: it is value of given key i.e. fid + * + * It will add the directory entry.This entry is needed to + * maintain name->fid mapping. + * + * Return: + * * %0 - on success + * * %-ve - on error + */ +static int osd_index_dir_insert(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *record, + const struct dt_key *key, + struct thandle *th) +{ + struct osd_object *pobj = osd_dt_obj(dt); + struct osd_device *osd = osd_dev(dt->do_lu.lo_dev); + struct dt_insert_rec *rec = (struct dt_insert_rec *)record; + const struct lu_fid *fid = rec->rec_fid; + const char *name = (const char *)key; + struct inode *dir = pobj->oo_inode; + struct dentry *parent; + struct dentry *dentry; + struct dentry *dchild = NULL; + struct inode *inode; + struct qstr dname; + bool nedir_rename = false; + int rc = 0; + + ENTRY; + + if (!dt_object_exists(dt)) + RETURN(-ENOENT); + + LASSERT(!dt_object_remote(dt)); + LASSERTF(fid_is_sane(fid), "fid "DFID" is insane!\n", PFID(fid)); + + /* Skip "." and ".." in MemFS. */ + if (name[0] == '.' && (name[1] == '\0' || + (name[1] == '.' && name[2] == '\0'))) + RETURN(0); + + /* FIXME: handle remote object in DNE environment. */ + /* TODO: Store inode in @osd_thread_info? */ + inode = ilookup5(osd_sb(osd), lu_fid_build_ino(fid, 0), + memfs_test_inode_by_fid, (void *)fid); + if (!inode) { + rc = -EINVAL; + CERROR("%s: lookup "DFID" from icache failed: rc=%d\n", + osd_name(osd_obj2dev(pobj)), PFID(fid), rc); + RETURN(rc); + } + + parent = d_find_any_alias(dir); + if (parent == NULL) { + rc = -ENOENT; + CERROR("%s: Cannot find dentry for inode@%pK "DFID": rc=%d\n", + osd_name(osd_obj2dev(pobj)), dir, + PFID(lu_object_fid(&pobj->oo_dt.do_lu)), rc); + GOTO(out_iput, rc); + } + + dname.name = name; + dname.len = strlen(name); + dname.hash = ll_full_name_hash(parent, dname.name, dname.len); + + dentry = d_alloc(parent, &dname); + if (!dentry) + GOTO(out_dput, rc = -ENOMEM); + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + /* + * TODO: Store these info into OSD thread info @osd_thread_info, + * thus we can do undo (recovery) operations upon failure. + */ + dchild = d_find_any_alias(inode); + /* mv (rename) a non-empty directory. */ + if (dchild && !simple_empty(dchild)) + nedir_rename = true; + fallthrough; + case S_IFREG: + dir->i_size += BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + break; + case S_IFLNK: + /* FIXME: symlink support. */ + CERROR("%s: symlink does not support\n", + osd_name(osd_obj2dev(pobj))); + break; + default: + LBUG(); + } + + inode_inc_iversion(dir); + if (nedir_rename) { + d_move(dchild, dentry); + /* Put the refcount obtained by @d_find_any_alias() */ + dput(dchild); + /* Finally release the @dentry. */ + dput(dentry); + } else { + /* Add dentry into dentry hashtable for VFS lookup. */ + d_add(dentry, inode); + ihold(inode); + } + /* Extra count (already obtain in @d_alloc) - pin the dentry in core */ + /* dget(dentry); */ + + CDEBUG(D_CACHE, + "%s: Insert dirent "DFID"/%pd@%pK inode@%pK nlink=%d\n", + osd_name(osd_obj2dev(pobj)), PFID(fid), dentry, dentry, + inode, inode->i_nlink); +out_dput: + + dput(parent); +out_iput: + iput(inode); + + RETURN(rc); +} + +/* + * Index delete funtion. + * It will remove the directory entry added by index insert. + * This entry is needed to maintain name->fid mapping. + */ +static int osd_index_dir_delete(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *th) +{ + struct osd_object *pobj = osd_dt_obj(dt); + struct inode *dir = pobj->oo_inode; + char *name = (char *)key; + struct dentry *parent; + struct dentry *dentry; + struct inode *inode; + struct qstr qstr; + bool nedir_rename = false; + int rc = 0; + + ENTRY; + + /* Skip "." and ".." in MemFS. */ + if (name[0] == '.' && (name[1] == '\0' || + (name[1] == '.' && name[2] == '\0'))) + RETURN(0); + + parent = d_find_any_alias(dir); + if (parent == NULL && strcmp(name, "..") == 0) { + CDEBUG(D_CACHE, "%s: delete name %s from an empty dir@%pK\n", + osd_name(osd_obj2dev(pobj)), name, dir); + RETURN(0); + } + + if (parent == NULL) { + CDEBUG(D_CACHE, "%s: delete name %s from an empty dir@%pK\n", + osd_name(osd_obj2dev(pobj)), name, dir); + RETURN(-ENOENT); + } + + LASSERTF(parent != NULL, "dir@%pK name %s\n", dir, name); + + qstr.name = name; + qstr.len = strlen(name); + qstr.hash = ll_full_name_hash(parent, qstr.name, qstr.len); + dentry = d_lookup(parent, &qstr); + if (dentry == NULL) { + CDEBUG(D_CACHE, "%s: cannot find %s from parent@%pK %pd\n", + osd_name(osd_obj2dev(pobj)), name, dir, parent); + GOTO(out_dput_parent, rc = -ENOENT); + } + + LASSERT(dentry != NULL); + inode = d_inode(dentry); + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + /* + * FIXME: rename() operation, @dentry may be not empty: + * (sanity/214). + * TODO: Put @dir_rename and @dentry into OSD thread info. + */ + if (!simple_empty(dentry)) + nedir_rename = true; + + /* + * MDD layer drops @nlink later via @dt_ref_del(). + * drop_nlink(inode); + * drop_nlink(dir); + */ + fallthrough; + case S_IFREG: + case S_IFLNK: + dir->i_size -= BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode))); + inode_inc_iversion(dir); + /* MDD layer drops @nlink later via @dt_ref_del(). */ + /* drop_nlink(inode); */ + /* + * Undo the count from "create". + * Unhash the dentry from the parent dentry hashtable which is + * add by @d_add(), so that it would not be found through a VFS + * lookup anymore. + * Unpin/drop the dentry from dcache. + */ + if (!nedir_rename) + dput(dentry); + break; + default: + LBUG(); + } + + CDEBUG(D_CACHE, + "%s: Delete %s from dir@%pK %pd inode@%pK nlink=%d %d: rc=%d.\n", + osd_name(osd_obj2dev(pobj)), name, dir, parent, inode, + inode->i_nlink, dentry->d_lockref.count, rc); + dput(dentry); +out_dput_parent: + dput(parent); + RETURN(rc); +} + +static struct osd_it * +__osd_dir_it_init(const struct lu_env *env, struct osd_device *dev, + struct inode *inode, u32 attr) +{ + struct osd_it *oit; + struct file *file; + int rc; + + ENTRY; + + OBD_SLAB_ALLOC_PTR_GFP(oit, osd_it_cachep, GFP_NOFS); + if (oit == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* TODO: store buffer as thread context data @osd_thread_info. */ + OBD_ALLOC(oit->oit_buf, OSD_IT_BUFSIZE); + if (!oit->oit_buf) + GOTO(out_free, rc = -ENOMEM); + + oit->oit_obj = NULL; + file = &oit->oit_file; + /* Only FMODE_64BITHASH or FMODE_32BITHASH should be set, NOT both. */ + if (attr & LUDA_64BITHASH) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + file->f_path.dentry = d_find_any_alias(inode); + file->f_flags = O_NOATIME | __FMODE_NONOTIFY; + file->f_mapping = inode->i_mapping; + file->f_op = inode->i_fop; + file->f_inode = inode; + + if (file->f_op->open) { + rc = file->f_op->open(inode, file); + if (rc) { + dput(file->f_path.dentry); + GOTO(out_free, rc); + } + } + + RETURN(oit); + +out_free: + OBD_SLAB_FREE_PTR(oit, osd_it_cachep); + return ERR_PTR(rc); +} + +/** + * osd_dir_it_init() - Creates or initializes iterator context. + * + * Returns: struct osd_it, iterator structure on success + */ +static struct dt_it *osd_dir_it_init(const struct lu_env *env, + struct dt_object *dt, __u32 attr) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *dev = osd_obj2dev(obj); + struct lu_object *lo = &dt->do_lu; + struct osd_it *oit; + + ENTRY; + + if (!dt_object_exists(dt) || obj->oo_destroyed) + RETURN(ERR_PTR(-ENOENT)); + + oit = __osd_dir_it_init(env, dev, obj->oo_inode, attr); + if (IS_ERR(oit)) + RETURN(ERR_CAST(oit)); + + oit->oit_obj = obj; + lu_object_get(lo); + RETURN((struct dt_it *)oit); +} + +/** + * osd_dir_it_fini() - Destroy or finishes iterator context. + * @di: iterator structure to be destroyed + */ +static void osd_dir_it_fini(const struct lu_env *env, struct dt_it *di) +{ + struct osd_it *oit = (struct osd_it *)di; + struct osd_object *obj = oit->oit_obj; + struct inode *inode = obj->oo_inode; + + ENTRY; + + dput(oit->oit_file.f_path.dentry); + oit->oit_file.f_op->release(inode, &oit->oit_file); + OBD_FREE(oit->oit_buf, OSD_IT_BUFSIZE); + OBD_SLAB_FREE_PTR(oit, osd_it_cachep); + + osd_object_put(env, obj); + + EXIT; +} + + +/* + * It position the iterator at given key, so that next lookup continues from + * that key Or it is similar to dio_it->load() but based on a key, + * rather than file position. + * + * As a special convention, osd_it_ea_get(env, di, "") has to rewind iterator + * to the beginning. + * + * TODO: Presently return 1 considering it is only used by mdd_dir_is_empty(). + */ +static int osd_dir_it_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) +{ + struct osd_it *it = (struct osd_it *)di; + struct file *file = &it->oit_file; + + ENTRY; + + LASSERT(((const char *)key)[0] == '\0'); + if (file->f_op->llseek) { + loff_t offset; + + offset = file->f_op->llseek(file, 0, 0); + if (offset != 0) + CWARN("Failed to llseek(): offset %lld != 0\n", offset); + } else { + it->oit_file.f_pos = 0; + } + + it->oit_rd_dirent = 0; + it->oit_it_dirent = 0; + it->oit_dirent = NULL; + + RETURN(1); +} + +/* Does nothing */ +static void osd_dir_it_put(const struct lu_env *env, struct dt_it *di) +{ +} + +/** + * osd_memfs_filldir() - It is called internally by ->iterate*() + * @buf: in which information to be filled in. + * @name: name of the file in given dir + * + * It fills the iterator's in-memory data structure with required + * information i.e. name, namelen, rec_size etc. + * + * Returns: + * * %0 - on success + * * %1 - on buffer full + */ +#ifdef HAVE_FILLDIR_USE_CTX +static FILLDIR_TYPE do_osd_memfs_filldir(struct dir_context *ctx, +#else +static int osd_memfs_filldir(void *ctx, +#endif + const char *name, int namelen, + loff_t offset, __u64 ino, unsigned int d_type) +{ + struct memfs_dir_context *mctx = (struct memfs_dir_context *)ctx; + struct osd_it *oit = (struct osd_it *)mctx->cbdata; + struct osd_object *obj = oit->oit_obj; + struct osd_it_dirent *ent = oit->oit_dirent; + struct lu_fid *fid = &ent->oitd_fid; + char *buf = oit->oit_buf; + + ENTRY; + + /* This should never happen */ + if (unlikely(namelen == 0 || namelen > NAME_MAX)) { + CERROR("MemFS return invalid namelen %d\n", namelen); + RETURN(-EIO); + } + + /* Check for enough space. Note oitd_name is not NUL terminated. */ + if (&ent->oitd_name[namelen] > buf + OSD_IT_BUFSIZE) + RETURN(1); + + /* "." is just the object itself. */ + if (namelen == 1 && name[0] == '.') { + if (obj != NULL) + *fid = obj->oo_dt.do_lu.lo_header->loh_fid; + } else if (namelen == 2 && name[0] == '.' && name[1] == '.') { + if (obj != NULL) { + struct inode *inode = obj->oo_inode; + struct dentry *dentry; + struct dentry *parent; + + LASSERT(S_ISDIR(inode->i_mode)); + dentry = d_find_any_alias(inode); + parent = dentry->d_parent; + *fid = MEMFS_I(d_inode(parent))->mei_fid; + dput(dentry); + } + } else if (mctx->dentry) { + *fid = MEMFS_I(d_inode(mctx->dentry))->mei_fid; + } else { + fid_zero(fid); + } + + /* NOT export local root. */ + if (obj != NULL && + unlikely(osd_sb(osd_obj2dev(obj))->s_root->d_inode->i_ino == ino)) { + ino = obj->oo_inode->i_ino; + *fid = obj->oo_dt.do_lu.lo_header->loh_fid; + } + + if (obj == NULL || !(obj->oo_lma_flags & LUSTRE_ENCRYPT_FL)) { + ent->oitd_namelen = namelen; + memcpy(ent->oitd_name, name, namelen); + } else { + int encoded_namelen = critical_chars(name, namelen); + + /* Check again for enough space. */ + if (&ent->oitd_name[encoded_namelen] > buf + OSD_IT_BUFSIZE) + RETURN(1); + + ent->oitd_namelen = encoded_namelen; + + if (encoded_namelen == namelen) + memcpy(ent->oitd_name, name, namelen); + else + critical_encode(name, namelen, ent->oitd_name); + } + + ent->oitd_ino = ino; + ent->oitd_off = offset; + ent->oitd_type = d_type; + + oit->oit_rd_dirent++; + oit->oit_dirent = (void *)ent + + round_up(sizeof(*ent) + ent->oitd_namelen, 8); + CDEBUG(D_DENTRY, "Filldir: fid="DFID" name=%s off=%llu rd_dirent=%u\n", + PFID(fid), name, offset, oit->oit_rd_dirent); + RETURN(0); +} + +WRAP_FILLDIR_FN(do_, osd_memfs_filldir) + +/** + * osd_memfs_it_fill() - Calls ->iterate*() to load a directory entry at + * a time and stored it in iterator's in-memory data structure. + * @di: iterator's in memory structure + * + * Returns: + * * %0 - on success + * * %-ve - on error + * * %1 - reach the end of entry + */ +static int osd_memfs_it_fill(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_it *it = (struct osd_it *)di; + struct file *filp = &it->oit_file; + struct inode *dir = file_inode(filp); + struct memfs_dir_context mctx = { + .super.actor = osd_memfs_filldir, + .dentry = NULL, + .cbdata = it + }; + int rc = 0; + + ENTRY; + + it->oit_dirent = it->oit_buf; + it->oit_rd_dirent = 0; + +#ifdef HAVE_FOP_ITERATE_SHARED + inode_lock_shared(dir); +#else + inode_lock(dir); +#endif + if (!IS_DEADDIR(dir)) { + if (filp->f_op->iterate_shared) { + mctx.super.pos = filp->f_pos; + rc = filp->f_op->iterate_shared(filp, &mctx.super); + filp->f_pos = mctx.super.pos; + } else { +#ifdef HAVE_FOP_READDIR + rc = filp->f_op->readdir(filp, &mctx.super, + mctx.super.actor); + mctx.super.pos = filp->f_pos; +#else + rc = -ENOTDIR; +#endif + } + } +#ifdef HAVE_FOP_ITERATE_SHARED + inode_unlock_shared(dir); +#else + inode_unlock(dir); +#endif + if (rc) + RETURN(rc); + + if (it->oit_rd_dirent == 0) { + /* + * If it does not get any dirent, it means it has been reached + * to the end of the dir + */ + it->oit_file.f_pos = MEMFS_DIR_EOF; + rc = 1; + } else { + it->oit_dirent = it->oit_buf; + it->oit_it_dirent = 1; + } + + RETURN(rc); +} + +/** + * osd_dir_it_next() - It calls osd_memfs_it_fill() which will use + * ->iterate*() to load a directory entry at a time and stored it in + * iterator's in-memory data structure. + * @di: iterator's in memory structure + * + * Returns: + * * %ve - iterator reached to end + * * %0 - iterator not reached to end + * * %-ve - on error + */ +static int osd_dir_it_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_it *it = (struct osd_it *)di; + int rc; + + ENTRY; + + if (it->oit_it_dirent < it->oit_rd_dirent) { + it->oit_dirent = + (void *)it->oit_dirent + + round_up(sizeof(struct osd_it_dirent) + + it->oit_dirent->oitd_namelen, 8); + it->oit_it_dirent++; + rc = 0; + } else { + if (it->oit_file.f_pos == MEMFS_DIR_EOF) + rc = 1; + else + rc = osd_memfs_it_fill(env, di); + } + + RETURN(rc); +} + +/** + * osd_dir_it_key() - Returns the key at current position from + * iterator's in memory structure. + * @di: iterator's in memory structure + * + * Returns: key i.e. struct dt_key on success + */ +static struct dt_key *osd_dir_it_key(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_it *it = (struct osd_it *)di; + + return (struct dt_key *)it->oit_dirent->oitd_name; +} + +/** + * osd_dir_it_key_size() - Returns key's size at current position + * from iterator's in memory structure. + * @di: iterator's in memory structure + * + * Returns: key_size i.e. struct dt_key on success + */ +static int osd_dir_it_key_size(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_it *it = (struct osd_it *)di; + + return it->oit_dirent->oitd_namelen; +} + +static inline void +osd_it_append_attrs(struct lu_dirent *ent, int len, __u16 type) +{ + /* check if file type is required */ + if (ent->lde_attrs & LUDA_TYPE) { + struct luda_type *lt; + int align = sizeof(*lt) - 1; + + len = (len + align) & ~align; + lt = (struct luda_type *)(ent->lde_name + len); + lt->lt_type = cpu_to_le16(DTTOIF(type)); + } + + ent->lde_attrs = cpu_to_le32(ent->lde_attrs); +} + +/* + * build lu direct from backend fs dirent. + */ +static inline void +osd_it_pack_dirent(struct lu_dirent *ent, struct lu_fid *fid, __u64 offset, + char *name, __u16 namelen, __u16 type, __u32 attr) +{ + ent->lde_attrs = attr | LUDA_FID; + fid_cpu_to_le(&ent->lde_fid, fid); + + ent->lde_hash = cpu_to_le64(offset); + ent->lde_reclen = cpu_to_le16(lu_dirent_calc_size(namelen, attr)); + + strncpy(ent->lde_name, name, namelen); + ent->lde_name[namelen] = '\0'; + ent->lde_namelen = cpu_to_le16(namelen); + + /* append lustre attributes */ + osd_it_append_attrs(ent, namelen, type); +} + +/** + * osd_dir_it_rec() - Returns the value at current position from + * iterator's in memory structure. + * @di: struct osd_it, iterator's in memory structure + * @dtrec: lustre dirent + * @attr: attr requested for dirent. + * + * Returns: + * %0 - no error and \param lde has correct lustre dirent. + * %-ve - on error + */ +static inline int osd_dir_it_rec(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *dtrec, __u32 attr) +{ + struct osd_it *it = (struct osd_it *)di; + struct lu_fid *fid = &it->oit_dirent->oitd_fid; + struct lu_dirent *lde = (struct lu_dirent *)dtrec; + + ENTRY; + + /* TODO: lfsck checking support.*/ + + attr &= ~LU_DIRENT_ATTRS_MASK; + /* Pack the entry anyway, at least the offset is right. */ + osd_it_pack_dirent(lde, fid, it->oit_dirent->oitd_off, + it->oit_dirent->oitd_name, + it->oit_dirent->oitd_namelen, + it->oit_dirent->oitd_type, attr); + + RETURN(0); +} + +/** + * osd_dir_it_rec_size() - Returns the record size at current position. + * @env: execution environment + * @di: iterator's in memory structure + * @attr: attribute of the entry, only requires LUDA_TYPE to + * calculate the lu_dirent size. + * + * This function will return record(lu_dirent) size in bytes. + * + * Returns: record size(in bytes & in memory) of the current lu_dirent + * entry. + */ +static int osd_dir_it_rec_size(const struct lu_env *env, const struct dt_it *di, + __u32 attr) +{ + struct osd_it *it = (struct osd_it *)di; + + return lu_dirent_calc_size(it->oit_dirent->oitd_namelen, attr); +} + +/** + * osd_dir_it_store() - Returns a cookie for current position of the iterator + * head, so that user can use this cookie to load/start the iterator next + * time. + * @di: iterator's in memory structure + * + * Returns: cookie for current position, on success + */ +static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_it *it = (struct osd_it *)di; + + return it->oit_dirent->oitd_off; +} + +/** + * osd_dir_it_load() - It calls osd_memfs_it_fill() which will use + * ->iterate*() to load a directory entry at a time and stored it + * in iterator's in-memory data structure. + * @di: struct osd_it, iterator's in memory structure + * + * Returns: + * * %ve - on success + * * %-ve - on error + */ +static int osd_dir_it_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) +{ + struct osd_it *it = (struct osd_it *)di; + struct file *file = &it->oit_file; + loff_t offset; + int rc; + + ENTRY; + + if (file->f_op->llseek) { + offset = file->f_op->llseek(file, hash, 0); + if (offset != hash) + CWARN("Failed to llseek(): offset %lld != hash %llu\n", + offset, hash); + } else { + it->oit_file.f_pos = hash; + } + + rc = osd_memfs_it_fill(env, di); + if (rc > 0) + rc = -ENODATA; + + if (rc == 0) + rc = 1; + + RETURN(rc); +} + +const struct dt_index_operations osd_dir_ops = { + .dio_lookup = osd_index_dir_lookup, + .dio_insert = osd_index_dir_insert, + .dio_delete = osd_index_dir_delete, + .dio_it = { + .init = osd_dir_it_init, + .fini = osd_dir_it_fini, + .get = osd_dir_it_get, + .put = osd_dir_it_put, + .next = osd_dir_it_next, + .key = osd_dir_it_key, + .key_size = osd_dir_it_key_size, + .rec = osd_dir_it_rec, + .rec_size = osd_dir_it_rec_size, + .store = osd_dir_it_store, + .load = osd_dir_it_load + } +}; diff --git a/lustre/osd-wbcfs/osd_handler.c b/lustre/osd-wbcfs/osd_handler.c new file mode 100644 index 0000000..9404211 --- /dev/null +++ b/lustre/osd-wbcfs/osd_handler.c @@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * wbcFS OSD module + * + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include +#include +#include +#include + +#include "osd_internal.h" +#include "wbcfs.h" + +struct kmem_cache *osd_it_cachep; +struct kmem_cache *osd_hash_it_cachep; + +static struct lu_kmem_descr wbcfs_caches[] = { + { + .ckd_cache = &osd_it_cachep, + .ckd_name = "osd_it_cache", + .ckd_size = sizeof(struct osd_it) + }, + { + .ckd_cache = &osd_hash_it_cachep, + .ckd_name = "osd_hash_it_cache", + .ckd_size = sizeof(struct osd_hash_it) + }, + { + .ckd_cache = NULL + } +}; + +/* Copied form osd-ldiskfs to open/put file handle in kenrel. */ +struct work_struct flush_fput; +atomic_t descriptors_cnt; +unsigned int wbcfs_flush_descriptors_cnt = 5000; + +#ifdef HAVE_FLUSH_DELAYED_FPUT +# define cfs_flush_delayed_fput() flush_delayed_fput() +#else +void (*cfs_flush_delayed_fput)(void); +#endif /* HAVE_FLUSH_DELAYED_FPUT */ + +static void osd_flush_fput(struct work_struct *work) +{ + /* flush file descriptors when too many files */ + CDEBUG_LIMIT(D_HA, "Flushing file descriptors limit %d\n", + wbcfs_flush_descriptors_cnt); + + /* descriptors_cnt triggers the threshold when a flush is started, + * but all pending descriptors will be flushed each time, so it + * doesn't need to exactly match the number of descriptors. + */ + atomic_set(&descriptors_cnt, 0); + cfs_flush_delayed_fput(); +} + +static struct lu_object *osd_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *d) +{ + struct osd_object *obj; + struct lu_object *l; + + OBD_ALLOC_PTR(obj); + if (!obj) + return NULL; + + l = &obj->oo_dt.do_lu; + dt_object_init(&obj->oo_dt, NULL, d); + obj->oo_header = NULL; + obj->oo_dt.do_ops = &osd_obj_ops; + l->lo_ops = &osd_lu_obj_ops; + spin_lock_init(&obj->oo_guard); + init_rwsem(&obj->oo_dt.dd_sem); + init_rwsem(&obj->oo_sem); + return l; +} + +static int osd_shutdown(const struct lu_env *env, struct osd_device *osd) +{ + seq_target_fini(env, &osd->od_dt_dev); + return 0; +} + +static int osd_mount(const struct lu_env *env, + struct osd_device *osd, struct lustre_cfg *cfg) +{ + struct file_system_type *type; + struct inode *inode; + unsigned long flags = 0; + struct lu_fid fid; + int rc = 0; + + ENTRY; + + if (osd->od_mnt != NULL) + RETURN(0); + + type = get_fs_type("wbcfs"); + if (type == NULL) { + CERROR("%s: Cannot find wbcfs FS type.\n", osd_name(osd)); + RETURN(-ENODEV); + } + + flags |= SB_KERNMOUNT; + osd->od_mnt = vfs_kern_mount(type, flags, NULL, NULL); + module_put(type->owner); + + if (IS_ERR(osd->od_mnt)) { + rc = PTR_ERR(osd->od_mnt); + osd->od_mnt = NULL; + CERROR("%s: Failed to mount wbcfs in kernel: rc=%d\n", + osd_name(osd), rc); + RETURN(rc); + } + + inode = osd_sb(osd)->s_root->d_inode; + lu_local_obj_fid(&fid, OSD_FS_ROOT_OID); + inode->i_ino = lu_fid_build_ino(&fid, 0); + inode->i_generation = lu_fid_build_gen(&fid); + MEMFS_I(inode)->mei_fid = fid; + __insert_inode_hash(inode, inode->i_ino); + + RETURN(rc); +} + +static int osd_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct osd_device *osd = osd_dev(d); + int count; + int rc; + + ENTRY; + + switch (cfg->lcfg_command) { + case LCFG_SETUP: + rc = osd_mount(env, osd, cfg); + break; + case LCFG_CLEANUP: + /* + * For the case LCFG_PRE_CLEANUP is not called in advance, + * that may happen if hit failure during mount process. + */ + lu_dev_del_linkage(d->ld_site, d); + rc = osd_shutdown(env, osd); + break; + case LCFG_PARAM: + LASSERT(&osd->od_dt_dev); + count = class_modify_config(cfg, PARAM_OSD, + &osd->od_dt_dev.dd_kobj); + if (count < 0) + count = class_modify_config(cfg, PARAM_OST, + &osd->od_dt_dev.dd_kobj); + rc = count > 0 ? 0 : count; + break; + case LCFG_PRE_CLEANUP: + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + } + + RETURN(rc); +} + +static int osd_recovery_complete(const struct lu_env *env, struct lu_device *d) +{ + RETURN(0); +} + +static int osd_prepare(const struct lu_env *env, struct lu_device *pdev, + struct lu_device *dev) +{ + struct osd_device *osd = osd_dev(dev); + int rc = 0; + + rc = seq_target_init(env, &osd->od_dt_dev, osd->od_svname, + osd->od_is_ost); + + RETURN(rc); +} + +const struct lu_device_operations osd_lu_ops = { + .ldo_object_alloc = osd_object_alloc, + .ldo_process_config = osd_process_config, + .ldo_recovery_complete = osd_recovery_complete, + .ldo_prepare = osd_prepare, + .ldo_fid_alloc = fid_alloc_generic, +}; + +static int osd_root_get(const struct lu_env *env, + struct dt_device *dev, struct lu_fid *f) +{ + lu_local_obj_fid(f, OSD_FS_ROOT_OID); + return 0; +} + +static int osd_statfs(const struct lu_env *env, struct dt_device *d, + struct obd_statfs *sfs, struct obd_statfs_info *info) +{ + struct osd_device *osd = osd_dt_dev(d); + struct super_block *sb = osd_sb(osd); + struct kstatfs ksfs; + int rc; + + if (unlikely(!sb)) + return -EINPROGRESS; + + memset(&ksfs, 0, sizeof(ksfs)); + rc = sb->s_op->statfs(sb->s_root, &ksfs); + if (rc) + RETURN(rc); + + statfs_pack(sfs, &ksfs); + if (unlikely(sb->s_flags & SB_RDONLY)) + sfs->os_state |= OS_STATFS_READONLY; + + if (sfs->os_blocks == 0) { + sfs->os_blocks = memfs_default_max_blocks(); + sfs->os_bfree = sfs->os_blocks; + sfs->os_bavail = sfs->os_bfree; + } + + if (sfs->os_files == 0) { + sfs->os_files = memfs_default_max_inodes(); + sfs->os_ffree = sfs->os_files; + } + + sfs->os_state |= OS_STATFS_NONROT; + sfs->os_namelen = NAME_MAX; + sfs->os_maxbytes = sb->s_maxbytes; + + return 0; +} + +static struct thandle *osd_trans_create(const struct lu_env *env, + struct dt_device *d) +{ + struct osd_thandle *oh; + struct thandle *th; + + ENTRY; + + if (d->dd_rdonly) { + CERROR("%s: someone try to start transaction under readonly mode, should be disabled.\n", + osd_name(osd_dt_dev(d))); + dump_stack(); + RETURN(ERR_PTR(-EROFS)); + } + + sb_start_write(osd_sb(osd_dt_dev(d))); + + OBD_ALLOC_PTR(oh); + if (!oh) { + sb_end_write(osd_sb(osd_dt_dev(d))); + RETURN(ERR_PTR(-ENOMEM)); + } + + th = &oh->ot_super; + th->th_dev = d; + th->th_result = 0; + INIT_LIST_HEAD(&oh->ot_commit_dcb_list); + INIT_LIST_HEAD(&oh->ot_stop_dcb_list); + + RETURN(th); +} + +static int osd_trans_start(const struct lu_env *env, struct dt_device *d, + struct thandle *th) +{ + int rc; + + ENTRY; + + rc = dt_txn_hook_start(env, d, th); + RETURN(rc); +} + +static void osd_trans_commit_cb(struct osd_thandle *oh, int result) +{ + struct thandle *th = &oh->ot_super; + struct dt_txn_commit_cb *dcb, *tmp; + + /* call per-transaction callbacks if any */ + list_for_each_entry_safe(dcb, tmp, &oh->ot_commit_dcb_list, + dcb_linkage) { + LASSERTF(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC, + "commit callback entry: magic=%x name='%s'\n", + dcb->dcb_magic, dcb->dcb_name); + list_del_init(&dcb->dcb_linkage); + dcb->dcb_func(NULL, th, dcb, result); + } +} + +static void osd_trans_stop_cb(struct osd_thandle *oh, int result) +{ + struct thandle *th = &oh->ot_super; + struct dt_txn_commit_cb *dcb, *tmp; + + /* call per-transaction stop callbacks if any */ + list_for_each_entry_safe(dcb, tmp, &oh->ot_stop_dcb_list, + dcb_linkage) { + LASSERTF(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC, + "commit callback entry: magic=%x name='%s'\n", + dcb->dcb_magic, dcb->dcb_name); + list_del_init(&dcb->dcb_linkage); + dcb->dcb_func(NULL, th, dcb, result); + } +} + +static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, + struct thandle *th) +{ + struct osd_device *osd = osd_dt_dev(th->th_dev); + struct osd_thandle *oh; + int rc = 0; + + ENTRY; + oh = container_of(th, struct osd_thandle, ot_super); + + rc = dt_txn_hook_stop(env, th); + if (rc) + CERROR("%s: failed in transaction hook: rc=%d\n", + osd_name(osd), rc); + + osd_trans_stop_cb(oh, rc); + /* FIXME: using th->th_result? */ + osd_trans_commit_cb(oh, rc); + sb_end_write(osd_sb(osd)); + + th->th_dev = NULL; + OBD_FREE_PTR(oh); + RETURN(rc); +} + +static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb) +{ + struct osd_thandle *oh = container_of(th, struct osd_thandle, + ot_super); + + LASSERT(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC); + LASSERT(&dcb->dcb_func != NULL); + + if (dcb->dcb_flags & DCB_TRANS_STOP) + list_add(&dcb->dcb_linkage, &oh->ot_stop_dcb_list); + else + list_add(&dcb->dcb_linkage, &oh->ot_commit_dcb_list); + + return 0; +} + +static void osd_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + struct osd_device *osd = osd_dt_dev(dev); + struct super_block *sb = osd_sb(osd); + + param->ddp_max_name_len = NAME_MAX; + param->ddp_max_nlink = 1 << 31; + param->ddp_symlink_max = sb->s_blocksize; + param->ddp_mount_type = LDD_MT_WBCFS; + param->ddp_maxbytes = sb->s_maxbytes; + param->ddp_max_extent_blks = 1024; + param->ddp_extent_tax = 1024; + + param->ddp_mntopts = MNTOPT_USERXATTR; + + /* TODO: Add support for MNTOPT_ACL. */ + + param->ddp_max_ea_size = OBD_MAX_EA_SIZE; + param->ddp_inodespace = 1024; + param->ddp_brw_size = DT_DEF_BRW_SIZE; + + param->ddp_has_lseek_data_hole = true; +} + +static int osd_ro(const struct lu_env *env, struct dt_device *d) +{ + int rc = -EOPNOTSUPP; + + ENTRY; + + CERROR("%s: cannot be set readonly: rc=%d\n", + osd_dt_dev(d)->od_svname, rc); + + RETURN(rc); +} + +static int osd_reserve_or_free_quota(const struct lu_env *env, + struct dt_device *dev, + struct lquota_id_info *qi) +{ + RETURN(0); +} + +static int osd_sync(const struct lu_env *env, struct dt_device *d) +{ + RETURN(0); +} + +static int osd_commit_async(const struct lu_env *env, struct dt_device *dev) +{ + RETURN(0); +} + +static const struct dt_device_operations osd_dt_ops = { + .dt_root_get = osd_root_get, + .dt_statfs = osd_statfs, + .dt_trans_create = osd_trans_create, + .dt_trans_start = osd_trans_start, + .dt_trans_stop = osd_trans_stop, + .dt_trans_cb_add = osd_trans_cb_add, + .dt_conf_get = osd_conf_get, + .dt_ro = osd_ro, + .dt_reserve_or_free_quota = osd_reserve_or_free_quota, + .dt_sync = osd_sync, + .dt_commit_async = osd_commit_async, +}; + +static void osd_umount(const struct lu_env *env, struct osd_device *dev) +{ + ENTRY; + + if (dev->od_mnt) { + shrink_dcache_sb(osd_sb(dev)); + mntput(dev->od_mnt); + dev->od_mnt = NULL; + } + + /* to be sure all delayed fput are finished. */ + cfs_flush_delayed_fput(); + + EXIT; +} + +static int __osd_device_init(const struct lu_env *env, struct osd_device *osd, + struct lustre_cfg *cfg) +{ + struct lu_device *ld = osd2lu_dev(osd); + int cplen = 0; + int rc; + + rc = lu_env_refill((struct lu_env *)env); + if (rc) + RETURN(rc); + + ld->ld_ops = &osd_lu_ops; + osd->od_dt_dev.dd_ops = &osd_dt_ops; + + cplen = strscpy(osd->od_svname, lustre_cfg_string(cfg, 4), + sizeof(osd->od_svname)); + if (cplen < 0) + GOTO(out, rc = cplen); + + /* -1 means that index is invalid. */ + osd->od_index = -1; + rc = server_name2index(osd->od_svname, &osd->od_index, NULL); + if (rc == LDD_F_SV_TYPE_OST) + osd->od_is_ost = 1; + + rc = osd_mount(env, osd, cfg); + if (rc) + GOTO(out, rc); + + rc = lu_site_init(&osd->od_site, ld); + if (rc) + GOTO(out_mnt, rc); + osd->od_site.ls_bottom_dev = ld; + + rc = lu_site_init_finish(&osd->od_site); + if (rc) + GOTO(out_site, rc); + + RETURN(0); + +out_site: + lu_site_fini(&osd->od_site); +out_mnt: + osd_umount(env, osd); +out: + return rc; +} + +static struct lu_device *osd_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct osd_device *osd; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(osd); + if (osd == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = dt_device_init(&osd->od_dt_dev, t); + if (unlikely(rc)) { + OBD_FREE_PTR(osd); + GOTO(out, rc); + } + + rc = __osd_device_init(env, osd, cfg); +out: + RETURN(rc == 0 ? osd2lu_dev(osd) : ERR_PTR(rc)); +} + +static struct lu_device *osd_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osd_device *osd = osd_dev(d); + + ENTRY; + + /* XXX: make osd top device in order to release reference */ + d->ld_site->ls_top_dev = d; + lu_site_purge(env, d->ld_site, -1); + lu_site_print(env, d->ld_site, &d->ld_site->ls_obj_hash.nelems, + D_ERROR, lu_cdebug_printer); + + lu_site_fini(&osd->od_site); + dt_device_fini(&osd->od_dt_dev); + OBD_FREE_PTR(osd); + + RETURN(NULL); +} + +static int osd_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + return 0; +} + +static struct lu_device *osd_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct osd_device *osd = osd_dev(d); + + ENTRY; + + osd_shutdown(env, osd); + osd_umount(env, osd); + RETURN(NULL); +} + +static const struct lu_device_type_operations osd_device_type_ops = { + .ldto_device_alloc = osd_device_alloc, + .ldto_device_free = osd_device_free, + .ldto_device_init = osd_device_init, + .ldto_device_fini = osd_device_fini +}; + +static struct lu_device_type osd_device_type = { + .ldt_tags = LU_DEVICE_DT, + .ldt_name = LUSTRE_OSD_WBCFS_NAME, + .ldt_ops = &osd_device_type_ops, + .ldt_ctx_tags = LCT_LOCAL +}; + +/* We use exports to track all osd users. */ +static int osd_obd_connect(const struct lu_env *env, struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct osd_device *osd = osd_dev(obd->obd_lu_dev); + struct lustre_handle conn; + int rc; + + ENTRY; + + CDEBUG(D_CONFIG, "connect #%d\n", atomic_read(&osd->od_connects)); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + RETURN(rc); + + *exp = class_conn2export(&conn); + atomic_inc(&osd->od_connects); + + RETURN(0); +} + +/* + * Once last export (we do not count self-export) disappeared, + * OSD can be released. + */ +static int osd_obd_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + struct osd_device *osd = osd_dev(obd->obd_lu_dev); + int rc, release = 0; + + ENTRY; + + /* Only disconnect the underlying layers on the final disconnect. */ + release = atomic_dec_and_test(&osd->od_connects); + rc = class_disconnect(exp); + + if (rc == 0 && release) + class_manual_cleanup(obd); + + RETURN(rc); +} + +static int osd_health_check(const struct lu_env *env, struct obd_device *obd) +{ + struct osd_device *osd = osd_dev(obd->obd_lu_dev); + struct super_block *sb = osd_sb(osd); + + return (!sb || sb->s_flags & SB_RDONLY); +} + +static const struct obd_ops osd_obd_device_ops = { + .o_owner = THIS_MODULE, + .o_connect = osd_obd_connect, + .o_disconnect = osd_obd_disconnect, + .o_health_check = osd_health_check, +}; + +static int __init osd_init(void) +{ + int rc; + + rc = libcfs_setup(); + if (rc) + return rc; + + rc = lu_kmem_init(wbcfs_caches); + if (rc) + return rc; + + rc = memfs_init(); + if (rc) + GOTO(out_kmem, rc); + + rc = class_register_type(&osd_obd_device_ops, NULL, true, + LUSTRE_OSD_WBCFS_NAME, &osd_device_type); + if (rc) + GOTO(out_memfs, rc); + +#ifndef HAVE_FLUSH_DELAYED_FPUT + if (unlikely(cfs_flush_delayed_fput == NULL)) + cfs_flush_delayed_fput = + cfs_kallsyms_lookup_name("flush_delayed_fput"); +#endif + + INIT_WORK(&flush_fput, osd_flush_fput); + + return 0; + +out_memfs: + memfs_fini(); +out_kmem: + lu_kmem_fini(wbcfs_caches); + return rc; +} + +static void __exit osd_exit(void) +{ + cancel_work_sync(&flush_fput); + class_unregister_type(LUSTRE_OSD_WBCFS_NAME); + memfs_fini(); + lu_kmem_fini(wbcfs_caches); +} + +MODULE_AUTHOR("Yingjin Qian "); +MODULE_DESCRIPTION("Lustre Object Storage Device ("LUSTRE_OSD_WBCFS_NAME")"); +MODULE_VERSION(LUSTRE_VERSION_STRING); +MODULE_LICENSE("GPL"); + +module_init(osd_init); +module_exit(osd_exit); diff --git a/lustre/osd-wbcfs/osd_hash.c b/lustre/osd-wbcfs/osd_hash.c new file mode 100644 index 0000000..1398f81 --- /dev/null +++ b/lustre/osd-wbcfs/osd_hash.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2024-2025, Amazon and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Hash index with FIXED key length. + * Traverse the index via linear list scanning. + * + * Author: Timothy Day + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include +#include + +#include "index.h" + +static u32 hash_index_keyhash(const void *data, u32 len, u32 seed) +{ + return jhash(data, len, seed); +} + +static u32 hash_index_entry_keyhash(const void *data, u32 len, u32 seed) +{ + struct hash_index_entry *entry = (struct hash_index_entry *)data; + + return hash_index_keyhash(&entry->he_buf, entry->he_keylen, seed); +} + +static int hash_index_keycmp(struct rhashtable_compare_arg *arg, + const void *obj) +{ + struct hash_index_entry *entry = (struct hash_index_entry *)obj; + + LASSERT(arg->ht->key_len == entry->he_keylen); + + if (!memcpy(entry->he_buf, arg->key, entry->he_keylen)) + return 0; + + /* ESRCH is typical for rhashtable */ + return -ESRCH; +} + +static const struct rhashtable_params hash_index_params = { + .head_offset = offsetof(struct hash_index_entry, he_hash), + .hashfn = hash_index_keyhash, + .obj_hashfn = hash_index_entry_keyhash, + .obj_cmpfn = hash_index_keycmp, + .automatic_shrinking = true, +}; + +int hash_index_init(struct hash_index *hind, size_t keylen, size_t reclen) +{ + int rc; + + LASSERT(keylen > 0); + INIT_LIST_HEAD(&hind->hi_list); + hind->hi_htbl_params = hash_index_params; + hind->hi_htbl_params.key_len = keylen; + hind->hi_reclen = reclen; + rc = rhashtable_init(&hind->hi_htbl, &hind->hi_htbl_params); + return rc; +} + +void hash_index_fini(struct hash_index *hind) +{ + struct hash_index_entry *entry, *tmp; + + if (!hind) + return; + + list_for_each_entry_safe(entry, tmp, &hind->hi_list, he_list_item) { + rhashtable_remove_fast(&hind->hi_htbl, &entry->he_hash, + hind->hi_htbl_params); + list_del(&entry->he_list_item); + OBD_FREE(entry, entry->he_len); + } + + rhashtable_destroy(&hind->hi_htbl); +} + +struct hash_index_entry * +hash_index_lookup_entry(struct hash_index *hind, const void *key) +{ + struct hash_index_entry *entry; + + entry = rhashtable_lookup_fast(&hind->hi_htbl, key, + hind->hi_htbl_params); + return entry; +} + +int hash_index_lookup(struct hash_index *hind, const void *key, void *rec) +{ + struct hash_index_entry *entry; + int rc = 0; + + entry = rhashtable_lookup_fast(&hind->hi_htbl, key, + hind->hi_htbl_params); + if (entry) { + size_t reclen; + + reclen = entry->he_len - sizeof(*entry) - entry->he_keylen; + LASSERT(ergo(hind->hi_reclen, hind->hi_reclen == reclen)); + memcpy(rec, entry->he_buf + entry->he_keylen, reclen); + return 1; + } + + return rc; +} + +int hash_index_insert(struct hash_index *hind, void *key, size_t keylen, + void *rec, size_t reclen) +{ + struct hash_index_entry *entry; + size_t len; + int rc = 0; + + ENTRY; + + if (!keylen) + keylen = hind->hi_htbl_params.key_len; + else + LASSERT(keylen == hind->hi_htbl_params.key_len); + if (!reclen) + reclen = hind->hi_reclen; + else + LASSERT(reclen == hind->hi_reclen); + + len = sizeof(*entry) + keylen + reclen; + OBD_ALLOC(entry, len); + if (!entry) + RETURN(-ENOMEM); + + entry->he_len = len; + entry->he_keylen = keylen; + memcpy(entry->he_buf, key, keylen); + memcpy(entry->he_buf + keylen, rec, reclen); + + rc = rhashtable_insert_fast(&hind->hi_htbl, &entry->he_hash, + hind->hi_htbl_params); + LASSERT(rc != -EBUSY); + if (rc) + GOTO(out_free, rc); + + list_add_tail(&entry->he_list_item, &hind->hi_list); + + /* TODO: Rollover? Should at least add detection... */ + entry->he_offset = hind->hi_next_offset++; + RETURN(0); + +out_free: + OBD_FREE(entry, len); + RETURN(rc); +} + +void hash_index_remove(struct hash_index *hind, const void *key) +{ + struct hash_index_entry *entry; + + entry = rhashtable_lookup_fast(&hind->hi_htbl, key, + hind->hi_htbl_params); + if (!entry) + return; + + rhashtable_remove_fast(&hind->hi_htbl, &entry->he_hash, + hind->hi_htbl_params); + /* FIXME: use RCU for list insert/remove. */ + list_del(&entry->he_list_item); + OBD_FREE(entry, entry->he_len); +} diff --git a/lustre/osd-wbcfs/osd_index_hash.c b/lustre/osd-wbcfs/osd_index_hash.c new file mode 100644 index 0000000..886e9a2 --- /dev/null +++ b/lustre/osd-wbcfs/osd_index_hash.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2024-2025, Amazon and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Index Access Module. + * + * Author: Timothy Day + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include +#include +#include +#include + +#include "osd_internal.h" +#include "wbcfs.h" + +static int osd_hash_index_lookup(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + int rc; + + ENTRY; + + down_read(&obj->oo_sem); + rc = hash_index_lookup(hind, (void *)key, rec); + up_read(&obj->oo_sem); + + RETURN(rc); +} + +static int +osd_hash_index_insert(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *rec, const struct dt_key *key, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + int rc; + + ENTRY; + + down_write(&obj->oo_sem); + rc = hash_index_insert(hind, (void *)key, 0, (void *)rec, 0); + up_write(&obj->oo_sem); + RETURN(rc); +} + +static int osd_hash_index_delete(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + + ENTRY; + + down_write(&obj->oo_sem); + hash_index_remove(hind, (void *)key); + up_write(&obj->oo_sem); + + RETURN(0); +} + +static struct dt_it *osd_hash_index_it_init(const struct lu_env *env, + struct dt_object *dt, __u32 unused) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct hash_index *hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + struct osd_hash_it *it; + + ENTRY; + + if (obj->oo_destroyed) + RETURN(ERR_PTR(-ENOENT)); + + OBD_SLAB_ALLOC_PTR(it, osd_hash_it_cachep); + if (!it) + RETURN(ERR_PTR(-ENOMEM)); + + /* FIXME: race between concurrent iterating and deleting */ + it->hit_cursor = &hind->hi_list; + it->hit_obj = obj; + + RETURN((struct dt_it *)it); +} + +static void osd_hash_index_it_fini(const struct lu_env *env, + struct dt_it *di) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + + ENTRY; + OBD_SLAB_FREE_PTR(it, osd_hash_it_cachep); + EXIT; +} + +static int osd_hash_index_it_get(const struct lu_env *env, struct dt_it *di, + const struct dt_key *key) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + struct hash_index_entry *entry; + struct hash_index *hind; + size_t keylen; + int rc = -EIO; + + ENTRY; + + if (obj->oo_destroyed) + RETURN(-ENOENT); + + hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + keylen = hind->hi_htbl_params.key_len; + + down_read(&obj->oo_sem); + list_for_each_entry(entry, &hind->hi_list, he_list_item) { + if (memcmp(key, entry->he_buf, keylen) == 0) { + it->hit_cursor = &entry->he_list_item; + rc = 0; + break; + } + } + up_read(&obj->oo_sem); + + RETURN(rc); +} + +/* TODO: remove and make fp optional. */ +static void osd_hash_index_it_put(const struct lu_env *env, struct dt_it *di) +{ +} + +static int osd_hash_index_it_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + struct hash_index *hind; + int rc = 0; + + ENTRY; + + if (obj->oo_destroyed) + RETURN(-ENOENT); + + hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + down_read(&obj->oo_sem); + it->hit_cursor = it->hit_cursor->next; + if (it->hit_cursor == &hind->hi_list) + rc = 1; + up_read(&obj->oo_sem); + RETURN(rc); +} + +static struct dt_key *osd_hash_index_it_key(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + struct hash_index_entry *entry; + + ENTRY; + + if (obj->oo_destroyed) + RETURN(ERR_PTR(-ENOENT)); + + entry = container_of(it->hit_cursor, struct hash_index_entry, + he_list_item); + RETURN((struct dt_key *)entry->he_buf); +} + +static int osd_hash_index_it_key_size(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + + RETURN(MEMFS_I(obj->oo_inode)->mei_hash_index.hi_htbl_params.key_len); +} + +static int osd_hash_index_it_rec(const struct lu_env *env, + const struct dt_it *di, struct dt_rec *rec, + __u32 attr) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + struct hash_index_entry *entry; + struct hash_index *hind; + size_t reclen; + + ENTRY; + + hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + /* FIXME: use RCU to avoid concurrent operations on the list. */ + entry = container_of(it->hit_cursor, struct hash_index_entry, + he_list_item); + reclen = entry->he_len - sizeof(*entry) - entry->he_keylen; + LASSERT(ergo(hind->hi_reclen, hind->hi_reclen == reclen)); + memcpy(rec, entry->he_buf + entry->he_keylen, reclen); + RETURN(0); +} + +static int osd_hash_index_it_rec_size(const struct lu_env *env, + const struct dt_it *di, __u32 attr) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + struct hash_index_entry *entry; + struct hash_index *hind; + size_t reclen; + + ENTRY; + + hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + if (hind->hi_reclen == 0) { + entry = container_of(it->hit_cursor, struct hash_index_entry, + he_list_item); + reclen = entry->he_len - sizeof(*entry) - entry->he_keylen; + } else { + reclen = hind->hi_reclen; + } + + RETURN(reclen); +} + +static __u64 osd_hash_index_it_store(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct hash_index_entry *entry; + + ENTRY; + + entry = container_of(it->hit_cursor, struct hash_index_entry, + he_list_item); + RETURN(entry->he_offset); +} + +static int osd_hash_index_it_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) +{ + struct osd_hash_it *it = (struct osd_hash_it *)di; + struct osd_object *obj = it->hit_obj; + struct hash_index_entry *entry; + struct hash_index *hind; + int rc = 1; + + ENTRY; + + hind = &MEMFS_I(obj->oo_inode)->mei_hash_index; + if (hash == 0) { + it->hit_cursor = &hind->hi_list; + it->hit_cursor = it->hit_cursor->next; + if (it->hit_cursor == &hind->hi_list) + rc = 0; + + RETURN(rc); + } + + /* TODO: A linear scan is not efficient, will use Maple Tree instead. */ + list_for_each_entry(entry, &hind->hi_list, he_list_item) { + if (entry->he_offset == hash) { + it->hit_cursor = &entry->he_list_item; + rc = 1; + break; + } + } + + RETURN(rc); +} + +const struct dt_index_operations osd_hash_index_ops = { + .dio_lookup = osd_hash_index_lookup, + .dio_insert = osd_hash_index_insert, + .dio_delete = osd_hash_index_delete, + .dio_it = { + .init = osd_hash_index_it_init, + .fini = osd_hash_index_it_fini, + .get = osd_hash_index_it_get, + .put = osd_hash_index_it_put, + .next = osd_hash_index_it_next, + .key = osd_hash_index_it_key, + .key_size = osd_hash_index_it_key_size, + .rec = osd_hash_index_it_rec, + .rec_size = osd_hash_index_it_rec_size, + .store = osd_hash_index_it_store, + .load = osd_hash_index_it_load + } +}; diff --git a/lustre/osd-wbcfs/osd_internal.h b/lustre/osd-wbcfs/osd_internal.h new file mode 100644 index 0000000..5a8cff7 --- /dev/null +++ b/lustre/osd-wbcfs/osd_internal.h @@ -0,0 +1,254 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Author: Yingjin Qian + */ + +#ifndef _OSD_INTERNAL_H +#define _OSD_INTERNAL_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct osd_object { + struct dt_object oo_dt; + /* + * Inode in the memory FS for file system object represented by this + * osd_object. This inode is pinned for the whole duration of the file + * life. + */ + struct inode *oo_inode; + /* Used to implement osd_{read|write}_{lock|unlock}. */ + struct rw_semaphore oo_sem; + /* protects inode attributes. */ + spinlock_t oo_guard; + /* the i_flags in LMA */ + __u32 oo_lma_flags; + __u32 oo_destroyed:1; + struct lu_object_header *oo_header; +}; + +struct osd_device { + /* Super-class */ + struct dt_device od_dt_dev; + /* Information about underlying memory file system */ + struct vfsmount *od_mnt; + /* Service name associated with the OSD device. */ + char od_svname[MAX_OBD_NAME]; + char od_mntdev[MAX_OBD_NAME]; + int od_index; + atomic_t od_connects; + struct lu_site od_site; + /* + * Enable to write back the data in the memory FS into the + * persistent storage. + */ + unsigned int od_writeback_enabled:1; + unsigned int od_is_ost:1; +}; + +struct osd_thandle { + struct thandle ot_super; + struct list_head ot_commit_dcb_list; + struct list_head ot_stop_dcb_list; +}; + +struct osd_it_dirent { + struct lu_fid oitd_fid; + __u64 oitd_ino; + __u64 oitd_off; + unsigned short oitd_namelen; + unsigned int oitd_type; + char oitd_name[]; +} __attribute__((packed)); + +/* + * As @osd_it_dirent (in memory dirent struct for osd) is greater + * than lu_dirent struct. osd readdir reads less number of dirent than + * required for mdd dir page. so buffer size need to be increased so that + * there would be one MemFS readdir for every mdd readdir page. + */ + +#define OSD_IT_BUFSIZE (PAGE_SIZE + PAGE_SIZE/4) + +struct osd_it { + struct osd_object *oit_obj; + struct file oit_file; + /* How many entries have been read-cached from storage */ + int oit_rd_dirent; + /* Current entry is being iterated by caller */ + int oit_it_dirent; + /* Current processing entry */ + struct osd_it_dirent *oit_dirent; + /* Buffer to hold entries, size == OSD_IT_BUFSIZE */ + void *oit_buf; +}; + +extern atomic_t descriptors_cnt; +extern unsigned int wbcfs_flush_descriptors_cnt; +extern struct work_struct flush_fput; +#define osd_alloc_file_pseudo(inode, mnt, name, flags, fops) \ +({ \ + struct file *__f; \ + int __descriptors_cnt; \ + __f = alloc_file_pseudo(inode, mnt, name, flags, fops); \ + __descriptors_cnt = atomic_inc_return(&descriptors_cnt); \ + if (unlikely(__descriptors_cnt >= wbcfs_flush_descriptors_cnt)) {\ + /* drop here to skip queue_work */ \ + atomic_set(&descriptors_cnt, 0); \ + queue_work(system_long_wq, &flush_fput); \ + } \ + __f; \ +}) + +/* Slab to allocate osd_it */ +extern struct kmem_cache *osd_it_cachep; + +struct osd_hash_it { + struct list_head *hit_cursor; + struct osd_object *hit_obj; +}; + +extern struct kmem_cache *osd_hash_it_cachep; + +extern const struct dt_body_operations osd_body_ops; +extern const struct dt_object_operations osd_obj_ops; +extern const struct lu_object_operations osd_lu_obj_ops; +extern const struct lu_device_operations osd_lu_ops; +extern const struct dt_index_operations osd_dir_ops; +extern const struct dt_index_operations osd_hash_index_ops; + +static inline int lu_device_is_osd(const struct lu_device *d) +{ + return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops); +} + +static inline struct osd_device *osd_dt_dev(const struct dt_device *d) +{ + LASSERT(lu_device_is_osd(&d->dd_lu_dev)); + return container_of(d, struct osd_device, od_dt_dev); +} + +static inline struct osd_device *osd_dev(const struct lu_device *d) +{ + LASSERT(lu_device_is_osd(d)); + return osd_dt_dev(container_of(d, struct dt_device, dd_lu_dev)); +} + +static inline struct osd_device *osd_obj2dev(const struct osd_object *o) +{ + return osd_dev(o->oo_dt.do_lu.lo_dev); +} + +static inline struct super_block *osd_sb(const struct osd_device *dev) +{ + if (!dev->od_mnt) + return NULL; + + return dev->od_mnt->mnt_sb; +} + +static inline char *osd_name(struct osd_device *osd) +{ + return osd->od_svname; +} + +static inline struct lu_device *osd2lu_dev(struct osd_device *osd) +{ + return &osd->od_dt_dev.dd_lu_dev; +} + +static inline struct osd_object *osd_obj(const struct lu_object *o) +{ + LASSERT(lu_device_is_osd(o->lo_dev)); + return container_of(o, struct osd_object, oo_dt.do_lu); +} + +/* + * Put the osd object once done with it. + * + * \param obj osd object that needs to be put + */ +static inline void osd_object_put(const struct lu_env *env, + struct osd_object *obj) +{ + dt_object_put(env, &obj->oo_dt); +} + +static inline struct osd_object *osd_dt_obj(const struct dt_object *d) +{ + return osd_obj(&d->do_lu); +} + +#if defined HAVE_INODE_TIMESPEC64 || defined HAVE_INODE_GET_MTIME_SEC +#define osd_timespec timespec64 +#else +#define osd_timespec timespec +#endif + +static inline struct osd_timespec osd_inode_time(struct inode *inode, + s64 seconds) +{ + struct osd_timespec ts = { .tv_sec = seconds }; + + return ts; +} + +#ifdef HAVE_FILLDIR_USE_CTX_RETURN_BOOL +#define WRAP_FILLDIR_FN(prefix, fill_fn) \ +static bool fill_fn(struct dir_context *buf, const char *name, int namelen, \ + loff_t offset, __u64 ino, unsigned int d_type) \ +{ \ + return !prefix##fill_fn(buf, name, namelen, offset, ino, d_type); \ +} +#elif defined(HAVE_FILLDIR_USE_CTX) +#define WRAP_FILLDIR_FN(prefix, fill_fn) \ +static int fill_fn(struct dir_context *buf, const char *name, int namelen, \ + loff_t offset, __u64 ino, unsigned int d_type) \ +{ \ + return prefix##fill_fn(buf, name, namelen, offset, ino, d_type); \ +} +#else +#define WRAP_FILLDIR_FN(prefix, fill_fn) +#endif + +/* + * Build inode number from passed @fid. + * + * For 32-bit systems or syscalls limit the inode number to a 32-bit value + * to avoid EOVERFLOW errors. This will inevitably result in inode number + * collisions, but fid_flatten32() tries hard to avoid this if possible. + */ +static inline __u64 lu_fid_build_ino(const struct lu_fid *fid, int api32) +{ + if (BITS_PER_LONG == 32 || api32) + RETURN(fid_flatten32(fid)); + + RETURN(fid_flatten64(fid)); +} + +/* + * Build inode generation from passed @fid. If our FID overflows the 32-bit + * inode number then return a non-zero generation to distinguish them. + */ +static inline __u32 lu_fid_build_gen(const struct lu_fid *fid) +{ + if (fid_is_igif(fid)) + RETURN(lu_igif_gen(fid)); + + RETURN(fid_flatten64(fid) >> 32); +} + +#endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-wbcfs/osd_io.c b/lustre/osd-wbcfs/osd_io.c new file mode 100644 index 0000000..5413647 --- /dev/null +++ b/lustre/osd-wbcfs/osd_io.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include +#include + +#include +#include + +#include "osd_internal.h" + +/* Copied from osd-ldiskfs */ +static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, + struct niobuf_local *lnb, int maxlnb) +{ + int rc = 0; + + ENTRY; + + *nrpages = 0; + + while (len > 0) { + int poff = offset & (PAGE_SIZE - 1); + int plen = PAGE_SIZE - poff; + + if (*nrpages >= maxlnb) { + rc = -EOVERFLOW; + break; + } + + if (plen > len) + plen = len; + lnb->lnb_file_offset = offset; + lnb->lnb_page_offset = poff; + lnb->lnb_len = plen; + lnb->lnb_flags = 0; + lnb->lnb_page = NULL; + lnb->lnb_rc = 0; + lnb->lnb_guard_rpc = 0; + lnb->lnb_guard_disk = 0; + lnb->lnb_locked = 0; + lnb->lnb_hole = 0; + + LASSERTF(plen <= len, "plen %u, len %lld\n", plen, + (long long) len); + offset += plen; + len -= plen; + lnb++; + (*nrpages)++; + } + + RETURN(rc); +} + +static int osd_get_page(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, gfp_t gfp_mask, bool write) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct page *page; + pgoff_t index; + + LASSERT(inode); + index = lnb->lnb_file_offset >> PAGE_SHIFT; + if (write) { + page = find_or_create_page(inode->i_mapping, index, gfp_mask); + if (page == NULL) + return -ENOMEM; + + LASSERT(!PagePrivate2(page)); + } else { + /* + * Specially handling for hole in the memory FS during read. + * It does not allocate pages for holes, just records them and + * free them after reading. + * Otherwise, reading on a large sparse file may hit OOM. + */ + page = find_lock_page(inode->i_mapping, index); + /* fallocated page? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } + + if (page == NULL) { + page = alloc_page(gfp_mask); + if (!page) + return -ENOMEM; + + SetPagePrivate2(page); + lock_page(page); + ClearPageUptodate(page); + page->index = index; + lnb->lnb_hole = 1; + } + } + + lnb->lnb_page = page; + lnb->lnb_locked = 1; + if (!lnb->lnb_hole) + mark_page_accessed(page); + + return 0; +} + +/* + * Unlock and release pages loaded by @osd_bufs_get(). + * + * Unlock \a npages pages from \a lnb and drop the refcount on them. + */ +static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int npages) +{ + struct folio_batch fbatch; + int i; + + ll_folio_batch_init(&fbatch, 0); + for (i = 0; i < npages; i++) { + struct page *page = lnb[i].lnb_page; + + if (page == NULL) + continue; + + /* If the page is not cached in the memory FS, then free it. */ + if (PagePrivate2(page)) { + LASSERT(lnb[i].lnb_hole); + LASSERT(PageLocked(page)); + ClearPagePrivate2(page); + unlock_page(page); + __free_page(page); + } else { + if (lnb[i].lnb_locked) + unlock_page(page); + if (folio_batch_add_page(&fbatch, page) == 0) + folio_batch_release(&fbatch); + } + + lnb[i].lnb_page = NULL; + } + + folio_batch_release(&fbatch); + return 0; +} + +/** + * osd_bufs_get() - Load and lock pages undergoing IO + * @env: thread execution environment + * @dt: dt object undergoing IO (OSD object + methods) + * @pos: byte offset of IO start + * @len: number of bytes of IO + * @lnb: array of extents undergoing IO + * @maxlnb: maximum lnb + * @rw: read or write operation, and other flags + * + * Pages as described in the \a lnb array are fetched (from disk or cache) + * and locked for IO by the caller. + * + * Returns: + * %pages - (zero or more) loaded successfully + * %-ENOMEM - on memory/page allocation error + */ +static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, + loff_t pos, ssize_t len, struct niobuf_local *lnb, + int maxlnb, enum dt_bufs_type rw) +{ + struct osd_object *obj = osd_dt_obj(dt); + gfp_t gfp_mask; + int npages; + int rc; + int i; + + LASSERT(obj->oo_inode); + + if (unlikely(obj->oo_destroyed)) + RETURN(-ENOENT); + + rc = osd_map_remote_to_local(pos, len, &npages, lnb, maxlnb); + if (rc) + RETURN(rc); + + /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */ + gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) : + GFP_HIGHUSER; + for (i = 0; i < npages; i++, lnb++) { + rc = osd_get_page(env, dt, lnb, gfp_mask, + rw & DT_BUFS_TYPE_WRITE); + if (rc) + GOTO(cleanup, rc); + } + + RETURN(i); + +cleanup: + if (i > 0) + osd_bufs_put(env, dt, lnb - i, i); + return rc; +} + +static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *dev = osd_obj2dev(obj); + struct inode *inode = obj->oo_inode; + struct file *file; + ssize_t result; + + ENTRY; + + /* TODO: Specially handling for symlink. */ + if (S_ISLNK(dt->do_lu.lo_header->loh_attr)) + RETURN(-EOPNOTSUPP); + + file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", + O_NOATIME | O_RDONLY, inode->i_fop); + if (IS_ERR(file)) + RETURN(PTR_ERR(file)); + + result = cfs_kernel_read(file, buf->lb_buf, buf->lb_len, pos); + ihold(inode); + fput(file); + RETURN(result); +} + +static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *dev = osd_obj2dev(obj); + struct inode *inode = obj->oo_inode; + struct file *file; + ssize_t result; + + ENTRY; + + /* TODO: Specially handling for symlink. */ + if (S_ISLNK(dt->do_lu.lo_header->loh_attr)) + RETURN(-EOPNOTSUPP); + + file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", + O_NOATIME | O_WRONLY, inode->i_fop); + if (IS_ERR(file)) + RETURN(PTR_ERR(file)); + + result = cfs_kernel_write(file, buf->lb_buf, buf->lb_len, pos); + ihold(inode); + fput(file); + RETURN(result); +} + +/* Can we move all osd_read_prep() codes into osd_bufs_get() ? */ +static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int npages) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + loff_t isize; + int i; + + ENTRY; + + LASSERT(inode); + isize = i_size_read(inode); + + for (i = 0; i < npages; i++) { + /* + * If there is no more data, abort early. + * lnb->lnb_rc == 0, so it is easy to detect later. + */ + if (isize <= lnb[i].lnb_file_offset) + break; + + /* + * Instead of looking if we go beyond isize, send complete + * pages all the time. + */ + lnb[i].lnb_rc = lnb[i].lnb_len; + if (lnb[i].lnb_hole) { + void *kaddr; + + LASSERT(PagePrivate2(lnb[i].lnb_page)); + kaddr = kmap(lnb[i].lnb_page); + memset(kaddr, 0, PAGE_SIZE); + kunmap(lnb[i].lnb_page); + SetPageUptodate(lnb[i].lnb_page); + } else { + /* + * The page in cache for MemFS should be always + * in uptodate state. + */ + LASSERT(PageUptodate(lnb[i].lnb_page)); + unlock_page(lnb[i].lnb_page); + /* + * No need to unlock in osd_bufs_put(). The sooner page + * is unlocked, the earlier another client can access + * it. + */ + lnb[i].lnb_locked = 0; + } + } + + RETURN(0); +} + +static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int npages) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + ssize_t isize; + __s64 maxidx; + int i; + + ENTRY; + + LASSERT(inode); + + isize = i_size_read(inode); + maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1; + for (i = 0; i < npages; i++) { + /* + * Till commit the content of the page is undefined + * we will set it uptodate once bulk is done. Otherwise + * subsequent reads can access non-stable data. + */ + ClearPageUptodate(lnb[i].lnb_page); + + if (lnb[i].lnb_len == PAGE_SIZE) + continue; + + if (maxidx < lnb[i].lnb_page->index) { + long off; + char *p = kmap(lnb[i].lnb_page); + + off = lnb[i].lnb_page_offset; + if (off) + memset(p, 0, off); + off = (lnb[i].lnb_page_offset + lnb[i].lnb_len) & + ~PAGE_MASK; + if (off) + memset(p + off, 0, PAGE_SIZE - off); + kunmap(lnb[i].lnb_page); + } + } + + RETURN(0); +} + + +static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int npages, + struct thandle *th, __u64 user_size) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct address_space *mapping = inode->i_mapping; + size_t isize; + int i; + + ENTRY; + + LASSERT(inode); + + for (i = 0; i < npages; i++) { + if (lnb[i].lnb_rc) { /* ENOSPC, network RPC error, etc. */ + LASSERT(lnb[i].lnb_page); + generic_error_remove_folio(inode->i_mapping, + page_folio(lnb[i].lnb_page)); + continue; + } + + /* + * TODO: @lnb array is a sorted array according to the file + * offset, thus it just needs to check the last @lnb for + * file size. + */ + if (user_size < lnb[i].lnb_file_offset + lnb[i].lnb_len) + user_size = lnb[i].lnb_file_offset + lnb[i].lnb_len; + + LASSERT(PageLocked(lnb[i].lnb_page)); + LASSERT(!PageWriteback(lnb[i].lnb_page)); + /* LASSERT(!PageDirty(lnb[i].lnb_page)); */ + + SetPageUptodate(lnb[i].lnb_page); +#ifdef HAVE_DIRTY_FOLIO + mapping->a_ops->dirty_folio(mapping, + page_folio(lnb[i].lnb_page)); +#else + mapping->a_ops->set_page_dirty(lnb[i].lnb_page); +#endif + } + + spin_lock(&inode->i_lock); + isize = i_size_read(inode); + if (isize < user_size) + i_size_write(inode, user_size); + spin_unlock(&inode->i_lock); + + CDEBUG(D_INFO, "Size after write: i_size=%lld user_size=%llu\n", + i_size_read(inode), user_size); + /* No transno is needed for in-memory FS. */ + th->th_local = 1; + RETURN(0); +} + +/* TODO: Implement punch operation. */ +static int osd_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th) +{ + RETURN(0); +} + +/* TODO: Implemented lseek operation. */ +static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt, + loff_t offset, int whence) +{ + RETURN(0); +} + +const struct dt_body_operations osd_body_ops = { + .dbo_read = osd_read, + .dbo_write = osd_write, + .dbo_bufs_get = osd_bufs_get, + .dbo_bufs_put = osd_bufs_put, + .dbo_write_prep = osd_write_prep, + .dbo_write_commit = osd_write_commit, + .dbo_read_prep = osd_read_prep, + .dbo_punch = osd_punch, + .dbo_lseek = osd_lseek, +}; + diff --git a/lustre/osd-wbcfs/osd_object.c b/lustre/osd-wbcfs/osd_object.c new file mode 100644 index 0000000..4856f28 --- /dev/null +++ b/lustre/osd-wbcfs/osd_object.c @@ -0,0 +1,848 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include + +#include + +#include "osd_internal.h" +#include "wbcfs.h" + +/* Concurrency: no external locking is necessary. */ +static int osd_index_try(const struct lu_env *env, struct dt_object *dt, + const struct dt_index_features *feat) +{ + int rc; + + if (likely(feat == &dt_directory_features)) { + dt->do_index_ops = &osd_dir_ops; + rc = 0; + } else if (unlikely(feat == &dt_acct_features)) { + /* TODO: Add quota support. */ + rc = -ENOTSUPP; + } else if (unlikely(feat == &dt_otable_features)) { + /* TODO: Add scrub support. */ + dt->do_index_ops = &osd_hash_index_ops; + rc = 0; + } else { + dt->do_index_ops = &osd_hash_index_ops; + rc = 0; + } + + return rc; +} + +static int osd_otable_it_attr_get(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr) +{ + attr->la_valid = 0; + return 0; +} + +static const struct dt_object_operations osd_obj_otable_it_ops = { + .do_attr_get = osd_otable_it_attr_get, + .do_index_try = osd_index_try, +}; + +static void __osd_object_init(struct osd_object *obj) +{ + LASSERT(obj->oo_inode != NULL); + obj->oo_dt.do_body_ops = &osd_body_ops; + obj->oo_dt.do_lu.lo_header->loh_attr |= + (LOHA_EXISTS | (obj->oo_inode->i_mode & S_IFMT)); +} + +/* + * Concurrency: No concurrent access is possible that early in object + * life cycle. + */ +static int osd_object_init(const struct lu_env *env, struct lu_object *l, + const struct lu_object_conf *conf) +{ + struct osd_object *obj = osd_obj(l); + struct osd_device *osd = osd_obj2dev(obj); + const struct lu_fid *fid = lu_object_fid(l); + struct inode *inode = NULL; + __u64 hash; + + if (fid_is_otable_it(&l->lo_header->loh_fid)) { + obj->oo_dt.do_ops = &osd_obj_otable_it_ops; + l->lo_header->loh_attr |= LOHA_EXISTS; + return 0; + } + + hash = lu_fid_build_ino(fid, 0); + inode = ilookup5(osd_sb(osd), hash, memfs_test_inode_by_fid, + (void *)fid); + obj->oo_dt.do_body_ops = &osd_body_ops; + if (inode) { + obj->oo_inode = inode; + __osd_object_init(obj); + + /* + * TODO: check LMA EA and convert LMAI flags to lustre + * LMA flags and cache it in object. + */ + } + + CDEBUG(D_INODE, "%s: object init for fid="DFID" inode@%pK nlink=%d\n", + osd_name(osd), PFID(fid), inode, inode ? inode->i_nlink : 0); + + return 0; +} + +static void osd_object_free(const struct lu_env *env, struct lu_object *l) +{ + struct osd_object *obj = osd_obj(l); + struct lu_object_header *h = obj->oo_header; + + dt_object_fini(&obj->oo_dt); + OBD_FREE_PTR(obj); + if (unlikely(h)) + lu_object_header_free(h); +} + +/* + * Called just before the object is freed. Releases all resources except for + * object itself (that is released by osd_object_free()). + * + * Concurrency: no concurrent access is possible that late in object + * life-cycle. + */ +static void osd_object_delete(const struct lu_env *env, struct lu_object *l) +{ + struct osd_object *obj = osd_obj(l); + struct inode *inode = obj->oo_inode; + + if (!inode) + return; + + obj->oo_inode = NULL; + CDEBUG(D_INODE, + "%s: object "DFID" delete: inode@%pK nlink=%u count=%d\n", + osd_name(osd_obj2dev(obj)), PFID(lu_object_fid(l)), + inode, inode->i_nlink, atomic_read(&inode->i_count)); + iput(inode); +} + +/* Concurrency: ->loo_object_release() is called under site spin-lock. */ +static void osd_object_release(const struct lu_env *env, struct lu_object *l) +{ + struct osd_object *o = osd_obj(l); + + /* + * Nobody should be releasing a non-destroyed object with nlink=0 + * the API allows this, but wbcfs does not like and then report + * this inode as deleted. + */ + if (o->oo_destroyed == 0 && o->oo_inode && o->oo_inode->i_nlink == 0) + CERROR("%s: Object "DFID" wrong: %d inode@%pK nlink=%u\n", + osd_name(osd_obj2dev(o)), PFID(lu_object_fid(l)), + o->oo_destroyed, o->oo_inode, + o->oo_inode ? o->oo_inode->i_nlink : 0); + + LASSERT(!(o->oo_destroyed == 0 && o->oo_inode && + o->oo_inode->i_nlink == 0)); +} + +static int osd_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *l) +{ + struct osd_object *o = osd_obj(l); + + return (*p)(env, cookie, + LUSTRE_OSD_WBCFS_NAME"-object@%p(i:%p:%lu/%u)", + o, o->oo_inode, + o->oo_inode ? o->oo_inode->i_ino : 0UL, + o->oo_inode ? o->oo_inode->i_generation : 0); +} + +static void osd_inode_getattr(const struct lu_env *env, + struct inode *inode, struct lu_attr *attr) +{ + attr->la_valid |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE | + LA_SIZE | LA_BLOCKS | LA_UID | LA_GID | + LA_PROJID | LA_FLAGS | LA_NLINK | LA_RDEV | + LA_BLKSIZE | LA_TYPE | LA_BTIME; + + attr->la_atime = inode_get_atime_sec(inode); + attr->la_mtime = inode_get_mtime_sec(inode); + attr->la_ctime = inode_get_ctime_sec(inode); + attr->la_btime = memfs_get_btime(inode); + attr->la_mode = inode->i_mode; + attr->la_size = i_size_read(inode); + attr->la_blocks = inode->i_blocks; + attr->la_uid = i_uid_read(inode); + attr->la_gid = i_gid_read(inode); + attr->la_projid = i_projid_read(inode); + attr->la_flags = ll_inode_to_ext_flags(inode->i_flags); + attr->la_nlink = inode->i_nlink; + attr->la_rdev = inode->i_rdev; + attr->la_blksize = 1 << inode->i_blkbits; + attr->la_blkbits = inode->i_blkbits; + /* + * MemFS did not transfer inherit flags from raw inode + * to inode flags, and MemFS internally test raw inode + * @i_flags directly. Instead of patching ext4, we do it here. + */ + if (memfs_get_flags(inode) & LUSTRE_PROJINHERIT_FL) + attr->la_flags |= LUSTRE_PROJINHERIT_FL; +} + +static int osd_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr) +{ + struct osd_object *obj = osd_dt_obj(dt); + + if (unlikely(!dt_object_exists(dt))) + return -ENOENT; + if (unlikely(obj->oo_destroyed)) + return -ENOENT; + + LASSERT(!dt_object_remote(dt)); + + spin_lock(&obj->oo_guard); + osd_inode_getattr(env, obj->oo_inode, attr); + if (obj->oo_lma_flags & LUSTRE_ORPHAN_FL) { + attr->la_valid |= LA_FLAGS; + attr->la_flags |= LUSTRE_ORPHAN_FL; + } + if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL) { + attr->la_valid |= LA_FLAGS; + attr->la_flags |= LUSTRE_ENCRYPT_FL; + } + spin_unlock(&obj->oo_guard); + CDEBUG(D_INFO, "%s: getattr "DFID" inode@%pK nlink=%d\n", + osd_name(osd_obj2dev(obj)), PFID(lu_object_fid(&dt->do_lu)), + obj->oo_inode, obj->oo_inode->i_nlink); + return 0; +} + +static int osd_inode_setattr(const struct lu_env *env, + struct inode *inode, const struct lu_attr *attr) +{ + __u64 bits = attr->la_valid; + + /* Only allow set size for regular file */ + if (!S_ISREG(inode->i_mode)) + bits &= ~(LA_SIZE | LA_BLOCKS); + + if (bits == 0) + return 0; + + if (bits & LA_ATIME) + inode_set_atime_to_ts(inode, + osd_inode_time(inode, attr->la_atime)); + if (bits & LA_CTIME) + inode_set_ctime_to_ts(inode, + osd_inode_time(inode, attr->la_ctime)); + if (bits & LA_MTIME) + inode_set_mtime_to_ts(inode, + osd_inode_time(inode, attr->la_mtime)); + if (bits & LA_SIZE) { + spin_lock(&inode->i_lock); + i_size_write(inode, attr->la_size); + spin_unlock(&inode->i_lock); + } + + /* + * OSD should not change "i_blocks" which is used by quota. + * "i_blocks" should be changed by ldiskfs only. + */ + if (bits & LA_MODE) + inode->i_mode = (inode->i_mode & S_IFMT) | + (attr->la_mode & ~S_IFMT); + if (bits & LA_UID) + i_uid_write(inode, attr->la_uid); + if (bits & LA_GID) + i_gid_write(inode, attr->la_gid); + if (bits & LA_PROJID) + i_projid_write(inode, attr->la_projid); + if (bits & LA_NLINK) + set_nlink(inode, attr->la_nlink); + if (bits & LA_RDEV) + inode->i_rdev = attr->la_rdev; + + if (bits & LA_FLAGS) { + /* always keep S_NOCMTIME */ + inode->i_flags = ll_ext_to_inode_flags(attr->la_flags) | + S_NOCMTIME; +#if defined(S_ENCRYPTED) + /* Always remove S_ENCRYPTED, because ldiskfs must not be + * aware of encryption status. It is just stored into LMA + * so that it can be forwared to client side. + */ + inode->i_flags &= ~S_ENCRYPTED; +#endif + /* + * MemFS did not transfer inherit flags from + * @inode->i_flags to raw inode i_flags when writing + * flags, we do it explictly here. + */ + if (attr->la_flags & LUSTRE_PROJINHERIT_FL) + MEMFS_I(inode)->mei_flags |= LUSTRE_PROJINHERIT_FL; + else + MEMFS_I(inode)->mei_flags &= ~LUSTRE_PROJINHERIT_FL; + } + return 0; +} + +static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *attr, struct thandle *handle) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode; + int rc; + + if (!dt_object_exists(dt)) + return -ENOENT; + + LASSERT(!dt_object_remote(dt)); + inode = obj->oo_inode; + spin_lock(&obj->oo_guard); + rc = osd_inode_setattr(env, inode, attr); + spin_unlock(&obj->oo_guard); + if (rc) + RETURN(rc); + + /* TODO: extra flags for LUSTRE_LMA_FL_MASKS */ + + return 0; +} + +static int osd_mkfile(const struct lu_env *env, struct osd_object *obj, + umode_t mode, struct dt_allocation_hint *hint, + struct thandle *th, struct lu_attr *attr) +{ + struct osd_device *osd = osd_obj2dev(obj); + struct dt_object *parent = NULL; + struct inode *inode; + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID | + ATTR_CTIME | ATTR_MTIME | ATTR_ATIME, + .ia_ctime.tv_sec = attr->la_ctime, + .ia_mtime.tv_sec = attr->la_mtime, + .ia_atime.tv_sec = attr->la_atime, + .ia_uid = GLOBAL_ROOT_UID, + .ia_gid = GLOBAL_ROOT_GID, + }; + const struct osd_timespec omit = { .tv_nsec = UTIME_OMIT }; + const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu); + + if (attr->la_valid & LA_UID) + iattr.ia_uid = make_kuid(&init_user_ns, attr->la_uid); + if (attr->la_valid & LA_GID) + iattr.ia_gid = make_kgid(&init_user_ns, attr->la_gid); + + LASSERT(obj->oo_inode == NULL); + + if (hint != NULL && hint->dah_parent != NULL && + !dt_object_remote(hint->dah_parent)) + parent = hint->dah_parent; + + /* if a time component is not valid set it to UTIME_OMIT */ + if (!(attr->la_valid & LA_CTIME)) + iattr.ia_ctime = omit; + if (!(attr->la_valid & LA_MTIME)) + iattr.ia_mtime = omit; + if (!(attr->la_valid & LA_ATIME)) + iattr.ia_atime = omit; + + inode = memfs_create_inode(osd_sb(osd), + parent ? osd_dt_obj(parent)->oo_inode : + osd_sb(osd)->s_root->d_inode, + mode, &iattr, 0, false); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + /* Do not update file c/mtime in MemFS. */ + inode->i_flags |= S_NOCMTIME; + inode->i_ino = lu_fid_build_ino(fid, 0); + inode->i_generation = lu_fid_build_gen(fid); + MEMFS_I(inode)->mei_fid = *fid; + if (unlikely(insert_inode_locked(inode) < 0)) { + CERROR("%s: Failed to insert inode %lu "DFID": doubly allocated?\n", + osd_name(osd), inode->i_ino, PFID(fid)); + iput(inode); + RETURN(-EIO); + } + + CDEBUG(D_INODE, + "%s: create object "DFID": inode@%pK nlink=%d mode=%#o\n", + osd_name(osd), PFID(fid), inode, inode->i_nlink, inode->i_mode); + obj->oo_inode = inode; + RETURN(0); +} + +static int osd_mkdir(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + __u32 mode = (attr->la_mode & (S_IFMT | S_IRWXUGO | S_ISVTX | S_ISGID)); + + LASSERT(S_ISDIR(attr->la_mode)); + + return osd_mkfile(env, obj, mode, hint, th, attr); +} + +static int osd_mk_index(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + __u32 mode = (attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX)); + const struct dt_index_features *feat = dof->u.dof_idx.di_feat; + struct memfs_inode_info *mei; + size_t keylen = 0; + size_t reclen = 0; + int rc; + + ENTRY; + + LASSERT(S_ISREG(attr->la_mode)); + + /* Only support index with fixed key length. */ + if (feat->dif_flags & DT_IND_VARKEY) + RETURN(-EINVAL); + + keylen = feat->dif_keysize_max; + if (!(feat->dif_flags & DT_IND_VARREC)) + reclen = feat->dif_recsize_max; + + rc = osd_mkfile(env, obj, mode, hint, th, attr); + if (rc) + GOTO(out, rc); + + LASSERT(obj->oo_inode != NULL); + mei = MEMFS_I(obj->oo_inode); + mei->mei_index_type = INDEX_TYPE_HASH; + rc = hash_index_init(&mei->mei_hash_index, keylen, reclen); + if (rc) { + CERROR("%s: failed to create index for FID="DFID": rc=%d\n", + osd_name(osd_obj2dev(obj)), + PFID(lu_object_fid(&obj->oo_dt.do_lu)), rc); + /* TODO: cleanup @oo_inode... */ + } +out: + RETURN(rc); +} + +static int osd_mkreg(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(S_ISREG(attr->la_mode)); + return osd_mkfile(env, obj, (attr->la_mode & + (S_IFMT | S_IALLUGO | S_ISVTX)), hint, th, + attr); +} + +static int osd_mksym(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(S_ISLNK(attr->la_mode)); + /* TODO: symlink support. */ + RETURN(-EOPNOTSUPP); +} + +static int osd_mknod(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX); + int result; + + LASSERT(obj->oo_inode == NULL); + LASSERT(S_ISCHR(mode) || S_ISBLK(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)); + + result = osd_mkfile(env, obj, mode, hint, th, attr); + if (result == 0) { + LASSERT(obj->oo_inode != NULL); + /* + * This inode should be marked dirty for i_rdev. Currently + * that is done in the osd_attr_init(). + */ + init_special_inode(obj->oo_inode, obj->oo_inode->i_mode, + attr->la_rdev); + } + return result; +} + +typedef int (*osd_obj_type_f)(const struct lu_env *env, + struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + +static osd_obj_type_f osd_create_type_f(enum dt_format_type type) +{ + osd_obj_type_f result; + + switch (type) { + case DFT_DIR: + result = osd_mkdir; + break; + case DFT_REGULAR: + result = osd_mkreg; + break; + case DFT_SYM: + result = osd_mksym; + break; + case DFT_NODE: + result = osd_mknod; + break; + case DFT_INDEX: + result = osd_mk_index; + break; + default: + LBUG(); + break; + } + return result; +} + +static void osd_attr_init(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, struct dt_object_format *dof, + struct thandle *handle) +{ + struct inode *inode = obj->oo_inode; + __u64 valid = attr->la_valid; + int result; + + attr->la_valid &= ~(LA_TYPE | LA_MODE); + + if (dof->dof_type != DFT_NODE) + attr->la_valid &= ~LA_RDEV; + if ((valid & LA_ATIME) && + (attr->la_atime == inode_get_atime_sec(inode))) + attr->la_valid &= ~LA_ATIME; + if ((valid & LA_CTIME) && + (attr->la_ctime == inode_get_ctime_sec(inode))) + attr->la_valid &= ~LA_CTIME; + if ((valid & LA_MTIME) && + (attr->la_mtime == inode_get_mtime_sec(inode))) + attr->la_valid &= ~LA_MTIME; + + /* TODO: Perform quota transfer. */ + + if (attr->la_valid != 0) { + result = osd_inode_setattr(env, inode, attr); + /* + * The osd_inode_setattr() should always succeed here. The + * only error that could be returned is EDQUOT when we are + * trying to change the UID or GID of the inode. However, this + * should not happen since quota enforcement is no longer + * enabled on MemFS (lquota is supported and takes care of it). + */ + LASSERTF(result == 0, "%d\n", result); + } + + attr->la_valid = valid; +} + +/* Helper function for osd_create(). */ +static int __osd_create(const struct lu_env *env, struct osd_object *obj, + struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) +{ + int result; + __u32 umask; + + /* we drop umask so that permissions we pass are not affected */ + umask = current->fs->umask; + current->fs->umask = 0; + + result = osd_create_type_f(dof->dof_type)(env, obj, attr, hint, dof, + th); + if (likely(obj->oo_inode && result == 0)) { + LASSERT(obj->oo_inode->i_state & I_NEW); + + /* + * Unlock the inode before attr initialization to avoid + * unnecessary dqget operations. LU-6378 + */ + unlock_new_inode(obj->oo_inode); + osd_attr_init(env, obj, attr, dof, th); + __osd_object_init(obj); + } + + /* restore previous umask value */ + current->fs->umask = umask; + + return result; +} + +static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah, + struct dt_object *parent, struct dt_object *child, + umode_t child_mode) +{ + LASSERT(ah); + + ah->dah_parent = parent; +} + +/* OSD layer object creation funcation for OST objects. */ +static int osd_create(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) +{ + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_object *obj = osd_dt_obj(dt); + int rc; + + ENTRY; + + if (dt_object_exists(dt)) + RETURN(-EEXIST); + + LASSERT(!dt_object_remote(dt)); + LASSERT(dt_write_locked(env, dt)); + + /* Quota files cannot be created from the kernel any more */ + if (unlikely(fid_is_acct(fid))) + RETURN(-EPERM); + + rc = __osd_create(env, obj, attr, hint, dof, th); + /* TODO: Update LMA EA with @fid. */ + LASSERT(ergo(rc == 0, + dt_object_exists(dt) && !dt_object_remote(dt))); + RETURN(rc); +} + +static int osd_destroy(const struct lu_env *env, struct dt_object *dt, + struct thandle *th) +{ + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + struct osd_device *osd = osd_obj2dev(obj); + + ENTRY; + + LASSERT(inode); + LASSERT(!lu_object_is_dying(dt->do_lu.lo_header)); + + if (unlikely(fid_is_acct(fid))) + RETURN(-EPERM); + + /* TODO: Agent entry remvoal... */ + if (S_ISDIR(inode->i_mode)) { + if (inode->i_nlink > 2) + CERROR("%s: dir "DFID" ino %lu nlink %u at unlink.\n", + osd_name(osd), PFID(fid), inode->i_ino, + inode->i_nlink); + + spin_lock(&obj->oo_guard); + clear_nlink(inode); + spin_unlock(&obj->oo_guard); + } + + set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags); + obj->oo_destroyed = 1; + CDEBUG(D_INODE, + "%s: Object "DFID" destroyed: inode@%pK nlink=%d mode=%#o\n", + osd_name(osd), PFID(lu_object_fid(&dt->do_lu)), inode, + inode->i_nlink, inode->i_mode); + + RETURN(0); +} + +/* + * Concurrency: @dt is write locked. + */ +static int osd_ref_add(const struct lu_env *env, struct dt_object *dt, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + int rc = 0; + + if (!dt_object_exists(dt) || obj->oo_destroyed) + return -ENOENT; + + LASSERT(!dt_object_remote(dt)); + LASSERT(dt_write_locked(env, dt)); + + CDEBUG(D_INODE, "%s:"DFID" increase nlink %d inode@%pK\n", + osd_name(osd_obj2dev(obj)), PFID(lu_object_fid(&dt->do_lu)), + inode->i_nlink, inode); + /* + * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX + * (65000) subdirectories by storing "1" in i_nlink if the link count + * would otherwise overflow. Directory tranversal tools understand + * that (st_nlink == 1) indicates that the filesystem dose not track + * hard links count on the directory, and will not abort subdirectory + * scanning early once (st_nlink - 2) subdirs have been found. + * + * This also has to properly handle the case of inodes with nlink == 0 + * in case they are being linked into the PENDING directory + */ + spin_lock(&obj->oo_guard); + if (unlikely(inode->i_nlink == 0)) + /* inc_nlink from 0 may cause WARN_ON */ + set_nlink(inode, 1); + else + inc_nlink(inode); + spin_unlock(&obj->oo_guard); + + return rc; +} + +/* + * Concurrency: @dt is write locked. + */ +static int osd_ref_del(const struct lu_env *env, struct dt_object *dt, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + struct osd_device *osd = osd_dev(dt->do_lu.lo_dev); + + if (!dt_object_exists(dt)) + return -ENOENT; + + LASSERT(!dt_object_remote(dt)); + LASSERT(dt_write_locked(env, dt)); + + if (CFS_FAIL_CHECK(OBD_FAIL_OSD_REF_DEL)) + return -EIO; + + spin_lock(&obj->oo_guard); + if (inode->i_nlink == 0) { + CDEBUG_LIMIT(fid_is_norm(lu_object_fid(&dt->do_lu)) ? + D_ERROR : D_INODE, "%s: nlink == 0 on "DFID".\n", + osd_name(osd), PFID(lu_object_fid(&dt->do_lu))); + spin_unlock(&obj->oo_guard); + return 0; + } + + CDEBUG(D_INODE, DFID" decrease nlink %d inode@%pK\n", + PFID(lu_object_fid(&dt->do_lu)), inode->i_nlink, inode); + + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); + spin_unlock(&obj->oo_guard); + + return 0; +} + +/* Concurrency: @dt is write locked. */ +static int osd_xattr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, const char *name, int fl, + struct thandle *handle) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + int flags = 0; + int rc; + + ENTRY; + + LASSERT(inode); + LASSERT(buf); + + if (fl & LU_XATTR_REPLACE) + flags |= XATTR_REPLACE; + if (fl & LU_XATTR_CREATE) + flags |= XATTR_CREATE; + + /* FIXME: using VFS i_op->setxattr()? */ + rc = memfs_xattr_set(inode, buf->lb_buf, buf->lb_len, name, flags); + + RETURN(rc); +} + +/* Concurrency: @dt is read locked. */ +static int osd_xattr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, const char *name) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + int rc; + + ENTRY; + LASSERT(buf); + + if (!dt_object_exists(dt)) + RETURN(-ENOENT); + + LASSERT(!dt_object_remote(dt)); + + /* FIXME: using VFS i_op->getxattr()? */ + rc = memfs_xattr_get(inode, buf->lb_buf, buf->lb_len, name); + RETURN(rc); +} + +/* Concurrency: @dt is write locked. */ +static int osd_xattr_del(const struct lu_env *env, struct dt_object *dt, + const char *name, struct thandle *handle) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + + if (!dt_object_exists(dt)) + return -ENOENT; + + LASSERT(!dt_object_remote(dt)); + /* FIXME: using VFS i_op->removexattr() */ + memfs_xattr_del(inode, name); + + return 0; +} + +/* TODO: Implement xattr listing. */ +static int osd_xattr_list(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf) +{ + RETURN(0); +} + +/* MemFS does not support object sync, return zero to ignore the error. */ +static int osd_object_sync(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end) +{ + RETURN(0); +} + +const struct dt_object_operations osd_obj_ops = { + .do_attr_get = osd_attr_get, + .do_attr_set = osd_attr_set, + .do_ah_init = osd_ah_init, + .do_create = osd_create, + .do_destroy = osd_destroy, + .do_index_try = osd_index_try, + .do_ref_add = osd_ref_add, + .do_ref_del = osd_ref_del, + .do_xattr_get = osd_xattr_get, + .do_xattr_set = osd_xattr_set, + .do_xattr_del = osd_xattr_del, + .do_xattr_list = osd_xattr_list, + .do_object_sync = osd_object_sync, +}; + +const struct lu_object_operations osd_lu_obj_ops = { + .loo_object_init = osd_object_init, + .loo_object_delete = osd_object_delete, + .loo_object_release = osd_object_release, + .loo_object_free = osd_object_free, + .loo_object_print = osd_object_print, +}; diff --git a/lustre/osd-wbcfs/wbcfs.c b/lustre/osd-wbcfs/wbcfs.c new file mode 100644 index 0000000..152c296 --- /dev/null +++ b/lustre/osd-wbcfs/wbcfs.c @@ -0,0 +1,1335 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * lustre/osd-wbcfs/osd_wbcfs.c + * + * Author: Yingjin Qian + */ + +#define DEBUG_SUBSYSTEM S_OSD + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_FS_CONTEXT_H +#include +#endif + +#include + +#include "wbcfs.h" + +#ifndef HAVE_USER_NAMESPACE_ARG +#define inode_init_owner(ns, inode, dir, mode) \ + inode_init_owner(inode, dir, mode) +#define memfs_mknod(ns, dir, dch, mode, rd) memfs_mknod(dir, dch, mode, rd) +#define memfs_mkdir(ns, dir, dch, mode) memfs_mkdir(dir, dch, mode) +#define memfs_create_nd(ns, dir, de, mode, ex) \ + memfs_create_nd(dir, de, mode, ex) +#endif /* HAVE_USER_NAMESPCE_ARG */ + +/* + * In-memory xattr entry. + * Borrowed from osd-ldiskfs @osd_xattr_entry and @simple_xattrs in Linux + * kernel. This part of codes in-memory XATTRs should put into libcfs module. + * The first part of @mxe_buf is XATTR name, and is '\0' terminated. + * The left part is for value, binary mode. + */ +struct mem_xattr_entry { + struct list_head mxe_list; + size_t mxe_len; + size_t mxe_namelen; + bool mxe_exist; + struct rcu_head mxe_rcu; + char mxe_buf[]; +}; + +static int mem_xattr_get(struct mem_xattrs *xattrs, const char *name, + void *buf, size_t len) +{ + struct mem_xattr_entry *mxe = NULL; + struct mem_xattr_entry *tmp; + size_t namelen = strlen(name); + int rc; + + ENTRY; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &xattrs->mex_xattr_list, mxe_list) { + if (namelen == tmp->mxe_namelen && + strncmp(name, tmp->mxe_buf, namelen) == 0) { + mxe = tmp; + break; + } + } + + if (mxe == NULL) + GOTO(out, rc = -ENODATA); + + if (!mxe->mxe_exist) + GOTO(out, rc = -ENODATA); + + /* Value length */ + rc = mxe->mxe_len - sizeof(*mxe) - mxe->mxe_namelen - 1; + LASSERT(rc > 0); + + if (buf == NULL) + GOTO(out, rc); + + if (len < rc) + GOTO(out, rc = -ERANGE); + + memcpy(buf, &mxe->mxe_buf[namelen + 1], rc); +out: + rcu_read_unlock(); + RETURN(rc); +} + +static void mem_xattr_free(struct rcu_head *head) +{ + struct mem_xattr_entry *mxe; + + mxe = container_of(head, struct mem_xattr_entry, mxe_rcu); + OBD_FREE(mxe, mxe->mxe_len); +} + +static int mem_xattr_add(struct mem_xattrs *xattrs, const char *name, + const char *buf, int buflen) +{ + struct mem_xattr_entry *mxe; + struct mem_xattr_entry *old = NULL; + struct mem_xattr_entry *tmp; + size_t namelen = strlen(name); + size_t len = sizeof(*mxe) + namelen + 1 + buflen; + + ENTRY; + + OBD_ALLOC(mxe, len); + if (mxe == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&mxe->mxe_list); + mxe->mxe_len = len; + mxe->mxe_namelen = namelen; + memcpy(mxe->mxe_buf, name, namelen); + if (buflen > 0) { + LASSERT(buf != NULL); + memcpy(mxe->mxe_buf + namelen + 1, buf, buflen); + mxe->mxe_exist = true; + } else { + mxe->mxe_exist = false; + } + + /* This should be rarely called, just remove old and add new */ + spin_lock(&xattrs->mex_lock); + list_for_each_entry(tmp, &xattrs->mex_xattr_list, mxe_list) { + if (namelen == tmp->mxe_namelen && + strncmp(name, tmp->mxe_buf, namelen) == 0) { + old = tmp; + break; + } + } + if (old != NULL) { + list_replace_rcu(&old->mxe_list, &mxe->mxe_list); + call_rcu(&old->mxe_rcu, mem_xattr_free); + } else { + list_add_tail_rcu(&mxe->mxe_list, &xattrs->mex_xattr_list); + } + spin_unlock(&xattrs->mex_lock); + + RETURN(0); +} + +static void mem_xattr_del(struct mem_xattrs *xattrs, const char *name) +{ + struct mem_xattr_entry *mxe; + size_t namelen = strlen(name); + + spin_lock(&xattrs->mex_lock); + list_for_each_entry(mxe, &xattrs->mex_xattr_list, mxe_list) { + if (namelen == mxe->mxe_namelen && + strncmp(name, mxe->mxe_buf, namelen) == 0) { + list_del_rcu(&mxe->mxe_list); + call_rcu(&mxe->mxe_rcu, mem_xattr_free); + break; + } + } + spin_unlock(&xattrs->mex_lock); +} + +static inline void mem_xattrs_init(struct mem_xattrs *xattrs) +{ + INIT_LIST_HEAD(&xattrs->mex_xattr_list); + spin_lock_init(&xattrs->mex_lock); +} + +static void mem_xattrs_fini(struct mem_xattrs *xattrs) +{ + struct mem_xattr_entry *mxe, *next; + + list_for_each_entry_safe(mxe, next, &xattrs->mex_xattr_list, mxe_list) { + list_del(&mxe->mxe_list); + OBD_FREE(mxe, mxe->mxe_len); + } +} + +int memfs_xattr_get(struct inode *inode, void *buf, size_t len, + const char *name) +{ + return mem_xattr_get(&MEMFS_I(inode)->mei_xattrs, name, buf, len); +} + +int memfs_xattr_set(struct inode *inode, void *buf, size_t len, + const char *name, int flags) +{ + return mem_xattr_add(&MEMFS_I(inode)->mei_xattrs, name, buf, len); +} + +void memfs_xattr_del(struct inode *inode, const char *name) +{ + mem_xattr_del(&MEMFS_I(inode)->mei_xattrs, name); +} + +static const struct super_operations memfs_ops; +static const struct address_space_operations memfs_aops; +static const struct file_operations memfs_file_operations; +static const struct inode_operations memfs_inode_operations; +static const struct file_operations memfs_dir_operations; +static const struct inode_operations memfs_dir_inode_operations; +static struct file_system_type memfs_fstype; + +static inline struct memfs_sb_info *MEMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static int memfs_reserve_inode(struct super_block *sb) +{ + return 0; +} + +static void memfs_free_inode(struct super_block *sb) +{ +} + +struct inode *memfs_create_inode(struct super_block *sb, struct inode *dir, + umode_t mode, struct iattr *iattr, dev_t dev, + bool update_link) +{ + struct memfs_sb_info *sbinfo = MEMFS_SB(sb); + struct memfs_inode_info *mei; + struct inode *inode; + + ENTRY; + + inode = new_inode(sb); + if (!inode) + RETURN(ERR_PTR(-ENOMEM)); + + if (iattr) { + uid_t owner[2] = { 0, 0 }; + + if (iattr->ia_valid & ATTR_UID) + owner[0] = from_kuid(&init_user_ns, iattr->ia_uid); + if (iattr->ia_valid & ATTR_GID) + owner[1] = from_kgid(&init_user_ns, iattr->ia_gid); + + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else { + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + } + + if (iattr) { + if (iattr->ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(inode, iattr->ia_ctime); + if (iattr->ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, iattr->ia_mtime); + if (iattr->ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, iattr->ia_atime); + } + + inode->i_blocks = 0; + + mei = MEMFS_I(inode); + mei->mei_crtime = inode_get_mtime(inode); + mem_xattrs_init(&mei->mei_xattrs); + mei->mei_index_type = INDEX_TYPE_NONE; + cache_no_acl(inode); + + if (sbinfo->msi_noswap) + mapping_set_unevictable(inode->i_mapping); + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &memfs_aops; + inode->i_op = &memfs_inode_operations; + inode->i_fop = &memfs_file_operations; + break; + case S_IFDIR: + if (update_link) + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &memfs_dir_inode_operations; + inode->i_fop = &memfs_dir_operations; + break; + case S_IFLNK: + break; + default: + CERROR("Unsupport file mode %#o\n", mode); + iput(inode); + /* + * TODO: Add support for other file types. + * Fix the error in sanity/test_28. + */ + RETURN(ERR_PTR(-EOPNOTSUPP)); + } + + return inode; +} + +static int memfs_mknod(struct mnt_idmap *map, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + struct inode *inode; + + ENTRY; + + inode = memfs_create_inode(dir->i_sb, dir, mode, NULL, dev, true); + if (IS_ERR(inode)) + RETURN(PTR_ERR(inode)); + + dir->i_size += BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + + RETURN(0); +} + +static int memfs_mkdir(struct mnt_idmap *map, struct inode *dir, + struct dentry *dchild, umode_t mode) +{ + int rc; + + rc = memfs_mknod(map, dir, dchild, mode | S_IFDIR, 0); + if (rc) + return rc; + + inc_nlink(dir); + return 0; +} + +static int memfs_create_nd(struct mnt_idmap *map, struct inode *dir, + struct dentry *dentry, umode_t mode, bool want_excl) +{ + return memfs_mknod(map, dir, dentry, mode | S_IFREG, 0); +} + +static int memfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) + memfs_free_inode(inode->i_sb); + + dir->i_size -= BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode))); + inode_inc_iversion(dir); + drop_nlink(inode); + dput(dentry); + return 0; +} + +static int memfs_rmdir(struct inode *dir, struct dentry *dchild) +{ + if (!simple_empty(dchild)) + return -ENOTEMPTY; + + drop_nlink(d_inode(dchild)); + drop_nlink(dir); + return memfs_unlink(dir, dchild); +} + +static int memfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + + ENTRY; + + /* + * No ordinary (disk based) filesystem counts links as inodes; + * but each new link needs a new dentry, pinning lowmem, and + * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. + */ + if (inode->i_nlink) { + int rc = 0; + + rc = memfs_reserve_inode(inode->i_sb); + if (rc) + RETURN(rc); + } + + dir->i_size += BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode))); + inode_inc_iversion(dir); + inc_nlink(inode); + ihold(inode); /* New dentry reference */ + dget(dentry); /* Extra pinning count for the created dentry */ + d_instantiate(dentry, inode); + return 0; +} + +#ifdef HAVE_DENTRY_D_CHILDREN +/* parent is locked at least shared */ +/* + * Returns an element of siblings' list. + * We are looking for th positive after

; if + * found, dentry is grabbed and returned to caller. + * If no such element exists, NULL is returned. + */ +static struct dentry *scan_positives(struct dentry *cursor, + struct hlist_node **p, + loff_t count, + struct dentry *last) +{ + struct dentry *dentry = cursor->d_parent, *found = NULL; + + spin_lock(&dentry->d_lock); + while (*p) { + struct dentry *d = hlist_entry(*p, struct dentry, d_sib); + + p = &d->d_sib.next; + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + if (!hlist_unhashed(&cursor->d_sib)) + __hlist_del(&cursor->d_sib); + hlist_add_behind(&cursor->d_sib, &d->d_sib); + p = &cursor->d_sib.next; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); + } + } + spin_unlock(&dentry->d_lock); + dput(last); + return found; +} + +/* + * Directory is locked and all positive dentries in it are safe, since + * for ramfs-type trees they can't go away without unlink() or rmdir(), + * both impossible due to the lock on directory. + */ + +static int memfs_dcache_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct dentry *cursor = file->private_data; + struct memfs_dir_context *mctx = (struct memfs_dir_context *)ctx; + struct dentry *next = NULL; + struct hlist_node **p; + + if (!dir_emit_dots(file, ctx)) + return 0; + + if (ctx->pos == 2) + p = &dentry->d_children.first; + else + p = &cursor->d_sib.next; + + while ((next = scan_positives(cursor, p, 1, next)) != NULL) { + mctx->dentry = next; + if (!dir_emit(ctx, next->d_name.name, next->d_name.len, + d_inode(next)->i_ino, + fs_umode_to_dtype(d_inode(next)->i_mode))) + break; + ctx->pos++; + p = &next->d_sib.next; + } + spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); + if (next) + hlist_add_before(&cursor->d_sib, &next->d_sib); + spin_unlock(&dentry->d_lock); + dput(next); + + return 0; +} + +#else /* !HAVE_DENTRY_D_CHILDREN */ + +/* Relationship between i_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +/* + * linux/fs/libfs.c: simple_positive() + * Public in linux/include/linux/dcache.h + * kernel 4.1-rc3 commit dc3f4198eac14e52a98dfc79cd84b45e280f59cd + */ +static inline int __simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +/* + * Returns an element of siblings' list. + * We are looking for th positive after

; if + * found, dentry is grabbed and returned to caller. + * If no such element exists, NULL is returned. + */ +/* parent is locked at least shared */ +static struct dentry *scan_positives(struct dentry *cursor, + struct list_head *p, + loff_t count, + struct dentry *last) +{ + struct dentry *dentry = cursor->d_parent, *found = NULL; + + spin_lock(&dentry->d_lock); + while ((p = p->next) != &dentry->d_subdirs) { + struct dentry *d = list_entry(p, struct dentry, d_child); + /* We must at least skip cursors, to avoid livelocks */ + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (__simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (__simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + list_move(&cursor->d_child, p); + p = &cursor->d_child; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); + } + } + spin_unlock(&dentry->d_lock); + dput(last); + return found; +} + +/* linux/fs/libfs.c: dcache_readdir() */ +/* + * Directory is locked and all positive dentries in it are safe, since + * for ramfs-type trees they can't go away without unlink() or rmdir(), + * both impossible due to the lock on directory. + */ +static int memfs_dcache_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct dentry *cursor = file->private_data; + struct list_head *anchor = &dentry->d_subdirs; + struct memfs_dir_context *mctx = (struct memfs_dir_context *)ctx; + struct dentry *next = NULL; + struct list_head *p; + + if (!dir_emit_dots(file, ctx)) + return 0; + + if (ctx->pos == 2) + p = anchor; + else if (!list_empty(&cursor->d_child)) + p = &cursor->d_child; + else + return 0; + + while ((next = scan_positives(cursor, p, 1, next)) != NULL) { + /* + * TODO: Add a new f_flags O_HAVE_DIR_CONTEXT_EXT to + * distinguish the normal readdir() access from the user space. + */ + mctx->dentry = next; + if (!dir_emit(ctx, next->d_name.name, next->d_name.len, + d_inode(next)->i_ino, dt_type(d_inode(next)))) + break; + ctx->pos++; + p = &next->d_child; + } + spin_lock(&dentry->d_lock); + if (next) + list_move_tail(&cursor->d_child, &next->d_child); + else + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); + dput(next); + + return 0; +} +#endif /* HAVE_DENTRY_D_CHILDREN */ + +/* + * Copied from @simple_write_end in the kernel. + * It does not export on the new kernel such as rhel9. + */ +static int memfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + + /* zero the stale part of the page if we did a short copy */ + if (!PageUptodate(page)) { + if (copied < len) { + unsigned int from = pos & (PAGE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + } + SetPageUptodate(page); + } + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + + set_page_dirty(page); + unlock_page(page); + put_page(page); + + return copied; +} + +/* TODO: implement file splice read/write interface for MemFS. */ +static ssize_t memfs_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, unsigned int flags) +{ + RETURN(0); +} + +/* + * linux/mm/shmem.c + * TODO: mmap support. + */ +static int memfs_getpage(struct inode *inode, pgoff_t index, + struct page **pagep) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) + return -EFBIG; + + page = find_lock_page(mapping, index); + /* fallocated page? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } + + *pagep = page; + return 0; +} + +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +/* linux/mm/shmem.c shmem_file_read_iter() */ +static ssize_t memfs_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + loff_t *ppos = &iocb->ki_pos; + unsigned long offset; + ssize_t retval = 0; + pgoff_t index; + int error = 0; + + ENTRY; + + /* + * Might this read be for a stacking filesystem? Then when reading + * holes of a sparse file, we actually need to allocate those pages, + * and even mark them dirty, so it cannot exceed the max_blocks limit. + */ + + index = *ppos >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; + + for (;;) { + struct page *page = NULL; + pgoff_t end_index; + unsigned long nr, ret; + loff_t i_size = i_size_read(inode); + + end_index = i_size >> PAGE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = i_size & ~PAGE_MASK; + if (nr <= offset) + break; + } + + error = memfs_getpage(inode, index, &page); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (page) + unlock_page(page); + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_mutex protection against truncate + */ + nr = PAGE_SIZE; + i_size = i_size_read(inode); + end_index = i_size >> PAGE_SHIFT; + if (index == end_index) { + nr = i_size & ~PAGE_MASK; + if (nr <= offset) { + if (page) + put_page(page); + break; + } + } + nr -= offset; + + if (page) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + mark_page_accessed(page); + } else { + page = ZERO_PAGE(0); + get_page(page); + } + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + ret = copy_page_to_iter(page, offset, nr, to); + retval += ret; + offset += ret; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; + + put_page(page); + if (!iov_iter_count(to)) + break; + if (ret < nr) { + error = -EFAULT; + break; + } + cond_resched(); + } + + *ppos = ((loff_t) index << PAGE_SHIFT) + offset; + file_accessed(file); + return retval ? retval : error; +} + +/* TODO: space limiting for write. */ +static ssize_t memfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + RETURN(generic_file_write_iter(iocb, iter)); +} + +#else + +/* + * It can not use simple_readpage() directly in Linux ramfs especially when + * there are holes in the file which is cached MemFS. It must rewrite the read + * VFS interface similar to Linux tmpfs. + */ +/* linux/mm/filemap.c */ +static int memfs_file_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + /* + * Faults on the destination of a read are common, so do it before + * taking the kmap. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) && + !fault_in_pages_writeable(desc->arg.buf, size)) { + kaddr = kmap_atomic(page); + left = __copy_to_user_inatomic(desc->arg.buf, + kaddr + offset, size); + kunmap_atomic(kaddr); + if (left == 0) + goto success; + } + + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_to_user(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } +success: + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +/* linux/mm/shmem.c do_shmem_file_read() */ +static void do_memfs_file_read(struct file *filp, + loff_t *ppos, read_descriptor_t *desc, + read_actor_t actor) +{ + struct inode *inode = file_inode(filp); + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + unsigned long offset; + + /* + * Might this read be for a stacking filesystem? Then when reading + * holes of a sparse file, we actually need to allocate those pages, + * and even mark them dirty, so it cannot exceed the max_blocks limit. + */ + + index = *ppos >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; + + for (;;) { + struct page *page = NULL; + pgoff_t end_index; + unsigned long nr, ret; + loff_t i_size = i_size_read(inode); + + end_index = i_size >> PAGE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = i_size & ~PAGE_MASK; + if (nr <= offset) + break; + } + + desc->error = memfs_getpage(inode, index, &page); + if (desc->error) { + if (desc->error == -EINVAL) + desc->error = 0; + break; + } + if (page) + unlock_page(page); + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_mutex protection against truncate + */ + nr = PAGE_SIZE; + i_size = i_size_read(inode); + end_index = i_size >> PAGE_SHIFT; + if (index == end_index) { + nr = i_size & ~PAGE_MASK; + if (nr <= offset) { + if (page) + put_page(page); + break; + } + } + nr -= offset; + + if (page) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + mark_page_accessed(page); + } else { + page = ZERO_PAGE(0); + get_page(page); + } + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; + + put_page(page); + if (ret != nr || !desc->count) + break; + + cond_resched(); + } + + *ppos = ((loff_t) index << PAGE_SHIFT) + offset; + file_accessed(filp); +} + +static ssize_t memfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_memfs_file_read(filp, ppos, &desc, memfs_file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } + return retval; +} + +static ssize_t memfs_file_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + RETURN(do_sync_read(file, buf, count, ppos)); +} + +static ssize_t memfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + RETURN(generic_file_aio_write(iocb, iov, nr_segs, pos)); +} + +static ssize_t memfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + RETURN(do_sync_write(file, buf, count, ppos)); +} +#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + +static void memfs_put_super(struct super_block *sb) +{ + struct memfs_sb_info *sbinfo = MEMFS_SB(sb); + + OBD_FREE_PTR(sbinfo); + sb->s_fs_info = NULL; +} + +#ifdef HAVE_FS_CONTEXT_H +static int memfs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct memfs_options *ctx = fc->fs_private; + struct memfs_sb_info *sbinfo; + struct inode *inode; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(sbinfo); + if (!sbinfo) + return -ENOMEM; + + sb->s_fs_info = sbinfo; + sb->s_flags |= SB_NOUSER | SB_NOSEC; + + sbinfo->msi_uid = ctx->meo_uid; + sbinfo->msi_gid = ctx->meo_gid; + sbinfo->msi_mode = ctx->meo_mode; + sbinfo->msi_max_blocks = ctx->meo_blocks; + sbinfo->msi_free_inodes = sbinfo->msi_max_inodes = ctx->meo_inodes; + /* Swap space for the larger capacity is not supported. */ + sbinfo->msi_noswap = true; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = WBCFS_MAGIC; + sb->s_op = &memfs_ops; + sb->s_d_op = &simple_dentry_operations; + sb->s_time_gran = 1; + uuid_gen(&sb->s_uuid); + + inode = memfs_create_inode(sb, NULL, S_IFDIR | sbinfo->msi_mode, + NULL, 0, true); + if (IS_ERR(inode)) + GOTO(out_fail, rc = PTR_ERR(inode)); + + inode->i_uid = sbinfo->msi_uid; + inode->i_gid = sbinfo->msi_gid; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + GOTO(out_fail, rc = -ENOMEM); + + RETURN(0); +out_fail: + memfs_put_super(sb); + RETURN(rc); +} + +static int memfs_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, memfs_fill_super); +} + +static void memfs_free_fc(struct fs_context *fc) +{ + struct memfs_options *ctx = fc->fs_private; + + if (ctx) + OBD_FREE_PTR(ctx); +} + +static const struct fs_context_operations memfs_context_ops = { + .free = memfs_free_fc, + .get_tree = memfs_get_tree, +}; + +static int memfs_init_fs_context(struct fs_context *fc) +{ + struct memfs_options *ctx; + + OBD_ALLOC_PTR(ctx); + if (!ctx) + return -ENOMEM; + + ctx->meo_mode = 0777 | S_ISVTX; + ctx->meo_uid = current_fsuid(); + ctx->meo_gid = current_fsgid(); + + fc->fs_private = ctx; + fc->ops = &memfs_context_ops; + return 0; +} + +#else /* !HAVE_FS_CONTEXT_H */ + +static int memfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct memfs_sb_info *sbinfo; + struct inode *inode; + int rc; + + /* Round up to L1_CACHE_BYTES to resist false sharing */ + OBD_ALLOC_PTR(sbinfo); + if (!sbinfo) + return -ENOMEM; + + sbinfo->msi_mode = S_IRWXUGO | S_ISVTX; + sbinfo->msi_uid = current_fsuid(); + sbinfo->msi_gid = current_fsgid(); + sb->s_fs_info = sbinfo; + + /* + * Per default we only allow half of the physical ram per + * tmpfs instance, limiting inodes to one per page of lowmem; + * but the internal instance is left unlimited. + */ + if (!(sb->s_flags & MS_KERNMOUNT)) { + sbinfo->msi_max_blocks = memfs_default_max_blocks(); + sbinfo->msi_max_inodes = memfs_default_max_inodes(); + } else { + sb->s_flags |= MS_NOUSER; + } + + sb->s_flags |= MS_NOSEC | MS_NOUSER; + sbinfo->msi_free_inodes = sbinfo->msi_max_inodes; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = WBCFS_MAGIC; + sb->s_op = &memfs_ops; + sb->s_d_op = &simple_dentry_operations; + sb->s_time_gran = 1; + + inode = memfs_create_inode(sb, NULL, S_IFDIR | sbinfo->msi_mode, NULL, + 0, true); + if (IS_ERR(inode)) + GOTO(out_fail, rc = PTR_ERR(inode)); + + inode->i_uid = sbinfo->msi_uid; + inode->i_gid = sbinfo->msi_gid; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + GOTO(out_fail, rc = -ENOMEM); + return 0; +out_fail: + memfs_put_super(sb); + return rc; +} + +static struct dentry *memfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, memfs_fill_super); +} +#endif /* HAVE_FS_CONTEXT_H */ + +static struct kmem_cache *memfs_inode_cachep; + +static struct inode *memfs_alloc_inode(struct super_block *sb) +{ + struct memfs_inode_info *mei; + + mei = kmem_cache_alloc(memfs_inode_cachep, GFP_KERNEL); + if (!mei) + return NULL; + + return &mei->mei_vfs_inode; +} + +static void memfs_destroy_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + ENTRY; + /* TOOD: free symlink name. */ + kmem_cache_free(memfs_inode_cachep, MEMFS_I(inode)); + EXIT; +} + +static void memfs_destroy_inode(struct inode *inode) +{ + struct memfs_inode_info *mei = MEMFS_I(inode); + + if (mei->mei_index_type == INDEX_TYPE_HASH) + hash_index_fini(&mei->mei_hash_index); + + call_rcu(&inode->i_rcu, memfs_destroy_callback); +} + +static void memfs_init_inode(void *foo) +{ + struct memfs_inode_info *mei = (struct memfs_inode_info *)foo; + + inode_init_once(&mei->mei_vfs_inode); +} + +static void memfs_init_inodecache(void) +{ + memfs_inode_cachep = kmem_cache_create("memfs_inode_cache", + sizeof(struct memfs_inode_info), + 0, SLAB_PANIC | SLAB_ACCOUNT, + memfs_init_inode); +} + +static void memfs_destroy_inodecache(void) +{ + kmem_cache_destroy(memfs_inode_cachep); +} + +static inline bool memfs_mapping(struct address_space *mapping) +{ + return mapping->a_ops == &memfs_aops; +} + +static void memfs_evict_inode(struct inode *inode) +{ + struct memfs_inode_info *mei = MEMFS_I(inode); + + if (memfs_mapping(inode->i_mapping)) { + inode->i_size = 0; + mapping_set_exiting(inode->i_mapping); + truncate_inode_pages_range(inode->i_mapping, 0, (loff_t)-1); + } + + mem_xattrs_fini(&mei->mei_xattrs); + memfs_free_inode(inode->i_sb); + clear_inode(inode); +} + +static int memfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct memfs_sb_info *sbinfo = MEMFS_SB(dentry->d_sb); + + buf->f_type = WBCFS_MAGIC; + buf->f_bsize = PAGE_SIZE; + buf->f_namelen = NAME_MAX; + if (sbinfo->msi_max_blocks) { + buf->f_blocks = sbinfo->msi_max_blocks; + buf->f_bavail = + buf->f_bfree = sbinfo->msi_max_blocks - + percpu_counter_sum(&sbinfo->msi_used_blocks); + } + if (sbinfo->msi_max_inodes) { + buf->f_files = sbinfo->msi_max_inodes; + buf->f_ffree = sbinfo->msi_free_inodes; + } + /* else leave those fields 0 like simple_statfs */ + + return 0; +} + +static const struct super_operations memfs_ops = { + .alloc_inode = memfs_alloc_inode, + .destroy_inode = memfs_destroy_inode, + .statfs = memfs_statfs, + .evict_inode = memfs_evict_inode, + .drop_inode = generic_delete_inode, + .put_super = memfs_put_super, +}; + +/* + * TODO: Using the new kernel data structure Maple Tree: + * @simple_offset_dir_operations to manage and access the dentries + * within a directory. It is much efficient than linear list. + */ +static const struct file_operations memfs_dir_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .llseek = dcache_dir_lseek, + .read = generic_read_dir, + .iterate_shared = memfs_dcache_readdir, + .fsync = noop_fsync, +}; + +static const struct inode_operations memfs_dir_inode_operations = { + .mknod = memfs_mknod, + .lookup = simple_lookup, + .create = memfs_create_nd, + .unlink = memfs_unlink, + .mkdir = memfs_mkdir, + .rmdir = memfs_rmdir, + .link = memfs_link, + .setattr = simple_setattr, + .getattr = simple_getattr, +}; + +static const struct file_operations memfs_file_operations = { +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER +# ifdef HAVE_SYNC_READ_WRITE + .read = new_sync_read, + .write = new_sync_write, +# endif + .read_iter = memfs_file_read_iter, + .write_iter = memfs_file_write_iter, +#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .read = memfs_file_read, + .aio_read = memfs_file_aio_read, + .write = memfs_file_write, + .aio_write = memfs_file_aio_write, +#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ + .mmap = generic_file_mmap, + .llseek = generic_file_llseek, + .splice_read = memfs_file_splice_read, + .fsync = noop_fsync, +}; + +static const struct address_space_operations memfs_aops = { +#ifdef HAVE_DIRTY_FOLIO + .dirty_folio = noop_dirty_folio, +#else + /* + * TODO: reimplemet ->set_page_dirty() interface. + * - The call __set_page_dirty_nobuffers will mark the inode dirty and + * put the inode into the writeback control list. Instead, it would + * better to call mark_inode_dirty() only one time when close the file + * once the file data was modified. + * - Here it can be optimized to use light weight function: + * __set_page_dirty_no_writeback(); The writeback related data + * structure can be delayed to initilize during data assimliation. + */ + .set_page_dirty = __set_page_dirty_nobuffers, +#endif + .write_begin = simple_write_begin, + .write_end = memfs_write_end, +}; + +static struct file_system_type memfs_fstype = { + .owner = THIS_MODULE, + .name = "wbcfs", +#ifdef HAVE_FS_CONTEXT_H + .init_fs_context = memfs_init_fs_context, +#else + .mount = memfs_mount, +#endif + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +int memfs_init(void) +{ + int rc; + + memfs_init_inodecache(); + rc = register_filesystem(&memfs_fstype); + if (rc) + memfs_destroy_inodecache(); + + return rc; +} + +void memfs_fini(void) +{ + unregister_filesystem(&memfs_fstype); + memfs_destroy_inodecache(); +} diff --git a/lustre/osd-wbcfs/wbcfs.h b/lustre/osd-wbcfs/wbcfs.h new file mode 100644 index 0000000..fdcf746 --- /dev/null +++ b/lustre/osd-wbcfs/wbcfs.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025-2026, DDN/Whamcloud, Inc. + */ + +/* + * Embed memory file system with writeback support that using for OSD. + * + * Author: Yingjin Qian + */ + +#ifndef _OSD_WBCFS_H_ +#define _OSD_WBCFS_H_ + +#include +#include +#include +#ifdef HAVE_INODE_IVERSION +#include +#else +#define inode_peek_iversion(__inode) ((__inode)->i_version) +#define inode_inc_iversion(__inode) +#endif + +#include + +#include "index.h" + +/* Pretend that each entry is of this size in directory's i_size */ +#define BOGO_DIRENT_SIZE 20 + +/* Pretend that one inode + its dentry occupy this much memory */ +#define BOGO_INODE_SIZE 1024 + +#define WBCFS_MAGIC 0xbdacbd05 + +/* In-memory xattr list */ +struct mem_xattrs { + spinlock_t mex_lock; + struct list_head mex_xattr_list; +}; + +struct memfs_options { + unsigned long long meo_blocks; + unsigned long long meo_inodes; + kuid_t meo_uid; + kgid_t meo_gid; + umode_t meo_mode; + bool meo_noswap; +}; + +struct memfs_sb_info { + /* How many blocks are allowed. */ + unsigned long msi_max_blocks; + /* How many blocks are allocated. */ + struct percpu_counter msi_used_blocks; + /* How many inodes are allowed. */ + unsigned long msi_max_inodes; + /* How much ispace left for allocation. */ + unsigned long msi_free_inodes; + /* Serialize memfs_sb_info changes. */ + spinlock_t msi_stat_lock; + /* Mount mode for root directory */ + umode_t msi_mode; + /* Mount uid for root directory */ + kuid_t msi_uid; + /* Mount gid for root directory */ + kgid_t msi_gid; + /* Whether enable swap with much larger capacity. */ + bool msi_noswap; + /* Whether there is backing persistent store. */ + bool msi_no_backing; + /* TODO: Quota limits support for MemFS. */ +}; + +enum index_type { + INDEX_TYPE_NONE = 0, + INDEX_TYPE_HASH, + INDEX_TYPE_MTREE, +}; + +/* MemFS inode in-kernel data */ +struct memfs_inode_info { + __u32 mei_flags; + struct mem_xattrs mei_xattrs; + struct lu_fid mei_fid; +#ifdef HAVE_PROJECT_QUOTA + /* Project ID */ + kprojid_t mei_projid; +#endif + /* File creation time. */ + struct timespec64 mei_crtime; + /* + * Index access for dir dentry or indexing KV store. + * Currently only support hash index with linear iterating. + * Next step add Maple Tree index. + * TODO: use maple tree to manage dir entries under this dir. + */ + enum index_type mei_index_type; + struct hash_index mei_hash_index; + /* Stack backing inode with the persistent storage. */ + struct inode *mei_backing; + struct inode mei_vfs_inode; +}; + +#define MEMFS_I(inode) (container_of(inode, struct memfs_inode_info, \ + mei_vfs_inode)) + +#define MEMFS_DIR_EOF ((1ULL << (64 - 1)) - 1) + +struct memfs_dir_context { + struct dir_context super; + struct dentry *dentry; + void *cbdata; +}; + +#ifdef HAVE_PROJECT_QUOTA +static inline __u32 i_projid_read(struct inode *inode) +{ + return (__u32)from_kprojid(&init_user_ns, MEMFS_I(inode)->mei_projid); +} + +static inline void i_projid_write(struct inode *inode, __u32 projid) +{ + kprojid_t kprojid; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + MEMFS_I(inode)->mei_projid = kprojid; +} +#else +static inline uid_t i_projid_read(struct inode *inode) +{ + return 0; +} +static inline void i_projid_write(struct inode *inode, __u32 projid) +{ +} +#endif + +static inline int memfs_test_inode_by_fid(struct inode *inode, void *opaque) +{ + return lu_fid_eq(&MEMFS_I(inode)->mei_fid, opaque); +} + +static inline __u64 memfs_get_btime(struct inode *inode) +{ + return MEMFS_I(inode)->mei_crtime.tv_sec; +} + +static inline __u32 memfs_get_flags(struct inode *inode) +{ + return MEMFS_I(inode)->mei_flags; +} + +static inline unsigned long memfs_default_max_blocks(void) +{ + return cfs_totalram_pages() / 2; +} + +static inline unsigned long memfs_default_max_inodes(void) +{ + unsigned long nr_pages = cfs_totalram_pages(); + + /* + * return min(nr_pages - totalhigh_pages(), nr_pages / 2); + */ + return nr_pages / 2; +} + +int memfs_xattr_get(struct inode *inode, void *buf, size_t len, + const char *name); +int memfs_xattr_set(struct inode *inode, void *buf, size_t len, + const char *name, int flags); +void memfs_xattr_del(struct inode *inode, const char *name); + +struct inode *memfs_create_inode(struct super_block *sb, struct inode *dir, + umode_t mode, struct iattr *iattr, dev_t dev, + bool update_link); + +int memfs_init(void); +void memfs_fini(void); +#endif /* _OSD_WBCFS_H_ */ diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0c8d90a..2d6238f 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -86,6 +86,105 @@ if [[ "$CLIENT_OS_ID_LIKE" =~ "rhel" ]]; then fi fi +if [[ "$FSTYPE" = "wbcfs" ]]; then + # Lack of lprocfs support + always_except LU-18813 0f 27A 53 66 270a + # lack of lprocfs: osd.*.nonrotational + always_except LU-18813 119e 119f 119g 119h + # No stats (similar to openZFS) + always_except LU-18813 156 + # MemFS-based OSD (wbcfs) cannot recovery from a server restart + always_except LU-18813 17o 27oo 27z 27F 60a 64i 232 257 + always_except LU-18813 278 280 427 801c 818 820 + # Symlink/CHR/SOCK/FIFO/BLK file types do not support + always_except LU-18813 17a 17b 17e 17g 17i 17p 21 25a + always_except LU-18813 25b 26a 26c 26d 26e 26f 27ga 27Q + always_except LU-18813 28 32e 32f 32g 32h 32m 32n 32o + always_except LU-18813 32p 48a 54a 54c 54d 56l 56m 56n 56rd + always_except LU-18813 56xb 56eb 56eg 56eh 56ei 133a 140 170b + always_except LU-18813 162a 226a + # Truncate operation is not supported yet. + always_except LU-18813 27p 27q 34a + # cross directory hardlink in DNE env + always_except LU-18813 31g 31l 31m + # FMD not expired: cannot reproduce on local testing + always_except LU-18813 36g + # Filemap is not supported yet. + always_except LU-18813 44f 130a 130b 130c 130d 130e 130i 430a + # inodes/blocks space usage accounting and statfs() is not supported + always_except LU-18813 51b 56ab 81b 220 413 418 806 + # lsattr: append-only/immutable flags + always_except LU-18813 52a 52b + # xattr_list() is not implemented yet + always_except LU-18813 102a 102h 102i 102r 102t + # linkea and fid2path wrong... + always_except LU-18813 154B 154f 154g + # changelog related failures: wbcfs-target device label is not correct + always_except LU-18813 160 161c 161d 205a 65k 807 808 812 + # DNE does not work well + always_except LU-18813 56 65e 65a 406 + # user.job XATTR + always_except LU-18813 205h + # Exclusive open timeout + always_except LU-18813 208 + # OFD access log failure + always_except LU-18813 165 + # rename() operations: the source may not empty + # always_except LU-18813 214 + # Data page cache has been updated during bulk write + always_except LU-18813 224d + # fid2path failure + always_except LU-18813 226d + # ladvise failure + always_except LU-18813 255 + # sec related failure + always_except LU-18813 258 + # DoM migration failure + always_except LU-18813 272 + # Unkown reason timeout! + always_except LU-18813 275 277 311 410 414 419 831 + # last_rcvd should fail + always_except LU-18813 313 314 315 + # block accting is wrong... + always_except LU-18813 317 + # Other timeouts + always_except LU-18813 200 350 398 399 403 404 408 432 433 + # DIO locking issue? + always_except LU-18813 398a + # Layout swap is not working + always_except LU-18813 405 + # Memory pressure under memcg control + always_except LU-18813 411 + # rmfid in DNE and in large numbers + always_except LU-18813 421 + # local testing passed but Maloo testing failed! + always_except LU-18813 27Cg 27U 422 424 425 426 428 429 434 442 + # OOM failure + always_except LU-18813 430b 430c 431 814 833 850 + # Expired barrier + always_except LU-18813 801a 801b + # ro is not implemented yet + always_except LU-18813 802b + # openZFS related partial page write + always_except LU-18813 810 + # Quota is not supported yet... + always_except LU-18813 812b + # ldlm kunit test + always_except LU-18813 842 + # fanotify does not work + always_except LU-18813 851 + # MGC locks and client umount + always_except LU-18813 901 + # destroy takes too much time + always_except LU-18813 903 +fi + +# Although every sanity.sh test has been run, we stop sooner for +# stability reasons. As we get farther, increment the STOP_AT value. +if [[ "$FSTYPE" = "wbcfs" ]]; then + export STOP_AT=${STOP_AT:-"440"} +fi + build_test_filter FAIL_ON_ERROR=false @@ -15637,7 +15736,7 @@ test_123a_base() { # was test 123, statahead(b=11401) log "$lsx done" stime=$SECONDS - rm -r $DIR/$tdir + rm -r $DIR/$tdir || error "failed to rm $DIR/$tdir" sync etime=$SECONDS delta=$((etime - stime)) @@ -25773,6 +25872,9 @@ test_250() { [ "$(facet_fstype ost$(($($LFS getstripe -i $DIR/$tfile) + 1)))" = "zfs" ] \ && skip "no 16TB file size limit on ZFS" + [ "$(facet_fstype ost$(($($LFS getstripe -i $DIR/$tfile) + 1)))" = "wbcfs" ] \ + && skip "no 16TB file size limit on wbcfs" + $LFS setstripe -c 1 $DIR/$tfile # ldiskfs extent file size limit is (16TB - 4KB - 1) bytes local size=$((16 * 1024 * 1024 * 1024 * 1024 - 4096 - 1)) diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 8c24bcc..e392963 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1135,6 +1135,8 @@ load_modules_local() { elif [[ $(node_fstypes $HOSTNAME) == *ldiskfs* ]]; then load_module ../ldiskfs/ldiskfs load_module osd-ldiskfs/osd_ldiskfs + elif [[ $(node_fstypes $HOSTNAME) == *wbcfs* ]]; then + load_module osd-wbcfs/osd_wbcfs fi load_module mgs/mgs load_module mdd/mdd @@ -1759,6 +1761,8 @@ devicelabel() { zfs) label=$(do_facet ${facet} "$ZFS get -H -o value lustre:svname \ ${dev} 2>/dev/null");; + wbcfs) + label="wbcfs-target";; *) error "unknown fstype!";; esac @@ -2497,6 +2501,8 @@ mount_facet() { local fstype=$(facet_fstype $facet) local devicelabel local dm_dev=${!dev} + local index=$(facet_index $facet) + local node_type=$(facet_type $facet) [[ $dev == "mgsfailover_dev" ]] && combined_mgs_mds && dev=mds1failover_dev @@ -2519,21 +2525,63 @@ mount_facet() { devicelabel=$(do_facet ${facet} "$ZFS get -H -o value \ lustre:svname $dm_dev");; + wbcfs) + :;; *) error "unknown fstype!";; esac - echo "Starting ${facet}: $opts $dm_dev $mntpt" # for testing LU-482 error handling in mount_facets() and test_0a() if [ -f $TMP/test-lu482-trigger ]; then RC=2 else local seq_width=$(($OSTSEQWIDTH / $OSTCOUNT)) (( $seq_width >= 16384 )) || seq_width=16384 - do_facet ${facet} \ - "mkdir -p $mntpt; $MOUNT_CMD $opts $dm_dev $mntpt" + + case $fstype in + wbcfs) + echo "Start ${facet}: $MOUNT_CMD -v lustre-wbcfs $mntpt" + + export OSD_WBC_FSNAME="$FSNAME" + export OSD_WBC_INDEX="$index" + export OSD_WBC_MGS_NID="$MGSNID" + + case $node_type in + OST) + export OSD_WBC_TGT_TYPE="OST" + ;; + MDS) + export OSD_WBC_TGT_TYPE="MDT" + if (( $index == 0 )) && + [[ "$mds_HOST" == "$mgs_HOST" ]]; then + export OSD_WBC_PRIMARY_MDT="1" + else + export OSD_WBC_PRIMARY_MDT="0" + fi + ;; + MGS) + export OSD_WBC_TGT_TYPE="MGT" + ;; + *) + error "Unhandled node_type!" + esac + + do_facet ${facet} "mkdir -p $mntpt; \ + OSD_WBC_TGT_TYPE=$OSD_WBC_TGT_TYPE \ + OSD_WBC_INDEX=$OSD_WBC_INDEX \ + OSD_WBC_MGS_NID=$OSD_WBC_MGS_NID \ + OSD_WBC_PRIMARY_MDT=$OSD_WBC_PRIMARY_MDT \ + OSD_WBC_FSNAME=$OSD_WBC_FSNAME \ + $MOUNT_CMD -v lustre-wbcfs $mntpt" + ;; + *) + echo "Start ${facet}: $MOUNT_CMD $opts $dm_dev $mntpt" + do_facet ${facet} \ + "mkdir -p $mntpt; $MOUNT_CMD $opts $dm_dev $mntpt" + esac + RC=${PIPESTATUS[0]} - if [[ ${facet} =~ ost ]]; then + if [[ ${facet} =~ ost ]] && [[ ! "$fstype" == "wbcfs" ]]; then do_facet ${facet} "$LCTL set_param \ seq.cli-$(devicelabel $facet $dm_dev)-super.width=$seq_width" fi @@ -2566,6 +2614,8 @@ mount_facet() { grep -E ':[a-zA-Z]{3}[0-9]{4}'" "" || error "$dm_dev failed to initialize!";; + wbcfs) + :;; *) error "unknown fstype!";; esac @@ -5019,6 +5069,8 @@ ostdevname() { #try $OSTZFSDEVn - independent of vdev DEVNAME=OSTZFSDEV$num eval DEVPTR=${!DEVNAME:=${FSNAME}-ost${num}/ost${num}};; + wbcfs ) + :;; * ) error "unknown fstype!";; esac @@ -5043,6 +5095,8 @@ ostvdevname() { # Device formatted by zfs DEVNAME=OSTDEV$num eval VDEVPTR=${!DEVNAME:=${OSTDEVBASE}${num}};; + wbcfs ) + :;; * ) error "unknown fstype!";; esac @@ -5067,6 +5121,8 @@ mdsdevname() { # try $MDSZFSDEVn - independent of vdev DEVNAME=MDSZFSDEV$num eval DEVPTR=${!DEVNAME:=${FSNAME}-mdt${num}/mdt${num}};; + wbcfs ) + :;; * ) error "unknown fstype!";; esac @@ -5089,6 +5145,8 @@ mdsvdevname() { # Device formatted by ZFS local DEVNAME=MDSDEV$num eval VDEVPTR=${!DEVNAME:=${MDSDEVBASE}${num}};; + wbcfs ) + :;; * ) error "unknown fstype!";; esac @@ -5117,6 +5175,8 @@ mgsdevname() { else DEVPTR=${MGSZFSDEV:-${FSNAME}-mgs/mgs} fi;; + wbcfs ) + :;; * ) error "unknown fstype!";; esac @@ -5141,6 +5201,8 @@ mgsvdevname() { elif [ -n "$MGSDEV" ]; then VDEVPTR=$MGSDEV fi;; + wbcfs ) + :;; * ) error "unknown fstype!";; esac @@ -5546,6 +5608,9 @@ __touch_device() format_mgs() { local quiet + local fstype=$(facet_fstype mgs) + + [[ "$fstype" == "wbcfs" ]] && return if ! $VERBOSE; then quiet=yes @@ -5565,6 +5630,9 @@ format_mgs() { format_mdt() { local num=$1 local quiet + local fstype=$(facet_fstype mdt$num) + + [[ "$fstype" == "wbcfs" ]] && return if ! $VERBOSE; then quiet=yes @@ -5581,6 +5649,9 @@ format_mdt() { format_ost() { local num=$1 + local fstype=$(facet_fstype ost$num) + + [[ "$fstype" == "wbcfs" ]] && return if ! $VERBOSE; then quiet=yes @@ -6640,6 +6711,11 @@ do_check_and_cleanup_lustre() { run_lfsck fi + # FIXME: The cleanup takes too long, times out... + if [[ "$FSTYPE" == "wbcfs" ]]; then + DO_CLEANUP=false + fi + if is_mounted $MOUNT; then if $DO_CLEANUP; then [[ -n "$DIR" ]] && rm -rf $DIR/[Rdfs][0-9]* || @@ -7513,6 +7589,10 @@ run_test() { return 0 else run_one_logged $testnum "$testmsg" + # TODO: Avoid running out of space!? + if [[ "$FSTYPE" == "wbcfs" ]]; then + rm -rf "$MOUNT/*" + fi return $? fi } @@ -8715,7 +8795,13 @@ convert_facet2label() { if [ -n "${!varsvc}" ]; then echo ${!varsvc} else - error "No label for $facet!" + # FIXME: Cannot find label correctly for some reason. + # Just assume wbcfs OSD and continue... + if [[ "$FSTYPE" == "wbcfs" ]]; then + echo "wbcfs-target" + else + error "No label for $facet!" + fi fi } diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 0f775c6..d95c416 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -128,6 +128,9 @@ endif if ZFS_ENABLED LIB_TARGETS += mount_osd_zfs.so endif +if SERVER +LIB_TARGETS += mount_osd_wbcfs.so +endif endif install-exec-hook: @@ -214,6 +217,24 @@ PLUGIN_LIB += libmount_utils_ldiskfs.a endif # PLUGINS endif # LDISKFS_ENABLED +if SERVER +noinst_LIBRARIES += libmount_utils_wbcfs.a + +libmount_utils_wbcfs_a_SOURCES = libmount_utils_wbcfs.c +libmount_utils_wbcfs_a_CPPFLAGS := + +if PLUGINS +lib_LTLIBRARIES += libmount_utils_wbcfs.la +libmount_utils_wbcfs.la : libmount_utils_wbcfs.a + $(CC) $(LDFLAGS) $(MNTMODLDFLAGS) -shared -Wl,--export-dynamic \ + -o mount_osd_wbcfs.so \ + `$(AR) -t libmount_utils_wbcfs.a` \ + $(MNTMODLIBS) +else +PLUGIN_LIB += libmount_utils_wbcfs.a +endif # PLUGINS +endif # SERVER + mount_lustre_SOURCES = mount_lustre.c mount_utils.c mount_utils.h $(GSSSRC) \ lustre_param.c mount_lustre_CPPFLAGS := ${MNTMODCFLAGS} diff --git a/lustre/utils/libmount_utils_wbcfs.c b/lustre/utils/libmount_utils_wbcfs.c new file mode 100644 index 0000000..41cdf08 --- /dev/null +++ b/lustre/utils/libmount_utils_wbcfs.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2024, Amazon and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Author: Timothy Day + */ + +#include "mount_utils.h" + +#define VAR_SIZE 64 + +enum osd_tgt_type { + MGT, + MDT, + OST, + INVALID +}; + +int wbcfs_write_ldd(struct mkfs_opts *mop) +{ + return 0; +} + +int wbcfs_erase_ldd(struct mkfs_opts *mop, char *param) +{ + return 0; +} + +static int get_wbcfs_env(char *out, char *env) +{ + if (!getenv(env)) { + fprintf(stderr, "%s is undefined\n", env); + return -EINVAL; + } + + strscpy(out, getenv(env), VAR_SIZE); + fprintf(stderr, "%s=%s\n", env, out); + + return 0; +} + +int wbcfs_read_ldd(char *ds, struct lustre_disk_data *ldd) +{ + enum osd_tgt_type tgt_type = INVALID; + char tgt_type_var[VAR_SIZE]; + char name_var[VAR_SIZE]; + char params[2 * VAR_SIZE]; + char svname[2 * VAR_SIZE]; + int rc = 0; + + memset(ldd, 0, sizeof(struct lustre_disk_data)); + ldd->ldd_magic = LDD_MAGIC; + ldd->ldd_config_ver = 1; + ldd->ldd_mount_type = LDD_MT_WBCFS; + + rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_TGT_TYPE"); + if (rc) + return rc; + + if (!strcmp(tgt_type_var, "OST")) { + ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN | + LDD_F_SV_TYPE_OST; + tgt_type = OST; + } + + if (!strcmp(tgt_type_var, "MGT")) { + ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN | + LDD_F_SV_TYPE_MGS; + tgt_type = MGT; + } + + if (!strcmp(tgt_type_var, "MDT")) { + rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_PRIMARY_MDT"); + if (rc) + return rc; + + if (!strcmp(tgt_type_var, "1")) { + ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN | + LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_MGS; + } else { + ldd->ldd_flags = LDD_F_UPDATE | LDD_F_VIRGIN | + LDD_F_SV_TYPE_MDT; + } + + tgt_type = MDT; + } + + if (tgt_type == INVALID) { + fprintf(stderr, "OSD_WBC_TGT_TYPE is invalid\n"); + return -EINVAL; + } + + rc = get_wbcfs_env(name_var, "OSD_WBC_FSNAME"); + if (rc) + return rc; + + strscpy(ldd->ldd_fsname, name_var, VAR_SIZE); + + if (!getenv("OSD_WBC_INDEX")) { + fprintf(stderr, "OSD_WBC_INDEX is undefined\n"); + return -EINVAL; + } + + rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_INDEX"); + if (rc) + return rc; + + ldd->ldd_svindex = strtol(tgt_type_var, + NULL, 0); + + if (tgt_type == MGT) + snprintf(svname, 2 * VAR_SIZE, "%s:%s%04x", + ldd->ldd_fsname, "MGS", + ldd->ldd_svindex); + + if (tgt_type == MDT) + snprintf(svname, 2 * VAR_SIZE, "%s:%s%04x", + ldd->ldd_fsname, "MDT", + ldd->ldd_svindex); + + if (tgt_type == OST) + snprintf(svname, 2 * VAR_SIZE, "%s:%s%04x", + ldd->ldd_fsname, "OST", + ldd->ldd_svindex); + + strscpy(ldd->ldd_svname, svname, VAR_SIZE); + + fprintf(stderr, "svname -> %s\n", svname); + + rc = get_wbcfs_env(tgt_type_var, "OSD_WBC_MGS_NID"); + if (rc) + return rc; + + if (tgt_type != MGT) { + snprintf(params, 2 * VAR_SIZE, "mgsnode=%s", + tgt_type_var); + strscpy(ldd->ldd_params, params, VAR_SIZE); + fprintf(stderr, "params -> %s\n", params); + } + + return 0; +} + +void wbcfs_print_ldd_params(struct mkfs_opts *mop) +{ +} + +int wbcfs_is_lustre(char *ds, unsigned int *mount_type) +{ + if (!strcmp(ds, OSD_WBCFS_DEV)) { + fprintf(stderr, "Lustre is using wbcfs as backend\n"); + *mount_type = LDD_MT_WBCFS; + return 1; + } + + return 0; +} + +int wbcfs_make_lustre(struct mkfs_opts *mop) +{ + return 0; +} + +int wbcfs_enable_quota(struct mkfs_opts *mop) +{ + return -EOPNOTSUPP; +} + +int wbcfs_prepare_lustre(struct mkfs_opts *mop, + char *wanted_mountopts, size_t len) +{ + return 0; +} + +int wbcfs_tune_lustre(char *dev, struct mount_opts *mop) +{ + return 0; +} + +int wbcfs_label_lustre(struct mount_opts *mop) +{ + return 0; +} + +int wbcfs_rename_fsname(struct mkfs_opts *mop, const char *oldname) +{ + return 0; +} + +int wbcfs_init(void) +{ + return 0; +} + +void wbcfs_fini(void) +{ +} + +#ifndef PLUGIN_DIR +struct module_backfs_ops wbcfs_ops = { + .init = wbcfs_init, + .fini = wbcfs_fini, + .read_ldd = wbcfs_read_ldd, + .write_ldd = wbcfs_write_ldd, + .erase_ldd = wbcfs_erase_ldd, + .print_ldd_params = wbcfs_print_ldd_params, + .is_lustre = wbcfs_is_lustre, + .make_lustre = wbcfs_make_lustre, + .prepare_lustre = wbcfs_prepare_lustre, + .tune_lustre = wbcfs_tune_lustre, + .label_lustre = wbcfs_label_lustre, + .enable_quota = wbcfs_enable_quota, + .rename_fsname = wbcfs_rename_fsname, +}; +#endif /* PLUGIN_DIR */ diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c index 257b528..ad7be84 100644 --- a/lustre/utils/mount_lustre.c +++ b/lustre/utils/mount_lustre.c @@ -732,6 +732,22 @@ static int parse_opts(int argc, char *const argv[], struct mount_opts *mop) if (!mop->mo_usource) usage(stderr); +#ifdef HAVE_SERVER_SUPPORT + /* osd-wbcfs lustre_tgt */ + if (strcmp(mop->mo_usource, OSD_WBCFS_DEV) == 0) { + mop->mo_ldd.ldd_mount_type = LDD_MT_WBCFS; + mop->mo_source = strdup(mop->mo_usource); + if (!realpath(argv[optind + 1], mop->mo_target)) { + rc = errno; + fprintf(stderr, "warning: %s: cannot resolve: %s\n", + argv[optind], strerror(errno)); + return rc; + } + + return 0; + } +#endif + /** * Try to get the real path to the device, in case it is a * symbolic link for instance diff --git a/lustre/utils/mount_utils.c b/lustre/utils/mount_utils.c index 8224929..08014fa 100644 --- a/lustre/utils/mount_utils.c +++ b/lustre/utils/mount_utils.c @@ -597,6 +597,9 @@ struct module_backfs_ops *load_backfs_module(enum ldd_mount_type mount_type) ops = &zfs_ops; break; #endif /* HAVE_ZFS_OSD */ + case LDD_MT_WBCFS: + ops = &wbcfs_ops; + break; default: ops = NULL; break; diff --git a/lustre/utils/mount_utils.h b/lustre/utils/mount_utils.h index 2ebf615..0f04381 100644 --- a/lustre/utils/mount_utils.h +++ b/lustre/utils/mount_utils.h @@ -139,6 +139,7 @@ static inline const char *mt_str(enum ldd_mount_type mt) "reiserfs", "ldiskfs2", "zfs", + "wbcfs", }; return mount_type_string[mt]; @@ -156,10 +157,13 @@ static inline const char *mt_type(enum ldd_mount_type mt) "osd-reiserfs", "osd-ldiskfs", "osd-zfs", + "osd-wbcfs", }; return mount_type_string[mt]; } + +#define OSD_WBCFS_DEV "lustre-wbcfs" #endif /* HAVE_SERVER_SUPPORT */ #define MT_STR(data) mt_str((data)->ldd_mount_type) @@ -241,6 +245,7 @@ struct module_backfs_ops { extern struct module_backfs_ops zfs_ops; extern struct module_backfs_ops ldiskfs_ops; +extern struct module_backfs_ops wbcfs_ops; struct module_backfs_ops *load_backfs_module(enum ldd_mount_type mount_type); void unload_backfs_ops(struct module_backfs_ops *ops); diff --git a/rpm/kmp-lustre-osd-wbcfs.files b/rpm/kmp-lustre-osd-wbcfs.files new file mode 100644 index 0000000..f567b09 --- /dev/null +++ b/rpm/kmp-lustre-osd-wbcfs.files @@ -0,0 +1,4 @@ +%defattr(-,root,root) +%dir %{modules_fs_path}/%{lustre_name}-osd-wbcfs +%dir %{modules_fs_path}/%{lustre_name}-osd-wbcfs/fs +%{modules_fs_path}/%{lustre_name}-osd-wbcfs/fs/osd_wbcfs.ko diff --git a/rpm/kmp-lustre-osd-wbcfs.preamble b/rpm/kmp-lustre-osd-wbcfs.preamble new file mode 100644 index 0000000..0933568 --- /dev/null +++ b/rpm/kmp-lustre-osd-wbcfs.preamble @@ -0,0 +1,8 @@ +License: GPL-2.0-only +%if 0%{?suse_version} > 1 +Requires: kernel-%1 +%endif +Requires: %{name}-osd-wbcfs-mount = %{version} +Provides: %{name}-osd = %{version} +Provides: %{name}-osd-wbcfs = %{version} +Obsoletes: %{name}-osd-wbcfs < %{version} -- 1.8.3.1