Whamcloud - gitweb
LU-10092 First phase of persistent client cache project merging 14/35214/1
authorOleg Drokin <green@whamcloud.com>
Thu, 13 Jun 2019 04:36:36 +0000 (00:36 -0400)
committerOleg Drokin <green@whamcloud.com>
Thu, 13 Jun 2019 04:37:28 +0000 (00:37 -0400)
Merge remote-tracking branch 'origin/pcc'

Change-Id: I87a681c54712926d336c983dd8e56b58ebf4b612
Signed-off-by: Oleg Drokin <green@whamcloud.com>
63 files changed:
lustre/autoconf/lustre-core.m4
lustre/doc/Makefile.am
lustre/doc/lctl-pcc.8 [new file with mode: 0644]
lustre/doc/lctl.8
lustre/doc/lfs-pcc-detach.1 [new file with mode: 0644]
lustre/doc/lfs-pcc.1 [new file with mode: 0644]
lustre/doc/lfs.1
lustre/doc/llapi_pcc_attach.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_attach_fid.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_attach_fid_str.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_detach_fid.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_detach_fid_fd.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_detach_fid_str.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_detach_file.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_state_get.3 [new file with mode: 0644]
lustre/doc/llapi_pcc_state_get_fd.3 [new file with mode: 0644]
lustre/doc/llapi_pccdev_get.3 [new file with mode: 0644]
lustre/doc/llapi_pccdev_set.3 [new file with mode: 0644]
lustre/include/cl_object.h
lustre/include/lustre/lustreapi.h
lustre/include/lustre_compat.h
lustre/include/md_object.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/include/uapi/linux/lustre/lustre_user.h
lustre/llite/Makefile.in
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/llite_mmap.c
lustre/llite/lproc_llite.c
lustre/llite/namei.c
lustre/llite/pcc.c [new file with mode: 0644]
lustre/llite/pcc.h [new file with mode: 0644]
lustre/llite/super25.c
lustre/llite/vvp_object.c
lustre/llite/xattr.c
lustre/llite/xattr26.c
lustre/lmv/lmv_intent.c
lustre/lmv/lmv_obd.c
lustre/lov/lov_object.c
lustre/mdc/mdc_lib.c
lustre/mdd/mdd_dir.c
lustre/mdt/mdt_lib.c
lustre/mdt/mdt_open.c
lustre/tests/Makefile.am
lustre/tests/multiop.c
lustre/tests/replay-vbr.sh
lustre/tests/sanity-hsm.sh
lustre/tests/sanity-pcc.sh [new file with mode: 0644]
lustre/tests/sanity-quota.sh
lustre/tests/test-framework.sh
lustre/tests/test-groups/regression
lustre/utils/Makefile.am
lustre/utils/lctl.c
lustre/utils/lfs.c
lustre/utils/lhsmtool_posix.c
lustre/utils/liblustreapi_hsm.c
lustre/utils/liblustreapi_pcc.c [new file with mode: 0644]
lustre/utils/obd.c
lustre/utils/obdctl.h

index cf9a753..cc4e8b6 100644 (file)
@@ -638,6 +638,23 @@ fs_struct_seqcount, [
 ])
 ]) # LC_FS_STRUCT_SEQCOUNT
 
+# LC_DENTRY_PATH_RAW
+#
+# Kernel version 2.6.37 commit ec2447c278ee973d35f38e53ca16ba7f965ae33d
+# dentry_path_raw is exported
+#
+AC_DEFUN([LC_DENTRY_PATH_RAW], [
+LB_CHECK_COMPILE([if 'dentry_path_raw' exist],
+dentry_path_raw, [
+       #include <linux/dcache.h>
+],[
+       dentry_path_raw(NULL, NULL, 0);
+],[
+       AC_DEFINE(HAVE_DENTRY_PATH_RAW, 1,
+               ['dentry_path_raw' is available])
+])
+]) # LC_DENTRY_PATH_RAW
+
 #
 # LC_D_COMPARE_7ARGS
 #
@@ -1001,6 +1018,23 @@ security_inode_init_security_callback, [
 ]) # LC_HAVE_SECURITY_IINITSEC
 
 #
+# 2.6.39 vfs_create takes a 'struct nameidata' parameter
+#
+AC_DEFUN([LC_VFS_CREATE_USE_NAMEIDATA], [
+LB_CHECK_COMPILE([if vfs_create takes a struct nameidata parameter],
+vfs_create, [
+       #include <linux/namei.h>
+       #include <linux/fs.h>
+],[
+       struct nameidata *nd;
+       vfs_create(NULL, NULL, 0, nd);
+],[
+       AC_DEFINE(HAVE_VFS_CREATE_USE_NAMEIDATA, 1,
+               [vfs_create use nameidata as parameter])
+])
+]) # LC_VFS_CREATE_USE_NAMEIDATA
+
+#
 # LC_HAVE_MIGRATE_HEADER
 #
 # 3.3 introduces migrate_mode.h and migratepage has 4 args
@@ -1430,6 +1464,23 @@ is_sxid, [
 ]) # LC_HAVE_IS_SXID
 
 #
+# LC_HAVE_VFS_GETATTR_2ARGS
+#
+AC_DEFUN([LC_HAVE_VFS_GETATTR_2ARGS], [
+LB_CHECK_COMPILE([if vfs_getattr takes 2 args],
+vfs_getattr, [
+       #include <linux/fs.h>
+],[
+       struct path path;
+
+       vfs_getattr(&path, NULL);
+],[
+       AC_DEFINE(HAVE_VFS_GETATTR_2ARGS, 1,
+               [vfs_getattr takes 2 args])
+])
+]) # LC_HAVE_VFS_GETATTR_2ARGS
+
+#
 # LC_HAVE_REMOVE_PROC_SUBTREE
 #
 # 3.10 introduced remove_proc_subtree
@@ -1769,6 +1820,23 @@ vfs_unlink_3args, [
 ])
 ]) # LC_VFS_UNLINK_3ARGS
 
+# LC_HAVE_D_IS_POSITIVE
+#
+# Kernel version 3.13 b18825a7c8e37a7cf6abb97a12a6ad71af160de7
+# d_is_positive is added
+#
+AC_DEFUN([LC_HAVE_D_IS_POSITIVE], [
+LB_CHECK_COMPILE([if 'd_is_positive' exist],
+d_is_positive, [
+       #include <linux/dcache.h>
+],[
+       d_is_positive(NULL);
+],[
+       AC_DEFINE(HAVE_D_IS_POSITIVE, 1,
+               ['d_is_positive' is available])
+])
+]) # LC_HAVE_D_IS_POSITIVE
+
 #
 # LC_HAVE_BVEC_ITER
 #
@@ -3141,6 +3209,7 @@ AC_DEFUN([LC_PROG_LINUX], [
        # 2.6.37
        LC_KERNEL_LOCKED
        LC_FS_STRUCT_SEQCOUNT
+       LC_DENTRY_PATH_RAW
 
        # 2.6.38
        LC_BLKDEV_GET_BY_DEV
@@ -3156,6 +3225,7 @@ AC_DEFUN([LC_PROG_LINUX], [
        LC_HAVE_FSTYPE_MOUNT
        LC_HAVE_INODE_OWNER_OR_CAPABLE
        LC_HAVE_SECURITY_IINITSEC
+       LC_VFS_CREATE_USE_NAMEIDATA
 
        # 3.0
        LC_DIRTY_INODE_WITH_FLAG
@@ -3207,6 +3277,7 @@ AC_DEFUN([LC_PROG_LINUX], [
        LC_HAVE_HLIST_FOR_EACH_3ARG
        LC_HAVE_BIO_END_SECTOR
        LC_HAVE_IS_SXID
+       LC_HAVE_VFS_GETATTR_2ARGS
 
        # 3.10
        LC_HAVE_REMOVE_PROC_SUBTREE
@@ -3233,6 +3304,7 @@ AC_DEFUN([LC_PROG_LINUX], [
        # 3.13
        LC_VFS_RENAME_5ARGS
        LC_VFS_UNLINK_3ARGS
+       LC_HAVE_D_IS_POSITIVE
 
        # 3.14
        LC_HAVE_BVEC_ITER
index b18525c..987b25f 100644 (file)
@@ -53,6 +53,7 @@ MANFILES =                                    \
        lfs-mirror-split.1                      \
        lfs-mirror-verify.1                     \
        lfs-mkdir.1                             \
+       lfs-pcc.1                               \
        lfs-setdirstripe.1                      \
        lfs-setstripe.1                         \
        lfs-setquota.1                          \
diff --git a/lustre/doc/lctl-pcc.8 b/lustre/doc/lctl-pcc.8
new file mode 100644 (file)
index 0000000..759a670
--- /dev/null
@@ -0,0 +1,52 @@
+.TH lctl-pcc 8 2019-04-15 "Lustre" "configuration Utilities"
+.SH NAME
+lctl pcc commands used to interact with PCC features.
+.SH SYNOPSIS
+.B lctl pcc add \fR<\fImntpath\fR> <\fIpccpath\fR> [\fB--param\fR|\fB-p\fR <\fIparam\fR>]
+.br
+.B lctl pcc del <\fImntpath\fR> <\fIpccpath\fR>
+.br
+.B lctl pcc clear <\fImntpath\fR>
+.br
+.B lctl pcc list <\fImntpath\fR>
+.SH DESCRIPTION
+.TP
+.B lctl pcc add \fR<\fImntpath\fR> <\fIpccpath\fR> [\fB--param\fR|\fB-p\fR <\fIparam\fR>]
+Add a PCC backend specified by HSM root path
+.IR pccpath
+on a Lustre filesystem client instances with the mount point referenced by
+.IR mntpath .
+The parameter
+.IR param
+is a string in the form of name-value pairs to config the PCC backend such as
+read-write attach id (archive ID) or read-only attach id and auto caching rule.
+i.e. for the string "projid={500}&fname={*.h5} rwid=2", the first substring of
+the config parameter is the auto caching rule. Where "&" represents the logical
+conjunction operator while "," represents the logical disjunction operator. The
+example rule means that new files are only auto cached if the project ID is 500
+and the suffix of the file name is "h5". "rwid" represents the read-write
+attach id (2) which value is same as the archive ID of the copytool agent
+running on this PCC node.
+.TP
+.B lctl pcc del <\fImntpath\fR> <\fIpccpath\fR>
+Delete a PCC backend specified by path
+.IR pccpath
+on a Lustre client referenced by the mount point of
+.IR mntpath .
+.TP
+.B lctl pcc clear <\fImntpath\fR>
+Remove all PCC backend on a Lustre client referenced by the mount point of
+.IR mntpath .
+.TP
+.B lctl pcc list <\fImntpath\fR>
+List all PCC backends on a Lustre client referenced by the mount point of
+.IR mntpath .
+.SH OPTIONS
+.TP
+.B --param | -p
+Specifies the configuration parameters for a PCC backend.
+.TP
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-hsm (1),
+.BR lfs-pcc (1)
index 84e6920..b3be0e8 100644 (file)
@@ -593,4 +593,5 @@ filesystem package.
 .BR lctl-nodemap-del-range (8),
 .BR lctl-nodemap-del (8),
 .BR lctl-nodemap-modify (8),
+.BR lctl-pcc (8),
 .BR lfs (1)
diff --git a/lustre/doc/lfs-pcc-detach.1 b/lustre/doc/lfs-pcc-detach.1
new file mode 100644 (file)
index 0000000..7ac48c9
--- /dev/null
@@ -0,0 +1,44 @@
+.TH LFS-PCC-DETACH 1 2019-04-15 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs pcc detach|detach_fid \- Detach given files from PCC
+.SH SYNOPSIS
+.B lfs pcc detach [\fB--keep\fR|\fB-k\fR] <\fIfile \fR...>
+.br
+.B lfs pcc detach_fid [\fB--keep\fR|\fB-k\fR] <\fImntpath\fR> <\fIfid \fR...>
+.SH DESCRIPTION
+.TP
+.B lfs pcc detach [\fB--keep\fR|\fB-k\fR] <\fIfile \fR...>
+Detach given files from the persistent client cache.
+.TP
+.B lfs pcc detach_fid [\fB--keep\fR|\fB-k\fR] <\fImntpath\fR> <\fIfid \fR...>
+Detach files from the persistent client cache by FID(s).
+.SH OPTIONS
+.TP
+.B --keep | -k
+By default, the detach command will detach the file from PCC permanently and
+remove the PCC copy after detach. This option will only detach the file, but
+keep the PCC copy in cache. It allows the detaching file to be attached
+automatically at next open if the cached copy of the file is still valid.
+.SH EXAMPLES
+.TP
+.B $ lfs pcc detach /mnt/lustre/test
+Detach the file permanently from PCC. The cached file on PCC will be removed
+after detach. IO to the file will come to Lustre OSTs after this command.
+.TP
+.B $ lfs pcc detach_fid /mnt/lustre 0x200000401:0x1:0x0
+Detach the file referenced by FID "0x200000401:0x1:0x0" from PCC permanently, and
+the cached file on PCC will be removed after detach.
+.TP
+.B $ lfs pcc detach -k /mnt/lustre/test
+Detach the file "/mnt/lustre/test" from PCC. The client will try to attach
+this file again at the next open if the cached copy is still valid.
+.TP
+.B $ lfs pcc detach_fid -k /mnt/lustre 0x200000401:0x1:0x0
+Detach the file referenced by FID "0x200000401:0x1:0x0" from PCC. The client
+will try to attach this file again at the next open if the cached copy is still
+valid.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-hsm (1),
+.BR lfs-pcc (1),
+.BR lctl-pcc (8)
diff --git a/lustre/doc/lfs-pcc.1 b/lustre/doc/lfs-pcc.1
new file mode 100644 (file)
index 0000000..886a96d
--- /dev/null
@@ -0,0 +1,90 @@
+.TH LFS-PCC 1 2019-04-15 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs pcc commands used to interact with the Persistent Client Cache (PCC).
+.SH SYNOPSIS
+.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR>  <\fIfile \fR...>
+.br
+.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR>  <\fB--mnt\fR|\fB-m\fR \fImntpath\fR> <\fIfid \fR...>
+.br
+.B lfs pcc state <\fIfile \fR...>
+.SH DESCRIPTION
+.TP
+.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR>  <\fIfile \fR...>
+Attach given files on the persistent client cache. Use
+.B lfs pcc detach
+to remove the cached files from PCC either manually, or through automatic
+mechanisms for the purpose of the cache space management.
+.TP
+.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR>  <\fB--mnt\fR|\fB-m\fR \fImntpath\fR> <\fIfid \fR...>
+Attach given files into the persistent client cache by FID(s).
+.TP
+.B lfs pcc state <\fIfile \fR...>
+Display the PCC state for given files.
+.TP
+.SH OPTIONS
+.TP
+.B --id | -i
+For RW-PCC, it is HSM ARCHIVE ID to choose which backend for cache files.
+.TP
+.B --mnt | -m
+Specify the Lustre mount point.
+.TP
+Before using RW-PCC, you need to configure HSM root and Archive ID mapping properly:
+.TP
+.B lfs pcc add $MNTPATH $PCCPATH \ "$PARAM"
+Add one PCC backend to the Lustre client. For RW-PCC, when a file is being
+created, a rule-based policy is used to determine whether it will be cached.
+The rule expression supports logical conditional conjunction and disjunction
+operations among different users, groups, projects, or filenames including
+wildcards. You need to specify auto create caching rule and archive ID in
+.B $PARAM.
+On this client any subsequently created files matching the condition of auto
+caching rule will be persistently cached automatically.
+.TP
+.B lfs pcc del $MNTPATH $PCCPATH
+ Delete one PCC backend
+.TP
+.B lfs pcc clear $MNTPATH
+ Clear and remove all PCC backends for the client.
+.TP
+.SH EXAMPLES
+.TP
+.B # lctl set_param mdt.$FSNAME-MDT0000.hsm_control=enabled
+Enable HSM on the appropriate MDT.
+.TP
+.B # lhsmtool_posix --daemon --hsm-root /mnt/pcc/ --archive=1 /mnt/lustre
+Launch one copytool on client node to connect cache storage.
+.TP
+.B # lfs pcc add /mnt/lustre /mnt/pcc \ "projid={500,1000}&fname={*.h5},uid=1001 rwid=1"
+Add HSM root and Archive ID (referenced by
+.IB rwid
+name-value pair) mapping for RW-PCC. Where "&" represents the logical
+conjunction operator while "," represents the logical disjunction operator.
+The example rule means that new files are only auto cached if the project ID is
+either 500 or 1000 and the suffix of the file name is “h5” or the user ID is
+1001.
+.TP
+.B $ lfs pcc attach -i 1 /mnt/lustre/file
+Attach an existing file into PCC and migrate data from lustre to Cache Device,
+any I/O to the Lustre file will direct to the RW-PCC copy.
+.TP
+.B $ lfs pcc attach_fid -i 1 -m /mnt/lustre 0x200000401:0x1:0x0
+Attach an existing file referenced by FID "0x200000401:0x1:0x0" into PCC.
+.TP
+.B $ lfs pcc state /mnt/lustre/file
+.br
+file: /mnt/lustre/file, type: readwrite, PCC file: /mnt/pcc/0004/0000/0bd1/0000/0002/0000/0x200000bd1:0x4:0x0, user number: 1, flags: 6
+.br
+Display the PCC state of the file "/mnt/lustre/file".
+.TP
+.B $ lfs pcc state /mnt/lustre/file
+.br
+file: /mnt/lustre/file, type: readwrite, PCC file: /mnt/pcc/0004/0000/0bd1/0000/0002/0000/0x200000bd1:0x4:0x0, user number: 1, flags: 6
+.br
+Display the PCC state of the file "/mnt/lustre/file".
+.TP
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-hsm (1),
+.BR lfs-pcc-detach (1),
+.BR lctl-pcc (8)
index 5b1177b..d508e0d 100644 (file)
@@ -319,4 +319,5 @@ The lfs command is part of the Lustre filesystem.
 .BR lfs-setdirstripe (1),
 .BR lfs-setquota (1),
 .BR lfs-setstripe (1),
+.BR lfs-pcc (1),
 .BR lustre (7)
diff --git a/lustre/doc/llapi_pcc_attach.3 b/lustre/doc/llapi_pcc_attach.3
new file mode 100644 (file)
index 0000000..a283559
--- /dev/null
@@ -0,0 +1,74 @@
+.TH llapi_pcc_attach 3 "2019 April 20" "Lustre User API"
+.SH NAME
+llapi_pcc_attach, llapi_pcc_attach_fid, llapi_pcc_attach_fid_str \- attach a file into PCC
+.SH SYNOPSIS
+.nf
+.B #include <lustre/lustreapi.h>
+.PP
+.BI "int llapi_pcc_attach(const char *" path ", __u32 " id ,
+.BI "                     enum lu_pcc_type " type ");"
+.PP
+.BI "int llapi_pcc_attach_fid(const char *" mntpath ", const struct lu_fid *" fid ,
+.BI "                         __u32 " id ", enum lu_pcc_type " type ");"
+.PP
+.BI "int llapi_pcc_attach_fid_str(const char *" mntpath ", const char *" fidstr ,
+.BI "                             __u32 " id ", enum lu_pcc_type " type ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR llapi_pcc_attach() ,
+.BR llapi_pcc_attach_fid() ,
+and
+.BR llapi_pcc_attach_fid_str()
+tries to attach the file referenced by
+.IR path ,
+.IR fid ,
+or
+.IR fidstr
+into PCC backend. PCC provides a group of local caches and works in two modes:
+RW-PCC enables a read-write cache on the local SSDs of a single client; RO-PCC
+provides a read-only cache on the local SSDs of multiple clients. For RW-PCC,
+the argument
+.I id
+is the archive ID of the copytool agent running on this client. By default,
+RO-PCC attach ID is setting same with RW-PCC attach ID for a PCC backend if it
+is also used as read-only caching. The attach mode is specified by
+.I type
+argument, which is a
+.B enum lu_pcc_type
+data structure, which contains the following values:
+.nf
+.LP
+       LU_PCC_NONE
+       LU_PCC_READWRITE
+       LU_PCC_READONLY
+.fi
+.TP
+LU_PCC_NONE
+menas that the file is not cached on PCC.
+.TP
+LU_PCC_READWRITE
+means RW-PCC mode.
+.TP
+LU_PCC_READONLY
+means RO-PCC mode.
+.SH RETURN VALUES
+.PP
+.B llapi_pcc_attach()
+return 0 on success or a negative errno value on failure.
+.SH ERRORS
+.TP 15
+.SM -ENOMEM
+Insufficient memory to complete operation.
+.TP
+.SM -EFAULT
+Memory region is not properly mapped.
+.TP
+.SM -EINVAL
+One or more invalid arguments are given.
+.TP
+.SM -EOPNOTSUPP
+PCC attach operation is not supported.
+.SH "SEE ALSO"
+.BR lustreapi (7)
diff --git a/lustre/doc/llapi_pcc_attach_fid.3 b/lustre/doc/llapi_pcc_attach_fid.3
new file mode 100644 (file)
index 0000000..2719276
--- /dev/null
@@ -0,0 +1 @@
+.so man3/llapi_pcc_attach.3
diff --git a/lustre/doc/llapi_pcc_attach_fid_str.3 b/lustre/doc/llapi_pcc_attach_fid_str.3
new file mode 100644 (file)
index 0000000..2719276
--- /dev/null
@@ -0,0 +1 @@
+.so man3/llapi_pcc_attach.3
diff --git a/lustre/doc/llapi_pcc_detach_fid.3 b/lustre/doc/llapi_pcc_detach_fid.3
new file mode 100644 (file)
index 0000000..1330fa6
--- /dev/null
@@ -0,0 +1 @@
+.so man3/llapi_pcc_detach_fid_fd.3
diff --git a/lustre/doc/llapi_pcc_detach_fid_fd.3 b/lustre/doc/llapi_pcc_detach_fid_fd.3
new file mode 100644 (file)
index 0000000..0afe243
--- /dev/null
@@ -0,0 +1,65 @@
+.TH llapi_pcc_detach_fid_fd 3 "2019 April 20" "Lustre User API"
+.SH NAME
+llapi_pcc_detach_fid_fd, llapi_pcc_detach_fid, llapi_pcc_detach_fid_str,
+llapi_pcc_detach_file \- detach the given file from PCC
+.SH SYNOPSIS
+.nf
+.B #include <lustre/lustreapi.h>
+.PP
+.BI "int llapi_pcc_detach_fid_fd(int " dirfd ", const struct lu_fid *" fid ");"
+.PP
+.BI "int llapi_pcc_detach_fid(const char *" mntpath ", const struct lu_fid *" fid ");"
+.PP
+.BI "int llapi_pcc_detach_fid_str(const char *" mntpath ", const char *" fidstr ");"
+.PP
+.BI "int llapi_pcc_detach_file(const char *" path ");"
+.fi
+.SH DESCRIPTION
+.PP
+.BR llapi_pcc_detach_fid_fd() ,
+.BR llapi_pcc_detach_fid() ,
+.BR llapi_pcc_detach_fid_str() ,
+and
+.BR llapi_pcc_detach_file()
+detaches a cached file from PCC by an ioctl on the dir. The file is referenced
+by
+.IR fid ,
+.IR fidstr ,
+or
+.IR path .
+The dir, which usually a mount point dir that the copytool already has opened,
+is referenced by
+.IR dirfd ,
+.IR mntpath ,
+.IR path .
+.SH RETURN VALUES
+.LP
+.BR llapi_pcc_detach_fid_fd() ,
+.BR llapi_pcc_detach_fid() ,
+.BR llapi_pcc_detach_fid_str() ,
+and
+.B llapi_pcc_detach_file()
+return 0 on success or a negative errno value on failure.
+.SH ERRORS
+.TP 15
+.SM -ENOMEM
+Insufficient memory to complete operation.
+.TP
+.SM -EFAULT
+Memory region is not properly mapped.
+.TP
+.SM -EINVAL
+One or more invalid arguments are given.
+.TP
+.SM -EOPNOTSUPP
+PCC state operation is not supported.
+.TP
+.SM -ENOTTY
+File does not reside on a Lustre filesystem.
+.TP
+.SM -ENOENT
+.I path
+does not exist.
+.SH "SEE ALSO"
+.BR llapi_pcc_attach (3),
+.BR lustreapi (7)
diff --git a/lustre/doc/llapi_pcc_detach_fid_str.3 b/lustre/doc/llapi_pcc_detach_fid_str.3
new file mode 100644 (file)
index 0000000..1330fa6
--- /dev/null
@@ -0,0 +1 @@
+.so man3/llapi_pcc_detach_fid_fd.3
diff --git a/lustre/doc/llapi_pcc_detach_file.3 b/lustre/doc/llapi_pcc_detach_file.3
new file mode 100644 (file)
index 0000000..1330fa6
--- /dev/null
@@ -0,0 +1 @@
+.so man3/llapi_pcc_detach_fid_fd.3
diff --git a/lustre/doc/llapi_pcc_state_get.3 b/lustre/doc/llapi_pcc_state_get.3
new file mode 100644 (file)
index 0000000..2d2a9d2
--- /dev/null
@@ -0,0 +1,73 @@
+.TH llapi_pcc_state_get 3 "2019 April 20" "Lustre User API"
+.SH NAME
+llapi_pcc_state_get, llapi_pcc_state_get_fd, \- get the current PCC state
+related to a file
+.SH SYNOPSIS
+.nf
+.B #include <lustre/lustreapi.h>
+.PP
+.BI "int llapi_pcc_state_get(const char *" path ", struct lu_pcc_state *" state ");"
+.PP
+.BI "int llapi_pcc_state_get_fd(int " fd ", struct lu_pcc_state *" state ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR llapi_pcc_state_get()
+and
+.BR llapi_pcc_state_get_fd()
+returns the PCC state information for the file referenced by
+.IR path
+or
+.IR fd .
+Information is returned in the
+.IR state
+argument which should be already allocated, which is a
+.B lu_pcc_state
+data structure, which contains the following fields:
+.nf
+.LP
+struct lu_pcc_state {
+       __u32   pccs_type; /* enum lu_pcc_type */
+       __u32   pccs_open_count;
+       __u32   pccs_flags; /* enum lu_pcc_state_flags */
+       __u32   pccs_padding;
+       char    pccs_path[PATH_MAX];
+};
+.fi
+.TP
+.I pccs_type
+specifies the PCC mode for the given file, which is actual an
+.B lu_pcc_type
+data structure.
+.TP
+.I pccs_open_count
+indicates the opener count for the given file on the client.
+.TP
+.I pccs_flags
+is PCC flags for the given file,  not used currently.
+.TP
+.I pccs_path
+is the full path of the cached file on the PCC backend.
+.SH RETURN VALUES
+.PP
+.B llapi_pcc_state_get()
+and
+.B llapi_pcc_state_get_fd()
+return 0 on success or a negative errno value on failure.
+.SH ERRORS
+.TP 15
+.SM -ENOMEM
+Insufficient memory to complete operation.
+.TP
+.SM -EFAULT
+Memory region is not properly mapped.
+.TP
+.SM -EINVAL
+One or more invalid arguments are given.
+.TP
+.SM -EOPNOTSUPP
+PCC state operation is not supported.
+.SH "SEE ALSO"
+.BR llapi_pcc_attach (3),
+.BR lustreapi (7)
diff --git a/lustre/doc/llapi_pcc_state_get_fd.3 b/lustre/doc/llapi_pcc_state_get_fd.3
new file mode 100644 (file)
index 0000000..da4b39f
--- /dev/null
@@ -0,0 +1 @@
+.so man3/llapi_pcc_state_get.3
diff --git a/lustre/doc/llapi_pccdev_get.3 b/lustre/doc/llapi_pccdev_get.3
new file mode 100644 (file)
index 0000000..b317243
--- /dev/null
@@ -0,0 +1,36 @@
+.TH llapi_pccdev_get 3 "2019 April 20" "Lustre User API"
+.SH NAME
+llapi_pccdev_get \- List all PCC backends on a client
+.SH SYNOPSIS
+.nf
+.B #include <lustre/lustreapi.h>
+.PP
+.BI "int llapi_pccdev_get(const char *" path ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR llapi_pccdev_get()
+lists all PCC backends on the client with the mount point referenced by
+.IR path ,
+and output the results to stdout in YAML format.
+.SH RETURN VALUES
+.PP
+.B llapi_pccdev_get()
+return 0 on success or a negative errno value on failure.
+.SH ERRORS
+.TP 15
+.SM -ENOMEM
+Insufficient memory to complete operation.
+.TP
+.SM -EFAULT
+Memory region is not properly mapped.
+.TP
+.SM -EINVAL
+One or more invalid arguments are given.
+.TP
+.SM -EOPNOTSUPP
+PCC backend operation is not supported.
+.SH "SEE ALSO"
+.BR llapi_pccdev_set (3)
+.BR lustreapi (7)
diff --git a/lustre/doc/llapi_pccdev_set.3 b/lustre/doc/llapi_pccdev_set.3
new file mode 100644 (file)
index 0000000..14f1010
--- /dev/null
@@ -0,0 +1,48 @@
+.TH llapi_pccdev_set 3 "2019 April 20" "Lustre User API"
+.SH NAME
+llapi_pccdev_set \- Add/delete a PCC backend on a client
+.SH SYNOPSIS
+.nf
+.B #include <lustre/lustreapi.h>
+.PP
+.BI "int llapi_pccdev_set(const char *" path ", const char *" cmd ");"
+.fi
+.SH DESCRIPTION
+.PP
+The function
+.BR llapi_pccdev_set()
+adds or deletes a PCC backend on the client with the mount point referenced by
+.IR path .
+The input argument
+.IR cmd
+could be in the following forms:
+.TP
+.B \ "add\ $PCCPATH\ $PARAM"
+Add a PCC backend referenced by the HSM root path
+.IR $PCCPATH .
+.TP
+.B \ "del\ $PCCPATH"
+Delete a PCC backend referenced by the HSM root path
+.IR $PCCPATH .
+.TP
+.B \ "clear"
+Clear and remove all PCC backends on a client.
+.SH RETURN VALUES
+.PP
+.B llapi_pccdev_set()
+return 0 on success or a negative errno value on failure.
+.SH ERRORS
+.TP 15
+.SM -ENOMEM
+Insufficient memory to complete operation.
+.TP
+.SM -EFAULT
+Memory region is not properly mapped.
+.TP
+.SM -EINVAL
+One or more invalid arguments are given.
+.TP
+.SM -EOPNOTSUPP
+PCC backend operation is not supported.
+.SH "SEE ALSO"
+.BR lustreapi (7)
index 413c5f9..b4e5571 100644 (file)
@@ -296,6 +296,8 @@ struct cl_layout {
        u32             cl_layout_gen;
        /** whether layout is a composite one */
        bool            cl_is_composite;
+       /** Whether layout is a HSM released one */
+       bool            cl_is_released;
 };
 
 /**
index 2ae985c..420b606 100644 (file)
@@ -518,6 +518,23 @@ int llapi_group_unlock(int fd, int gid);
 /* Ladvise */
 int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
                  struct llapi_lu_ladvise *ladvise);
+
+/* PCC */
+int llapi_pcc_attach(const char *path, __u32 id, enum lu_pcc_type type);
+int llapi_pcc_attach_fid(const char *mntpath, const struct lu_fid *fid,
+                        __u32 id, enum lu_pcc_type type);
+int llapi_pcc_attach_fid_str(const char *mntpath, const char *fidstr,
+                            __u32 id, enum lu_pcc_type type);
+int llapi_pcc_detach_fd(int fd, __u32 option);
+int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid,
+                        __u32 option);
+int llapi_pcc_detach_fid_str(const char *mntpath, const char *fidstr,
+                            __u32 option);
+int llapi_pcc_detach_file(const char *path, __u32 option);
+int llapi_pcc_state_get_fd(int fd, struct lu_pcc_state *state);
+int llapi_pcc_state_get(const char *path, struct lu_pcc_state *state);
+int llapi_pccdev_set(const char *mntpath, const char *cmd);
+int llapi_pccdev_get(const char *mntpath);
 /** @} llapi */
 
 /* llapi_layout user interface */
index 1b74aa4..284f4ee 100644 (file)
@@ -378,6 +378,37 @@ static inline struct inode *file_inode(const struct file *file)
 #define ll_vfs_unlink(a, b) vfs_unlink(a, b)
 #endif
 
+#ifndef HAVE_INODE_OWNER_OR_CAPABLE
+#define inode_owner_or_capable(inode) is_owner_or_cap(inode)
+#endif
+
+static inline int ll_vfs_getattr(struct path *path, struct kstat *st)
+{
+       int rc;
+
+#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
+       rc = vfs_getattr(path, st, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+#elif defined HAVE_VFS_GETATTR_2ARGS
+       rc = vfs_getattr(path, st);
+#else
+       rc = vfs_getattr(path->mnt, path->dentry, st);
+#endif
+       return rc;
+}
+
+#ifndef HAVE_D_IS_POSITIVE
+static inline bool d_is_positive(const struct dentry *dentry)
+{
+       return dentry->d_inode != NULL;
+}
+#endif
+
+#ifdef HAVE_VFS_CREATE_USE_NAMEIDATA
+# define LL_VFS_CREATE_FALSE NULL
+#else
+# define LL_VFS_CREATE_FALSE false
+#endif
+
 #ifndef HAVE_INODE_LOCK
 # define inode_lock(inode) mutex_lock(&(inode)->i_mutex)
 # define inode_unlock(inode) mutex_unlock(&(inode)->i_mutex)
index d245dd1..c43539a 100644 (file)
@@ -158,6 +158,9 @@ struct md_op_spec {
        void            *sp_cr_file_secctx; /* xattr value */
        size_t           sp_cr_file_secctx_size; /* xattr value size */
 
+       /* Archive ID used for auto PCC attach when create newly files. */
+       __u32            sp_archive_id;
+
        /** don't create lov objects or llog cookie - this replay */
        unsigned int no_create:1,
                     sp_cr_lookup:1, /* do lookup sanity check or not. */
index cc15c9f..5627685 100644 (file)
@@ -903,6 +903,8 @@ struct md_op_data {
        bool                    op_post_migrate;
        /* used to access dir with bash hash */
        __u32                   op_stripe_index;
+       /* Archive ID for PCC attach */
+       __u32                   op_archive_id;
 };
 
 struct md_callback {
index 0aea7c2..d488249 100644 (file)
@@ -568,6 +568,10 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_IMUTEX_SEC                  0x140e
 #define OBD_FAIL_LLITE_IMUTEX_NOSEC                0x140f
 #define OBD_FAIL_LLITE_OPEN_BY_NAME                0x1410
+#define OBD_FAIL_LLITE_PCC_FAKE_ERROR              0x1411
+#define OBD_FAIL_LLITE_PCC_DETACH_MKWRITE          0x1412
+#define OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE           0x1413
+#define OBD_FAIL_LLITE_PCC_ATTACH_PAUSE                    0x1414
 
 #define OBD_FAIL_FID_INDIR     0x1501
 #define OBD_FAIL_FID_INLMA     0x1502
index e628f43..85705d0 100644 (file)
@@ -898,7 +898,8 @@ struct ptlrpc_body_v2 {
                                OBD_CONNECT2_ARCHIVE_ID_ARRAY | \
                                OBD_CONNECT2_SELINUX_POLICY | \
                                OBD_CONNECT2_LSOM | \
-                               OBD_CONNECT2_ASYNC_DISCARD)
+                               OBD_CONNECT2_ASYNC_DISCARD | \
+                               OBD_CONNECT2_PCC)
 
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
@@ -1952,6 +1953,7 @@ enum mds_op_bias {
        MDS_CLOSE_RESYNC_DONE   = 1 << 16,
        MDS_CLOSE_LAYOUT_SPLIT  = 1 << 17,
        MDS_TRUNC_KEEP_LEASE    = 1 << 18,
+       MDS_PCC_ATTACH          = 1 << 19,
 };
 
 #define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |         \
@@ -1974,7 +1976,10 @@ struct mdt_rec_create {
        struct lu_fid   cr_fid2;
        struct lustre_handle cr_open_handle_old; /* in case of open replay */
        __s64           cr_time;
-       __u64           cr_rdev;
+       union {
+               __u64           cr_rdev;
+               __u32           cr_archive_id;
+       };
        __u64           cr_ioepoch;
        __u64           cr_padding_1;   /* rr_blocks */
        __u32           cr_mode;
@@ -3547,6 +3552,8 @@ struct close_data {
                struct close_data_resync_done   cd_resync;
                /* split close */
                __u16                           cd_mirror_id;
+               /* PCC release */
+               __u32                           cd_archive_id;
        };
 };
 
index 0bad5c7..7d321f8 100644 (file)
@@ -393,6 +393,7 @@ enum ll_lease_flags {
        LL_LEASE_RESYNC_DONE    = 0x2,
        LL_LEASE_LAYOUT_MERGE   = 0x4,
        LL_LEASE_LAYOUT_SPLIT   = 0x8,
+       LL_LEASE_PCC_ATTACH     = 0x10,
 };
 
 #define IOC_IDS_MAX    4096
@@ -481,6 +482,9 @@ struct ll_ioc_lease_id {
 #define LL_IOC_LADVISE                 _IOR('f', 250, struct llapi_lu_ladvise)
 #define LL_IOC_HEAT_GET                        _IOWR('f', 251, struct lu_heat)
 #define LL_IOC_HEAT_SET                        _IOW('f', 251, __u64)
+#define LL_IOC_PCC_DETACH              _IOW('f', 252, struct lu_pcc_detach)
+#define LL_IOC_PCC_DETACH_BY_FID       _IOW('f', 252, struct lu_pcc_detach_fid)
+#define LL_IOC_PCC_STATE               _IOR('f', 252, struct lu_pcc_state)
 
 #ifndef        FS_IOC_FSGETXATTR
 /*
@@ -1214,12 +1218,15 @@ enum la_valid {
 #define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
 
 #define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
+#define MDS_OPEN_PCC      010000000000000ULL /* PCC: auto RW-PCC cache attach
+                                             * for newly created file */
 
 /* lustre internal open flags, which should not be set from user space */
 #define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |    \
                              MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |  \
                              MDS_OPEN_BY_FID | MDS_OPEN_LEASE |        \
-                             MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
+                             MDS_OPEN_RELEASE | MDS_OPEN_RESYNC |      \
+                             MDS_OPEN_PCC)
 
 
 /********* Changelogs **********/
@@ -2294,6 +2301,62 @@ struct lu_heat {
        __u64 lh_heat[0];
 };
 
+enum lu_pcc_type {
+       LU_PCC_NONE = 0,
+       LU_PCC_READWRITE,
+       LU_PCC_MAX
+};
+
+static inline const char *pcc_type2string(enum lu_pcc_type type)
+{
+       switch (type) {
+       case LU_PCC_NONE:
+               return "none";
+       case LU_PCC_READWRITE:
+               return "readwrite";
+       default:
+               return "fault";
+       }
+}
+
+struct lu_pcc_attach {
+       __u32 pcca_type; /* PCC type */
+       __u32 pcca_id; /* archive ID for readwrite, group ID for readonly */
+};
+
+enum lu_pcc_detach_opts {
+       PCC_DETACH_OPT_NONE = 0, /* Detach only, keep the PCC copy */
+       PCC_DETACH_OPT_UNCACHE, /* Remove the cached file after detach */
+};
+
+struct lu_pcc_detach_fid {
+       /* fid of the file to detach */
+       struct lu_fid   pccd_fid;
+       __u32           pccd_opt;
+};
+
+struct lu_pcc_detach {
+       __u32           pccd_opt;
+};
+
+enum lu_pcc_state_flags {
+       PCC_STATE_FL_NONE               = 0x0,
+       /* The inode attr is cached locally */
+       PCC_STATE_FL_ATTR_VALID         = 0x01,
+       /* The file is being attached into PCC */
+       PCC_STATE_FL_ATTACHING          = 0x02,
+       /* Allow to auto attach at open */
+       PCC_STATE_FL_OPEN_ATTACH        = 0x04,
+};
+
+struct lu_pcc_state {
+       __u32   pccs_type; /* enum lu_pcc_type */
+       __u32   pccs_open_count;
+       __u32   pccs_flags; /* enum lu_pcc_state_flags */
+       __u32   pccs_padding;
+       char    pccs_path[PATH_MAX];
+};
+
 #if defined(__cplusplus)
 }
 #endif
index a2414bd..98dd203 100644 (file)
@@ -9,10 +9,10 @@ lustre-objs += glimpse.o
 lustre-objs += lcommon_cl.o
 lustre-objs += lcommon_misc.o
 lustre-objs += vvp_dev.o vvp_page.o vvp_io.o vvp_object.o
-lustre-objs += range_lock.o
+lustre-objs += range_lock.o pcc.o
 
 EXTRA_DIST := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c
-EXTRA_DIST += vvp_internal.h range_lock.h
+EXTRA_DIST += vvp_internal.h range_lock.h pcc.h
 
 @XATTR_HANDLER_TRUE@EXTRA_DIST += xattr26.c
 @XATTR_HANDLER_FALSE@EXTRA_DIST += xattr.c
index 06f092e..a2d3d4b 100644 (file)
@@ -1982,6 +1982,43 @@ migrate_free:
                RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
        case LL_IOC_FSSETXATTR:
                RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
+       case LL_IOC_PCC_DETACH_BY_FID: {
+               struct lu_pcc_detach_fid *detach;
+               struct lu_fid *fid;
+               struct inode *inode2;
+               unsigned long ino;
+
+               OBD_ALLOC_PTR(detach);
+               if (detach == NULL)
+                       RETURN(-ENOMEM);
+
+               if (copy_from_user(detach,
+                                  (const struct lu_pcc_detach_fid __user *)arg,
+                                  sizeof(*detach)))
+                       GOTO(out_detach, rc = -EFAULT);
+
+               fid = &detach->pccd_fid;
+               ino = cl_fid_build_ino(fid, ll_need_32bit_api(sbi));
+               inode2 = ilookup5(inode->i_sb, ino, ll_test_inode_by_fid, fid);
+               if (inode2 == NULL)
+                       /* Target inode is not in inode cache, and PCC file
+                        * has aleady released, return immdiately.
+                        */
+                       GOTO(out_detach, rc = 0);
+
+               if (!S_ISREG(inode2->i_mode))
+                       GOTO(out_iput, rc = -EINVAL);
+
+               if (!inode_owner_or_capable(inode2))
+                       GOTO(out_iput, rc = -EPERM);
+
+               rc = pcc_ioctl_detach(inode2, detach->pccd_opt);
+out_iput:
+               iput(inode2);
+out_detach:
+               OBD_FREE_PTR(detach);
+               RETURN(rc);
+       }
        default:
                RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
                                     (void __user *)arg));
index 4420486..c45090e 100644 (file)
@@ -58,6 +58,12 @@ struct split_param {
        __u16           sp_mirror_id;
 };
 
+struct pcc_param {
+       __u64   pa_data_version;
+       __u32   pa_archive_id;
+       __u32   pa_layout_gen;
+};
+
 static int
 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
 
@@ -73,6 +79,7 @@ static struct ll_file_data *ll_file_data_get(void)
                return NULL;
 
        fd->fd_write_failed = false;
+       pcc_file_init(&fd->fd_pcc_file);
 
        return fd;
 }
@@ -190,6 +197,17 @@ static int ll_close_inode_openhandle(struct inode *inode,
                break;
        }
 
+       case MDS_PCC_ATTACH: {
+               struct pcc_param *param = data;
+
+               LASSERT(data != NULL);
+               op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
+               op_data->op_archive_id = param->pa_archive_id;
+               op_data->op_data_version = param->pa_data_version;
+               op_data->op_lease_handle = och->och_lease_handle;
+               break;
+       }
+
        case MDS_HSM_RELEASE:
                LASSERT(data != NULL);
                op_data->op_bias |= MDS_HSM_RELEASE;
@@ -220,6 +238,12 @@ static int ll_close_inode_openhandle(struct inode *inode,
                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
                if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
                        rc = -EBUSY;
+
+               if (bias & MDS_PCC_ATTACH) {
+                       struct pcc_param *param = data;
+
+                       param->pa_layout_gen = body->mbo_layout_gen;
+               }
        }
 
        ll_finish_md_op_data(op_data);
@@ -372,6 +396,8 @@ int ll_file_release(struct inode *inode, struct file *file)
                RETURN(0);
        }
 
+       pcc_file_release(inode, file);
+
        if (!S_ISDIR(inode->i_mode)) {
                if (lli->lli_clob != NULL)
                        lov_read_and_clear_async_rc(lli->lli_clob);
@@ -815,6 +841,11 @@ restart:
                if (rc)
                        GOTO(out_och_free, rc);
        }
+
+       rc = pcc_file_open(inode, file);
+       if (rc)
+               GOTO(out_och_free, rc);
+
        mutex_unlock(&lli->lli_och_mutex);
         fd = NULL;
 
@@ -839,6 +870,7 @@ out_och_free:
 out_openerr:
                if (lli->lli_opendir_key == fd)
                        ll_deauthorize_statahead(inode, fd);
+
                if (fd != NULL)
                        ll_file_data_put(fd);
         } else {
@@ -1619,6 +1651,22 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
        ssize_t result;
        ssize_t rc2;
        __u16 refcheck;
+       bool cached;
+
+       /**
+        * Currently when PCC read failed, we do not fall back to the
+        * normal read path, just return the error.
+        * The resaon is that: for RW-PCC, the file data may be modified
+        * in the PCC and inconsistent with the data on OSTs (or file
+        * data has been removed from the Lustre file system), at this
+        * time, fallback to the normal read path may read the wrong
+        * data.
+        * TODO: for RO-PCC (readonly PCC), fall back to normal read
+        * path: read data from data copy on OSTs.
+        */
+       result = pcc_file_read_iter(iocb, to, &cached);
+       if (cached)
+               return result;
 
        ll_ras_enter(iocb->ki_filp);
 
@@ -1714,9 +1762,25 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct lu_env *env;
        ssize_t rc_tiny = 0, rc_normal;
        __u16 refcheck;
+       bool cached;
+       int result;
 
        ENTRY;
 
+       /**
+        * When PCC write failed, we usually do not fall back to the normal
+        * write path, just return the error. But there is a special case when
+        * returned error code is -ENOSPC due to running out of space on PCC HSM
+        * bakcend. At this time, it will fall back to normal I/O path and
+        * retry the I/O. As the file is in HSM released state, it will restore
+        * the file data to OSTs first and redo the write again. And the
+        * restore process will revoke the layout lock and detach the file
+        * from PCC cache automatically.
+        */
+       result = pcc_file_write_iter(iocb, from, &cached);
+       if (cached && result != -ENOSPC && result != -EDQUOT)
+               return result;
+
        /* NB: we can't do direct IO for tiny writes because they use the page
         * cache, we can't do sync writes because tiny writes can't flush
         * pages, and we can't do append writes because we can't guarantee the
@@ -1893,15 +1957,22 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
                                    struct pipe_inode_info *pipe, size_t count,
                                    unsigned int flags)
 {
-        struct lu_env      *env;
-        struct vvp_io_args *args;
-        ssize_t             result;
-       __u16               refcheck;
+       struct lu_env *env;
+       struct vvp_io_args *args;
+       ssize_t result;
+       __u16 refcheck;
+       bool cached;
+
         ENTRY;
 
+       result = pcc_file_splice_read(in_file, ppos, pipe,
+                                     count, flags, &cached);
+       if (cached)
+               RETURN(result);
+
        ll_ras_enter(in_file);
 
-        env = cl_env_get(&refcheck);
+       env = cl_env_get(&refcheck);
         if (IS_ERR(env))
                 RETURN(PTR_ERR(env));
 
@@ -3107,13 +3178,16 @@ static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
        struct ll_inode_info    *lli = ll_i2info(inode);
        struct obd_client_handle *och = NULL;
        struct split_param sp;
-       bool lease_broken;
+       struct pcc_param param;
+       bool lease_broken = false;
        fmode_t fmode = 0;
        enum mds_op_bias bias = 0;
        struct file *layout_file = NULL;
        void *data = NULL;
        size_t data_size = 0;
-       long rc;
+       bool attached = false;
+       long rc, rc2 = 0;
+
        ENTRY;
 
        mutex_lock(&lli->lli_och_mutex);
@@ -3124,22 +3198,22 @@ static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
        mutex_unlock(&lli->lli_och_mutex);
 
        if (och == NULL)
-               GOTO(out, rc = -ENOLCK);
+               RETURN(-ENOLCK);
 
        fmode = och->och_flags;
 
        switch (ioc->lil_flags) {
        case LL_LEASE_RESYNC_DONE:
                if (ioc->lil_count > IOC_IDS_MAX)
-                       GOTO(out, rc = -EINVAL);
+                       GOTO(out_lease_close, rc = -EINVAL);
 
                data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
                OBD_ALLOC(data, data_size);
                if (!data)
-                       GOTO(out, rc = -ENOMEM);
+                       GOTO(out_lease_close, rc = -ENOMEM);
 
                if (copy_from_user(data, (void __user *)arg, data_size))
-                       GOTO(out, rc = -EFAULT);
+                       GOTO(out_lease_close, rc = -EFAULT);
 
                bias = MDS_CLOSE_RESYNC_DONE;
                break;
@@ -3147,19 +3221,19 @@ static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
                int fd;
 
                if (ioc->lil_count != 1)
-                       GOTO(out, rc = -EINVAL);
+                       GOTO(out_lease_close, rc = -EINVAL);
 
                arg += sizeof(*ioc);
                if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
-                       GOTO(out, rc = -EFAULT);
+                       GOTO(out_lease_close, rc = -EFAULT);
 
                layout_file = fget(fd);
                if (!layout_file)
-                       GOTO(out, rc = -EBADF);
+                       GOTO(out_lease_close, rc = -EBADF);
 
                if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
                                (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
-                       GOTO(out, rc = -EPERM);
+                       GOTO(out_lease_close, rc = -EPERM);
 
                data = file_inode(layout_file);
                bias = MDS_CLOSE_LAYOUT_MERGE;
@@ -3170,20 +3244,20 @@ static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
                int mirror_id;
 
                if (ioc->lil_count != 2)
-                       GOTO(out, rc = -EINVAL);
+                       GOTO(out_lease_close, rc = -EINVAL);
 
                arg += sizeof(*ioc);
                if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
-                       GOTO(out, rc = -EFAULT);
+                       GOTO(out_lease_close, rc = -EFAULT);
 
                arg += sizeof(__u32);
                if (copy_from_user(&mirror_id, (void __user *)arg,
                                   sizeof(__u32)))
-                       GOTO(out, rc = -EFAULT);
+                       GOTO(out_lease_close, rc = -EFAULT);
 
                layout_file = fget(fdv);
                if (!layout_file)
-                       GOTO(out, rc = -EBADF);
+                       GOTO(out_lease_close, rc = -EBADF);
 
                sp.sp_inode = file_inode(layout_file);
                sp.sp_mirror_id = (__u16)mirror_id;
@@ -3191,11 +3265,35 @@ static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
                bias = MDS_CLOSE_LAYOUT_SPLIT;
                break;
        }
+       case LL_LEASE_PCC_ATTACH:
+               if (ioc->lil_count != 1)
+                       RETURN(-EINVAL);
+
+               arg += sizeof(*ioc);
+               if (copy_from_user(&param.pa_archive_id, (void __user *)arg,
+                                  sizeof(__u32)))
+                       GOTO(out_lease_close, rc2 = -EFAULT);
+
+               rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
+               if (rc2)
+                       GOTO(out_lease_close, rc2);
+
+               attached = true;
+               /* Grab latest data version */
+               rc2 = ll_data_version(inode, &param.pa_data_version,
+                                    LL_DV_WR_FLUSH);
+               if (rc2)
+                       GOTO(out_lease_close, rc2);
+
+               data = &param;
+               bias = MDS_PCC_ATTACH;
+               break;
        default:
                /* without close intent */
                break;
        }
 
+out_lease_close:
        rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
        if (rc < 0)
                GOTO(out, rc);
@@ -3219,6 +3317,14 @@ out:
                if (layout_file)
                        fput(layout_file);
                break;
+       case LL_LEASE_PCC_ATTACH:
+               if (!rc)
+                       rc = rc2;
+               rc = pcc_readwrite_attach_fini(file, inode,
+                                              param.pa_layout_gen,
+                                              lease_broken, rc,
+                                              attached);
+               break;
        }
 
        if (!rc)
@@ -3742,6 +3848,52 @@ out_ladvise:
                rc = ll_heat_set(inode, flags);
                RETURN(rc);
        }
+       case LL_IOC_PCC_DETACH: {
+               struct lu_pcc_detach *detach;
+
+               OBD_ALLOC_PTR(detach);
+               if (detach == NULL)
+                       RETURN(-ENOMEM);
+
+               if (copy_from_user(detach,
+                                  (const struct lu_pcc_detach __user *)arg,
+                                  sizeof(*detach)))
+                       GOTO(out_detach_free, rc = -EFAULT);
+
+               if (!S_ISREG(inode->i_mode))
+                       GOTO(out_detach_free, rc = -EINVAL);
+
+               if (!inode_owner_or_capable(inode))
+                       GOTO(out_detach_free, rc = -EPERM);
+
+               rc = pcc_ioctl_detach(inode, detach->pccd_opt);
+out_detach_free:
+               OBD_FREE_PTR(detach);
+               RETURN(rc);
+       }
+       case LL_IOC_PCC_STATE: {
+               struct lu_pcc_state __user *ustate =
+                       (struct lu_pcc_state __user *)arg;
+               struct lu_pcc_state *state;
+
+               OBD_ALLOC_PTR(state);
+               if (state == NULL)
+                       RETURN(-ENOMEM);
+
+               if (copy_from_user(state, ustate, sizeof(*state)))
+                       GOTO(out_state, rc = -EFAULT);
+
+               rc = pcc_ioctl_state(file, inode, state);
+               if (rc)
+                       GOTO(out_state, rc);
+
+               if (copy_to_user(ustate, state, sizeof(*state)))
+                       GOTO(out_state, rc = -EFAULT);
+
+out_state:
+               OBD_FREE_PTR(state);
+               RETURN(rc);
+       }
        default:
                RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
                                     (void __user *)arg));
@@ -3940,6 +4092,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
        struct ll_inode_info *lli = ll_i2info(inode);
        struct ptlrpc_request *req;
        int rc, err;
+
        ENTRY;
 
        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
@@ -3977,8 +4130,15 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 
        if (S_ISREG(inode->i_mode)) {
                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+               bool cached;
 
-               err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
+               /* Sync metadata on MDT first, and then sync the cached data
+                * on PCC.
+                */
+               err = pcc_fsync(file, start, end, datasync, &cached);
+               if (!cached)
+                       err = cl_sync_file_range(inode, start, end,
+                                                CL_FSYNC_ALL, 0);
                if (rc == 0 && err < 0)
                        rc = err;
                if (rc < 0)
@@ -4503,27 +4663,8 @@ static int ll_merge_md_attr(struct inode *inode)
        RETURN(0);
 }
 
-static inline dev_t ll_compat_encode_dev(dev_t dev)
+int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
 {
-       /* The compat_sys_*stat*() syscalls will fail unless the
-        * device majors and minors are both less than 256. Note that
-        * the value returned here will be passed through
-        * old_encode_dev() in cp_compat_stat(). And so we are not
-        * trying to return a valid compat (u16) device number, just
-        * one that will pass the old_valid_dev() check. */
-
-       return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
-}
-
-#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
-int ll_getattr(const struct path *path, struct kstat *stat,
-              u32 request_mask, unsigned int flags)
-{
-       struct dentry *de = path->dentry;
-#else
-int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
-{
-#endif
        struct inode *inode = de->d_inode;
        struct ll_sb_info *sbi = ll_i2sbi(inode);
        struct ll_inode_info *lli = ll_i2info(inode);
@@ -4536,6 +4677,12 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
                RETURN(rc);
 
        if (S_ISREG(inode->i_mode)) {
+               bool cached;
+
+               rc = pcc_inode_getattr(inode, &cached);
+               if (cached && rc < 0)
+                       RETURN(rc);
+
                /* In case of restore, the MDT has the right size and has
                 * already send it back without granting the layout lock,
                 * inode is up-to-date so glimpse is useless.
@@ -4543,7 +4690,7 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
                 * restore the MDT holds the layout lock so the glimpse will
                 * block up to the end of restore (getattr will block)
                 */
-               if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
+               if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
                        rc = ll_glimpse_size(inode);
                        if (rc < 0)
                                RETURN(rc);
@@ -4588,6 +4735,18 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
         return 0;
 }
 
+#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
+int ll_getattr(const struct path *path, struct kstat *stat,
+              u32 request_mask, unsigned int flags)
+{
+       struct dentry *de = path->dentry;
+#else
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+#endif
+       return ll_getattr_dentry(de, stat);
+}
+
 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                     __u64 start, __u64 len)
 {
index 9ff4cc0..1e86edd 100644 (file)
@@ -48,6 +48,7 @@
 
 #include "vvp_internal.h"
 #include "range_lock.h"
+#include "pcc.h"
 
 #ifndef FMODE_EXEC
 #define FMODE_EXEC 0
@@ -209,6 +210,10 @@ struct ll_inode_info {
                         * accurate if the file is shared by different jobs.
                         */
                        char                    lli_jobid[LUSTRE_JOBID_SIZE];
+
+                       struct mutex             lli_pcc_lock;
+                       enum lu_pcc_state_flags  lli_pcc_state;
+                       struct pcc_inode        *lli_pcc_inode;
                };
        };
 
@@ -339,6 +344,11 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode)
        return container_of(inode, struct ll_inode_info, lli_vfs_inode);
 }
 
+static inline struct pcc_inode *ll_i2pcci(struct inode *inode)
+{
+       return ll_i2info(inode)->lli_pcc_inode;
+}
+
 /* default to about 64M of readahead on a given system. */
 #define SBI_DEFAULT_READAHEAD_MAX              MiB_TO_PAGES(64UL)
 
@@ -579,6 +589,9 @@ struct ll_sb_info {
 
        /* filesystem fsname */
        char                      ll_fsname[LUSTRE_MAXFSNAME + 1];
+
+       /* Persistent Client Cache */
+       struct pcc_super          ll_pcc_super;
 };
 
 #define SBI_DEFAULT_HEAT_DECAY_WEIGHT  ((80 * 256 + 50) / 100)
@@ -694,6 +707,7 @@ struct ll_file_data {
        /* The layout version when resync starts. Resync I/O should carry this
         * layout version for verification to OST objects */
        __u32 fd_layout_version;
+       struct pcc_file fd_pcc_file;
 };
 
 void llite_tunables_unregister(void);
@@ -878,6 +892,7 @@ int ll_getattr(const struct path *path, struct kstat *stat,
 #else
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
 #endif
+int ll_getattr_dentry(struct dentry *de, struct kstat *stat);
 struct posix_acl *ll_get_acl(struct inode *inode, int type);
 #ifdef HAVE_IOP_SET_ACL
 #ifdef CONFIG_FS_POSIX_ACL
@@ -1473,6 +1488,18 @@ static inline void d_lustre_revalidate(struct dentry *dentry)
        spin_unlock(&dentry->d_lock);
 }
 
+static inline dev_t ll_compat_encode_dev(dev_t dev)
+{
+       /* The compat_sys_*stat*() syscalls will fail unless the
+        * device majors and minors are both less than 256. Note that
+        * the value returned here will be passed through
+        * old_encode_dev() in cp_compat_stat(). And so we are not
+        * trying to return a valid compat (u16) device number, just
+        * one that will pass the old_valid_dev() check. */
+
+       return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
+}
+
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
 int ll_layout_refresh(struct inode *inode, __u32 *gen);
 int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
index cc84e32..4a94d99 100644 (file)
@@ -73,12 +73,18 @@ static struct ll_sb_info *ll_init_sbi(void)
        unsigned long pages;
        unsigned long lru_page_max;
        struct sysinfo si;
+       int rc;
        int i;
+
        ENTRY;
 
        OBD_ALLOC_PTR(sbi);
        if (sbi == NULL)
-               RETURN(NULL);
+               RETURN(ERR_PTR(-ENOMEM));
+
+       rc = pcc_super_init(&sbi->ll_pcc_super);
+       if (rc < 0)
+               GOTO(out_sbi, rc);
 
        spin_lock_init(&sbi->ll_lock);
        mutex_init(&sbi->ll_lco.lco_lock);
@@ -92,10 +98,8 @@ static struct ll_sb_info *ll_init_sbi(void)
 
        /* initialize ll_cache data */
        sbi->ll_cache = cl_cache_init(lru_page_max);
-       if (sbi->ll_cache == NULL) {
-               OBD_FREE(sbi, sizeof(*sbi));
-               RETURN(NULL);
-       }
+       if (sbi->ll_cache == NULL)
+               GOTO(out_pcc, rc = -ENOMEM);
 
        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                           SBI_DEFAULT_READAHEAD_MAX);
@@ -143,6 +147,11 @@ static struct ll_sb_info *ll_init_sbi(void)
        sbi->ll_heat_decay_weight = SBI_DEFAULT_HEAT_DECAY_WEIGHT;
        sbi->ll_heat_period_second = SBI_DEFAULT_HEAT_PERIOD_SECOND;
        RETURN(sbi);
+out_pcc:
+       pcc_super_fini(&sbi->ll_pcc_super);
+out_sbi:
+       OBD_FREE_PTR(sbi);
+       RETURN(ERR_PTR(rc));
 }
 
 static void ll_free_sbi(struct super_block *sb)
@@ -157,6 +166,7 @@ static void ll_free_sbi(struct super_block *sb)
                        cl_cache_decref(sbi->ll_cache);
                        sbi->ll_cache = NULL;
                }
+               pcc_super_fini(&sbi->ll_pcc_super);
                OBD_FREE(sbi, sizeof(*sbi));
        }
        EXIT;
@@ -228,7 +238,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                   OBD_CONNECT2_LOCK_CONVERT |
                                   OBD_CONNECT2_ARCHIVE_ID_ARRAY |
                                   OBD_CONNECT2_LSOM |
-                                  OBD_CONNECT2_ASYNC_DISCARD;
+                                  OBD_CONNECT2_ASYNC_DISCARD |
+                                  OBD_CONNECT2_PCC;
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
@@ -979,6 +990,9 @@ void ll_lli_init(struct ll_inode_info *lli)
                spin_lock_init(&lli->lli_heat_lock);
                obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
                lli->lli_heat_flags = 0;
+               mutex_init(&lli->lli_pcc_lock);
+               lli->lli_pcc_state = PCC_STATE_FL_NONE;
+               lli->lli_pcc_inode = NULL;
        }
        mutex_init(&lli->lli_layout_mutex);
        memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
@@ -1052,8 +1066,8 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 
        /* client additional sb info */
        lsi->lsi_llsbi = sbi = ll_init_sbi();
-       if (!sbi)
-               GOTO(out_free_cfg, err = -ENOMEM);
+       if (IS_ERR(sbi))
+               GOTO(out_free_cfg, err = PTR_ERR(sbi));
 
        err = ll_options(lsi->lsi_lmd->lmd_opts, sbi);
        if (err)
@@ -1183,7 +1197,7 @@ void ll_put_super(struct super_block *sb)
        int next, force = 1, rc = 0;
        ENTRY;
 
-       if (!sbi)
+       if (IS_ERR(sbi))
                GOTO(out_no_sbi, 0);
 
        /* Should replace instance_id with something better for ASLR */
@@ -1556,17 +1570,20 @@ void ll_clear_inode(struct inode *inode)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
         struct ll_sb_info *sbi = ll_i2sbi(inode);
+
         ENTRY;
 
        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
               PFID(ll_inode2fid(inode)), inode);
 
-        if (S_ISDIR(inode->i_mode)) {
-                /* these should have been cleared in ll_file_release */
-                LASSERT(lli->lli_opendir_key == NULL);
-                LASSERT(lli->lli_sai == NULL);
-                LASSERT(lli->lli_opendir_pid == 0);
-        }
+       if (S_ISDIR(inode->i_mode)) {
+               /* these should have been cleared in ll_file_release */
+               LASSERT(lli->lli_opendir_key == NULL);
+               LASSERT(lli->lli_sai == NULL);
+               LASSERT(lli->lli_opendir_pid == 0);
+       } else {
+               pcc_inode_free(inode);
+       }
 
        md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
 
@@ -1692,6 +1709,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
         struct ll_inode_info *lli = ll_i2info(inode);
         struct md_op_data *op_data = NULL;
        int rc = 0;
+
        ENTRY;
 
        CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, "
@@ -1790,14 +1808,28 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
        if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET |
                              ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) ||
            xvalid & OP_XVALID_CTIME_SET) {
-               /* For truncate and utimes sending attributes to OSTs, setting
-                * mtime/atime to the past will be performed under PW [0:EOF]
-                * extent lock (new_size:EOF for truncate).  It may seem
-                * excessive to send mtime/atime updates to OSTs when not
-                * setting times to past, but it is necessary due to possible
-                * time de-synchronization between MDT inode and OST objects
-                */
-               rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0);
+               bool cached = false;
+
+               rc = pcc_inode_setattr(inode, attr, &cached);
+               if (cached) {
+                       if (rc) {
+                               CERROR("%s: PCC inode "DFID" setattr failed: "
+                                      "rc = %d\n",
+                                      ll_i2sbi(inode)->ll_fsname,
+                                      PFID(&lli->lli_fid), rc);
+                               GOTO(out, rc);
+                       }
+               } else {
+                       /* For truncate and utimes sending attributes to OSTs,
+                        * setting mtime/atime to the past will be performed
+                        * under PW [0:EOF] extent lock (new_size:EOF for
+                        * truncate). It may seem excessive to send mtime/atime
+                        * updates to OSTs when not setting times to past, but
+                        * it is necessary due to possible time
+                        * de-synchronization between MDT inode and OST objects
+                        */
+                       rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0);
+               }
        }
 
        /* If the file was restored, it needs to set dirty flag.
index 989a53b..e9552ae 100644 (file)
@@ -349,17 +349,22 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 #endif
        int count = 0;
        bool printed = false;
+       bool cached;
        int result;
        sigset_t set;
 
+       ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+                          LPROC_LL_FAULT, 1);
+
+       result = pcc_fault(vma, vmf, &cached);
+       if (cached)
+               return result;
+
        /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
         * so that it can be killed by admin but not cause segfault by
         * other signals. */
        set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
 
-       ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
-                          LPROC_LL_FAULT, 1);
-
        /* make sure offset is not a negative number */
        if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return VM_FAULT_SIGBUS;
@@ -403,11 +408,16 @@ static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        int count = 0;
        bool printed = false;
        bool retry;
+       bool cached;
        int result;
 
        ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
                           LPROC_LL_MKWRITE, 1);
 
+       result = pcc_page_mkwrite(vma, vmf, &cached);
+       if (cached)
+               return result;
+
        file_update_time(vma->vm_file);
         do {
                 retry = false;
@@ -459,6 +469,7 @@ static void ll_vm_open(struct vm_area_struct * vma)
        ENTRY;
        LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0);
        atomic_inc(&vob->vob_mmap_cnt);
+       pcc_vm_open(vma);
        EXIT;
 }
 
@@ -473,6 +484,7 @@ static void ll_vm_close(struct vm_area_struct *vma)
        ENTRY;
        atomic_dec(&vob->vob_mmap_cnt);
        LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0);
+       pcc_vm_close(vma);
        EXIT;
 }
 
@@ -487,7 +499,7 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
         if (mapping_mapped(mapping)) {
                 rc = 0;
                unmap_mapping_range(mapping, first + PAGE_SIZE - 1,
-                                    last - first + 1, 0);
+                                   last - first + 1, 1);
         }
 
         RETURN(rc);
@@ -503,19 +515,26 @@ static const struct vm_operations_struct ll_file_vm_ops = {
 int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
 {
        struct inode *inode = file_inode(file);
+       bool cached;
         int rc;
+
         ENTRY;
 
         if (ll_file_nolock(file))
                 RETURN(-EOPNOTSUPP);
 
+       rc = pcc_file_mmap(file, vma, &cached);
+       if (cached && rc != 0)
+               RETURN(rc);
+
         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
         rc = generic_file_mmap(file, vma);
         if (rc == 0) {
-                vma->vm_ops = &ll_file_vm_ops;
+               vma->vm_ops = &ll_file_vm_ops;
                 vma->vm_ops->open(vma);
                 /* update the inode's size and mtime */
-                rc = ll_glimpse_size(inode);
+               if (!cached)
+                       rc = ll_glimpse_size(inode);
         }
 
         RETURN(rc);
index 5030e0f..ed66cd8 100644 (file)
@@ -1311,6 +1311,43 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 
 LDEBUGFS_SEQ_FOPS(ll_nosquash_nids);
 
+static int ll_pcc_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return pcc_super_dump(&sbi->ll_pcc_super, m);
+}
+
+static ssize_t ll_pcc_seq_write(struct file *file, const char __user *buffer,
+                               size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int rc;
+       char *kernbuf;
+
+       if (count >= LPROCFS_WR_PCC_MAX_CMD)
+               return -EINVAL;
+
+       if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_PCC))
+               return -EOPNOTSUPP;
+
+       OBD_ALLOC(kernbuf, count + 1);
+       if (kernbuf == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               GOTO(out_free_kernbuff, rc = -EFAULT);
+
+       rc = pcc_cmd_handle(kernbuf, count, &sbi->ll_pcc_super);
+out_free_kernbuff:
+       OBD_FREE(kernbuf, count + 1);
+       return rc ? rc : count;
+}
+LPROC_SEQ_FOPS(ll_pcc);
+
 struct lprocfs_vars lprocfs_llite_obd_vars[] = {
        { .name =       "site",
          .fops =       &ll_site_stats_fops                     },
@@ -1332,6 +1369,8 @@ struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          .fops =       &ll_root_squash_fops                    },
        { .name =       "nosquash_nids",
          .fops =       &ll_nosquash_nids_fops                  },
+       { .name =       "pcc",
+         .fops =       &ll_pcc_fops,                           },
        { NULL }
 };
 
index 4734210..eb5b5f9 100644 (file)
@@ -725,14 +725,21 @@ out:
        return rc;
 }
 
+struct pcc_create_attach {
+       struct pcc_dataset *pca_dataset;
+       struct dentry *pca_dentry;
+};
+
 static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
                                   struct lookup_intent *it,
-                                  void **secctx, __u32 *secctxlen)
+                                  void **secctx, __u32 *secctxlen,
+                                  struct pcc_create_attach *pca)
 {
        struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
        struct dentry *save = dentry, *retval;
        struct ptlrpc_request *req = NULL;
        struct md_op_data *op_data = NULL;
+       struct lov_user_md *lum = NULL;
        __u32 opc;
        int rc;
        char secctx_name[XATTR_NAME_MAX + 1];
@@ -813,6 +820,32 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
                }
        }
 
+       if (pca && pca->pca_dataset) {
+               struct pcc_dataset *dataset = pca->pca_dataset;
+
+               OBD_ALLOC_PTR(lum);
+               if (lum == NULL)
+                       GOTO(out, retval = ERR_PTR(-ENOMEM));
+
+               lum->lmm_magic = LOV_USER_MAGIC_V1;
+               lum->lmm_pattern = LOV_PATTERN_F_RELEASED | LOV_PATTERN_RAID0;
+               op_data->op_data = lum;
+               op_data->op_data_size = sizeof(*lum);
+               op_data->op_archive_id = dataset->pccd_rwid;
+
+               rc = obd_fid_alloc(NULL, ll_i2mdexp(parent), &op_data->op_fid2,
+                                  op_data);
+               if (rc)
+                       GOTO(out, retval = ERR_PTR(rc));
+
+               rc = pcc_inode_create(parent->i_sb, dataset, &op_data->op_fid2,
+                                     &pca->pca_dentry);
+               if (rc)
+                       GOTO(out, retval = ERR_PTR(rc));
+
+               it->it_flags |= MDS_OPEN_PCC;
+       }
+
        rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
                            &ll_md_blocking_ast, 0);
        /* If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the
@@ -874,6 +907,9 @@ out:
                ll_finish_md_op_data(op_data);
        }
 
+       if (lum != NULL)
+               OBD_FREE_PTR(lum);
+
        ptlrpc_req_finished(req);
        return retval;
 }
@@ -902,7 +938,7 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
                itp = NULL;
        else
                itp = &it;
-       de = ll_lookup_it(parent, dentry, itp, NULL, NULL);
+       de = ll_lookup_it(parent, dentry, itp, NULL, NULL, NULL);
 
        if (itp != NULL)
                ll_intent_release(itp);
@@ -923,6 +959,9 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
        long long lookup_flags = LOOKUP_OPEN;
        void *secctx = NULL;
        __u32 secctxlen = 0;
+       struct ll_sb_info *sbi;
+       struct pcc_create_attach pca = {NULL, NULL};
+       struct pcc_dataset *dataset = NULL;
        int rc = 0;
        ENTRY;
 
@@ -957,13 +996,27 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
        if (open_flags & O_CREAT) {
                it->it_op |= IT_CREAT;
                lookup_flags |= LOOKUP_CREATE;
+               sbi = ll_i2sbi(dir);
+               /* Volatile file is used for HSM restore, so do not use PCC */
+               if (!filename_is_volatile(dentry->d_name.name,
+                                         dentry->d_name.len, NULL)) {
+                       struct pcc_matcher item;
+
+                       item.pm_uid = from_kuid(&init_user_ns, current_uid());
+                       item.pm_gid = from_kgid(&init_user_ns, current_gid());
+                       item.pm_projid = ll_i2info(dir)->lli_projid;
+                       item.pm_name = &dentry->d_name;
+                       dataset = pcc_dataset_match_get(&sbi->ll_pcc_super,
+                                                       &item);
+                       pca.pca_dataset = dataset;
+               }
        }
        it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
        it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
        it->it_flags &= ~MDS_OPEN_FL_INTERNAL;
 
        /* Dentry added to dcache tree in ll_lookup_it */
-       de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen);
+       de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen, &pca);
        if (IS_ERR(de))
                rc = PTR_ERR(de);
        else if (de != NULL)
@@ -982,9 +1035,20 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
                                        dput(de);
                                goto out_release;
                        }
+                       if (dataset != NULL && dentry->d_inode) {
+                               rc = pcc_inode_create_fini(dataset,
+                                                          dentry->d_inode,
+                                                          pca.pca_dentry);
+                               if (rc) {
+                                       if (de != NULL)
+                                               dput(de);
+                                       GOTO(out_release, rc);
+                               }
+                       }
 
                        *opened |= FILE_CREATED;
                }
+
                if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
                        /* Open dentry. */
                        if (S_ISFIFO(dentry->d_inode->i_mode)) {
@@ -1007,6 +1071,8 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
        }
 
 out_release:
+       if (dataset != NULL)
+               pcc_dataset_put(dataset);
        ll_intent_release(it);
        OBD_FREE(it, sizeof(*it));
 
@@ -1071,7 +1137,7 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
                                RETURN((struct dentry *)it);
                }
 
-               de = ll_lookup_it(parent, dentry, it, NULL, NULL);
+               de = ll_lookup_it(parent, dentry, it, NULL, NULL, NULL);
                if (de)
                        dentry = de;
                if ((nd->flags & LOOKUP_OPEN) && !IS_ERR(dentry)) { /* Open */
@@ -1111,7 +1177,7 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
                        OBD_FREE(it, sizeof(*it));
                }
        } else {
-               de = ll_lookup_it(parent, dentry, NULL, NULL, NULL);
+               de = ll_lookup_it(parent, dentry, NULL, NULL, NULL, NULL);
        }
 
        RETURN(de);
diff --git a/lustre/llite/pcc.c b/lustre/llite/pcc.c
new file mode 100644 (file)
index 0000000..fcebed4
--- /dev/null
@@ -0,0 +1,2514 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, DDN Storage Corporation.
+ */
+/*
+ * Persistent Client Cache
+ *
+ * PCC is a new framework which provides a group of local cache on Lustre
+ * client side. It works in two modes: RW-PCC enables a read-write cache on the
+ * local SSDs of a single client; RO-PCC provides a read-only cache on the
+ * local SSDs of multiple clients. Less overhead is visible to the applications
+ * and network latencies and lock conflicts can be significantly reduced.
+ *
+ * For RW-PCC, no global namespace will be provided. Each client uses its own
+ * local storage as a cache for itself. Local file system is used to manage
+ * the data on local caches. Cached I/O is directed to local file system while
+ * normal I/O is directed to OSTs. RW-PCC uses HSM for data synchronization.
+ * It uses HSM copytool to restore file from local caches to Lustre OSTs. Each
+ * PCC has a copytool instance running with unique archive number. Any remote
+ * access from another Lustre client would trigger the data synchronization. If
+ * a client with RW-PCC goes offline, the cached data becomes inaccessible for
+ * other client temporarily. And after the RW-PCC client reboots and the
+ * copytool restarts, the data will be accessible again.
+ *
+ * Following is what will happen in different conditions for RW-PCC:
+ *
+ * > When file is being created on RW-PCC
+ *
+ * A normal HSM released file is created on MDT;
+ * An empty mirror file is created on local cache;
+ * The HSM status of the Lustre file will be set to archived and released;
+ * The archive number will be set to the proper value.
+ *
+ * > When file is being prefetched to RW-PCC
+ *
+ * An file is copied to the local cache;
+ * The HSM status of the Lustre file will be set to archived and released;
+ * The archive number will be set to the proper value.
+ *
+ * > When file is being accessed from PCC
+ *
+ * Data will be read directly from local cache;
+ * Metadata will be read from MDT, except file size;
+ * File size will be got from local cache.
+ *
+ * > When PCC cached file is being accessed on another client
+ *
+ * RW-PCC cached files are automatically restored when a process on another
+ * client tries to read or modify them. The corresponding I/O will block
+ * waiting for the released file to be restored. This is transparent to the
+ * process.
+ *
+ * For RW-PCC, when a file is being created, a rule-based policy is used to
+ * determine whether it will be cached. Rule-based caching of newly created
+ * files can determine which file can use a cache on PCC directly without any
+ * admission control.
+ *
+ * RW-PCC design can accelerate I/O intensive applications with one-to-one
+ * mappings between files and accessing clients. However, in several use cases,
+ * files will never be updated, but need to be read simultaneously from many
+ * clients. RO-PCC implements a read-only caching on Lustre clients using
+ * SSDs. RO-PCC is based on the same framework as RW-PCC, expect
+ * that no HSM mechanism is used.
+ *
+ * The main advantages to use this SSD cache on the Lustre clients via PCC
+ * is that:
+ * - The I/O stack becomes much simpler for the cached data, as there is no
+ *   interference with I/Os from other clients, which enables easier
+ *   performance optimizations;
+ * - The requirements on the HW inside the client nodes are small, any kind of
+ *   SSDs or even HDDs can be used as cache devices;
+ * - Caching reduces the pressure on the object storage targets (OSTs), as
+ *   small or random I/Os can be regularized to big sequential I/Os and
+ *   temporary files do not even need to be flushed to OSTs.
+ *
+ * PCC can accelerate applications with certain I/O patterns:
+ * - small-sized random writes (< 1MB) from a single client
+ * - repeated read of data that is larger than RAM
+ * - clients with high network latency
+ *
+ * Author: Li Xi <lixi@ddn.com>
+ * Author: Qian Yingjin <qian@ddn.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "pcc.h"
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <lustre_compat.h>
+#include "llite_internal.h"
+
+struct kmem_cache *pcc_inode_slab;
+
+int pcc_super_init(struct pcc_super *super)
+{
+       struct cred *cred;
+
+       super->pccs_cred = cred = prepare_creds();
+       if (!cred)
+               return -ENOMEM;
+
+       /* Never override disk quota limits or use reserved space */
+       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
+       init_rwsem(&super->pccs_rw_sem);
+       INIT_LIST_HEAD(&super->pccs_datasets);
+
+       return 0;
+}
+
+/* Rule based auto caching */
+static void pcc_id_list_free(struct list_head *id_list)
+{
+       struct pcc_match_id *id, *n;
+
+       list_for_each_entry_safe(id, n, id_list, pmi_linkage) {
+               list_del_init(&id->pmi_linkage);
+               OBD_FREE_PTR(id);
+       }
+}
+
+static void pcc_fname_list_free(struct list_head *fname_list)
+{
+       struct pcc_match_fname *fname, *n;
+
+       list_for_each_entry_safe(fname, n, fname_list, pmf_linkage) {
+               OBD_FREE(fname->pmf_name, strlen(fname->pmf_name) + 1);
+               list_del_init(&fname->pmf_linkage);
+               OBD_FREE_PTR(fname);
+       }
+}
+
+static void pcc_expression_free(struct pcc_expression *expr)
+{
+       LASSERT(expr->pe_field >= PCC_FIELD_UID &&
+               expr->pe_field < PCC_FIELD_MAX);
+       switch (expr->pe_field) {
+       case PCC_FIELD_UID:
+       case PCC_FIELD_GID:
+       case PCC_FIELD_PROJID:
+               pcc_id_list_free(&expr->pe_cond);
+               break;
+       case PCC_FIELD_FNAME:
+               pcc_fname_list_free(&expr->pe_cond);
+               break;
+       default:
+               LBUG();
+       }
+       OBD_FREE_PTR(expr);
+}
+
+static void pcc_conjunction_free(struct pcc_conjunction *conjunction)
+{
+       struct pcc_expression *expression, *n;
+
+       LASSERT(list_empty(&conjunction->pc_linkage));
+       list_for_each_entry_safe(expression, n,
+                                &conjunction->pc_expressions,
+                                pe_linkage) {
+               list_del_init(&expression->pe_linkage);
+               pcc_expression_free(expression);
+       }
+       OBD_FREE_PTR(conjunction);
+}
+
+static void pcc_rule_conds_free(struct list_head *cond_list)
+{
+       struct pcc_conjunction *conjunction, *n;
+
+       list_for_each_entry_safe(conjunction, n, cond_list, pc_linkage) {
+               list_del_init(&conjunction->pc_linkage);
+               pcc_conjunction_free(conjunction);
+       }
+}
+
+static void pcc_cmd_fini(struct pcc_cmd *cmd)
+{
+       if (cmd->pccc_cmd == PCC_ADD_DATASET) {
+               if (!list_empty(&cmd->u.pccc_add.pccc_conds))
+                       pcc_rule_conds_free(&cmd->u.pccc_add.pccc_conds);
+               if (cmd->u.pccc_add.pccc_conds_str)
+                       OBD_FREE(cmd->u.pccc_add.pccc_conds_str,
+                                strlen(cmd->u.pccc_add.pccc_conds_str) + 1);
+       }
+}
+
+#define PCC_DISJUNCTION_DELIM  (',')
+#define PCC_CONJUNCTION_DELIM  ('&')
+#define PCC_EXPRESSION_DELIM   ('=')
+
+static int
+pcc_fname_list_add(struct cfs_lstr *id, struct list_head *fname_list)
+{
+       struct pcc_match_fname *fname;
+
+       OBD_ALLOC(fname, sizeof(struct pcc_match_fname));
+       if (fname == NULL)
+               return -ENOMEM;
+
+       OBD_ALLOC(fname->pmf_name, id->ls_len + 1);
+       if (fname->pmf_name == NULL) {
+               OBD_FREE(fname, sizeof(struct pcc_match_fname));
+               return -ENOMEM;
+       }
+
+       memcpy(fname->pmf_name, id->ls_str, id->ls_len);
+       list_add_tail(&fname->pmf_linkage, fname_list);
+       return 0;
+}
+
+static int
+pcc_fname_list_parse(char *str, int len, struct list_head *fname_list)
+{
+       struct cfs_lstr src;
+       struct cfs_lstr res;
+       int rc = 0;
+
+       ENTRY;
+
+       src.ls_str = str;
+       src.ls_len = len;
+       INIT_LIST_HEAD(fname_list);
+       while (src.ls_str) {
+               rc = cfs_gettok(&src, ' ', &res);
+               if (rc == 0) {
+                       rc = -EINVAL;
+                       break;
+               }
+               rc = pcc_fname_list_add(&res, fname_list);
+               if (rc)
+                       break;
+       }
+       if (rc)
+               pcc_fname_list_free(fname_list);
+       RETURN(rc);
+}
+
+static int
+pcc_id_list_parse(char *str, int len, struct list_head *id_list,
+                 enum pcc_field type)
+{
+       struct cfs_lstr src;
+       struct cfs_lstr res;
+       int rc = 0;
+
+       ENTRY;
+
+       if (type != PCC_FIELD_UID && type != PCC_FIELD_GID &&
+           type != PCC_FIELD_PROJID)
+               RETURN(-EINVAL);
+
+       src.ls_str = str;
+       src.ls_len = len;
+       INIT_LIST_HEAD(id_list);
+       while (src.ls_str) {
+               struct pcc_match_id *id;
+               __u32 id_val;
+
+               if (cfs_gettok(&src, ' ', &res) == 0)
+                       GOTO(out, rc = -EINVAL);
+
+               if (!cfs_str2num_check(res.ls_str, res.ls_len,
+                                      &id_val, 0, (u32)~0U))
+                       GOTO(out, rc = -EINVAL);
+
+               OBD_ALLOC_PTR(id);
+               if (id == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               id->pmi_id = id_val;
+               list_add_tail(&id->pmi_linkage, id_list);
+       }
+out:
+       if (rc)
+               pcc_id_list_free(id_list);
+       RETURN(rc);
+}
+
+static inline bool
+pcc_check_field(struct cfs_lstr *field, char *str)
+{
+       int len = strlen(str);
+
+       return (field->ls_len == len &&
+               strncmp(field->ls_str, str, len) == 0);
+}
+
+static int
+pcc_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+       struct pcc_expression *expr;
+       struct cfs_lstr field;
+       int rc = 0;
+
+       OBD_ALLOC(expr, sizeof(struct pcc_expression));
+       if (expr == NULL)
+               return -ENOMEM;
+
+       rc = cfs_gettok(src, PCC_EXPRESSION_DELIM, &field);
+       if (rc == 0 || src->ls_len <= 2 || src->ls_str[0] != '{' ||
+           src->ls_str[src->ls_len - 1] != '}')
+               GOTO(out, rc = -EINVAL);
+
+       /* Skip '{' and '}' */
+       src->ls_str++;
+       src->ls_len -= 2;
+
+       if (pcc_check_field(&field, "uid")) {
+               if (pcc_id_list_parse(src->ls_str,
+                                     src->ls_len,
+                                     &expr->pe_cond,
+                                     PCC_FIELD_UID) < 0)
+                       GOTO(out, rc = -EINVAL);
+               expr->pe_field = PCC_FIELD_UID;
+       } else if (pcc_check_field(&field, "gid")) {
+               if (pcc_id_list_parse(src->ls_str,
+                                     src->ls_len,
+                                     &expr->pe_cond,
+                                     PCC_FIELD_GID) < 0)
+                       GOTO(out, rc = -EINVAL);
+               expr->pe_field = PCC_FIELD_GID;
+       } else if (pcc_check_field(&field, "projid")) {
+               if (pcc_id_list_parse(src->ls_str,
+                                     src->ls_len,
+                                     &expr->pe_cond,
+                                     PCC_FIELD_PROJID) < 0)
+                       GOTO(out, rc = -EINVAL);
+               expr->pe_field = PCC_FIELD_PROJID;
+       } else if (pcc_check_field(&field, "fname")) {
+               if (pcc_fname_list_parse(src->ls_str,
+                                        src->ls_len,
+                                        &expr->pe_cond) < 0)
+                       GOTO(out, rc = -EINVAL);
+               expr->pe_field = PCC_FIELD_FNAME;
+       } else {
+               GOTO(out, rc = -EINVAL);
+       }
+
+       list_add_tail(&expr->pe_linkage, cond_list);
+       return 0;
+out:
+       OBD_FREE_PTR(expr);
+       return rc;
+}
+
+static int
+pcc_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+       struct pcc_conjunction *conjunction;
+       struct cfs_lstr expr;
+       int rc = 0;
+
+       OBD_ALLOC(conjunction, sizeof(struct pcc_conjunction));
+       if (conjunction == NULL)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&conjunction->pc_expressions);
+       list_add_tail(&conjunction->pc_linkage, cond_list);
+
+       while (src->ls_str) {
+               rc = cfs_gettok(src, PCC_CONJUNCTION_DELIM, &expr);
+               if (rc == 0) {
+                       rc = -EINVAL;
+                       break;
+               }
+               rc = pcc_expression_parse(&expr,
+                                         &conjunction->pc_expressions);
+               if (rc)
+                       break;
+       }
+       return rc;
+}
+
+static int pcc_conds_parse(char *str, int len, struct list_head *cond_list)
+{
+       struct cfs_lstr src;
+       struct cfs_lstr res;
+       int rc = 0;
+
+       src.ls_str = str;
+       src.ls_len = len;
+       INIT_LIST_HEAD(cond_list);
+       while (src.ls_str) {
+               rc = cfs_gettok(&src, PCC_DISJUNCTION_DELIM, &res);
+               if (rc == 0) {
+                       rc = -EINVAL;
+                       break;
+               }
+               rc = pcc_conjunction_parse(&res, cond_list);
+               if (rc)
+                       break;
+       }
+       return rc;
+}
+
+static int pcc_id_parse(struct pcc_cmd *cmd, const char *id)
+{
+       int rc;
+
+       OBD_ALLOC(cmd->u.pccc_add.pccc_conds_str, strlen(id) + 1);
+       if (cmd->u.pccc_add.pccc_conds_str == NULL)
+               return -ENOMEM;
+
+       memcpy(cmd->u.pccc_add.pccc_conds_str, id, strlen(id));
+
+       rc = pcc_conds_parse(cmd->u.pccc_add.pccc_conds_str,
+                            strlen(cmd->u.pccc_add.pccc_conds_str),
+                            &cmd->u.pccc_add.pccc_conds);
+       if (rc)
+               pcc_cmd_fini(cmd);
+
+       return rc;
+}
+
+static int
+pcc_parse_value_pair(struct pcc_cmd *cmd, char *buffer)
+{
+       char *key, *val;
+       unsigned long id;
+       int rc;
+
+       val = buffer;
+       key = strsep(&val, "=");
+       if (val == NULL || strlen(val) == 0)
+               return -EINVAL;
+
+       /* Key of the value pair */
+       if (strcmp(key, "rwid") == 0) {
+               rc = kstrtoul(val, 10, &id);
+               if (rc)
+                       return rc;
+               if (id <= 0)
+                       return -EINVAL;
+               cmd->u.pccc_add.pccc_rwid = id;
+       } else if (strcmp(key, "roid") == 0) {
+               rc = kstrtoul(val, 10, &id);
+               if (rc)
+                       return rc;
+               if (id <= 0)
+                       return -EINVAL;
+               cmd->u.pccc_add.pccc_roid = id;
+       } else if (strcmp(key, "open_attach") == 0) {
+               rc = kstrtoul(val, 10, &id);
+               if (rc)
+                       return rc;
+               if (id > 0)
+                       cmd->u.pccc_add.pccc_flags |= PCC_DATASET_OPEN_ATTACH;
+       } else if (strcmp(key, "rwpcc") == 0) {
+               rc = kstrtoul(val, 10, &id);
+               if (rc)
+                       return rc;
+               if (id > 0)
+                       cmd->u.pccc_add.pccc_flags |= PCC_DATASET_RWPCC;
+       } else if (strcmp(key, "ropcc") == 0) {
+               rc = kstrtoul(val, 10, &id);
+               if (rc)
+                       return rc;
+               if (id > 0)
+                       cmd->u.pccc_add.pccc_flags |= PCC_DATASET_ROPCC;
+       } else {
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int
+pcc_parse_value_pairs(struct pcc_cmd *cmd, char *buffer)
+{
+       char *val;
+       char *token;
+       int rc;
+
+       val = buffer;
+       while (val != NULL && strlen(val) != 0) {
+               token = strsep(&val, " ");
+               rc = pcc_parse_value_pair(cmd, token);
+               if (rc)
+                       return rc;
+       }
+
+       switch (cmd->pccc_cmd) {
+       case PCC_ADD_DATASET:
+               if (cmd->u.pccc_add.pccc_flags & PCC_DATASET_RWPCC &&
+                   cmd->u.pccc_add.pccc_flags & PCC_DATASET_ROPCC)
+                       return -EINVAL;
+               /*
+                * By default, a PCC backend can provide caching service for
+                * both RW-PCC and RO-PCC.
+                */
+               if ((cmd->u.pccc_add.pccc_flags & PCC_DATASET_PCC_ALL) == 0)
+                       cmd->u.pccc_add.pccc_flags |= PCC_DATASET_PCC_ALL;
+               break;
+       case PCC_DEL_DATASET:
+       case PCC_CLEAR_ALL:
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static void
+pcc_dataset_rule_fini(struct pcc_match_rule *rule)
+{
+       if (!list_empty(&rule->pmr_conds))
+               pcc_rule_conds_free(&rule->pmr_conds);
+       LASSERT(rule->pmr_conds_str != NULL);
+       OBD_FREE(rule->pmr_conds_str, strlen(rule->pmr_conds_str) + 1);
+}
+
+static int
+pcc_dataset_rule_init(struct pcc_match_rule *rule, struct pcc_cmd *cmd)
+{
+       int rc = 0;
+
+       LASSERT(cmd->u.pccc_add.pccc_conds_str);
+       OBD_ALLOC(rule->pmr_conds_str,
+                 strlen(cmd->u.pccc_add.pccc_conds_str) + 1);
+       if (rule->pmr_conds_str == NULL)
+               return -ENOMEM;
+
+       memcpy(rule->pmr_conds_str,
+              cmd->u.pccc_add.pccc_conds_str,
+              strlen(cmd->u.pccc_add.pccc_conds_str));
+
+       INIT_LIST_HEAD(&rule->pmr_conds);
+       if (!list_empty(&cmd->u.pccc_add.pccc_conds))
+               rc = pcc_conds_parse(rule->pmr_conds_str,
+                                         strlen(rule->pmr_conds_str),
+                                         &rule->pmr_conds);
+
+       if (rc)
+               pcc_dataset_rule_fini(rule);
+
+       return rc;
+}
+
+/* Rule Matching */
+static int
+pcc_id_list_match(struct list_head *id_list, __u32 id_val)
+{
+       struct pcc_match_id *id;
+
+       list_for_each_entry(id, id_list, pmi_linkage) {
+               if (id->pmi_id == id_val)
+                       return 1;
+       }
+       return 0;
+}
+
+static bool
+cfs_match_wildcard(const char *pattern, const char *content)
+{
+       if (*pattern == '\0' && *content == '\0')
+               return true;
+
+       if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0')
+               return false;
+
+       while (*pattern == *content) {
+               pattern++;
+               content++;
+               if (*pattern == '\0' && *content == '\0')
+                       return true;
+
+               if (*pattern == '*' && *(pattern + 1) != '\0' &&
+                   *content == '\0')
+                       return false;
+       }
+
+       if (*pattern == '*')
+               return (cfs_match_wildcard(pattern + 1, content) ||
+                       cfs_match_wildcard(pattern, content + 1));
+
+       return false;
+}
+
+static int
+pcc_fname_list_match(struct list_head *fname_list, const char *name)
+{
+       struct pcc_match_fname *fname;
+
+       list_for_each_entry(fname, fname_list, pmf_linkage) {
+               if (cfs_match_wildcard(fname->pmf_name, name))
+                       return 1;
+       }
+       return 0;
+}
+
+static int
+pcc_expression_match(struct pcc_expression *expr, struct pcc_matcher *matcher)
+{
+       switch (expr->pe_field) {
+       case PCC_FIELD_UID:
+               return pcc_id_list_match(&expr->pe_cond, matcher->pm_uid);
+       case PCC_FIELD_GID:
+               return pcc_id_list_match(&expr->pe_cond, matcher->pm_gid);
+       case PCC_FIELD_PROJID:
+               return pcc_id_list_match(&expr->pe_cond, matcher->pm_projid);
+       case PCC_FIELD_FNAME:
+               return pcc_fname_list_match(&expr->pe_cond,
+                                           matcher->pm_name->name);
+       default:
+               return 0;
+       }
+}
+
+static int
+pcc_conjunction_match(struct pcc_conjunction *conjunction,
+                     struct pcc_matcher *matcher)
+{
+       struct pcc_expression *expr;
+       int matched;
+
+       list_for_each_entry(expr, &conjunction->pc_expressions, pe_linkage) {
+               matched = pcc_expression_match(expr, matcher);
+               if (!matched)
+                       return 0;
+       }
+
+       return 1;
+}
+
+static int
+pcc_cond_match(struct pcc_match_rule *rule, struct pcc_matcher *matcher)
+{
+       struct pcc_conjunction *conjunction;
+       int matched;
+
+       list_for_each_entry(conjunction, &rule->pmr_conds, pc_linkage) {
+               matched = pcc_conjunction_match(conjunction, matcher);
+               if (matched)
+                       return 1;
+       }
+
+       return 0;
+}
+
+struct pcc_dataset*
+pcc_dataset_match_get(struct pcc_super *super, struct pcc_matcher *matcher)
+{
+       struct pcc_dataset *dataset;
+       struct pcc_dataset *selected = NULL;
+
+       down_read(&super->pccs_rw_sem);
+       list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) {
+               if (!(dataset->pccd_flags & PCC_DATASET_RWPCC))
+                       continue;
+
+               if (pcc_cond_match(&dataset->pccd_rule, matcher)) {
+                       atomic_inc(&dataset->pccd_refcount);
+                       selected = dataset;
+                       break;
+               }
+       }
+       up_read(&super->pccs_rw_sem);
+       if (selected)
+               CDEBUG(D_CACHE, "PCC create, matched %s - %d:%d:%d:%s\n",
+                      dataset->pccd_rule.pmr_conds_str,
+                      matcher->pm_uid, matcher->pm_gid,
+                      matcher->pm_projid, matcher->pm_name->name);
+
+       return selected;
+}
+
+/**
+ * pcc_dataset_add - Add a Cache policy to control which files need be
+ * cached and where it will be cached.
+ *
+ * @super:     superblock of pcc
+ * @cmd:       pcc command
+ */
+static int
+pcc_dataset_add(struct pcc_super *super, struct pcc_cmd *cmd)
+{
+       char *pathname = cmd->pccc_pathname;
+       struct pcc_dataset *dataset;
+       struct pcc_dataset *tmp;
+       bool found = false;
+       int rc;
+
+       OBD_ALLOC_PTR(dataset);
+       if (dataset == NULL)
+               return -ENOMEM;
+
+       rc = kern_path(pathname, LOOKUP_DIRECTORY, &dataset->pccd_path);
+       if (unlikely(rc)) {
+               OBD_FREE_PTR(dataset);
+               return rc;
+       }
+       strncpy(dataset->pccd_pathname, pathname, PATH_MAX);
+       dataset->pccd_rwid = cmd->u.pccc_add.pccc_rwid;
+       dataset->pccd_roid = cmd->u.pccc_add.pccc_roid;
+       dataset->pccd_flags = cmd->u.pccc_add.pccc_flags;
+       atomic_set(&dataset->pccd_refcount, 1);
+
+       rc = pcc_dataset_rule_init(&dataset->pccd_rule, cmd);
+       if (rc) {
+               pcc_dataset_put(dataset);
+               return rc;
+       }
+
+       down_write(&super->pccs_rw_sem);
+       list_for_each_entry(tmp, &super->pccs_datasets, pccd_linkage) {
+               if (strcmp(tmp->pccd_pathname, pathname) == 0 ||
+                   (dataset->pccd_rwid != 0 &&
+                    dataset->pccd_rwid == tmp->pccd_rwid) ||
+                   (dataset->pccd_roid != 0 &&
+                    dataset->pccd_roid == tmp->pccd_roid)) {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found)
+               list_add(&dataset->pccd_linkage, &super->pccs_datasets);
+       up_write(&super->pccs_rw_sem);
+
+       if (found) {
+               pcc_dataset_put(dataset);
+               rc = -EEXIST;
+       }
+
+       return rc;
+}
+
+struct pcc_dataset *
+pcc_dataset_get(struct pcc_super *super, enum lu_pcc_type type, __u32 id)
+{
+       struct pcc_dataset *dataset;
+       struct pcc_dataset *selected = NULL;
+
+       if (id == 0)
+               return NULL;
+
+       /*
+        * archive ID (read-write ID) or read-only ID is unique in the list,
+        * we just return last added one as first priority.
+        */
+       down_read(&super->pccs_rw_sem);
+       list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) {
+               if (type == LU_PCC_READWRITE && (dataset->pccd_rwid != id ||
+                   !(dataset->pccd_flags & PCC_DATASET_RWPCC)))
+                       continue;
+               atomic_inc(&dataset->pccd_refcount);
+               selected = dataset;
+               break;
+       }
+       up_read(&super->pccs_rw_sem);
+       if (selected)
+               CDEBUG(D_CACHE, "matched id %u, PCC mode %d\n", id, type);
+
+       return selected;
+}
+
+void
+pcc_dataset_put(struct pcc_dataset *dataset)
+{
+       if (atomic_dec_and_test(&dataset->pccd_refcount)) {
+               pcc_dataset_rule_fini(&dataset->pccd_rule);
+               path_put(&dataset->pccd_path);
+               OBD_FREE_PTR(dataset);
+       }
+}
+
+static int
+pcc_dataset_del(struct pcc_super *super, char *pathname)
+{
+       struct list_head *l, *tmp;
+       struct pcc_dataset *dataset;
+       int rc = -ENOENT;
+
+       down_write(&super->pccs_rw_sem);
+       list_for_each_safe(l, tmp, &super->pccs_datasets) {
+               dataset = list_entry(l, struct pcc_dataset, pccd_linkage);
+               if (strcmp(dataset->pccd_pathname, pathname) == 0) {
+                       list_del_init(&dataset->pccd_linkage);
+                       pcc_dataset_put(dataset);
+                       rc = 0;
+                       break;
+               }
+       }
+       up_write(&super->pccs_rw_sem);
+       return rc;
+}
+
+static void
+pcc_dataset_dump(struct pcc_dataset *dataset, struct seq_file *m)
+{
+       seq_printf(m, "%s:\n", dataset->pccd_pathname);
+       seq_printf(m, "  rwid: %u\n", dataset->pccd_rwid);
+       seq_printf(m, "  flags: %x\n", dataset->pccd_flags);
+       seq_printf(m, "  autocache: %s\n", dataset->pccd_rule.pmr_conds_str);
+}
+
+int
+pcc_super_dump(struct pcc_super *super, struct seq_file *m)
+{
+       struct pcc_dataset *dataset;
+
+       down_read(&super->pccs_rw_sem);
+       list_for_each_entry(dataset, &super->pccs_datasets, pccd_linkage) {
+               pcc_dataset_dump(dataset, m);
+       }
+       up_read(&super->pccs_rw_sem);
+       return 0;
+}
+
+static void pcc_remove_datasets(struct pcc_super *super)
+{
+       struct pcc_dataset *dataset, *tmp;
+
+       down_write(&super->pccs_rw_sem);
+       list_for_each_entry_safe(dataset, tmp,
+                                &super->pccs_datasets, pccd_linkage) {
+               list_del(&dataset->pccd_linkage);
+               pcc_dataset_put(dataset);
+       }
+       up_write(&super->pccs_rw_sem);
+}
+
+void pcc_super_fini(struct pcc_super *super)
+{
+       pcc_remove_datasets(super);
+       put_cred(super->pccs_cred);
+}
+
+static bool pathname_is_valid(const char *pathname)
+{
+       /* Needs to be absolute path */
+       if (pathname == NULL || strlen(pathname) == 0 ||
+           strlen(pathname) >= PATH_MAX || pathname[0] != '/')
+               return false;
+       return true;
+}
+
+static struct pcc_cmd *
+pcc_cmd_parse(char *buffer, unsigned long count)
+{
+       static struct pcc_cmd *cmd;
+       char *token;
+       char *val;
+       int rc = 0;
+
+       OBD_ALLOC_PTR(cmd);
+       if (cmd == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       /* clear all setting */
+       if (strncmp(buffer, "clear", 5) == 0) {
+               cmd->pccc_cmd = PCC_CLEAR_ALL;
+               GOTO(out, rc = 0);
+       }
+
+       val = buffer;
+       token = strsep(&val, " ");
+       if (val == NULL || strlen(val) == 0)
+               GOTO(out_free_cmd, rc = -EINVAL);
+
+       /* Type of the command */
+       if (strcmp(token, "add") == 0)
+               cmd->pccc_cmd = PCC_ADD_DATASET;
+       else if (strcmp(token, "del") == 0)
+               cmd->pccc_cmd = PCC_DEL_DATASET;
+       else
+               GOTO(out_free_cmd, rc = -EINVAL);
+
+       /* Pathname of the dataset */
+       token = strsep(&val, " ");
+       if ((val == NULL && cmd->pccc_cmd != PCC_DEL_DATASET) ||
+           !pathname_is_valid(token))
+               GOTO(out_free_cmd, rc = -EINVAL);
+       cmd->pccc_pathname = token;
+
+       if (cmd->pccc_cmd == PCC_ADD_DATASET) {
+               /* List of ID */
+               LASSERT(val);
+               token = val;
+               val = strrchr(token, '}');
+               if (!val)
+                       GOTO(out_free_cmd, rc = -EINVAL);
+
+               /* Skip '}' */
+               val++;
+               if (*val == '\0') {
+                       val = NULL;
+               } else if (*val == ' ') {
+                       *val = '\0';
+                       val++;
+               } else {
+                       GOTO(out_free_cmd, rc = -EINVAL);
+               }
+
+               rc = pcc_id_parse(cmd, token);
+               if (rc)
+                       GOTO(out_free_cmd, rc);
+
+               rc = pcc_parse_value_pairs(cmd, val);
+               if (rc)
+                       GOTO(out_cmd_fini, rc = -EINVAL);
+       }
+       goto out;
+out_cmd_fini:
+       pcc_cmd_fini(cmd);
+out_free_cmd:
+       OBD_FREE_PTR(cmd);
+out:
+       if (rc)
+               cmd = ERR_PTR(rc);
+       return cmd;
+}
+
+int pcc_cmd_handle(char *buffer, unsigned long count,
+                  struct pcc_super *super)
+{
+       int rc = 0;
+       struct pcc_cmd *cmd;
+
+       cmd = pcc_cmd_parse(buffer, count);
+       if (IS_ERR(cmd))
+               return PTR_ERR(cmd);
+
+       switch (cmd->pccc_cmd) {
+       case PCC_ADD_DATASET:
+               rc = pcc_dataset_add(super, cmd);
+               break;
+       case PCC_DEL_DATASET:
+               rc = pcc_dataset_del(super, cmd->pccc_pathname);
+               break;
+       case PCC_CLEAR_ALL:
+               pcc_remove_datasets(super);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       pcc_cmd_fini(cmd);
+       OBD_FREE_PTR(cmd);
+       return rc;
+}
+
+static inline void pcc_inode_lock(struct inode *inode)
+{
+       mutex_lock(&ll_i2info(inode)->lli_pcc_lock);
+}
+
+static inline void pcc_inode_unlock(struct inode *inode)
+{
+       mutex_unlock(&ll_i2info(inode)->lli_pcc_lock);
+}
+
+static void pcc_inode_init(struct pcc_inode *pcci, struct ll_inode_info *lli)
+{
+       pcci->pcci_lli = lli;
+       lli->lli_pcc_inode = pcci;
+       lli->lli_pcc_state = PCC_STATE_FL_NONE;
+       atomic_set(&pcci->pcci_refcount, 0);
+       pcci->pcci_type = LU_PCC_NONE;
+       pcci->pcci_layout_gen = CL_LAYOUT_GEN_NONE;
+       atomic_set(&pcci->pcci_active_ios, 0);
+       init_waitqueue_head(&pcci->pcci_waitq);
+}
+
+static void pcc_inode_fini(struct pcc_inode *pcci)
+{
+       struct ll_inode_info *lli = pcci->pcci_lli;
+
+       path_put(&pcci->pcci_path);
+       pcci->pcci_type = LU_PCC_NONE;
+       OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab);
+       lli->lli_pcc_inode = NULL;
+}
+
+static void pcc_inode_get(struct pcc_inode *pcci)
+{
+       atomic_inc(&pcci->pcci_refcount);
+}
+
+static void pcc_inode_put(struct pcc_inode *pcci)
+{
+       if (atomic_dec_and_test(&pcci->pcci_refcount))
+               pcc_inode_fini(pcci);
+}
+
+void pcc_inode_free(struct inode *inode)
+{
+       struct pcc_inode *pcci = ll_i2pcci(inode);
+
+       if (pcci) {
+               WARN_ON(atomic_read(&pcci->pcci_refcount) > 1);
+               pcc_inode_put(pcci);
+       }
+}
+
+/*
+ * TODO:
+ * As Andreas suggested, we'd better use new layout to
+ * reduce overhead:
+ * (fid->f_oid >> 16 & oxFFFF)/FID
+ */
+#define MAX_PCC_DATABASE_PATH (6 * 5 + FID_NOBRACE_LEN + 1)
+static int pcc_fid2dataset_path(char *buf, int sz, struct lu_fid *fid)
+{
+       return snprintf(buf, sz, "%04x/%04x/%04x/%04x/%04x/%04x/"
+                       DFID_NOBRACE,
+                       (fid)->f_oid       & 0xFFFF,
+                       (fid)->f_oid >> 16 & 0xFFFF,
+                       (unsigned int)((fid)->f_seq       & 0xFFFF),
+                       (unsigned int)((fid)->f_seq >> 16 & 0xFFFF),
+                       (unsigned int)((fid)->f_seq >> 32 & 0xFFFF),
+                       (unsigned int)((fid)->f_seq >> 48 & 0xFFFF),
+                       PFID(fid));
+}
+
+static inline const struct cred *pcc_super_cred(struct super_block *sb)
+{
+       return ll_s2sbi(sb)->ll_pcc_super.pccs_cred;
+}
+
+void pcc_file_init(struct pcc_file *pccf)
+{
+       pccf->pccf_file = NULL;
+       pccf->pccf_type = LU_PCC_NONE;
+}
+
+static inline bool pcc_open_attach_enabled(struct pcc_dataset *dataset)
+{
+       return dataset->pccd_flags & PCC_DATASET_OPEN_ATTACH;
+}
+
+static const char pcc_xattr_layout[] = XATTR_USER_PREFIX "PCC.layout";
+
+static int pcc_layout_xattr_set(struct pcc_inode *pcci, __u32 gen)
+{
+       struct dentry *pcc_dentry = pcci->pcci_path.dentry;
+       struct ll_inode_info *lli = pcci->pcci_lli;
+       int rc;
+
+       ENTRY;
+
+       if (!(lli->lli_pcc_state & PCC_STATE_FL_OPEN_ATTACH))
+               RETURN(0);
+
+#ifndef HAVE_VFS_SETXATTR
+       if (!pcc_dentry->d_inode->i_op->setxattr)
+               RETURN(-ENOTSUPP);
+
+       rc = pcc_dentry->d_inode->i_op->setxattr(pcc_dentry, pcc_xattr_layout,
+                                                &gen, sizeof(gen), 0);
+#else
+       rc = __vfs_setxattr(pcc_dentry, pcc_dentry->d_inode, pcc_xattr_layout,
+                           &gen, sizeof(gen), 0);
+#endif
+       RETURN(rc);
+}
+
+static int pcc_get_layout_info(struct inode *inode, struct cl_layout *clt)
+{
+       struct lu_env *env;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       __u16 refcheck;
+       int rc;
+
+       ENTRY;
+
+       if (!lli->lli_clob)
+               RETURN(-EINVAL);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       rc = cl_object_layout_get(env, lli->lli_clob, clt);
+       if (rc)
+               CDEBUG(D_INODE, "Cannot get layout for "DFID"\n",
+                      PFID(ll_inode2fid(inode)));
+
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+}
+
+static int pcc_fid2dataset_fullpath(char *buf, int sz, struct lu_fid *fid,
+                                   struct pcc_dataset *dataset)
+{
+       return snprintf(buf, sz, "%s/%04x/%04x/%04x/%04x/%04x/%04x/"
+                       DFID_NOBRACE,
+                       dataset->pccd_pathname,
+                       (fid)->f_oid       & 0xFFFF,
+                       (fid)->f_oid >> 16 & 0xFFFF,
+                       (unsigned int)((fid)->f_seq       & 0xFFFF),
+                       (unsigned int)((fid)->f_seq >> 16 & 0xFFFF),
+                       (unsigned int)((fid)->f_seq >> 32 & 0xFFFF),
+                       (unsigned int)((fid)->f_seq >> 48 & 0xFFFF),
+                       PFID(fid));
+}
+
+/* Must be called with pcci->pcci_lock held */
+static void pcc_inode_attach_init(struct pcc_dataset *dataset,
+                                 struct pcc_inode *pcci,
+                                 struct dentry *dentry,
+                                 enum lu_pcc_type type)
+{
+       pcci->pcci_path.mnt = mntget(dataset->pccd_path.mnt);
+       pcci->pcci_path.dentry = dentry;
+       LASSERT(atomic_read(&pcci->pcci_refcount) == 0);
+       atomic_set(&pcci->pcci_refcount, 1);
+       pcci->pcci_type = type;
+       pcci->pcci_attr_valid = false;
+
+       if (pcc_open_attach_enabled(dataset)) {
+               struct ll_inode_info *lli = pcci->pcci_lli;
+
+               lli->lli_pcc_state |= PCC_STATE_FL_OPEN_ATTACH;
+       }
+}
+
+static inline void pcc_layout_gen_set(struct pcc_inode *pcci,
+                                     __u32 gen)
+{
+       pcci->pcci_layout_gen = gen;
+}
+
+static inline bool pcc_inode_has_layout(struct pcc_inode *pcci)
+{
+       return pcci->pcci_layout_gen != CL_LAYOUT_GEN_NONE;
+}
+
+static int pcc_try_dataset_attach(struct inode *inode, __u32 gen,
+                                 enum lu_pcc_type type,
+                                 struct pcc_dataset *dataset,
+                                 bool *cached)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct pcc_inode *pcci = lli->lli_pcc_inode;
+       const struct cred *old_cred;
+       struct dentry *pcc_dentry;
+       struct path path;
+       char *pathname;
+       __u32 pcc_gen;
+       int rc;
+
+       ENTRY;
+
+       if (type == LU_PCC_READWRITE &&
+           !(dataset->pccd_flags & PCC_DATASET_RWPCC))
+               RETURN(0);
+
+       OBD_ALLOC(pathname, PATH_MAX);
+       if (pathname == NULL)
+               RETURN(-ENOMEM);
+
+       pcc_fid2dataset_fullpath(pathname, PATH_MAX, &lli->lli_fid, dataset);
+
+       old_cred = override_creds(pcc_super_cred(inode->i_sb));
+       rc = kern_path(pathname, LOOKUP_FOLLOW, &path);
+       if (rc)
+               /* ignore this error */
+               GOTO(out, rc = 0);
+
+       pcc_dentry = path.dentry;
+#ifndef HAVE_VFS_SETXATTR
+       if (!pcc_dentry->d_inode->i_op->getxattr)
+               /* ignore this error */
+               GOTO(out_put_path, rc = 0);
+
+       rc = pcc_dentry->d_inode->i_op->getxattr(pcc_dentry, pcc_xattr_layout,
+                                                &pcc_gen, sizeof(pcc_gen));
+#else
+       rc = __vfs_getxattr(pcc_dentry, pcc_dentry->d_inode, pcc_xattr_layout,
+                           &pcc_gen, sizeof(pcc_gen));
+#endif
+
+       if (rc < 0)
+               /* ignore this error */
+               GOTO(out_put_path, rc = 0);
+
+       rc = 0;
+       /* The file is still valid cached in PCC, attach it immediately. */
+       if (pcc_gen == gen) {
+               CDEBUG(D_CACHE, DFID" L.Gen (%d) consistent, auto attached.\n",
+                      PFID(&lli->lli_fid), gen);
+               if (!pcci) {
+                       OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS);
+                       if (pcci == NULL)
+                               GOTO(out_put_path, rc = -ENOMEM);
+
+                       pcc_inode_init(pcci, lli);
+                       dget(pcc_dentry);
+                       pcc_inode_attach_init(dataset, pcci, pcc_dentry, type);
+               } else {
+                       /*
+                        * This happened when a file was once attached into
+                        * PCC, and some processes keep this file opened
+                        * (pcci->refcount > 1) and corresponding PCC file
+                        * without any I/O activity, and then this file was
+                        * detached by the manual detach command or the
+                        * revocation of the layout lock (i.e. cached LRU lock
+                        * shrinking).
+                        */
+                       pcc_inode_get(pcci);
+                       pcci->pcci_type = type;
+               }
+               pcc_layout_gen_set(pcci, gen);
+               *cached = true;
+       }
+out_put_path:
+       path_put(&path);
+out:
+       revert_creds(old_cred);
+       OBD_FREE(pathname, PATH_MAX);
+       RETURN(rc);
+}
+
+static int pcc_try_datasets_attach(struct inode *inode, __u32 gen,
+                                  enum lu_pcc_type type, bool *cached)
+{
+       struct pcc_dataset *dataset, *tmp;
+       struct pcc_super *super = &ll_i2sbi(inode)->ll_pcc_super;
+       int rc = 0;
+
+       ENTRY;
+
+       down_read(&super->pccs_rw_sem);
+       list_for_each_entry_safe(dataset, tmp,
+                                &super->pccs_datasets, pccd_linkage) {
+               if (!pcc_open_attach_enabled(dataset))
+                       continue;
+               rc = pcc_try_dataset_attach(inode, gen, type, dataset, cached);
+               if (rc < 0 || (!rc && *cached))
+                       break;
+       }
+       up_read(&super->pccs_rw_sem);
+
+       RETURN(rc);
+}
+
+static int pcc_try_open_attach(struct inode *inode, bool *cached)
+{
+       struct pcc_super *super = &ll_i2sbi(inode)->ll_pcc_super;
+       struct cl_layout clt = {
+               .cl_layout_gen = 0,
+               .cl_is_released = false,
+       };
+       int rc;
+
+       ENTRY;
+
+       /*
+        * Quick check whether there is PCC device.
+        */
+       if (list_empty(&super->pccs_datasets))
+               RETURN(0);
+
+       /*
+        * The file layout lock was cancelled. And this open does not
+        * obtain valid layout lock from MDT (i.e. the file is being
+        * HSM restoring).
+        */
+       if (ll_layout_version_get(ll_i2info(inode)) == CL_LAYOUT_GEN_NONE)
+               RETURN(0);
+
+       rc = pcc_get_layout_info(inode, &clt);
+       if (rc)
+               RETURN(rc);
+
+       if (clt.cl_is_released)
+               rc = pcc_try_datasets_attach(inode, clt.cl_layout_gen,
+                                            LU_PCC_READWRITE, cached);
+
+       RETURN(rc);
+}
+
+int pcc_file_open(struct inode *inode, struct file *file)
+{
+       struct pcc_inode *pcci;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct pcc_file *pccf = &fd->fd_pcc_file;
+       struct file *pcc_file;
+       struct path *path;
+       struct qstr *dname;
+       bool cached = false;
+       int rc = 0;
+
+       ENTRY;
+
+       if (!S_ISREG(inode->i_mode))
+               RETURN(0);
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+
+       if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING)
+               GOTO(out_unlock, rc = 0);
+
+       if (!pcci || !pcc_inode_has_layout(pcci)) {
+               rc = pcc_try_open_attach(inode, &cached);
+               if (rc < 0 || !cached)
+                       GOTO(out_unlock, rc);
+
+               if (!pcci)
+                       pcci = ll_i2pcci(inode);
+       }
+
+       pcc_inode_get(pcci);
+       WARN_ON(pccf->pccf_file);
+
+       path = &pcci->pcci_path;
+       dname = &path->dentry->d_name;
+       CDEBUG(D_CACHE, "opening pcc file '%.*s'\n", dname->len,
+              dname->name);
+
+#ifdef HAVE_DENTRY_OPEN_USE_PATH
+       pcc_file = dentry_open(path, file->f_flags,
+                              pcc_super_cred(inode->i_sb));
+#else
+       pcc_file = dentry_open(path->dentry, path->mnt, file->f_flags,
+                              pcc_super_cred(inode->i_sb));
+#endif
+       if (IS_ERR_OR_NULL(pcc_file)) {
+               rc = pcc_file == NULL ? -EINVAL : PTR_ERR(pcc_file);
+               pcc_inode_put(pcci);
+       } else {
+               pccf->pccf_file = pcc_file;
+               pccf->pccf_type = pcci->pcci_type;
+       }
+
+out_unlock:
+       pcc_inode_unlock(inode);
+       RETURN(rc);
+}
+
+void pcc_file_release(struct inode *inode, struct file *file)
+{
+       struct pcc_inode *pcci;
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct pcc_file *pccf;
+       struct path *path;
+       struct qstr *dname;
+
+       ENTRY;
+
+       if (!S_ISREG(inode->i_mode) || fd == NULL)
+               RETURN_EXIT;
+
+       pccf = &fd->fd_pcc_file;
+       pcc_inode_lock(inode);
+       if (pccf->pccf_file == NULL)
+               goto out;
+
+       pcci = ll_i2pcci(inode);
+       LASSERT(pcci);
+       path = &pcci->pcci_path;
+       dname = &path->dentry->d_name;
+       CDEBUG(D_CACHE, "releasing pcc file \"%.*s\"\n", dname->len,
+              dname->name);
+       pcc_inode_put(pcci);
+       fput(pccf->pccf_file);
+       pccf->pccf_file = NULL;
+out:
+       pcc_inode_unlock(inode);
+       RETURN_EXIT;
+}
+
+static void pcc_io_init(struct inode *inode, bool *cached)
+{
+       struct pcc_inode *pcci;
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       if (pcci && pcc_inode_has_layout(pcci)) {
+               LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+               atomic_inc(&pcci->pcci_active_ios);
+               *cached = true;
+       } else {
+               *cached = false;
+       }
+       pcc_inode_unlock(inode);
+}
+
+static void pcc_io_fini(struct inode *inode)
+{
+       struct pcc_inode *pcci = ll_i2pcci(inode);
+
+       LASSERT(pcci && atomic_read(&pcci->pcci_active_ios) > 0);
+       if (atomic_dec_and_test(&pcci->pcci_active_ios))
+               wake_up_all(&pcci->pcci_waitq);
+}
+
+
+static ssize_t
+__pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+       return file->f_op->read_iter(iocb, iter);
+#else
+       struct iovec iov;
+       struct iov_iter i;
+       ssize_t bytes = 0;
+
+       iov_for_each(iov, i, *iter) {
+               ssize_t res;
+
+               res = file->f_op->aio_read(iocb, &iov, 1, iocb->ki_pos);
+               if (-EIOCBQUEUED == res)
+                       res = wait_on_sync_kiocb(iocb);
+               if (res <= 0) {
+                       if (bytes == 0)
+                               bytes = res;
+                       break;
+               }
+
+               bytes += res;
+               if (res < iov.iov_len)
+                       break;
+       }
+
+       if (bytes > 0)
+               iov_iter_advance(iter, bytes);
+       return bytes;
+#endif
+}
+
+ssize_t pcc_file_read_iter(struct kiocb *iocb,
+                          struct iov_iter *iter, bool *cached)
+{
+       struct file *file = iocb->ki_filp;
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct pcc_file *pccf = &fd->fd_pcc_file;
+       struct inode *inode = file_inode(file);
+       ssize_t result;
+
+       ENTRY;
+
+       if (pccf->pccf_file == NULL) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+       iocb->ki_filp = pccf->pccf_file;
+       /* generic_file_aio_read does not support ext4-dax,
+        * __pcc_file_read_iter uses ->aio_read hook directly
+        * to add support for ext4-dax.
+        */
+       result = __pcc_file_read_iter(iocb, iter);
+       iocb->ki_filp = file;
+
+       pcc_io_fini(inode);
+       RETURN(result);
+}
+
+static ssize_t
+__pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+       return file->f_op->write_iter(iocb, iter);
+#else
+       struct iovec iov;
+       struct iov_iter i;
+       ssize_t bytes = 0;
+
+       iov_for_each(iov, i, *iter) {
+               ssize_t res;
+
+               res = file->f_op->aio_write(iocb, &iov, 1, iocb->ki_pos);
+               if (-EIOCBQUEUED == res)
+                       res = wait_on_sync_kiocb(iocb);
+               if (res <= 0) {
+                       if (bytes == 0)
+                               bytes = res;
+                       break;
+               }
+
+               bytes += res;
+               if (res < iov.iov_len)
+                       break;
+       }
+
+       if (bytes > 0)
+               iov_iter_advance(iter, bytes);
+       return bytes;
+#endif
+}
+
+ssize_t pcc_file_write_iter(struct kiocb *iocb,
+                           struct iov_iter *iter, bool *cached)
+{
+       struct file *file = iocb->ki_filp;
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct pcc_file *pccf = &fd->fd_pcc_file;
+       struct inode *inode = file_inode(file);
+       ssize_t result;
+
+       ENTRY;
+
+       if (pccf->pccf_file == NULL) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       if (pccf->pccf_type != LU_PCC_READWRITE) {
+               *cached = false;
+               RETURN(-EAGAIN);
+       }
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_FAKE_ERROR))
+               GOTO(out, result = -ENOSPC);
+
+       iocb->ki_filp = pccf->pccf_file;
+
+       /* Since __pcc_file_write_iter makes write calls via
+        * the normal vfs interface to the local PCC file system,
+        * the inode lock is not needed.
+        */
+       result = __pcc_file_write_iter(iocb, iter);
+       iocb->ki_filp = file;
+out:
+       pcc_io_fini(inode);
+       RETURN(result);
+}
+
+int pcc_inode_setattr(struct inode *inode, struct iattr *attr,
+                     bool *cached)
+{
+       int rc;
+       const struct cred *old_cred;
+       struct iattr attr2 = *attr;
+       struct dentry *pcc_dentry;
+       struct pcc_inode *pcci;
+
+       ENTRY;
+
+       if (!S_ISREG(inode->i_mode)) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+       attr2.ia_valid = attr->ia_valid & (ATTR_SIZE | ATTR_ATIME |
+                        ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET |
+                        ATTR_CTIME | ATTR_UID | ATTR_GID);
+       pcci = ll_i2pcci(inode);
+       pcc_dentry = pcci->pcci_path.dentry;
+       inode_lock(pcc_dentry->d_inode);
+       old_cred = override_creds(pcc_super_cred(inode->i_sb));
+       rc = pcc_dentry->d_inode->i_op->setattr(pcc_dentry, &attr2);
+       revert_creds(old_cred);
+       inode_unlock(pcc_dentry->d_inode);
+
+       pcc_io_fini(inode);
+       RETURN(rc);
+}
+
+int pcc_inode_getattr(struct inode *inode, bool *cached)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       const struct cred *old_cred;
+       struct kstat stat;
+       s64 atime;
+       s64 mtime;
+       s64 ctime;
+       int rc;
+
+       ENTRY;
+
+       if (!S_ISREG(inode->i_mode)) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+       old_cred = override_creds(pcc_super_cred(inode->i_sb));
+       rc = ll_vfs_getattr(&ll_i2pcci(inode)->pcci_path, &stat);
+       revert_creds(old_cred);
+       if (rc)
+               GOTO(out, rc);
+
+       ll_inode_size_lock(inode);
+       if (inode->i_atime.tv_sec < lli->lli_atime ||
+           lli->lli_update_atime) {
+               inode->i_atime.tv_sec = lli->lli_atime;
+               lli->lli_update_atime = 0;
+       }
+       inode->i_mtime.tv_sec = lli->lli_mtime;
+       inode->i_ctime.tv_sec = lli->lli_ctime;
+
+       atime = inode->i_atime.tv_sec;
+       mtime = inode->i_mtime.tv_sec;
+       ctime = inode->i_ctime.tv_sec;
+
+       if (atime < stat.atime.tv_sec)
+               atime = stat.atime.tv_sec;
+
+       if (ctime < stat.ctime.tv_sec)
+               ctime = stat.ctime.tv_sec;
+
+       if (mtime < stat.mtime.tv_sec)
+               mtime = stat.mtime.tv_sec;
+
+       i_size_write(inode, stat.size);
+       inode->i_blocks = stat.blocks;
+
+       inode->i_atime.tv_sec = atime;
+       inode->i_mtime.tv_sec = mtime;
+       inode->i_ctime.tv_sec = ctime;
+
+       ll_inode_size_unlock(inode);
+out:
+       pcc_io_fini(inode);
+       RETURN(rc);
+}
+
+ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos,
+                            struct pipe_inode_info *pipe,
+                            size_t count, unsigned int flags,
+                            bool *cached)
+{
+       struct inode *inode = file_inode(in_file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(in_file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       ssize_t result;
+
+       ENTRY;
+
+       *cached = false;
+       if (!pcc_file)
+               RETURN(0);
+
+       if (!file_inode(pcc_file)->i_fop->splice_read)
+               RETURN(-ENOTSUPP);
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+       result = file_inode(pcc_file)->i_fop->splice_read(pcc_file,
+                                                         ppos, pipe, count,
+                                                         flags);
+
+       pcc_io_fini(inode);
+       RETURN(result);
+}
+
+int pcc_fsync(struct file *file, loff_t start, loff_t end,
+             int datasync, bool *cached)
+{
+       struct inode *inode = file_inode(file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       int rc;
+
+       ENTRY;
+
+       if (!pcc_file) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+#ifdef HAVE_FILE_FSYNC_4ARGS
+       rc = file_inode(pcc_file)->i_fop->fsync(pcc_file,
+                                               start, end, datasync);
+#elif defined(HAVE_FILE_FSYNC_2ARGS)
+       rc = file_inode(pcc_file)->i_fop->fsync(pcc_file, datasync);
+#else
+       rc = file_inode(pcc_file)->i_fop->fsync(pcc_file,
+                               file_dentry(dentry), datasync);
+#endif
+
+       pcc_io_fini(inode);
+       RETURN(rc);
+}
+
+int pcc_file_mmap(struct file *file, struct vm_area_struct *vma,
+                 bool *cached)
+{
+       struct inode *inode = file_inode(file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       struct pcc_inode *pcci;
+       int rc = 0;
+
+       ENTRY;
+
+       if (!pcc_file || !file_inode(pcc_file)->i_fop->mmap) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       if (pcci && pcc_inode_has_layout(pcci)) {
+               LASSERT(atomic_read(&pcci->pcci_refcount) > 1);
+               *cached = true;
+               vma->vm_file = pcc_file;
+               rc = file_inode(pcc_file)->i_fop->mmap(pcc_file, vma);
+               vma->vm_file = file;
+               /* Save the vm ops of backend PCC */
+               vma->vm_private_data = (void *)vma->vm_ops;
+       } else {
+               *cached = false;
+       }
+       pcc_inode_unlock(inode);
+
+       RETURN(rc);
+}
+
+void pcc_vm_open(struct vm_area_struct *vma)
+{
+       struct pcc_inode *pcci;
+       struct file *file = vma->vm_file;
+       struct inode *inode = file_inode(file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+
+       ENTRY;
+
+       if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->open)
+               RETURN_EXIT;
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       if (pcci && pcc_inode_has_layout(pcci)) {
+               vma->vm_file = pcc_file;
+               pcc_vm_ops->open(vma);
+               vma->vm_file = file;
+       }
+       pcc_inode_unlock(inode);
+       EXIT;
+}
+
+void pcc_vm_close(struct vm_area_struct *vma)
+{
+       struct file *file = vma->vm_file;
+       struct inode *inode = file_inode(file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+
+       ENTRY;
+
+       if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->close)
+               RETURN_EXIT;
+
+       pcc_inode_lock(inode);
+       /* Layout lock maybe revoked here */
+       vma->vm_file = pcc_file;
+       pcc_vm_ops->close(vma);
+       vma->vm_file = file;
+       pcc_inode_unlock(inode);
+       EXIT;
+}
+
+int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                    bool *cached)
+{
+       struct page *page = vmf->page;
+       struct mm_struct *mm = vma->vm_mm;
+       struct file *file = vma->vm_file;
+       struct inode *inode = file_inode(file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+       int rc;
+
+       ENTRY;
+
+       if (!pcc_file || !pcc_vm_ops) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       if (!pcc_vm_ops->page_mkwrite &&
+           page->mapping == pcc_file->f_mapping) {
+               CDEBUG(D_MMAP,
+                      "%s: PCC backend fs not support ->page_mkwrite()\n",
+                      ll_i2sbi(inode)->ll_fsname);
+               pcc_ioctl_detach(inode, PCC_DETACH_OPT_NONE);
+               up_read(&mm->mmap_sem);
+               *cached = true;
+               RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE);
+       }
+       /* Pause to allow for a race with concurrent detach */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE, cfs_fail_val);
+
+       pcc_io_init(inode, cached);
+       if (!*cached) {
+               /* This happens when the file is detached from PCC after got
+                * the fault page via ->fault() on the inode of the PCC copy.
+                * Here it can not simply fall back to normal Lustre I/O path.
+                * The reason is that the address space of fault page used by
+                * ->page_mkwrite() is still the one of PCC inode. In the
+                * normal Lustre ->page_mkwrite() I/O path, it will be wrongly
+                * handled as the address space of the fault page is not
+                * consistent with the one of the Lustre inode (though the
+                * fault page was truncated).
+                * As the file is detached from PCC, the fault page must
+                * be released frist, and retry the mmap write (->fault() and
+                * ->page_mkwrite).
+                * We use an ugly and tricky method by returning
+                * VM_FAULT_NOPAGE | VM_FAULT_RETRY to the caller
+                * __do_page_fault and retry the memory fault handling.
+                */
+               if (page->mapping == pcc_file->f_mapping) {
+                       *cached = true;
+                       up_read(&mm->mmap_sem);
+                       RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE);
+               }
+
+               RETURN(0);
+       }
+
+       /*
+        * This fault injection can also be used to simulate -ENOSPC and
+        * -EDQUOT failure of underlying PCC backend fs.
+        */
+       if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_DETACH_MKWRITE)) {
+               pcc_io_fini(inode);
+               pcc_ioctl_detach(inode, PCC_DETACH_OPT_NONE);
+               up_read(&mm->mmap_sem);
+               RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE);
+       }
+
+       vma->vm_file = pcc_file;
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+       rc = pcc_vm_ops->page_mkwrite(vmf);
+#else
+       rc = pcc_vm_ops->page_mkwrite(vma, vmf);
+#endif
+       vma->vm_file = file;
+
+       pcc_io_fini(inode);
+       RETURN(rc);
+}
+
+int pcc_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+             bool *cached)
+{
+       struct file *file = vma->vm_file;
+       struct inode *inode = file_inode(file);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct file *pcc_file = fd->fd_pcc_file.pccf_file;
+       struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data;
+       int rc;
+
+       ENTRY;
+
+       if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->fault) {
+               *cached = false;
+               RETURN(0);
+       }
+
+       pcc_io_init(inode, cached);
+       if (!*cached)
+               RETURN(0);
+
+       vma->vm_file = pcc_file;
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+       rc = pcc_vm_ops->fault(vmf);
+#else
+       rc = pcc_vm_ops->fault(vma, vmf);
+#endif
+       vma->vm_file = file;
+
+       pcc_io_fini(inode);
+       RETURN(rc);
+}
+
+static void pcc_layout_wait(struct pcc_inode *pcci)
+{
+       struct l_wait_info lwi = { 0 };
+
+       while (atomic_read(&pcci->pcci_active_ios) > 0) {
+               CDEBUG(D_CACHE, "Waiting for IO completion: %d\n",
+                      atomic_read(&pcci->pcci_active_ios));
+               l_wait_event(pcci->pcci_waitq,
+                            atomic_read(&pcci->pcci_active_ios) == 0, &lwi);
+       }
+}
+
+static void __pcc_layout_invalidate(struct pcc_inode *pcci)
+{
+       pcci->pcci_type = LU_PCC_NONE;
+       pcc_layout_gen_set(pcci, CL_LAYOUT_GEN_NONE);
+       pcc_layout_wait(pcci);
+}
+
+void pcc_layout_invalidate(struct inode *inode)
+{
+       struct pcc_inode *pcci;
+
+       ENTRY;
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       if (pcci && pcc_inode_has_layout(pcci)) {
+               LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+               __pcc_layout_invalidate(pcci);
+
+               CDEBUG(D_CACHE, "Invalidate "DFID" layout gen %d\n",
+                      PFID(&ll_i2info(inode)->lli_fid), pcci->pcci_layout_gen);
+
+               pcc_inode_put(pcci);
+       }
+       pcc_inode_unlock(inode);
+
+       EXIT;
+}
+
+static int pcc_inode_remove(struct inode *inode, struct dentry *pcc_dentry)
+{
+       int rc;
+
+       rc = ll_vfs_unlink(pcc_dentry->d_parent->d_inode, pcc_dentry);
+       if (rc)
+               CWARN("%s: failed to unlink PCC file %.*s, rc = %d\n",
+                     ll_i2sbi(inode)->ll_fsname, pcc_dentry->d_name.len,
+                     pcc_dentry->d_name.name, rc);
+
+       return rc;
+}
+
+/* Create directory under base if directory does not exist */
+static struct dentry *
+pcc_mkdir(struct dentry *base, const char *name, umode_t mode)
+{
+       int rc;
+       struct dentry *dentry;
+       struct inode *dir = base->d_inode;
+
+       inode_lock(dir);
+       dentry = lookup_one_len(name, base, strlen(name));
+       if (IS_ERR(dentry))
+               goto out;
+
+       if (d_is_positive(dentry))
+               goto out;
+
+       rc = vfs_mkdir(dir, dentry, mode);
+       if (rc) {
+               dput(dentry);
+               dentry = ERR_PTR(rc);
+               goto out;
+       }
+out:
+       inode_unlock(dir);
+       return dentry;
+}
+
+static struct dentry *
+pcc_mkdir_p(struct dentry *root, char *path, umode_t mode)
+{
+       char *ptr, *entry_name;
+       struct dentry *parent;
+       struct dentry *child = ERR_PTR(-EINVAL);
+
+       ptr = path;
+       while (*ptr == '/')
+               ptr++;
+
+       entry_name = ptr;
+       parent = dget(root);
+       while ((ptr = strchr(ptr, '/')) != NULL) {
+               *ptr = '\0';
+               child = pcc_mkdir(parent, entry_name, mode);
+               *ptr = '/';
+               dput(parent);
+               if (IS_ERR(child))
+                       break;
+
+               parent = child;
+               ptr++;
+               entry_name = ptr;
+       }
+
+       return child;
+}
+
+/* Create file under base. If file already exist, return failure */
+static struct dentry *
+pcc_create(struct dentry *base, const char *name, umode_t mode)
+{
+       int rc;
+       struct dentry *dentry;
+       struct inode *dir = base->d_inode;
+
+       inode_lock(dir);
+       dentry = lookup_one_len(name, base, strlen(name));
+       if (IS_ERR(dentry))
+               goto out;
+
+       if (d_is_positive(dentry))
+               goto out;
+
+       rc = vfs_create(dir, dentry, mode, LL_VFS_CREATE_FALSE);
+       if (rc) {
+               dput(dentry);
+               dentry = ERR_PTR(rc);
+               goto out;
+       }
+out:
+       inode_unlock(dir);
+       return dentry;
+}
+
+static int __pcc_inode_create(struct pcc_dataset *dataset,
+                             struct lu_fid *fid,
+                             struct dentry **dentry)
+{
+       char *path;
+       struct dentry *base;
+       struct dentry *child;
+       int rc = 0;
+
+       OBD_ALLOC(path, MAX_PCC_DATABASE_PATH);
+       if (path == NULL)
+               return -ENOMEM;
+
+       pcc_fid2dataset_path(path, MAX_PCC_DATABASE_PATH, fid);
+
+       base = pcc_mkdir_p(dataset->pccd_path.dentry, path, 0);
+       if (IS_ERR(base)) {
+               rc = PTR_ERR(base);
+               GOTO(out, rc);
+       }
+
+       snprintf(path, MAX_PCC_DATABASE_PATH, DFID_NOBRACE, PFID(fid));
+       child = pcc_create(base, path, 0);
+       if (IS_ERR(child)) {
+               rc = PTR_ERR(child);
+               GOTO(out_base, rc);
+       }
+       *dentry = child;
+
+out_base:
+       dput(base);
+out:
+       OBD_FREE(path, MAX_PCC_DATABASE_PATH);
+       return rc;
+}
+
+/* TODO: Set the project ID for PCC copy */
+int pcc_inode_store_ugpid(struct dentry *dentry, kuid_t uid, kgid_t gid)
+{
+       struct inode *inode = dentry->d_inode;
+       struct iattr attr;
+       int rc;
+
+       ENTRY;
+
+       attr.ia_valid = ATTR_UID | ATTR_GID;
+       attr.ia_uid = uid;
+       attr.ia_gid = gid;
+
+       inode_lock(inode);
+       rc = notify_change(dentry, &attr, NULL);
+       inode_unlock(inode);
+
+       RETURN(rc);
+}
+
+int pcc_inode_create(struct super_block *sb, struct pcc_dataset *dataset,
+                    struct lu_fid *fid, struct dentry **pcc_dentry)
+{
+       const struct cred *old_cred;
+       int rc;
+
+       old_cred = override_creds(pcc_super_cred(sb));
+       rc = __pcc_inode_create(dataset, fid, pcc_dentry);
+       revert_creds(old_cred);
+       return rc;
+}
+
+int pcc_inode_create_fini(struct pcc_dataset *dataset, struct inode *inode,
+                         struct dentry *pcc_dentry)
+{
+       const struct cred *old_cred;
+       struct pcc_inode *pcci;
+       int rc = 0;
+
+       ENTRY;
+
+       old_cred = override_creds(pcc_super_cred(inode->i_sb));
+       pcc_inode_lock(inode);
+       LASSERT(ll_i2pcci(inode) == NULL);
+       OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS);
+       if (pcci == NULL)
+               GOTO(out_put, rc = -ENOMEM);
+
+       rc = pcc_inode_store_ugpid(pcc_dentry, old_cred->suid,
+                                  old_cred->sgid);
+       if (rc)
+               GOTO(out_put, rc);
+
+       pcc_inode_init(pcci, ll_i2info(inode));
+       pcc_inode_attach_init(dataset, pcci, pcc_dentry, LU_PCC_READWRITE);
+
+       rc = pcc_layout_xattr_set(pcci, 0);
+       if (rc) {
+               (void) pcc_inode_remove(inode, pcci->pcci_path.dentry);
+               pcc_inode_put(pcci);
+               GOTO(out_unlock, rc);
+       }
+
+       /* Set the layout generation of newly created file with 0 */
+       pcc_layout_gen_set(pcci, 0);
+
+out_put:
+       if (rc) {
+               (void) pcc_inode_remove(inode, pcc_dentry);
+               dput(pcc_dentry);
+
+               if (pcci)
+                       OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab);
+       }
+out_unlock:
+       pcc_inode_unlock(inode);
+       revert_creds(old_cred);
+       RETURN(rc);
+}
+
+static int pcc_filp_write(struct file *filp, const void *buf, ssize_t count,
+                         loff_t *offset)
+{
+       while (count > 0) {
+               ssize_t size;
+
+               size = vfs_write(filp, (const void __user *)buf, count, offset);
+               if (size < 0)
+                       return size;
+               count -= size;
+               buf += size;
+       }
+       return 0;
+}
+
+static int pcc_copy_data(struct file *src, struct file *dst)
+{
+       int rc = 0;
+       ssize_t rc2;
+       mm_segment_t oldfs;
+       loff_t pos, offset = 0;
+       size_t buf_len = 1048576;
+       void *buf;
+
+       ENTRY;
+
+       OBD_ALLOC_LARGE(buf, buf_len);
+       if (buf == NULL)
+               RETURN(-ENOMEM);
+
+       oldfs = get_fs();
+       set_fs(KERNEL_DS);
+       while (1) {
+               pos = offset;
+               rc2 = vfs_read(src, (void __user *)buf, buf_len, &pos);
+               if (rc2 < 0)
+                       GOTO(out_fs, rc = rc2);
+               else if (rc2 == 0)
+                       break;
+
+               pos = offset;
+               rc = pcc_filp_write(dst, buf, rc2, &pos);
+               if (rc < 0)
+                       GOTO(out_fs, rc);
+               offset += rc2;
+       }
+
+out_fs:
+       set_fs(oldfs);
+       OBD_FREE_LARGE(buf, buf_len);
+       RETURN(rc);
+}
+
+static int pcc_attach_allowed_check(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct pcc_inode *pcci;
+       int rc = 0;
+
+       ENTRY;
+
+       pcc_inode_lock(inode);
+       if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING)
+               GOTO(out_unlock, rc = -EBUSY);
+
+       pcci = ll_i2pcci(inode);
+       if (pcci && pcc_inode_has_layout(pcci))
+               GOTO(out_unlock, rc = -EEXIST);
+
+       lli->lli_pcc_state |= PCC_STATE_FL_ATTACHING;
+out_unlock:
+       pcc_inode_unlock(inode);
+       RETURN(rc);
+}
+
+int pcc_readwrite_attach(struct file *file, struct inode *inode,
+                        __u32 archive_id)
+{
+       struct pcc_dataset *dataset;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct pcc_inode *pcci;
+       const struct cred *old_cred;
+       struct dentry *dentry;
+       struct file *pcc_filp;
+       struct path path;
+       int rc;
+
+       ENTRY;
+
+       rc = pcc_attach_allowed_check(inode);
+       if (rc)
+               RETURN(rc);
+
+       dataset = pcc_dataset_get(&ll_i2sbi(inode)->ll_pcc_super,
+                                 LU_PCC_READWRITE, archive_id);
+       if (dataset == NULL)
+               RETURN(-ENOENT);
+
+       old_cred = override_creds(pcc_super_cred(inode->i_sb));
+       rc = __pcc_inode_create(dataset, &lli->lli_fid, &dentry);
+       if (rc) {
+               revert_creds(old_cred);
+               GOTO(out_dataset_put, rc);
+       }
+
+       path.mnt = dataset->pccd_path.mnt;
+       path.dentry = dentry;
+#ifdef HAVE_DENTRY_OPEN_USE_PATH
+       pcc_filp = dentry_open(&path, O_TRUNC | O_WRONLY | O_LARGEFILE,
+                              current_cred());
+#else
+       pcc_filp = dentry_open(path.dentry, path.mnt,
+                              O_TRUNC | O_WRONLY | O_LARGEFILE,
+                              current_cred());
+#endif
+       if (IS_ERR_OR_NULL(pcc_filp)) {
+               rc = pcc_filp == NULL ? -EINVAL : PTR_ERR(pcc_filp);
+               revert_creds(old_cred);
+               GOTO(out_dentry, rc);
+       }
+
+       rc = pcc_inode_store_ugpid(dentry, old_cred->uid, old_cred->gid);
+       revert_creds(old_cred);
+       if (rc)
+               GOTO(out_fput, rc);
+
+       rc = pcc_copy_data(file, pcc_filp);
+       if (rc)
+               GOTO(out_fput, rc);
+
+       /* Pause to allow for a race with concurrent HSM remove */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_ATTACH_PAUSE, cfs_fail_val);
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       LASSERT(!pcci);
+       OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS);
+       if (pcci == NULL)
+               GOTO(out_unlock, rc = -ENOMEM);
+
+       pcc_inode_init(pcci, lli);
+       pcc_inode_attach_init(dataset, pcci, dentry, LU_PCC_READWRITE);
+out_unlock:
+       pcc_inode_unlock(inode);
+out_fput:
+       fput(pcc_filp);
+out_dentry:
+       if (rc) {
+               old_cred = override_creds(pcc_super_cred(inode->i_sb));
+               (void) pcc_inode_remove(inode, dentry);
+               revert_creds(old_cred);
+               dput(dentry);
+       }
+out_dataset_put:
+       pcc_dataset_put(dataset);
+       RETURN(rc);
+}
+
+int pcc_readwrite_attach_fini(struct file *file, struct inode *inode,
+                             __u32 gen, bool lease_broken, int rc,
+                             bool attached)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       const struct cred *old_cred;
+       struct pcc_inode *pcci;
+       __u32 gen2;
+
+       ENTRY;
+
+       old_cred = override_creds(pcc_super_cred(inode->i_sb));
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       lli->lli_pcc_state &= ~PCC_STATE_FL_ATTACHING;
+       if (rc || lease_broken) {
+               if (attached && pcci)
+                       pcc_inode_put(pcci);
+
+               GOTO(out_unlock, rc);
+       }
+
+       /* PCC inode may be released due to layout lock revocatioin */
+       if (!pcci)
+               GOTO(out_unlock, rc = -ESTALE);
+
+       LASSERT(attached);
+       rc = pcc_layout_xattr_set(pcci, gen);
+       if (rc)
+               GOTO(out_put, rc);
+
+       rc = ll_layout_refresh(inode, &gen2);
+       if (!rc) {
+               if (gen2 == gen) {
+                       pcc_layout_gen_set(pcci, gen);
+               } else {
+                       CDEBUG(D_CACHE,
+                              DFID" layout changed from %d to %d.\n",
+                              PFID(ll_inode2fid(inode)), gen, gen2);
+                       GOTO(out_put, rc = -ESTALE);
+               }
+       }
+
+out_put:
+       if (rc) {
+               (void) pcc_inode_remove(inode, pcci->pcci_path.dentry);
+               pcc_inode_put(pcci);
+       }
+out_unlock:
+       pcc_inode_unlock(inode);
+       revert_creds(old_cred);
+       RETURN(rc);
+}
+
+static int pcc_hsm_remove(struct inode *inode)
+{
+       struct hsm_user_request *hur;
+       __u32 gen;
+       int len;
+       int rc;
+
+       ENTRY;
+
+       rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF);
+       if (rc) {
+               CDEBUG(D_CACHE, DFID" RESTORE failure: %d\n",
+                      PFID(&ll_i2info(inode)->lli_fid), rc);
+               RETURN(rc);
+       }
+
+       ll_layout_refresh(inode, &gen);
+
+       len = sizeof(struct hsm_user_request) +
+             sizeof(struct hsm_user_item);
+       OBD_ALLOC(hur, len);
+       if (hur == NULL)
+               RETURN(-ENOMEM);
+
+       hur->hur_request.hr_action = HUA_REMOVE;
+       hur->hur_request.hr_archive_id = 0;
+       hur->hur_request.hr_flags = 0;
+       memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+              sizeof(hur->hur_user_item[0].hui_fid));
+       hur->hur_user_item[0].hui_extent.offset = 0;
+       hur->hur_user_item[0].hui_extent.length = OBD_OBJECT_EOF;
+       hur->hur_request.hr_itemcount = 1;
+       rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
+                          len, hur, NULL);
+       if (rc)
+               CDEBUG(D_CACHE, DFID" HSM REMOVE failure: %d\n",
+                      PFID(&ll_i2info(inode)->lli_fid), rc);
+
+       OBD_FREE(hur, len);
+       RETURN(rc);
+}
+
+int pcc_ioctl_detach(struct inode *inode, __u32 opt)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct pcc_inode *pcci;
+       bool hsm_remove = false;
+       int rc = 0;
+
+       ENTRY;
+
+       pcc_inode_lock(inode);
+       pcci = lli->lli_pcc_inode;
+       if (!pcci || lli->lli_pcc_state & PCC_STATE_FL_ATTACHING ||
+           !pcc_inode_has_layout(pcci))
+               GOTO(out_unlock, rc = 0);
+
+       LASSERT(atomic_read(&pcci->pcci_refcount) > 0);
+
+       if (pcci->pcci_type == LU_PCC_READWRITE) {
+               if (opt == PCC_DETACH_OPT_UNCACHE)
+                       hsm_remove = true;
+
+               __pcc_layout_invalidate(pcci);
+               pcc_inode_put(pcci);
+       }
+
+out_unlock:
+       pcc_inode_unlock(inode);
+       if (hsm_remove) {
+               const struct cred *old_cred;
+
+               old_cred = override_creds(pcc_super_cred(inode->i_sb));
+               rc = pcc_hsm_remove(inode);
+               revert_creds(old_cred);
+       }
+
+       RETURN(rc);
+}
+
+int pcc_ioctl_state(struct file *file, struct inode *inode,
+                   struct lu_pcc_state *state)
+{
+       int rc = 0;
+       int count;
+       char *buf;
+       char *path;
+       int buf_len = sizeof(state->pccs_path);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct pcc_file *pccf = &fd->fd_pcc_file;
+       struct pcc_inode *pcci;
+
+       ENTRY;
+
+       if (buf_len <= 0)
+               RETURN(-EINVAL);
+
+       OBD_ALLOC(buf, buf_len);
+       if (buf == NULL)
+               RETURN(-ENOMEM);
+
+       pcc_inode_lock(inode);
+       pcci = ll_i2pcci(inode);
+       if (pcci == NULL) {
+               state->pccs_type = LU_PCC_NONE;
+               GOTO(out_unlock, rc = 0);
+       }
+
+       count = atomic_read(&pcci->pcci_refcount);
+       if (count == 0) {
+               state->pccs_type = LU_PCC_NONE;
+               state->pccs_open_count = 0;
+               GOTO(out_unlock, rc = 0);
+       }
+
+       if (pcc_inode_has_layout(pcci))
+               count--;
+       if (pccf->pccf_file != NULL)
+               count--;
+       state->pccs_type = pcci->pcci_type;
+       state->pccs_open_count = count;
+       state->pccs_flags = ll_i2info(inode)->lli_pcc_state;
+#ifdef HAVE_DENTRY_PATH_RAW
+       path = dentry_path_raw(pcci->pcci_path.dentry, buf, buf_len);
+       if (IS_ERR(path))
+               GOTO(out_unlock, rc = PTR_ERR(path));
+#else
+       path = "UNKNOWN";
+#endif
+
+       if (strlcpy(state->pccs_path, path, buf_len) >= buf_len)
+               GOTO(out_unlock, rc = -ENAMETOOLONG);
+
+out_unlock:
+       pcc_inode_unlock(inode);
+       OBD_FREE(buf, buf_len);
+       RETURN(rc);
+}
diff --git a/lustre/llite/pcc.h b/lustre/llite/pcc.h
new file mode 100644 (file)
index 0000000..d2c8512
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, DDN Storage Corporation.
+ */
+/*
+ *
+ * Persistent Client Cache
+ *
+ * Author: Li Xi <lixi@ddn.com>
+ */
+
+#ifndef LLITE_PCC_H
+#define LLITE_PCC_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/mm.h>
+#include <uapi/linux/lustre/lustre_user.h>
+
+extern struct kmem_cache *pcc_inode_slab;
+
+#define LPROCFS_WR_PCC_MAX_CMD 4096
+
+/* User/Group/Project ID */
+struct pcc_match_id {
+       __u32                   pmi_id;
+       struct list_head        pmi_linkage;
+};
+
+/* wildcard file name */
+struct pcc_match_fname {
+       char                    *pmf_name;
+       struct list_head         pmf_linkage;
+};
+
+enum pcc_field {
+       PCC_FIELD_UID,
+       PCC_FIELD_GID,
+       PCC_FIELD_PROJID,
+       PCC_FIELD_FNAME,
+       PCC_FIELD_MAX
+};
+
+struct pcc_expression {
+       enum pcc_field          pe_field;
+       struct list_head        pe_cond;
+       struct list_head        pe_linkage;
+};
+
+struct pcc_conjunction {
+       /* link to disjunction */
+       struct list_head        pc_linkage;
+       /* list of logical conjunction */
+       struct list_head        pc_expressions;
+};
+
+/**
+ * Match rule for auto PCC-cached files.
+ */
+struct pcc_match_rule {
+       char                    *pmr_conds_str;
+       struct list_head         pmr_conds;
+};
+
+struct pcc_matcher {
+       __u32            pm_uid;
+       __u32            pm_gid;
+       __u32            pm_projid;
+       struct qstr     *pm_name;
+};
+
+enum pcc_dataset_flags {
+       PCC_DATASET_NONE        = 0x0,
+       /* Try auto attach at open, disabled by default */
+       PCC_DATASET_OPEN_ATTACH = 0x1,
+       /* PCC backend is only used for RW-PCC */
+       PCC_DATASET_RWPCC       = 0x2,
+       /* PCC backend is only used for RO-PCC */
+       PCC_DATASET_ROPCC       = 0x4,
+       /* PCC backend provides caching services for both RW-PCC and RO-PCC */
+       PCC_DATASET_PCC_ALL     = PCC_DATASET_RWPCC | PCC_DATASET_ROPCC,
+};
+
+struct pcc_dataset {
+       __u32                   pccd_rwid;       /* Archive ID */
+       __u32                   pccd_roid;       /* Readonly ID */
+       struct pcc_match_rule   pccd_rule;       /* Match rule */
+       enum pcc_dataset_flags  pccd_flags;      /* flags of PCC backend */
+       char                    pccd_pathname[PATH_MAX]; /* full path */
+       struct path             pccd_path;       /* Root path */
+       struct list_head        pccd_linkage;  /* Linked to pccs_datasets */
+       atomic_t                pccd_refcount; /* Reference count */
+};
+
+struct pcc_super {
+       /* Protect pccs_datasets */
+       struct rw_semaphore      pccs_rw_sem;
+       /* List of datasets */
+       struct list_head         pccs_datasets;
+       /* creds of process who forced instantiation of super block */
+       const struct cred       *pccs_cred;
+};
+
+struct pcc_inode {
+       struct ll_inode_info    *pcci_lli;
+       /* Cache path on local file system */
+       struct path              pcci_path;
+       /*
+        * If reference count is 0, then the cache is not inited, if 1, then
+        * no one is using it.
+        */
+       atomic_t                 pcci_refcount;
+       /* Whether readonly or readwrite PCC */
+       enum lu_pcc_type         pcci_type;
+       /* Whether the inode attr is cached locally */
+       bool                     pcci_attr_valid;
+       /* Layout generation */
+       __u32                    pcci_layout_gen;
+       /*
+        * How many IOs are on going on this cached object. Layout can be
+        * changed only if there is no active IO.
+        */
+       atomic_t                 pcci_active_ios;
+       /* Waitq - wait for PCC I/O completion. */
+       wait_queue_head_t        pcci_waitq;
+};
+
+struct pcc_file {
+       /* Opened cache file */
+       struct file             *pccf_file;
+       /* Whether readonly or readwrite PCC */
+       enum lu_pcc_type         pccf_type;
+};
+
+enum pcc_cmd_type {
+       PCC_ADD_DATASET = 0,
+       PCC_DEL_DATASET,
+       PCC_CLEAR_ALL,
+};
+
+struct pcc_cmd {
+       enum pcc_cmd_type                        pccc_cmd;
+       char                                    *pccc_pathname;
+       union {
+               struct pcc_cmd_add {
+                       __u32                    pccc_rwid;
+                       __u32                    pccc_roid;
+                       struct list_head         pccc_conds;
+                       char                    *pccc_conds_str;
+                       enum pcc_dataset_flags   pccc_flags;
+               } pccc_add;
+               struct pcc_cmd_del {
+                       __u32                    pccc_pad;
+               } pccc_del;
+       } u;
+};
+
+int pcc_super_init(struct pcc_super *super);
+void pcc_super_fini(struct pcc_super *super);
+int pcc_cmd_handle(char *buffer, unsigned long count,
+                  struct pcc_super *super);
+int pcc_super_dump(struct pcc_super *super, struct seq_file *m);
+int pcc_readwrite_attach(struct file *file, struct inode *inode,
+                        __u32 arch_id);
+int pcc_readwrite_attach_fini(struct file *file, struct inode *inode,
+                             __u32 gen, bool lease_broken, int rc,
+                             bool attached);
+int pcc_ioctl_detach(struct inode *inode, __u32 opt);
+int pcc_ioctl_state(struct file *file, struct inode *inode,
+                   struct lu_pcc_state *state);
+void pcc_file_init(struct pcc_file *pccf);
+int pcc_file_open(struct inode *inode, struct file *file);
+void pcc_file_release(struct inode *inode, struct file *file);
+ssize_t pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter,
+                          bool *cached);
+ssize_t pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+                           bool *cached);
+int pcc_inode_getattr(struct inode *inode, bool *cached);
+int pcc_inode_setattr(struct inode *inode, struct iattr *attr, bool *cached);
+ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos,
+                            struct pipe_inode_info *pipe, size_t count,
+                            unsigned int flags, bool *cached);
+int pcc_fsync(struct file *file, loff_t start, loff_t end,
+             int datasync, bool *cached);
+int pcc_file_mmap(struct file *file, struct vm_area_struct *vma, bool *cached);
+void pcc_vm_open(struct vm_area_struct *vma);
+void pcc_vm_close(struct vm_area_struct *vma);
+int pcc_fault(struct vm_area_struct *mva, struct vm_fault *vmf, bool *cached);
+int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                    bool *cached);
+int pcc_inode_create(struct super_block *sb, struct pcc_dataset *dataset,
+                    struct lu_fid *fid, struct dentry **pcc_dentry);
+int pcc_inode_create_fini(struct pcc_dataset *dataset, struct inode *inode,
+                          struct dentry *pcc_dentry);
+struct pcc_dataset *pcc_dataset_match_get(struct pcc_super *super,
+                                         struct pcc_matcher *matcher);
+void pcc_dataset_put(struct pcc_dataset *dataset);
+void pcc_inode_free(struct inode *inode);
+void pcc_layout_invalidate(struct inode *inode);
+
+#endif /* LLITE_PCC_H */
index 5158d0a..c2e88cc 100644 (file)
@@ -121,6 +121,12 @@ static int __init lustre_init(void)
        if (ll_file_data_slab == NULL)
                GOTO(out_cache, rc = -ENOMEM);
 
+       pcc_inode_slab = kmem_cache_create("ll_pcc_inode",
+                                          sizeof(struct pcc_inode), 0,
+                                          SLAB_HWCACHE_ALIGN, NULL);
+       if (pcc_inode_slab == NULL)
+               GOTO(out_cache, rc = -ENOMEM);
+
        rc = llite_tunables_register();
        if (rc)
                GOTO(out_cache, rc);
@@ -163,6 +169,7 @@ out_tunables:
 out_cache:
        kmem_cache_destroy(ll_inode_cachep);
        kmem_cache_destroy(ll_file_data_slab);
+       kmem_cache_destroy(pcc_inode_slab);
        return rc;
 }
 
@@ -186,6 +193,7 @@ static void __exit lustre_exit(void)
 #endif
        kmem_cache_destroy(ll_inode_cachep);
        kmem_cache_destroy(ll_file_data_slab);
+       kmem_cache_destroy(pcc_inode_slab);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
index c3bf715..7412a06 100644 (file)
@@ -150,7 +150,8 @@ static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
                 * This operation is expensive but mmap processes have to pay
                 * a price themselves. */
                unmap_mapping_range(conf->coc_inode->i_mapping,
-                                   0, OBD_OBJECT_EOF, 0);
+                                   0, OBD_OBJECT_EOF, 1);
+               pcc_layout_invalidate(conf->coc_inode);
        }
        return 0;
 }
index c98a852..7abc814 100644 (file)
@@ -124,11 +124,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 
        if ((handler->flags == XATTR_ACL_ACCESS_T ||
             handler->flags == XATTR_ACL_DEFAULT_T) &&
-#ifdef HAVE_INODE_OWNER_OR_CAPABLE
            !inode_owner_or_capable(inode))
-#else
-           !is_owner_or_cap(inode))
-#endif
                RETURN(-EPERM);
 
        /* b10667: ignore lustre special xattr for now */
index 152eb7f..d0cd4a1 100644 (file)
@@ -124,11 +124,7 @@ int ll_setxattr_common(struct inode *inode, const char *name,
 
        if ((xattr_type == XATTR_ACL_ACCESS_T ||
             xattr_type == XATTR_ACL_DEFAULT_T) &&
-#ifdef HAVE_INODE_OWNER_OR_CAPABLE
            !inode_owner_or_capable(inode))
-#else
-           !is_owner_or_cap(inode))
-#endif
                return -EPERM;
 
        /* b10667: ignore lustre special xattr for now */
index a7a1903..92a2ff4 100644 (file)
@@ -353,7 +353,8 @@ retry:
                op_data->op_mds = tgt->ltd_index;
        } else {
                LASSERT(fid_is_sane(&op_data->op_fid1));
-               LASSERT(fid_is_zero(&op_data->op_fid2));
+               LASSERT(it->it_flags & MDS_OPEN_PCC ||
+                       fid_is_zero(&op_data->op_fid2));
                LASSERT(op_data->op_name != NULL);
 
                tgt = lmv_locate_tgt(lmv, op_data);
@@ -363,7 +364,8 @@ retry:
 
        /* If it is ready to open the file by FID, do not need
         * allocate FID at all, otherwise it will confuse MDT */
-       if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) {
+       if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID ||
+                                       it->it_flags & MDS_OPEN_PCC)) {
                /*
                 * For lookup(IT_CREATE) cases allocate new fid and setup FLD
                 * for it.
index 3d73e22..f10ca5a 100644 (file)
@@ -3550,6 +3550,7 @@ struct obd_ops lmv_obd_ops = {
         .o_set_info_async       = lmv_set_info_async,
         .o_notify               = lmv_notify,
         .o_get_uuid             = lmv_get_uuid,
+       .o_fid_alloc            = lmv_fid_alloc,
         .o_iocontrol            = lmv_iocontrol,
         .o_quotactl             = lmv_quotactl
 };
index 08e9dd1..ae535c0 100644 (file)
@@ -2044,6 +2044,7 @@ static int lov_object_layout_get(const struct lu_env *env,
        cl->cl_size = lov_comp_md_size(lsm);
        cl->cl_layout_gen = lsm->lsm_layout_gen;
        cl->cl_dom_comp_size = 0;
+       cl->cl_is_released = lsm->lsm_is_released;
        if (lsm_is_composite(lsm->lsm_magic)) {
                struct lov_stripe_md_entry *lsme = lsm->lsm_entries[0];
 
index cd6f2ca..efa5469 100644 (file)
@@ -304,6 +304,10 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
                cr_flags |= MDS_OPEN_HAS_EA;
                tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
                memcpy(tmp, lmm, lmmlen);
+               if (cr_flags & MDS_OPEN_PCC) {
+                       LASSERT(op_data != NULL);
+                       rec->cr_archive_id = op_data->op_archive_id;
+               }
        }
        set_mrc_cr_flags(rec, cr_flags);
 }
@@ -514,6 +518,8 @@ static void mdc_close_intent_pack(struct ptlrpc_request *req,
                        memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
                                op_data->op_data, count * sizeof(__u32));
                }
+       } else if (bias & MDS_PCC_ATTACH) {
+               data->cd_archive_id = op_data->op_archive_id;
        }
 }
 
index df65484..3a24e21 100644 (file)
@@ -2109,6 +2109,7 @@ static int mdd_declare_create_object(const struct lu_env *env,
                                     const struct md_op_spec *spec,
                                     struct lu_buf *def_acl_buf,
                                     struct lu_buf *acl_buf,
+                                    struct lu_buf *hsm_buf,
                                     struct dt_allocation_hint *hint)
 {
        const struct lu_buf *buf;
@@ -2155,6 +2156,14 @@ static int mdd_declare_create_object(const struct lu_env *env,
                                           0, handle);
                if (rc)
                        GOTO(out, rc);
+
+               if (spec->sp_cr_flags & MDS_OPEN_PCC) {
+                       rc = mdo_declare_xattr_set(env, c, hsm_buf,
+                                                  XATTR_NAME_HSM,
+                                                  0, handle);
+                       if (rc)
+                               GOTO(out, rc);
+               }
        }
 
        if (S_ISLNK(attr->la_mode)) {
@@ -2191,12 +2200,13 @@ static int mdd_declare_create(const struct lu_env *env, struct mdd_device *mdd,
                              struct linkea_data *ldata,
                              struct lu_buf *def_acl_buf,
                              struct lu_buf *acl_buf,
+                             struct lu_buf *hsm_buf,
                              struct dt_allocation_hint *hint)
 {
        int rc;
 
        rc = mdd_declare_create_object(env, mdd, p, c, attr, handle, spec,
-                                      def_acl_buf, acl_buf, hint);
+                                      def_acl_buf, acl_buf, hsm_buf, hint);
        if (rc)
                GOTO(out, rc);
 
@@ -2291,6 +2301,7 @@ static int mdd_create_object(const struct lu_env *env, struct mdd_object *pobj,
                             struct mdd_object *son, struct lu_attr *attr,
                             struct md_op_spec *spec, struct lu_buf *acl_buf,
                             struct lu_buf *def_acl_buf,
+                            struct lu_buf *hsm_buf,
                             struct dt_allocation_hint *hint,
                             struct thandle *handle)
 {
@@ -2339,6 +2350,19 @@ static int mdd_create_object(const struct lu_env *env, struct mdd_object *pobj,
                        GOTO(err_destroy, rc);
        }
 
+       if (S_ISREG(attr->la_mode) && spec->sp_cr_flags & MDS_OPEN_PCC) {
+               struct md_hsm mh;
+
+               memset(&mh, 0, sizeof(mh));
+               mh.mh_flags = HS_EXISTS | HS_ARCHIVED | HS_RELEASED;
+               mh.mh_arch_id = spec->sp_archive_id;
+               lustre_hsm2buf(hsm_buf->lb_buf, &mh);
+               rc = mdo_xattr_set(env, son, hsm_buf, XATTR_NAME_HSM,
+                                  0, handle);
+               if (rc != 0)
+                       GOTO(err_destroy, rc);
+       }
+
 #ifdef CONFIG_FS_POSIX_ACL
        if (def_acl_buf != NULL && def_acl_buf->lb_len > 0 &&
            S_ISDIR(attr->la_mode)) {
@@ -2501,6 +2525,7 @@ static int mdd_create(const struct lu_env *env, struct md_object *pobj,
        struct lu_attr          *pattr = &info->mti_pattr;
        struct lu_buf           acl_buf;
        struct lu_buf           def_acl_buf;
+       struct lu_buf           hsm_buf;
        struct linkea_data      *ldata = &info->mti_link_data;
        const char              *name = lname->ln_name;
        struct dt_allocation_hint *hint = &mdd_env_info(env)->mti_hint;
@@ -2562,9 +2587,18 @@ static int mdd_create(const struct lu_env *env, struct md_object *pobj,
                                        lname, 1, 0, ldata);
        }
 
+       if (spec->sp_cr_flags & MDS_OPEN_PCC) {
+               LASSERT(spec->sp_cr_flags & MDS_OPEN_HAS_EA);
+
+               memset(&hsm_buf, 0, sizeof(hsm_buf));
+               lu_buf_alloc(&hsm_buf, sizeof(struct hsm_attrs));
+               if (hsm_buf.lb_buf == NULL)
+                       GOTO(out_stop, rc = -ENOMEM);
+       }
+
        rc = mdd_declare_create(env, mdd, mdd_pobj, son, lname, attr,
                                handle, spec, ldata, &def_acl_buf, &acl_buf,
-                               hint);
+                               &hsm_buf, hint);
        if (rc)
                GOTO(out_stop, rc);
 
@@ -2573,7 +2607,7 @@ static int mdd_create(const struct lu_env *env, struct md_object *pobj,
                GOTO(out_stop, rc);
 
        rc = mdd_create_object(env, mdd_pobj, son, attr, spec, &acl_buf,
-                              &def_acl_buf, hint, handle);
+                              &def_acl_buf, &hsm_buf, hint, handle);
        if (rc != 0)
                GOTO(out_stop, rc);
 
@@ -2664,6 +2698,9 @@ out_free:
                /* if we vmalloced a large buffer drop it */
                lu_buf_free(ldata->ld_buf);
 
+       if (spec->sp_cr_flags & MDS_OPEN_PCC)
+               lu_buf_free(&hsm_buf);
+
        /* The child object shouldn't be cached anymore */
        if (rc)
                set_bit(LU_OBJECT_HEARD_BANSHEE,
@@ -3905,7 +3942,7 @@ static int mdd_declare_migrate_create(const struct lu_env *env,
 
        rc = mdd_declare_create(env, mdo2mdd(&tpobj->mod_obj), tpobj, tobj,
                                lname, attr, handle, spec, ldata, NULL, NULL,
-                               hint);
+                               NULL, hint);
        if (rc)
                return rc;
 
@@ -4064,8 +4101,8 @@ static int mdd_migrate_create(const struct lu_env *env,
        /* don't set nlink from sobj */
        attr->la_valid &= ~LA_NLINK;
 
-       rc = mdd_create_object(env, tpobj, tobj, attr, spec, NULL, NULL, hint,
-                               handle);
+       rc = mdd_create_object(env, tpobj, tobj, attr, spec, NULL, NULL, NULL,
+                              hint, handle);
        if (rc)
                RETURN(rc);
 
index a7d5130..020ff59 100644 (file)
@@ -1131,7 +1131,8 @@ static int mdt_setattr_unpack_rec(struct mdt_thread_info *info)
        ma->ma_valid = MA_INODE;
 
        ma->ma_attr_flags |= rec->sa_bias & (MDS_CLOSE_INTENT |
-                               MDS_DATA_MODIFIED | MDS_TRUNC_KEEP_LEASE);
+                               MDS_DATA_MODIFIED | MDS_TRUNC_KEEP_LEASE |
+                               MDS_PCC_ATTACH);
        RETURN(0);
 }
 
@@ -1619,6 +1620,7 @@ static int mdt_open_unpack(struct mdt_thread_info *info)
                                                                &RMF_EADATA);
                         sp->u.sp_ea.eadatalen = rr->rr_eadatalen;
                         sp->u.sp_ea.eadata = rr->rr_eadata;
+                       sp->sp_archive_id = rec->cr_archive_id;
                         sp->no_create = !!req_is_replay(req);
                        mdt_fix_lov_magic(info, rr->rr_eadata);
                 }
index 43d891c..fe0cb34 100644 (file)
@@ -1746,6 +1746,22 @@ static inline int mdt_hsm_set_released(struct lov_mds_md *lmm)
        return 0;
 }
 
+static inline int mdt_get_lmm_gen(struct lov_mds_md *lmm, __u32 *gen)
+{
+       struct lov_comp_md_v1 *comp_v1;
+
+       if (le32_to_cpu(lmm->lmm_magic == LOV_MAGIC_COMP_V1)) {
+               comp_v1 = (struct lov_comp_md_v1 *)lmm;
+               *gen = le32_to_cpu(comp_v1->lcm_layout_gen);
+       } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1 ||
+                  le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3) {
+               *gen = le16_to_cpu(lmm->lmm_layout_gen);
+       } else {
+               return -EINVAL;
+       }
+       return 0;
+}
+
 static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o,
                           struct md_attr *ma)
 {
@@ -1805,19 +1821,66 @@ static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o,
        if (rc != 0)
                GOTO(out_unlock, rc);
 
-       if (!mdt_hsm_release_allow(ma))
-               GOTO(out_unlock, rc = -EPERM);
-
-       /* already released? */
-       if (ma->ma_hsm.mh_flags & HS_RELEASED)
-               GOTO(out_unlock, rc = 0);
+       if (ma->ma_attr_flags & MDS_PCC_ATTACH) {
+               if (ma->ma_valid & MA_HSM) {
+                       if (ma->ma_hsm.mh_flags & HS_RELEASED)
+                               GOTO(out_unlock, rc = -EALREADY);
+
+                       if (ma->ma_hsm.mh_arch_id != data->cd_archive_id)
+                               CDEBUG(D_CACHE,
+                                      DFID" archive id diff: %llu:%u\n",
+                                      PFID(mdt_object_fid(o)),
+                                      ma->ma_hsm.mh_arch_id,
+                                      data->cd_archive_id);
+
+                       if (!(ma->ma_hsm.mh_flags & HS_DIRTY) &&
+                           ma->ma_hsm.mh_arch_ver == data->cd_data_version) {
+                               CDEBUG(D_CACHE,
+                                      DFID" data version matches: packed=%llu "
+                                      "and on-disk=%llu\n",
+                                      PFID(mdt_object_fid(o)),
+                                      data->cd_data_version,
+                                      ma->ma_hsm.mh_arch_ver);
+                               ma->ma_hsm.mh_flags = HS_ARCHIVED | HS_EXISTS;
+                       }
 
-       /* Compare on-disk and packed data_version */
-       if (data->cd_data_version != ma->ma_hsm.mh_arch_ver) {
-               CDEBUG(D_HSM, DFID" data_version mismatches: packed=%llu"
-                      " and on-disk=%llu\n", PFID(mdt_object_fid(o)),
-                      data->cd_data_version, ma->ma_hsm.mh_arch_ver);
-               GOTO(out_unlock, rc = -EPERM);
+                       if (ma->ma_hsm.mh_flags & HS_DIRTY)
+                               ma->ma_hsm.mh_flags = HS_ARCHIVED | HS_EXISTS;
+               } else {
+                       /* Set up HSM attribte for PCC archived object */
+                       CLASSERT(sizeof(struct hsm_attrs) <=
+                                sizeof(info->mti_xattr_buf));
+                       buf = &info->mti_buf;
+                       buf->lb_buf = info->mti_xattr_buf;
+                       buf->lb_len = sizeof(struct hsm_attrs);
+                       memset(&ma->ma_hsm, 0, sizeof(ma->ma_hsm));
+                       ma->ma_hsm.mh_flags = HS_ARCHIVED | HS_EXISTS;
+                       ma->ma_hsm.mh_arch_id = data->cd_archive_id;
+                       ma->ma_hsm.mh_arch_ver = data->cd_data_version;
+                       lustre_hsm2buf(buf->lb_buf, &ma->ma_hsm);
+
+                       rc = mo_xattr_set(info->mti_env, mdt_object_child(o),
+                                         buf, XATTR_NAME_HSM, 0);
+                       if (rc)
+                               GOTO(out_unlock, rc);
+               }
+       } else {
+               if (!mdt_hsm_release_allow(ma))
+                       GOTO(out_unlock, rc = -EPERM);
+
+               /* already released? */
+               if (ma->ma_hsm.mh_flags & HS_RELEASED)
+                       GOTO(out_unlock, rc = 0);
+
+               /* Compare on-disk and packed data_version */
+               if (data->cd_data_version != ma->ma_hsm.mh_arch_ver) {
+                       CDEBUG(D_HSM, DFID" data_version mismatches: "
+                              "packed=%llu and on-disk=%llu\n",
+                              PFID(mdt_object_fid(o)),
+                              data->cd_data_version,
+                              ma->ma_hsm.mh_arch_ver);
+                       GOTO(out_unlock, rc = -EPERM);
+               }
        }
 
        ma->ma_valid = MA_INODE;
@@ -1913,6 +1976,12 @@ static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o,
        rc = mo_swap_layouts(info->mti_env, mdt_object_child(o),
                             mdt_object_child(orphan),
                             SWAP_LAYOUTS_MDS_HSM);
+
+       if (!rc && ma->ma_attr_flags & MDS_PCC_ATTACH) {
+               ma->ma_need = MA_LOV;
+               rc = mdt_attr_get_complex(info, o, ma);
+       }
+
        EXIT;
 
 out_layout_lock:
@@ -1939,6 +2008,13 @@ out_unlock:
                repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
                LASSERT(repbody != NULL);
                repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+               if (ma->ma_attr_flags & MDS_PCC_ATTACH) {
+                       LASSERT(ma->ma_valid & MA_LOV);
+                       rc = mdt_get_lmm_gen(ma->ma_lmm,
+                                            &repbody->mbo_layout_gen);
+                       if (!rc)
+                               repbody->mbo_valid |= OBD_MD_LAYOUT_VERSION;
+               }
        }
 
 out_reprocess:
index 68fd6ae..f65c116 100644 (file)
@@ -39,7 +39,7 @@ noinst_SCRIPTS += posix.sh sanity-scrub.sh scrub-performance.sh ha.sh
 noinst_SCRIPTS += sanity-lfsck.sh lfsck-performance.sh
 noinst_SCRIPTS += resolveip
 noinst_SCRIPTS += sanity-hsm.sh sanity-lsnapshot.sh sanity-pfl.sh sanity-flr.sh
-noinst_SCRIPTS += sanity-dom.sh dom-performance.sh
+noinst_SCRIPTS += sanity-dom.sh sanity-pcc.sh dom-performance.sh
 nobase_noinst_SCRIPTS = cfg/local.sh
 nobase_noinst_SCRIPTS += test-groups/regression test-groups/regression-mpi
 nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh
index 6f4169c..c361ae2 100644 (file)
@@ -213,7 +213,7 @@ int main(int argc, char **argv)
        struct stat              st;
        struct statfs            stfs;
        size_t                   mmap_len = 0, i;
-       unsigned char           *mmap_ptr = NULL, junk = 0;
+       unsigned char           *mmap_ptr = NULL, junk = 1;
        int                      len, fd = -1;
        int                      flags;
        int                      save_errno;
index 46a96d6..b759e49 100755 (executable)
@@ -42,60 +42,6 @@ CLIENT2=${CLIENT2:-$CLIENT1}
 
 is_mounted $MOUNT2 || error "MOUNT2 is not mounted"
 
-rmultiop_start() {
-       local client=$1
-       local file=$2
-       local cmds=$3
-       local WAIT_MAX=${4:-60}
-       local wait_time=0
-
-       # We need to run do_node in bg, because pdsh does not exit
-       # if child process of run script exists.
-       # I.e. pdsh does not exit when runmultiop_bg_pause exited,
-       # because of multiop_bg_pause -> $MULTIOP_PROG &
-       # By the same reason we need sleep a bit after do_nodes starts
-       # to let runmultiop_bg_pause start muliop and
-       # update /tmp/multiop_bg.pid ;
-       # The rm /tmp/multiop_bg.pid guarantees here that
-       # we have the updated by runmultiop_bg_pause
-       # /tmp/multiop_bg.pid file
-
-       local pid_file=$TMP/multiop_bg.pid.$$
-       do_node $client "MULTIOP_PID_FILE=$pid_file LUSTRE= \
-                       runmultiop_bg_pause $file $cmds" &
-       local pid=$!
-       local multiop_pid
-
-       while [[ $wait_time -lt $WAIT_MAX ]]; do
-               sleep 3
-               wait_time=$((wait_time + 3))
-               multiop_pid=$(do_node $client cat $pid_file)
-               if [ -n "$multiop_pid" ]; then
-                       break
-               fi
-       done
-
-       [ -n "$multiop_pid" ] ||
-               error "$client : Can not get multiop_pid from $pid_file "
-
-       eval export $(node_var_name $client)_multiop_pid=$multiop_pid
-       eval export $(node_var_name $client)_do_node_pid=$pid
-       local var=$(node_var_name $client)_multiop_pid
-       echo client $client multiop_bg started multiop_pid=${!var}
-       return $?
-}
-
-rmultiop_stop() {
-    local client=$1
-    local multiop_pid=$(node_var_name $client)_multiop_pid
-    local do_node_pid=$(node_var_name $client)_do_node_pid
-
-    echo "Stopping multiop_pid=${!multiop_pid} (kill ${!multiop_pid} on $client)"
-    do_node $client kill -USR1 ${!multiop_pid}
-
-    wait ${!do_node_pid}
-}
-
 #
 # get_version(): Gets the version of an object on servers
 # Parameter1: Client/Machine Name
index 97a5129..7e5a0b9 100755 (executable)
@@ -226,27 +226,6 @@ copytool_monitor_setup() {
        fi
 }
 
-copytool_monitor_cleanup() {
-       local facet=${1:-$SINGLEAGT}
-       local agent=$(facet_active_host $facet)
-
-       if [ -n "$HSMTOOL_MONITOR_DIR" ]; then
-               # Should die when the copytool dies, but just in case.
-               local cmd="kill \\\$(cat $HSMTOOL_MONITOR_DIR/monitor_pid)"
-               cmd+=" 2>/dev/null || true"
-               do_node $agent "$cmd"
-               do_node $agent "rm -fr $HSMTOOL_MONITOR_DIR"
-               export HSMTOOL_MONITOR_DIR=
-       fi
-
-       # The pdsh should die on its own when the monitor dies. Just
-       # in case, though, try to clean up to avoid any cruft.
-       if [ -n "$HSMTOOL_MONITOR_PDSH" ]; then
-               kill $HSMTOOL_MONITOR_PDSH 2>/dev/null || true
-               export HSMTOOL_MONITOR_PDSH=
-       fi
-}
-
 fid2archive()
 {
        local fid="$1"
@@ -258,134 +237,6 @@ fid2archive()
        esac
 }
 
-copytool_logfile()
-{
-       local host="$(facet_host "$1")"
-       local prefix=$TESTLOG_PREFIX
-       [ -n "$TESTNAME" ] && prefix+=.$TESTNAME
-
-       printf "${prefix}.copytool${archive_id}_log.${host}.log"
-}
-
-__lhsmtool_rebind()
-{
-       do_facet $facet $HSMTOOL -p "$hsm_root" --rebind "$@" "$mountpoint"
-}
-
-__lhsmtool_import()
-{
-       mkdir -p "$(dirname "$2")" ||
-               error "cannot create directory '$(dirname "$2")'"
-       do_facet $facet $HSMTOOL -p "$hsm_root" --import "$@" "$mountpoint"
-}
-
-__lhsmtool_setup()
-{
-       local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root \"$hsm_root\""
-       [ -n "$bandwidth" ] && cmd+=" --bandwidth $bandwidth"
-       [ -n "$archive_id" ] && cmd+=" --archive $archive_id"
-       [ ${#misc_options[@]} -gt 0 ] &&
-               cmd+=" $(IFS=" " echo "$@")"
-       cmd+=" \"$mountpoint\""
-
-       echo "Starting copytool $facet on $(facet_host $facet)"
-       stack_trap "do_facet $facet libtool execute pkill -x '$HSMTOOL' || true" EXIT
-       do_facet $facet "$cmd < /dev/null > \"$(copytool_logfile $facet)\" 2>&1"
-}
-
-hsm_root() {
-       local facet="${1:-$SINGLEAGT}"
-
-       printf "$(copytool_device "$facet")/${TESTSUITE}.${TESTNAME}/"
-}
-
-# Main entry point to perform copytool related operations
-#
-# Sub-commands:
-#
-#      setup   setup a copytool to run in the background, that copytool will be
-#              killed on EXIT
-#      import  import a file from an HSM backend
-#      rebind  rebind an archived file to a new fid
-#
-# Although the semantics might suggest otherwise, one does not need to 'setup'
-# a copytool before a call to 'copytool import' or 'copytool rebind'.
-#
-copytool()
-{
-       local action=$1
-       shift
-
-       # Parse arguments
-       local fail_on_error=true
-       local -a misc_options
-       while [ $# -gt 0 ]; do
-               case "$1" in
-               -f|--facet)
-                       shift
-                       local facet="$1"
-                       ;;
-               -m|--mountpoint)
-                       shift
-                       local mountpoint="$1"
-                       ;;
-               -a|--archive-id)
-                       shift
-                       local archive_id="$1"
-                       ;;
-               -b|--bwlimit)
-                       shift
-                       local bandwidth="$1" # in MB/s
-                       ;;
-               -n|--no-fail)
-                       local fail_on_error=false
-                       ;;
-               *)
-                       # Uncommon(/copytool dependent) option
-                       misc_options+=("$1")
-                       ;;
-               esac
-               shift
-       done
-
-       # Use default values if needed
-       local facet=${facet:-$SINGLEAGT}
-       local mountpoint="${mountpoint:-${MOUNT2:-$MOUNT}}"
-       local hsm_root="$(hsm_root "$facet")"
-
-       stack_trap "do_facet $facet rm -rf '$hsm_root'" EXIT
-       do_facet $facet mkdir -p "$hsm_root" ||
-               error "mkdir '$hsm_root' failed"
-
-       case "$HSMTOOL" in
-       lhsmtool_posix)
-               local copytool=lhsmtool
-               ;;
-       esac
-
-       __${copytool}_${action} "${misc_options[@]}"
-       if [ $? -ne 0 ]; then
-               local error_msg
-
-               case $action in
-               setup)
-                       local host="$(facet_host $facet)"
-                       error_msg="Failed to start copytool $facet on '$host'"
-                       ;;
-               import)
-                       local src="${misc_options[0]}"
-                       local dest="${misc_options[1]}"
-                       error_msg="Failed to import '$src' to '$dest'"
-                       ;;
-               rebind)
-                       error_msg="could not rebind file"
-                       ;;
-               esac
-
-               $fail_on_error && error "$error_msg" || echo "$error_msg"
-       fi
-}
-
 get_copytool_event_log() {
        local facet=${1:-$SINGLEAGT}
        local agent=$(facet_active_host $facet)
@@ -508,55 +359,12 @@ copy2archive() {
                error "cannot copy '$1' to '$file'"
 }
 
-mdts_set_param() {
-       local arg=$1
-       local key=$2
-       local value=$3
-       local mdtno
-       local rc=0
-       if [[ "$value" != "" ]]; then
-               value="=$value"
-       fi
-       for mdtno in $(seq 1 $MDSCOUNT); do
-               local idx=$(($mdtno - 1))
-               local facet=mds${mdtno}
-               # if $arg include -P option, run 1 set_param per MDT on the MGS
-               # else, run set_param on each MDT
-               [[ $arg = *"-P"* ]] && facet=mgs
-               do_facet $facet $LCTL set_param $arg mdt.${MDT[$idx]}.$key$value
-               [[ $? != 0 ]] && rc=1
-       done
-       return $rc
-}
-
-mdts_check_param() {
-       local key="$1"
-       local target="$2"
-       local timeout="$3"
-       local mdtno
-       for mdtno in $(seq 1 $MDSCOUNT); do
-               local idx=$(($mdtno - 1))
-               wait_result mds${mdtno} \
-                       "$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \
-                       $timeout ||
-                       error "$key state is not '$target' on mds${mdtno}"
-       done
-}
-
 get_hsm_param() {
        local param=$1
        local val=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.$param)
        echo $val
 }
 
-set_hsm_param() {
-       local param=$1
-       local value=$2
-       local opt=$3
-       mdts_set_param "$opt -n" "hsm.$param" "$value"
-       return $?
-}
-
 set_test_state() {
        local cmd=$1
        local target=$2
@@ -564,15 +372,6 @@ set_test_state() {
        mdts_check_param hsm_control "$target" 10
 }
 
-cdt_set_sanity_policy() {
-       if [[ "$CDT_POLICY_HAD_CHANGED" ]]
-       then
-               # clear all
-               mdts_set_param "" hsm.policy "+NRA"
-               mdts_set_param "" hsm.policy "-NBR"
-               CDT_POLICY_HAD_CHANGED=
-       fi
-}
 
 cdt_set_no_retry() {
        mdts_set_param "" hsm.policy "+NRA"
@@ -598,21 +397,6 @@ cdt_clear_mount_state() {
        mdts_set_param "-P -d" hsm_control ""
 }
 
-cdt_set_mount_state() {
-       mdts_set_param "-P" hsm_control "$1"
-       # set_param -P is asynchronous operation and could race with set_param.
-       # In such case configs could be retrieved and applied at mgc after
-       # set_param -P completion. Sleep here to avoid race with set_param.
-       # We need at least 20 seconds. 10 for mgc_requeue_thread to wake up
-       # MGC_TIMEOUT_MIN_SECONDS + MGC_TIMEOUT_RAND_CENTISEC(5 + 5)
-       # and 10 seconds to retrieve config from server.
-       sleep 20
-}
-
-cdt_check_state() {
-       mdts_check_param hsm_control "$1" 20
-}
-
 cdt_disable() {
        set_test_state disabled disabled
 }
@@ -635,37 +419,6 @@ cdt_restart() {
        cdt_set_sanity_policy
 }
 
-needclients() {
-       local client_count=$1
-       if [[ $CLIENTCOUNT -lt $client_count ]]; then
-               skip "Need $client_count or more clients, have $CLIENTCOUNT"
-               return 1
-       fi
-       return 0
-}
-
-path2fid() {
-       $LFS path2fid $1 | tr -d '[]'
-       return ${PIPESTATUS[0]}
-}
-
-get_hsm_flags() {
-       local f=$1
-       local u=$2
-       local st
-
-       if [[ $u == "user" ]]; then
-               st=$($RUNAS $LFS hsm_state $f)
-       else
-               u=root
-               st=$($LFS hsm_state $f)
-       fi
-
-       [[ $? == 0 ]] || error "$LFS hsm_state $f failed (run as $u)"
-
-       st=$(echo $st | cut -f 2 -d" " | tr -d "()," )
-       echo $st
-}
 
 get_hsm_archive_id() {
        local f=$1
@@ -677,14 +430,6 @@ get_hsm_archive_id() {
        echo $ar
 }
 
-check_hsm_flags() {
-       local f=$1
-       local fl=$2
-
-       local st=$(get_hsm_flags $f)
-       [[ $st == $fl ]] || error "hsm flags on $f are $st != $fl"
-}
-
 check_hsm_flags_user() {
        local f=$1
        local fl=$2
@@ -721,27 +466,6 @@ delete_large_files() {
        wait_delete_completed
 }
 
-wait_result() {
-       local facet=$1
-       shift
-       wait_update --verbose $(facet_active_host $facet) "$@"
-}
-
-wait_request_state() {
-       local fid=$1
-       local request=$2
-       local state=$3
-       # 4th arg (mdt index) is optional
-       local mdtidx=${4:-0}
-       local mds=mds$(($mdtidx + 1))
-
-       local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions"
-       cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d="
-
-       wait_result $mds "$cmd" "$state" 200 ||
-               error "request on $fid is not $state on $mds"
-}
-
 get_request_state() {
        local fid=$1
        local request=$2
diff --git a/lustre/tests/sanity-pcc.sh b/lustre/tests/sanity-pcc.sh
new file mode 100644 (file)
index 0000000..e18ad79
--- /dev/null
@@ -0,0 +1,1278 @@
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+#
+# exit on error
+set -e
+set +o monitor
+
+SRCDIR=$(dirname $0)
+export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin
+
+ONLY=${ONLY:-"$*"}
+# bug number for skipped test:
+ALWAYS_EXCEPT=""
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+ENABLE_PROJECT_QUOTAS=${ENABLE_PROJECT_QUOTAS:-true}
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
+
+MULTIOP=${MULTIOP:-multiop}
+OPENFILE=${OPENFILE:-openfile}
+MMAP_CAT=${MMAP_CAT:-mmap_cat}
+MOUNT_2=${MOUNT_2:-"yes"}
+FAIL_ON_ERROR=false
+
+# script only handles up to 10 MDTs (because of MDT_PREFIX)
+[ $MDSCOUNT -gt 9 ] &&
+       error "script cannot handle more than 9 MDTs, please fix" && exit
+
+check_and_setup_lustre
+
+if [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.12.52) ]]; then
+       skip_env "Need MDS version at least 2.12.52" && exit
+fi
+
+# $RUNAS_ID may get set incorrectly somewhere else
+if [[ $UID -eq 0 && $RUNAS_ID -eq 0 ]]; then
+       skip_env "\$RUNAS_ID set to 0, but \$UID is also 0!" && exit
+fi
+check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS
+if getent group nobody; then
+       GROUP=nobody
+elif getent group nogroup; then
+       GROUP=nogroup
+else
+       error "No generic nobody group"
+fi
+
+build_test_filter
+
+# if there is no CLIENT1 defined, some tests can be ran on localhost
+CLIENT1=${CLIENT1:-$HOSTNAME}
+# if CLIENT2 doesn't exist then use CLIENT1 instead
+# All tests should use CLIENT2 with MOUNT2 only therefore it will work if
+# $CLIENT2 == CLIENT1
+# Exception is the test which need two separate nodes
+CLIENT2=${CLIENT2:-$CLIENT1}
+
+check_file_size()
+{
+       local client="$1"
+       local fpath="$2"
+       local expected_size="$3"
+
+       size=$(do_facet $client stat "--printf=%s" $fpath)
+       [[ $size == "$expected_size" ]] || error \
+               "expected $fpath size: $expected_size got: $size"
+}
+
+check_lpcc_sizes()
+{
+       local client="$1"
+       local lpcc_fpath="$2"
+       local lustre_fpath="$3"
+       local expected_size="$4"
+
+       check_file_size $client $lpcc_fpath $expected_size
+       check_file_size $client $lustre_fpath $expected_size
+}
+
+check_file_data()
+{
+       local client="$1"
+       local path="$2"
+       local expected_data="$3"
+
+       path_data=$(do_facet $client cat $path)
+       [[ "x$path_data" == "x$expected_data" ]] || error \
+               "expected $path: $expected_data, got: $path_data"
+}
+
+check_lpcc_data()
+{
+       local client="$1"
+       local lpcc_fpath="$2"
+       local lustre_fpath="$3"
+       local expected_data="$4"
+
+       check_file_data  "$client" "$lpcc_fpath" "$expected_data"
+       check_file_data  "$client" "$lustre_fpath" "$expected_data"
+}
+
+lpcc_fid2path()
+{
+       local hsm_root="$1"
+       local lustre_path="$2"
+       local fid=$(path2fid $lustre_path)
+
+       local -a f_seq
+       local -a f_oid
+       local -a f_ver
+
+       f_seq=$(echo $fid | awk -F ':' '{print $1}')
+       f_oid=$(echo $fid | awk -F ':' '{print $2}')
+       f_ver=$(echo $fid | awk -F ':' '{print $3}')
+
+       printf "%s/%04x/%04x/%04x/%04x/%04x/%04x/%s" \
+               $hsm_root $(($f_oid & 0xFFFF)) \
+               $(($f_oid >> 16 & 0xFFFF)) \
+               $(($f_seq & 0xFFFF)) \
+               $(($f_seq >> 16 & 0xFFFF)) \
+               $(($f_seq >> 32 & 0xFFFF)) \
+               $(($f_seq >> 48 & 0xFFFF)) $fid
+}
+
+check_lpcc_state()
+{
+       local lustre_path="$1"
+       local expected_state="$2"
+       local facet=${3:-$SINGLEAGT}
+       local myRUNAS="$4"
+       local state=$(do_facet $facet $myRUNAS $LFS pcc state $lustre_path |
+                       awk -F 'type: ' '{print $2}' | awk -F ',' '{print $1}')
+
+       [[ "x$state" == "x$expected_state" ]] || error \
+               "$lustre_path expected pcc state: $expected_state, but got: $state"
+}
+
+# initiate variables
+init_agt_vars
+
+# populate MDT device array
+get_mdt_devices
+
+# cleanup from previous bad setup
+kill_copytools
+
+# for recovery tests, coordinator needs to be started at mount
+# so force it
+# the lustre conf must be without hsm on (like for sanity.sh)
+echo "Set HSM on and start"
+cdt_set_mount_state enabled
+cdt_check_state enabled
+
+echo "Set sanity-hsm HSM policy"
+cdt_set_sanity_policy
+
+# finished requests are quickly removed from list
+set_hsm_param grace_delay 10
+
+cleanup_pcc_mapping() {
+       local facet=${1:-$SINGLEAGT}
+
+       do_facet $facet $LCTL pcc clear $MOUNT
+}
+
+setup_pcc_mapping() {
+       local facet=${1:-$SINGLEAGT}
+       local hsm_root=${hsm_root:-$(hsm_root "$facet")}
+       local param="$2"
+
+       [ -z "$param" ] && param="projid={100}\ rwid=$HSM_ARCHIVE_NUMBER"
+       stack_trap "cleanup_pcc_mapping $facet" EXIT
+       do_facet $facet $LCTL pcc add $MOUNT $hsm_root -p $param
+}
+
+lpcc_rw_test() {
+       local restore="$1"
+       local project="$2"
+       local project_id=100
+       local agt_facet=$SINGLEAGT
+       local hsm_root=$(hsm_root)
+       local file=$DIR/$tdir/$tfile
+       local -a state
+       local -a lpcc_path
+       local -a size
+
+       $project && enable_project_quota
+
+       do_facet $SINGLEAGT rm -rf $hsm_root
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+
+       is_project_quota_supported || project=false
+
+       do_facet $SINGLEAGT mkdir -p $DIR/$tdir
+       setup_pcc_mapping
+       $project && lfs project -sp $project_id $DIR/$tdir
+
+       do_facet $SINGLEAGT "echo -n attach_origin > $file"
+       if ! $project; then
+               check_lpcc_state $file "none"
+               do_facet $SINGLEAGT $LFS pcc attach -i \
+                       $HSM_ARCHIVE_NUMBER $file ||
+                       error "pcc attach $file failed"
+       fi
+
+       check_lpcc_state $file "readwrite"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       check_lpcc_data $SINGLEAGT $lpcc_path $file "attach_origin"
+
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=7654321 count=1
+       check_lpcc_sizes $SINGLEAGT $lpcc_path $file 7654321
+
+       do_facet $SINGLEAGT $TRUNCATE $file 1234567 ||
+               error "truncate failed"
+       check_lpcc_sizes $SINGLEAGT $lpcc_path $file 1234567
+       check_lpcc_state $file "readwrite"
+
+       do_facet $SINGLEAGT "echo -n file_data > $file"
+       check_lpcc_state $file "readwrite"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+       check_lpcc_data $SINGLEAGT $lpcc_path $file "file_data"
+
+       echo "Restore testing..."
+       if [ $CLIENTCOUNT -lt 2 -o $restore ]; then
+               $LFS hsm_restore $file || error \
+                       "failed to restore $file"
+               wait_request_state $(path2fid $file) RESTORE SUCCEED
+       else
+               path_data=$(do_node $CLIENT2 cat $file)
+               [[ "x$path_data" == "xfile_data" ]] || error \
+                       "expected file_data, got: $path_data"
+       fi
+
+       check_lpcc_state $file "none"
+       # HSM exists archived status
+       check_hsm_flags $file "0x00000009"
+
+       echo -n "new_data" > $file
+       check_lpcc_state $file "none"
+       # HSM exists dirty archived status
+       check_hsm_flags $file "0x0000000b"
+       check_file_data $SINGLEAGT $file "new_data"
+
+       echo "Attach and detach testing"
+       rm -f $file
+       do_facet $SINGLEAGT "echo -n new_data2 > $file"
+       if ! $project; then
+               check_lpcc_state $file "none"
+               do_facet $SINGLEAGT $LFS pcc attach -i \
+                       $HSM_ARCHIVE_NUMBER $file ||
+                       error "PCC attach $file failed"
+       fi
+       check_lpcc_state $file "readwrite"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+       do_facet $SINGLEAGT "echo -n attach_detach > $file"
+       echo "Start to detach the $file"
+       do_facet $SINGLEAGT $LFS pcc detach $file ||
+               error "PCC detach $file failed"
+       wait_request_state $(path2fid $file) REMOVE SUCCEED
+
+       check_lpcc_state $file "none"
+       # The file is removed from PCC
+       check_hsm_flags $file "0x00000000"
+       check_file_data $SINGLEAGT $file "attach_detach"
+}
+
+test_1a() {
+       lpcc_rw_test true false
+}
+run_test 1a "Test manual lfs pcc attach with manual HSM restore"
+
+test_1b() {
+       lpcc_rw_test false false
+}
+run_test 1b "Test manual lfs pcc attach with restore on remote access"
+
+test_1c() {
+       lpcc_rw_test true true
+}
+run_test 1c "Test automated attach using Project ID with manual HSM restore"
+
+test_1d() {
+       lpcc_rw_test false true
+}
+run_test 1d "Test Project ID with remote access"
+
+test_1e() {
+       local file=$DIR/$tdir/$tfile
+       local hsm_root=$(hsm_root)
+       local -a lpcc_path
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+       $LCTL pcc list $MOUNT
+       mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       chmod 777 $DIR/$tdir || error "chmod 777 $DIR/$tdir failed"
+
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $RUNAS dd if=$file of=/dev/null bs=1024 count=1 ||
+               error "failed to dd read from $file"
+       do_facet $SINGLEAGT $RUNAS $TRUNCATE $file 256 ||
+               error "failed to truncate $file"
+       do_facet $SINGLEAGT $RUNAS $TRUNCATE $file 2048 ||
+               error "failed to truncate $file"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+
+       # non-root user is forbidden to access PCC file directly
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       do_facet $SINGLEAGT $RUNAS touch $lpcc_path &&
+               error "non-root user can touch access PCC file $lpcc_path"
+       do_facet $SINGLEAGT $RUNAS dd if=$lpcc_path of=/dev/null bs=1024 \
+               count=1 && error "non-root user can read PCC file $lpcc_path"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$lpcc_path bs=1024 \
+               count=1 && error "non-root user can write PCC file $lpcc_path"
+
+       local perm=$(do_facet $SINGLEAGT stat -c %a $lpcc_path)
+
+       [[ $perm == "0" ]] || error "PCC file permission ($perm) is not zero"
+
+       do_facet $SINGLEAGT $RUNAS $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+       wait_request_state $(path2fid $file) REMOVE SUCCEED
+}
+run_test 1e "Test RW-PCC with non-root user"
+
+test_1f() {
+       local project_id=100
+       local agt_facet=$SINGLEAGT
+       local hsm_root=$(hsm_root)
+       local file=$DIR/$tdir/$tfile
+
+       ! is_project_quota_supported &&
+               skip "project quota is not supported"
+
+       enable_project_quota
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+       do_facet $SINGLEAGT mkdir -p $DIR/$tdir
+       chmod 777 $DIR/$tdir || error "chmod 0777 $DIR/$tdir failed"
+       $LFS project -sp $project_id $DIR/$tdir ||
+               error "failed to set project for $DIR/$tdir"
+
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $RUNAS dd if=$file of=/dev/null bs=1024 count=1 ||
+               error "failed to dd read from $file"
+       do_facet $SINGLEAGT $RUNAS $TRUNCATE $file 256 ||
+               error "failed to truncate $file"
+       do_facet $SINGLEAGT $RUNAS $TRUNCATE $file 2048 ||
+               error "failed to truncate $file"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write from $file"
+       check_lpcc_state $file "readwrite"
+
+       # non-root user is forbidden to access PCC file directly
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       do_facet $SINGLEAGT $RUNAS touch $lpcc_path &&
+               error "non-root user can touch access PCC file $lpcc_path"
+       do_facet $SINGLEAGT $RUNAS dd if=$lpcc_path of=/dev/null bs=1024 \
+               count=1 && error "non-root user can read PCC file $lpcc_path"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$lpcc_path bs=1024 \
+               count=1 && error "non-root user can write PCC file $lpcc_path"
+
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+       wait_request_state $(path2fid $file) REMOVE SUCCEED
+}
+run_test 1f "Test auto RW-PCC cache with non-root user"
+
+test_1g() {
+       local file=$DIR/$tfile
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 &&
+               error "non-root user can dd write to $file"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 &&
+               error "non-root user can dd write to $file"
+       chmod 777 $file || error "chmod 777 $file failed"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "non-root user cannot write $file with permission (777)"
+
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach $file &&
+               error "non-root user or non owner can detach $file"
+       chown $RUNAS_ID $file || error "chown $RUNAS_ID $file failed"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+       wait_request_state $(path2fid $file) REMOVE SUCCEED
+       do_facet $SINGLEAGT $RUNAS dd if=$file of=/dev/null bs=1024 count=1 ||
+               error "non-root user cannot read to $file with permisson (777)"
+}
+run_test 1g "General permission test for RW-PCC"
+
+#
+# When a process created a LPCC file and holding the open,
+# another process on the same client should be able to open the file.
+#
+test_2a() {
+       local project_id=100
+       local agt_facet=$SINGLEAGT
+       local hsm_root=$(hsm_root)
+       local agt_host=$(facet_active_host $SINGLEAGT)
+
+       ! is_project_quota_supported &&
+               skip "project quota is not supported" && return
+
+       enable_project_quota
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+       file=$DIR/$tdir/multiop
+       mkdir -p $DIR/$tdir
+       rm -f $file
+
+       do_facet $SINGLEAGT $LFS project -sp $project_id $DIR/$tdir ||
+               error "failed to set project quota"
+       rmultiop_start $agt_host $file O_c || error "open $file failed"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+       do_facet $SINGLEAGT "echo -n multiopen_data > $file" ||
+               error "failed to echo multiopen_data to $file"
+
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       do_facet $SINGLEAGT ls -l $lpcc_path ||
+               error "failed to ls $lpcc_path"
+       check_lpcc_data $SINGLEAGT $lpcc_path $file "multiopen_data"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+
+       rmultiop_stop $agt_host || error "close $file failed"
+}
+run_test 2a "Test multi open when creating"
+
+get_remote_client() {
+       current_id=$(do_facet $SINGLEAGT hostname)
+       for client in ${CLIENTS//,/ }
+       do
+               r_id=$(do_node $client hostname)
+               if [ $r_id != $current_id ]; then
+                       echo $client
+                       return
+               fi
+       done
+}
+
+#
+# When a process created a LPCC file and holding the open, another
+# process on the different client should be able to open the file
+# and perform IO on the file.
+#
+test_2b() {
+       local agt_facet=$SINGLEAGT
+       local hsm_root=$(hsm_root)
+       local agt_host=$(facet_active_host $SINGLEAGT)
+
+       needclients 2 || return 0
+
+       remote_client=$(get_remote_client)
+
+       enable_project_quota
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+       file=$DIR/$tdir/multiop
+       mkdir -p $DIR/$tdir
+       rm -f $file
+
+       do_facet $SINGLEAGT "echo -n file_data > $file"
+       do_facet $SINGLEAGT lfs pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "PCC attach $file failed"
+       check_lpcc_state $file "readwrite"
+
+       rmultiop_start $agt_host $file O_c || error "open $file failed"
+
+       do_node $remote_client "echo -n multiopen_data > $file"
+
+       # PCC cached file should be automatically detached
+       check_lpcc_state $file "none"
+
+       check_file_data $SINGLEAGT $file "multiopen_data"
+       rmultiop_stop $agt_host || error "close $file failed"
+       check_file_data $SINGLEAGT $file "multiopen_data"
+
+       do_node $remote_client cat $file || error \
+               "cat $file on remote client failed"
+       do_node $remote_client echo -n "multiopen_data" > $file \
+               || error "write $file on remote client failed"
+}
+run_test 2b "Test multi remote open when creating"
+
+test_2c() {
+       local agt_host=$(facet_active_host $SINGLEAGT)
+       local file=$DIR/$tdir/$tfile
+       local file2=$DIR2/$tdir/$tfile
+
+       enable_project_quota
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+       mkdir -p $DIR/$tdir
+       rm -f $file
+
+       do_facet $SINGLEAGT "echo -n file_data > $file"
+       do_facet $SINGLEAGT lfs pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "PCC attach $file failed"
+       check_lpcc_state $file "readwrite"
+
+       rmultiop_start $agt_host $file O_c || error "open $file failed"
+
+       echo -n multiopen_data > $file2
+
+       # PCC cached file should be automatically detached
+       check_lpcc_state $file "none"
+
+       check_file_data $SINGLEAGT $file "multiopen_data"
+       rmultiop_stop $agt_host || error "close $file failed"
+       check_file_data $SINGLEAGT $file "multiopen_data"
+
+       cat $file2 || error "cat $file on mount $MOUNT2 failed"
+       echo -n "multiopen_data" > $file2 ||
+               error "write $file on mount $MOUNT2 failed"
+}
+run_test 2c "Test multi open on different mount points when creating"
+
+test_3a() {
+       local file=$DIR/$tdir/$tfile
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+
+       echo "Start to attach/detach the file: $file"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+
+       echo "Repeat to attach/detach the same file: $file"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+}
+run_test 3a "Repeat attach/detach operations"
+
+test_3b() {
+       local n
+       local file=$DIR/$tdir/$tfile
+
+       needclients 3 || return 0
+
+       # Start all of the copytools and setup PCC
+       for n in $(seq $AGTCOUNT); do
+               copytool setup -f agt$n -a $n -m $MOUNT
+               setup_pcc_mapping agt$n "projid={100}\ rwid=$n"
+       done
+
+       mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+
+       echo "Start to attach/detach $file on $agt1_HOST"
+       do_facet agt1 $LFS pcc attach -i 1 $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite" agt1
+       do_facet agt1 $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none" agt1
+
+       echo "Repeat to attach/detach $file on $agt2_HOST"
+       do_facet agt2 $LFS pcc attach -i 2 $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite" agt2
+       do_facet agt2 $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none" agt2
+
+       echo "Try attach on two agents"
+       do_facet agt1 $LFS pcc attach -i 1 $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite" agt1
+       do_facet agt2 $LFS pcc attach -i 2 $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite" agt2
+       # The later attach PCC agent should succeed,
+       # the former agent should be detached automatically.
+       check_lpcc_state $file "none" agt1
+       do_facet agt2 $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none" agt2
+}
+run_test 3b "Repeat attach/detach operations on multiple clients"
+
+test_4() {
+       local project_id=100
+
+       ! is_project_quota_supported &&
+               skip "project quota is not supported" && return
+
+       enable_project_quota
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       lfs project -sp $project_id $DIR/$tdir ||
+               error "lfs project -sp $project_id $DIR/$tdir failed"
+
+       # mmap_sanity tst7 failed on the local ext4 filesystem.
+       # It seems that Lustre filesystem does special process for tst 7.
+       # Thus, we exclude tst7 from the PCC testing.
+       $LUSTRE/tests/mmap_sanity -d $DIR/$tdir -m $DIR2/$tdir -e 7 ||
+               error "mmap_sanity test failed"
+       sync; sleep 1; sync
+}
+run_test 4 "Auto cache test for mmap"
+
+test_5() {
+       local file=$DIR/$tfile
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       do_facet $SINGLEAGT "echo -n attach_mmap_data > $file" ||
+               error "echo $file failed"
+
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+
+       local content=$($MMAP_CAT $file)
+
+       [[ $content == "attach_mmap_data" ]] ||
+               error "mmap cat data mismatch: $content"
+
+       $LFS hsm_restore $file || error "failed to restore $file"
+       wait_request_state $(path2fid $file) RESTORE SUCCEED
+       check_lpcc_state $file "none"
+
+       content=$($MMAP_CAT $file)
+       [[ $content == "attach_mmap_data" ]] ||
+               error "mmap cat data mismatch: $content"
+}
+run_test 5 "Mmap & cat a RW-PCC cached file"
+
+setup_loopdev() {
+       local facet=$1
+       local file=$2
+       local mntpt=$3
+       local size=${4:-50}
+
+       do_facet $facet mkdir -p $mntpt || error "mkdir -p $hsm_root failed"
+       stack_trap "do_facet $facet rm -rf $mntpt" EXIT
+       do_facet $facet dd if=/dev/zero of=$file bs=1M count=$size
+       stack_trap "do_facet $facet rm -f $file" EXIT
+       do_facet $facet mkfs.ext4 $file ||
+               error "mkfs.ext4 $file failed"
+       do_facet $facet file $file
+       do_facet $facet mount -t ext4 -o loop,usrquota,grpquota $file $mntpt ||
+               error "mount -o loop,usrquota,grpquota $file $mntpt failed"
+       stack_trap "do_facet $facet $UMOUNT $mntpt" EXIT
+}
+
+test_6() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tfile
+       local content
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       echo -n mmap_write_data > $file || error "echo write $file failed"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+
+       do_facet $SINGLEAGT $MULTIOP $file OSMWUc ||
+               error "could not mmap $file"
+       check_lpcc_state $file "readwrite"
+       content=$(do_facet $SINGLEAGT $MMAP_CAT $file)
+       # After mmap write via multiop, the first character of each page
+       # increases with 1.
+       [[ $content == "nmap_write_data" ]] ||
+               error "mmap write data mismatch: $content"
+       check_lpcc_state $file "readwrite"
+
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+
+       content=$(do_facet $SINGLEAGT $MMAP_CAT $file)
+       [[ $content == "nmap_write_data" ]] ||
+               error "mmap write data mismatch: $content"
+}
+run_test 6 "Test mmap write on RW-PCC "
+
+test_7a() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tfile
+       local content
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       echo "QQQQQ" > $file
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       check_file_data $SINGLEAGT $file "QQQQQ"
+       # define OBD_FAIL_LLITE_PCC_DETACH_MKWRITE      0x1412
+       do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1412
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+
+       # multiop mmap write increase the first character of each page with 1
+       do_facet $SINGLEAGT $MULTIOP $file OSMWUc ||
+               error "mmap write $file failed"
+       check_lpcc_state $file "none"
+       content=$(do_facet $SINGLEAGT $MMAP_CAT $file)
+       [[ $content == "RQQQQ" ]] || error "data mismatch: $content"
+}
+run_test 7a "Fake file detached between fault() and page_mkwrite() for RW-PCC"
+
+test_7b() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tfile
+       local content
+       local pid
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       echo "QQQQQ" > $file
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       check_file_data $SINGLEAGT $file "QQQQQ"
+       # define OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE       0x1413
+       do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1413 fail_val=20
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+
+       # multiop mmap write increases the first character of each page with 1
+       do_facet $SINGLEAGT $MULTIOP $file OSMWUc &
+       pid=$!
+
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+
+       wait $pid || error "multiop mmap write failed"
+       check_lpcc_state $file "none"
+       content=$(do_facet $SINGLEAGT $MMAP_CAT $file)
+       [[ $content == "RQQQQ" ]] || error "data mismatch: $content"
+}
+run_test 7b "Test the race with concurrent mkwrite and detach"
+
+test_8() {
+       local file=$DIR/$tfile
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       echo "QQQQQ" > $file
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach file $file"
+       check_lpcc_state $file "readwrite"
+       check_file_data $SINGLEAGT $file "QQQQQ"
+
+       # define OBD_FAIL_LLITE_PCC_FAKE_ERROR  0x1411
+       do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1411
+       do_facet $SINGLEAGT "echo -n ENOSPC_write > $file"
+       # Above write will return -ENOSPC failure and retry the IO on normal
+       # IO path. It will restore the HSM released file.
+       check_lpcc_state $file "none"
+       check_file_data $SINGLEAGT $file "ENOSPC_write"
+}
+run_test 8 "Test fake -ENOSPC tolerance for RW-PCC"
+
+test_9() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.9a"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tfile
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMVER" -h "$hsm_root"
+       setup_pcc_mapping
+       do_facet $SINGLEAGT $LCTL pcc list $MOUNT
+
+       touch $file || error "touch $file failed"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "fail to attach $file"
+       check_lpcc_state $file "readwrite"
+       # write 60M data, it is larger than the capacity of PCC backend
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=60 ||
+               error "fail to dd write $file"
+       check_lpcc_state $file "none"
+       check_file_size $SINGLEAGT $file 62914560
+}
+run_test 9 "Test -ENOSPC tolerance on loop PCC device for RW-PCC"
+
+test_usrgrp_quota() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local ug=$1
+       local id=$RUNAS_ID
+
+       [[ $ug == "g" ]] && id=$RUNAS_GID
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       do_facet $SINGLEAGT quotacheck -c$ug $mntpt ||
+               error "quotacheck -c$ug $mntpt failed"
+       do_facet $SINGLEAGT quotaon -$ug $mntpt ||
+               error "quotaon -$ug $mntpt failed"
+       do_facet $SINGLEAGT setquota -$ug $id 0 20480 0 0 $mntpt ||
+               error "setquota -$ug $id on $mntpt failed"
+       do_facet $SINGLEAGT repquota -${ug}vs $mntpt
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMVER" -h "$hsm_root"
+       setup_pcc_mapping
+       do_facet $SINGLEAGT $LCTL pcc list $MOUNT
+
+       mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+
+       local file1=$DIR/$tdir/${ug}quotaA
+       local file2=$DIR/$tdir/${ug}quotaB
+
+       dd if=/dev/zero of=$file1 bs=1M count=15 ||
+               error "dd write $file1 failed"
+       dd if=/dev/zero of=$file2 bs=1M count=15 ||
+               error "dd write $file2 failed"
+       chown $RUNAS_ID:$RUNAS_GID $file1 ||
+               error "chown $RUNAS_ID:$RUNAS_GID $file1 failed"
+       chown $RUNAS_ID:$RUNAS_GID $file2 ||
+               error "chown $RUNAS_ID:$RUNAS_GID $file2 failed"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file1 || error "attach $file1 failed"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file2 && error "attach $file2 should fail due to quota limit"
+       check_lpcc_state $file1 "readwrite"
+       check_lpcc_state $file2 "none"
+
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file1 bs=1M count=30 ||
+               error "dd write $file1 failed"
+       # -EDQUOT error should be tolerated via fallback to normal Lustre path.
+       check_lpcc_state $file1 "none"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file1 ||
+               error "failed to detach file $file"
+       rm $file1 $file2
+}
+
+test_10a() {
+       test_usrgrp_quota "u"
+}
+run_test 10a "Test RW-PCC with user quota on loop PCC device"
+
+test_10b() {
+       test_usrgrp_quota "g"
+}
+run_test 10b "Test RW-PCC with group quota on loop PCC device"
+
+test_11() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tfile
+       local -a lpcc_path
+       local lpcc_dir
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       do_facet $SINGLEAGT "echo -n QQQQQ > $file"
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       lpcc_dir=$(dirname $lpcc_path)
+       echo "Lustre file: $file LPCC dir: $lpcc_dir"
+       do_facet $SINGLEAGT mkdir -p $lpcc_dir ||
+               error "mkdir -p $lpcc_dir failed"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "failed to attach $file"
+       check_lpcc_state $file "readwrite"
+       check_file_data $SINGLEAGT $file "QQQQQ"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach $file"
+       rm $file || error "rm $file failed"
+
+       # The parent directory of the PCC file is immutable
+       do_facet $SINGLEAGT "echo -n immutable_dir > $file"
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       lpcc_dir=$(dirname $lpcc_path)
+       echo "Lustre file: $file LPCC dir: $lpcc_dir"
+       do_facet $SINGLEAGT mkdir -p $lpcc_dir ||
+               error "mkdir -p $lpcc_dir failed"
+       do_facet $SINGLEAGT chattr +i $lpcc_dir ||
+               error "chattr +i $lpcc_dir failed"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file &&
+               error "attach $file with immutable directory should be failed"
+       do_facet $SINGLEAGT chattr -i $lpcc_dir ||
+               error "chattr -i $lpcc_dir failed"
+       rm $file || error "rm $file failed"
+
+       # The PCC file path is set to a directory
+       do_facet $SINGLEAGT "echo -n pcc_file_path_is_dir > $file"
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       do_facet $SINGLEAGT mkdir -p $lpcc_path ||
+               error "mkdir -p $lpcc_path failed"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file &&
+               error "attach $file should fail as PCC path is a directory"
+       rm $file || error "rm $file failed"
+}
+run_test 11 "Test attach fault injection with simulated PCC file path"
+
+test_12() {
+       local file=$DIR/$tfile
+       local hsm_root=$(hsm_root)
+       local -a lpcc_path
+       local pid
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       echo  -n race_rw_attach_hsmremove > $file
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file ||
+               error "attach $file failed"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "detach $file failed"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+       # define OBD_FAIL_LLITE_PCC_ATTACH_PAUSE        0x1414
+       do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1414 fail_val=20
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file &
+       pid=$!
+       $LFS hsm_state $file
+       sleep 3
+       wait_request_state $(path2fid $file) RESTORE SUCCEED
+       $LFS hsm_remove $file || error "hsm remove $file failed"
+       wait $pid && error "RW-PCC attach $file should fail"
+       do_facet $SINGLEAGT "[ -f $lpcc_path ]" &&
+               error "RW-PCC cached file '$lpcc_path' should be removed"
+
+       return 0
+}
+run_test 12 "RW-PCC attach races with concurrent HSM remove"
+
+test_rule_id() {
+       local idstr="${1}id"
+       local rule="${idstr}={$2}"
+       local myRUNAS="$3"
+       local file=$DIR/$tdir/$tfile
+
+       setup_pcc_mapping $SINGLEAGT "$rule\ rwid=$HSM_ARCHIVE_NUMBER"
+       $LCTL pcc list $MOUNT
+
+       do_facet $SINGLEAGT mkdir -p $DIR/$tdir
+       chmod 777 $DIR/$tdir || error "chmod 0777 $DIR/$tdir failed"
+
+       rm -f $file || error "rm $file failed"
+       do_facet $SINGLEAGT $myRUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $myRUNAS dd if=$file of=/dev/null bs=1024 count=1 ||
+               error "failed to dd read from $file"
+       do_facet $SINGLEAGT $myRUNAS $TRUNCATE $file 256 ||
+               error "failed to truncate $file"
+       do_facet $SINGLEAGT $myRUNAS $TRUNCATE $file 2048 ||
+               error "failed to truncate $file"
+       do_facet $SINGLEAGT $myRUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write from $file"
+       check_lpcc_state $file "readwrite"
+
+       do_facet $SINGLEAGT $myRUNAS $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+}
+
+test_13a() {
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       test_rule_id "u" "500" "runas -u 500"
+       test_rule_id "g" "500" "runas -u 500 -g 500"
+}
+run_test 13a "Test auto RW-PCC create caching for UID/GID rule"
+
+test_13b() {
+       local file
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping $SINGLEAGT \
+               "fname={*.h5\ suffix.*\ Mid*dle}\ rwid=$HSM_ARCHIVE_NUMBER"
+       $LCTL pcc list $MOUNT
+
+       do_facet $SINGLEAGT mkdir -p $DIR/$tdir
+       chmod 777 $DIR/$tdir || error "chmod 0777 $DIR/$tdir failed"
+
+       file=$DIR/$tdir/prefix.h5
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $myRUNAS $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/suffix.doc
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $myRUNAS $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/MidPADdle
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $myRUNAS $LFS pcc detach -k $file ||
+               error "failed to detach file $file"
+       check_lpcc_state $file "none"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/Midpad
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "none"
+       rm $file || error "rm $file failed"
+}
+run_test 13b "Test auto RW-PCC create caching for file name with wildcard"
+
+test_13c() {
+       local file
+       local myRUNAS
+
+       ! is_project_quota_supported &&
+               echo "Skip project quota is not supported" && return 0
+
+       enable_project_quota
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping $SINGLEAGT \
+               "projid={100\ 200}\&fname={*.h5},uid={500}\&gid={1000}\ rwid=$HSM_ARCHIVE_NUMBER"
+       $LCTL pcc list $MOUNT
+       do_facet $SINGLEAGT mkdir -p $DIR/$tdir
+       chmod 777 $DIR/$tdir || error "chmod 0777 $DIR/$tdir failed"
+
+       mkdir -p $DIR/$tdir/proj || error "mkdir $DIR/$tdir/proj failed"
+       mkdir -p $DIR/$tdir/proj2 || error "mkdir $DIR/$tdir/proj2 failed"
+       $LFS project -sp 100 $DIR/$tdir/proj ||
+               error "failed to set project for $DIR/$tdir/proj"
+       $LFS project -sp 200 $DIR/$tdir/proj2 ||
+               error "failed to set project for $DIR/$tdir/proj2"
+
+       file=$DIR/$tdir/proj/notcache
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "none"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/proj/autocache.h5
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach $file"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/proj2/notcache
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "none"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/proj2/autocache.h5
+       do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach $file"
+       rm $file || error "rm $file failed"
+
+       file=$DIR/$tdir/ugidcache
+       myRUNAS="runas -u 500 -g 1000"
+       do_facet $SINGLEAGT $myRUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       check_lpcc_state $file "readwrite"
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "failed to detach $file"
+       rm $file || error "rm $file failed"
+}
+run_test 13c "Check auto RW-PCC create caching for UID/GID/ProjID/fname rule"
+
+test_14() {
+       local file=$DIR/$tdir/$tfile
+
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping
+
+       mkdir -p $DIR/$tdir || error "mkdir -p $DIR/$tdir failed"
+       do_facet $SINGLEAGT "echo -n autodetach_data > $file"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "PCC attach $file failed"
+       check_lpcc_state $file "readwrite"
+
+       # Revoke the layout lock, the PCC-cached file will be
+       # detached automatically.
+       do_facet $SINGLEAGT $LCTL \
+               set_param ldlm.namespaces.*mdc*.lru_size=clear
+       check_file_data $SINGLEAGT $file "autodetach_data"
+       check_lpcc_state $file "none"
+}
+run_test 14 "Revocation of the layout lock should detach the file automatically"
+
+test_15() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tdir/$tfile
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping $SINGLEAGT \
+               "projid={100}\ rwid=$HSM_ARCHIVE_NUMBER\ open_attach=1"
+
+       mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       chmod 777 $DIR/$tdir || error "chmod 777 $DIR/$tdir failed"
+
+       echo "Check open attach for non-root user"
+       do_facet $SINGLEAGT $RUNAS dd if=/dev/zero of=$file bs=1024 count=1 ||
+               error "failed to dd write to $file"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "failed to attach file $file"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc state $file
+       check_lpcc_state $file "readwrite" $SINGLEAGT "$RUNAS"
+       # Revoke the layout lock, the PCC-cached file will be
+       # detached automatically.
+       do_facet $SINGLEAGT $LCTL \
+               set_param ldlm.namespaces.*mdc*.lru_size=clear
+       check_lpcc_state $file "readwrite" $SINGLEAGT "$RUNAS"
+       # Detach the file but keep the cache , as the file layout generation
+       # is not changed, so the file is still valid cached in PCC, and can
+       # be reused from PCC cache directly.
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach -k $file ||
+               error "PCC detach $file failed"
+       check_lpcc_state $file "readwrite" $SINGLEAGT "$RUNAS"
+       do_facet $SINGLEAGT $RUNAS $LFS pcc detach $file ||
+               error "PCC detach $file failed"
+       rm $file || error "rm $file failed"
+
+       echo "check open attach for root user"
+       do_facet $SINGLEAGT "echo -n autoattach_data > $file"
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "PCC attach $file failed"
+       check_lpcc_state $file "readwrite"
+
+       # Revoke the layout lock, the PCC-cached file will be
+       # detached automatically.
+       do_facet $SINGLEAGT $LCTL \
+               set_param ldlm.namespaces.*mdc*.lru_size=clear
+       check_file_data $SINGLEAGT $file "autoattach_data"
+       check_lpcc_state $file "readwrite"
+
+       # Detach the file with -k option, as the file layout generation
+       # is not changed, so the file is still valid cached in PCC,
+       # and can be reused from PCC cache directly.
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "PCC detach $file failed"
+       check_lpcc_state $file "readwrite"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+       check_file_data $SINGLEAGT $file "autoattach_data"
+
+       # HSM restore the PCC cached file, the layout generation
+       # was changed, so the file can not be auto attached.
+       $LFS hsm_restore $file || error "failed to restore $file"
+       wait_request_state $(path2fid $file) RESTORE SUCCEED
+       check_lpcc_state $file "none"
+       # HSM exists archived status
+       check_hsm_flags $file "0x00000009"
+
+}
+run_test 15 "Test auto attach at open when file is still valid cached"
+
+test_16() {
+       local loopfile="$TMP/$tfile"
+       local mntpt="/mnt/pcc.$tdir"
+       local hsm_root="$mntpt/$tdir"
+       local file=$DIR/$tfile
+       local -a lpcc_path
+
+       setup_loopdev $SINGLEAGT $loopfile $mntpt 50
+       copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER"
+       setup_pcc_mapping $SINGLEAGT \
+               "projid={100}\ rwid=$HSM_ARCHIVE_NUMBER\ open_attach=1"
+
+       do_facet $SINGLEAGT "echo -n detach_data > $file"
+       lpcc_path=$(lpcc_fid2path $hsm_root $file)
+       do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER \
+               $file || error "PCC attach $file failed"
+       check_lpcc_state $file "readwrite"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+
+       echo "Test for reusing valid PCC cache"
+       # Valid PCC cache can be reused
+       do_facet $SINGLEAGT $LFS pcc detach -k $file ||
+               error "PCC detach $file failed"
+       check_lpcc_state $file "readwrite"
+       # HSM released exists archived status
+       check_hsm_flags $file "0x0000000d"
+
+       echo "Test for the default detach"
+       # Permanent detach by default, it will remove the PCC copy
+       do_facet $SINGLEAGT $LFS pcc detach $file ||
+               error "PCC detach $file failed"
+       wait_request_state $(path2fid $file) REMOVE SUCCEED
+       check_lpcc_state $file "none"
+       # File is removed from PCC backend
+       check_hsm_flags $file "0x00000000"
+       do_facet $SINGLEAGT "[ -f $lpcc_path ]" &&
+               error "RW-PCC cached file '$lpcc_path' should be removed"
+
+       return 0
+}
+run_test 16 "Test detach with different options"
+
+complete $SECONDS
+check_and_cleanup_lustre
+exit_status
index 3336688..ee01ef1 100755 (executable)
@@ -73,24 +73,6 @@ export QUOTA_AUTO=0
 check_and_setup_lustre
 
 ENABLE_PROJECT_QUOTAS=${ENABLE_PROJECT_QUOTAS:-true}
-is_project_quota_supported() {
-       $ENABLE_PROJECT_QUOTAS || return 1
-       [ "$(facet_fstype $SINGLEMDS)" == "ldiskfs" ] &&
-               [ $(lustre_version_code $SINGLEMDS) -gt \
-               $(version_code 2.9.55) ] &&
-               lfs --help | grep project >&/dev/null &&
-               egrep -q "7." /etc/redhat-release && return 0
-
-       if [ "$(facet_fstype $SINGLEMDS)" == "zfs" ]; then
-               [ $(lustre_version_code $SINGLEMDS) -le \
-                       $(version_code 2.10.53) ] && return 1
-
-               do_facet mds1 $ZPOOL upgrade -v |
-                       grep project_quota && return 0
-       fi
-
-       return 1
-}
 
 SHOW_QUOTA_USER="$LFS quota -v -u $TSTUSR $DIR"
 SHOW_QUOTA_USERID="$LFS quota -v -u $TSTID $DIR"
@@ -351,25 +333,6 @@ wait_ost_reint() {
        return 0
 }
 
-disable_project_quota() {
-       is_project_quota_supported || return 0
-       [ "$(facet_fstype $SINGLEMDS)" != "ldiskfs" ] && return 0
-       stopall || error "failed to stopall (1)"
-
-       for num in $(seq $MDSCOUNT); do
-               do_facet mds$num $TUNE2FS -Q ^prj $(mdsdevname $num) ||
-                       error "tune2fs $(mdsdevname $num) failed"
-       done
-
-       for num in $(seq $OSTCOUNT); do
-               do_facet ost$num $TUNE2FS -Q ^prj $(ostdevname $num) ||
-                       error "tune2fs $(ostdevname $num) failed"
-       done
-
-       mount
-       setupall
-}
-
 setup_quota_test() {
        wait_delete_completed
        echo "Creating test directory"
@@ -421,25 +384,6 @@ quota_show_check() {
        fi
 }
 
-enable_project_quota() {
-       is_project_quota_supported || return 0
-       [ "$(facet_fstype $SINGLEMDS)" != "ldiskfs" ] && return 0
-       stopall || error "failed to stopall (1)"
-
-       for num in $(seq $MDSCOUNT); do
-               do_facet mds$num $TUNE2FS -O project $(mdsdevname $num) ||
-                       error "tune2fs $(mdsdevname $num) failed"
-       done
-
-       for num in $(seq $OSTCOUNT); do
-               do_facet ost$num $TUNE2FS -O project $(ostdevname $num) ||
-                       error "tune2fs $(ostdevname $num) failed"
-       done
-
-       mount
-       setupall
-}
-
 project_quota_enabled () {
        local rc=0
        for num in $(seq $MDSCOUNT); do
index 5b78d09..7a1a050 100755 (executable)
@@ -9476,3 +9476,522 @@ verify_yaml_layout() {
        [ "$layout1" == "$layout2" ] ||
                error "$msg_prefix $src/$dst layouts are not equal"
 }
+
+is_project_quota_supported() {
+       $ENABLE_PROJECT_QUOTAS || return 1
+       [ "$(facet_fstype $SINGLEMDS)" == "ldiskfs" ] &&
+               [ $(lustre_version_code $SINGLEMDS) -gt \
+               $(version_code 2.9.55) ] &&
+               lfs --help | grep project >&/dev/null &&
+               egrep -q "7." /etc/redhat-release && return 0
+
+       if [ "$(facet_fstype $SINGLEMDS)" == "zfs" ]; then
+               [ $(lustre_version_code $SINGLEMDS) -le \
+                       $(version_code 2.10.53) ] && return 1
+
+               do_fact mds1 $ZPOOL upgrade -v |
+                       grep project_quota && return 0
+       fi
+
+       return 1
+}
+
+enable_project_quota() {
+       is_project_quota_supported || return 0
+       [ "$(facet_fstype $SINGLEMDS)" != "ldiskfs" ] && return 0
+       stopall || error "failed to stopall (1)"
+
+       for num in $(seq $MDSCOUNT); do
+               do_facet mds$num $TUNE2FS -O project $(mdsdevname $num) ||
+                       error "tune2fs $(mdsdevname $num) failed"
+       done
+
+       for num in $(seq $OSTCOUNT); do
+               do_facet ost$num $TUNE2FS -O project $(ostdevname $num) ||
+                       error "tune2fs $(ostdevname $num) failed"
+       done
+
+       mount
+       setupall
+}
+
+disable_project_quota() {
+       is_project_quota_supported || return 0
+       [ "$(facet_fstype $SINGLEMDS)" != "ldiskfs" ] && return 0
+       stopall || error "failed to stopall (1)"
+
+       for num in $(seq $MDSCOUNT); do
+               do_facet mds$num $TUNE2FS -Q ^prj $(mdsdevname $num) ||
+                       error "tune2fs $(mdsdevname $num) failed"
+       done
+
+       for num in $(seq $OSTCOUNT); do
+               do_facet ost$num $TUNE2FS -Q ^prj $(ostdevname $num) ||
+                       error "tune2fs $(ostdevname $num) failed"
+       done
+
+       mount
+       setupall
+}
+
+#
+# In order to test multiple remote HSM agents, a new facet type named "AGT" and
+# the following associated variables are added:
+#
+# AGTCOUNT: number of agents
+# AGTDEV{N}: target HSM mount point (root path of the backend)
+# agt{N}_HOST: hostname of the agent agt{N}
+# SINGLEAGT: facet of the single agent
+#
+# The number of agents is initialized as the number of remote client nodes.
+# By default, only single copytool is started on a remote client/agent. If there
+# was no remote client, then the copytool will be started on the local client.
+#
+init_agt_vars() {
+       local n
+       local agent
+
+       export AGTCOUNT=${AGTCOUNT:-$((CLIENTCOUNT - 1))}
+       [[ $AGTCOUNT -gt 0 ]] || AGTCOUNT=1
+
+       export SHARED_DIRECTORY=${SHARED_DIRECTORY:-$TMP}
+       if [[ $CLIENTCOUNT -gt 1 ]] &&
+               ! check_shared_dir $SHARED_DIRECTORY $CLIENTS; then
+               skip_env "SHARED_DIRECTORY should be accessible"\
+                        "on all client nodes"
+               exit 0
+       fi
+
+       # We used to put the HSM archive in $SHARED_DIRECTORY but that
+       # meant NFS issues could hose sanity-hsm sessions. So now we
+       # use $TMP instead.
+       for n in $(seq $AGTCOUNT); do
+               eval export AGTDEV$n=\$\{AGTDEV$n:-"$TMP/arc$n"\}
+               agent=CLIENT$((n + 1))
+               if [[ -z "${!agent}" ]]; then
+                       [[ $CLIENTCOUNT -eq 1 ]] && agent=CLIENT1 ||
+                               agent=CLIENT2
+               fi
+               eval export agt${n}_HOST=\$\{agt${n}_HOST:-${!agent}\}
+               local var=agt${n}_HOST
+               [[ ! -z "${!var}" ]] || error "agt${n}_HOST is empty!"
+       done
+
+       export SINGLEAGT=${SINGLEAGT:-agt1}
+
+       export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"}
+       export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""}
+       export HSMTOOL_UPDATE_INTERVAL=${HSMTOOL_UPDATE_INTERVAL:=""}
+       export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""}
+       export HSMTOOL_TESTDIR
+       export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ")
+
+       HSM_ARCHIVE_NUMBER=2
+
+       # The test only support up to 10 MDTs
+       MDT_PREFIX="mdt.$FSNAME-MDT000"
+       HSM_PARAM="${MDT_PREFIX}0.hsm"
+
+       # archive is purged at copytool setup
+       HSM_ARCHIVE_PURGE=true
+
+       # Don't allow copytool error upon start/setup
+       HSMTOOL_NOERROR=false
+}
+
+# Get the backend root path for the given agent facet.
+copytool_device() {
+       local facet=$1
+       local dev=AGTDEV$(facet_number $facet)
+
+       echo -n ${!dev}
+}
+
+get_mdt_devices() {
+       local mdtno
+       # get MDT device for each mdc
+       for mdtno in $(seq 1 $MDSCOUNT); do
+               local idx=$(($mdtno - 1))
+               MDT[$idx]=$($LCTL get_param -n \
+                       mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid |
+                       awk '{gsub(/_UUID/,""); print $1}' | head -n1)
+       done
+}
+
+search_copytools() {
+       local hosts=${1:-$(facet_active_host $SINGLEAGT)}
+       do_nodesv $hosts "pgrep -x $HSMTOOL_BASE"
+}
+
+kill_copytools() {
+       local hosts=${1:-$(facet_active_host $SINGLEAGT)}
+
+       echo "Killing existing copytools on $hosts"
+       do_nodesv $hosts "killall -q $HSMTOOL_BASE" || true
+}
+
+wait_copytools() {
+       local hosts=${1:-$(facet_active_host $SINGLEAGT)}
+       local wait_timeout=200
+       local wait_start=$SECONDS
+       local wait_end=$((wait_start + wait_timeout))
+       local sleep_time=100000 # 0.1 second
+
+       while ((SECONDS < wait_end)); do
+               if ! search_copytools $hosts; then
+                       echo "copytools stopped in $((SECONDS - wait_start))s"
+                       return 0
+               fi
+
+               echo "copytools still running on $hosts"
+               usleep $sleep_time
+               [ $sleep_time -lt 32000000 ] && # 3.2 seconds
+                       sleep_time=$(bc <<< "$sleep_time * 2")
+       done
+
+       # try to dump Copytool's stack
+       do_nodesv $hosts "echo 1 >/proc/sys/kernel/sysrq ; " \
+                        "echo t >/proc/sysrq-trigger"
+
+       echo "copytools failed to stop in ${wait_timeout}s"
+
+       return 1
+}
+
+copytool_monitor_cleanup() {
+       local facet=${1:-$SINGLEAGT}
+       local agent=$(facet_active_host $facet)
+
+       if [ -n "$HSMTOOL_MONITOR_DIR" ]; then
+               # Should die when the copytool dies, but just in case.
+               local cmd="kill \\\$(cat $HSMTOOL_MONITOR_DIR/monitor_pid)"
+               cmd+=" 2>/dev/null || true"
+               do_node $agent "$cmd"
+               do_node $agent "rm -fr $HSMTOOL_MONITOR_DIR"
+               export HSMTOOL_MONITOR_DIR=
+       fi
+
+       # The pdsh should die on its own when the monitor dies. Just
+       # in case, though, try to clean up to avoid any cruft.
+       if [ -n "$HSMTOOL_MONITOR_PDSH" ]; then
+               kill $HSMTOOL_MONITOR_PDSH 2>/dev/null || true
+               export HSMTOOL_MONITOR_PDSH=
+       fi
+}
+
+copytool_logfile()
+{
+       local host="$(facet_host "$1")"
+       local prefix=$TESTLOG_PREFIX
+       [ -n "$TESTNAME" ] && prefix+=.$TESTNAME
+
+       printf "${prefix}.copytool${archive_id}_log.${host}.log"
+}
+
+__lhsmtool_rebind()
+{
+       do_facet $facet $HSMTOOL -p "$hsm_root" --rebind "$@" "$mountpoint"
+}
+
+__lhsmtool_import()
+{
+       mkdir -p "$(dirname "$2")" ||
+               error "cannot create directory '$(dirname "$2")'"
+       do_facet $facet $HSMTOOL -p "$hsm_root" --import "$@" "$mountpoint"
+}
+
+__lhsmtool_setup()
+{
+       local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root \"$hsm_root\""
+       [ -n "$bandwidth" ] && cmd+=" --bandwidth $bandwidth"
+       [ -n "$archive_id" ] && cmd+=" --archive $archive_id"
+       [ ${#misc_options[@]} -gt 0 ] &&
+               cmd+=" $(IFS=" " echo "$@")"
+       cmd+=" \"$mountpoint\""
+
+       echo "Starting copytool $facet on $(facet_host $facet)"
+       stack_trap "do_facet $facet libtool execute pkill -x '$HSMTOOL' || true" EXIT
+       do_facet $facet "$cmd < /dev/null > \"$(copytool_logfile $facet)\" 2>&1"
+}
+
+hsm_root() {
+       local facet="${1:-$SINGLEAGT}"
+
+       printf "$(copytool_device "$facet")/${TESTSUITE}.${TESTNAME}/"
+}
+
+# Main entry point to perform copytool related operations
+#
+# Sub-commands:
+#
+#      setup   setup a copytool to run in the background, that copytool will be
+#              killed on EXIT
+#      import  import a file from an HSM backend
+#      rebind  rebind an archived file to a new fid
+#
+# Although the semantics might suggest otherwise, one does not need to 'setup'
+# a copytool before a call to 'copytool import' or 'copytool rebind'.
+#
+copytool()
+{
+       local action=$1
+       shift
+
+       # Parse arguments
+       local fail_on_error=true
+       local -a misc_options
+       while [ $# -gt 0 ]; do
+               case "$1" in
+               -f|--facet)
+                       shift
+                       local facet="$1"
+                       ;;
+               -m|--mountpoint)
+                       shift
+                       local mountpoint="$1"
+                       ;;
+               -a|--archive-id)
+                       shift
+                       local archive_id="$1"
+                       ;;
+               -h|--hsm-root)
+                       shift
+                       local hsm_root="$1"
+                       ;;
+               -b|--bwlimit)
+                       shift
+                       local bandwidth="$1" # in MB/s
+                       ;;
+               -n|--no-fail)
+                       local fail_on_error=false
+                       ;;
+               *)
+                       # Uncommon(/copytool dependent) option
+                       misc_options+=("$1")
+                       ;;
+               esac
+               shift
+       done
+
+       # Use default values if needed
+       local facet=${facet:-$SINGLEAGT}
+       local mountpoint="${mountpoint:-${MOUNT2:-$MOUNT}}"
+       local hsm_root="${hsm_root:-$(hsm_root "$facet")}"
+
+       stack_trap "do_facet $facet rm -rf '$hsm_root'" EXIT
+       do_facet $facet mkdir -p "$hsm_root" ||
+               error "mkdir '$hsm_root' failed"
+
+       case "$HSMTOOL" in
+       lhsmtool_posix)
+               local copytool=lhsmtool
+               ;;
+       esac
+
+       __${copytool}_${action} "${misc_options[@]}"
+       if [ $? -ne 0 ]; then
+               local error_msg
+
+               case $action in
+               setup)
+                       local host="$(facet_host $facet)"
+                       error_msg="Failed to start copytool $facet on '$host'"
+                       ;;
+               import)
+                       local src="${misc_options[0]}"
+                       local dest="${misc_options[1]}"
+                       error_msg="Failed to import '$src' to '$dest'"
+                       ;;
+               rebind)
+                       error_msg="could not rebind file"
+                       ;;
+               esac
+
+               $fail_on_error && error "$error_msg" || echo "$error_msg"
+       fi
+}
+
+needclients() {
+       local client_count=$1
+       if [[ $CLIENTCOUNT -lt $client_count ]]; then
+               skip "Need $client_count or more clients, have $CLIENTCOUNT"
+               return 1
+       fi
+       return 0
+}
+
+path2fid() {
+       $LFS path2fid $1 | tr -d '[]'
+       return ${PIPESTATUS[0]}
+}
+
+get_hsm_flags() {
+       local f=$1
+       local u=$2
+       local st
+
+       if [[ $u == "user" ]]; then
+               st=$($RUNAS $LFS hsm_state $f)
+       else
+               u=root
+               st=$($LFS hsm_state $f)
+       fi
+
+       [[ $? == 0 ]] || error "$LFS hsm_state $f failed (run as $u)"
+
+       st=$(echo $st | cut -f 2 -d" " | tr -d "()," )
+       echo $st
+}
+
+check_hsm_flags() {
+       local f=$1
+       local fl=$2
+
+       local st=$(get_hsm_flags $f)
+       [[ $st == $fl ]] || error "hsm flags on $f are $st != $fl"
+}
+
+mdts_set_param() {
+       local arg=$1
+       local key=$2
+       local value=$3
+       local mdtno
+       local rc=0
+       if [[ "$value" != "" ]]; then
+               value="=$value"
+       fi
+       for mdtno in $(seq 1 $MDSCOUNT); do
+               local idx=$(($mdtno - 1))
+               local facet=mds${mdtno}
+               # if $arg include -P option, run 1 set_param per MDT on the MGS
+               # else, run set_param on each MDT
+               [[ $arg = *"-P"* ]] && facet=mgs
+               do_facet $facet $LCTL set_param $arg mdt.${MDT[$idx]}.$key$value
+               [[ $? != 0 ]] && rc=1
+       done
+       return $rc
+}
+
+wait_result() {
+       local facet=$1
+       shift
+       wait_update --verbose $(facet_active_host $facet) "$@"
+}
+
+mdts_check_param() {
+       local key="$1"
+       local target="$2"
+       local timeout="$3"
+       local mdtno
+       for mdtno in $(seq 1 $MDSCOUNT); do
+               local idx=$(($mdtno - 1))
+               wait_result mds${mdtno} \
+                       "$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \
+                       $timeout ||
+                       error "$key state is not '$target' on mds${mdtno}"
+       done
+}
+
+cdt_set_mount_state() {
+       mdts_set_param "-P" hsm_control "$1"
+       # set_param -P is asynchronous operation and could race with set_param.
+       # In such case configs could be retrieved and applied at mgc after
+       # set_param -P completion. Sleep here to avoid race with set_param.
+       # We need at least 20 seconds. 10 for mgc_requeue_thread to wake up
+       # MGC_TIMEOUT_MIN_SECONDS + MGC_TIMEOUT_RAND_CENTISEC(5 + 5)
+       # and 10 seconds to retrieve config from server.
+       sleep 20
+}
+
+cdt_check_state() {
+       mdts_check_param hsm_control "$1" 20
+}
+
+cdt_set_sanity_policy() {
+       if [[ "$CDT_POLICY_HAD_CHANGED" ]]
+       then
+               # clear all
+               mdts_set_param "" hsm.policy "+NRA"
+               mdts_set_param "" hsm.policy "-NBR"
+               CDT_POLICY_HAD_CHANGED=
+       fi
+}
+
+set_hsm_param() {
+       local param=$1
+       local value=$2
+       local opt=$3
+       mdts_set_param "$opt -n" "hsm.$param" "$value"
+       return $?
+}
+
+wait_request_state() {
+       local fid=$1
+       local request=$2
+       local state=$3
+       # 4th arg (mdt index) is optional
+       local mdtidx=${4:-0}
+       local mds=mds$(($mdtidx + 1))
+
+       local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions"
+       cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d="
+
+       wait_result $mds "$cmd" "$state" 200 ||
+               error "request on $fid is not $state on $mds"
+}
+
+
+rmultiop_start() {
+       local client=$1
+       local file=$2
+       local cmds=$3
+       local WAIT_MAX=${4:-60}
+       local wait_time=0
+
+       # We need to run do_node in bg, because pdsh does not exit
+       # if child process of run script exists.
+       # I.e. pdsh does not exit when runmultiop_bg_pause exited,
+       # because of multiop_bg_pause -> $MULTIOP_PROG &
+       # By the same reason we need sleep a bit after do_nodes starts
+       # to let runmultiop_bg_pause start muliop and
+       # update /tmp/multiop_bg.pid ;
+       # The rm /tmp/multiop_bg.pid guarantees here that
+       # we have the updated by runmultiop_bg_pause
+       # /tmp/multiop_bg.pid file
+
+       local pid_file=$TMP/multiop_bg.pid.$$
+
+       do_node $client "MULTIOP_PID_FILE=$pid_file LUSTRE= \
+                       runmultiop_bg_pause $file $cmds" &
+       local pid=$!
+       local multiop_pid
+
+       while [[ $wait_time -lt $WAIT_MAX ]]; do
+               sleep 3
+               wait_time=$((wait_time + 3))
+               multiop_pid=$(do_node $client cat $pid_file)
+               if [ -n "$multiop_pid" ]; then
+                       break
+               fi
+       done
+
+       [ -n "$multiop_pid" ] ||
+               error "$client : Can not get multiop_pid from $pid_file "
+
+       eval export $(node_var_name $client)_multiop_pid=$multiop_pid
+       eval export $(node_var_name $client)_do_node_pid=$pid
+       local var=$(node_var_name $client)_multiop_pid
+       echo client $client multiop_bg started multiop_pid=${!var}
+       return $?
+}
+
+rmultiop_stop() {
+       local client=$1
+       local multiop_pid=$(node_var_name $client)_multiop_pid
+       local do_node_pid=$(node_var_name $client)_do_node_pid
+
+       echo "Stopping multiop_pid=${!multiop_pid} (kill ${!multiop_pid} on $client)"
+       do_node $client kill -USR1 ${!multiop_pid}
+
+       wait ${!do_node_pid}
+}
index 86942e5..fb1d17c 100644 (file)
@@ -25,3 +25,4 @@ sanity-lfsck
 sanity-hsm
 sanity-lsnapshot
 sanity-pfl
+sanity-pcc
index 7726f27..75e6018 100644 (file)
@@ -106,7 +106,7 @@ liblustreapi_la_SOURCES = liblustreapi.c liblustreapi_hsm.c \
                          liblustreapi_kernelconn.c liblustreapi_param.c \
                          liblustreapi_mirror.c \
                          liblustreapi_ladvise.c liblustreapi_chlg.c \
-                         liblustreapi_heat.c
+                         liblustreapi_heat.c liblustreapi_pcc.c
 liblustreapi_la_LDFLAGS = $(LIBREADLINE) -version-info 1:0:0 \
                          -Wl,--version-script=liblustreapi.map
 liblustreapi_la_LIBADD = $(top_builddir)/libcfs/libcfs/libcfs.la
index 8f330d9..8504c72 100644 (file)
@@ -54,6 +54,36 @@ static int jt_opt_ignore_errors(int argc, char **argv) {
         return 0;
 }
 
+static int jt_pcc_list_commands(int argc, char **argv);
+static int jt_pcc(int argc, char **argv);
+
+/**
+ * command_t pccdev_cmdlist - lctl pcc commands.
+ */
+command_t pccdev_cmdlist[] = {
+       { .pc_name = "add", .pc_func = jt_pcc_add,
+         .pc_help = "Add a PCC backend to a client.\n"
+               "usage: lctl pcc add <mntpath> <pccpath> [--param|-p <param>]\n"
+               "\tmntpath: Lustre mount point.\n"
+               "\tpccpath: Path of the PCC backend.\n"
+               "\tparam:   Setting parameters for PCC backend.\n" },
+       { .pc_name = "del", .pc_func = jt_pcc_del,
+         .pc_help = "Delete the specified PCC backend on a client.\n"
+               "usage: clt pcc del <mntpath> <pccpath>\n" },
+       { .pc_name = "clear", .pc_func = jt_pcc_clear,
+         .pc_help = "Remove all PCC backend on a client.\n"
+               "usage: lctl pcc clear <mntpath>\n" },
+       { .pc_name = "list", .pc_func = jt_pcc_list,
+         .pc_help = "List all PCC backends on a client.\n"
+               "usage: lctl pcc list <mntpath>\n" },
+       { .pc_name = "list-commands", .pc_func = jt_pcc_list_commands,
+         .pc_help = "list commands supported by lctl pcc"},
+       { .pc_name = "help", .pc_func = Parser_help, .pc_help = "help" },
+       { .pc_name = "exit", .pc_func = Parser_quit, .pc_help = "quit" },
+       { .pc_name = "quit", .pc_func = Parser_quit, .pc_help = "quit" },
+       { .pc_help = NULL }
+};
+
 command_t cmdlist[] = {
        /* Metacommands */
        {"===== metacommands =======", NULL, 0, "metacommands"},
@@ -349,6 +379,15 @@ command_t cmdlist[] = {
         "deregister an existing changelog user\n"
         "usage: --device <mdtname> changelog_deregister <id>"},
 
+       /* Persistent Client Cache (PCC) commands */
+       {"=== Persistent Client Cache ===", NULL, 0, "PCC user management"},
+       {"pcc", jt_pcc, pccdev_cmdlist,
+        "lctl commands used to interact with PCC features:\n"
+        "lclt pcc add    - add a PCC backend to a client\n"
+        "lclt pcc del    - delete a PCC backend on a client\n"
+        "lclt pcc clear  - remove all PCC backends on a client\n"
+        "lclt pcc list   - list all PCC backends on a client\n"},
+
        /* Device configuration commands */
        {"== device setup (these are not normally used post 1.4) ==",
                NULL, 0, "device config"},
@@ -534,6 +573,55 @@ command_t cmdlist[] = {
        { 0, 0, 0, NULL }
 };
 
+/**
+ * jt_pcc_list_commands() - List lctl pcc commands.
+ * @argc: The count of command line arguments.
+ * @argv: Array of strings for command line arguments.
+ *
+ * This function lists lctl pcc commands defined in pccdev_cmdlist[].
+ *
+ * Return: 0 on success.
+ */
+static int jt_pcc_list_commands(int argc, char **argv)
+{
+       char buffer[81] = "";
+
+       Parser_list_commands(pccdev_cmdlist, buffer, sizeof(buffer),
+                            NULL, 0, 4);
+
+       return 0;
+}
+
+/**
+ * jt_pcc() - Parse and execute lctl pcc commands.
+ * @argc: The count of lctl pcc command line arguments.
+ * @argv: Array of strings for lctl pcc command line arguments.
+ *
+ * This function parses lfs pcc commands and performs the
+ * corresponding functions specified in pccdev_cmdlist[].
+ *
+ * Return: 0 on success or an error code on failure.
+ */
+static int jt_pcc(int argc, char **argv)
+{
+       char cmd[PATH_MAX];
+       int rc = 0;
+
+       setlinebuf(stdout);
+
+       Parser_init("lctl-pcc > ", pccdev_cmdlist);
+
+       snprintf(cmd, sizeof(cmd), "%s %s", program_invocation_short_name,
+                argv[0]);
+       program_invocation_short_name = cmd;
+       if (argc > 1)
+               rc = Parser_execarg(argc - 1, argv + 1, pccdev_cmdlist);
+       else
+               rc = Parser_commands();
+
+       return rc < 0 ? -rc : rc;
+}
+
 int lctl_main(int argc, char **argv)
 {
         int rc;
index 123afed..bf23b67 100644 (file)
@@ -127,6 +127,13 @@ static inline int lfs_mirror_verify(int argc, char **argv);
 static inline int lfs_mirror_read(int argc, char **argv);
 static inline int lfs_mirror_write(int argc, char **argv);
 static inline int lfs_mirror_copy(int argc, char **argv);
+static int lfs_pcc_attach(int argc, char **argv);
+static int lfs_pcc_attach_fid(int argc, char **argv);
+static int lfs_pcc_detach(int argc, char **argv);
+static int lfs_pcc_detach_fid(int argc, char **argv);
+static int lfs_pcc_state(int argc, char **argv);
+static int lfs_pcc(int argc, char **argv);
+static int lfs_pcc_list_commands(int argc, char **argv);
 
 enum setstripe_origin {
        SO_SETSTRIPE,
@@ -325,6 +332,37 @@ command_t mirror_cmdlist[] = {
        { .pc_help = NULL }
 };
 
+/**
+ * command_t pcc_cmdlist - lfs pcc commands.
+ */
+command_t pcc_cmdlist[] = {
+       { .pc_name = "attach", .pc_func = lfs_pcc_attach,
+         .pc_help = "Attach given files to the Persistent Client Cache.\n"
+               "usage: lfs pcc attach <--id|-i NUM> <file> ...\n"
+               "\t-i: archive id for RW-PCC\n" },
+       { .pc_name = "attach_fid", .pc_func = lfs_pcc_attach_fid,
+         .pc_help = "Attach given files into PCC by FID(s).\n"
+               "usage: lfs pcc attach_id <--id|-i NUM> <--mnt|-m mnt> "
+               "<fid> ...\n"
+               "\t-i: archive id for RW-PCC\n"
+               "\t-m: Lustre mount point\n" },
+       { .pc_name = "state", .pc_func = lfs_pcc_state,
+         .pc_help = "Display the PCC state for given files.\n"
+               "usage: lfs pcc state <file> ...\n" },
+       { .pc_name = "detach", .pc_func = lfs_pcc_detach,
+         .pc_help = "Detach given files from the Persistent Client Cache.\n"
+               "usage: lfs pcc detach <file> ...\n" },
+       { .pc_name = "detach_fid", .pc_func = lfs_pcc_detach_fid,
+         .pc_help = "Detach given files from PCC by FID(s).\n"
+               "usage: lfs pcc detach_fid <mntpath> <fid>...\n" },
+       { .pc_name = "list-commands", .pc_func = lfs_pcc_list_commands,
+         .pc_help = "list commands supported by lfs pcc"},
+       { .pc_name = "help", .pc_func = Parser_help, .pc_help = "help" },
+       { .pc_name = "exit", .pc_func = Parser_quit, .pc_help = "quit" },
+       { .pc_name = "quit", .pc_func = Parser_quit, .pc_help = "quit" },
+       { .pc_help = NULL }
+};
+
 /* all available commands */
 command_t cmdlist[] = {
        {"setstripe", lfs_setstripe, 0,
@@ -630,6 +668,13 @@ command_t cmdlist[] = {
         "\t--clear|-c: Clear file heat for given files\n"
         "\t--off|-o:   Turn off file heat for given files\n"
         "\t--on|-O:    Turn on file heat for given files\n"},
+       {"pcc", lfs_pcc, pcc_cmdlist,
+        "lfs commands used to interact with PCC features:\n"
+        "lfs pcc attach - attach given files to Persistent Client Cache\n"
+        "lfs pcc attach_fid - attach given files into PCC by FID(s)\n"
+        "lfs pcc state  - display the PCC state for given files\n"
+        "lfs pcc detach - detach given files from Persistent Client Cache\n"
+        "lfs pcc detach_fid - detach given files from PCC by FID(s)\n"},
        {"help", Parser_help, 0, "help"},
        {"exit", Parser_quit, 0, "quit"},
        {"quit", Parser_quit, 0, "quit"},
@@ -10330,6 +10375,343 @@ static int lfs_mirror_list_commands(int argc, char **argv)
        return 0;
 }
 
+static int lfs_pcc_attach(int argc, char **argv)
+{
+       struct option long_opts[] = {
+       { .val = 'i',   .name = "id",   .has_arg = required_argument },
+       { .name = NULL } };
+       int c;
+       int rc = 0;
+       __u32 archive_id = 0;
+       const char *path;
+       char *end;
+       char fullpath[PATH_MAX];
+       enum lu_pcc_type type = LU_PCC_READWRITE;
+
+       optind = 0;
+       while ((c = getopt_long(argc, argv, "i:",
+                               long_opts, NULL)) != -1) {
+               switch (c) {
+               case 'i':
+                       archive_id = strtoul(optarg, &end, 0);
+                       if (*end != '\0' || archive_id == 0) {
+                               fprintf(stderr, "error: %s: bad archive ID "
+                                       "'%s'\n", argv[0], optarg);
+                               return CMD_HELP;
+                       }
+                       break;
+               case '?':
+                       return CMD_HELP;
+               default:
+                       fprintf(stderr, "%s: option '%s' unrecognized\n",
+                               argv[0], argv[optind - 1]);
+                       return CMD_HELP;
+               }
+       }
+
+       if (argc <= optind) {
+               fprintf(stderr, "%s: must specify one or more file names\n",
+                       argv[0]);
+               return CMD_HELP;
+       }
+
+       while (optind < argc) {
+               int rc2;
+
+               path = argv[optind++];
+               if (realpath(path, fullpath) == NULL) {
+                       fprintf(stderr, "%s: could not find path '%s': %s\n",
+                               argv[0], path, strerror(errno));
+                       if (rc == 0)
+                               rc = -EINVAL;
+                       continue;
+               }
+
+               rc2 = llapi_pcc_attach(fullpath, archive_id, type);
+               if (rc2 < 0) {
+                       fprintf(stderr, "%s: cannot attach '%s' to PCC "
+                               "with archive ID '%u': %s\n", argv[0],
+                               path, archive_id, strerror(-rc2));
+                       if (rc == 0)
+                               rc = rc2;
+               }
+       }
+       return rc;
+}
+
+static int lfs_pcc_attach_fid(int argc, char **argv)
+{
+       struct option long_opts[] = {
+       { .val = 'i',   .name = "id",   .has_arg = required_argument },
+       { .val = 'm',   .name = "mnt",  .has_arg = required_argument },
+       { .name = NULL } };
+       char                     short_opts[] = "i:m:";
+       int                      c;
+       int                      rc = 0;
+       __u32                    archive_id = 0;
+       char                    *end;
+       const char              *mntpath = NULL;
+       const char              *fidstr;
+       enum lu_pcc_type         type = LU_PCC_READWRITE;
+
+       optind = 0;
+       while ((c = getopt_long(argc, argv, short_opts,
+                               long_opts, NULL)) != -1) {
+               switch (c) {
+               case 'i':
+                       archive_id = strtoul(optarg, &end, 0);
+                       if (*end != '\0') {
+                               fprintf(stderr, "error: %s: bad archive ID "
+                                       "'%s'\n", argv[0], optarg);
+                               return CMD_HELP;
+                       }
+                       break;
+               case 'm':
+                       mntpath = optarg;
+                       break;
+               case '?':
+                       return CMD_HELP;
+               default:
+                       fprintf(stderr, "%s: option '%s' unrecognized\n",
+                               argv[0], argv[optind - 1]);
+                       return CMD_HELP;
+               }
+       }
+
+       if (archive_id == 0) {
+               fprintf(stderr, "%s: must specify an archive ID\n", argv[0]);
+               return CMD_HELP;
+       }
+
+       if (mntpath == NULL) {
+               fprintf(stderr, "%s: must specify Lustre mount point\n",
+                       argv[0]);
+               return CMD_HELP;
+       }
+
+       if (argc <= optind) {
+               fprintf(stderr, "%s: must specify one or more fids\n", argv[0]);
+               return CMD_HELP;
+       }
+
+       while (optind < argc) {
+               int rc2;
+
+               fidstr = argv[optind++];
+
+               rc2 = llapi_pcc_attach_fid_str(mntpath, fidstr,
+                                              archive_id, type);
+               if (rc2 < 0) {
+                       fprintf(stderr, "%s: cannot attach '%s' on '%s' to PCC "
+                               "with archive ID '%u': %s\n", argv[0],
+                               fidstr, mntpath, archive_id, strerror(rc2));
+               }
+               if (rc == 0 && rc2 < 0)
+                       rc = rc2;
+       }
+       return rc;
+}
+
+static int lfs_pcc_detach(int argc, char **argv)
+{
+       struct option long_opts[] = {
+       { .val = 'k',   .name = "keep", .has_arg = no_argument },
+       { .name = NULL } };
+       char                     short_opts[] = "k";
+       int                      c;
+       int                      rc = 0;
+       const char              *path;
+       char                     fullpath[PATH_MAX];
+       __u32                    detach_opt = PCC_DETACH_OPT_UNCACHE;
+
+       optind = 0;
+       while ((c = getopt_long(argc, argv, short_opts,
+                               long_opts, NULL)) != -1) {
+               switch (c) {
+               case 'k':
+                       detach_opt = PCC_DETACH_OPT_NONE;
+                       break;
+               case '?':
+                       return CMD_HELP;
+               default:
+                       fprintf(stderr, "%s: option '%s' unrecognized\n",
+                               argv[0], argv[optind - 1]);
+                       return CMD_HELP;
+               }
+       }
+
+       while (optind < argc) {
+               int rc2;
+
+               path = argv[optind++];
+               if (realpath(path, fullpath) == NULL) {
+                       fprintf(stderr, "%s: could not find path '%s': %s\n",
+                               argv[0], path, strerror(errno));
+                       if (rc == 0)
+                               rc = -EINVAL;
+                       continue;
+               }
+
+               rc2 = llapi_pcc_detach_file(fullpath, detach_opt);
+               if (rc2 < 0) {
+                       rc2 = -errno;
+                       fprintf(stderr, "%s: cannot detach '%s' from PCC: "
+                               "%s\n", argv[0], path, strerror(errno));
+                       if (rc == 0)
+                               rc = rc2;
+               }
+       }
+       return rc;
+}
+
+static int lfs_pcc_detach_fid(int argc, char **argv)
+{
+       struct option long_opts[] = {
+       { .val = 'k',   .name = "keep", .has_arg = no_argument },
+       { .name = NULL } };
+       char             short_opts[] = "k";
+       int              c;
+       int              rc = 0;
+       const char      *fid;
+       const char      *mntpath;
+       __u32            detach_opt = PCC_DETACH_OPT_UNCACHE;
+
+       optind = 0;
+       while ((c = getopt_long(argc, argv, short_opts,
+                               long_opts, NULL)) != -1) {
+               switch (c) {
+               case 'k':
+                       detach_opt = PCC_DETACH_OPT_NONE;
+                       break;
+               case '?':
+                       return CMD_HELP;
+               default:
+                       fprintf(stderr, "%s: option '%s' unrecognized\n",
+                               argv[0], argv[optind - 1]);
+                       return CMD_HELP;
+               }
+       }
+
+       mntpath = argv[optind++];
+
+       while (optind < argc) {
+               int rc2;
+
+               fid = argv[optind++];
+
+               rc2 = llapi_pcc_detach_fid_str(mntpath, fid, detach_opt);
+               if (rc2 < 0) {
+                       fprintf(stderr, "%s: cannot detach '%s' on '%s' "
+                               "from PCC: %s\n", argv[0], fid, mntpath,
+                               strerror(-rc2));
+                       if (rc == 0)
+                               rc = rc2;
+               }
+       }
+       return rc;
+}
+
+static int lfs_pcc_state(int argc, char **argv)
+{
+       int                      rc = 0;
+       const char              *path;
+       char                     fullpath[PATH_MAX];
+       struct lu_pcc_state      state;
+
+       optind = 1;
+
+       if (argc <= 1) {
+               fprintf(stderr, "%s: must specify one or more file names\n",
+                       argv[0]);
+               return CMD_HELP;
+       }
+
+       while (optind < argc) {
+               int rc2;
+
+               path = argv[optind++];
+               if (realpath(path, fullpath) == NULL) {
+                       fprintf(stderr, "%s: could not find path '%s': %s\n",
+                               argv[0], path, strerror(errno));
+                       if (rc == 0)
+                               rc = -EINVAL;
+                       continue;
+               }
+
+               rc2 = llapi_pcc_state_get(fullpath, &state);
+               if (rc2 < 0) {
+                       if (rc == 0)
+                               rc = rc2;
+                       fprintf(stderr, "%s: cannot get PCC state of '%s': "
+                               "%s\n", argv[0], path, strerror(-rc2));
+                       continue;
+               }
+
+               printf("file: %s", path);
+               printf(", type: %s", pcc_type2string(state.pccs_type));
+               if (state.pccs_type == LU_PCC_NONE &&
+                   state.pccs_open_count == 0) {
+                       printf("\n");
+                       continue;
+               }
+
+               printf(", PCC file: %s", state.pccs_path);
+               printf(", user number: %u", state.pccs_open_count);
+               printf(", flags: %x", state.pccs_flags);
+               printf("\n");
+       }
+       return rc;
+}
+
+/**
+ * lfs_pcc_list_commands() - List lfs pcc commands.
+ * @argc: The count of command line arguments.
+ * @argv: Array of strings for command line arguments.
+ *
+ * This function lists lfs pcc commands defined in pcc_cmdlist[].
+ *
+ * Return: 0 on success.
+ */
+static int lfs_pcc_list_commands(int argc, char **argv)
+{
+       char buffer[81] = "";
+
+       Parser_list_commands(pcc_cmdlist, buffer, sizeof(buffer),
+                            NULL, 0, 4);
+
+       return 0;
+}
+
+/**
+ * lfs_pcc() - Parse and execute lfs pcc commands.
+ * @argc: The count of lfs pcc command line arguments.
+ * @argv: Array of strings for lfs pcc command line arguments.
+ *
+ * This function parses lfs pcc commands and performs the
+ * corresponding functions specified in pcc_cmdlist[].
+ *
+ * Return: 0 on success or an error code on failure.
+ */
+static int lfs_pcc(int argc, char **argv)
+{
+       char cmd[PATH_MAX];
+       int rc = 0;
+
+       setlinebuf(stdout);
+
+       Parser_init("lfs-pcc > ", pcc_cmdlist);
+
+       snprintf(cmd, sizeof(cmd), "%s %s", progname, argv[0]);
+       progname = cmd;
+       program_invocation_short_name = cmd;
+       if (argc > 1)
+               rc = Parser_execarg(argc - 1, argv + 1, pcc_cmdlist);
+       else
+               rc = Parser_commands();
+
+       return rc < 0 ? -rc : rc;
+}
+
 static int lfs_list_commands(int argc, char **argv)
 {
        char buffer[81] = ""; /* 80 printable chars + terminating NUL */
index 93e72be..a0c8a77 100644 (file)
@@ -1321,7 +1321,12 @@ static int ct_remove(const struct hsm_action_item *hai, const long hal_flags)
                rc = -errno;
                CT_ERROR(rc, "cannot unlink '%s'", attr);
                err_minor++;
-               goto fini;
+
+               /* ignore the error when lov file does not exist. */
+               if (rc == -ENOENT)
+                       rc = 0;
+               else
+                       goto fini;
        }
 
 fini:
index 39624f2..c8a68ef 100644 (file)
@@ -1140,7 +1140,37 @@ int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
                        goto err_out;
        } else if (hai->hai_action == HSMA_REMOVE) {
                /* Since remove is atomic there is no need to send an
-                * initial MDS_HSM_PROGRESS RPC. */
+                * initial MDS_HSM_PROGRESS RPC.
+                * RW-PCC uses Lustre HSM mechanism for data synchronization.
+                * At the beginning of RW-PCC attach, the client tries to
+                * exclusively open the file by using a lease lock. A
+                * successful lease open ensures that the current attach
+                * process is the unique opener for the file.
+                * After taking the lease, the file data is then copied from
+                * OSTs into PCC and then the client closes the lease with
+                * with a PCC attach intent.
+                * However, for a file with HSM exists, archived state (i.e. a
+                * cached file just was detached from PCC and restore into
+                * OST), a HSM REMOVE request may delete the above PCC copy
+                * during RW-PCC attach wrongly.
+                * Thus, a open/close on the corresponding Lustre file is added
+                * for HSMA_REMOVE here to solve this conflict.
+                */
+               fd = ct_open_by_fid(hcp->ct_priv, &hai->hai_fid,
+                               O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NONBLOCK);
+               if (fd < 0) {
+                       rc = fd;
+                       /* ignore the error in case of Remove Archive on Last
+                        * Unlink (RAoLU).
+                        */
+                       if (rc == -ENOENT) {
+                               rc = 0;
+                               goto out_log;
+                       }
+                       goto err_out;
+               }
+
+               hcp->source_fd = fd;
                goto out_log;
        }
 
diff --git a/lustre/utils/liblustreapi_pcc.c b/lustre/utils/liblustreapi_pcc.c
new file mode 100644 (file)
index 0000000..398239d
--- /dev/null
@@ -0,0 +1,452 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Lesser General Public License
+ * (LGPL) version 2.1 or (at your discretion) any later version.
+ * (LGPL) version 2.1 accompanies this distribution, and is available at
+ * http://www.gnu.org/licenses/lgpl-2.1.html
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, DDN Storage Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ */
+/*
+ *
+ * lustreapi library for Persistent Client Cache.
+ *
+ * Author: Li Xi <lixi@ddn.com>
+ * Author: Qian Yingjin <qian@ddn.com>
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <lustre/lustreapi.h>
+#include <linux/lustre/lustre_user.h>
+#include <linux/lustre/lustre_fid.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include "lustreapi_internal.h"
+
+/**
+ * Fetch and attach a file to readwrite PCC.
+ *
+ */
+static int llapi_readwrite_pcc_attach_fd(int fd, __u32 archive_id)
+{
+       int rc;
+       struct ll_ioc_lease *data;
+
+       rc = llapi_lease_acquire(fd, LL_LEASE_WRLCK);
+       if (rc < 0) {
+               llapi_error(LLAPI_MSG_ERROR, rc, "cannot get lease");
+               return rc;
+       }
+
+       data = malloc(offsetof(typeof(*data), lil_ids[1]));
+       if (!data) {
+               rc = -ENOMEM;
+               llapi_err_noerrno(LLAPI_MSG_ERROR,
+                                 "failed to allocate memory");
+               return rc;
+       }
+
+       data->lil_mode = LL_LEASE_UNLCK;
+       data->lil_flags = LL_LEASE_PCC_ATTACH;
+       data->lil_count = 1;
+       data->lil_ids[0] = archive_id;
+       rc = llapi_lease_set(fd, data);
+       if (rc <= 0) {
+               if (rc == 0) /* lost lease lock */
+                       rc = -EBUSY;
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "cannot attach with ID: %u", archive_id);
+       } else {
+               rc = 0;
+       }
+
+       free(data);
+       return rc;
+}
+
+static int llapi_readwrite_pcc_attach(const char *path, __u32 archive_id)
+{
+       int fd;
+       int rc;
+
+       fd = open(path, O_RDWR | O_NONBLOCK);
+       if (fd < 0) {
+               rc = -errno;
+               llapi_error(LLAPI_MSG_ERROR, rc, "cannot open '%s'",
+                           path);
+               return rc;
+       }
+
+       rc = llapi_readwrite_pcc_attach_fd(fd, archive_id);
+
+       close(fd);
+       return rc;
+}
+
+int llapi_pcc_attach(const char *path, __u32 id, enum lu_pcc_type type)
+{
+       int rc;
+
+       switch (type) {
+       case LU_PCC_READWRITE:
+               rc = llapi_readwrite_pcc_attach(path, id);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+       return rc;
+}
+
+static int llapi_readwrite_pcc_attach_fid(const char *mntpath,
+                                         const struct lu_fid *fid,
+                                         __u32 id)
+{
+       int rc;
+       int fd;
+
+       fd = llapi_open_by_fid(mntpath, fid, O_RDWR | O_NONBLOCK);
+       if (fd < 0) {
+               rc = -errno;
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "llapi_open_by_fid for " DFID "failed",
+                           PFID(fid));
+               return rc;
+       }
+
+       rc = llapi_readwrite_pcc_attach_fd(fd, id);
+
+       close(fd);
+       return rc;
+}
+
+int llapi_pcc_attach_fid(const char *mntpath, const struct lu_fid *fid,
+                        __u32 id, enum lu_pcc_type type)
+{
+       int rc;
+
+       switch (type) {
+       case LU_PCC_READWRITE:
+               rc = llapi_readwrite_pcc_attach_fid(mntpath, fid, id);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+       return rc;
+}
+
+
+int llapi_pcc_attach_fid_str(const char *mntpath, const char *fidstr,
+                            __u32 id, enum lu_pcc_type type)
+{
+       int rc;
+       struct lu_fid fid;
+       const char *fidstr_orig = fidstr;
+
+       while (*fidstr == '[')
+               fidstr++;
+       rc = sscanf(fidstr, SFID, RFID(&fid));
+       if (rc != 3) {
+               llapi_err_noerrno(LLAPI_MSG_ERROR,
+                                 "bad FID format '%s', should be [seq:oid:ver]"
+                                 " (e.g. "DFID")\n", fidstr_orig,
+                                 (unsigned long long)FID_SEQ_NORMAL, 2, 0);
+               return -EINVAL;
+       }
+
+       rc = llapi_pcc_attach_fid(mntpath, &fid, id, type);
+
+       return rc;
+}
+
+/**
+ * detach PCC cache of a file by using fd.
+ *
+ * \param fd           File handle.
+ * \param option       Detach option
+ *
+ * \return 0 on success, an error code otherwise.
+ */
+int llapi_pcc_detach_fd(int fd, __u32 option)
+{
+       struct lu_pcc_detach detach;
+       int rc;
+
+       detach.pccd_opt = option;
+       rc = ioctl(fd, LL_IOC_PCC_DETACH, &detach);
+       return rc;
+}
+
+/**
+ * detach PCC cache of a file via FID.
+ *
+ * \param mntpath      Fullpath to the client mount point.
+ * \param fid          FID of the file.
+ * \param option       Detach option.
+ *
+ * \return 0 on success, an error code otherwise.
+ */
+int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid,
+                        __u32 option)
+{
+       int rc;
+       int fd;
+       struct lu_pcc_detach_fid detach;
+
+       rc = get_root_path(WANT_FD, NULL, &fd, (char *)mntpath, -1);
+       if (rc) {
+               llapi_error(LLAPI_MSG_ERROR, rc, "cannot get root path: %s",
+                           mntpath);
+               return rc;
+       }
+
+       /*
+        * PCC prefetching algorithm scans Lustre OPEN/CLOSE changelogs
+        * to determine the candidate files needing to prefetch into
+        * PCC. To avoid generattion of unnecessary open/close changelogs,
+        * we implement a new dir ioctl LL_IOC_PCC_DETACH_BY_FID to detach
+        * files.
+        */
+       detach.pccd_fid = *fid;
+       detach.pccd_opt = option;
+       rc = ioctl(fd, LL_IOC_PCC_DETACH_BY_FID, &detach);
+       close(fd);
+       return rc;
+}
+
+/**
+ * detach PCC cache of a file via FID.
+ *
+ * \param mntpath      Fullpath to the client mount point.
+ * \param fidstr       FID string of the file.
+ * \param option       Detach option.
+ *
+ * \return 0 on success, an error code otherwise.
+ */
+int llapi_pcc_detach_fid_str(const char *mntpath, const char *fidstr,
+                            __u32 option)
+{
+       int rc;
+       struct lu_fid fid;
+       const char *fidstr_orig = fidstr;
+
+       while (*fidstr == '[')
+               fidstr++;
+       rc = sscanf(fidstr, SFID, RFID(&fid));
+       if (rc != 3 || !fid_is_sane(&fid)) {
+               llapi_err_noerrno(LLAPI_MSG_ERROR,
+                                 "bad FID format '%s', should be [seq:oid:ver]"
+                                 " (e.g. "DFID")\n", fidstr_orig,
+                                 (unsigned long long)FID_SEQ_NORMAL, 2, 0);
+               return -EINVAL;
+       }
+
+       rc = llapi_pcc_detach_fid(mntpath, &fid, option);
+
+       return rc;
+}
+
+/**
+ * detach PCC cache of a file.
+ *
+ * \param path         Fullpath to the file to operate on.
+ * \param option       Detach option.
+ *
+ * \return 0 on success, an error code otherwise.
+ */
+int llapi_pcc_detach_file(const char *path, __u32 option)
+{
+       int rc;
+       int fd;
+
+       fd = open(path, O_RDWR | O_NONBLOCK);
+       if (fd < 0) {
+               rc = -errno;
+               llapi_error(LLAPI_MSG_ERROR, rc, "cannot open '%s'",
+                           path);
+               return rc;
+       }
+
+       rc = llapi_pcc_detach_fd(fd, option);
+       close(fd);
+       return rc;
+}
+
+/**
+ * Return the current PCC state related to a file.
+ *
+ * \param fd   File handle.
+ * \param state        PCC state info.
+ *
+ * \return 0 on success, an error code otherwise.
+ */
+int llapi_pcc_state_get_fd(int fd, struct lu_pcc_state *state)
+{
+       int rc;
+
+       rc = ioctl(fd, LL_IOC_PCC_STATE, state);
+       /* If error, save errno value */
+       rc = rc ? -errno : 0;
+
+       return rc;
+}
+
+/**
+ * Return the current PCC state related to file pointed by a path.
+ *
+ * see llapi_pcc_state_get_fd() for args use and return
+ */
+int llapi_pcc_state_get(const char *path, struct lu_pcc_state *state)
+{
+       int fd;
+       int rc;
+
+       fd = open(path, O_RDONLY | O_NONBLOCK);
+       if (fd < 0)
+               return -errno;
+
+       rc = llapi_pcc_state_get_fd(fd, state);
+
+       close(fd);
+       return rc;
+}
+
+/**
+ * Add/delete a PCC backend on a client.
+ */
+int llapi_pccdev_set(const char *mntpath, const char *cmd)
+{
+       char buf[sizeof(struct obd_uuid)];
+       glob_t path;
+       ssize_t count;
+       int fd;
+       int rc;
+
+       rc = llapi_getname(mntpath, buf, sizeof(buf));
+       if (rc < 0) {
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "cannot get name for '%s'\n", mntpath);
+               return rc;
+       }
+
+       rc = cfs_get_param_paths(&path, "llite/%s/pcc", buf);
+       if (rc != 0)
+               return -errno;
+
+       fd = open(path.gl_pathv[0], O_WRONLY);
+       if (fd < 0) {
+               rc = -errno;
+               llapi_error(LLAPI_MSG_ERROR, rc, "error opening %s",
+                           path.gl_pathv[0]);
+               goto out;
+       }
+
+       count = write(fd, cmd, strlen(cmd));
+       if (count < 0) {
+               rc = errno;
+               if (errno != EIO)
+                       llapi_error(LLAPI_MSG_ERROR, rc,
+                                   "error: setting llite.%s.pcc=\"%s\"\n",
+                                   buf, cmd);
+       } else if (count < strlen(cmd)) { /* Truncate case */
+               rc = -EINVAL;
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "setting llite.%s.pcc=\"%s\": wrote only %zd\n",
+                           buf, cmd, count);
+       }
+       close(fd);
+out:
+       cfs_free_param_data(&path);
+       return rc;
+}
+
+/**
+ * List all PCC backend devices on a client.
+ */
+int llapi_pccdev_get(const char *mntpath)
+{
+       long page_size = sysconf(_SC_PAGESIZE);
+       char pathbuf[sizeof(struct obd_uuid)];
+       glob_t path;
+       char *buf;
+       int fd;
+       int rc;
+
+       rc = llapi_getname(mntpath, pathbuf, sizeof(pathbuf));
+       if (rc < 0) {
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "cannot get name for '%s'\n", mntpath);
+               return rc;
+       }
+
+       rc = cfs_get_param_paths(&path, "llite/%s/pcc", pathbuf);
+       if (rc != 0)
+               return -errno;
+
+       /* Read the contents of file to stdout */
+       fd = open(path.gl_pathv[0], O_RDONLY);
+       if (fd < 0) {
+               rc = -errno;
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "error: pccdev_get: opening '%s'\n",
+                           path.gl_pathv[0]);
+               goto out_free_param;
+       }
+
+       buf = calloc(1, page_size);
+       if (buf == NULL) {
+               rc = -ENOMEM;
+               llapi_error(LLAPI_MSG_ERROR, rc,
+                           "error: pccdev_get: allocating '%s' buffer\n",
+                           path.gl_pathv[0]);
+               goto out_close;
+       }
+
+       while (1) {
+               ssize_t count = read(fd, buf, page_size);
+
+               if (count == 0)
+                       break;
+               if (count < 0) {
+                       rc = -errno;
+                       if (errno != EIO) {
+                               llapi_error(LLAPI_MSG_ERROR, rc,
+                                           "error: pccdev_get: "
+                                           "reading failed\n");
+                       }
+                       break;
+               }
+
+               if (fwrite(buf, 1, count, stdout) != count) {
+                       rc = -errno;
+                       llapi_error(LLAPI_MSG_ERROR, rc,
+                                   "error: get_param: write to stdout\n");
+                       break;
+               }
+       }
+out_close:
+       close(fd);
+       free(buf);
+out_free_param:
+       cfs_free_param_data(&path);
+       return rc;
+}
index 2f81597..15a63b5 100644 (file)
@@ -4861,3 +4861,119 @@ int jt_changelog_deregister(int argc, char **argv)
 
        return 0;
 }
+
+int jt_pcc_add(int argc, char **argv)
+{
+       struct option long_opts[] = {
+               { .val = 'p', .name = "param", .has_arg = required_argument },
+               { .name = NULL } };
+       const char *mntpath;
+       const char *pccpath;
+       char *param = NULL;
+       char cmd[PATH_MAX];
+       int rc;
+
+       optind = 1;
+       while ((rc = getopt_long(argc, argv, "p:",
+               long_opts, NULL)) != -1) {
+               switch (rc) {
+               case 'p':
+                       param = optarg;
+                       break;
+               default:
+                       return CMD_HELP;
+               }
+       }
+
+       if (!param) {
+               fprintf(stderr, "%s: must specify the config param for PCC\n",
+                       jt_cmdname(argv[0]));
+               return CMD_HELP;
+       }
+
+       if (optind + 2 != argc) {
+               fprintf(stderr,
+                       "%s: must speficy mount path and PCC path %d:%d\n",
+                       jt_cmdname(argv[0]), optind, argc);
+               return CMD_HELP;
+       }
+
+       mntpath = argv[optind++];
+       pccpath = argv[optind];
+
+       snprintf(cmd, PATH_MAX, "add %s %s", pccpath, param);
+       rc = llapi_pccdev_set(mntpath, cmd);
+       if (rc < 0)
+               fprintf(stderr, "%s: failed to run '%s' on %s\n",
+                       jt_cmdname(argv[0]), cmd, mntpath);
+
+       return rc;
+}
+
+int jt_pcc_del(int argc, char **argv)
+{
+       const char *mntpath;
+       const char *pccpath;
+       char cmd[PATH_MAX];
+       int rc;
+
+       optind = 1;
+       if (argc != 3) {
+               fprintf(stderr, "%s: require 3 arguments\n",
+                       jt_cmdname(argv[0]));
+               return CMD_HELP;
+       }
+
+       mntpath = argv[optind++];
+       pccpath = argv[optind++];
+
+       snprintf(cmd, PATH_MAX, "del %s", pccpath);
+       rc = llapi_pccdev_set(mntpath, cmd);
+       if (rc < 0)
+               fprintf(stderr, "%s: failed to run '%s' on %s\n",
+                       jt_cmdname(argv[0]), cmd, mntpath);
+
+       return rc;
+}
+
+int jt_pcc_clear(int argc, char **argv)
+{
+       const char *mntpath;
+       int rc;
+
+       optind = 1;
+       if (argc != 2) {
+               fprintf(stderr, "%s: require 2 arguments\n",
+                       jt_cmdname(argv[0]));
+               return CMD_HELP;
+       }
+
+       mntpath = argv[optind];
+       rc = llapi_pccdev_set(mntpath, "clear");
+       if (rc < 0)
+               fprintf(stderr, "%s: failed to run 'clear' on %s\n",
+                       jt_cmdname(argv[0]), mntpath);
+
+       return rc;
+}
+
+int jt_pcc_list(int argc, char **argv)
+{
+       const char *mntpath;
+       int rc;
+
+       optind = 1;
+       if (argc != 2) {
+               fprintf(stderr, "%s: require 2 arguments\n",
+                       jt_cmdname(argv[0]));
+               return CMD_HELP;
+       }
+
+       mntpath = argv[optind];
+       rc = llapi_pccdev_get(mntpath);
+       if (rc < 0)
+               fprintf(stderr, "%s: failed to run 'pcc list' on %s\n",
+                       jt_cmdname(argv[0]), mntpath);
+
+       return rc;
+}
index 8ca38a5..cf06edd 100644 (file)
@@ -194,6 +194,10 @@ int jt_nodemap_set_sepol(int argc, char **argv);
 int jt_nodemap_info(int argc, char **argv);
 int jt_changelog_register(int argc, char **argv);
 int jt_changelog_deregister(int argc, char **argv);
+int jt_pcc_add(int argc, char **argv);
+int jt_pcc_del(int argc, char **argv);
+int jt_pcc_clear(int argc, char **argv);
+int jt_pcc_list(int argc, char **argv);
 
 #ifdef HAVE_SERVER_SUPPORT
 /* lustre_lfsck.c */