From 58d744e3eaab358ef346e51ff4aa17e9f08efbb3 Mon Sep 17 00:00:00 2001 From: Qian Yingjin Date: Thu, 5 Jul 2018 14:43:46 +0800 Subject: [PATCH] LU-10092 pcc: Non-blocking PCC caching Current PCC uses refcount of PCC inode to determine whether a previous PCC-attached file can be detached. If a file is open (refcount > 1), the detaching will return -EBUSY. When another client accesses the PCC-cached file, it will trigger the restore process as the file is HSM released. During restore, the Agent needs to detach the PCC-cached file. Thus, if a PCC-attached file is keeping opened but not closed for a long time, the restore request will always return failure. In this patch, we implement a non-blocking PCC caching mechanism for Lustre. After attaching the file into PCC, the client acquires the layout lock for the file, and the layout generation is maintained in the PCC inode. Under the layout lock protection, the PCC caching state is valid and all I/O will direct into PCC. When the layout lock is revoked, in the blocking AST it will invalidate the PCC caching state and detach the file automatically. This patch is also helpful to handle the ENOSPC error for PCC write by fallback to normal I/O path which will restore the file data into OSTs (The file is in HSM released state) and redo the write again. Change-Id: I9130c04dc0e6eae879ea2ff3fdda65726e74d177 Test-Parameters: clientcount=3 testlist=sanity-pcc,sanity-pcc,sanity-pcc Signed-off-by: Qian Yingjin Reviewed-on: https://review.whamcloud.com/32966 Tested-by: Jenkins Reviewed-by: Wang Shilong Tested-by: Maloo Reviewed-by: Patrick Farrell Reviewed-by: Oleg Drokin --- lustre/doc/lfs-pcc.1 | 24 +- lustre/doc/llapi_pcc_attach.3 | 18 +- lustre/doc/llapi_pcc_attach_fid.3 | 1 + lustre/doc/llapi_pcc_attach_fid_str.3 | 1 + lustre/include/lustre/lustreapi.h | 6 +- lustre/include/obd_support.h | 4 + lustre/include/uapi/linux/lustre/lustre_user.h | 10 +- lustre/llite/dir.c | 31 +- lustre/llite/file.c | 93 ++-- lustre/llite/llite_internal.h | 1 + lustre/llite/llite_lib.c | 2 + lustre/llite/llite_mmap.c | 36 +- lustre/llite/namei.c | 4 - lustre/llite/pcc.c | 623 +++++++++++++++++++++---- lustre/llite/pcc.h | 51 +- lustre/llite/vvp_object.c | 3 +- lustre/mdt/mdt_open.c | 32 ++ lustre/tests/multiop.c | 2 +- lustre/tests/sanity-pcc.sh | 477 ++++++++++++++++++- lustre/tests/test-framework.sh | 6 +- lustre/utils/lfs.c | 88 +++- lustre/utils/lhsmtool_posix.c | 17 +- lustre/utils/liblustreapi_hsm.c | 32 +- lustre/utils/liblustreapi_pcc.c | 154 ++++-- 24 files changed, 1447 insertions(+), 269 deletions(-) create mode 100644 lustre/doc/llapi_pcc_attach_fid.3 create mode 100644 lustre/doc/llapi_pcc_attach_fid_str.3 diff --git a/lustre/doc/lfs-pcc.1 b/lustre/doc/lfs-pcc.1 index e4746f5..c709c8d 100644 --- a/lustre/doc/lfs-pcc.1 +++ b/lustre/doc/lfs-pcc.1 @@ -2,7 +2,9 @@ .SH NAME lfs pcc commands used to interact with PCC features. .SH SYNOPSIS -.B lfs pcc attach <\fB--id\fR|\fB-i\fR NUM> <\fIfile \fR...> +.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR> <\fIfile \fR...> +.br +.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR> <\fB--mnt\fR|\fB-m\fR \fImntpath\fR> <\fIfid \fR...> .br .B lfs pcc detach <\fIfile \fR...> .br @@ -11,9 +13,12 @@ lfs pcc commands used to interact with PCC features. .B lfs pcc state <\fIfile \fR...> .SH DESCRIPTION .TP -.B lfs pcc attach <\fB--id\fR|\fB-i\fR NUM> <\fIfile \fR...> +.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR> <\fIfile \fR...> Attach given files on the persistent client cache. .TP +.B lfs pcc attach <\fB--id\fR|\fB-i\fR \fINUM\fR> <\fB--mnt\fR|\fB-m\fR \fImntpath\fR> <\fIfid \fR...> +Attach given fils into the persistent client cache by FID(s). +.TP .B lfs pcc detach <\fIfile \fR...> Detach given files from the persistent client cache. .TP @@ -28,9 +33,12 @@ Display the PCC state for given files. .B --id | -i For RW-PCC, it is HSM ARCHIVE ID to choose which backend for cache files. .TP +.B --mnt | -m +Specifies Lustre mount point. +.TP Before using RW-PCC, you need to configure HSM root and Archive ID mapping properly: .TP -.B lfs pcc add $MNTPATH $PCCPATH "$ARCHIVE_ID $PROJID" +.B lfs pcc add $MNTPATH $PCCPATH \ "$ARCHIVE_ID $PROJID" Add one PCC backend to the Lustre client, you need to specify hsm root, archive ID, and project ID. On this client any subsequently created files with this project ID will be persistently cached automatically. @@ -49,15 +57,21 @@ Enable HSM on the appropriate MDT. .B # lhsmtool_posix --daemon --hsm-root /mnt/pcc/ --archive=1 /mnt/lustre Launch one copytool on client node to connect cache storage. .TP -.B # lfs pcc add /mnt/lustre /mnt/pcc \ "1\ 1" +.B # lfs pcc add /mnt/lustre /mnt/pcc \ "1\ 100" Add HSM root and Archive ID mapping for RW-PCC. .TP .B $ lfs pcc attach -i 1 /mnt/lustre/file Attach an existing file into PCC and migrate data from lustre to Cache Device, any I/O to the Lustre file will direct to the RW-PCC copy. .TP +.B $ lfs pcc attach_fid -i 1 -m /mnt/lustre 0x200000401:0x1:0x0 +Attach an existing file referenced by FID "0x200000401:0x1:0x0" into PCC. +.TP .B $ lfs pcc detach /mnt/lustre/file -Detach the file from RW-PCC, IO to the file will come to Lustre after this command. +Detach the file from RW-PCC, IO to the file will come to Lustre after this +command. +.B $ lfs pcc detach_fid /mnt/lustre 0x200000401:0x1:0x0 +Detach the file referenced by FID "0x200000401:0x1:0x0" from PCC. .TP .B $ lfs pcc state /mnt/lustre/file .br diff --git a/lustre/doc/llapi_pcc_attach.3 b/lustre/doc/llapi_pcc_attach.3 index 87a9202..a283559 100644 --- a/lustre/doc/llapi_pcc_attach.3 +++ b/lustre/doc/llapi_pcc_attach.3 @@ -1,19 +1,31 @@ .TH llapi_pcc_attach 3 "2019 April 20" "Lustre User API" .SH NAME -llapi_pcc_attach \- attach a file into PCC +llapi_pcc_attach, llapi_pcc_attach_fid, llapi_pcc_attach_fid_str \- attach a file into PCC .SH SYNOPSIS .nf .B #include .PP .BI "int llapi_pcc_attach(const char *" path ", __u32 " id , .BI " enum lu_pcc_type " type ");" +.PP +.BI "int llapi_pcc_attach_fid(const char *" mntpath ", const struct lu_fid *" fid , +.BI " __u32 " id ", enum lu_pcc_type " type ");" +.PP +.BI "int llapi_pcc_attach_fid_str(const char *" mntpath ", const char *" fidstr , +.BI " __u32 " id ", enum lu_pcc_type " type ");" .fi .SH DESCRIPTION .PP The function -.B llapi_pcc_attach() +.BR llapi_pcc_attach() , +.BR llapi_pcc_attach_fid() , +and +.BR llapi_pcc_attach_fid_str() tries to attach the file referenced by -.BR path +.IR path , +.IR fid , +or +.IR fidstr into PCC backend. PCC provides a group of local caches and works in two modes: RW-PCC enables a read-write cache on the local SSDs of a single client; RO-PCC provides a read-only cache on the local SSDs of multiple clients. For RW-PCC, diff --git a/lustre/doc/llapi_pcc_attach_fid.3 b/lustre/doc/llapi_pcc_attach_fid.3 new file mode 100644 index 0000000..2719276 --- /dev/null +++ b/lustre/doc/llapi_pcc_attach_fid.3 @@ -0,0 +1 @@ +.so man3/llapi_pcc_attach.3 diff --git a/lustre/doc/llapi_pcc_attach_fid_str.3 b/lustre/doc/llapi_pcc_attach_fid_str.3 new file mode 100644 index 0000000..2719276 --- /dev/null +++ b/lustre/doc/llapi_pcc_attach_fid_str.3 @@ -0,0 +1 @@ +.so man3/llapi_pcc_attach.3 diff --git a/lustre/include/lustre/lustreapi.h b/lustre/include/lustre/lustreapi.h index 9b58d53..08cfba0 100644 --- a/lustre/include/lustre/lustreapi.h +++ b/lustre/include/lustre/lustreapi.h @@ -521,7 +521,11 @@ int llapi_ladvise(int fd, unsigned long long flags, int num_advise, /* PCC */ int llapi_pcc_attach(const char *path, __u32 id, enum lu_pcc_type type); -int llapi_pcc_detach_fid_fd(int fd, const struct lu_fid *fid); +int llapi_pcc_attach_fid(const char *mntpath, const struct lu_fid *fid, + __u32 id, enum lu_pcc_type type); +int llapi_pcc_attach_fid_str(const char *mntpath, const char *fidstr, + __u32 id, enum lu_pcc_type type); +int llapi_pcc_detach_fd(int fd); int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid); int llapi_pcc_detach_fid_str(const char *mntpath, const char *fidstr); int llapi_pcc_detach_file(const char *path); diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 0aea7c2..d488249 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -568,6 +568,10 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_IMUTEX_SEC 0x140e #define OBD_FAIL_LLITE_IMUTEX_NOSEC 0x140f #define OBD_FAIL_LLITE_OPEN_BY_NAME 0x1410 +#define OBD_FAIL_LLITE_PCC_FAKE_ERROR 0x1411 +#define OBD_FAIL_LLITE_PCC_DETACH_MKWRITE 0x1412 +#define OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE 0x1413 +#define OBD_FAIL_LLITE_PCC_ATTACH_PAUSE 0x1414 #define OBD_FAIL_FID_INDIR 0x1501 #define OBD_FAIL_FID_INLMA 0x1502 diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index d9567f7..ccccdac 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -482,7 +482,8 @@ struct ll_ioc_lease_id { #define LL_IOC_LADVISE _IOR('f', 250, struct llapi_lu_ladvise) #define LL_IOC_HEAT_GET _IOWR('f', 251, struct lu_heat) #define LL_IOC_HEAT_SET _IOW('f', 251, __u64) -#define LL_IOC_PCC_DETACH _IOW('f', 252, struct lu_pcc_detach) +#define LL_IOC_PCC_DETACH _IO('f', 252) +#define LL_IOC_PCC_DETACH_BY_FID _IOW('f', 252, struct lu_pcc_detach) #define LL_IOC_PCC_STATE _IOR('f', 252, struct lu_pcc_state) #ifndef FS_IOC_FSGETXATTR @@ -2329,8 +2330,11 @@ struct lu_pcc_detach { }; enum lu_pcc_state_flags { - /* Whether the inode attr is cached locally */ - PCC_STATE_FLAG_ATTR_VALID = 0x1, + PCC_STATE_FL_NONE = 0x0, + /* The inode attr is cached locally */ + PCC_STATE_FL_ATTR_VALID = 0x01, + /* The file is being attached into PCC */ + PCC_STATE_FL_ATTACHING = 0x02, }; struct lu_pcc_state { diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index c70cd20..b5cbb7e 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -1983,41 +1983,12 @@ migrate_free: RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg)); case LL_IOC_FSSETXATTR: RETURN(ll_ioctl_fssetxattr(inode, cmd, arg)); - case LL_IOC_PCC_DETACH: { + case LL_IOC_PCC_DETACH_BY_FID: { struct lu_pcc_detach *detach; struct lu_fid *fid; struct inode *inode2; unsigned long ino; - /* - * The reason why a dir IOCTL is used to detach a PCC-cached - * file rather than making it a file IOCTL is: - * When PCC caching a file, it will attach the file firstly, - * and increase the refcount of PCC inode (pcci->pcci_refcount) - * from 0 to 1. - * When detaching a PCC-cached file, it will check whether the - * refcount is 1. If so, the file can be detached successfully. - * Otherwise, it means there are some users opened and using - * the file currently, and it will return -EBUSY. - * Each open on the PCC-cached file will increase the refcount - * of the PCC inode; - * Each close on the PCC-cached file will decrease the refcount - * of the PCC inode; - * When used a file IOCTL to detach a PCC-cached file, it needs - * to open it at first, which will increase the refcount. So - * during the process of the detach IOCTL, it will return - * -EBUSY as the PCC inode refcount is larger than 1. Someone - * might argue that here it can just decrease the refcount - * of the PCC inode, return succeed and make the close of - * IOCTL file handle to perform the real detach. But this - * may result in inconsistent state of a PCC file. i.e. Process - * A got a successful return form the detach IOCTL; Process B - * opens the file before Process A finally closed the IOCTL - * file handle. It makes the following I/O of Process B will - * direct into PCC although the file was already detached from - * the view of Process A. - * Using a dir IOCTL does not exist the problem above. - */ OBD_ALLOC_PTR(detach); if (detach == NULL) RETURN(-ENOMEM); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index e340c71..c0941d6 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -61,6 +61,7 @@ struct split_param { struct pcc_param { __u64 pa_data_version; __u32 pa_archive_id; + __u32 pa_layout_gen; }; static int @@ -237,6 +238,12 @@ static int ll_close_inode_openhandle(struct inode *inode, body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED)) rc = -EBUSY; + + if (bias & MDS_PCC_ATTACH) { + struct pcc_param *param = data; + + param->pa_layout_gen = body->mbo_layout_gen; + } } ll_finish_md_op_data(op_data); @@ -1641,7 +1648,7 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ssize_t result; ssize_t rc2; __u16 refcheck; - bool cached = false; + bool cached; /** * Currently when PCC read failed, we do not fall back to the @@ -1752,22 +1759,23 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct lu_env *env; ssize_t rc_tiny = 0, rc_normal; __u16 refcheck; - bool cached = false; + bool cached; int result; ENTRY; /** - * When PCC write failed, we do not fall back to the normal - * write path, just return the error. The reason is that: - * PCC is actually a HSM device, and HSM does not handle the - * failure especially -ENOSPC due to space used out; Moreover, - * the fallback to normal I/O path for ENOSPC failure, needs - * to restore the file data to OSTs first and redo the write - * again, making the logic of PCC very complex. + * When PCC write failed, we usually do not fall back to the normal + * write path, just return the error. But there is a special case when + * returned error code is -ENOSPC due to running out of space on PCC HSM + * bakcend. At this time, it will fall back to normal I/O path and + * retry the I/O. As the file is in HSM released state, it will restore + * the file data to OSTs first and redo the write again. And the + * restore process will revoke the layout lock and detach the file + * from PCC cache automatically. */ result = pcc_file_write_iter(iocb, from, &cached); - if (cached) + if (cached && result != -ENOSPC) return result; /* NB: we can't do direct IO for tiny writes because they use the page @@ -1946,23 +1954,22 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, unsigned int flags) { - struct lu_env *env; - struct vvp_io_args *args; - ssize_t result; - __u16 refcheck; - struct ll_file_data *fd = LUSTRE_FPRIVATE(in_file); - struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct lu_env *env; + struct vvp_io_args *args; + ssize_t result; + __u16 refcheck; + bool cached; ENTRY; - /* pcc cache path */ - if (pcc_file && file_inode(pcc_file)->i_fop->splice_read) - return file_inode(pcc_file)->i_fop->splice_read(pcc_file, - ppos, pipe, count, flags); + result = pcc_file_splice_read(in_file, ppos, pipe, + count, flags, &cached); + if (cached) + RETURN(result); ll_ras_enter(in_file); - env = cl_env_get(&refcheck); + env = cl_env_get(&refcheck); if (IS_ERR(env)) RETURN(PTR_ERR(env)); @@ -3310,8 +3317,10 @@ out: case LL_LEASE_PCC_ATTACH: if (!rc) rc = rc2; - rc = pcc_readwrite_attach_fini(file, inode, lease_broken, - rc, attached); + rc = pcc_readwrite_attach_fini(file, inode, + param.pa_layout_gen, + lease_broken, rc, + attached); break; } @@ -3836,6 +3845,14 @@ out_ladvise: rc = ll_heat_set(inode, flags); RETURN(rc); } + case LL_IOC_PCC_DETACH: + if (!S_ISREG(inode->i_mode)) + RETURN(-EINVAL); + + if (!inode_owner_or_capable(inode)) + RETURN(-EPERM); + + RETURN(pcc_ioctl_detach(inode)); case LL_IOC_PCC_STATE: { struct lu_pcc_state __user *ustate = (struct lu_pcc_state __user *)arg; @@ -3848,7 +3865,7 @@ out_ladvise: if (copy_from_user(state, ustate, sizeof(*state))) GOTO(out_state, rc = -EFAULT); - rc = pcc_ioctl_state(inode, state); + rc = pcc_ioctl_state(file, inode, state); if (rc) GOTO(out_state, rc); @@ -4055,29 +4072,15 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync) #endif struct inode *inode = dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); struct ptlrpc_request *req; - struct file *pcc_file = fd->fd_pcc_file.pccf_file; int rc, err; + ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", PFID(ll_inode2fid(inode)), inode); ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); - /* pcc cache path */ - if (pcc_file) -#ifdef HAVE_FILE_FSYNC_4ARGS - return file_inode(pcc_file)->i_fop->fsync(pcc_file, - start, end, datasync); -#elif defined(HAVE_FILE_FSYNC_2ARGS) - return file_inode(pcc_file)->i_fop->fsync(pcc_file, - datasync); -#else - return file_inode(pcc_file)->i_fop->fsync(pcc_file, - dentry, datasync); -#endif - #ifdef HAVE_FILE_FSYNC_4ARGS rc = filemap_write_and_wait_range(inode->i_mapping, start, end); inode_lock(inode); @@ -4109,8 +4112,15 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync) if (S_ISREG(inode->i_mode)) { struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + bool cached; - err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0); + /* Sync metadata on MDT first, and then sync the cached data + * on PCC. + */ + err = pcc_fsync(file, start, end, datasync, &cached); + if (!cached) + err = cl_sync_file_range(inode, start, end, + CL_FSYNC_ALL, 0); if (rc == 0 && err < 0) rc = err; if (rc < 0) @@ -4651,11 +4661,12 @@ int ll_getattr_dentry(struct dentry *de, struct kstat *stat) RETURN(rc); if (S_ISREG(inode->i_mode)) { - bool cached = false; + bool cached; rc = pcc_inode_getattr(inode, &cached); if (cached && rc < 0) RETURN(rc); + /* In case of restore, the MDT has the right size and has * already send it back without granting the layout lock, * inode is up-to-date so glimpse is useless. diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 5f7aea2..0b31f0f 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -216,6 +216,7 @@ struct ll_inode_info { char lli_jobid[LUSTRE_JOBID_SIZE]; struct mutex lli_pcc_lock; + enum lu_pcc_state_flags lli_pcc_state; struct pcc_inode *lli_pcc_inode; }; }; diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 5853551..7f696c8 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -984,6 +984,7 @@ void ll_lli_init(struct ll_inode_info *lli) obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT); lli->lli_heat_flags = 0; mutex_init(&lli->lli_pcc_lock); + lli->lli_pcc_state = PCC_STATE_FL_NONE; lli->lli_pcc_inode = NULL; } mutex_init(&lli->lli_layout_mutex); @@ -1656,6 +1657,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, struct ll_inode_info *lli = ll_i2info(inode); struct md_op_data *op_data = NULL; int rc = 0; + ENTRY; CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, " diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index 65e4af6..e9552ae 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -349,17 +349,22 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #endif int count = 0; bool printed = false; + bool cached; int result; sigset_t set; + ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), + LPROC_LL_FAULT, 1); + + result = pcc_fault(vma, vmf, &cached); + if (cached) + return result; + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite * so that it can be killed by admin but not cause segfault by * other signals. */ set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); - ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), - LPROC_LL_FAULT, 1); - /* make sure offset is not a negative number */ if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return VM_FAULT_SIGBUS; @@ -403,11 +408,16 @@ static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int count = 0; bool printed = false; bool retry; + bool cached; int result; ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), LPROC_LL_MKWRITE, 1); + result = pcc_page_mkwrite(vma, vmf, &cached); + if (cached) + return result; + file_update_time(vma->vm_file); do { retry = false; @@ -459,6 +469,7 @@ static void ll_vm_open(struct vm_area_struct * vma) ENTRY; LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); atomic_inc(&vob->vob_mmap_cnt); + pcc_vm_open(vma); EXIT; } @@ -473,6 +484,7 @@ static void ll_vm_close(struct vm_area_struct *vma) ENTRY; atomic_dec(&vob->vob_mmap_cnt); LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); + pcc_vm_close(vma); EXIT; } @@ -487,7 +499,7 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) if (mapping_mapped(mapping)) { rc = 0; unmap_mapping_range(mapping, first + PAGE_SIZE - 1, - last - first + 1, 0); + last - first + 1, 1); } RETURN(rc); @@ -503,28 +515,26 @@ static const struct vm_operations_struct ll_file_vm_ops = { int ll_file_mmap(struct file *file, struct vm_area_struct * vma) { struct inode *inode = file_inode(file); + bool cached; int rc; - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct file *pcc_file = fd->fd_pcc_file.pccf_file; ENTRY; - /* pcc cache path */ - if (pcc_file) { - vma->vm_file = pcc_file; - return file_inode(pcc_file)->i_fop->mmap(pcc_file, vma); - } - if (ll_file_nolock(file)) RETURN(-EOPNOTSUPP); + rc = pcc_file_mmap(file, vma, &cached); + if (cached && rc != 0) + RETURN(rc); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); rc = generic_file_mmap(file, vma); if (rc == 0) { vma->vm_ops = &ll_file_vm_ops; vma->vm_ops->open(vma); /* update the inode's size and mtime */ - rc = ll_glimpse_size(inode); + if (!cached) + rc = ll_glimpse_size(inode); } RETURN(rc); diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index afed81a..e0dc6c5 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -833,10 +833,6 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, lum->lmm_magic = LOV_USER_MAGIC_V1; lum->lmm_pattern = LOV_PATTERN_F_RELEASED | LOV_PATTERN_RAID0; - lum->lmm_stripe_size = 0; - lum->lmm_stripe_count = 0; - lum->lmm_stripe_offset = 0; - op_data->op_data = lum; op_data->op_data_size = sizeof(*lum); op_data->op_archive_id = dataset->pccd_id; diff --git a/lustre/llite/pcc.c b/lustre/llite/pcc.c index 2de2715..be3a9db 100644 --- a/lustre/llite/pcc.c +++ b/lustre/llite/pcc.c @@ -382,17 +382,25 @@ static inline void pcc_inode_unlock(struct inode *inode) mutex_unlock(&ll_i2info(inode)->lli_pcc_lock); } -static void pcc_inode_init(struct pcc_inode *pcci) +static void pcc_inode_init(struct pcc_inode *pcci, struct ll_inode_info *lli) { + pcci->pcci_lli = lli; + lli->lli_pcc_inode = pcci; atomic_set(&pcci->pcci_refcount, 0); pcci->pcci_type = LU_PCC_NONE; + pcci->pcci_layout_gen = CL_LAYOUT_GEN_NONE; + atomic_set(&pcci->pcci_active_ios, 0); + init_waitqueue_head(&pcci->pcci_waitq); } static void pcc_inode_fini(struct pcc_inode *pcci) { + struct ll_inode_info *lli = pcci->pcci_lli; + path_put(&pcci->pcci_path); pcci->pcci_type = LU_PCC_NONE; OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab); + lli->lli_pcc_inode = NULL; } static void pcc_inode_get(struct pcc_inode *pcci) @@ -408,13 +416,11 @@ static void pcc_inode_put(struct pcc_inode *pcci) void pcc_inode_free(struct inode *inode) { - struct ll_inode_info *lli = ll_i2info(inode); - struct pcc_inode *pcci = lli->lli_pcc_inode; + struct pcc_inode *pcci = ll_i2pcci(inode); if (pcci) { WARN_ON(atomic_read(&pcci->pcci_refcount) > 1); pcc_inode_put(pcci); - lli->lli_pcc_inode = NULL; } } @@ -444,6 +450,11 @@ void pcc_file_init(struct pcc_file *pccf) pccf->pccf_type = LU_PCC_NONE; } +static inline bool pcc_inode_has_layout(struct pcc_inode *pcci) +{ + return pcci->pcci_layout_gen != CL_LAYOUT_GEN_NONE; +} + int pcc_file_open(struct inode *inode, struct file *file) { struct pcc_inode *pcci; @@ -464,7 +475,8 @@ int pcc_file_open(struct inode *inode, struct file *file) if (!pcci) GOTO(out_unlock, rc = 0); - if (atomic_read(&pcci->pcci_refcount) == 0) + if (atomic_read(&pcci->pcci_refcount) == 0 || + !pcc_inode_has_layout(pcci)) GOTO(out_unlock, rc = 0); pcc_inode_get(pcci); @@ -522,6 +534,74 @@ void pcc_file_release(struct inode *inode, struct file *file) pccf->pccf_file = NULL; out: pcc_inode_unlock(inode); + RETURN_EXIT; +} + +static inline void pcc_layout_gen_set(struct pcc_inode *pcci, + __u32 gen) +{ + pcci->pcci_layout_gen = gen; +} + +static void pcc_io_init(struct inode *inode, bool *cached) +{ + struct pcc_inode *pcci; + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + LASSERT(atomic_read(&pcci->pcci_refcount) > 0); + atomic_inc(&pcci->pcci_active_ios); + *cached = true; + } else { + *cached = false; + } + pcc_inode_unlock(inode); +} + +static void pcc_io_fini(struct inode *inode) +{ + struct pcc_inode *pcci = ll_i2pcci(inode); + + LASSERT(pcci && atomic_read(&pcci->pcci_active_ios) > 0); + if (atomic_dec_and_test(&pcci->pcci_active_ios)) + wake_up_all(&pcci->pcci_waitq); +} + + +static ssize_t +__pcc_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + +#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER + return file->f_op->read_iter(iocb, iter); +#else + struct iovec iov; + struct iov_iter i; + ssize_t bytes = 0; + + iov_for_each(iov, i, *iter) { + ssize_t res; + + res = file->f_op->aio_read(iocb, &iov, 1, iocb->ki_pos); + if (-EIOCBQUEUED == res) + res = wait_on_sync_kiocb(iocb); + if (res <= 0) { + if (bytes == 0) + bytes = res; + break; + } + + bytes += res; + if (res < iov.iov_len) + break; + } + + if (bytes > 0) + iov_iter_advance(iter, bytes); + return bytes; +#endif } ssize_t pcc_file_read_iter(struct kiocb *iocb, @@ -530,6 +610,7 @@ ssize_t pcc_file_read_iter(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct ll_file_data *fd = LUSTRE_FPRIVATE(file); struct pcc_file *pccf = &fd->fd_pcc_file; + struct inode *inode = file_inode(file); ssize_t result; ENTRY; @@ -538,12 +619,20 @@ ssize_t pcc_file_read_iter(struct kiocb *iocb, *cached = false; RETURN(0); } - *cached = true; - iocb->ki_filp = pccf->pccf_file; - result = generic_file_read_iter(iocb, iter); + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); + + iocb->ki_filp = pccf->pccf_file; + /* generic_file_aio_read does not support ext4-dax, + * __pcc_file_read_iter uses ->aio_read hook directly + * to add support for ext4-dax. + */ + result = __pcc_file_read_iter(iocb, iter); iocb->ki_filp = file; + pcc_io_fini(inode); RETURN(result); } @@ -588,6 +677,7 @@ ssize_t pcc_file_write_iter(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct ll_file_data *fd = LUSTRE_FPRIVATE(file); struct pcc_file *pccf = &fd->fd_pcc_file; + struct inode *inode = file_inode(file); ssize_t result; ENTRY; @@ -596,10 +686,18 @@ ssize_t pcc_file_write_iter(struct kiocb *iocb, *cached = false; RETURN(0); } - *cached = true; - if (pccf->pccf_type != LU_PCC_READWRITE) - RETURN(-EWOULDBLOCK); + if (pccf->pccf_type != LU_PCC_READWRITE) { + *cached = false; + RETURN(-EAGAIN); + } + + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_FAKE_ERROR)) + GOTO(out, result = -ENOSPC); iocb->ki_filp = pccf->pccf_file; @@ -609,6 +707,8 @@ ssize_t pcc_file_write_iter(struct kiocb *iocb, */ result = __pcc_file_write_iter(iocb, iter); iocb->ki_filp = file; +out: + pcc_io_fini(inode); RETURN(result); } @@ -616,9 +716,9 @@ int pcc_inode_setattr(struct inode *inode, struct iattr *attr, bool *cached) { int rc; - struct pcc_inode *pcci; struct iattr attr2 = *attr; struct dentry *pcc_dentry; + struct pcc_inode *pcci; ENTRY; @@ -627,28 +727,26 @@ int pcc_inode_setattr(struct inode *inode, struct iattr *attr, RETURN(0); } - pcc_inode_lock(inode); - pcci = ll_i2pcci(inode); - if (pcci == NULL || atomic_read(&pcci->pcci_refcount) == 0) - GOTO(out_unlock, rc = 0); + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); - *cached = true; attr2.ia_valid = attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME); + pcci = ll_i2pcci(inode); pcc_dentry = pcci->pcci_path.dentry; inode_lock(pcc_dentry->d_inode); rc = pcc_dentry->d_inode->i_op->setattr(pcc_dentry, &attr2); inode_unlock(pcc_dentry->d_inode); -out_unlock: - pcc_inode_unlock(inode); + + pcc_io_fini(inode); RETURN(rc); } int pcc_inode_getattr(struct inode *inode, bool *cached) { struct ll_inode_info *lli = ll_i2info(inode); - struct pcc_inode *pcci; struct kstat stat; s64 atime; s64 mtime; @@ -662,15 +760,13 @@ int pcc_inode_getattr(struct inode *inode, bool *cached) RETURN(0); } - pcc_inode_lock(inode); - pcci = ll_i2pcci(inode); - if (pcci == NULL || atomic_read(&pcci->pcci_refcount) == 0) - GOTO(out_unlock, rc = 0); + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); - *cached = true; - rc = ll_vfs_getattr(&pcci->pcci_path, &stat); + rc = ll_vfs_getattr(&ll_i2pcci(inode)->pcci_path, &stat); if (rc) - GOTO(out_unlock, rc); + GOTO(out, rc); ll_inode_size_lock(inode); if (inode->i_atime.tv_sec < lli->lli_atime || @@ -702,12 +798,311 @@ int pcc_inode_getattr(struct inode *inode, bool *cached) inode->i_ctime.tv_sec = ctime; ll_inode_size_unlock(inode); +out: + pcc_io_fini(inode); + RETURN(rc); +} -out_unlock: +ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, unsigned int flags, + bool *cached) +{ + struct inode *inode = file_inode(in_file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(in_file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + ssize_t result; + + ENTRY; + + *cached = false; + if (!pcc_file) + RETURN(0); + + if (!file_inode(pcc_file)->i_fop->splice_read) + RETURN(-ENOTSUPP); + + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); + + result = file_inode(pcc_file)->i_fop->splice_read(pcc_file, + ppos, pipe, count, + flags); + + pcc_io_fini(inode); + RETURN(result); +} + +int pcc_fsync(struct file *file, loff_t start, loff_t end, + int datasync, bool *cached) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + int rc; + + ENTRY; + + if (!pcc_file) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); + +#ifdef HAVE_FILE_FSYNC_4ARGS + rc = file_inode(pcc_file)->i_fop->fsync(pcc_file, + start, end, datasync); +#elif defined(HAVE_FILE_FSYNC_2ARGS) + rc = file_inode(pcc_file)->i_fop->fsync(pcc_file, datasync); +#else + rc = file_inode(pcc_file)->i_fop->fsync(pcc_file, + file_dentry(dentry), datasync); +#endif + + pcc_io_fini(inode); + RETURN(rc); +} + +int pcc_file_mmap(struct file *file, struct vm_area_struct *vma, + bool *cached) +{ + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct pcc_inode *pcci; + int rc = 0; + + ENTRY; + + if (!pcc_file || !file_inode(pcc_file)->i_fop->mmap) { + *cached = false; + RETURN(0); + } + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + LASSERT(atomic_read(&pcci->pcci_refcount) > 1); + *cached = true; + vma->vm_file = pcc_file; + rc = file_inode(pcc_file)->i_fop->mmap(pcc_file, vma); + vma->vm_file = file; + /* Save the vm ops of backend PCC */ + vma->vm_private_data = (void *)vma->vm_ops; + } else { + *cached = false; + } + pcc_inode_unlock(inode); + + RETURN(rc); +} + +void pcc_vm_open(struct vm_area_struct *vma) +{ + struct pcc_inode *pcci; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->open) + RETURN_EXIT; + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + vma->vm_file = pcc_file; + pcc_vm_ops->open(vma); + vma->vm_file = file; + } + pcc_inode_unlock(inode); + EXIT; +} + +void pcc_vm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->close) + RETURN_EXIT; + + pcc_inode_lock(inode); + /* Layout lock maybe revoked here */ + vma->vm_file = pcc_file; + pcc_vm_ops->close(vma); + vma->vm_file = file; pcc_inode_unlock(inode); + EXIT; +} + +int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + bool *cached) +{ + struct page *page = vmf->page; + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + int rc; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->page_mkwrite) { + *cached = false; + RETURN(0); + } + + /* Pause to allow for a race with concurrent detach */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE, cfs_fail_val); + + pcc_io_init(inode, cached); + if (!*cached) { + /* This happens when the file is detached from PCC after got + * the fault page via ->fault() on the inode of the PCC copy. + * Here it can not simply fall back to normal Lustre I/O path. + * The reason is that the address space of fault page used by + * ->page_mkwrite() is still the one of PCC inode. In the + * normal Lustre ->page_mkwrite() I/O path, it will be wrongly + * handled as the address space of the fault page is not + * consistent with the one of the Lustre inode (though the + * fault page was truncated). + * As the file is detached from PCC, the fault page must + * be released frist, and retry the mmap write (->fault() and + * ->page_mkwrite). + * We use an ugly and tricky method by returning + * VM_FAULT_NOPAGE | VM_FAULT_RETRY to the caller + * __do_page_fault and retry the memory fault handling. + */ + if (page->mapping == file_inode(pcc_file)->i_mapping) { + *cached = true; + up_read(&mm->mmap_sem); + RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE); + } + + RETURN(0); + } + + /* + * This fault injection can also be used to simulate -ENOSPC and + * -EDQUOT failure of underlying PCC backend fs. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_PCC_DETACH_MKWRITE)) { + pcc_io_fini(inode); + pcc_ioctl_detach(inode); + up_read(&mm->mmap_sem); + RETURN(VM_FAULT_RETRY | VM_FAULT_NOPAGE); + } + + vma->vm_file = pcc_file; +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY + rc = pcc_vm_ops->page_mkwrite(vmf); +#else + rc = pcc_vm_ops->page_mkwrite(vma, vmf); +#endif + vma->vm_file = file; + + pcc_io_fini(inode); + RETURN(rc); +} + +int pcc_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + bool *cached) +{ + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct file *pcc_file = fd->fd_pcc_file.pccf_file; + struct vm_operations_struct *pcc_vm_ops = vma->vm_private_data; + int rc; + + ENTRY; + + if (!pcc_file || !pcc_vm_ops || !pcc_vm_ops->fault) { + *cached = false; + RETURN(0); + } + + pcc_io_init(inode, cached); + if (!*cached) + RETURN(0); + + vma->vm_file = pcc_file; +#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY + rc = pcc_vm_ops->fault(vmf); +#else + rc = pcc_vm_ops->fault(vma, vmf); +#endif + vma->vm_file = file; + + pcc_io_fini(inode); RETURN(rc); } +static void pcc_layout_wait(struct pcc_inode *pcci) +{ + struct l_wait_info lwi = { 0 }; + + while (atomic_read(&pcci->pcci_active_ios) > 0) { + CDEBUG(D_CACHE, "Waiting for IO completion: %d\n", + atomic_read(&pcci->pcci_active_ios)); + l_wait_event(pcci->pcci_waitq, + atomic_read(&pcci->pcci_active_ios) == 0, &lwi); + } +} + +static void __pcc_layout_invalidate(struct pcc_inode *pcci) +{ + pcci->pcci_type = LU_PCC_NONE; + pcc_layout_gen_set(pcci, CL_LAYOUT_GEN_NONE); + pcc_layout_wait(pcci); +} + +void pcc_layout_invalidate(struct inode *inode) +{ + struct pcc_inode *pcci; + + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) { + LASSERT(atomic_read(&pcci->pcci_refcount) > 0); + __pcc_layout_invalidate(pcci); + + CDEBUG(D_CACHE, "Invalidate "DFID" layout gen %d\n", + PFID(&ll_i2info(inode)->lli_fid), pcci->pcci_layout_gen); + + pcc_inode_put(pcci); + } + pcc_inode_unlock(inode); +} + +static int pcc_inode_remove(struct pcc_inode *pcci) +{ + struct dentry *dentry; + int rc; + + dentry = pcci->pcci_path.dentry; + rc = ll_vfs_unlink(dentry->d_parent->d_inode, dentry); + if (rc) + CWARN("failed to unlink cached file, rc = %d\n", rc); + + return rc; +} + /* Create directory under base if directory does not exist */ static struct dentry * pcc_mkdir(struct dentry *base, const char *name, umode_t mode) @@ -752,9 +1147,10 @@ pcc_mkdir_p(struct dentry *root, char *path, umode_t mode) *ptr = '\0'; child = pcc_mkdir(parent, entry_name, mode); *ptr = '/'; + dput(parent); if (IS_ERR(child)) break; - dput(parent); + parent = child; ptr++; entry_name = ptr; @@ -849,23 +1245,35 @@ int pcc_inode_create(struct pcc_dataset *dataset, struct lu_fid *fid, int pcc_inode_create_fini(struct pcc_dataset *dataset, struct inode *inode, struct dentry *pcc_dentry) { - struct ll_inode_info *lli = ll_i2info(inode); struct pcc_inode *pcci; + int rc = 0; ENTRY; + pcc_inode_lock(inode); LASSERT(ll_i2pcci(inode) == NULL); OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS); if (pcci == NULL) - RETURN(-ENOMEM); + GOTO(out_unlock, rc = -ENOMEM); - pcc_inode_init(pcci); - pcc_inode_lock(inode); + pcc_inode_init(pcci, ll_i2info(inode)); pcc_inode_attach_init(dataset, pcci, pcc_dentry, LU_PCC_READWRITE); - lli->lli_pcc_inode = pcci; - pcc_inode_unlock(inode); + /* Set the layout generation of newly created file with 0 */ + pcc_layout_gen_set(pcci, 0); + +out_unlock: + if (rc) { + int rc2; - RETURN(0); + rc2 = ll_vfs_unlink(pcc_dentry->d_parent->d_inode, pcc_dentry); + if (rc2) + CWARN("failed to unlink PCC file, rc = %d\n", rc2); + + dput(pcc_dentry); + } + + pcc_inode_unlock(inode); + RETURN(rc); } static int pcc_filp_write(struct file *filp, const void *buf, ssize_t count, @@ -921,6 +1329,28 @@ out_fs: RETURN(rc); } +static int pcc_attach_allowed_check(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_inode *pcci; + int rc = 0; + + ENTRY; + + pcc_inode_lock(inode); + if (lli->lli_pcc_state & PCC_STATE_FL_ATTACHING) + GOTO(out_unlock, rc = -EBUSY); + + pcci = ll_i2pcci(inode); + if (pcci && pcc_inode_has_layout(pcci)) + GOTO(out_unlock, rc = -EEXIST); + + lli->lli_pcc_state |= PCC_STATE_FL_ATTACHING; +out_unlock: + pcc_inode_unlock(inode); + RETURN(rc); +} + int pcc_readwrite_attach(struct file *file, struct inode *inode, __u32 archive_id) { @@ -934,26 +1364,14 @@ int pcc_readwrite_attach(struct file *file, struct inode *inode, ENTRY; - pcc_inode_lock(inode); - pcci = ll_i2pcci(inode); - if (pcci == NULL) { - OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS); - if (pcci == NULL) { - pcc_inode_unlock(inode); - RETURN(-ENOMEM); - } - - pcc_inode_init(pcci); - } else if (atomic_read(&pcci->pcci_refcount) > 0) { - pcc_inode_unlock(inode); - RETURN(-EEXIST); - } - pcc_inode_unlock(inode); + rc = pcc_attach_allowed_check(inode); + if (rc) + RETURN(rc); dataset = pcc_dataset_get(&ll_i2sbi(inode)->ll_pcc_super, 0, archive_id); if (dataset == NULL) - GOTO(out_free_pcci, rc = -ENOENT); + RETURN(-ENOENT); rc = __pcc_inode_create(dataset, &lli->lli_fid, &dentry); if (rc) @@ -978,74 +1396,116 @@ int pcc_readwrite_attach(struct file *file, struct inode *inode, if (rc) GOTO(out_fput, rc); + /* Pause to allow for a race with concurrent HSM remove */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_PCC_ATTACH_PAUSE, cfs_fail_val); + pcc_inode_lock(inode); - if (lli->lli_pcc_inode) - GOTO(out_unlock, rc = -EEXIST); + pcci = ll_i2pcci(inode); + LASSERT(!pcci); + OBD_SLAB_ALLOC_PTR_GFP(pcci, pcc_inode_slab, GFP_NOFS); + if (pcci == NULL) + GOTO(out_unlock, rc = -ENOMEM); + + pcc_inode_init(pcci, lli); pcc_inode_attach_init(dataset, pcci, dentry, LU_PCC_READWRITE); - lli->lli_pcc_inode = pcci; out_unlock: pcc_inode_unlock(inode); out_fput: fput(pcc_filp); out_dentry: - if (rc) + if (rc) { + int rc2; + + rc2 = ll_vfs_unlink(dentry->d_parent->d_inode, dentry); + if (rc2) + CWARN("failed to unlink PCC file, rc = %d\n", rc2); + dput(dentry); + } out_dataset_put: pcc_dataset_put(dataset); -out_free_pcci: - if (rc) - OBD_SLAB_FREE_PTR(pcci, pcc_inode_slab); RETURN(rc); - } int pcc_readwrite_attach_fini(struct file *file, struct inode *inode, - bool lease_broken, int rc, bool attached) + __u32 gen, bool lease_broken, int rc, + bool attached) { - struct pcc_inode *pcci = ll_i2pcci(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct pcc_inode *pcci; + __u32 gen2; ENTRY; - if ((rc || lease_broken) && attached && pcci) - pcc_inode_put(pcci); + pcc_inode_lock(inode); + pcci = ll_i2pcci(inode); + lli->lli_pcc_state &= ~PCC_STATE_FL_ATTACHING; + if ((rc || lease_broken)) { + if (attached && pcci) + pcc_inode_put(pcci); + GOTO(out_unlock, rc); + } + + /* PCC inode may be released due to layout lock revocatioin */ + if (!pcci) + GOTO(out_unlock, rc = -ESTALE); + + LASSERT(attached); + rc = ll_layout_refresh(inode, &gen2); + if (!rc) { + if (gen2 == gen) { + pcc_layout_gen_set(pcci, gen); + } else { + CDEBUG(D_CACHE, + DFID" layout changed from %d to %d.\n", + PFID(ll_inode2fid(inode)), gen, gen2); + GOTO(out_put, rc = -ESTALE); + } + } + +out_put: + if (rc) { + pcc_inode_remove(pcci); + pcc_inode_put(pcci); + } +out_unlock: + pcc_inode_unlock(inode); RETURN(rc); } int pcc_ioctl_detach(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); - struct pcc_inode *pcci = lli->lli_pcc_inode; + struct pcc_inode *pcci; int rc = 0; - int count; ENTRY; pcc_inode_lock(inode); - if (pcci == NULL) - GOTO(out_unlock, rc = 0); - - count = atomic_read(&pcci->pcci_refcount); - if (count > 1) - GOTO(out_unlock, rc = -EBUSY); - else if (count == 0) + pcci = lli->lli_pcc_inode; + if (!pcci || lli->lli_pcc_state & PCC_STATE_FL_ATTACHING || + !pcc_inode_has_layout(pcci)) GOTO(out_unlock, rc = 0); + __pcc_layout_invalidate(pcci); pcc_inode_put(pcci); - lli->lli_pcc_inode = NULL; + out_unlock: pcc_inode_unlock(inode); - RETURN(rc); } -int pcc_ioctl_state(struct inode *inode, struct lu_pcc_state *state) +int pcc_ioctl_state(struct file *file, struct inode *inode, + struct lu_pcc_state *state) { int rc = 0; int count; char *buf; char *path; int buf_len = sizeof(state->pccs_path); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct pcc_file *pccf = &fd->fd_pcc_file; struct pcc_inode *pcci; ENTRY; @@ -1067,12 +1527,17 @@ int pcc_ioctl_state(struct inode *inode, struct lu_pcc_state *state) count = atomic_read(&pcci->pcci_refcount); if (count == 0) { state->pccs_type = LU_PCC_NONE; + state->pccs_open_count = 0; GOTO(out_unlock, rc = 0); } + + if (pcc_inode_has_layout(pcci)) + count--; + if (pccf->pccf_file != NULL) + count--; state->pccs_type = pcci->pcci_type; - state->pccs_open_count = count - 1; - state->pccs_flags = pcci->pcci_attr_valid ? - PCC_STATE_FLAG_ATTR_VALID : 0; + state->pccs_open_count = count; + state->pccs_flags = ll_i2info(inode)->lli_pcc_state; #ifdef HAVE_DENTRY_PATH_RAW path = dentry_path_raw(pcci->pcci_path.dentry, buf, buf_len); if (IS_ERR(path)) diff --git a/lustre/llite/pcc.h b/lustre/llite/pcc.h index 7d3e8b4..d79fe19 100644 --- a/lustre/llite/pcc.h +++ b/lustre/llite/pcc.h @@ -35,6 +35,7 @@ #include #include #include +#include #include extern struct kmem_cache *pcc_inode_slab; @@ -56,17 +57,27 @@ struct pcc_super { }; struct pcc_inode { + struct ll_inode_info *pcci_lli; /* Cache path on local file system */ - struct path pcci_path; + struct path pcci_path; /* * If reference count is 0, then the cache is not inited, if 1, then * no one is using it. */ - atomic_t pcci_refcount; + atomic_t pcci_refcount; /* Whether readonly or readwrite PCC */ - enum lu_pcc_type pcci_type; - /* Whether the inode is cached locally */ - bool pcci_attr_valid; + enum lu_pcc_type pcci_type; + /* Whether the inode attr is cached locally */ + bool pcci_attr_valid; + /* Layout generation */ + __u32 pcci_layout_gen; + /* + * How many IOs are on going on this cached object. Layout can be + * changed only if there is no active IO. + */ + atomic_t pcci_active_ios; + /* Waitq - wait for PCC I/O completion. */ + wait_queue_head_t pcci_waitq; }; struct pcc_file { @@ -100,14 +111,15 @@ void pcc_super_init(struct pcc_super *super); void pcc_super_fini(struct pcc_super *super); int pcc_cmd_handle(char *buffer, unsigned long count, struct pcc_super *super); -int -pcc_super_dump(struct pcc_super *super, struct seq_file *m); -int pcc_readwrite_attach(struct file *file, - struct inode *inode, __u32 arch_id); +int pcc_super_dump(struct pcc_super *super, struct seq_file *m); +int pcc_readwrite_attach(struct file *file, struct inode *inode, + __u32 arch_id); int pcc_readwrite_attach_fini(struct file *file, struct inode *inode, - bool lease_broken, int rc, bool attached); + __u32 gen, bool lease_broken, int rc, + bool attached); int pcc_ioctl_detach(struct inode *inode); -int pcc_ioctl_state(struct inode *inode, struct lu_pcc_state *state); +int pcc_ioctl_state(struct file *file, struct inode *inode, + struct lu_pcc_state *state); void pcc_file_init(struct pcc_file *pccf); int pcc_file_open(struct inode *inode, struct file *file); void pcc_file_release(struct inode *inode, struct file *file); @@ -117,12 +129,25 @@ ssize_t pcc_file_write_iter(struct kiocb *iocb, struct iov_iter *iter, bool *cached); int pcc_inode_getattr(struct inode *inode, bool *cached); int pcc_inode_setattr(struct inode *inode, struct iattr *attr, bool *cached); +ssize_t pcc_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags, bool *cached); +int pcc_fsync(struct file *file, loff_t start, loff_t end, + int datasync, bool *cached); +int pcc_file_mmap(struct file *file, struct vm_area_struct *vma, bool *cached); +void pcc_vm_open(struct vm_area_struct *vma); +void pcc_vm_close(struct vm_area_struct *vma); +int pcc_fault(struct vm_area_struct *mva, struct vm_fault *vmf, bool *cached); +int pcc_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + bool *cached); int pcc_inode_create(struct pcc_dataset *dataset, struct lu_fid *fid, struct dentry **pcc_dentry); int pcc_inode_create_fini(struct pcc_dataset *dataset, struct inode *inode, struct dentry *pcc_dentry); -struct pcc_dataset * -pcc_dataset_get(struct pcc_super *super, __u32 projid, __u32 archive_id); +struct pcc_dataset *pcc_dataset_get(struct pcc_super *super, __u32 projid, + __u32 archive_id); void pcc_dataset_put(struct pcc_dataset *dataset); void pcc_inode_free(struct inode *inode); +void pcc_layout_invalidate(struct inode *inode); + #endif /* LLITE_PCC_H */ diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c index c3bf715..7412a06 100644 --- a/lustre/llite/vvp_object.c +++ b/lustre/llite/vvp_object.c @@ -150,7 +150,8 @@ static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, * This operation is expensive but mmap processes have to pay * a price themselves. */ unmap_mapping_range(conf->coc_inode->i_mapping, - 0, OBD_OBJECT_EOF, 0); + 0, OBD_OBJECT_EOF, 1); + pcc_layout_invalidate(conf->coc_inode); } return 0; } diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index bd20a5f..fe0cb34 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -1746,6 +1746,22 @@ static inline int mdt_hsm_set_released(struct lov_mds_md *lmm) return 0; } +static inline int mdt_get_lmm_gen(struct lov_mds_md *lmm, __u32 *gen) +{ + struct lov_comp_md_v1 *comp_v1; + + if (le32_to_cpu(lmm->lmm_magic == LOV_MAGIC_COMP_V1)) { + comp_v1 = (struct lov_comp_md_v1 *)lmm; + *gen = le32_to_cpu(comp_v1->lcm_layout_gen); + } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1 || + le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3) { + *gen = le16_to_cpu(lmm->lmm_layout_gen); + } else { + return -EINVAL; + } + return 0; +} + static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o, struct md_attr *ma) { @@ -1827,6 +1843,9 @@ static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o, ma->ma_hsm.mh_arch_ver); ma->ma_hsm.mh_flags = HS_ARCHIVED | HS_EXISTS; } + + if (ma->ma_hsm.mh_flags & HS_DIRTY) + ma->ma_hsm.mh_flags = HS_ARCHIVED | HS_EXISTS; } else { /* Set up HSM attribte for PCC archived object */ CLASSERT(sizeof(struct hsm_attrs) <= @@ -1957,6 +1976,12 @@ static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o, rc = mo_swap_layouts(info->mti_env, mdt_object_child(o), mdt_object_child(orphan), SWAP_LAYOUTS_MDS_HSM); + + if (!rc && ma->ma_attr_flags & MDS_PCC_ATTACH) { + ma->ma_need = MA_LOV; + rc = mdt_attr_get_complex(info, o, ma); + } + EXIT; out_layout_lock: @@ -1983,6 +2008,13 @@ out_unlock: repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); LASSERT(repbody != NULL); repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED; + if (ma->ma_attr_flags & MDS_PCC_ATTACH) { + LASSERT(ma->ma_valid & MA_LOV); + rc = mdt_get_lmm_gen(ma->ma_lmm, + &repbody->mbo_layout_gen); + if (!rc) + repbody->mbo_valid |= OBD_MD_LAYOUT_VERSION; + } } out_reprocess: diff --git a/lustre/tests/multiop.c b/lustre/tests/multiop.c index 6f4169c..c361ae2 100644 --- a/lustre/tests/multiop.c +++ b/lustre/tests/multiop.c @@ -213,7 +213,7 @@ int main(int argc, char **argv) struct stat st; struct statfs stfs; size_t mmap_len = 0, i; - unsigned char *mmap_ptr = NULL, junk = 0; + unsigned char *mmap_ptr = NULL, junk = 1; int len, fd = -1; int flags; int save_errno; diff --git a/lustre/tests/sanity-pcc.sh b/lustre/tests/sanity-pcc.sh index 205c0eb..ed2ec57 100644 --- a/lustre/tests/sanity-pcc.sh +++ b/lustre/tests/sanity-pcc.sh @@ -26,6 +26,7 @@ init_logging MULTIOP=${MULTIOP:-multiop} OPENFILE=${OPENFILE:-openfile} +MMAP_CAT=${MMAP_CAT:-mmap_cat} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false @@ -133,7 +134,8 @@ check_lpcc_state() { local lustre_path="$1" local expected_state="$2" - local state=$(do_facet $SINGLEAGT $LFS pcc state $lustre_path | + local facet=${3:-$SINGLEAGT} + local state=$(do_facet $facet $LFS pcc state $lustre_path | awk -F 'type: ' '{print $2}' | awk -F ',' '{print $1}') [[ "x$state" == "x$expected_state" ]] || error \ @@ -163,15 +165,19 @@ cdt_set_sanity_policy set_hsm_param grace_delay 10 cleanup_pcc_mapping() { - do_facet $SINGLEAGT $LCTL pcc clear $MOUNT + local facet=${1:-$SINGLEAGT} + + do_facet $facet $LCTL pcc clear $MOUNT } setup_pcc_mapping() { - local hsm_root=$(hsm_root) + local facet=${1:-$SINGLEAGT} + local hsm_root=${hsm_root:-$(hsm_root "$facet")} + local param="$2" - cleanup_pcc_mapping - do_facet $SINGLEAGT $LCTL pcc add $MOUNT $hsm_root \ - -p "$HSM_ARCHIVE_NUMBER\ 100" + [ -z "$param" ] && param="$HSM_ARCHIVE_NUMBER\ 100" + cleanup_pcc_mapping $facet + do_facet $facet $LCTL pcc add $MOUNT $hsm_root -p $param } lpcc_rw_test() { @@ -257,6 +263,7 @@ lpcc_rw_test() { # HSM released exists archived status check_hsm_flags $file "0x0000000d" do_facet $SINGLEAGT "echo -n attach_detach > $file" + echo "Start to detach the $file" do_facet $SINGLEAGT $LFS pcc detach $file || error "PCC detach $file failed" check_lpcc_state $file "none" @@ -287,12 +294,11 @@ test_1d() { } run_test 1d "Test Project ID with remote access" - # # When a process created a LPCC file and holding the open, # another process on the same client should be able to open the file. # -test_2() { +test_2a() { local project_id=100 local agt_facet=$SINGLEAGT local hsm_root=$(hsm_root) @@ -326,9 +332,101 @@ test_2() { rmultiop_stop $agt_host || error "close $file failed" cleanup_pcc_mapping } -run_test 2 "Test multi open when creating" +run_test 2a "Test multi open when creating" + +get_remote_client() { + current_id=$(do_facet $SINGLEAGT hostname) + for client in ${CLIENTS//,/ } + do + r_id=$(do_node $client hostname) + if [ $r_id != $current_id ]; then + echo $client + return + fi + done +} -test_3() { +# +# When a process created a LPCC file and holding the open, another +# process on the different client should be able to open the file +# and perform IO on the file. +# +test_2b() { + local agt_facet=$SINGLEAGT + local hsm_root=$(hsm_root) + local agt_host=$(facet_active_host $SINGLEAGT) + + needclients 2 || return 0 + + remote_client=$(get_remote_client) + + enable_project_quota + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + file=$DIR/$tdir/multiop + mkdir -p $DIR/$tdir + rm -f $file + + do_facet $SINGLEAGT "echo -n file_data > $file" + do_facet $SINGLEAGT lfs pcc attach -i $HSM_ARCHIVE_NUMBER \ + $file || error "PCC attach $file failed" + check_lpcc_state $file "readwrite" + + rmultiop_start $agt_host $file O_c || error "open $file failed" + + do_node $remote_client "echo -n multiopen_data > $file" + + # PCC cached file should be automatically detached + check_lpcc_state $file "none" + + check_file_data $SINGLEAGT $file "multiopen_data" + rmultiop_stop $agt_host || error "close $file failed" + check_file_data $SINGLEAGT $file "multiopen_data" + + do_node $remote_client cat $file || error \ + "cat $file on remote client failed" + do_node $remote_client echo -n "multiopen_data" > $file \ + || error "write $file on remote client failed" + cleanup_pcc_mapping +} +run_test 2b "Test multi remote open when creating" + +test_2c() { + local agt_host=$(facet_active_host $SINGLEAGT) + local file=$DIR/$tdir/$tfile + local file2=$DIR2/$tdir/$tfile + + enable_project_quota + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + mkdir -p $DIR/$tdir + rm -f $file + + do_facet $SINGLEAGT "echo -n file_data > $file" + do_facet $SINGLEAGT lfs pcc attach -i $HSM_ARCHIVE_NUMBER \ + $file || error "PCC attach $file failed" + check_lpcc_state $file "readwrite" + + rmultiop_start $agt_host $file O_c || error "open $file failed" + + echo -n multiopen_data > $file2 + + # PCC cached file should be automatically detached + check_lpcc_state $file "none" + + check_file_data $SINGLEAGT $file "multiopen_data" + rmultiop_stop $agt_host || error "close $file failed" + check_file_data $SINGLEAGT $file "multiopen_data" + + cat $file2 || error "cat $file on mount $MOUNT2 failed" + echo -n "multiopen_data" > $file2 || + error "write $file on mount $MOUNT2 failed" + + cleanup_pcc_mapping +} +run_test 2c "Test multi open on different mount points when creating" + +test_3a() { local file=$DIR/$tdir/$tfile copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" @@ -356,7 +454,364 @@ test_3() { cleanup_pcc_mapping } -run_test 3 "Repeat attach/detach operations" +run_test 3a "Repeat attach/detach operations" + +test_3b() { + local n + local file=$DIR/$tdir/$tfile + + needclients 3 || return 0 + + # Start all of the copytools and setup PCC + for n in $(seq $AGTCOUNT); do + copytool setup -f agt$n -a $n -m $MOUNT + setup_pcc_mapping agt$n "$n\ 100" + done + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + dd if=/dev/zero of=$file bs=1024 count=1 || + error "failed to dd write to $file" + + echo "Start to attach/detach $file on $agt1_HOST" + do_facet agt1 $LFS pcc attach -i 1 $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" agt1 + do_facet agt1 $LFS pcc detach $file || + error "failed to detach file $file" + check_lpcc_state $file "none" agt1 + + echo "Repeat to attach/detach $file on $agt2_HOST" + do_facet agt2 $LFS pcc attach -i 2 $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" agt2 + do_facet agt2 $LFS pcc detach $file || + error "failed to detach file $file" + check_lpcc_state $file "none" agt2 + + echo "Try attach on two agents" + do_facet agt1 $LFS pcc attach -i 1 $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" agt1 + do_facet agt2 $LFS pcc attach -i 2 $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" agt2 + # The later attach PCC agent should succeed, + # the former agent should be detached automatically. + check_lpcc_state $file "none" agt1 + do_facet agt2 $LFS pcc detach $file || + error "failed to detach file $file" + check_lpcc_state $file "none" agt2 + + for n in $(seq $AGTCOUNT); do + cleanup_pcc_mapping agt$n + done +} +run_test 3b "Repeat attach/detach operations on multiple clients" + +test_4() { + local project_id=100 + + ! is_project_quota_supported && + skip "project quota is not supported" && return + + enable_project_quota + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + lfs project -sp $project_id $DIR/$tdir || + error "lfs project -sp $project_id $DIR/$tdir failed" + + # mmap_sanity tst7 failed on the local ext4 filesystem. + # It seems that Lustre filesystem does special process for tst 7. + # Thus, we exclude tst7 from the PCC testing. + $LUSTRE/tests/mmap_sanity -d $DIR/$tdir -m $DIR2/$tdir -e 7 || + error "mmap_sanity test failed" + sync; sleep 1; sync + + cleanup_pcc_mapping +} +run_test 4 "Auto cache test for mmap" + +test_5() { + local file=$DIR/$tdir/$tfile + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + do_facet $SINGLEAGT "echo -n attach_mmap_data > $file" || + error "echo $file failed" + + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" + + local content=$($MMAP_CAT $file) + + [[ $content == "attach_mmap_data" ]] || + error "mmap cat data mismatch: $content" + + $LFS hsm_restore $file || error "failed to restore $file" + wait_request_state $(path2fid $file) RESTORE SUCCEED + check_lpcc_state $file "none" + + content=$($MMAP_CAT $file) + [[ $content == "attach_mmap_data" ]] || + error "mmap cat data mismatch: $content" + + cleanup_pcc_mapping +} +run_test 5 "Mmap & cat a RW-PCC cached file" + +test_6() { + local file=$DIR/$tdir/$tfile + local content + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + + echo -n mmap_write_data > $file || error "echo write $file failed" + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" + + do_facet $SINGLEAGT $MULTIOP $file OSMWUc || + error "could not mmap $file" + check_lpcc_state $file "readwrite" + content=$(do_facet $SINGLEAGT $MMAP_CAT $file) + # After mmap write via multiop, the first character of each page + # increases with 1. + [[ $content == "nmap_write_data" ]] || + error "mmap write data mismatch: $content" + check_lpcc_state $file "readwrite" + + do_facet $SINGLEAGT $LFS pcc detach $file || + error "failed to detach file $file" + + content=$(do_facet $SINGLEAGT $MMAP_CAT $file) + [[ $content == "nmap_write_data" ]] || + error "mmap write data mismatch: $content" + + cleanup_pcc_mapping +} +run_test 6 "Test mmap write on RW-PCC " + +test_7a() { + local file=$DIR/$tdir/$tfile + local content + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + echo "QQQQQ" > $file + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" + check_file_data $SINGLEAGT $file "QQQQQ" + # define OBD_FAIL_LLITE_PCC_DETACH_MKWRITE 0x1412 + do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1412 + # HSM released exists archived status + check_hsm_flags $file "0x0000000d" + + # multiop mmap write increase the first character of each page with 1 + do_facet $SINGLEAGT $MULTIOP $file OSMWUc || + error "mmap write $file failed" + check_lpcc_state $file "none" + content=$(do_facet $SINGLEAGT $MMAP_CAT $file) + [[ $content == "RQQQQ" ]] || error "data mismatch: $content" + + cleanup_pcc_mapping +} +run_test 7a "Fake file detached between fault() and page_mkwrite() for RW-PCC" + +test_7b() { + local file=$DIR/$tdir/$tfile + local content + local pid + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + echo "QQQQQ" > $file + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" + check_file_data $SINGLEAGT $file "QQQQQ" + # define OBD_FAIL_LLITE_PCC_MKWRITE_PAUSE 0x1413 + do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1413 fail_val=20 + # HSM released exists archived status + check_hsm_flags $file "0x0000000d" + + # multiop mmap write increase the first character of each page with 1 + do_facet $SINGLEAGT $MULTIOP $file OSMWUc & + pid=$! + + do_facet $SINGLEAGT $LFS pcc detach $file || + error "failed to detach file $file" + + wait $pid || error "multiop mmap write failed" + check_lpcc_state $file "none" + content=$(do_facet $SINGLEAGT $MMAP_CAT $file) + [[ $content == "RQQQQ" ]] || error "data mismatch: $content" + + cleanup_pcc_mapping +} +run_test 7b "Test the race with concurrent mkwrite and detach" + +test_8() { + local file=$DIR/$tdir/$tfile + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + + echo "QQQQQ" > $file + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "failed to attach file $file" + check_lpcc_state $file "readwrite" + check_file_data $SINGLEAGT $file "QQQQQ" + + # define OBD_FAIL_LLITE_PCC_FAKE_ERROR 0x1411 + do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1411 + do_facet $SINGLEAGT "echo -n ENOSPC_write > $file" + # Above write will return -ENOSPC failure and retry the IO on normal + # IO path. It will restore the HSM released file. + check_lpcc_state $file "none" + check_file_data $SINGLEAGT $file "ENOSPC_write" + + cleanup_pcc_mapping +} +run_test 8 "Test fake -ENOSPC tolerance for RW-PCC" + +setup_loopdev() { + local facet=$1 + local file=$2 + local mntpt=$3 + local size=${4:-50} + + do_facet $facet mkdir -p $mntpt || error "mkdir -p $hsm_root failed" + stack_trap "do_facet $facet rm -rf $mntpt" EXIT + do_facet $facet dd if=/dev/zero of=$file bs=1M count=$size + stack_trap "do_facet $facet rm -f $file" EXIT + do_facet $facet mkfs.ext4 $file || + error "mkfs.ext4 $file failed" + do_facet $facet file $file + do_facet $facet mount -t ext4 -o loop,usrquota,grpquota $file $mntpt || + error "mount -o loop,usrquota,grpquota $file $mntpt failed" + stack_trap "do_facet $facet $UMOUNT $mntpt" EXIT +} + +test_9() { + local loopfile="$TMP/$tfile" + local mntpt="/mnt/pcc.9a" + local hsm_root="$mntpt/$tdir" + local file=$DIR/$tfile + + setup_loopdev $SINGLEAGT $loopfile $mntpt 50 + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMVER" -h "$hsm_root" + setup_pcc_mapping + do_facet $SINGLEAGT $LCTL pcc list $MOUNT + + touch $file || error "touch $file failed" + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "fail to attach $file" + check_lpcc_state $file "readwrite" + # write 60M data, it is larger than the capacity of PCC backend + do_facet $SINGLEAGT dd if=/dev/zero of=$file bs=1M count=60 || + error "fail to dd write $file" + check_lpcc_state $file "none" + check_file_size $SINGLEAGT $file 62914560 + + cleanup_pcc_mapping +} +run_test 9 "Test -ENOSPC tolerance on loop PCC device for RW-PCC" + +test_10() { + local file=$DIR/$tdir/$tfile + local hsm_root=$(hsm_root) + local file=$DIR/$tdir/$tfile + local -a lpcc_path + local lpcc_dir + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" + do_facet $SINGLEAGT "echo -n QQQQQ > $file" + lpcc_path=$(lpcc_fid2path $hsm_root $file) + lpcc_dir=$(dirname $lpcc_path) + echo "Lustre file: $file LPCC dir: $lpcc_dir" + do_facet $SINGLEAGT mkdir -p $lpcc_dir || + error "mkdir -p $lpcc_dir failed" + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "failed to attach $file" + check_lpcc_state $file "readwrite" + check_file_data $SINGLEAGT $file "QQQQQ" + do_facet $SINGLEAGT $LFS pcc detach $file || + error "failed to detach $file" + rm $file || error "rm $file failed" + + # The parent directory of the PCC file is immutable + do_facet $SINGLEAGT "echo -n immutable_dir > $file" + lpcc_path=$(lpcc_fid2path $hsm_root $file) + lpcc_dir=$(dirname $lpcc_path) + echo "Lustre file: $file LPCC dir: $lpcc_dir" + do_facet $SINGLEAGT mkdir -p $lpcc_dir || + error "mkdir -p $lpcc_dir failed" + do_facet $SINGLEAGT chattr +i $lpcc_dir || + error "chattr +i $lpcc_dir failed" + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file && + error "attach $file with immutable directory should be failed" + do_facet $SINGLEAGT chattr -i $lpcc_dir || + error "chattr -i $lpcc_dir failed" + rm $file || error "rm $file failed" + + # The PCC file path is set to a directory + do_facet $SINGLEAGT "echo -n pcc_file_path_is_dir > $file" + lpcc_path=$(lpcc_fid2path $hsm_root $file) + do_facet $SINGLEAGT mkdir -p $lpcc_path || + error "mkdir -p $lpcc_path failed" + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file && + error "attach $file should fail as PCC path is a directory" + rm $file || error "rm $file failed" + + cleanup_pcc_mapping +} +run_test 10 "Test attach fault injection with simulated PCC file path" + +test_11() { + local file=$DIR/$tfile + local pid + + copytool setup -m "$MOUNT" -a "$HSM_ARCHIVE_NUMBER" + setup_pcc_mapping + + echo -n race_rw_attach_hsmremove > $file + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file || + error "attach $file failed" + do_facet $SINGLEAGT $LFS pcc detach $file || error "detach $file failed" + # HSM released exists archived status + check_hsm_flags $file "0x0000000d" + # define OBD_FAIL_LLITE_PCC_ATTACH_PAUSE 0x1414 + do_facet $SINGLEAGT $LCTL set_param fail_loc=0x1414 fail_val=20 + do_facet $SINGLEAGT $LFS pcc attach -i $HSM_ARCHIVE_NUMBER $file & + pid=$! + $LFS hsm_state $file + sleep 3 + wait_request_state $(path2fid $file) RESTORE SUCCEED + $LFS hsm_remove $file || error "hsm remove $file failed" + wait $pid && error "RW-PCC attach $file should fail" + + cleanup_pcc_mapping +} +run_test 11 "RW-PCC attach races with concurrent HSM remove" complete $SECONDS check_and_cleanup_lustre diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index e749838..7a1a050 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -9754,6 +9754,10 @@ copytool() shift local archive_id="$1" ;; + -h|--hsm-root) + shift + local hsm_root="$1" + ;; -b|--bwlimit) shift local bandwidth="$1" # in MB/s @@ -9772,7 +9776,7 @@ copytool() # Use default values if needed local facet=${facet:-$SINGLEAGT} local mountpoint="${mountpoint:-${MOUNT2:-$MOUNT}}" - local hsm_root="$(hsm_root "$facet")" + local hsm_root="${hsm_root:-$(hsm_root "$facet")}" stack_trap "do_facet $facet rm -rf '$hsm_root'" EXIT do_facet $facet mkdir -p "$hsm_root" || diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 02bad77..d11c61c 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -128,6 +128,7 @@ static inline int lfs_mirror_read(int argc, char **argv); static inline int lfs_mirror_write(int argc, char **argv); static inline int lfs_mirror_copy(int argc, char **argv); static int lfs_pcc_attach(int argc, char **argv); +static int lfs_pcc_attach_fid(int argc, char **argv); static int lfs_pcc_detach(int argc, char **argv); static int lfs_pcc_detach_fid(int argc, char **argv); static int lfs_pcc_state(int argc, char **argv); @@ -339,6 +340,12 @@ command_t pcc_cmdlist[] = { .pc_help = "Attach given files to the Persistent Client Cache.\n" "usage: lfs pcc attach <--id|-i NUM> ...\n" "\t-i: archive id for RW-PCC\n" }, + { .pc_name = "attach_fid", .pc_func = lfs_pcc_attach_fid, + .pc_help = "Attach given files into PCC by FID(s).\n" + "usage: lfs pcc attach_id <--id|-i NUM> <--mnt|-m mnt> " + " ...\n" + "\t-i: archive id for RW-PCC\n" + "\t-m: Lustre mount point\n" }, { .pc_name = "state", .pc_func = lfs_pcc_state, .pc_help = "Display the PCC state for given files.\n" "usage: lfs pcc state ...\n" }, @@ -664,6 +671,7 @@ command_t cmdlist[] = { {"pcc", lfs_pcc, pcc_cmdlist, "lfs commands used to interact with PCC features:\n" "lfs pcc attach - attach given files to Persistent Client Cache\n" + "lfs pcc attach_fid - attach given files into PCC by FID(s)\n" "lfs pcc state - display the PCC state for given files\n" "lfs pcc detach - detach given files from Persistent Client Cache\n" "lfs pcc detach_fid - detach given files from PCC by FID(s)\n"}, @@ -10433,6 +10441,79 @@ static int lfs_pcc_attach(int argc, char **argv) return rc; } +static int lfs_pcc_attach_fid(int argc, char **argv) +{ + struct option long_opts[] = { + { .val = 'i', .name = "id", .has_arg = required_argument }, + { .val = 'm', .name = "mnt", .has_arg = required_argument }, + { .name = NULL } }; + char short_opts[] = "i:m:"; + int c; + int rc = 0; + __u32 archive_id = 0; + char *end; + const char *mntpath = NULL; + const char *fidstr; + enum lu_pcc_type type = LU_PCC_READWRITE; + + optind = 0; + while ((c = getopt_long(argc, argv, short_opts, + long_opts, NULL)) != -1) { + switch (c) { + case 'i': + archive_id = strtoul(optarg, &end, 0); + if (*end != '\0') { + fprintf(stderr, "error: %s: bad archive ID " + "'%s'\n", argv[0], optarg); + return CMD_HELP; + } + break; + case 'm': + mntpath = optarg; + break; + case '?': + return CMD_HELP; + default: + fprintf(stderr, "%s: option '%s' unrecognized\n", + argv[0], argv[optind - 1]); + return CMD_HELP; + } + } + + if (archive_id == 0) { + fprintf(stderr, "%s: must specify an archive ID\n", argv[0]); + return CMD_HELP; + } + + if (mntpath == NULL) { + fprintf(stderr, "%s: must specify Lustre mount point\n", + argv[0]); + return CMD_HELP; + } + + if (argc <= optind) { + fprintf(stderr, "%s: must specify one or more fids\n", argv[0]); + return CMD_HELP; + } + + while (optind < argc) { + int rc2; + + fidstr = argv[optind++]; + + rc2 = llapi_pcc_attach_fid_str(mntpath, fidstr, + archive_id, type); + if (rc2 < 0) { + fprintf(stderr, "%s: cannot attach '%s' on '%s' to PCC " + "with archive ID '%u': %s\n", argv[0], + fidstr, mntpath, archive_id, strerror(rc2)); + } + if (rc == 0 && rc2 < 0) + rc = rc2; + } + return rc; +} + static int lfs_pcc_detach(int argc, char **argv) { int rc = 0; @@ -10461,8 +10542,9 @@ static int lfs_pcc_detach(int argc, char **argv) rc2 = llapi_pcc_detach_file(fullpath); if (rc2 < 0) { + rc2 = -errno; fprintf(stderr, "%s: cannot detach '%s' from PCC: " - "%s\n", argv[0], path, strerror(-rc2)); + "%s\n", argv[0], path, strerror(errno)); if (rc == 0) rc = rc2; } @@ -10549,9 +10631,7 @@ static int lfs_pcc_state(int argc, char **argv) printf(", PCC file: %s", state.pccs_path); printf(", user number: %u", state.pccs_open_count); - printf(", attr cached: %s", - state.pccs_flags & PCC_STATE_FLAG_ATTR_VALID ? - "true" : "false"); + printf(", flags: %x", state.pccs_flags); printf("\n"); } return rc; diff --git a/lustre/utils/lhsmtool_posix.c b/lustre/utils/lhsmtool_posix.c index ce1e48e..a0c8a77 100644 --- a/lustre/utils/lhsmtool_posix.c +++ b/lustre/utils/lhsmtool_posix.c @@ -1243,16 +1243,6 @@ static int ct_restore(const struct hsm_action_item *hai, const long hal_flags) goto fini; } - /* When restore request for a file triggered by read/write/ - * truncate operation from another client, it needs to detach - * the file first if it is PCC-attached. - */ - rc = llapi_pcc_detach_fid_fd(opt.o_mnt_fd, &hai->hai_fid); - if (rc) { - CT_ERROR(rc, "cannot detach pcc for file '%s'", dst); - goto fini; - } - dst_fd = llapi_hsm_action_get_fd(hcp); if (dst_fd < 0) { rc = dst_fd; @@ -1331,7 +1321,12 @@ static int ct_remove(const struct hsm_action_item *hai, const long hal_flags) rc = -errno; CT_ERROR(rc, "cannot unlink '%s'", attr); err_minor++; - goto fini; + + /* ignore the error when lov file does not exist. */ + if (rc == -ENOENT) + rc = 0; + else + goto fini; } fini: diff --git a/lustre/utils/liblustreapi_hsm.c b/lustre/utils/liblustreapi_hsm.c index 39624f2..c8a68ef 100644 --- a/lustre/utils/liblustreapi_hsm.c +++ b/lustre/utils/liblustreapi_hsm.c @@ -1140,7 +1140,37 @@ int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp, goto err_out; } else if (hai->hai_action == HSMA_REMOVE) { /* Since remove is atomic there is no need to send an - * initial MDS_HSM_PROGRESS RPC. */ + * initial MDS_HSM_PROGRESS RPC. + * RW-PCC uses Lustre HSM mechanism for data synchronization. + * At the beginning of RW-PCC attach, the client tries to + * exclusively open the file by using a lease lock. A + * successful lease open ensures that the current attach + * process is the unique opener for the file. + * After taking the lease, the file data is then copied from + * OSTs into PCC and then the client closes the lease with + * with a PCC attach intent. + * However, for a file with HSM exists, archived state (i.e. a + * cached file just was detached from PCC and restore into + * OST), a HSM REMOVE request may delete the above PCC copy + * during RW-PCC attach wrongly. + * Thus, a open/close on the corresponding Lustre file is added + * for HSMA_REMOVE here to solve this conflict. + */ + fd = ct_open_by_fid(hcp->ct_priv, &hai->hai_fid, + O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NONBLOCK); + if (fd < 0) { + rc = fd; + /* ignore the error in case of Remove Archive on Last + * Unlink (RAoLU). + */ + if (rc == -ENOENT) { + rc = 0; + goto out_log; + } + goto err_out; + } + + hcp->source_fd = fd; goto out_log; } diff --git a/lustre/utils/liblustreapi_pcc.c b/lustre/utils/liblustreapi_pcc.c index 8d23bf4..8e52b8f 100644 --- a/lustre/utils/liblustreapi_pcc.c +++ b/lustre/utils/liblustreapi_pcc.c @@ -45,25 +45,15 @@ * Fetch and attach a file to readwrite PCC. * */ -static int llapi_readwrite_pcc_attach(const char *path, __u32 archive_id) +static int llapi_readwrite_pcc_attach_fd(int fd, __u32 archive_id) { - int fd; int rc; struct ll_ioc_lease *data; - fd = open(path, O_RDWR | O_NONBLOCK); - if (fd < 0) { - rc = -errno; - llapi_error(LLAPI_MSG_ERROR, rc, "cannot open '%s'", - path); - return rc; - } - rc = llapi_lease_acquire(fd, LL_LEASE_WRLCK); if (rc < 0) { - llapi_error(LLAPI_MSG_ERROR, rc, - "cannot get lease for '%s'", path); - goto out_close; + llapi_error(LLAPI_MSG_ERROR, rc, "cannot get lease"); + return rc; } data = malloc(offsetof(typeof(*data), lil_ids[1])); @@ -71,7 +61,7 @@ static int llapi_readwrite_pcc_attach(const char *path, __u32 archive_id) rc = -ENOMEM; llapi_err_noerrno(LLAPI_MSG_ERROR, "failed to allocate memory"); - goto out_close; + return rc; } data->lil_mode = LL_LEASE_UNLCK; @@ -83,14 +73,30 @@ static int llapi_readwrite_pcc_attach(const char *path, __u32 archive_id) if (rc == 0) /* lost lease lock */ rc = -EBUSY; llapi_error(LLAPI_MSG_ERROR, rc, - "cannot attach '%s' with ID: %u", - path, archive_id); + "cannot attach with ID: %u", archive_id); } else { rc = 0; } free(data); -out_close: + return rc; +} + +static int llapi_readwrite_pcc_attach(const char *path, __u32 archive_id) +{ + int fd; + int rc; + + fd = open(path, O_RDWR | O_NONBLOCK); + if (fd < 0) { + rc = -errno; + llapi_error(LLAPI_MSG_ERROR, rc, "cannot open '%s'", + path); + return rc; + } + + rc = llapi_readwrite_pcc_attach_fd(fd, archive_id); + close(fd); return rc; } @@ -110,41 +116,85 @@ int llapi_pcc_attach(const char *path, __u32 id, enum lu_pcc_type type) return rc; } +static int llapi_readwrite_pcc_attach_fid(const char *mntpath, + const struct lu_fid *fid, + __u32 id) +{ + int rc; + int fd; + + fd = llapi_open_by_fid(mntpath, fid, O_RDWR | O_NONBLOCK); + if (fd < 0) { + rc = -errno; + llapi_error(LLAPI_MSG_ERROR, rc, + "llapi_open_by_fid for " DFID "failed", + PFID(fid)); + return rc; + } + + rc = llapi_readwrite_pcc_attach_fd(fd, id); + + close(fd); + return rc; +} + +int llapi_pcc_attach_fid(const char *mntpath, const struct lu_fid *fid, + __u32 id, enum lu_pcc_type type) +{ + int rc; + + switch (type) { + case LU_PCC_READWRITE: + rc = llapi_readwrite_pcc_attach_fid(mntpath, fid, id); + break; + default: + rc = -EINVAL; + break; + } + return rc; +} + + +int llapi_pcc_attach_fid_str(const char *mntpath, const char *fidstr, + __u32 id, enum lu_pcc_type type) +{ + int rc; + struct lu_fid fid; + const char *fidstr_orig = fidstr; + + while (*fidstr == '[') + fidstr++; + rc = sscanf(fidstr, SFID, RFID(&fid)); + if (rc != 3) { + llapi_err_noerrno(LLAPI_MSG_ERROR, + "bad FID format '%s', should be [seq:oid:ver]" + " (e.g. "DFID")\n", fidstr_orig, + (unsigned long long)FID_SEQ_NORMAL, 2, 0); + return -EINVAL; + } + + rc = llapi_pcc_attach_fid(mntpath, &fid, id, type); + + return rc; +} /** - * detach PCC cache of a file by an ioctl on the dir fd (usually a mount - * point fd that the copytool already has open). - * - * If the file is being used, the detaching will return -EBUSY immediately. - * Thus, if a PCC-attached file is kept open for a long time, the restore - * request will always return failure. + * detach PCC cache of a file by using fd. * - * \param fd Directory file descriptor. - * \param fid FID of the file. + * \param fd File handle. * * \return 0 on success, an error code otherwise. */ -int llapi_pcc_detach_fid_fd(int fd, const struct lu_fid *fid) +int llapi_pcc_detach_fd(int fd) { int rc; - struct lu_pcc_detach detach; - - detach.pccd_fid = *fid; - rc = ioctl(fd, LL_IOC_PCC_DETACH, &detach); - if (rc == -EAGAIN) - llapi_error(LLAPI_MSG_ERROR, rc, - "FID "DFID" may be in the attaching state, " - "or you may need to re-run the pcc_attach " - "to finish the attach process.", PFID(fid)); - else if (rc) - llapi_error(LLAPI_MSG_ERROR, rc, - "cannot detach FID "DFID" from PCC", PFID(fid)); + rc = ioctl(fd, LL_IOC_PCC_DETACH); return rc; } /** - * detach PCC cache of a file. + * detach PCC cache of a file via FID. * * \param mntpath Fullpath to the client mount point. * \param fid FID of the file. @@ -155,6 +205,7 @@ int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid) { int rc; int fd; + struct lu_pcc_detach detach; rc = get_root_path(WANT_FD, NULL, &fd, (char *)mntpath, -1); if (rc) { @@ -163,14 +214,21 @@ int llapi_pcc_detach_fid(const char *mntpath, const struct lu_fid *fid) return rc; } - rc = llapi_pcc_detach_fid_fd(fd, fid); - + /* + * PCC prefetching algorithm scans Lustre OPEN/CLOSE changelogs + * to determine the candidate files needing to prefetch into + * PCC. To avoid generattion of unnecessary open/close changelogs, + * we implement a new dir ioctl LL_IOC_PCC_DETACH_BY_FID to detach + * files. + */ + detach.pccd_fid = *fid; + rc = ioctl(fd, LL_IOC_PCC_DETACH_BY_FID, &detach); close(fd); return rc; } /** - * detach PCC cache of a file. + * detach PCC cache of a file via FID. * * \param mntpath Fullpath to the client mount point. * \param fid FID string of the file. @@ -209,16 +267,18 @@ int llapi_pcc_detach_fid_str(const char *mntpath, const char *fidstr) int llapi_pcc_detach_file(const char *path) { int rc; - lustre_fid fid; + int fd; - rc = llapi_path2fid(path, &fid); - if (rc) { - llapi_error(LLAPI_MSG_ERROR, rc, "cannot get FID of '%s'", + fd = open(path, O_RDWR | O_NONBLOCK); + if (fd < 0) { + rc = -errno; + llapi_error(LLAPI_MSG_ERROR, rc, "cannot open '%s'", path); return rc; } - rc = llapi_pcc_detach_fid(path, &fid); + rc = llapi_pcc_detach_fd(fd); + close(fd); return rc; } -- 1.8.3.1