4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
67 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
69 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
72 static struct ll_file_data *ll_file_data_get(void)
74 struct ll_file_data *fd;
76 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
80 fd->fd_write_failed = false;
81 pcc_file_init(&fd->fd_pcc_file);
86 static void ll_file_data_put(struct ll_file_data *fd)
89 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
93 * Packs all the attributes into @op_data for the CLOSE rpc.
95 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
100 ll_prep_md_op_data(op_data, inode, NULL, NULL,
101 0, 0, LUSTRE_OPC_ANY, NULL);
103 op_data->op_attr.ia_mode = inode->i_mode;
104 op_data->op_attr.ia_atime = inode->i_atime;
105 op_data->op_attr.ia_mtime = inode->i_mtime;
106 op_data->op_attr.ia_ctime = inode->i_ctime;
107 op_data->op_attr.ia_size = i_size_read(inode);
108 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
111 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
112 op_data->op_attr_blocks = inode->i_blocks;
113 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
114 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
115 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
116 op_data->op_open_handle = och->och_open_handle;
118 if (och->och_flags & FMODE_WRITE &&
119 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
120 /* For HSM: if inode data has been modified, pack it so that
121 * MDT can set data dirty flag in the archive. */
122 op_data->op_bias |= MDS_DATA_MODIFIED;
128 * Perform a close, possibly with a bias.
129 * The meaning of "data" depends on the value of "bias".
131 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
132 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
135 static int ll_close_inode_openhandle(struct inode *inode,
136 struct obd_client_handle *och,
137 enum mds_op_bias bias, void *data)
139 struct obd_export *md_exp = ll_i2mdexp(inode);
140 const struct ll_inode_info *lli = ll_i2info(inode);
141 struct md_op_data *op_data;
142 struct ptlrpc_request *req = NULL;
146 if (class_exp2obd(md_exp) == NULL) {
147 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
148 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
152 OBD_ALLOC_PTR(op_data);
153 /* We leak openhandle and request here on error, but not much to be
154 * done in OOM case since app won't retry close on error either. */
156 GOTO(out, rc = -ENOMEM);
158 ll_prepare_close(inode, op_data, och);
160 case MDS_CLOSE_LAYOUT_MERGE:
161 /* merge blocks from the victim inode */
162 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
163 op_data->op_attr.ia_valid |= ATTR_SIZE;
164 op_data->op_xvalid |= OP_XVALID_BLOCKS;
165 case MDS_CLOSE_LAYOUT_SPLIT:
166 case MDS_CLOSE_LAYOUT_SWAP: {
167 struct split_param *sp = data;
169 LASSERT(data != NULL);
170 op_data->op_bias |= bias;
171 op_data->op_data_version = 0;
172 op_data->op_lease_handle = och->och_lease_handle;
173 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
174 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
175 op_data->op_mirror_id = sp->sp_mirror_id;
177 op_data->op_fid2 = *ll_inode2fid(data);
182 case MDS_CLOSE_RESYNC_DONE: {
183 struct ll_ioc_lease *ioc = data;
185 LASSERT(data != NULL);
186 op_data->op_attr_blocks +=
187 ioc->lil_count * op_data->op_attr_blocks;
188 op_data->op_attr.ia_valid |= ATTR_SIZE;
189 op_data->op_xvalid |= OP_XVALID_BLOCKS;
190 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
192 op_data->op_lease_handle = och->och_lease_handle;
193 op_data->op_data = &ioc->lil_ids[0];
194 op_data->op_data_size =
195 ioc->lil_count * sizeof(ioc->lil_ids[0]);
199 case MDS_PCC_ATTACH: {
200 struct pcc_param *param = data;
202 LASSERT(data != NULL);
203 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
204 op_data->op_archive_id = param->pa_archive_id;
205 op_data->op_data_version = param->pa_data_version;
206 op_data->op_lease_handle = och->och_lease_handle;
210 case MDS_HSM_RELEASE:
211 LASSERT(data != NULL);
212 op_data->op_bias |= MDS_HSM_RELEASE;
213 op_data->op_data_version = *(__u64 *)data;
214 op_data->op_lease_handle = och->och_lease_handle;
215 op_data->op_attr.ia_valid |= ATTR_SIZE;
216 op_data->op_xvalid |= OP_XVALID_BLOCKS;
220 LASSERT(data == NULL);
224 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
225 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
226 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
227 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
229 rc = md_close(md_exp, op_data, och->och_mod, &req);
230 if (rc != 0 && rc != -EINTR)
231 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
232 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
234 if (rc == 0 && op_data->op_bias & bias) {
235 struct mdt_body *body;
237 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
238 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 ll_finish_md_op_data(op_data);
246 md_clear_open_replay_data(md_exp, och);
247 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
250 ptlrpc_req_finished(req); /* This is close request */
254 int ll_md_real_close(struct inode *inode, fmode_t fmode)
256 struct ll_inode_info *lli = ll_i2info(inode);
257 struct obd_client_handle **och_p;
258 struct obd_client_handle *och;
263 if (fmode & FMODE_WRITE) {
264 och_p = &lli->lli_mds_write_och;
265 och_usecount = &lli->lli_open_fd_write_count;
266 } else if (fmode & FMODE_EXEC) {
267 och_p = &lli->lli_mds_exec_och;
268 och_usecount = &lli->lli_open_fd_exec_count;
270 LASSERT(fmode & FMODE_READ);
271 och_p = &lli->lli_mds_read_och;
272 och_usecount = &lli->lli_open_fd_read_count;
275 mutex_lock(&lli->lli_och_mutex);
276 if (*och_usecount > 0) {
277 /* There are still users of this handle, so skip
279 mutex_unlock(&lli->lli_och_mutex);
285 mutex_unlock(&lli->lli_och_mutex);
288 /* There might be a race and this handle may already
290 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
296 static int ll_md_close(struct inode *inode, struct file *file)
298 union ldlm_policy_data policy = {
299 .l_inodebits = { MDS_INODELOCK_OPEN },
301 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
303 struct ll_inode_info *lli = ll_i2info(inode);
304 struct lustre_handle lockh;
305 enum ldlm_mode lockmode;
309 /* clear group lock, if present */
310 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
311 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
313 if (fd->fd_lease_och != NULL) {
316 /* Usually the lease is not released when the
317 * application crashed, we need to release here. */
318 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
319 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
320 PFID(&lli->lli_fid), rc, lease_broken);
322 fd->fd_lease_och = NULL;
325 if (fd->fd_och != NULL) {
326 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
331 /* Let's see if we have good enough OPEN lock on the file and if
332 we can skip talking to MDS */
333 mutex_lock(&lli->lli_och_mutex);
334 if (fd->fd_omode & FMODE_WRITE) {
336 LASSERT(lli->lli_open_fd_write_count);
337 lli->lli_open_fd_write_count--;
338 } else if (fd->fd_omode & FMODE_EXEC) {
340 LASSERT(lli->lli_open_fd_exec_count);
341 lli->lli_open_fd_exec_count--;
344 LASSERT(lli->lli_open_fd_read_count);
345 lli->lli_open_fd_read_count--;
347 mutex_unlock(&lli->lli_och_mutex);
349 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
350 LDLM_IBITS, &policy, lockmode, &lockh))
351 rc = ll_md_real_close(inode, fd->fd_omode);
354 LUSTRE_FPRIVATE(file) = NULL;
355 ll_file_data_put(fd);
360 /* While this returns an error code, fput() the caller does not, so we need
361 * to make every effort to clean up all of our state here. Also, applications
362 * rarely check close errors and even if an error is returned they will not
363 * re-try the close call.
365 int ll_file_release(struct inode *inode, struct file *file)
367 struct ll_file_data *fd;
368 struct ll_sb_info *sbi = ll_i2sbi(inode);
369 struct ll_inode_info *lli = ll_i2info(inode);
373 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
374 PFID(ll_inode2fid(inode)), inode);
376 if (inode->i_sb->s_root != file_dentry(file))
377 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
378 fd = LUSTRE_FPRIVATE(file);
381 /* The last ref on @file, maybe not the the owner pid of statahead,
382 * because parent and child process can share the same file handle. */
383 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
384 ll_deauthorize_statahead(inode, fd);
386 if (inode->i_sb->s_root == file_dentry(file)) {
387 LUSTRE_FPRIVATE(file) = NULL;
388 ll_file_data_put(fd);
392 pcc_file_release(inode, file);
394 if (!S_ISDIR(inode->i_mode)) {
395 if (lli->lli_clob != NULL)
396 lov_read_and_clear_async_rc(lli->lli_clob);
397 lli->lli_async_rc = 0;
400 rc = ll_md_close(inode, file);
402 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
403 libcfs_debug_dumplog();
408 static inline int ll_dom_readpage(void *data, struct page *page)
410 struct niobuf_local *lnb = data;
413 kaddr = ll_kmap_atomic(page, KM_USER0);
414 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
415 if (lnb->lnb_len < PAGE_SIZE)
416 memset(kaddr + lnb->lnb_len, 0,
417 PAGE_SIZE - lnb->lnb_len);
418 flush_dcache_page(page);
419 SetPageUptodate(page);
420 ll_kunmap_atomic(kaddr, KM_USER0);
426 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
427 struct lookup_intent *it)
429 struct ll_inode_info *lli = ll_i2info(inode);
430 struct cl_object *obj = lli->lli_clob;
431 struct address_space *mapping = inode->i_mapping;
433 struct niobuf_remote *rnb;
435 unsigned long index, start;
436 struct niobuf_local lnb;
443 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
447 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
448 if (rnb == NULL || rnb->rnb_len == 0)
451 /* LU-11595: Server may return whole file and that is OK always or
452 * it may return just file tail and its offset must be aligned with
453 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
454 * smaller then offset may be not aligned and that data is just ignored.
456 if (rnb->rnb_offset % PAGE_SIZE)
459 /* Server returns whole file or just file tail if it fills in
460 * reply buffer, in both cases total size should be inode size.
462 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
463 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
464 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
465 rnb->rnb_len, i_size_read(inode));
469 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
470 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
472 data = (char *)rnb + sizeof(*rnb);
474 lnb.lnb_file_offset = rnb->rnb_offset;
475 start = lnb.lnb_file_offset / PAGE_SIZE;
477 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
478 lnb.lnb_page_offset = 0;
480 lnb.lnb_data = data + (index << PAGE_SHIFT);
481 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
482 if (lnb.lnb_len > PAGE_SIZE)
483 lnb.lnb_len = PAGE_SIZE;
485 vmpage = read_cache_page(mapping, index + start,
486 ll_dom_readpage, &lnb);
487 if (IS_ERR(vmpage)) {
488 CWARN("%s: cannot fill page %lu for "DFID
489 " with data: rc = %li\n",
490 ll_i2sbi(inode)->ll_fsname, index + start,
491 PFID(lu_object_fid(&obj->co_lu)),
497 } while (rnb->rnb_len > (index << PAGE_SHIFT));
501 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
502 struct lookup_intent *itp)
504 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
505 struct dentry *parent = de->d_parent;
508 struct md_op_data *op_data;
509 struct ptlrpc_request *req = NULL;
513 LASSERT(parent != NULL);
514 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
516 /* if server supports open-by-fid, or file name is invalid, don't pack
517 * name in open request */
518 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
519 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
521 len = de->d_name.len;
522 name = kmalloc(len + 1, GFP_NOFS);
527 spin_lock(&de->d_lock);
528 if (len != de->d_name.len) {
529 spin_unlock(&de->d_lock);
533 memcpy(name, de->d_name.name, len);
535 spin_unlock(&de->d_lock);
537 if (!lu_name_is_valid_2(name, len)) {
543 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
544 name, len, 0, LUSTRE_OPC_ANY, NULL);
545 if (IS_ERR(op_data)) {
547 RETURN(PTR_ERR(op_data));
549 op_data->op_data = lmm;
550 op_data->op_data_size = lmmsize;
552 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
553 &ll_md_blocking_ast, 0);
555 ll_finish_md_op_data(op_data);
557 /* reason for keep own exit path - don`t flood log
558 * with messages with -ESTALE errors.
560 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
561 it_open_error(DISP_OPEN_OPEN, itp))
563 ll_release_openhandle(de, itp);
567 if (it_disposition(itp, DISP_LOOKUP_NEG))
568 GOTO(out, rc = -ENOENT);
570 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
571 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
572 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
576 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
578 if (!rc && itp->it_lock_mode) {
579 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
580 struct ldlm_lock *lock;
581 bool has_dom_bit = false;
583 /* If we got a lock back and it has a LOOKUP bit set,
584 * make sure the dentry is marked as valid so we can find it.
585 * We don't need to care about actual hashing since other bits
586 * of kernel will deal with that later.
588 lock = ldlm_handle2lock(&handle);
590 has_dom_bit = ldlm_has_dom(lock);
591 if (lock->l_policy_data.l_inodebits.bits &
592 MDS_INODELOCK_LOOKUP)
593 d_lustre_revalidate(de);
597 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
599 ll_dom_finish_open(de->d_inode, req, itp);
603 ptlrpc_req_finished(req);
604 ll_intent_drop_lock(itp);
606 /* We did open by fid, but by the time we got to the server,
607 * the object disappeared. If this is a create, we cannot really
608 * tell the userspace that the file it was trying to create
609 * does not exist. Instead let's return -ESTALE, and the VFS will
610 * retry the create with LOOKUP_REVAL that we are going to catch
611 * in ll_revalidate_dentry() and use lookup then.
613 if (rc == -ENOENT && itp->it_op & IT_CREAT)
619 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
620 struct obd_client_handle *och)
622 struct mdt_body *body;
624 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
625 och->och_open_handle = body->mbo_open_handle;
626 och->och_fid = body->mbo_fid1;
627 och->och_lease_handle.cookie = it->it_lock_handle;
628 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
629 och->och_flags = it->it_flags;
631 return md_set_open_replay_data(md_exp, och, it);
634 static int ll_local_open(struct file *file, struct lookup_intent *it,
635 struct ll_file_data *fd, struct obd_client_handle *och)
637 struct inode *inode = file_inode(file);
640 LASSERT(!LUSTRE_FPRIVATE(file));
647 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
652 LUSTRE_FPRIVATE(file) = fd;
653 ll_readahead_init(inode, &fd->fd_ras);
654 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
656 /* ll_cl_context initialize */
657 rwlock_init(&fd->fd_lock);
658 INIT_LIST_HEAD(&fd->fd_lccs);
663 /* Open a file, and (for the very first open) create objects on the OSTs at
664 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
665 * creation or open until ll_lov_setstripe() ioctl is called.
667 * If we already have the stripe MD locally then we don't request it in
668 * md_open(), by passing a lmm_size = 0.
670 * It is up to the application to ensure no other processes open this file
671 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
672 * used. We might be able to avoid races of that sort by getting lli_open_sem
673 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
674 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
676 int ll_file_open(struct inode *inode, struct file *file)
678 struct ll_inode_info *lli = ll_i2info(inode);
679 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
680 .it_flags = file->f_flags };
681 struct obd_client_handle **och_p = NULL;
682 __u64 *och_usecount = NULL;
683 struct ll_file_data *fd;
687 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
688 PFID(ll_inode2fid(inode)), inode, file->f_flags);
690 it = file->private_data; /* XXX: compat macro */
691 file->private_data = NULL; /* prevent ll_local_open assertion */
693 fd = ll_file_data_get();
695 GOTO(out_nofiledata, rc = -ENOMEM);
698 if (S_ISDIR(inode->i_mode))
699 ll_authorize_statahead(inode, fd);
701 if (inode->i_sb->s_root == file_dentry(file)) {
702 LUSTRE_FPRIVATE(file) = fd;
706 if (!it || !it->it_disposition) {
707 /* Convert f_flags into access mode. We cannot use file->f_mode,
708 * because everything but O_ACCMODE mask was stripped from
710 if ((oit.it_flags + 1) & O_ACCMODE)
712 if (file->f_flags & O_TRUNC)
713 oit.it_flags |= FMODE_WRITE;
715 /* kernel only call f_op->open in dentry_open. filp_open calls
716 * dentry_open after call to open_namei that checks permissions.
717 * Only nfsd_open call dentry_open directly without checking
718 * permissions and because of that this code below is safe.
720 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
721 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
723 /* We do not want O_EXCL here, presumably we opened the file
724 * already? XXX - NFS implications? */
725 oit.it_flags &= ~O_EXCL;
727 /* bug20584, if "it_flags" contains O_CREAT, the file will be
728 * created if necessary, then "IT_CREAT" should be set to keep
729 * consistent with it */
730 if (oit.it_flags & O_CREAT)
731 oit.it_op |= IT_CREAT;
737 /* Let's see if we have file open on MDS already. */
738 if (it->it_flags & FMODE_WRITE) {
739 och_p = &lli->lli_mds_write_och;
740 och_usecount = &lli->lli_open_fd_write_count;
741 } else if (it->it_flags & FMODE_EXEC) {
742 och_p = &lli->lli_mds_exec_och;
743 och_usecount = &lli->lli_open_fd_exec_count;
745 och_p = &lli->lli_mds_read_och;
746 och_usecount = &lli->lli_open_fd_read_count;
749 mutex_lock(&lli->lli_och_mutex);
750 if (*och_p) { /* Open handle is present */
751 if (it_disposition(it, DISP_OPEN_OPEN)) {
752 /* Well, there's extra open request that we do not need,
753 let's close it somehow. This will decref request. */
754 rc = it_open_error(DISP_OPEN_OPEN, it);
756 mutex_unlock(&lli->lli_och_mutex);
757 GOTO(out_openerr, rc);
760 ll_release_openhandle(file_dentry(file), it);
764 rc = ll_local_open(file, it, fd, NULL);
767 mutex_unlock(&lli->lli_och_mutex);
768 GOTO(out_openerr, rc);
771 LASSERT(*och_usecount == 0);
772 if (!it->it_disposition) {
773 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
774 /* We cannot just request lock handle now, new ELC code
775 means that one of other OPEN locks for this file
776 could be cancelled, and since blocking ast handler
777 would attempt to grab och_mutex as well, that would
778 result in a deadlock */
779 mutex_unlock(&lli->lli_och_mutex);
781 * Normally called under two situations:
783 * 2. A race/condition on MDS resulting in no open
784 * handle to be returned from LOOKUP|OPEN request,
785 * for example if the target entry was a symlink.
787 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
788 * marked by a bit set in ll_iget_for_nfs. Clear the
789 * bit so that it's not confusing later callers.
791 * NB; when ldd is NULL, it must have come via normal
792 * lookup path only, since ll_iget_for_nfs always calls
795 if (ldd && ldd->lld_nfs_dentry) {
796 ldd->lld_nfs_dentry = 0;
797 it->it_flags |= MDS_OPEN_LOCK;
801 * Always specify MDS_OPEN_BY_FID because we don't want
802 * to get file with different fid.
804 it->it_flags |= MDS_OPEN_BY_FID;
805 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
808 GOTO(out_openerr, rc);
812 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
814 GOTO(out_och_free, rc = -ENOMEM);
818 /* md_intent_lock() didn't get a request ref if there was an
819 * open error, so don't do cleanup on the request here
821 /* XXX (green): Should not we bail out on any error here, not
822 * just open error? */
823 rc = it_open_error(DISP_OPEN_OPEN, it);
825 GOTO(out_och_free, rc);
827 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
828 "inode %p: disposition %x, status %d\n", inode,
829 it_disposition(it, ~0), it->it_status);
831 rc = ll_local_open(file, it, fd, *och_p);
833 GOTO(out_och_free, rc);
835 rc = pcc_file_open(inode, file);
837 GOTO(out_och_free, rc);
839 mutex_unlock(&lli->lli_och_mutex);
842 /* Must do this outside lli_och_mutex lock to prevent deadlock where
843 different kind of OPEN lock for this same inode gets cancelled
844 by ldlm_cancel_lru */
845 if (!S_ISREG(inode->i_mode))
846 GOTO(out_och_free, rc);
848 cl_lov_delay_create_clear(&file->f_flags);
849 GOTO(out_och_free, rc);
853 if (och_p && *och_p) {
854 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
855 *och_p = NULL; /* OBD_FREE writes some magic there */
858 mutex_unlock(&lli->lli_och_mutex);
861 if (lli->lli_opendir_key == fd)
862 ll_deauthorize_statahead(inode, fd);
865 ll_file_data_put(fd);
867 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
871 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
872 ptlrpc_req_finished(it->it_request);
873 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
879 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
880 struct ldlm_lock_desc *desc, void *data, int flag)
883 struct lustre_handle lockh;
887 case LDLM_CB_BLOCKING:
888 ldlm_lock2handle(lock, &lockh);
889 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
891 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
895 case LDLM_CB_CANCELING:
903 * When setting a lease on a file, we take ownership of the lli_mds_*_och
904 * and save it as fd->fd_och so as to force client to reopen the file even
905 * if it has an open lock in cache already.
907 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
908 struct lustre_handle *old_open_handle)
910 struct ll_inode_info *lli = ll_i2info(inode);
911 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
912 struct obd_client_handle **och_p;
917 /* Get the openhandle of the file */
918 mutex_lock(&lli->lli_och_mutex);
919 if (fd->fd_lease_och != NULL)
920 GOTO(out_unlock, rc = -EBUSY);
922 if (fd->fd_och == NULL) {
923 if (file->f_mode & FMODE_WRITE) {
924 LASSERT(lli->lli_mds_write_och != NULL);
925 och_p = &lli->lli_mds_write_och;
926 och_usecount = &lli->lli_open_fd_write_count;
928 LASSERT(lli->lli_mds_read_och != NULL);
929 och_p = &lli->lli_mds_read_och;
930 och_usecount = &lli->lli_open_fd_read_count;
933 if (*och_usecount > 1)
934 GOTO(out_unlock, rc = -EBUSY);
941 *old_open_handle = fd->fd_och->och_open_handle;
945 mutex_unlock(&lli->lli_och_mutex);
950 * Release ownership on lli_mds_*_och when putting back a file lease.
952 static int ll_lease_och_release(struct inode *inode, struct file *file)
954 struct ll_inode_info *lli = ll_i2info(inode);
955 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
956 struct obd_client_handle **och_p;
957 struct obd_client_handle *old_och = NULL;
962 mutex_lock(&lli->lli_och_mutex);
963 if (file->f_mode & FMODE_WRITE) {
964 och_p = &lli->lli_mds_write_och;
965 och_usecount = &lli->lli_open_fd_write_count;
967 och_p = &lli->lli_mds_read_och;
968 och_usecount = &lli->lli_open_fd_read_count;
971 /* The file may have been open by another process (broken lease) so
972 * *och_p is not NULL. In this case we should simply increase usecount
975 if (*och_p != NULL) {
976 old_och = fd->fd_och;
983 mutex_unlock(&lli->lli_och_mutex);
986 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
992 * Acquire a lease and open the file.
994 static struct obd_client_handle *
995 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
998 struct lookup_intent it = { .it_op = IT_OPEN };
999 struct ll_sb_info *sbi = ll_i2sbi(inode);
1000 struct md_op_data *op_data;
1001 struct ptlrpc_request *req = NULL;
1002 struct lustre_handle old_open_handle = { 0 };
1003 struct obd_client_handle *och = NULL;
1008 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1009 RETURN(ERR_PTR(-EINVAL));
1012 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1013 RETURN(ERR_PTR(-EPERM));
1015 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1017 RETURN(ERR_PTR(rc));
1022 RETURN(ERR_PTR(-ENOMEM));
1024 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1025 LUSTRE_OPC_ANY, NULL);
1026 if (IS_ERR(op_data))
1027 GOTO(out, rc = PTR_ERR(op_data));
1029 /* To tell the MDT this openhandle is from the same owner */
1030 op_data->op_open_handle = old_open_handle;
1032 it.it_flags = fmode | open_flags;
1033 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1034 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1035 &ll_md_blocking_lease_ast,
1036 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1037 * it can be cancelled which may mislead applications that the lease is
1039 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1040 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1041 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1042 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1043 ll_finish_md_op_data(op_data);
1044 ptlrpc_req_finished(req);
1046 GOTO(out_release_it, rc);
1048 if (it_disposition(&it, DISP_LOOKUP_NEG))
1049 GOTO(out_release_it, rc = -ENOENT);
1051 rc = it_open_error(DISP_OPEN_OPEN, &it);
1053 GOTO(out_release_it, rc);
1055 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1056 ll_och_fill(sbi->ll_md_exp, &it, och);
1058 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1059 GOTO(out_close, rc = -EOPNOTSUPP);
1061 /* already get lease, handle lease lock */
1062 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1063 if (it.it_lock_mode == 0 ||
1064 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1065 /* open lock must return for lease */
1066 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1067 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1069 GOTO(out_close, rc = -EPROTO);
1072 ll_intent_release(&it);
1076 /* Cancel open lock */
1077 if (it.it_lock_mode != 0) {
1078 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1080 it.it_lock_mode = 0;
1081 och->och_lease_handle.cookie = 0ULL;
1083 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1085 CERROR("%s: error closing file "DFID": %d\n",
1086 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1087 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1089 ll_intent_release(&it);
1093 RETURN(ERR_PTR(rc));
1097 * Check whether a layout swap can be done between two inodes.
1099 * \param[in] inode1 First inode to check
1100 * \param[in] inode2 Second inode to check
1102 * \retval 0 on success, layout swap can be performed between both inodes
1103 * \retval negative error code if requirements are not met
1105 static int ll_check_swap_layouts_validity(struct inode *inode1,
1106 struct inode *inode2)
1108 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1111 if (inode_permission(inode1, MAY_WRITE) ||
1112 inode_permission(inode2, MAY_WRITE))
1115 if (inode1->i_sb != inode2->i_sb)
1121 static int ll_swap_layouts_close(struct obd_client_handle *och,
1122 struct inode *inode, struct inode *inode2)
1124 const struct lu_fid *fid1 = ll_inode2fid(inode);
1125 const struct lu_fid *fid2;
1129 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1130 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1132 rc = ll_check_swap_layouts_validity(inode, inode2);
1134 GOTO(out_free_och, rc);
1136 /* We now know that inode2 is a lustre inode */
1137 fid2 = ll_inode2fid(inode2);
1139 rc = lu_fid_cmp(fid1, fid2);
1141 GOTO(out_free_och, rc = -EINVAL);
1143 /* Close the file and {swap,merge} layouts between inode & inode2.
1144 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1145 * because we still need it to pack l_remote_handle to MDT. */
1146 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1149 och = NULL; /* freed in ll_close_inode_openhandle() */
1159 * Release lease and close the file.
1160 * It will check if the lease has ever broken.
1162 static int ll_lease_close_intent(struct obd_client_handle *och,
1163 struct inode *inode,
1164 bool *lease_broken, enum mds_op_bias bias,
1167 struct ldlm_lock *lock;
1168 bool cancelled = true;
1172 lock = ldlm_handle2lock(&och->och_lease_handle);
1174 lock_res_and_lock(lock);
1175 cancelled = ldlm_is_cancel(lock);
1176 unlock_res_and_lock(lock);
1177 LDLM_LOCK_PUT(lock);
1180 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1181 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1183 if (lease_broken != NULL)
1184 *lease_broken = cancelled;
1186 if (!cancelled && !bias)
1187 ldlm_cli_cancel(&och->och_lease_handle, 0);
1189 if (cancelled) { /* no need to excute intent */
1194 rc = ll_close_inode_openhandle(inode, och, bias, data);
1198 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1201 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1205 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1207 static int ll_lease_file_resync(struct obd_client_handle *och,
1208 struct inode *inode, unsigned long arg)
1210 struct ll_sb_info *sbi = ll_i2sbi(inode);
1211 struct md_op_data *op_data;
1212 struct ll_ioc_lease_id ioc;
1213 __u64 data_version_unused;
1217 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1218 LUSTRE_OPC_ANY, NULL);
1219 if (IS_ERR(op_data))
1220 RETURN(PTR_ERR(op_data));
1222 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1226 /* before starting file resync, it's necessary to clean up page cache
1227 * in client memory, otherwise once the layout version is increased,
1228 * writing back cached data will be denied the OSTs. */
1229 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1233 op_data->op_lease_handle = och->och_lease_handle;
1234 op_data->op_mirror_id = ioc.lil_mirror_id;
1235 rc = md_file_resync(sbi->ll_md_exp, op_data);
1241 ll_finish_md_op_data(op_data);
1245 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1247 struct ll_inode_info *lli = ll_i2info(inode);
1248 struct cl_object *obj = lli->lli_clob;
1249 struct cl_attr *attr = vvp_env_thread_attr(env);
1257 ll_inode_size_lock(inode);
1259 /* Merge timestamps the most recently obtained from MDS with
1260 * timestamps obtained from OSTs.
1262 * Do not overwrite atime of inode because it may be refreshed
1263 * by file_accessed() function. If the read was served by cache
1264 * data, there is no RPC to be sent so that atime may not be
1265 * transferred to OSTs at all. MDT only updates atime at close time
1266 * if it's at least 'mdd.*.atime_diff' older.
1267 * All in all, the atime in Lustre does not strictly comply with
1268 * POSIX. Solving this problem needs to send an RPC to MDT for each
1269 * read, this will hurt performance.
1271 if (inode->i_atime.tv_sec < lli->lli_atime ||
1272 lli->lli_update_atime) {
1273 inode->i_atime.tv_sec = lli->lli_atime;
1274 lli->lli_update_atime = 0;
1276 inode->i_mtime.tv_sec = lli->lli_mtime;
1277 inode->i_ctime.tv_sec = lli->lli_ctime;
1279 mtime = inode->i_mtime.tv_sec;
1280 atime = inode->i_atime.tv_sec;
1281 ctime = inode->i_ctime.tv_sec;
1283 cl_object_attr_lock(obj);
1284 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1287 rc = cl_object_attr_get(env, obj, attr);
1288 cl_object_attr_unlock(obj);
1291 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1293 if (atime < attr->cat_atime)
1294 atime = attr->cat_atime;
1296 if (ctime < attr->cat_ctime)
1297 ctime = attr->cat_ctime;
1299 if (mtime < attr->cat_mtime)
1300 mtime = attr->cat_mtime;
1302 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1303 PFID(&lli->lli_fid), attr->cat_size);
1305 i_size_write(inode, attr->cat_size);
1306 inode->i_blocks = attr->cat_blocks;
1308 inode->i_mtime.tv_sec = mtime;
1309 inode->i_atime.tv_sec = atime;
1310 inode->i_ctime.tv_sec = ctime;
1313 ll_inode_size_unlock(inode);
1319 * Set designated mirror for I/O.
1321 * So far only read, write, and truncated can support to issue I/O to
1322 * designated mirror.
1324 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1326 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1328 /* clear layout version for generic(non-resync) I/O in case it carries
1329 * stale layout version due to I/O restart */
1330 io->ci_layout_version = 0;
1332 /* FLR: disable non-delay for designated mirror I/O because obviously
1333 * only one mirror is available */
1334 if (fd->fd_designated_mirror > 0) {
1336 io->ci_designated_mirror = fd->fd_designated_mirror;
1337 io->ci_layout_version = fd->fd_layout_version;
1340 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1341 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1344 static bool file_is_noatime(const struct file *file)
1346 const struct vfsmount *mnt = file->f_path.mnt;
1347 const struct inode *inode = file_inode((struct file *)file);
1349 /* Adapted from file_accessed() and touch_atime().*/
1350 if (file->f_flags & O_NOATIME)
1353 if (inode->i_flags & S_NOATIME)
1356 if (IS_NOATIME(inode))
1359 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1362 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1365 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1371 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1373 struct inode *inode = file_inode(file);
1374 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1376 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1377 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1379 if (iot == CIT_WRITE) {
1380 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1381 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1382 file->f_flags & O_DIRECT ||
1385 io->ci_obj = ll_i2info(inode)->lli_clob;
1386 io->ci_lockreq = CILR_MAYBE;
1387 if (ll_file_nolock(file)) {
1388 io->ci_lockreq = CILR_NEVER;
1389 io->ci_no_srvlock = 1;
1390 } else if (file->f_flags & O_APPEND) {
1391 io->ci_lockreq = CILR_MANDATORY;
1393 io->ci_noatime = file_is_noatime(file);
1395 /* FLR: only use non-delay I/O for read as there is only one
1396 * avaliable mirror for write. */
1397 io->ci_ndelay = !(iot == CIT_WRITE);
1399 ll_io_set_mirror(io, file);
1402 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1405 struct ll_inode_info *lli = ll_i2info(inode);
1406 struct ll_sb_info *sbi = ll_i2sbi(inode);
1407 enum obd_heat_type sample_type;
1408 enum obd_heat_type iobyte_type;
1409 __u64 now = ktime_get_real_seconds();
1411 if (!ll_sbi_has_file_heat(sbi) ||
1412 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1415 if (iot == CIT_READ) {
1416 sample_type = OBD_HEAT_READSAMPLE;
1417 iobyte_type = OBD_HEAT_READBYTE;
1418 } else if (iot == CIT_WRITE) {
1419 sample_type = OBD_HEAT_WRITESAMPLE;
1420 iobyte_type = OBD_HEAT_WRITEBYTE;
1425 spin_lock(&lli->lli_heat_lock);
1426 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1427 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1428 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1429 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1430 spin_unlock(&lli->lli_heat_lock);
1434 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1435 struct file *file, enum cl_io_type iot,
1436 loff_t *ppos, size_t count)
1438 struct vvp_io *vio = vvp_env_io(env);
1439 struct inode *inode = file_inode(file);
1440 struct ll_inode_info *lli = ll_i2info(inode);
1441 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1442 struct range_lock range;
1446 unsigned retried = 0;
1447 bool restarted = false;
1451 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1452 file_dentry(file)->d_name.name,
1453 iot == CIT_READ ? "read" : "write", *ppos, count);
1456 io = vvp_env_thread_io(env);
1457 ll_io_init(io, file, iot);
1458 io->ci_ndelay_tried = retried;
1460 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1461 bool range_locked = false;
1463 if (file->f_flags & O_APPEND)
1464 range_lock_init(&range, 0, LUSTRE_EOF);
1466 range_lock_init(&range, *ppos, *ppos + count - 1);
1468 vio->vui_fd = LUSTRE_FPRIVATE(file);
1469 vio->vui_io_subtype = args->via_io_subtype;
1471 switch (vio->vui_io_subtype) {
1473 vio->vui_iter = args->u.normal.via_iter;
1474 vio->vui_iocb = args->u.normal.via_iocb;
1475 /* Direct IO reads must also take range lock,
1476 * or multiple reads will try to work on the same pages
1477 * See LU-6227 for details. */
1478 if (((iot == CIT_WRITE) ||
1479 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1480 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1481 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1483 rc = range_lock(&lli->lli_write_tree, &range);
1487 range_locked = true;
1491 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1492 vio->u.splice.vui_flags = args->u.splice.via_flags;
1495 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1499 ll_cl_add(file, env, io, LCC_RW);
1500 rc = cl_io_loop(env, io);
1501 ll_cl_remove(file, env);
1504 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1506 range_unlock(&lli->lli_write_tree, &range);
1509 /* cl_io_rw_init() handled IO */
1513 if (io->ci_nob > 0) {
1514 result += io->ci_nob;
1515 count -= io->ci_nob;
1516 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1518 /* prepare IO restart */
1519 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1520 args->u.normal.via_iter = vio->vui_iter;
1523 cl_io_fini(env, io);
1526 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1527 file->f_path.dentry->d_name.name,
1528 iot, rc, result, io->ci_need_restart);
1530 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1532 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1533 file_dentry(file)->d_name.name,
1534 iot == CIT_READ ? "read" : "write",
1535 *ppos, count, result, rc);
1536 /* preserve the tried count for FLR */
1537 retried = io->ci_ndelay_tried;
1542 if (iot == CIT_READ) {
1544 ll_stats_ops_tally(ll_i2sbi(inode),
1545 LPROC_LL_READ_BYTES, result);
1546 } else if (iot == CIT_WRITE) {
1548 ll_stats_ops_tally(ll_i2sbi(inode),
1549 LPROC_LL_WRITE_BYTES, result);
1550 fd->fd_write_failed = false;
1551 } else if (result == 0 && rc == 0) {
1554 fd->fd_write_failed = true;
1556 fd->fd_write_failed = false;
1557 } else if (rc != -ERESTARTSYS) {
1558 fd->fd_write_failed = true;
1562 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1564 ll_heat_add(inode, iot, result);
1566 RETURN(result > 0 ? result : rc);
1570 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1571 * especially for small I/O.
1573 * To serve a read request, CLIO has to create and initialize a cl_io and
1574 * then request DLM lock. This has turned out to have siginificant overhead
1575 * and affects the performance of small I/O dramatically.
1577 * It's not necessary to create a cl_io for each I/O. Under the help of read
1578 * ahead, most of the pages being read are already in memory cache and we can
1579 * read those pages directly because if the pages exist, the corresponding DLM
1580 * lock must exist so that page content must be valid.
1582 * In fast read implementation, the llite speculatively finds and reads pages
1583 * in memory cache. There are three scenarios for fast read:
1584 * - If the page exists and is uptodate, kernel VM will provide the data and
1585 * CLIO won't be intervened;
1586 * - If the page was brought into memory by read ahead, it will be exported
1587 * and read ahead parameters will be updated;
1588 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1589 * it will go back and invoke normal read, i.e., a cl_io will be created
1590 * and DLM lock will be requested.
1592 * POSIX compliance: posix standard states that read is intended to be atomic.
1593 * Lustre read implementation is in line with Linux kernel read implementation
1594 * and neither of them complies with POSIX standard in this matter. Fast read
1595 * doesn't make the situation worse on single node but it may interleave write
1596 * results from multiple nodes due to short read handling in ll_file_aio_read().
1598 * \param env - lu_env
1599 * \param iocb - kiocb from kernel
1600 * \param iter - user space buffers where the data will be copied
1602 * \retval - number of bytes have been read, or error code if error occurred.
1605 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1609 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1612 /* NB: we can't do direct IO for fast read because it will need a lock
1613 * to make IO engine happy. */
1614 if (iocb->ki_filp->f_flags & O_DIRECT)
1617 result = generic_file_read_iter(iocb, iter);
1619 /* If the first page is not in cache, generic_file_aio_read() will be
1620 * returned with -ENODATA.
1621 * See corresponding code in ll_readpage(). */
1622 if (result == -ENODATA)
1626 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1627 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1628 LPROC_LL_READ_BYTES, result);
1635 * Read from a file (through the page cache).
1637 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1640 struct vvp_io_args *args;
1644 bool cached = false;
1647 * Currently when PCC read failed, we do not fall back to the
1648 * normal read path, just return the error.
1649 * The resaon is that: for RW-PCC, the file data may be modified
1650 * in the PCC and inconsistent with the data on OSTs (or file
1651 * data has been removed from the Lustre file system), at this
1652 * time, fallback to the normal read path may read the wrong
1654 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1655 * path: read data from data copy on OSTs.
1657 result = pcc_file_read_iter(iocb, to, &cached);
1661 ll_ras_enter(iocb->ki_filp);
1663 result = ll_do_fast_read(iocb, to);
1664 if (result < 0 || iov_iter_count(to) == 0)
1667 env = cl_env_get(&refcheck);
1669 return PTR_ERR(env);
1671 args = ll_env_args(env, IO_NORMAL);
1672 args->u.normal.via_iter = to;
1673 args->u.normal.via_iocb = iocb;
1675 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1676 &iocb->ki_pos, iov_iter_count(to));
1679 else if (result == 0)
1682 cl_env_put(env, &refcheck);
1688 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1689 * If a page is already in the page cache and dirty (and some other things -
1690 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1691 * write to it without doing a full I/O, because Lustre already knows about it
1692 * and will write it out. This saves a lot of processing time.
1694 * All writes here are within one page, so exclusion is handled by the page
1695 * lock on the vm page. We do not do tiny writes for writes which touch
1696 * multiple pages because it's very unlikely multiple sequential pages are
1697 * are already dirty.
1699 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1700 * and are unlikely to be to already dirty pages.
1702 * Attribute updates are important here, we do them in ll_tiny_write_end.
1704 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1706 ssize_t count = iov_iter_count(iter);
1707 struct file *file = iocb->ki_filp;
1708 struct inode *inode = file_inode(file);
1709 bool lock_inode = !IS_NOSEC(inode);
1714 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1715 * of function for why.
1717 if (count >= PAGE_SIZE ||
1718 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1721 if (unlikely(lock_inode))
1723 result = __generic_file_write_iter(iocb, iter);
1725 if (unlikely(lock_inode))
1726 inode_unlock(inode);
1728 /* If the page is not already dirty, ll_tiny_write_begin returns
1729 * -ENODATA. We continue on to normal write.
1731 if (result == -ENODATA)
1735 ll_heat_add(inode, CIT_WRITE, result);
1736 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1738 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1741 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1747 * Write to a file (through the page cache).
1749 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1751 struct vvp_io_args *args;
1753 ssize_t rc_tiny = 0, rc_normal;
1755 bool cached = false;
1761 * When PCC write failed, we do not fall back to the normal
1762 * write path, just return the error. The reason is that:
1763 * PCC is actually a HSM device, and HSM does not handle the
1764 * failure especially -ENOSPC due to space used out; Moreover,
1765 * the fallback to normal I/O path for ENOSPC failure, needs
1766 * to restore the file data to OSTs first and redo the write
1767 * again, making the logic of PCC very complex.
1769 result = pcc_file_write_iter(iocb, from, &cached);
1773 /* NB: we can't do direct IO for tiny writes because they use the page
1774 * cache, we can't do sync writes because tiny writes can't flush
1775 * pages, and we can't do append writes because we can't guarantee the
1776 * required DLM locks are held to protect file size.
1778 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1779 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1780 rc_tiny = ll_do_tiny_write(iocb, from);
1782 /* In case of error, go on and try normal write - Only stop if tiny
1783 * write completed I/O.
1785 if (iov_iter_count(from) == 0)
1786 GOTO(out, rc_normal = rc_tiny);
1788 env = cl_env_get(&refcheck);
1790 return PTR_ERR(env);
1792 args = ll_env_args(env, IO_NORMAL);
1793 args->u.normal.via_iter = from;
1794 args->u.normal.via_iocb = iocb;
1796 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1797 &iocb->ki_pos, iov_iter_count(from));
1799 /* On success, combine bytes written. */
1800 if (rc_tiny >= 0 && rc_normal > 0)
1801 rc_normal += rc_tiny;
1802 /* On error, only return error from normal write if tiny write did not
1803 * write any bytes. Otherwise return bytes written by tiny write.
1805 else if (rc_tiny > 0)
1806 rc_normal = rc_tiny;
1808 cl_env_put(env, &refcheck);
1813 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1815 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1817 static int ll_file_get_iov_count(const struct iovec *iov,
1818 unsigned long *nr_segs, size_t *count)
1823 for (seg = 0; seg < *nr_segs; seg++) {
1824 const struct iovec *iv = &iov[seg];
1827 * If any segment has a negative length, or the cumulative
1828 * length ever wraps negative then return -EINVAL.
1831 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1833 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1838 cnt -= iv->iov_len; /* This segment is no good */
1845 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1846 unsigned long nr_segs, loff_t pos)
1853 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1857 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1858 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1859 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1860 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1861 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1863 result = ll_file_read_iter(iocb, &to);
1868 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1871 struct iovec iov = { .iov_base = buf, .iov_len = count };
1876 init_sync_kiocb(&kiocb, file);
1877 kiocb.ki_pos = *ppos;
1878 #ifdef HAVE_KIOCB_KI_LEFT
1879 kiocb.ki_left = count;
1880 #elif defined(HAVE_KI_NBYTES)
1881 kiocb.i_nbytes = count;
1884 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1885 *ppos = kiocb.ki_pos;
1891 * Write to a file (through the page cache).
1894 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1895 unsigned long nr_segs, loff_t pos)
1897 struct iov_iter from;
1902 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1906 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1907 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1908 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1909 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1910 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1912 result = ll_file_write_iter(iocb, &from);
1917 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1918 size_t count, loff_t *ppos)
1920 struct iovec iov = { .iov_base = (void __user *)buf,
1927 init_sync_kiocb(&kiocb, file);
1928 kiocb.ki_pos = *ppos;
1929 #ifdef HAVE_KIOCB_KI_LEFT
1930 kiocb.ki_left = count;
1931 #elif defined(HAVE_KI_NBYTES)
1932 kiocb.ki_nbytes = count;
1935 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1936 *ppos = kiocb.ki_pos;
1940 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1943 * Send file content (through pagecache) somewhere with helper
1945 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1946 struct pipe_inode_info *pipe, size_t count,
1950 struct vvp_io_args *args;
1953 struct ll_file_data *fd = LUSTRE_FPRIVATE(in_file);
1954 struct file *pcc_file = fd->fd_pcc_file.pccf_file;
1958 /* pcc cache path */
1959 if (pcc_file && file_inode(pcc_file)->i_fop->splice_read)
1960 return file_inode(pcc_file)->i_fop->splice_read(pcc_file,
1961 ppos, pipe, count, flags);
1963 ll_ras_enter(in_file);
1965 env = cl_env_get(&refcheck);
1967 RETURN(PTR_ERR(env));
1969 args = ll_env_args(env, IO_SPLICE);
1970 args->u.splice.via_pipe = pipe;
1971 args->u.splice.via_flags = flags;
1973 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1974 cl_env_put(env, &refcheck);
1978 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1979 __u64 flags, struct lov_user_md *lum, int lum_size)
1981 struct lookup_intent oit = {
1983 .it_flags = flags | MDS_OPEN_BY_FID,
1988 ll_inode_size_lock(inode);
1989 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1991 GOTO(out_unlock, rc);
1993 ll_release_openhandle(dentry, &oit);
1996 ll_inode_size_unlock(inode);
1997 ll_intent_release(&oit);
2002 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2003 struct lov_mds_md **lmmp, int *lmm_size,
2004 struct ptlrpc_request **request)
2006 struct ll_sb_info *sbi = ll_i2sbi(inode);
2007 struct mdt_body *body;
2008 struct lov_mds_md *lmm = NULL;
2009 struct ptlrpc_request *req = NULL;
2010 struct md_op_data *op_data;
2013 rc = ll_get_default_mdsize(sbi, &lmmsize);
2017 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2018 strlen(filename), lmmsize,
2019 LUSTRE_OPC_ANY, NULL);
2020 if (IS_ERR(op_data))
2021 RETURN(PTR_ERR(op_data));
2023 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2024 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2025 ll_finish_md_op_data(op_data);
2027 CDEBUG(D_INFO, "md_getattr_name failed "
2028 "on %s: rc %d\n", filename, rc);
2032 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2033 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2035 lmmsize = body->mbo_eadatasize;
2037 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2039 GOTO(out, rc = -ENODATA);
2042 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2043 LASSERT(lmm != NULL);
2045 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2046 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2047 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2048 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2049 GOTO(out, rc = -EPROTO);
2052 * This is coming from the MDS, so is probably in
2053 * little endian. We convert it to host endian before
2054 * passing it to userspace.
2056 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2059 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2060 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2061 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2062 if (le32_to_cpu(lmm->lmm_pattern) &
2063 LOV_PATTERN_F_RELEASED)
2067 /* if function called for directory - we should
2068 * avoid swab not existent lsm objects */
2069 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2070 lustre_swab_lov_user_md_v1(
2071 (struct lov_user_md_v1 *)lmm);
2072 if (S_ISREG(body->mbo_mode))
2073 lustre_swab_lov_user_md_objects(
2074 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2076 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2077 lustre_swab_lov_user_md_v3(
2078 (struct lov_user_md_v3 *)lmm);
2079 if (S_ISREG(body->mbo_mode))
2080 lustre_swab_lov_user_md_objects(
2081 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2083 } else if (lmm->lmm_magic ==
2084 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2085 lustre_swab_lov_comp_md_v1(
2086 (struct lov_comp_md_v1 *)lmm);
2087 } else if (lmm->lmm_magic ==
2088 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2089 struct lov_foreign_md *lfm;
2091 lfm = (struct lov_foreign_md *)lmm;
2092 __swab32s(&lfm->lfm_magic);
2093 __swab32s(&lfm->lfm_length);
2094 __swab32s(&lfm->lfm_type);
2095 __swab32s(&lfm->lfm_flags);
2101 *lmm_size = lmmsize;
2106 static int ll_lov_setea(struct inode *inode, struct file *file,
2109 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2110 struct lov_user_md *lump;
2111 int lum_size = sizeof(struct lov_user_md) +
2112 sizeof(struct lov_user_ost_data);
2116 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2119 OBD_ALLOC_LARGE(lump, lum_size);
2123 if (copy_from_user(lump, arg, lum_size))
2124 GOTO(out_lump, rc = -EFAULT);
2126 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2128 cl_lov_delay_create_clear(&file->f_flags);
2131 OBD_FREE_LARGE(lump, lum_size);
2135 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2142 env = cl_env_get(&refcheck);
2144 RETURN(PTR_ERR(env));
2146 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2147 cl_env_put(env, &refcheck);
2151 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2154 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2155 struct lov_user_md *klum;
2157 __u64 flags = FMODE_WRITE;
2160 rc = ll_copy_user_md(lum, &klum);
2165 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2170 rc = put_user(0, &lum->lmm_stripe_count);
2174 rc = ll_layout_refresh(inode, &gen);
2178 rc = ll_file_getstripe(inode, arg, lum_size);
2180 cl_lov_delay_create_clear(&file->f_flags);
2183 OBD_FREE(klum, lum_size);
2188 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2190 struct ll_inode_info *lli = ll_i2info(inode);
2191 struct cl_object *obj = lli->lli_clob;
2192 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2193 struct ll_grouplock grouplock;
2198 CWARN("group id for group lock must not be 0\n");
2202 if (ll_file_nolock(file))
2203 RETURN(-EOPNOTSUPP);
2205 spin_lock(&lli->lli_lock);
2206 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2207 CWARN("group lock already existed with gid %lu\n",
2208 fd->fd_grouplock.lg_gid);
2209 spin_unlock(&lli->lli_lock);
2212 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2213 spin_unlock(&lli->lli_lock);
2216 * XXX: group lock needs to protect all OST objects while PFL
2217 * can add new OST objects during the IO, so we'd instantiate
2218 * all OST objects before getting its group lock.
2223 struct cl_layout cl = {
2224 .cl_is_composite = false,
2226 struct lu_extent ext = {
2228 .e_end = OBD_OBJECT_EOF,
2231 env = cl_env_get(&refcheck);
2233 RETURN(PTR_ERR(env));
2235 rc = cl_object_layout_get(env, obj, &cl);
2236 if (!rc && cl.cl_is_composite)
2237 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2240 cl_env_put(env, &refcheck);
2245 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2246 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2250 spin_lock(&lli->lli_lock);
2251 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2252 spin_unlock(&lli->lli_lock);
2253 CERROR("another thread just won the race\n");
2254 cl_put_grouplock(&grouplock);
2258 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2259 fd->fd_grouplock = grouplock;
2260 spin_unlock(&lli->lli_lock);
2262 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2266 static int ll_put_grouplock(struct inode *inode, struct file *file,
2269 struct ll_inode_info *lli = ll_i2info(inode);
2270 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2271 struct ll_grouplock grouplock;
2274 spin_lock(&lli->lli_lock);
2275 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2276 spin_unlock(&lli->lli_lock);
2277 CWARN("no group lock held\n");
2281 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2283 if (fd->fd_grouplock.lg_gid != arg) {
2284 CWARN("group lock %lu doesn't match current id %lu\n",
2285 arg, fd->fd_grouplock.lg_gid);
2286 spin_unlock(&lli->lli_lock);
2290 grouplock = fd->fd_grouplock;
2291 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2292 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2293 spin_unlock(&lli->lli_lock);
2295 cl_put_grouplock(&grouplock);
2296 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2301 * Close inode open handle
2303 * \param dentry [in] dentry which contains the inode
2304 * \param it [in,out] intent which contains open info and result
2307 * \retval <0 failure
2309 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2311 struct inode *inode = dentry->d_inode;
2312 struct obd_client_handle *och;
2318 /* Root ? Do nothing. */
2319 if (dentry->d_inode->i_sb->s_root == dentry)
2322 /* No open handle to close? Move away */
2323 if (!it_disposition(it, DISP_OPEN_OPEN))
2326 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2328 OBD_ALLOC(och, sizeof(*och));
2330 GOTO(out, rc = -ENOMEM);
2332 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2334 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2336 /* this one is in place of ll_file_open */
2337 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2338 ptlrpc_req_finished(it->it_request);
2339 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2345 * Get size for inode for which FIEMAP mapping is requested.
2346 * Make the FIEMAP get_info call and returns the result.
2347 * \param fiemap kernel buffer to hold extens
2348 * \param num_bytes kernel buffer size
2350 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2356 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2359 /* Checks for fiemap flags */
2360 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2361 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2365 /* Check for FIEMAP_FLAG_SYNC */
2366 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2367 rc = filemap_fdatawrite(inode->i_mapping);
2372 env = cl_env_get(&refcheck);
2374 RETURN(PTR_ERR(env));
2376 if (i_size_read(inode) == 0) {
2377 rc = ll_glimpse_size(inode);
2382 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2383 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2384 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2386 /* If filesize is 0, then there would be no objects for mapping */
2387 if (fmkey.lfik_oa.o_size == 0) {
2388 fiemap->fm_mapped_extents = 0;
2392 fmkey.lfik_fiemap = *fiemap;
2394 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2395 &fmkey, fiemap, &num_bytes);
2397 cl_env_put(env, &refcheck);
2401 int ll_fid2path(struct inode *inode, void __user *arg)
2403 struct obd_export *exp = ll_i2mdexp(inode);
2404 const struct getinfo_fid2path __user *gfin = arg;
2406 struct getinfo_fid2path *gfout;
2412 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2413 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2416 /* Only need to get the buflen */
2417 if (get_user(pathlen, &gfin->gf_pathlen))
2420 if (pathlen > PATH_MAX)
2423 outsize = sizeof(*gfout) + pathlen;
2424 OBD_ALLOC(gfout, outsize);
2428 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2429 GOTO(gf_free, rc = -EFAULT);
2430 /* append root FID after gfout to let MDT know the root FID so that it
2431 * can lookup the correct path, this is mainly for fileset.
2432 * old server without fileset mount support will ignore this. */
2433 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2435 /* Call mdc_iocontrol */
2436 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2440 if (copy_to_user(arg, gfout, outsize))
2444 OBD_FREE(gfout, outsize);
2449 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2451 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2459 ioc->idv_version = 0;
2460 ioc->idv_layout_version = UINT_MAX;
2462 /* If no file object initialized, we consider its version is 0. */
2466 env = cl_env_get(&refcheck);
2468 RETURN(PTR_ERR(env));
2470 io = vvp_env_thread_io(env);
2472 io->u.ci_data_version.dv_data_version = 0;
2473 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2474 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2477 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2478 result = cl_io_loop(env, io);
2480 result = io->ci_result;
2482 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2483 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2485 cl_io_fini(env, io);
2487 if (unlikely(io->ci_need_restart))
2490 cl_env_put(env, &refcheck);
2496 * Read the data_version for inode.
2498 * This value is computed using stripe object version on OST.
2499 * Version is computed using server side locking.
2501 * @param flags if do sync on the OST side;
2503 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2504 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2506 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2508 struct ioc_data_version ioc = { .idv_flags = flags };
2511 rc = ll_ioc_data_version(inode, &ioc);
2513 *data_version = ioc.idv_version;
2519 * Trigger a HSM release request for the provided inode.
2521 int ll_hsm_release(struct inode *inode)
2524 struct obd_client_handle *och = NULL;
2525 __u64 data_version = 0;
2530 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2531 ll_i2sbi(inode)->ll_fsname,
2532 PFID(&ll_i2info(inode)->lli_fid));
2534 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2536 GOTO(out, rc = PTR_ERR(och));
2538 /* Grab latest data_version and [am]time values */
2539 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2543 env = cl_env_get(&refcheck);
2545 GOTO(out, rc = PTR_ERR(env));
2547 rc = ll_merge_attr(env, inode);
2548 cl_env_put(env, &refcheck);
2550 /* If error happen, we have the wrong size for a file.
2556 /* Release the file.
2557 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2558 * we still need it to pack l_remote_handle to MDT. */
2559 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2565 if (och != NULL && !IS_ERR(och)) /* close the file */
2566 ll_lease_close(och, inode, NULL);
2571 struct ll_swap_stack {
2574 struct inode *inode1;
2575 struct inode *inode2;
2580 static int ll_swap_layouts(struct file *file1, struct file *file2,
2581 struct lustre_swap_layouts *lsl)
2583 struct mdc_swap_layouts msl;
2584 struct md_op_data *op_data;
2587 struct ll_swap_stack *llss = NULL;
2590 OBD_ALLOC_PTR(llss);
2594 llss->inode1 = file_inode(file1);
2595 llss->inode2 = file_inode(file2);
2597 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2601 /* we use 2 bool because it is easier to swap than 2 bits */
2602 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2603 llss->check_dv1 = true;
2605 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2606 llss->check_dv2 = true;
2608 /* we cannot use lsl->sl_dvX directly because we may swap them */
2609 llss->dv1 = lsl->sl_dv1;
2610 llss->dv2 = lsl->sl_dv2;
2612 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2613 if (rc == 0) /* same file, done! */
2616 if (rc < 0) { /* sequentialize it */
2617 swap(llss->inode1, llss->inode2);
2619 swap(llss->dv1, llss->dv2);
2620 swap(llss->check_dv1, llss->check_dv2);
2624 if (gid != 0) { /* application asks to flush dirty cache */
2625 rc = ll_get_grouplock(llss->inode1, file1, gid);
2629 rc = ll_get_grouplock(llss->inode2, file2, gid);
2631 ll_put_grouplock(llss->inode1, file1, gid);
2636 /* ultimate check, before swaping the layouts we check if
2637 * dataversion has changed (if requested) */
2638 if (llss->check_dv1) {
2639 rc = ll_data_version(llss->inode1, &dv, 0);
2642 if (dv != llss->dv1)
2643 GOTO(putgl, rc = -EAGAIN);
2646 if (llss->check_dv2) {
2647 rc = ll_data_version(llss->inode2, &dv, 0);
2650 if (dv != llss->dv2)
2651 GOTO(putgl, rc = -EAGAIN);
2654 /* struct md_op_data is used to send the swap args to the mdt
2655 * only flags is missing, so we use struct mdc_swap_layouts
2656 * through the md_op_data->op_data */
2657 /* flags from user space have to be converted before they are send to
2658 * server, no flag is sent today, they are only used on the client */
2661 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2662 0, LUSTRE_OPC_ANY, &msl);
2663 if (IS_ERR(op_data))
2664 GOTO(free, rc = PTR_ERR(op_data));
2666 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2667 sizeof(*op_data), op_data, NULL);
2668 ll_finish_md_op_data(op_data);
2675 ll_put_grouplock(llss->inode2, file2, gid);
2676 ll_put_grouplock(llss->inode1, file1, gid);
2686 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2688 struct obd_export *exp = ll_i2mdexp(inode);
2689 struct md_op_data *op_data;
2693 /* Detect out-of range masks */
2694 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2697 /* Non-root users are forbidden to set or clear flags which are
2698 * NOT defined in HSM_USER_MASK. */
2699 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2700 !cfs_capable(CFS_CAP_SYS_ADMIN))
2703 if (!exp_connect_archive_id_array(exp)) {
2704 /* Detect out-of range archive id */
2705 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2706 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2710 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2711 LUSTRE_OPC_ANY, hss);
2712 if (IS_ERR(op_data))
2713 RETURN(PTR_ERR(op_data));
2715 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2718 ll_finish_md_op_data(op_data);
2723 static int ll_hsm_import(struct inode *inode, struct file *file,
2724 struct hsm_user_import *hui)
2726 struct hsm_state_set *hss = NULL;
2727 struct iattr *attr = NULL;
2731 if (!S_ISREG(inode->i_mode))
2737 GOTO(out, rc = -ENOMEM);
2739 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2740 hss->hss_archive_id = hui->hui_archive_id;
2741 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2742 rc = ll_hsm_state_set(inode, hss);
2746 OBD_ALLOC_PTR(attr);
2748 GOTO(out, rc = -ENOMEM);
2750 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2751 attr->ia_mode |= S_IFREG;
2752 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2753 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2754 attr->ia_size = hui->hui_size;
2755 attr->ia_mtime.tv_sec = hui->hui_mtime;
2756 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2757 attr->ia_atime.tv_sec = hui->hui_atime;
2758 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2760 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2761 ATTR_UID | ATTR_GID |
2762 ATTR_MTIME | ATTR_MTIME_SET |
2763 ATTR_ATIME | ATTR_ATIME_SET;
2767 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2771 inode_unlock(inode);
2783 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2785 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2786 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2789 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2791 struct inode *inode = file_inode(file);
2793 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2794 ATTR_MTIME | ATTR_MTIME_SET |
2797 .tv_sec = lfu->lfu_atime_sec,
2798 .tv_nsec = lfu->lfu_atime_nsec,
2801 .tv_sec = lfu->lfu_mtime_sec,
2802 .tv_nsec = lfu->lfu_mtime_nsec,
2805 .tv_sec = lfu->lfu_ctime_sec,
2806 .tv_nsec = lfu->lfu_ctime_nsec,
2812 if (!capable(CAP_SYS_ADMIN))
2815 if (!S_ISREG(inode->i_mode))
2819 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2821 inode_unlock(inode);
2826 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2829 case MODE_READ_USER:
2831 case MODE_WRITE_USER:
2838 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2840 /* Used to allow the upper layers of the client to request an LDLM lock
2841 * without doing an actual read or write.
2843 * Used for ladvise lockahead to manually request specific locks.
2845 * \param[in] file file this ladvise lock request is on
2846 * \param[in] ladvise ladvise struct describing this lock request
2848 * \retval 0 success, no detailed result available (sync requests
2849 * and requests sent to the server [not handled locally]
2850 * cannot return detailed results)
2851 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2852 * see definitions for details.
2853 * \retval negative negative errno on error
2855 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2857 struct lu_env *env = NULL;
2858 struct cl_io *io = NULL;
2859 struct cl_lock *lock = NULL;
2860 struct cl_lock_descr *descr = NULL;
2861 struct dentry *dentry = file->f_path.dentry;
2862 struct inode *inode = dentry->d_inode;
2863 enum cl_lock_mode cl_mode;
2864 off_t start = ladvise->lla_start;
2865 off_t end = ladvise->lla_end;
2871 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2872 "start=%llu, end=%llu\n", dentry->d_name.len,
2873 dentry->d_name.name, dentry->d_inode,
2874 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2877 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2879 GOTO(out, result = cl_mode);
2881 /* Get IO environment */
2882 result = cl_io_get(inode, &env, &io, &refcheck);
2886 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2889 * nothing to do for this io. This currently happens when
2890 * stripe sub-object's are not yet created.
2892 result = io->ci_result;
2893 } else if (result == 0) {
2894 lock = vvp_env_lock(env);
2895 descr = &lock->cll_descr;
2897 descr->cld_obj = io->ci_obj;
2898 /* Convert byte offsets to pages */
2899 descr->cld_start = cl_index(io->ci_obj, start);
2900 descr->cld_end = cl_index(io->ci_obj, end);
2901 descr->cld_mode = cl_mode;
2902 /* CEF_MUST is used because we do not want to convert a
2903 * lockahead request to a lockless lock */
2904 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2907 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2908 descr->cld_enq_flags |= CEF_SPECULATIVE;
2910 result = cl_lock_request(env, io, lock);
2912 /* On success, we need to release the lock */
2914 cl_lock_release(env, lock);
2916 cl_io_fini(env, io);
2917 cl_env_put(env, &refcheck);
2919 /* -ECANCELED indicates a matching lock with a different extent
2920 * was already present, and -EEXIST indicates a matching lock
2921 * on exactly the same extent was already present.
2922 * We convert them to positive values for userspace to make
2923 * recognizing true errors easier.
2924 * Note we can only return these detailed results on async requests,
2925 * as sync requests look the same as i/o requests for locking. */
2926 if (result == -ECANCELED)
2927 result = LLA_RESULT_DIFFERENT;
2928 else if (result == -EEXIST)
2929 result = LLA_RESULT_SAME;
2934 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2936 static int ll_ladvise_sanity(struct inode *inode,
2937 struct llapi_lu_ladvise *ladvise)
2939 struct ll_sb_info *sbi = ll_i2sbi(inode);
2940 enum lu_ladvise_type advice = ladvise->lla_advice;
2941 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2942 * be in the first 32 bits of enum ladvise_flags */
2943 __u32 flags = ladvise->lla_peradvice_flags;
2944 /* 3 lines at 80 characters per line, should be plenty */
2947 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2949 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2950 "last supported advice is %s (value '%d'): rc = %d\n",
2951 sbi->ll_fsname, advice,
2952 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2956 /* Per-advice checks */
2958 case LU_LADVISE_LOCKNOEXPAND:
2959 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2961 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2962 "rc = %d\n", sbi->ll_fsname, flags,
2963 ladvise_names[advice], rc);
2967 case LU_LADVISE_LOCKAHEAD:
2968 /* Currently only READ and WRITE modes can be requested */
2969 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2970 ladvise->lla_lockahead_mode == 0) {
2972 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2973 "rc = %d\n", sbi->ll_fsname,
2974 ladvise->lla_lockahead_mode,
2975 ladvise_names[advice], rc);
2978 case LU_LADVISE_WILLREAD:
2979 case LU_LADVISE_DONTNEED:
2981 /* Note fall through above - These checks apply to all advices
2982 * except LOCKNOEXPAND */
2983 if (flags & ~LF_DEFAULT_MASK) {
2985 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2986 "rc = %d\n", sbi->ll_fsname, flags,
2987 ladvise_names[advice], rc);
2990 if (ladvise->lla_start >= ladvise->lla_end) {
2992 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2993 "for %s: rc = %d\n", sbi->ll_fsname,
2994 ladvise->lla_start, ladvise->lla_end,
2995 ladvise_names[advice], rc);
3007 * Give file access advices
3009 * The ladvise interface is similar to Linux fadvise() system call, except it
3010 * forwards the advices directly from Lustre client to server. The server side
3011 * codes will apply appropriate read-ahead and caching techniques for the
3012 * corresponding files.
3014 * A typical workload for ladvise is e.g. a bunch of different clients are
3015 * doing small random reads of a file, so prefetching pages into OSS cache
3016 * with big linear reads before the random IO is a net benefit. Fetching
3017 * all that data into each client cache with fadvise() may not be, due to
3018 * much more data being sent to the client.
3020 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3021 struct llapi_lu_ladvise *ladvise)
3025 struct cl_ladvise_io *lio;
3030 env = cl_env_get(&refcheck);
3032 RETURN(PTR_ERR(env));
3034 io = vvp_env_thread_io(env);
3035 io->ci_obj = ll_i2info(inode)->lli_clob;
3037 /* initialize parameters for ladvise */
3038 lio = &io->u.ci_ladvise;
3039 lio->li_start = ladvise->lla_start;
3040 lio->li_end = ladvise->lla_end;
3041 lio->li_fid = ll_inode2fid(inode);
3042 lio->li_advice = ladvise->lla_advice;
3043 lio->li_flags = flags;
3045 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3046 rc = cl_io_loop(env, io);
3050 cl_io_fini(env, io);
3051 cl_env_put(env, &refcheck);
3055 static int ll_lock_noexpand(struct file *file, int flags)
3057 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3059 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3064 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3067 struct fsxattr fsxattr;
3069 if (copy_from_user(&fsxattr,
3070 (const struct fsxattr __user *)arg,
3074 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3075 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3076 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3077 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3078 if (copy_to_user((struct fsxattr __user *)arg,
3079 &fsxattr, sizeof(fsxattr)))
3085 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3088 * Project Quota ID state is only allowed to change from within the init
3089 * namespace. Enforce that restriction only if we are trying to change
3090 * the quota ID state. Everything else is allowed in user namespaces.
3092 if (current_user_ns() == &init_user_ns)
3095 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3098 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3099 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3102 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3109 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3113 struct md_op_data *op_data;
3114 struct ptlrpc_request *req = NULL;
3116 struct fsxattr fsxattr;
3117 struct cl_object *obj;
3121 if (copy_from_user(&fsxattr,
3122 (const struct fsxattr __user *)arg,
3126 rc = ll_ioctl_check_project(inode, &fsxattr);
3130 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3131 LUSTRE_OPC_ANY, NULL);
3132 if (IS_ERR(op_data))
3133 RETURN(PTR_ERR(op_data));
3135 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3136 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3137 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3138 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3139 op_data->op_projid = fsxattr.fsx_projid;
3140 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3141 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3143 ptlrpc_req_finished(req);
3145 GOTO(out_fsxattr, rc);
3146 ll_update_inode_flags(inode, op_data->op_attr_flags);
3147 obj = ll_i2info(inode)->lli_clob;
3149 GOTO(out_fsxattr, rc);
3151 OBD_ALLOC_PTR(attr);
3153 GOTO(out_fsxattr, rc = -ENOMEM);
3155 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3156 fsxattr.fsx_xflags);
3159 ll_finish_md_op_data(op_data);
3163 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3166 struct inode *inode = file_inode(file);
3167 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3168 struct ll_inode_info *lli = ll_i2info(inode);
3169 struct obd_client_handle *och = NULL;
3170 struct split_param sp;
3171 struct pcc_param param;
3172 bool lease_broken = false;
3174 enum mds_op_bias bias = 0;
3175 struct file *layout_file = NULL;
3177 size_t data_size = 0;
3178 bool attached = false;
3183 mutex_lock(&lli->lli_och_mutex);
3184 if (fd->fd_lease_och != NULL) {
3185 och = fd->fd_lease_och;
3186 fd->fd_lease_och = NULL;
3188 mutex_unlock(&lli->lli_och_mutex);
3193 fmode = och->och_flags;
3195 switch (ioc->lil_flags) {
3196 case LL_LEASE_RESYNC_DONE:
3197 if (ioc->lil_count > IOC_IDS_MAX)
3198 GOTO(out_lease_close, rc = -EINVAL);
3200 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3201 OBD_ALLOC(data, data_size);
3203 GOTO(out_lease_close, rc = -ENOMEM);
3205 if (copy_from_user(data, (void __user *)arg, data_size))
3206 GOTO(out_lease_close, rc = -EFAULT);
3208 bias = MDS_CLOSE_RESYNC_DONE;
3210 case LL_LEASE_LAYOUT_MERGE: {
3213 if (ioc->lil_count != 1)
3214 GOTO(out_lease_close, rc = -EINVAL);
3216 arg += sizeof(*ioc);
3217 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3218 GOTO(out_lease_close, rc = -EFAULT);
3220 layout_file = fget(fd);
3222 GOTO(out_lease_close, rc = -EBADF);
3224 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3225 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3226 GOTO(out_lease_close, rc = -EPERM);
3228 data = file_inode(layout_file);
3229 bias = MDS_CLOSE_LAYOUT_MERGE;
3232 case LL_LEASE_LAYOUT_SPLIT: {
3236 if (ioc->lil_count != 2)
3237 GOTO(out_lease_close, rc = -EINVAL);
3239 arg += sizeof(*ioc);
3240 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3241 GOTO(out_lease_close, rc = -EFAULT);
3243 arg += sizeof(__u32);
3244 if (copy_from_user(&mirror_id, (void __user *)arg,
3246 GOTO(out_lease_close, rc = -EFAULT);
3248 layout_file = fget(fdv);
3250 GOTO(out_lease_close, rc = -EBADF);
3252 sp.sp_inode = file_inode(layout_file);
3253 sp.sp_mirror_id = (__u16)mirror_id;
3255 bias = MDS_CLOSE_LAYOUT_SPLIT;
3258 case LL_LEASE_PCC_ATTACH:
3259 if (ioc->lil_count != 1)
3262 arg += sizeof(*ioc);
3263 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3265 GOTO(out_lease_close, rc2 = -EFAULT);
3267 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3269 GOTO(out_lease_close, rc2);
3272 /* Grab latest data version */
3273 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3276 GOTO(out_lease_close, rc2);
3279 bias = MDS_PCC_ATTACH;
3282 /* without close intent */
3287 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3291 rc = ll_lease_och_release(inode, file);
3300 switch (ioc->lil_flags) {
3301 case LL_LEASE_RESYNC_DONE:
3303 OBD_FREE(data, data_size);
3305 case LL_LEASE_LAYOUT_MERGE:
3306 case LL_LEASE_LAYOUT_SPLIT:
3310 case LL_LEASE_PCC_ATTACH:
3313 rc = pcc_readwrite_attach_fini(file, inode, lease_broken,
3319 rc = ll_lease_type_from_fmode(fmode);
3323 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3326 struct inode *inode = file_inode(file);
3327 struct ll_inode_info *lli = ll_i2info(inode);
3328 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3329 struct obd_client_handle *och = NULL;
3330 __u64 open_flags = 0;
3336 switch (ioc->lil_mode) {
3337 case LL_LEASE_WRLCK:
3338 if (!(file->f_mode & FMODE_WRITE))
3340 fmode = FMODE_WRITE;
3342 case LL_LEASE_RDLCK:
3343 if (!(file->f_mode & FMODE_READ))
3347 case LL_LEASE_UNLCK:
3348 RETURN(ll_file_unlock_lease(file, ioc, arg));
3353 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3355 /* apply for lease */
3356 if (ioc->lil_flags & LL_LEASE_RESYNC)
3357 open_flags = MDS_OPEN_RESYNC;
3358 och = ll_lease_open(inode, file, fmode, open_flags);
3360 RETURN(PTR_ERR(och));
3362 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3363 rc = ll_lease_file_resync(och, inode, arg);
3365 ll_lease_close(och, inode, NULL);
3368 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3370 ll_lease_close(och, inode, NULL);
3376 mutex_lock(&lli->lli_och_mutex);
3377 if (fd->fd_lease_och == NULL) {
3378 fd->fd_lease_och = och;
3381 mutex_unlock(&lli->lli_och_mutex);
3383 /* impossible now that only excl is supported for now */
3384 ll_lease_close(och, inode, &lease_broken);
3390 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3392 struct ll_inode_info *lli = ll_i2info(inode);
3393 struct ll_sb_info *sbi = ll_i2sbi(inode);
3394 __u64 now = ktime_get_real_seconds();
3397 spin_lock(&lli->lli_heat_lock);
3398 heat->lh_flags = lli->lli_heat_flags;
3399 for (i = 0; i < heat->lh_count; i++)
3400 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3401 now, sbi->ll_heat_decay_weight,
3402 sbi->ll_heat_period_second);
3403 spin_unlock(&lli->lli_heat_lock);
3406 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3408 struct ll_inode_info *lli = ll_i2info(inode);
3411 spin_lock(&lli->lli_heat_lock);
3412 if (flags & LU_HEAT_FLAG_CLEAR)
3413 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3415 if (flags & LU_HEAT_FLAG_OFF)
3416 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3418 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3420 spin_unlock(&lli->lli_heat_lock);
3426 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3428 struct inode *inode = file_inode(file);
3429 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3433 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3434 PFID(ll_inode2fid(inode)), inode, cmd);
3435 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3437 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3438 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3442 case LL_IOC_GETFLAGS:
3443 /* Get the current value of the file flags */
3444 return put_user(fd->fd_flags, (int __user *)arg);
3445 case LL_IOC_SETFLAGS:
3446 case LL_IOC_CLRFLAGS:
3447 /* Set or clear specific file flags */
3448 /* XXX This probably needs checks to ensure the flags are
3449 * not abused, and to handle any flag side effects.
3451 if (get_user(flags, (int __user *) arg))
3454 if (cmd == LL_IOC_SETFLAGS) {
3455 if ((flags & LL_FILE_IGNORE_LOCK) &&
3456 !(file->f_flags & O_DIRECT)) {
3457 CERROR("%s: unable to disable locking on "
3458 "non-O_DIRECT file\n", current->comm);
3462 fd->fd_flags |= flags;
3464 fd->fd_flags &= ~flags;
3467 case LL_IOC_LOV_SETSTRIPE:
3468 case LL_IOC_LOV_SETSTRIPE_NEW:
3469 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3470 case LL_IOC_LOV_SETEA:
3471 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3472 case LL_IOC_LOV_SWAP_LAYOUTS: {
3474 struct lustre_swap_layouts lsl;
3476 if (copy_from_user(&lsl, (char __user *)arg,
3477 sizeof(struct lustre_swap_layouts)))
3480 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3483 file2 = fget(lsl.sl_fd);
3487 /* O_WRONLY or O_RDWR */
3488 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3489 GOTO(out, rc = -EPERM);
3491 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3492 struct inode *inode2;
3493 struct ll_inode_info *lli;
3494 struct obd_client_handle *och = NULL;
3496 lli = ll_i2info(inode);
3497 mutex_lock(&lli->lli_och_mutex);
3498 if (fd->fd_lease_och != NULL) {
3499 och = fd->fd_lease_och;
3500 fd->fd_lease_och = NULL;
3502 mutex_unlock(&lli->lli_och_mutex);
3504 GOTO(out, rc = -ENOLCK);
3505 inode2 = file_inode(file2);
3506 rc = ll_swap_layouts_close(och, inode, inode2);
3508 rc = ll_swap_layouts(file, file2, &lsl);
3514 case LL_IOC_LOV_GETSTRIPE:
3515 case LL_IOC_LOV_GETSTRIPE_NEW:
3516 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3517 case FS_IOC_GETFLAGS:
3518 case FS_IOC_SETFLAGS:
3519 RETURN(ll_iocontrol(inode, file, cmd, arg));
3520 case FSFILT_IOC_GETVERSION:
3521 case FS_IOC_GETVERSION:
3522 RETURN(put_user(inode->i_generation, (int __user *)arg));
3523 /* We need to special case any other ioctls we want to handle,
3524 * to send them to the MDS/OST as appropriate and to properly
3525 * network encode the arg field. */
3526 case FS_IOC_SETVERSION:
3529 case LL_IOC_GROUP_LOCK:
3530 RETURN(ll_get_grouplock(inode, file, arg));
3531 case LL_IOC_GROUP_UNLOCK:
3532 RETURN(ll_put_grouplock(inode, file, arg));
3533 case IOC_OBD_STATFS:
3534 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3536 case LL_IOC_FLUSHCTX:
3537 RETURN(ll_flush_ctx(inode));
3538 case LL_IOC_PATH2FID: {
3539 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3540 sizeof(struct lu_fid)))
3545 case LL_IOC_GETPARENT:
3546 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3548 case OBD_IOC_FID2PATH:
3549 RETURN(ll_fid2path(inode, (void __user *)arg));
3550 case LL_IOC_DATA_VERSION: {
3551 struct ioc_data_version idv;
3554 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3557 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3558 rc = ll_ioc_data_version(inode, &idv);
3561 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3567 case LL_IOC_GET_MDTIDX: {
3570 mdtidx = ll_get_mdt_idx(inode);
3574 if (put_user((int)mdtidx, (int __user *)arg))
3579 case OBD_IOC_GETDTNAME:
3580 case OBD_IOC_GETMDNAME:
3581 RETURN(ll_get_obd_name(inode, cmd, arg));
3582 case LL_IOC_HSM_STATE_GET: {
3583 struct md_op_data *op_data;
3584 struct hsm_user_state *hus;
3591 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3592 LUSTRE_OPC_ANY, hus);
3593 if (IS_ERR(op_data)) {
3595 RETURN(PTR_ERR(op_data));
3598 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3601 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3604 ll_finish_md_op_data(op_data);
3608 case LL_IOC_HSM_STATE_SET: {
3609 struct hsm_state_set *hss;
3616 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3621 rc = ll_hsm_state_set(inode, hss);
3626 case LL_IOC_HSM_ACTION: {
3627 struct md_op_data *op_data;
3628 struct hsm_current_action *hca;
3635 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3636 LUSTRE_OPC_ANY, hca);
3637 if (IS_ERR(op_data)) {
3639 RETURN(PTR_ERR(op_data));
3642 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3645 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3648 ll_finish_md_op_data(op_data);
3652 case LL_IOC_SET_LEASE_OLD: {
3653 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3655 RETURN(ll_file_set_lease(file, &ioc, 0));
3657 case LL_IOC_SET_LEASE: {
3658 struct ll_ioc_lease ioc;
3660 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3663 RETURN(ll_file_set_lease(file, &ioc, arg));
3665 case LL_IOC_GET_LEASE: {
3666 struct ll_inode_info *lli = ll_i2info(inode);
3667 struct ldlm_lock *lock = NULL;
3670 mutex_lock(&lli->lli_och_mutex);
3671 if (fd->fd_lease_och != NULL) {
3672 struct obd_client_handle *och = fd->fd_lease_och;
3674 lock = ldlm_handle2lock(&och->och_lease_handle);
3676 lock_res_and_lock(lock);
3677 if (!ldlm_is_cancel(lock))
3678 fmode = och->och_flags;
3680 unlock_res_and_lock(lock);
3681 LDLM_LOCK_PUT(lock);
3684 mutex_unlock(&lli->lli_och_mutex);
3686 RETURN(ll_lease_type_from_fmode(fmode));
3688 case LL_IOC_HSM_IMPORT: {
3689 struct hsm_user_import *hui;
3695 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3700 rc = ll_hsm_import(inode, file, hui);
3705 case LL_IOC_FUTIMES_3: {
3706 struct ll_futimes_3 lfu;
3708 if (copy_from_user(&lfu,
3709 (const struct ll_futimes_3 __user *)arg,
3713 RETURN(ll_file_futimes_3(file, &lfu));
3715 case LL_IOC_LADVISE: {
3716 struct llapi_ladvise_hdr *k_ladvise_hdr;
3717 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3720 int alloc_size = sizeof(*k_ladvise_hdr);
3723 u_ladvise_hdr = (void __user *)arg;
3724 OBD_ALLOC_PTR(k_ladvise_hdr);
3725 if (k_ladvise_hdr == NULL)
3728 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3729 GOTO(out_ladvise, rc = -EFAULT);
3731 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3732 k_ladvise_hdr->lah_count < 1)
3733 GOTO(out_ladvise, rc = -EINVAL);
3735 num_advise = k_ladvise_hdr->lah_count;
3736 if (num_advise >= LAH_COUNT_MAX)
3737 GOTO(out_ladvise, rc = -EFBIG);
3739 OBD_FREE_PTR(k_ladvise_hdr);
3740 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3741 lah_advise[num_advise]);
3742 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3743 if (k_ladvise_hdr == NULL)
3747 * TODO: submit multiple advices to one server in a single RPC
3749 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3750 GOTO(out_ladvise, rc = -EFAULT);
3752 for (i = 0; i < num_advise; i++) {
3753 struct llapi_lu_ladvise *k_ladvise =
3754 &k_ladvise_hdr->lah_advise[i];
3755 struct llapi_lu_ladvise __user *u_ladvise =
3756 &u_ladvise_hdr->lah_advise[i];
3758 rc = ll_ladvise_sanity(inode, k_ladvise);
3760 GOTO(out_ladvise, rc);
3762 switch (k_ladvise->lla_advice) {
3763 case LU_LADVISE_LOCKNOEXPAND:
3764 rc = ll_lock_noexpand(file,
3765 k_ladvise->lla_peradvice_flags);
3766 GOTO(out_ladvise, rc);
3767 case LU_LADVISE_LOCKAHEAD:
3769 rc = ll_file_lock_ahead(file, k_ladvise);
3772 GOTO(out_ladvise, rc);
3775 &u_ladvise->lla_lockahead_result))
3776 GOTO(out_ladvise, rc = -EFAULT);
3779 rc = ll_ladvise(inode, file,
3780 k_ladvise_hdr->lah_flags,
3783 GOTO(out_ladvise, rc);
3790 OBD_FREE(k_ladvise_hdr, alloc_size);
3793 case LL_IOC_FLR_SET_MIRROR: {
3794 /* mirror I/O must be direct to avoid polluting page cache
3796 if (!(file->f_flags & O_DIRECT))
3799 fd->fd_designated_mirror = (__u32)arg;
3802 case LL_IOC_FSGETXATTR:
3803 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3804 case LL_IOC_FSSETXATTR:
3805 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3807 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3808 case LL_IOC_HEAT_GET: {
3809 struct lu_heat uheat;
3810 struct lu_heat *heat;
3813 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3816 if (uheat.lh_count > OBD_HEAT_COUNT)
3817 uheat.lh_count = OBD_HEAT_COUNT;
3819 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3820 OBD_ALLOC(heat, size);
3824 heat->lh_count = uheat.lh_count;
3825 ll_heat_get(inode, heat);
3826 rc = copy_to_user((char __user *)arg, heat, size);
3827 OBD_FREE(heat, size);
3828 RETURN(rc ? -EFAULT : 0);
3830 case LL_IOC_HEAT_SET: {
3833 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3836 rc = ll_heat_set(inode, flags);
3839 case LL_IOC_PCC_STATE: {
3840 struct lu_pcc_state __user *ustate =
3841 (struct lu_pcc_state __user *)arg;
3842 struct lu_pcc_state *state;
3844 OBD_ALLOC_PTR(state);
3848 if (copy_from_user(state, ustate, sizeof(*state)))
3849 GOTO(out_state, rc = -EFAULT);
3851 rc = pcc_ioctl_state(inode, state);
3853 GOTO(out_state, rc);
3855 if (copy_to_user(ustate, state, sizeof(*state)))
3856 GOTO(out_state, rc = -EFAULT);
3859 OBD_FREE_PTR(state);
3863 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3864 (void __user *)arg));
3868 #ifndef HAVE_FILE_LLSEEK_SIZE
3869 static inline loff_t
3870 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3872 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3874 if (offset > maxsize)
3877 if (offset != file->f_pos) {
3878 file->f_pos = offset;
3879 file->f_version = 0;
3885 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3886 loff_t maxsize, loff_t eof)
3888 struct inode *inode = file_inode(file);
3896 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3897 * position-querying operation. Avoid rewriting the "same"
3898 * f_pos value back to the file because a concurrent read(),
3899 * write() or lseek() might have altered it
3904 * f_lock protects against read/modify/write race with other
3905 * SEEK_CURs. Note that parallel writes and reads behave
3909 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3910 inode_unlock(inode);
3914 * In the generic case the entire file is data, so as long as
3915 * offset isn't at the end of the file then the offset is data.
3922 * There is a virtual hole at the end of the file, so as long as
3923 * offset isn't i_size or larger, return i_size.
3931 return llseek_execute(file, offset, maxsize);
3935 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3937 struct inode *inode = file_inode(file);
3938 loff_t retval, eof = 0;
3941 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3942 (origin == SEEK_CUR) ? file->f_pos : 0);
3943 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3944 PFID(ll_inode2fid(inode)), inode, retval, retval,
3946 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3948 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3949 retval = ll_glimpse_size(inode);
3952 eof = i_size_read(inode);
3955 retval = ll_generic_file_llseek_size(file, offset, origin,
3956 ll_file_maxbytes(inode), eof);
3960 static int ll_flush(struct file *file, fl_owner_t id)
3962 struct inode *inode = file_inode(file);
3963 struct ll_inode_info *lli = ll_i2info(inode);
3964 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3967 LASSERT(!S_ISDIR(inode->i_mode));
3969 /* catch async errors that were recorded back when async writeback
3970 * failed for pages in this mapping. */
3971 rc = lli->lli_async_rc;
3972 lli->lli_async_rc = 0;
3973 if (lli->lli_clob != NULL) {
3974 err = lov_read_and_clear_async_rc(lli->lli_clob);
3979 /* The application has been told write failure already.
3980 * Do not report failure again. */
3981 if (fd->fd_write_failed)
3983 return rc ? -EIO : 0;
3987 * Called to make sure a portion of file has been written out.
3988 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3990 * Return how many pages have been written.
3992 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3993 enum cl_fsync_mode mode, int ignore_layout)
3997 struct cl_fsync_io *fio;
4002 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4003 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4006 env = cl_env_get(&refcheck);
4008 RETURN(PTR_ERR(env));
4010 io = vvp_env_thread_io(env);
4011 io->ci_obj = ll_i2info(inode)->lli_clob;
4012 io->ci_ignore_layout = ignore_layout;
4014 /* initialize parameters for sync */
4015 fio = &io->u.ci_fsync;
4016 fio->fi_start = start;
4018 fio->fi_fid = ll_inode2fid(inode);
4019 fio->fi_mode = mode;
4020 fio->fi_nr_written = 0;
4022 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4023 result = cl_io_loop(env, io);
4025 result = io->ci_result;
4027 result = fio->fi_nr_written;
4028 cl_io_fini(env, io);
4029 cl_env_put(env, &refcheck);
4035 * When dentry is provided (the 'else' case), file_dentry() may be
4036 * null and dentry must be used directly rather than pulled from
4037 * file_dentry() as is done otherwise.
4040 #ifdef HAVE_FILE_FSYNC_4ARGS
4041 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4043 struct dentry *dentry = file_dentry(file);
4044 #elif defined(HAVE_FILE_FSYNC_2ARGS)
4045 int ll_fsync(struct file *file, int datasync)
4047 struct dentry *dentry = file_dentry(file);
4049 loff_t end = LLONG_MAX;
4051 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
4054 loff_t end = LLONG_MAX;
4056 struct inode *inode = dentry->d_inode;
4057 struct ll_inode_info *lli = ll_i2info(inode);
4058 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4059 struct ptlrpc_request *req;
4060 struct file *pcc_file = fd->fd_pcc_file.pccf_file;
4064 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
4065 PFID(ll_inode2fid(inode)), inode);
4066 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4068 /* pcc cache path */
4070 #ifdef HAVE_FILE_FSYNC_4ARGS
4071 return file_inode(pcc_file)->i_fop->fsync(pcc_file,
4072 start, end, datasync);
4073 #elif defined(HAVE_FILE_FSYNC_2ARGS)
4074 return file_inode(pcc_file)->i_fop->fsync(pcc_file,
4077 return file_inode(pcc_file)->i_fop->fsync(pcc_file,
4081 #ifdef HAVE_FILE_FSYNC_4ARGS
4082 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4085 /* fsync's caller has already called _fdata{sync,write}, we want
4086 * that IO to finish before calling the osc and mdc sync methods */
4087 rc = filemap_fdatawait(inode->i_mapping);
4090 /* catch async errors that were recorded back when async writeback
4091 * failed for pages in this mapping. */
4092 if (!S_ISDIR(inode->i_mode)) {
4093 err = lli->lli_async_rc;
4094 lli->lli_async_rc = 0;
4097 if (lli->lli_clob != NULL) {
4098 err = lov_read_and_clear_async_rc(lli->lli_clob);
4104 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4108 ptlrpc_req_finished(req);
4110 if (S_ISREG(inode->i_mode)) {
4111 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4113 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
4114 if (rc == 0 && err < 0)
4117 fd->fd_write_failed = true;
4119 fd->fd_write_failed = false;
4122 #ifdef HAVE_FILE_FSYNC_4ARGS
4123 inode_unlock(inode);
4129 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4131 struct inode *inode = file_inode(file);
4132 struct ll_sb_info *sbi = ll_i2sbi(inode);
4133 struct ldlm_enqueue_info einfo = {
4134 .ei_type = LDLM_FLOCK,
4135 .ei_cb_cp = ldlm_flock_completion_ast,
4136 .ei_cbdata = file_lock,
4138 struct md_op_data *op_data;
4139 struct lustre_handle lockh = { 0 };
4140 union ldlm_policy_data flock = { { 0 } };
4141 int fl_type = file_lock->fl_type;
4147 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4148 PFID(ll_inode2fid(inode)), file_lock);
4150 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4152 if (file_lock->fl_flags & FL_FLOCK) {
4153 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4154 /* flocks are whole-file locks */
4155 flock.l_flock.end = OFFSET_MAX;
4156 /* For flocks owner is determined by the local file desctiptor*/
4157 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4158 } else if (file_lock->fl_flags & FL_POSIX) {
4159 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4160 flock.l_flock.start = file_lock->fl_start;
4161 flock.l_flock.end = file_lock->fl_end;
4165 flock.l_flock.pid = file_lock->fl_pid;
4167 /* Somewhat ugly workaround for svc lockd.
4168 * lockd installs custom fl_lmops->lm_compare_owner that checks
4169 * for the fl_owner to be the same (which it always is on local node
4170 * I guess between lockd processes) and then compares pid.
4171 * As such we assign pid to the owner field to make it all work,
4172 * conflict with normal locks is unlikely since pid space and
4173 * pointer space for current->files are not intersecting */
4174 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4175 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4179 einfo.ei_mode = LCK_PR;
4182 /* An unlock request may or may not have any relation to
4183 * existing locks so we may not be able to pass a lock handle
4184 * via a normal ldlm_lock_cancel() request. The request may even
4185 * unlock a byte range in the middle of an existing lock. In
4186 * order to process an unlock request we need all of the same
4187 * information that is given with a normal read or write record
4188 * lock request. To avoid creating another ldlm unlock (cancel)
4189 * message we'll treat a LCK_NL flock request as an unlock. */
4190 einfo.ei_mode = LCK_NL;
4193 einfo.ei_mode = LCK_PW;
4196 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4211 flags = LDLM_FL_BLOCK_NOWAIT;
4217 flags = LDLM_FL_TEST_LOCK;
4220 CERROR("unknown fcntl lock command: %d\n", cmd);
4224 /* Save the old mode so that if the mode in the lock changes we
4225 * can decrement the appropriate reader or writer refcount. */
4226 file_lock->fl_type = einfo.ei_mode;
4228 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4229 LUSTRE_OPC_ANY, NULL);
4230 if (IS_ERR(op_data))
4231 RETURN(PTR_ERR(op_data));
4233 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4234 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4235 flock.l_flock.pid, flags, einfo.ei_mode,
4236 flock.l_flock.start, flock.l_flock.end);
4238 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4241 /* Restore the file lock type if not TEST lock. */
4242 if (!(flags & LDLM_FL_TEST_LOCK))
4243 file_lock->fl_type = fl_type;
4245 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4246 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4247 !(flags & LDLM_FL_TEST_LOCK))
4248 rc2 = locks_lock_file_wait(file, file_lock);
4250 if ((file_lock->fl_flags & FL_FLOCK) &&
4251 (rc == 0 || file_lock->fl_type == F_UNLCK))
4252 rc2 = flock_lock_file_wait(file, file_lock);
4253 if ((file_lock->fl_flags & FL_POSIX) &&
4254 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4255 !(flags & LDLM_FL_TEST_LOCK))
4256 rc2 = posix_lock_file_wait(file, file_lock);
4257 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4259 if (rc2 && file_lock->fl_type != F_UNLCK) {
4260 einfo.ei_mode = LCK_NL;
4261 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4266 ll_finish_md_op_data(op_data);
4271 int ll_get_fid_by_name(struct inode *parent, const char *name,
4272 int namelen, struct lu_fid *fid,
4273 struct inode **inode)
4275 struct md_op_data *op_data = NULL;
4276 struct mdt_body *body;
4277 struct ptlrpc_request *req;
4281 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4282 LUSTRE_OPC_ANY, NULL);
4283 if (IS_ERR(op_data))
4284 RETURN(PTR_ERR(op_data));
4286 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4287 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4288 ll_finish_md_op_data(op_data);
4292 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4294 GOTO(out_req, rc = -EFAULT);
4296 *fid = body->mbo_fid1;
4299 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4301 ptlrpc_req_finished(req);
4305 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4308 struct dentry *dchild = NULL;
4309 struct inode *child_inode = NULL;
4310 struct md_op_data *op_data;
4311 struct ptlrpc_request *request = NULL;
4312 struct obd_client_handle *och = NULL;
4314 struct mdt_body *body;
4315 __u64 data_version = 0;
4316 size_t namelen = strlen(name);
4317 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4321 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4322 PFID(ll_inode2fid(parent)), name,
4323 lum->lum_stripe_offset, lum->lum_stripe_count);
4325 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4326 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4327 lustre_swab_lmv_user_md(lum);
4329 /* Get child FID first */
4330 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4333 dchild = d_lookup(file_dentry(file), &qstr);
4335 if (dchild->d_inode)
4336 child_inode = igrab(dchild->d_inode);
4341 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4350 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4351 OBD_CONNECT2_DIR_MIGRATE)) {
4352 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4353 ll_i2info(child_inode)->lli_lsm_md) {
4354 CERROR("%s: MDT doesn't support stripe directory "
4355 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4356 GOTO(out_iput, rc = -EOPNOTSUPP);
4361 * lfs migrate command needs to be blocked on the client
4362 * by checking the migrate FID against the FID of the
4365 if (child_inode == parent->i_sb->s_root->d_inode)
4366 GOTO(out_iput, rc = -EINVAL);
4368 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4369 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4370 if (IS_ERR(op_data))
4371 GOTO(out_iput, rc = PTR_ERR(op_data));
4373 inode_lock(child_inode);
4374 op_data->op_fid3 = *ll_inode2fid(child_inode);
4375 if (!fid_is_sane(&op_data->op_fid3)) {
4376 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4377 ll_i2sbi(parent)->ll_fsname, name,
4378 PFID(&op_data->op_fid3));
4379 GOTO(out_unlock, rc = -EINVAL);
4382 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4383 op_data->op_data = lum;
4384 op_data->op_data_size = lumlen;
4387 if (S_ISREG(child_inode->i_mode)) {
4388 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4392 GOTO(out_unlock, rc);
4395 rc = ll_data_version(child_inode, &data_version,
4398 GOTO(out_close, rc);
4400 op_data->op_open_handle = och->och_open_handle;
4401 op_data->op_data_version = data_version;
4402 op_data->op_lease_handle = och->och_lease_handle;
4403 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4405 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4406 och->och_mod->mod_open_req->rq_replay = 0;
4407 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4410 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4411 name, namelen, &request);
4413 LASSERT(request != NULL);
4414 ll_update_times(request, parent);
4417 if (rc == 0 || rc == -EAGAIN) {
4418 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4419 LASSERT(body != NULL);
4421 /* If the server does release layout lock, then we cleanup
4422 * the client och here, otherwise release it in out_close: */
4423 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4424 obd_mod_put(och->och_mod);
4425 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4427 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4433 if (request != NULL) {
4434 ptlrpc_req_finished(request);
4438 /* Try again if the lease has cancelled. */
4439 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4444 ll_lease_close(och, child_inode, NULL);
4446 clear_nlink(child_inode);
4448 inode_unlock(child_inode);
4449 ll_finish_md_op_data(op_data);
4456 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4464 * test if some locks matching bits and l_req_mode are acquired
4465 * - bits can be in different locks
4466 * - if found clear the common lock bits in *bits
4467 * - the bits not found, are kept in *bits
4469 * \param bits [IN] searched lock bits [IN]
4470 * \param l_req_mode [IN] searched lock mode
4471 * \retval boolean, true iff all bits are found
4473 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4475 struct lustre_handle lockh;
4476 union ldlm_policy_data policy;
4477 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4478 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4487 fid = &ll_i2info(inode)->lli_fid;
4488 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4489 ldlm_lockname[mode]);
4491 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4492 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4493 policy.l_inodebits.bits = *bits & (1 << i);
4494 if (policy.l_inodebits.bits == 0)
4497 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4498 &policy, mode, &lockh)) {
4499 struct ldlm_lock *lock;
4501 lock = ldlm_handle2lock(&lockh);
4504 ~(lock->l_policy_data.l_inodebits.bits);
4505 LDLM_LOCK_PUT(lock);
4507 *bits &= ~policy.l_inodebits.bits;
4514 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4515 struct lustre_handle *lockh, __u64 flags,
4516 enum ldlm_mode mode)
4518 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4523 fid = &ll_i2info(inode)->lli_fid;
4524 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4526 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4527 fid, LDLM_IBITS, &policy, mode, lockh);
4532 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4534 /* Already unlinked. Just update nlink and return success */
4535 if (rc == -ENOENT) {
4537 /* If it is striped directory, and there is bad stripe
4538 * Let's revalidate the dentry again, instead of returning
4540 if (S_ISDIR(inode->i_mode) &&
4541 ll_i2info(inode)->lli_lsm_md != NULL)
4544 /* This path cannot be hit for regular files unless in
4545 * case of obscure races, so no need to to validate
4547 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4549 } else if (rc != 0) {
4550 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4551 "%s: revalidate FID "DFID" error: rc = %d\n",
4552 ll_i2sbi(inode)->ll_fsname,
4553 PFID(ll_inode2fid(inode)), rc);
4559 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4561 struct inode *inode = dentry->d_inode;
4562 struct obd_export *exp = ll_i2mdexp(inode);
4563 struct lookup_intent oit = {
4566 struct ptlrpc_request *req = NULL;
4567 struct md_op_data *op_data;
4571 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4572 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4574 /* Call getattr by fid, so do not provide name at all. */
4575 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4576 LUSTRE_OPC_ANY, NULL);
4577 if (IS_ERR(op_data))
4578 RETURN(PTR_ERR(op_data));
4580 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4581 ll_finish_md_op_data(op_data);
4583 rc = ll_inode_revalidate_fini(inode, rc);
4587 rc = ll_revalidate_it_finish(req, &oit, dentry);
4589 ll_intent_release(&oit);
4593 /* Unlinked? Unhash dentry, so it is not picked up later by
4594 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4595 * here to preserve get_cwd functionality on 2.6.
4597 if (!dentry->d_inode->i_nlink) {
4598 ll_lock_dcache(inode);
4599 d_lustre_invalidate(dentry, 0);
4600 ll_unlock_dcache(inode);
4603 ll_lookup_finish_locks(&oit, dentry);
4605 ptlrpc_req_finished(req);
4610 static int ll_merge_md_attr(struct inode *inode)
4612 struct ll_inode_info *lli = ll_i2info(inode);
4613 struct cl_attr attr = { 0 };
4616 LASSERT(lli->lli_lsm_md != NULL);
4618 /* foreign dir is not striped dir */
4619 if (lli->lli_lsm_md->lsm_md_magic == LMV_MAGIC_FOREIGN)
4622 down_read(&lli->lli_lsm_sem);
4623 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4624 &attr, ll_md_blocking_ast);
4625 up_read(&lli->lli_lsm_sem);
4629 set_nlink(inode, attr.cat_nlink);
4630 inode->i_blocks = attr.cat_blocks;
4631 i_size_write(inode, attr.cat_size);
4633 ll_i2info(inode)->lli_atime = attr.cat_atime;
4634 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4635 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4640 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4642 struct inode *inode = de->d_inode;
4643 struct ll_sb_info *sbi = ll_i2sbi(inode);
4644 struct ll_inode_info *lli = ll_i2info(inode);
4647 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4649 rc = ll_inode_revalidate(de, IT_GETATTR);
4653 if (S_ISREG(inode->i_mode)) {
4654 bool cached = false;
4656 rc = pcc_inode_getattr(inode, &cached);
4657 if (cached && rc < 0)
4659 /* In case of restore, the MDT has the right size and has
4660 * already send it back without granting the layout lock,
4661 * inode is up-to-date so glimpse is useless.
4662 * Also to glimpse we need the layout, in case of a running
4663 * restore the MDT holds the layout lock so the glimpse will
4664 * block up to the end of restore (getattr will block)
4666 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4667 rc = ll_glimpse_size(inode);
4672 /* If object isn't regular a file then don't validate size. */
4673 if (S_ISDIR(inode->i_mode) &&
4674 lli->lli_lsm_md != NULL) {
4675 rc = ll_merge_md_attr(inode);
4680 inode->i_atime.tv_sec = lli->lli_atime;
4681 inode->i_mtime.tv_sec = lli->lli_mtime;
4682 inode->i_ctime.tv_sec = lli->lli_ctime;
4685 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4687 if (ll_need_32bit_api(sbi)) {
4688 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4689 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4690 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4692 stat->ino = inode->i_ino;
4693 stat->dev = inode->i_sb->s_dev;
4694 stat->rdev = inode->i_rdev;
4697 stat->mode = inode->i_mode;
4698 stat->uid = inode->i_uid;
4699 stat->gid = inode->i_gid;
4700 stat->atime = inode->i_atime;
4701 stat->mtime = inode->i_mtime;
4702 stat->ctime = inode->i_ctime;
4703 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4705 stat->nlink = inode->i_nlink;
4706 stat->size = i_size_read(inode);
4707 stat->blocks = inode->i_blocks;
4712 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4713 int ll_getattr(const struct path *path, struct kstat *stat,
4714 u32 request_mask, unsigned int flags)
4716 struct dentry *de = path->dentry;
4718 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4721 return ll_getattr_dentry(de, stat);
4724 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4725 __u64 start, __u64 len)
4729 struct fiemap *fiemap;
4730 unsigned int extent_count = fieinfo->fi_extents_max;
4732 num_bytes = sizeof(*fiemap) + (extent_count *
4733 sizeof(struct fiemap_extent));
4734 OBD_ALLOC_LARGE(fiemap, num_bytes);
4739 fiemap->fm_flags = fieinfo->fi_flags;
4740 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4741 fiemap->fm_start = start;
4742 fiemap->fm_length = len;
4743 if (extent_count > 0 &&
4744 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4745 sizeof(struct fiemap_extent)) != 0)
4746 GOTO(out, rc = -EFAULT);
4748 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4750 fieinfo->fi_flags = fiemap->fm_flags;
4751 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4752 if (extent_count > 0 &&
4753 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4754 fiemap->fm_mapped_extents *
4755 sizeof(struct fiemap_extent)) != 0)
4756 GOTO(out, rc = -EFAULT);
4758 OBD_FREE_LARGE(fiemap, num_bytes);
4762 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4764 struct ll_inode_info *lli = ll_i2info(inode);
4765 struct posix_acl *acl = NULL;
4768 spin_lock(&lli->lli_lock);
4769 /* VFS' acl_permission_check->check_acl will release the refcount */
4770 acl = posix_acl_dup(lli->lli_posix_acl);
4771 spin_unlock(&lli->lli_lock);
4776 #ifdef HAVE_IOP_SET_ACL
4777 #ifdef CONFIG_FS_POSIX_ACL
4778 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4780 struct ll_sb_info *sbi = ll_i2sbi(inode);
4781 struct ptlrpc_request *req = NULL;
4782 const char *name = NULL;
4784 size_t value_size = 0;
4789 case ACL_TYPE_ACCESS:
4790 name = XATTR_NAME_POSIX_ACL_ACCESS;
4792 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4795 case ACL_TYPE_DEFAULT:
4796 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4797 if (!S_ISDIR(inode->i_mode))
4798 rc = acl ? -EACCES : 0;
4809 value_size = posix_acl_xattr_size(acl->a_count);
4810 value = kmalloc(value_size, GFP_NOFS);
4812 GOTO(out, rc = -ENOMEM);
4814 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4816 GOTO(out_value, rc);
4819 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4820 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4821 name, value, value_size, 0, 0, &req);
4823 ptlrpc_req_finished(req);
4828 forget_cached_acl(inode, type);
4830 set_cached_acl(inode, type, acl);
4833 #endif /* CONFIG_FS_POSIX_ACL */
4834 #endif /* HAVE_IOP_SET_ACL */
4836 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4838 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4839 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4841 ll_check_acl(struct inode *inode, int mask)
4844 # ifdef CONFIG_FS_POSIX_ACL
4845 struct posix_acl *acl;
4849 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4850 if (flags & IPERM_FLAG_RCU)
4853 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4858 rc = posix_acl_permission(inode, acl, mask);
4859 posix_acl_release(acl);
4862 # else /* !CONFIG_FS_POSIX_ACL */
4864 # endif /* CONFIG_FS_POSIX_ACL */
4866 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4868 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4869 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4871 # ifdef HAVE_INODE_PERMISION_2ARGS
4872 int ll_inode_permission(struct inode *inode, int mask)
4874 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4879 struct ll_sb_info *sbi;
4880 struct root_squash_info *squash;
4881 struct cred *cred = NULL;
4882 const struct cred *old_cred = NULL;
4884 bool squash_id = false;
4887 #ifdef MAY_NOT_BLOCK
4888 if (mask & MAY_NOT_BLOCK)
4890 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4891 if (flags & IPERM_FLAG_RCU)
4895 /* as root inode are NOT getting validated in lookup operation,
4896 * need to do it before permission check. */
4898 if (inode == inode->i_sb->s_root->d_inode) {
4899 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4904 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4905 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4907 /* squash fsuid/fsgid if needed */
4908 sbi = ll_i2sbi(inode);
4909 squash = &sbi->ll_squash;
4910 if (unlikely(squash->rsi_uid != 0 &&
4911 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4912 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4916 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4917 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4918 squash->rsi_uid, squash->rsi_gid);
4920 /* update current process's credentials
4921 * and FS capability */
4922 cred = prepare_creds();
4926 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4927 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4928 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4929 if ((1 << cap) & CFS_CAP_FS_MASK)
4930 cap_lower(cred->cap_effective, cap);
4932 old_cred = override_creds(cred);
4935 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4936 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4937 /* restore current process's credentials and FS capability */
4939 revert_creds(old_cred);
4946 /* -o localflock - only provides locally consistent flock locks */
4947 struct file_operations ll_file_operations = {
4948 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4949 # ifdef HAVE_SYNC_READ_WRITE
4950 .read = new_sync_read,
4951 .write = new_sync_write,
4953 .read_iter = ll_file_read_iter,
4954 .write_iter = ll_file_write_iter,
4955 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4956 .read = ll_file_read,
4957 .aio_read = ll_file_aio_read,
4958 .write = ll_file_write,
4959 .aio_write = ll_file_aio_write,
4960 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4961 .unlocked_ioctl = ll_file_ioctl,
4962 .open = ll_file_open,
4963 .release = ll_file_release,
4964 .mmap = ll_file_mmap,
4965 .llseek = ll_file_seek,
4966 .splice_read = ll_file_splice_read,
4971 struct file_operations ll_file_operations_flock = {
4972 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4973 # ifdef HAVE_SYNC_READ_WRITE
4974 .read = new_sync_read,
4975 .write = new_sync_write,
4976 # endif /* HAVE_SYNC_READ_WRITE */
4977 .read_iter = ll_file_read_iter,
4978 .write_iter = ll_file_write_iter,
4979 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4980 .read = ll_file_read,
4981 .aio_read = ll_file_aio_read,
4982 .write = ll_file_write,
4983 .aio_write = ll_file_aio_write,
4984 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4985 .unlocked_ioctl = ll_file_ioctl,
4986 .open = ll_file_open,
4987 .release = ll_file_release,
4988 .mmap = ll_file_mmap,
4989 .llseek = ll_file_seek,
4990 .splice_read = ll_file_splice_read,
4993 .flock = ll_file_flock,
4994 .lock = ll_file_flock
4997 /* These are for -o noflock - to return ENOSYS on flock calls */
4998 struct file_operations ll_file_operations_noflock = {
4999 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5000 # ifdef HAVE_SYNC_READ_WRITE
5001 .read = new_sync_read,
5002 .write = new_sync_write,
5003 # endif /* HAVE_SYNC_READ_WRITE */
5004 .read_iter = ll_file_read_iter,
5005 .write_iter = ll_file_write_iter,
5006 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5007 .read = ll_file_read,
5008 .aio_read = ll_file_aio_read,
5009 .write = ll_file_write,
5010 .aio_write = ll_file_aio_write,
5011 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5012 .unlocked_ioctl = ll_file_ioctl,
5013 .open = ll_file_open,
5014 .release = ll_file_release,
5015 .mmap = ll_file_mmap,
5016 .llseek = ll_file_seek,
5017 .splice_read = ll_file_splice_read,
5020 .flock = ll_file_noflock,
5021 .lock = ll_file_noflock
5024 struct inode_operations ll_file_inode_operations = {
5025 .setattr = ll_setattr,
5026 .getattr = ll_getattr,
5027 .permission = ll_inode_permission,
5028 #ifdef HAVE_IOP_XATTR
5029 .setxattr = ll_setxattr,
5030 .getxattr = ll_getxattr,
5031 .removexattr = ll_removexattr,
5033 .listxattr = ll_listxattr,
5034 .fiemap = ll_fiemap,
5035 #ifdef HAVE_IOP_GET_ACL
5036 .get_acl = ll_get_acl,
5038 #ifdef HAVE_IOP_SET_ACL
5039 .set_acl = ll_set_acl,
5043 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5045 struct ll_inode_info *lli = ll_i2info(inode);
5046 struct cl_object *obj = lli->lli_clob;
5055 env = cl_env_get(&refcheck);
5057 RETURN(PTR_ERR(env));
5059 rc = cl_conf_set(env, lli->lli_clob, conf);
5063 if (conf->coc_opc == OBJECT_CONF_SET) {
5064 struct ldlm_lock *lock = conf->coc_lock;
5065 struct cl_layout cl = {
5069 LASSERT(lock != NULL);
5070 LASSERT(ldlm_has_layout(lock));
5072 /* it can only be allowed to match after layout is
5073 * applied to inode otherwise false layout would be
5074 * seen. Applying layout shoud happen before dropping
5075 * the intent lock. */
5076 ldlm_lock_allow_match(lock);
5078 rc = cl_object_layout_get(env, obj, &cl);
5083 DFID": layout version change: %u -> %u\n",
5084 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5086 ll_layout_version_set(lli, cl.cl_layout_gen);
5090 cl_env_put(env, &refcheck);
5095 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5096 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5099 struct ll_sb_info *sbi = ll_i2sbi(inode);
5100 struct ptlrpc_request *req;
5107 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5108 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5109 lock->l_lvb_data, lock->l_lvb_len);
5111 if (lock->l_lvb_data != NULL)
5114 /* if layout lock was granted right away, the layout is returned
5115 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5116 * blocked and then granted via completion ast, we have to fetch
5117 * layout here. Please note that we can't use the LVB buffer in
5118 * completion AST because it doesn't have a large enough buffer */
5119 rc = ll_get_default_mdsize(sbi, &lmmsize);
5123 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5124 XATTR_NAME_LOV, lmmsize, &req);
5127 GOTO(out, rc = 0); /* empty layout */
5134 if (lmmsize == 0) /* empty layout */
5137 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5139 GOTO(out, rc = -EFAULT);
5141 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5142 if (lvbdata == NULL)
5143 GOTO(out, rc = -ENOMEM);
5145 memcpy(lvbdata, lmm, lmmsize);
5146 lock_res_and_lock(lock);
5147 if (unlikely(lock->l_lvb_data == NULL)) {
5148 lock->l_lvb_type = LVB_T_LAYOUT;
5149 lock->l_lvb_data = lvbdata;
5150 lock->l_lvb_len = lmmsize;
5153 unlock_res_and_lock(lock);
5156 OBD_FREE_LARGE(lvbdata, lmmsize);
5161 ptlrpc_req_finished(req);
5166 * Apply the layout to the inode. Layout lock is held and will be released
5169 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5170 struct inode *inode)
5172 struct ll_inode_info *lli = ll_i2info(inode);
5173 struct ll_sb_info *sbi = ll_i2sbi(inode);
5174 struct ldlm_lock *lock;
5175 struct cl_object_conf conf;
5178 bool wait_layout = false;
5181 LASSERT(lustre_handle_is_used(lockh));
5183 lock = ldlm_handle2lock(lockh);
5184 LASSERT(lock != NULL);
5185 LASSERT(ldlm_has_layout(lock));
5187 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5188 PFID(&lli->lli_fid), inode);
5190 /* in case this is a caching lock and reinstate with new inode */
5191 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5193 lock_res_and_lock(lock);
5194 lvb_ready = ldlm_is_lvb_ready(lock);
5195 unlock_res_and_lock(lock);
5197 /* checking lvb_ready is racy but this is okay. The worst case is
5198 * that multi processes may configure the file on the same time. */
5202 rc = ll_layout_fetch(inode, lock);
5206 /* for layout lock, lmm is stored in lock's lvb.
5207 * lvb_data is immutable if the lock is held so it's safe to access it
5210 * set layout to file. Unlikely this will fail as old layout was
5211 * surely eliminated */
5212 memset(&conf, 0, sizeof conf);
5213 conf.coc_opc = OBJECT_CONF_SET;
5214 conf.coc_inode = inode;
5215 conf.coc_lock = lock;
5216 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5217 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5218 rc = ll_layout_conf(inode, &conf);
5220 /* refresh layout failed, need to wait */
5221 wait_layout = rc == -EBUSY;
5224 LDLM_LOCK_PUT(lock);
5225 ldlm_lock_decref(lockh, mode);
5227 /* wait for IO to complete if it's still being used. */
5229 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5230 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5232 memset(&conf, 0, sizeof conf);
5233 conf.coc_opc = OBJECT_CONF_WAIT;
5234 conf.coc_inode = inode;
5235 rc = ll_layout_conf(inode, &conf);
5239 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5240 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5246 * Issue layout intent RPC to MDS.
5247 * \param inode [in] file inode
5248 * \param intent [in] layout intent
5250 * \retval 0 on success
5251 * \retval < 0 error code
5253 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5255 struct ll_inode_info *lli = ll_i2info(inode);
5256 struct ll_sb_info *sbi = ll_i2sbi(inode);
5257 struct md_op_data *op_data;
5258 struct lookup_intent it;
5259 struct ptlrpc_request *req;
5263 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5264 0, 0, LUSTRE_OPC_ANY, NULL);
5265 if (IS_ERR(op_data))
5266 RETURN(PTR_ERR(op_data));
5268 op_data->op_data = intent;
5269 op_data->op_data_size = sizeof(*intent);
5271 memset(&it, 0, sizeof(it));
5272 it.it_op = IT_LAYOUT;
5273 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5274 intent->li_opc == LAYOUT_INTENT_TRUNC)
5275 it.it_flags = FMODE_WRITE;
5277 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5278 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5280 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5281 &ll_md_blocking_ast, 0);
5282 if (it.it_request != NULL)
5283 ptlrpc_req_finished(it.it_request);
5284 it.it_request = NULL;
5286 ll_finish_md_op_data(op_data);
5288 /* set lock data in case this is a new lock */
5290 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5292 ll_intent_drop_lock(&it);
5298 * This function checks if there exists a LAYOUT lock on the client side,
5299 * or enqueues it if it doesn't have one in cache.
5301 * This function will not hold layout lock so it may be revoked any time after
5302 * this function returns. Any operations depend on layout should be redone
5305 * This function should be called before lov_io_init() to get an uptodate
5306 * layout version, the caller should save the version number and after IO
5307 * is finished, this function should be called again to verify that layout
5308 * is not changed during IO time.
5310 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5312 struct ll_inode_info *lli = ll_i2info(inode);
5313 struct ll_sb_info *sbi = ll_i2sbi(inode);
5314 struct lustre_handle lockh;
5315 struct layout_intent intent = {
5316 .li_opc = LAYOUT_INTENT_ACCESS,
5318 enum ldlm_mode mode;
5322 *gen = ll_layout_version_get(lli);
5323 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5327 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5328 LASSERT(S_ISREG(inode->i_mode));
5330 /* take layout lock mutex to enqueue layout lock exclusively. */
5331 mutex_lock(&lli->lli_layout_mutex);
5334 /* mostly layout lock is caching on the local side, so try to
5335 * match it before grabbing layout lock mutex. */
5336 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5337 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5338 if (mode != 0) { /* hit cached lock */
5339 rc = ll_layout_lock_set(&lockh, mode, inode);
5345 rc = ll_layout_intent(inode, &intent);
5351 *gen = ll_layout_version_get(lli);
5352 mutex_unlock(&lli->lli_layout_mutex);
5358 * Issue layout intent RPC indicating where in a file an IO is about to write.
5360 * \param[in] inode file inode.
5361 * \param[in] ext write range with start offset of fille in bytes where
5362 * an IO is about to write, and exclusive end offset in
5365 * \retval 0 on success
5366 * \retval < 0 error code
5368 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5369 struct lu_extent *ext)
5371 struct layout_intent intent = {
5373 .li_extent.e_start = ext->e_start,
5374 .li_extent.e_end = ext->e_end,
5379 rc = ll_layout_intent(inode, &intent);
5385 * This function send a restore request to the MDT
5387 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5389 struct hsm_user_request *hur;
5393 len = sizeof(struct hsm_user_request) +
5394 sizeof(struct hsm_user_item);
5395 OBD_ALLOC(hur, len);
5399 hur->hur_request.hr_action = HUA_RESTORE;
5400 hur->hur_request.hr_archive_id = 0;
5401 hur->hur_request.hr_flags = 0;
5402 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5403 sizeof(hur->hur_user_item[0].hui_fid));
5404 hur->hur_user_item[0].hui_extent.offset = offset;
5405 hur->hur_user_item[0].hui_extent.length = length;
5406 hur->hur_request.hr_itemcount = 1;
5407 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,