4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
515 len = de->d_name.len;
516 name = kmalloc(len, GFP_NOFS);
520 spin_lock(&de->d_lock);
521 if (len != de->d_name.len) {
522 spin_unlock(&de->d_lock);
526 memcpy(name, de->d_name.name, len);
527 spin_unlock(&de->d_lock);
529 if (!lu_name_is_valid_2(name, len)) {
536 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
537 name, len, 0, LUSTRE_OPC_ANY, NULL);
538 if (IS_ERR(op_data)) {
540 RETURN(PTR_ERR(op_data));
542 op_data->op_data = lmm;
543 op_data->op_data_size = lmmsize;
545 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
546 &ll_md_blocking_ast, 0);
548 ll_finish_md_op_data(op_data);
550 /* reason for keep own exit path - don`t flood log
551 * with messages with -ESTALE errors.
553 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
554 it_open_error(DISP_OPEN_OPEN, itp))
556 ll_release_openhandle(de, itp);
560 if (it_disposition(itp, DISP_LOOKUP_NEG))
561 GOTO(out, rc = -ENOENT);
563 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
564 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
565 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
569 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
571 if (!rc && itp->it_lock_mode) {
572 ll_dom_finish_open(de->d_inode, req, itp);
573 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
577 ptlrpc_req_finished(req);
578 ll_intent_drop_lock(itp);
580 /* We did open by fid, but by the time we got to the server,
581 * the object disappeared. If this is a create, we cannot really
582 * tell the userspace that the file it was trying to create
583 * does not exist. Instead let's return -ESTALE, and the VFS will
584 * retry the create with LOOKUP_REVAL that we are going to catch
585 * in ll_revalidate_dentry() and use lookup then.
587 if (rc == -ENOENT && itp->it_op & IT_CREAT)
593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
594 struct obd_client_handle *och)
596 struct mdt_body *body;
598 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
599 och->och_open_handle = body->mbo_open_handle;
600 och->och_fid = body->mbo_fid1;
601 och->och_lease_handle.cookie = it->it_lock_handle;
602 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
603 och->och_flags = it->it_flags;
605 return md_set_open_replay_data(md_exp, och, it);
608 static int ll_local_open(struct file *file, struct lookup_intent *it,
609 struct ll_file_data *fd, struct obd_client_handle *och)
611 struct inode *inode = file_inode(file);
614 LASSERT(!LUSTRE_FPRIVATE(file));
621 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
626 LUSTRE_FPRIVATE(file) = fd;
627 ll_readahead_init(inode, &fd->fd_ras);
628 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
630 /* ll_cl_context initialize */
631 rwlock_init(&fd->fd_lock);
632 INIT_LIST_HEAD(&fd->fd_lccs);
637 /* Open a file, and (for the very first open) create objects on the OSTs at
638 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
639 * creation or open until ll_lov_setstripe() ioctl is called.
641 * If we already have the stripe MD locally then we don't request it in
642 * md_open(), by passing a lmm_size = 0.
644 * It is up to the application to ensure no other processes open this file
645 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
646 * used. We might be able to avoid races of that sort by getting lli_open_sem
647 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
648 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
650 int ll_file_open(struct inode *inode, struct file *file)
652 struct ll_inode_info *lli = ll_i2info(inode);
653 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
654 .it_flags = file->f_flags };
655 struct obd_client_handle **och_p = NULL;
656 __u64 *och_usecount = NULL;
657 struct ll_file_data *fd;
661 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
662 PFID(ll_inode2fid(inode)), inode, file->f_flags);
664 it = file->private_data; /* XXX: compat macro */
665 file->private_data = NULL; /* prevent ll_local_open assertion */
667 fd = ll_file_data_get();
669 GOTO(out_nofiledata, rc = -ENOMEM);
672 if (S_ISDIR(inode->i_mode))
673 ll_authorize_statahead(inode, fd);
675 if (inode->i_sb->s_root == file_dentry(file)) {
676 LUSTRE_FPRIVATE(file) = fd;
680 if (!it || !it->it_disposition) {
681 /* Convert f_flags into access mode. We cannot use file->f_mode,
682 * because everything but O_ACCMODE mask was stripped from
684 if ((oit.it_flags + 1) & O_ACCMODE)
686 if (file->f_flags & O_TRUNC)
687 oit.it_flags |= FMODE_WRITE;
689 /* kernel only call f_op->open in dentry_open. filp_open calls
690 * dentry_open after call to open_namei that checks permissions.
691 * Only nfsd_open call dentry_open directly without checking
692 * permissions and because of that this code below is safe.
694 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
695 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
697 /* We do not want O_EXCL here, presumably we opened the file
698 * already? XXX - NFS implications? */
699 oit.it_flags &= ~O_EXCL;
701 /* bug20584, if "it_flags" contains O_CREAT, the file will be
702 * created if necessary, then "IT_CREAT" should be set to keep
703 * consistent with it */
704 if (oit.it_flags & O_CREAT)
705 oit.it_op |= IT_CREAT;
711 /* Let's see if we have file open on MDS already. */
712 if (it->it_flags & FMODE_WRITE) {
713 och_p = &lli->lli_mds_write_och;
714 och_usecount = &lli->lli_open_fd_write_count;
715 } else if (it->it_flags & FMODE_EXEC) {
716 och_p = &lli->lli_mds_exec_och;
717 och_usecount = &lli->lli_open_fd_exec_count;
719 och_p = &lli->lli_mds_read_och;
720 och_usecount = &lli->lli_open_fd_read_count;
723 mutex_lock(&lli->lli_och_mutex);
724 if (*och_p) { /* Open handle is present */
725 if (it_disposition(it, DISP_OPEN_OPEN)) {
726 /* Well, there's extra open request that we do not need,
727 let's close it somehow. This will decref request. */
728 rc = it_open_error(DISP_OPEN_OPEN, it);
730 mutex_unlock(&lli->lli_och_mutex);
731 GOTO(out_openerr, rc);
734 ll_release_openhandle(file_dentry(file), it);
738 rc = ll_local_open(file, it, fd, NULL);
741 mutex_unlock(&lli->lli_och_mutex);
742 GOTO(out_openerr, rc);
745 LASSERT(*och_usecount == 0);
746 if (!it->it_disposition) {
747 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
748 /* We cannot just request lock handle now, new ELC code
749 means that one of other OPEN locks for this file
750 could be cancelled, and since blocking ast handler
751 would attempt to grab och_mutex as well, that would
752 result in a deadlock */
753 mutex_unlock(&lli->lli_och_mutex);
755 * Normally called under two situations:
757 * 2. A race/condition on MDS resulting in no open
758 * handle to be returned from LOOKUP|OPEN request,
759 * for example if the target entry was a symlink.
761 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
762 * marked by a bit set in ll_iget_for_nfs. Clear the
763 * bit so that it's not confusing later callers.
765 * NB; when ldd is NULL, it must have come via normal
766 * lookup path only, since ll_iget_for_nfs always calls
769 if (ldd && ldd->lld_nfs_dentry) {
770 ldd->lld_nfs_dentry = 0;
771 it->it_flags |= MDS_OPEN_LOCK;
775 * Always specify MDS_OPEN_BY_FID because we don't want
776 * to get file with different fid.
778 it->it_flags |= MDS_OPEN_BY_FID;
779 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
782 GOTO(out_openerr, rc);
786 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
788 GOTO(out_och_free, rc = -ENOMEM);
792 /* md_intent_lock() didn't get a request ref if there was an
793 * open error, so don't do cleanup on the request here
795 /* XXX (green): Should not we bail out on any error here, not
796 * just open error? */
797 rc = it_open_error(DISP_OPEN_OPEN, it);
799 GOTO(out_och_free, rc);
801 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
802 "inode %p: disposition %x, status %d\n", inode,
803 it_disposition(it, ~0), it->it_status);
805 rc = ll_local_open(file, it, fd, *och_p);
807 GOTO(out_och_free, rc);
809 mutex_unlock(&lli->lli_och_mutex);
812 /* Must do this outside lli_och_mutex lock to prevent deadlock where
813 different kind of OPEN lock for this same inode gets cancelled
814 by ldlm_cancel_lru */
815 if (!S_ISREG(inode->i_mode))
816 GOTO(out_och_free, rc);
818 cl_lov_delay_create_clear(&file->f_flags);
819 GOTO(out_och_free, rc);
823 if (och_p && *och_p) {
824 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
825 *och_p = NULL; /* OBD_FREE writes some magic there */
828 mutex_unlock(&lli->lli_och_mutex);
831 if (lli->lli_opendir_key == fd)
832 ll_deauthorize_statahead(inode, fd);
834 ll_file_data_put(fd);
836 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
840 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
841 ptlrpc_req_finished(it->it_request);
842 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
848 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
849 struct ldlm_lock_desc *desc, void *data, int flag)
852 struct lustre_handle lockh;
856 case LDLM_CB_BLOCKING:
857 ldlm_lock2handle(lock, &lockh);
858 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
860 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
864 case LDLM_CB_CANCELING:
872 * When setting a lease on a file, we take ownership of the lli_mds_*_och
873 * and save it as fd->fd_och so as to force client to reopen the file even
874 * if it has an open lock in cache already.
876 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
877 struct lustre_handle *old_open_handle)
879 struct ll_inode_info *lli = ll_i2info(inode);
880 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
881 struct obd_client_handle **och_p;
886 /* Get the openhandle of the file */
887 mutex_lock(&lli->lli_och_mutex);
888 if (fd->fd_lease_och != NULL)
889 GOTO(out_unlock, rc = -EBUSY);
891 if (fd->fd_och == NULL) {
892 if (file->f_mode & FMODE_WRITE) {
893 LASSERT(lli->lli_mds_write_och != NULL);
894 och_p = &lli->lli_mds_write_och;
895 och_usecount = &lli->lli_open_fd_write_count;
897 LASSERT(lli->lli_mds_read_och != NULL);
898 och_p = &lli->lli_mds_read_och;
899 och_usecount = &lli->lli_open_fd_read_count;
902 if (*och_usecount > 1)
903 GOTO(out_unlock, rc = -EBUSY);
910 *old_open_handle = fd->fd_och->och_open_handle;
914 mutex_unlock(&lli->lli_och_mutex);
919 * Release ownership on lli_mds_*_och when putting back a file lease.
921 static int ll_lease_och_release(struct inode *inode, struct file *file)
923 struct ll_inode_info *lli = ll_i2info(inode);
924 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
925 struct obd_client_handle **och_p;
926 struct obd_client_handle *old_och = NULL;
931 mutex_lock(&lli->lli_och_mutex);
932 if (file->f_mode & FMODE_WRITE) {
933 och_p = &lli->lli_mds_write_och;
934 och_usecount = &lli->lli_open_fd_write_count;
936 och_p = &lli->lli_mds_read_och;
937 och_usecount = &lli->lli_open_fd_read_count;
940 /* The file may have been open by another process (broken lease) so
941 * *och_p is not NULL. In this case we should simply increase usecount
944 if (*och_p != NULL) {
945 old_och = fd->fd_och;
952 mutex_unlock(&lli->lli_och_mutex);
955 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
961 * Acquire a lease and open the file.
963 static struct obd_client_handle *
964 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
967 struct lookup_intent it = { .it_op = IT_OPEN };
968 struct ll_sb_info *sbi = ll_i2sbi(inode);
969 struct md_op_data *op_data;
970 struct ptlrpc_request *req = NULL;
971 struct lustre_handle old_open_handle = { 0 };
972 struct obd_client_handle *och = NULL;
977 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
978 RETURN(ERR_PTR(-EINVAL));
981 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
982 RETURN(ERR_PTR(-EPERM));
984 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
991 RETURN(ERR_PTR(-ENOMEM));
993 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
994 LUSTRE_OPC_ANY, NULL);
996 GOTO(out, rc = PTR_ERR(op_data));
998 /* To tell the MDT this openhandle is from the same owner */
999 op_data->op_open_handle = old_open_handle;
1001 it.it_flags = fmode | open_flags;
1002 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1003 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1004 &ll_md_blocking_lease_ast,
1005 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1006 * it can be cancelled which may mislead applications that the lease is
1008 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1009 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1010 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1011 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1012 ll_finish_md_op_data(op_data);
1013 ptlrpc_req_finished(req);
1015 GOTO(out_release_it, rc);
1017 if (it_disposition(&it, DISP_LOOKUP_NEG))
1018 GOTO(out_release_it, rc = -ENOENT);
1020 rc = it_open_error(DISP_OPEN_OPEN, &it);
1022 GOTO(out_release_it, rc);
1024 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1025 ll_och_fill(sbi->ll_md_exp, &it, och);
1027 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1028 GOTO(out_close, rc = -EOPNOTSUPP);
1030 /* already get lease, handle lease lock */
1031 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1032 if (it.it_lock_mode == 0 ||
1033 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1034 /* open lock must return for lease */
1035 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1036 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1038 GOTO(out_close, rc = -EPROTO);
1041 ll_intent_release(&it);
1045 /* Cancel open lock */
1046 if (it.it_lock_mode != 0) {
1047 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1049 it.it_lock_mode = 0;
1050 och->och_lease_handle.cookie = 0ULL;
1052 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1054 CERROR("%s: error closing file "DFID": %d\n",
1055 ll_get_fsname(inode->i_sb, NULL, 0),
1056 PFID(&ll_i2info(inode)->lli_fid), rc2);
1057 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1059 ll_intent_release(&it);
1063 RETURN(ERR_PTR(rc));
1067 * Check whether a layout swap can be done between two inodes.
1069 * \param[in] inode1 First inode to check
1070 * \param[in] inode2 Second inode to check
1072 * \retval 0 on success, layout swap can be performed between both inodes
1073 * \retval negative error code if requirements are not met
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076 struct inode *inode2)
1078 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1081 if (inode_permission(inode1, MAY_WRITE) ||
1082 inode_permission(inode2, MAY_WRITE))
1085 if (inode1->i_sb != inode2->i_sb)
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092 struct inode *inode, struct inode *inode2)
1094 const struct lu_fid *fid1 = ll_inode2fid(inode);
1095 const struct lu_fid *fid2;
1099 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1102 rc = ll_check_swap_layouts_validity(inode, inode2);
1104 GOTO(out_free_och, rc);
1106 /* We now know that inode2 is a lustre inode */
1107 fid2 = ll_inode2fid(inode2);
1109 rc = lu_fid_cmp(fid1, fid2);
1111 GOTO(out_free_och, rc = -EINVAL);
1113 /* Close the file and {swap,merge} layouts between inode & inode2.
1114 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115 * because we still need it to pack l_remote_handle to MDT. */
1116 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1119 och = NULL; /* freed in ll_close_inode_openhandle() */
1129 * Release lease and close the file.
1130 * It will check if the lease has ever broken.
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133 struct inode *inode,
1134 bool *lease_broken, enum mds_op_bias bias,
1137 struct ldlm_lock *lock;
1138 bool cancelled = true;
1142 lock = ldlm_handle2lock(&och->och_lease_handle);
1144 lock_res_and_lock(lock);
1145 cancelled = ldlm_is_cancel(lock);
1146 unlock_res_and_lock(lock);
1147 LDLM_LOCK_PUT(lock);
1150 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1153 if (lease_broken != NULL)
1154 *lease_broken = cancelled;
1156 if (!cancelled && !bias)
1157 ldlm_cli_cancel(&och->och_lease_handle, 0);
1159 if (cancelled) { /* no need to excute intent */
1164 rc = ll_close_inode_openhandle(inode, och, bias, data);
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1171 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1175 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178 struct inode *inode, unsigned long arg)
1180 struct ll_sb_info *sbi = ll_i2sbi(inode);
1181 struct md_op_data *op_data;
1182 struct ll_ioc_lease_id ioc;
1183 __u64 data_version_unused;
1187 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188 LUSTRE_OPC_ANY, NULL);
1189 if (IS_ERR(op_data))
1190 RETURN(PTR_ERR(op_data));
1192 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1196 /* before starting file resync, it's necessary to clean up page cache
1197 * in client memory, otherwise once the layout version is increased,
1198 * writing back cached data will be denied the OSTs. */
1199 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1203 op_data->op_lease_handle = och->och_lease_handle;
1204 op_data->op_mirror_id = ioc.lil_mirror_id;
1205 rc = md_file_resync(sbi->ll_md_exp, op_data);
1211 ll_finish_md_op_data(op_data);
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1217 struct ll_inode_info *lli = ll_i2info(inode);
1218 struct cl_object *obj = lli->lli_clob;
1219 struct cl_attr *attr = vvp_env_thread_attr(env);
1227 ll_inode_size_lock(inode);
1229 /* Merge timestamps the most recently obtained from MDS with
1230 * timestamps obtained from OSTs.
1232 * Do not overwrite atime of inode because it may be refreshed
1233 * by file_accessed() function. If the read was served by cache
1234 * data, there is no RPC to be sent so that atime may not be
1235 * transferred to OSTs at all. MDT only updates atime at close time
1236 * if it's at least 'mdd.*.atime_diff' older.
1237 * All in all, the atime in Lustre does not strictly comply with
1238 * POSIX. Solving this problem needs to send an RPC to MDT for each
1239 * read, this will hurt performance.
1241 if (inode->i_atime.tv_sec < lli->lli_atime ||
1242 lli->lli_update_atime) {
1243 inode->i_atime.tv_sec = lli->lli_atime;
1244 lli->lli_update_atime = 0;
1246 inode->i_mtime.tv_sec = lli->lli_mtime;
1247 inode->i_ctime.tv_sec = lli->lli_ctime;
1249 mtime = inode->i_mtime.tv_sec;
1250 atime = inode->i_atime.tv_sec;
1251 ctime = inode->i_ctime.tv_sec;
1253 cl_object_attr_lock(obj);
1254 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1257 rc = cl_object_attr_get(env, obj, attr);
1258 cl_object_attr_unlock(obj);
1261 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1263 if (atime < attr->cat_atime)
1264 atime = attr->cat_atime;
1266 if (ctime < attr->cat_ctime)
1267 ctime = attr->cat_ctime;
1269 if (mtime < attr->cat_mtime)
1270 mtime = attr->cat_mtime;
1272 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273 PFID(&lli->lli_fid), attr->cat_size);
1275 i_size_write(inode, attr->cat_size);
1276 inode->i_blocks = attr->cat_blocks;
1278 inode->i_mtime.tv_sec = mtime;
1279 inode->i_atime.tv_sec = atime;
1280 inode->i_ctime.tv_sec = ctime;
1283 ll_inode_size_unlock(inode);
1289 * Set designated mirror for I/O.
1291 * So far only read, write, and truncated can support to issue I/O to
1292 * designated mirror.
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1296 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1298 /* clear layout version for generic(non-resync) I/O in case it carries
1299 * stale layout version due to I/O restart */
1300 io->ci_layout_version = 0;
1302 /* FLR: disable non-delay for designated mirror I/O because obviously
1303 * only one mirror is available */
1304 if (fd->fd_designated_mirror > 0) {
1306 io->ci_designated_mirror = fd->fd_designated_mirror;
1307 io->ci_layout_version = fd->fd_layout_version;
1310 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1314 static bool file_is_noatime(const struct file *file)
1316 const struct vfsmount *mnt = file->f_path.mnt;
1317 const struct inode *inode = file_inode((struct file *)file);
1319 /* Adapted from file_accessed() and touch_atime().*/
1320 if (file->f_flags & O_NOATIME)
1323 if (inode->i_flags & S_NOATIME)
1326 if (IS_NOATIME(inode))
1329 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1332 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1335 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1343 struct inode *inode = file_inode(file);
1344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1346 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1349 if (iot == CIT_WRITE) {
1350 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1352 file->f_flags & O_DIRECT ||
1355 io->ci_obj = ll_i2info(inode)->lli_clob;
1356 io->ci_lockreq = CILR_MAYBE;
1357 if (ll_file_nolock(file)) {
1358 io->ci_lockreq = CILR_NEVER;
1359 io->ci_no_srvlock = 1;
1360 } else if (file->f_flags & O_APPEND) {
1361 io->ci_lockreq = CILR_MANDATORY;
1363 io->ci_noatime = file_is_noatime(file);
1365 /* FLR: only use non-delay I/O for read as there is only one
1366 * avaliable mirror for write. */
1367 io->ci_ndelay = !(iot == CIT_WRITE);
1369 ll_io_set_mirror(io, file);
1373 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1374 struct file *file, enum cl_io_type iot,
1375 loff_t *ppos, size_t count)
1377 struct vvp_io *vio = vvp_env_io(env);
1378 struct inode *inode = file_inode(file);
1379 struct ll_inode_info *lli = ll_i2info(inode);
1380 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1381 struct range_lock range;
1385 unsigned retried = 0;
1386 bool restarted = false;
1390 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1391 file_dentry(file)->d_name.name,
1392 iot == CIT_READ ? "read" : "write", *ppos, count);
1395 io = vvp_env_thread_io(env);
1396 ll_io_init(io, file, iot);
1397 io->ci_ndelay_tried = retried;
1399 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1400 bool range_locked = false;
1402 if (file->f_flags & O_APPEND)
1403 range_lock_init(&range, 0, LUSTRE_EOF);
1405 range_lock_init(&range, *ppos, *ppos + count - 1);
1407 vio->vui_fd = LUSTRE_FPRIVATE(file);
1408 vio->vui_io_subtype = args->via_io_subtype;
1410 switch (vio->vui_io_subtype) {
1412 vio->vui_iter = args->u.normal.via_iter;
1413 vio->vui_iocb = args->u.normal.via_iocb;
1414 /* Direct IO reads must also take range lock,
1415 * or multiple reads will try to work on the same pages
1416 * See LU-6227 for details. */
1417 if (((iot == CIT_WRITE) ||
1418 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1419 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1420 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1422 rc = range_lock(&lli->lli_write_tree, &range);
1426 range_locked = true;
1430 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1431 vio->u.splice.vui_flags = args->u.splice.via_flags;
1434 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1438 ll_cl_add(file, env, io, LCC_RW);
1439 rc = cl_io_loop(env, io);
1440 ll_cl_remove(file, env);
1443 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1445 range_unlock(&lli->lli_write_tree, &range);
1448 /* cl_io_rw_init() handled IO */
1452 if (io->ci_nob > 0) {
1453 result += io->ci_nob;
1454 count -= io->ci_nob;
1455 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1457 /* prepare IO restart */
1458 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1459 args->u.normal.via_iter = vio->vui_iter;
1462 cl_io_fini(env, io);
1465 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1466 file->f_path.dentry->d_name.name,
1467 iot, rc, result, io->ci_need_restart);
1469 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1471 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1472 file_dentry(file)->d_name.name,
1473 iot == CIT_READ ? "read" : "write",
1474 *ppos, count, result, rc);
1475 /* preserve the tried count for FLR */
1476 retried = io->ci_ndelay_tried;
1481 if (iot == CIT_READ) {
1483 ll_stats_ops_tally(ll_i2sbi(inode),
1484 LPROC_LL_READ_BYTES, result);
1485 } else if (iot == CIT_WRITE) {
1487 ll_stats_ops_tally(ll_i2sbi(inode),
1488 LPROC_LL_WRITE_BYTES, result);
1489 fd->fd_write_failed = false;
1490 } else if (result == 0 && rc == 0) {
1493 fd->fd_write_failed = true;
1495 fd->fd_write_failed = false;
1496 } else if (rc != -ERESTARTSYS) {
1497 fd->fd_write_failed = true;
1501 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1503 RETURN(result > 0 ? result : rc);
1507 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1508 * especially for small I/O.
1510 * To serve a read request, CLIO has to create and initialize a cl_io and
1511 * then request DLM lock. This has turned out to have siginificant overhead
1512 * and affects the performance of small I/O dramatically.
1514 * It's not necessary to create a cl_io for each I/O. Under the help of read
1515 * ahead, most of the pages being read are already in memory cache and we can
1516 * read those pages directly because if the pages exist, the corresponding DLM
1517 * lock must exist so that page content must be valid.
1519 * In fast read implementation, the llite speculatively finds and reads pages
1520 * in memory cache. There are three scenarios for fast read:
1521 * - If the page exists and is uptodate, kernel VM will provide the data and
1522 * CLIO won't be intervened;
1523 * - If the page was brought into memory by read ahead, it will be exported
1524 * and read ahead parameters will be updated;
1525 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1526 * it will go back and invoke normal read, i.e., a cl_io will be created
1527 * and DLM lock will be requested.
1529 * POSIX compliance: posix standard states that read is intended to be atomic.
1530 * Lustre read implementation is in line with Linux kernel read implementation
1531 * and neither of them complies with POSIX standard in this matter. Fast read
1532 * doesn't make the situation worse on single node but it may interleave write
1533 * results from multiple nodes due to short read handling in ll_file_aio_read().
1535 * \param env - lu_env
1536 * \param iocb - kiocb from kernel
1537 * \param iter - user space buffers where the data will be copied
1539 * \retval - number of bytes have been read, or error code if error occurred.
1542 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1546 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1549 /* NB: we can't do direct IO for fast read because it will need a lock
1550 * to make IO engine happy. */
1551 if (iocb->ki_filp->f_flags & O_DIRECT)
1554 result = generic_file_read_iter(iocb, iter);
1556 /* If the first page is not in cache, generic_file_aio_read() will be
1557 * returned with -ENODATA.
1558 * See corresponding code in ll_readpage(). */
1559 if (result == -ENODATA)
1563 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1564 LPROC_LL_READ_BYTES, result);
1570 * Read from a file (through the page cache).
1572 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1575 struct vvp_io_args *args;
1580 result = ll_do_fast_read(iocb, to);
1581 if (result < 0 || iov_iter_count(to) == 0)
1584 env = cl_env_get(&refcheck);
1586 return PTR_ERR(env);
1588 args = ll_env_args(env, IO_NORMAL);
1589 args->u.normal.via_iter = to;
1590 args->u.normal.via_iocb = iocb;
1592 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1593 &iocb->ki_pos, iov_iter_count(to));
1596 else if (result == 0)
1599 cl_env_put(env, &refcheck);
1605 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1606 * If a page is already in the page cache and dirty (and some other things -
1607 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1608 * write to it without doing a full I/O, because Lustre already knows about it
1609 * and will write it out. This saves a lot of processing time.
1611 * All writes here are within one page, so exclusion is handled by the page
1612 * lock on the vm page. We do not do tiny writes for writes which touch
1613 * multiple pages because it's very unlikely multiple sequential pages are
1614 * are already dirty.
1616 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1617 * and are unlikely to be to already dirty pages.
1619 * Attribute updates are important here, we do them in ll_tiny_write_end.
1621 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1623 ssize_t count = iov_iter_count(iter);
1624 struct file *file = iocb->ki_filp;
1625 struct inode *inode = file_inode(file);
1626 bool lock_inode = !IS_NOSEC(inode);
1631 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1632 * of function for why.
1634 if (count >= PAGE_SIZE ||
1635 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1638 if (unlikely(lock_inode))
1640 result = __generic_file_write_iter(iocb, iter);
1642 if (unlikely(lock_inode))
1643 inode_unlock(inode);
1645 /* If the page is not already dirty, ll_tiny_write_begin returns
1646 * -ENODATA. We continue on to normal write.
1648 if (result == -ENODATA)
1652 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1654 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1657 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1663 * Write to a file (through the page cache).
1665 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1667 struct vvp_io_args *args;
1669 ssize_t rc_tiny = 0, rc_normal;
1674 /* NB: we can't do direct IO for tiny writes because they use the page
1675 * cache, we can't do sync writes because tiny writes can't flush
1676 * pages, and we can't do append writes because we can't guarantee the
1677 * required DLM locks are held to protect file size.
1679 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1680 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1681 rc_tiny = ll_do_tiny_write(iocb, from);
1683 /* In case of error, go on and try normal write - Only stop if tiny
1684 * write completed I/O.
1686 if (iov_iter_count(from) == 0)
1687 GOTO(out, rc_normal = rc_tiny);
1689 env = cl_env_get(&refcheck);
1691 return PTR_ERR(env);
1693 args = ll_env_args(env, IO_NORMAL);
1694 args->u.normal.via_iter = from;
1695 args->u.normal.via_iocb = iocb;
1697 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1698 &iocb->ki_pos, iov_iter_count(from));
1700 /* On success, combine bytes written. */
1701 if (rc_tiny >= 0 && rc_normal > 0)
1702 rc_normal += rc_tiny;
1703 /* On error, only return error from normal write if tiny write did not
1704 * write any bytes. Otherwise return bytes written by tiny write.
1706 else if (rc_tiny > 0)
1707 rc_normal = rc_tiny;
1709 cl_env_put(env, &refcheck);
1714 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1716 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1718 static int ll_file_get_iov_count(const struct iovec *iov,
1719 unsigned long *nr_segs, size_t *count)
1724 for (seg = 0; seg < *nr_segs; seg++) {
1725 const struct iovec *iv = &iov[seg];
1728 * If any segment has a negative length, or the cumulative
1729 * length ever wraps negative then return -EINVAL.
1732 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1734 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1739 cnt -= iv->iov_len; /* This segment is no good */
1746 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1747 unsigned long nr_segs, loff_t pos)
1754 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1758 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1759 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1760 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1761 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1762 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1764 result = ll_file_read_iter(iocb, &to);
1769 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1772 struct iovec iov = { .iov_base = buf, .iov_len = count };
1777 init_sync_kiocb(&kiocb, file);
1778 kiocb.ki_pos = *ppos;
1779 #ifdef HAVE_KIOCB_KI_LEFT
1780 kiocb.ki_left = count;
1781 #elif defined(HAVE_KI_NBYTES)
1782 kiocb.i_nbytes = count;
1785 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1786 *ppos = kiocb.ki_pos;
1792 * Write to a file (through the page cache).
1795 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1796 unsigned long nr_segs, loff_t pos)
1798 struct iov_iter from;
1803 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1807 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1808 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1809 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1810 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1811 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1813 result = ll_file_write_iter(iocb, &from);
1818 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1819 size_t count, loff_t *ppos)
1821 struct iovec iov = { .iov_base = (void __user *)buf,
1828 init_sync_kiocb(&kiocb, file);
1829 kiocb.ki_pos = *ppos;
1830 #ifdef HAVE_KIOCB_KI_LEFT
1831 kiocb.ki_left = count;
1832 #elif defined(HAVE_KI_NBYTES)
1833 kiocb.ki_nbytes = count;
1836 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1837 *ppos = kiocb.ki_pos;
1841 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1844 * Send file content (through pagecache) somewhere with helper
1846 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1847 struct pipe_inode_info *pipe, size_t count,
1851 struct vvp_io_args *args;
1856 env = cl_env_get(&refcheck);
1858 RETURN(PTR_ERR(env));
1860 args = ll_env_args(env, IO_SPLICE);
1861 args->u.splice.via_pipe = pipe;
1862 args->u.splice.via_flags = flags;
1864 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1865 cl_env_put(env, &refcheck);
1869 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1870 __u64 flags, struct lov_user_md *lum, int lum_size)
1872 struct lookup_intent oit = {
1874 .it_flags = flags | MDS_OPEN_BY_FID,
1879 ll_inode_size_lock(inode);
1880 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1882 GOTO(out_unlock, rc);
1884 ll_release_openhandle(dentry, &oit);
1887 ll_inode_size_unlock(inode);
1888 ll_intent_release(&oit);
1893 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1894 struct lov_mds_md **lmmp, int *lmm_size,
1895 struct ptlrpc_request **request)
1897 struct ll_sb_info *sbi = ll_i2sbi(inode);
1898 struct mdt_body *body;
1899 struct lov_mds_md *lmm = NULL;
1900 struct ptlrpc_request *req = NULL;
1901 struct md_op_data *op_data;
1904 rc = ll_get_default_mdsize(sbi, &lmmsize);
1908 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1909 strlen(filename), lmmsize,
1910 LUSTRE_OPC_ANY, NULL);
1911 if (IS_ERR(op_data))
1912 RETURN(PTR_ERR(op_data));
1914 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1915 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1916 ll_finish_md_op_data(op_data);
1918 CDEBUG(D_INFO, "md_getattr_name failed "
1919 "on %s: rc %d\n", filename, rc);
1923 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1924 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1926 lmmsize = body->mbo_eadatasize;
1928 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1930 GOTO(out, rc = -ENODATA);
1933 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1934 LASSERT(lmm != NULL);
1936 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1937 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1938 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1939 GOTO(out, rc = -EPROTO);
1942 * This is coming from the MDS, so is probably in
1943 * little endian. We convert it to host endian before
1944 * passing it to userspace.
1946 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1949 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1950 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1951 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1952 if (le32_to_cpu(lmm->lmm_pattern) &
1953 LOV_PATTERN_F_RELEASED)
1957 /* if function called for directory - we should
1958 * avoid swab not existent lsm objects */
1959 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1960 lustre_swab_lov_user_md_v1(
1961 (struct lov_user_md_v1 *)lmm);
1962 if (S_ISREG(body->mbo_mode))
1963 lustre_swab_lov_user_md_objects(
1964 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1966 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1967 lustre_swab_lov_user_md_v3(
1968 (struct lov_user_md_v3 *)lmm);
1969 if (S_ISREG(body->mbo_mode))
1970 lustre_swab_lov_user_md_objects(
1971 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1973 } else if (lmm->lmm_magic ==
1974 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1975 lustre_swab_lov_comp_md_v1(
1976 (struct lov_comp_md_v1 *)lmm);
1982 *lmm_size = lmmsize;
1987 static int ll_lov_setea(struct inode *inode, struct file *file,
1990 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1991 struct lov_user_md *lump;
1992 int lum_size = sizeof(struct lov_user_md) +
1993 sizeof(struct lov_user_ost_data);
1997 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2000 OBD_ALLOC_LARGE(lump, lum_size);
2004 if (copy_from_user(lump, arg, lum_size))
2005 GOTO(out_lump, rc = -EFAULT);
2007 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2009 cl_lov_delay_create_clear(&file->f_flags);
2012 OBD_FREE_LARGE(lump, lum_size);
2016 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2023 env = cl_env_get(&refcheck);
2025 RETURN(PTR_ERR(env));
2027 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2028 cl_env_put(env, &refcheck);
2032 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2035 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2036 struct lov_user_md *klum;
2038 __u64 flags = FMODE_WRITE;
2041 rc = ll_copy_user_md(lum, &klum);
2046 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2051 rc = put_user(0, &lum->lmm_stripe_count);
2055 rc = ll_layout_refresh(inode, &gen);
2059 rc = ll_file_getstripe(inode, arg, lum_size);
2061 cl_lov_delay_create_clear(&file->f_flags);
2064 OBD_FREE(klum, lum_size);
2069 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2071 struct ll_inode_info *lli = ll_i2info(inode);
2072 struct cl_object *obj = lli->lli_clob;
2073 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2074 struct ll_grouplock grouplock;
2079 CWARN("group id for group lock must not be 0\n");
2083 if (ll_file_nolock(file))
2084 RETURN(-EOPNOTSUPP);
2086 spin_lock(&lli->lli_lock);
2087 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2088 CWARN("group lock already existed with gid %lu\n",
2089 fd->fd_grouplock.lg_gid);
2090 spin_unlock(&lli->lli_lock);
2093 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2094 spin_unlock(&lli->lli_lock);
2097 * XXX: group lock needs to protect all OST objects while PFL
2098 * can add new OST objects during the IO, so we'd instantiate
2099 * all OST objects before getting its group lock.
2104 struct cl_layout cl = {
2105 .cl_is_composite = false,
2107 struct lu_extent ext = {
2109 .e_end = OBD_OBJECT_EOF,
2112 env = cl_env_get(&refcheck);
2114 RETURN(PTR_ERR(env));
2116 rc = cl_object_layout_get(env, obj, &cl);
2117 if (!rc && cl.cl_is_composite)
2118 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2121 cl_env_put(env, &refcheck);
2126 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2127 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2131 spin_lock(&lli->lli_lock);
2132 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2133 spin_unlock(&lli->lli_lock);
2134 CERROR("another thread just won the race\n");
2135 cl_put_grouplock(&grouplock);
2139 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2140 fd->fd_grouplock = grouplock;
2141 spin_unlock(&lli->lli_lock);
2143 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2147 static int ll_put_grouplock(struct inode *inode, struct file *file,
2150 struct ll_inode_info *lli = ll_i2info(inode);
2151 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2152 struct ll_grouplock grouplock;
2155 spin_lock(&lli->lli_lock);
2156 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2157 spin_unlock(&lli->lli_lock);
2158 CWARN("no group lock held\n");
2162 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2164 if (fd->fd_grouplock.lg_gid != arg) {
2165 CWARN("group lock %lu doesn't match current id %lu\n",
2166 arg, fd->fd_grouplock.lg_gid);
2167 spin_unlock(&lli->lli_lock);
2171 grouplock = fd->fd_grouplock;
2172 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2173 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2174 spin_unlock(&lli->lli_lock);
2176 cl_put_grouplock(&grouplock);
2177 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2182 * Close inode open handle
2184 * \param dentry [in] dentry which contains the inode
2185 * \param it [in,out] intent which contains open info and result
2188 * \retval <0 failure
2190 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2192 struct inode *inode = dentry->d_inode;
2193 struct obd_client_handle *och;
2199 /* Root ? Do nothing. */
2200 if (dentry->d_inode->i_sb->s_root == dentry)
2203 /* No open handle to close? Move away */
2204 if (!it_disposition(it, DISP_OPEN_OPEN))
2207 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2209 OBD_ALLOC(och, sizeof(*och));
2211 GOTO(out, rc = -ENOMEM);
2213 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2215 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2217 /* this one is in place of ll_file_open */
2218 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2219 ptlrpc_req_finished(it->it_request);
2220 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2226 * Get size for inode for which FIEMAP mapping is requested.
2227 * Make the FIEMAP get_info call and returns the result.
2228 * \param fiemap kernel buffer to hold extens
2229 * \param num_bytes kernel buffer size
2231 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2237 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2240 /* Checks for fiemap flags */
2241 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2242 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2246 /* Check for FIEMAP_FLAG_SYNC */
2247 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2248 rc = filemap_fdatawrite(inode->i_mapping);
2253 env = cl_env_get(&refcheck);
2255 RETURN(PTR_ERR(env));
2257 if (i_size_read(inode) == 0) {
2258 rc = ll_glimpse_size(inode);
2263 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2264 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2265 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2267 /* If filesize is 0, then there would be no objects for mapping */
2268 if (fmkey.lfik_oa.o_size == 0) {
2269 fiemap->fm_mapped_extents = 0;
2273 fmkey.lfik_fiemap = *fiemap;
2275 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2276 &fmkey, fiemap, &num_bytes);
2278 cl_env_put(env, &refcheck);
2282 int ll_fid2path(struct inode *inode, void __user *arg)
2284 struct obd_export *exp = ll_i2mdexp(inode);
2285 const struct getinfo_fid2path __user *gfin = arg;
2287 struct getinfo_fid2path *gfout;
2293 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2294 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2297 /* Only need to get the buflen */
2298 if (get_user(pathlen, &gfin->gf_pathlen))
2301 if (pathlen > PATH_MAX)
2304 outsize = sizeof(*gfout) + pathlen;
2305 OBD_ALLOC(gfout, outsize);
2309 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2310 GOTO(gf_free, rc = -EFAULT);
2311 /* append root FID after gfout to let MDT know the root FID so that it
2312 * can lookup the correct path, this is mainly for fileset.
2313 * old server without fileset mount support will ignore this. */
2314 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2316 /* Call mdc_iocontrol */
2317 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2321 if (copy_to_user(arg, gfout, outsize))
2325 OBD_FREE(gfout, outsize);
2330 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2332 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2340 ioc->idv_version = 0;
2341 ioc->idv_layout_version = UINT_MAX;
2343 /* If no file object initialized, we consider its version is 0. */
2347 env = cl_env_get(&refcheck);
2349 RETURN(PTR_ERR(env));
2351 io = vvp_env_thread_io(env);
2353 io->u.ci_data_version.dv_data_version = 0;
2354 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2355 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2358 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2359 result = cl_io_loop(env, io);
2361 result = io->ci_result;
2363 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2364 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2366 cl_io_fini(env, io);
2368 if (unlikely(io->ci_need_restart))
2371 cl_env_put(env, &refcheck);
2377 * Read the data_version for inode.
2379 * This value is computed using stripe object version on OST.
2380 * Version is computed using server side locking.
2382 * @param flags if do sync on the OST side;
2384 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2385 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2387 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2389 struct ioc_data_version ioc = { .idv_flags = flags };
2392 rc = ll_ioc_data_version(inode, &ioc);
2394 *data_version = ioc.idv_version;
2400 * Trigger a HSM release request for the provided inode.
2402 int ll_hsm_release(struct inode *inode)
2405 struct obd_client_handle *och = NULL;
2406 __u64 data_version = 0;
2411 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2412 ll_get_fsname(inode->i_sb, NULL, 0),
2413 PFID(&ll_i2info(inode)->lli_fid));
2415 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2417 GOTO(out, rc = PTR_ERR(och));
2419 /* Grab latest data_version and [am]time values */
2420 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2424 env = cl_env_get(&refcheck);
2426 GOTO(out, rc = PTR_ERR(env));
2428 rc = ll_merge_attr(env, inode);
2429 cl_env_put(env, &refcheck);
2431 /* If error happen, we have the wrong size for a file.
2437 /* Release the file.
2438 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2439 * we still need it to pack l_remote_handle to MDT. */
2440 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2446 if (och != NULL && !IS_ERR(och)) /* close the file */
2447 ll_lease_close(och, inode, NULL);
2452 struct ll_swap_stack {
2455 struct inode *inode1;
2456 struct inode *inode2;
2461 static int ll_swap_layouts(struct file *file1, struct file *file2,
2462 struct lustre_swap_layouts *lsl)
2464 struct mdc_swap_layouts msl;
2465 struct md_op_data *op_data;
2468 struct ll_swap_stack *llss = NULL;
2471 OBD_ALLOC_PTR(llss);
2475 llss->inode1 = file_inode(file1);
2476 llss->inode2 = file_inode(file2);
2478 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2482 /* we use 2 bool because it is easier to swap than 2 bits */
2483 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2484 llss->check_dv1 = true;
2486 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2487 llss->check_dv2 = true;
2489 /* we cannot use lsl->sl_dvX directly because we may swap them */
2490 llss->dv1 = lsl->sl_dv1;
2491 llss->dv2 = lsl->sl_dv2;
2493 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2494 if (rc == 0) /* same file, done! */
2497 if (rc < 0) { /* sequentialize it */
2498 swap(llss->inode1, llss->inode2);
2500 swap(llss->dv1, llss->dv2);
2501 swap(llss->check_dv1, llss->check_dv2);
2505 if (gid != 0) { /* application asks to flush dirty cache */
2506 rc = ll_get_grouplock(llss->inode1, file1, gid);
2510 rc = ll_get_grouplock(llss->inode2, file2, gid);
2512 ll_put_grouplock(llss->inode1, file1, gid);
2517 /* ultimate check, before swaping the layouts we check if
2518 * dataversion has changed (if requested) */
2519 if (llss->check_dv1) {
2520 rc = ll_data_version(llss->inode1, &dv, 0);
2523 if (dv != llss->dv1)
2524 GOTO(putgl, rc = -EAGAIN);
2527 if (llss->check_dv2) {
2528 rc = ll_data_version(llss->inode2, &dv, 0);
2531 if (dv != llss->dv2)
2532 GOTO(putgl, rc = -EAGAIN);
2535 /* struct md_op_data is used to send the swap args to the mdt
2536 * only flags is missing, so we use struct mdc_swap_layouts
2537 * through the md_op_data->op_data */
2538 /* flags from user space have to be converted before they are send to
2539 * server, no flag is sent today, they are only used on the client */
2542 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2543 0, LUSTRE_OPC_ANY, &msl);
2544 if (IS_ERR(op_data))
2545 GOTO(free, rc = PTR_ERR(op_data));
2547 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2548 sizeof(*op_data), op_data, NULL);
2549 ll_finish_md_op_data(op_data);
2556 ll_put_grouplock(llss->inode2, file2, gid);
2557 ll_put_grouplock(llss->inode1, file1, gid);
2567 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2569 struct obd_export *exp = ll_i2mdexp(inode);
2570 struct md_op_data *op_data;
2574 /* Detect out-of range masks */
2575 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2578 /* Non-root users are forbidden to set or clear flags which are
2579 * NOT defined in HSM_USER_MASK. */
2580 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2581 !cfs_capable(CFS_CAP_SYS_ADMIN))
2584 if (!exp_connect_archive_id_array(exp)) {
2585 /* Detect out-of range archive id */
2586 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2587 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2591 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2592 LUSTRE_OPC_ANY, hss);
2593 if (IS_ERR(op_data))
2594 RETURN(PTR_ERR(op_data));
2596 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2599 ll_finish_md_op_data(op_data);
2604 static int ll_hsm_import(struct inode *inode, struct file *file,
2605 struct hsm_user_import *hui)
2607 struct hsm_state_set *hss = NULL;
2608 struct iattr *attr = NULL;
2612 if (!S_ISREG(inode->i_mode))
2618 GOTO(out, rc = -ENOMEM);
2620 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2621 hss->hss_archive_id = hui->hui_archive_id;
2622 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2623 rc = ll_hsm_state_set(inode, hss);
2627 OBD_ALLOC_PTR(attr);
2629 GOTO(out, rc = -ENOMEM);
2631 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2632 attr->ia_mode |= S_IFREG;
2633 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2634 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2635 attr->ia_size = hui->hui_size;
2636 attr->ia_mtime.tv_sec = hui->hui_mtime;
2637 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2638 attr->ia_atime.tv_sec = hui->hui_atime;
2639 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2641 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2642 ATTR_UID | ATTR_GID |
2643 ATTR_MTIME | ATTR_MTIME_SET |
2644 ATTR_ATIME | ATTR_ATIME_SET;
2648 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2652 inode_unlock(inode);
2664 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2666 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2667 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2670 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2672 struct inode *inode = file_inode(file);
2674 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2675 ATTR_MTIME | ATTR_MTIME_SET |
2678 .tv_sec = lfu->lfu_atime_sec,
2679 .tv_nsec = lfu->lfu_atime_nsec,
2682 .tv_sec = lfu->lfu_mtime_sec,
2683 .tv_nsec = lfu->lfu_mtime_nsec,
2686 .tv_sec = lfu->lfu_ctime_sec,
2687 .tv_nsec = lfu->lfu_ctime_nsec,
2693 if (!capable(CAP_SYS_ADMIN))
2696 if (!S_ISREG(inode->i_mode))
2700 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2702 inode_unlock(inode);
2707 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2710 case MODE_READ_USER:
2712 case MODE_WRITE_USER:
2719 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2721 /* Used to allow the upper layers of the client to request an LDLM lock
2722 * without doing an actual read or write.
2724 * Used for ladvise lockahead to manually request specific locks.
2726 * \param[in] file file this ladvise lock request is on
2727 * \param[in] ladvise ladvise struct describing this lock request
2729 * \retval 0 success, no detailed result available (sync requests
2730 * and requests sent to the server [not handled locally]
2731 * cannot return detailed results)
2732 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2733 * see definitions for details.
2734 * \retval negative negative errno on error
2736 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2738 struct lu_env *env = NULL;
2739 struct cl_io *io = NULL;
2740 struct cl_lock *lock = NULL;
2741 struct cl_lock_descr *descr = NULL;
2742 struct dentry *dentry = file->f_path.dentry;
2743 struct inode *inode = dentry->d_inode;
2744 enum cl_lock_mode cl_mode;
2745 off_t start = ladvise->lla_start;
2746 off_t end = ladvise->lla_end;
2752 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2753 "start=%llu, end=%llu\n", dentry->d_name.len,
2754 dentry->d_name.name, dentry->d_inode,
2755 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2758 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2760 GOTO(out, result = cl_mode);
2762 /* Get IO environment */
2763 result = cl_io_get(inode, &env, &io, &refcheck);
2767 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2770 * nothing to do for this io. This currently happens when
2771 * stripe sub-object's are not yet created.
2773 result = io->ci_result;
2774 } else if (result == 0) {
2775 lock = vvp_env_lock(env);
2776 descr = &lock->cll_descr;
2778 descr->cld_obj = io->ci_obj;
2779 /* Convert byte offsets to pages */
2780 descr->cld_start = cl_index(io->ci_obj, start);
2781 descr->cld_end = cl_index(io->ci_obj, end);
2782 descr->cld_mode = cl_mode;
2783 /* CEF_MUST is used because we do not want to convert a
2784 * lockahead request to a lockless lock */
2785 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2788 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2789 descr->cld_enq_flags |= CEF_SPECULATIVE;
2791 result = cl_lock_request(env, io, lock);
2793 /* On success, we need to release the lock */
2795 cl_lock_release(env, lock);
2797 cl_io_fini(env, io);
2798 cl_env_put(env, &refcheck);
2800 /* -ECANCELED indicates a matching lock with a different extent
2801 * was already present, and -EEXIST indicates a matching lock
2802 * on exactly the same extent was already present.
2803 * We convert them to positive values for userspace to make
2804 * recognizing true errors easier.
2805 * Note we can only return these detailed results on async requests,
2806 * as sync requests look the same as i/o requests for locking. */
2807 if (result == -ECANCELED)
2808 result = LLA_RESULT_DIFFERENT;
2809 else if (result == -EEXIST)
2810 result = LLA_RESULT_SAME;
2815 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2817 static int ll_ladvise_sanity(struct inode *inode,
2818 struct llapi_lu_ladvise *ladvise)
2820 enum lu_ladvise_type advice = ladvise->lla_advice;
2821 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2822 * be in the first 32 bits of enum ladvise_flags */
2823 __u32 flags = ladvise->lla_peradvice_flags;
2824 /* 3 lines at 80 characters per line, should be plenty */
2827 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2829 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2830 "last supported advice is %s (value '%d'): rc = %d\n",
2831 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2832 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2836 /* Per-advice checks */
2838 case LU_LADVISE_LOCKNOEXPAND:
2839 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2841 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2843 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2844 ladvise_names[advice], rc);
2848 case LU_LADVISE_LOCKAHEAD:
2849 /* Currently only READ and WRITE modes can be requested */
2850 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2851 ladvise->lla_lockahead_mode == 0) {
2853 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2855 ll_get_fsname(inode->i_sb, NULL, 0),
2856 ladvise->lla_lockahead_mode,
2857 ladvise_names[advice], rc);
2860 case LU_LADVISE_WILLREAD:
2861 case LU_LADVISE_DONTNEED:
2863 /* Note fall through above - These checks apply to all advices
2864 * except LOCKNOEXPAND */
2865 if (flags & ~LF_DEFAULT_MASK) {
2867 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2869 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2870 ladvise_names[advice], rc);
2873 if (ladvise->lla_start >= ladvise->lla_end) {
2875 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2876 "for %s: rc = %d\n",
2877 ll_get_fsname(inode->i_sb, NULL, 0),
2878 ladvise->lla_start, ladvise->lla_end,
2879 ladvise_names[advice], rc);
2891 * Give file access advices
2893 * The ladvise interface is similar to Linux fadvise() system call, except it
2894 * forwards the advices directly from Lustre client to server. The server side
2895 * codes will apply appropriate read-ahead and caching techniques for the
2896 * corresponding files.
2898 * A typical workload for ladvise is e.g. a bunch of different clients are
2899 * doing small random reads of a file, so prefetching pages into OSS cache
2900 * with big linear reads before the random IO is a net benefit. Fetching
2901 * all that data into each client cache with fadvise() may not be, due to
2902 * much more data being sent to the client.
2904 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2905 struct llapi_lu_ladvise *ladvise)
2909 struct cl_ladvise_io *lio;
2914 env = cl_env_get(&refcheck);
2916 RETURN(PTR_ERR(env));
2918 io = vvp_env_thread_io(env);
2919 io->ci_obj = ll_i2info(inode)->lli_clob;
2921 /* initialize parameters for ladvise */
2922 lio = &io->u.ci_ladvise;
2923 lio->li_start = ladvise->lla_start;
2924 lio->li_end = ladvise->lla_end;
2925 lio->li_fid = ll_inode2fid(inode);
2926 lio->li_advice = ladvise->lla_advice;
2927 lio->li_flags = flags;
2929 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2930 rc = cl_io_loop(env, io);
2934 cl_io_fini(env, io);
2935 cl_env_put(env, &refcheck);
2939 static int ll_lock_noexpand(struct file *file, int flags)
2941 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2943 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2948 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2951 struct fsxattr fsxattr;
2953 if (copy_from_user(&fsxattr,
2954 (const struct fsxattr __user *)arg,
2958 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2959 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2960 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2961 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2962 if (copy_to_user((struct fsxattr __user *)arg,
2963 &fsxattr, sizeof(fsxattr)))
2969 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
2972 * Project Quota ID state is only allowed to change from within the init
2973 * namespace. Enforce that restriction only if we are trying to change
2974 * the quota ID state. Everything else is allowed in user namespaces.
2976 if (current_user_ns() == &init_user_ns)
2979 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
2982 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
2983 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
2986 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
2993 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2997 struct md_op_data *op_data;
2998 struct ptlrpc_request *req = NULL;
3000 struct fsxattr fsxattr;
3001 struct cl_object *obj;
3005 if (copy_from_user(&fsxattr,
3006 (const struct fsxattr __user *)arg,
3010 rc = ll_ioctl_check_project(inode, &fsxattr);
3014 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3015 LUSTRE_OPC_ANY, NULL);
3016 if (IS_ERR(op_data))
3017 RETURN(PTR_ERR(op_data));
3019 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3020 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3021 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3022 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3023 op_data->op_projid = fsxattr.fsx_projid;
3024 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3025 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3027 ptlrpc_req_finished(req);
3029 GOTO(out_fsxattr, rc);
3030 ll_update_inode_flags(inode, op_data->op_attr_flags);
3031 obj = ll_i2info(inode)->lli_clob;
3033 GOTO(out_fsxattr, rc);
3035 OBD_ALLOC_PTR(attr);
3037 GOTO(out_fsxattr, rc = -ENOMEM);
3039 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3040 fsxattr.fsx_xflags);
3043 ll_finish_md_op_data(op_data);
3047 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3050 struct inode *inode = file_inode(file);
3051 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3052 struct ll_inode_info *lli = ll_i2info(inode);
3053 struct obd_client_handle *och = NULL;
3054 struct split_param sp;
3057 enum mds_op_bias bias = 0;
3058 struct file *layout_file = NULL;
3060 size_t data_size = 0;
3064 mutex_lock(&lli->lli_och_mutex);
3065 if (fd->fd_lease_och != NULL) {
3066 och = fd->fd_lease_och;
3067 fd->fd_lease_och = NULL;
3069 mutex_unlock(&lli->lli_och_mutex);
3072 GOTO(out, rc = -ENOLCK);
3074 fmode = och->och_flags;
3076 switch (ioc->lil_flags) {
3077 case LL_LEASE_RESYNC_DONE:
3078 if (ioc->lil_count > IOC_IDS_MAX)
3079 GOTO(out, rc = -EINVAL);
3081 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3082 OBD_ALLOC(data, data_size);
3084 GOTO(out, rc = -ENOMEM);
3086 if (copy_from_user(data, (void __user *)arg, data_size))
3087 GOTO(out, rc = -EFAULT);
3089 bias = MDS_CLOSE_RESYNC_DONE;
3091 case LL_LEASE_LAYOUT_MERGE: {
3094 if (ioc->lil_count != 1)
3095 GOTO(out, rc = -EINVAL);
3097 arg += sizeof(*ioc);
3098 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3099 GOTO(out, rc = -EFAULT);
3101 layout_file = fget(fd);
3103 GOTO(out, rc = -EBADF);
3105 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3106 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3107 GOTO(out, rc = -EPERM);
3109 data = file_inode(layout_file);
3110 bias = MDS_CLOSE_LAYOUT_MERGE;
3113 case LL_LEASE_LAYOUT_SPLIT: {
3117 if (ioc->lil_count != 2)
3118 GOTO(out, rc = -EINVAL);
3120 arg += sizeof(*ioc);
3121 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3122 GOTO(out, rc = -EFAULT);
3124 arg += sizeof(__u32);
3125 if (copy_from_user(&mirror_id, (void __user *)arg,
3127 GOTO(out, rc = -EFAULT);
3129 layout_file = fget(fdv);
3131 GOTO(out, rc = -EBADF);
3133 sp.sp_inode = file_inode(layout_file);
3134 sp.sp_mirror_id = (__u16)mirror_id;
3136 bias = MDS_CLOSE_LAYOUT_SPLIT;
3140 /* without close intent */
3144 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3148 rc = ll_lease_och_release(inode, file);
3157 switch (ioc->lil_flags) {
3158 case LL_LEASE_RESYNC_DONE:
3160 OBD_FREE(data, data_size);
3162 case LL_LEASE_LAYOUT_MERGE:
3163 case LL_LEASE_LAYOUT_SPLIT:
3170 rc = ll_lease_type_from_fmode(fmode);
3174 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3177 struct inode *inode = file_inode(file);
3178 struct ll_inode_info *lli = ll_i2info(inode);
3179 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3180 struct obd_client_handle *och = NULL;
3181 __u64 open_flags = 0;
3187 switch (ioc->lil_mode) {
3188 case LL_LEASE_WRLCK:
3189 if (!(file->f_mode & FMODE_WRITE))
3191 fmode = FMODE_WRITE;
3193 case LL_LEASE_RDLCK:
3194 if (!(file->f_mode & FMODE_READ))
3198 case LL_LEASE_UNLCK:
3199 RETURN(ll_file_unlock_lease(file, ioc, arg));
3204 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3206 /* apply for lease */
3207 if (ioc->lil_flags & LL_LEASE_RESYNC)
3208 open_flags = MDS_OPEN_RESYNC;
3209 och = ll_lease_open(inode, file, fmode, open_flags);
3211 RETURN(PTR_ERR(och));
3213 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3214 rc = ll_lease_file_resync(och, inode, arg);
3216 ll_lease_close(och, inode, NULL);
3219 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3221 ll_lease_close(och, inode, NULL);
3227 mutex_lock(&lli->lli_och_mutex);
3228 if (fd->fd_lease_och == NULL) {
3229 fd->fd_lease_och = och;
3232 mutex_unlock(&lli->lli_och_mutex);
3234 /* impossible now that only excl is supported for now */
3235 ll_lease_close(och, inode, &lease_broken);
3242 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3244 struct inode *inode = file_inode(file);
3245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3249 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3250 PFID(ll_inode2fid(inode)), inode, cmd);
3251 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3253 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3254 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3258 case LL_IOC_GETFLAGS:
3259 /* Get the current value of the file flags */
3260 return put_user(fd->fd_flags, (int __user *)arg);
3261 case LL_IOC_SETFLAGS:
3262 case LL_IOC_CLRFLAGS:
3263 /* Set or clear specific file flags */
3264 /* XXX This probably needs checks to ensure the flags are
3265 * not abused, and to handle any flag side effects.
3267 if (get_user(flags, (int __user *) arg))
3270 if (cmd == LL_IOC_SETFLAGS) {
3271 if ((flags & LL_FILE_IGNORE_LOCK) &&
3272 !(file->f_flags & O_DIRECT)) {
3273 CERROR("%s: unable to disable locking on "
3274 "non-O_DIRECT file\n", current->comm);
3278 fd->fd_flags |= flags;
3280 fd->fd_flags &= ~flags;
3283 case LL_IOC_LOV_SETSTRIPE:
3284 case LL_IOC_LOV_SETSTRIPE_NEW:
3285 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3286 case LL_IOC_LOV_SETEA:
3287 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3288 case LL_IOC_LOV_SWAP_LAYOUTS: {
3290 struct lustre_swap_layouts lsl;
3292 if (copy_from_user(&lsl, (char __user *)arg,
3293 sizeof(struct lustre_swap_layouts)))
3296 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3299 file2 = fget(lsl.sl_fd);
3303 /* O_WRONLY or O_RDWR */
3304 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3305 GOTO(out, rc = -EPERM);
3307 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3308 struct inode *inode2;
3309 struct ll_inode_info *lli;
3310 struct obd_client_handle *och = NULL;
3312 lli = ll_i2info(inode);
3313 mutex_lock(&lli->lli_och_mutex);
3314 if (fd->fd_lease_och != NULL) {
3315 och = fd->fd_lease_och;
3316 fd->fd_lease_och = NULL;
3318 mutex_unlock(&lli->lli_och_mutex);
3320 GOTO(out, rc = -ENOLCK);
3321 inode2 = file_inode(file2);
3322 rc = ll_swap_layouts_close(och, inode, inode2);
3324 rc = ll_swap_layouts(file, file2, &lsl);
3330 case LL_IOC_LOV_GETSTRIPE:
3331 case LL_IOC_LOV_GETSTRIPE_NEW:
3332 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3333 case FS_IOC_GETFLAGS:
3334 case FS_IOC_SETFLAGS:
3335 RETURN(ll_iocontrol(inode, file, cmd, arg));
3336 case FSFILT_IOC_GETVERSION:
3337 case FS_IOC_GETVERSION:
3338 RETURN(put_user(inode->i_generation, (int __user *)arg));
3339 /* We need to special case any other ioctls we want to handle,
3340 * to send them to the MDS/OST as appropriate and to properly
3341 * network encode the arg field. */
3342 case FS_IOC_SETVERSION:
3345 case LL_IOC_GROUP_LOCK:
3346 RETURN(ll_get_grouplock(inode, file, arg));
3347 case LL_IOC_GROUP_UNLOCK:
3348 RETURN(ll_put_grouplock(inode, file, arg));
3349 case IOC_OBD_STATFS:
3350 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3352 case LL_IOC_FLUSHCTX:
3353 RETURN(ll_flush_ctx(inode));
3354 case LL_IOC_PATH2FID: {
3355 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3356 sizeof(struct lu_fid)))
3361 case LL_IOC_GETPARENT:
3362 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3364 case OBD_IOC_FID2PATH:
3365 RETURN(ll_fid2path(inode, (void __user *)arg));
3366 case LL_IOC_DATA_VERSION: {
3367 struct ioc_data_version idv;
3370 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3373 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3374 rc = ll_ioc_data_version(inode, &idv);
3377 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3383 case LL_IOC_GET_MDTIDX: {
3386 mdtidx = ll_get_mdt_idx(inode);
3390 if (put_user((int)mdtidx, (int __user *)arg))
3395 case OBD_IOC_GETDTNAME:
3396 case OBD_IOC_GETMDNAME:
3397 RETURN(ll_get_obd_name(inode, cmd, arg));
3398 case LL_IOC_HSM_STATE_GET: {
3399 struct md_op_data *op_data;
3400 struct hsm_user_state *hus;
3407 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3408 LUSTRE_OPC_ANY, hus);
3409 if (IS_ERR(op_data)) {
3411 RETURN(PTR_ERR(op_data));
3414 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3417 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3420 ll_finish_md_op_data(op_data);
3424 case LL_IOC_HSM_STATE_SET: {
3425 struct hsm_state_set *hss;
3432 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3437 rc = ll_hsm_state_set(inode, hss);
3442 case LL_IOC_HSM_ACTION: {
3443 struct md_op_data *op_data;
3444 struct hsm_current_action *hca;
3451 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3452 LUSTRE_OPC_ANY, hca);
3453 if (IS_ERR(op_data)) {
3455 RETURN(PTR_ERR(op_data));
3458 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3461 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3464 ll_finish_md_op_data(op_data);
3468 case LL_IOC_SET_LEASE_OLD: {
3469 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3471 RETURN(ll_file_set_lease(file, &ioc, 0));
3473 case LL_IOC_SET_LEASE: {
3474 struct ll_ioc_lease ioc;
3476 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3479 RETURN(ll_file_set_lease(file, &ioc, arg));
3481 case LL_IOC_GET_LEASE: {
3482 struct ll_inode_info *lli = ll_i2info(inode);
3483 struct ldlm_lock *lock = NULL;
3486 mutex_lock(&lli->lli_och_mutex);
3487 if (fd->fd_lease_och != NULL) {
3488 struct obd_client_handle *och = fd->fd_lease_och;
3490 lock = ldlm_handle2lock(&och->och_lease_handle);
3492 lock_res_and_lock(lock);
3493 if (!ldlm_is_cancel(lock))
3494 fmode = och->och_flags;
3496 unlock_res_and_lock(lock);
3497 LDLM_LOCK_PUT(lock);
3500 mutex_unlock(&lli->lli_och_mutex);
3502 RETURN(ll_lease_type_from_fmode(fmode));
3504 case LL_IOC_HSM_IMPORT: {
3505 struct hsm_user_import *hui;
3511 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3516 rc = ll_hsm_import(inode, file, hui);
3521 case LL_IOC_FUTIMES_3: {
3522 struct ll_futimes_3 lfu;
3524 if (copy_from_user(&lfu,
3525 (const struct ll_futimes_3 __user *)arg,
3529 RETURN(ll_file_futimes_3(file, &lfu));
3531 case LL_IOC_LADVISE: {
3532 struct llapi_ladvise_hdr *k_ladvise_hdr;
3533 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3536 int alloc_size = sizeof(*k_ladvise_hdr);
3539 u_ladvise_hdr = (void __user *)arg;
3540 OBD_ALLOC_PTR(k_ladvise_hdr);
3541 if (k_ladvise_hdr == NULL)
3544 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3545 GOTO(out_ladvise, rc = -EFAULT);
3547 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3548 k_ladvise_hdr->lah_count < 1)
3549 GOTO(out_ladvise, rc = -EINVAL);
3551 num_advise = k_ladvise_hdr->lah_count;
3552 if (num_advise >= LAH_COUNT_MAX)
3553 GOTO(out_ladvise, rc = -EFBIG);
3555 OBD_FREE_PTR(k_ladvise_hdr);
3556 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3557 lah_advise[num_advise]);
3558 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3559 if (k_ladvise_hdr == NULL)
3563 * TODO: submit multiple advices to one server in a single RPC
3565 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3566 GOTO(out_ladvise, rc = -EFAULT);
3568 for (i = 0; i < num_advise; i++) {
3569 struct llapi_lu_ladvise *k_ladvise =
3570 &k_ladvise_hdr->lah_advise[i];
3571 struct llapi_lu_ladvise __user *u_ladvise =
3572 &u_ladvise_hdr->lah_advise[i];
3574 rc = ll_ladvise_sanity(inode, k_ladvise);
3576 GOTO(out_ladvise, rc);
3578 switch (k_ladvise->lla_advice) {
3579 case LU_LADVISE_LOCKNOEXPAND:
3580 rc = ll_lock_noexpand(file,
3581 k_ladvise->lla_peradvice_flags);
3582 GOTO(out_ladvise, rc);
3583 case LU_LADVISE_LOCKAHEAD:
3585 rc = ll_file_lock_ahead(file, k_ladvise);
3588 GOTO(out_ladvise, rc);
3591 &u_ladvise->lla_lockahead_result))
3592 GOTO(out_ladvise, rc = -EFAULT);
3595 rc = ll_ladvise(inode, file,
3596 k_ladvise_hdr->lah_flags,
3599 GOTO(out_ladvise, rc);
3606 OBD_FREE(k_ladvise_hdr, alloc_size);
3609 case LL_IOC_FLR_SET_MIRROR: {
3610 /* mirror I/O must be direct to avoid polluting page cache
3612 if (!(file->f_flags & O_DIRECT))
3615 fd->fd_designated_mirror = (__u32)arg;
3618 case LL_IOC_FSGETXATTR:
3619 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3620 case LL_IOC_FSSETXATTR:
3621 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3623 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3625 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3626 (void __user *)arg));
3630 #ifndef HAVE_FILE_LLSEEK_SIZE
3631 static inline loff_t
3632 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3634 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3636 if (offset > maxsize)
3639 if (offset != file->f_pos) {
3640 file->f_pos = offset;
3641 file->f_version = 0;
3647 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3648 loff_t maxsize, loff_t eof)
3650 struct inode *inode = file_inode(file);
3658 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3659 * position-querying operation. Avoid rewriting the "same"
3660 * f_pos value back to the file because a concurrent read(),
3661 * write() or lseek() might have altered it
3666 * f_lock protects against read/modify/write race with other
3667 * SEEK_CURs. Note that parallel writes and reads behave
3671 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3672 inode_unlock(inode);
3676 * In the generic case the entire file is data, so as long as
3677 * offset isn't at the end of the file then the offset is data.
3684 * There is a virtual hole at the end of the file, so as long as
3685 * offset isn't i_size or larger, return i_size.
3693 return llseek_execute(file, offset, maxsize);
3697 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3699 struct inode *inode = file_inode(file);
3700 loff_t retval, eof = 0;
3703 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3704 (origin == SEEK_CUR) ? file->f_pos : 0);
3705 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3706 PFID(ll_inode2fid(inode)), inode, retval, retval,
3708 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3710 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3711 retval = ll_glimpse_size(inode);
3714 eof = i_size_read(inode);
3717 retval = ll_generic_file_llseek_size(file, offset, origin,
3718 ll_file_maxbytes(inode), eof);
3722 static int ll_flush(struct file *file, fl_owner_t id)
3724 struct inode *inode = file_inode(file);
3725 struct ll_inode_info *lli = ll_i2info(inode);
3726 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3729 LASSERT(!S_ISDIR(inode->i_mode));
3731 /* catch async errors that were recorded back when async writeback
3732 * failed for pages in this mapping. */
3733 rc = lli->lli_async_rc;
3734 lli->lli_async_rc = 0;
3735 if (lli->lli_clob != NULL) {
3736 err = lov_read_and_clear_async_rc(lli->lli_clob);
3741 /* The application has been told write failure already.
3742 * Do not report failure again. */
3743 if (fd->fd_write_failed)
3745 return rc ? -EIO : 0;
3749 * Called to make sure a portion of file has been written out.
3750 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3752 * Return how many pages have been written.
3754 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3755 enum cl_fsync_mode mode, int ignore_layout)
3759 struct cl_fsync_io *fio;
3764 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3765 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3768 env = cl_env_get(&refcheck);
3770 RETURN(PTR_ERR(env));
3772 io = vvp_env_thread_io(env);
3773 io->ci_obj = ll_i2info(inode)->lli_clob;
3774 io->ci_ignore_layout = ignore_layout;
3776 /* initialize parameters for sync */
3777 fio = &io->u.ci_fsync;
3778 fio->fi_start = start;
3780 fio->fi_fid = ll_inode2fid(inode);
3781 fio->fi_mode = mode;
3782 fio->fi_nr_written = 0;
3784 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3785 result = cl_io_loop(env, io);
3787 result = io->ci_result;
3789 result = fio->fi_nr_written;
3790 cl_io_fini(env, io);
3791 cl_env_put(env, &refcheck);
3797 * When dentry is provided (the 'else' case), file_dentry() may be
3798 * null and dentry must be used directly rather than pulled from
3799 * file_dentry() as is done otherwise.
3802 #ifdef HAVE_FILE_FSYNC_4ARGS
3803 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3805 struct dentry *dentry = file_dentry(file);
3806 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3807 int ll_fsync(struct file *file, int datasync)
3809 struct dentry *dentry = file_dentry(file);
3811 loff_t end = LLONG_MAX;
3813 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3816 loff_t end = LLONG_MAX;
3818 struct inode *inode = dentry->d_inode;
3819 struct ll_inode_info *lli = ll_i2info(inode);
3820 struct ptlrpc_request *req;
3824 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3825 PFID(ll_inode2fid(inode)), inode);
3826 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3828 #ifdef HAVE_FILE_FSYNC_4ARGS
3829 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3832 /* fsync's caller has already called _fdata{sync,write}, we want
3833 * that IO to finish before calling the osc and mdc sync methods */
3834 rc = filemap_fdatawait(inode->i_mapping);
3837 /* catch async errors that were recorded back when async writeback
3838 * failed for pages in this mapping. */
3839 if (!S_ISDIR(inode->i_mode)) {
3840 err = lli->lli_async_rc;
3841 lli->lli_async_rc = 0;
3844 if (lli->lli_clob != NULL) {
3845 err = lov_read_and_clear_async_rc(lli->lli_clob);
3851 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3855 ptlrpc_req_finished(req);
3857 if (S_ISREG(inode->i_mode)) {
3858 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3860 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3861 if (rc == 0 && err < 0)
3864 fd->fd_write_failed = true;
3866 fd->fd_write_failed = false;
3869 #ifdef HAVE_FILE_FSYNC_4ARGS
3870 inode_unlock(inode);
3876 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3878 struct inode *inode = file_inode(file);
3879 struct ll_sb_info *sbi = ll_i2sbi(inode);
3880 struct ldlm_enqueue_info einfo = {
3881 .ei_type = LDLM_FLOCK,
3882 .ei_cb_cp = ldlm_flock_completion_ast,
3883 .ei_cbdata = file_lock,
3885 struct md_op_data *op_data;
3886 struct lustre_handle lockh = { 0 };
3887 union ldlm_policy_data flock = { { 0 } };
3888 int fl_type = file_lock->fl_type;
3894 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3895 PFID(ll_inode2fid(inode)), file_lock);
3897 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3899 if (file_lock->fl_flags & FL_FLOCK) {
3900 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3901 /* flocks are whole-file locks */
3902 flock.l_flock.end = OFFSET_MAX;
3903 /* For flocks owner is determined by the local file desctiptor*/
3904 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3905 } else if (file_lock->fl_flags & FL_POSIX) {
3906 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3907 flock.l_flock.start = file_lock->fl_start;
3908 flock.l_flock.end = file_lock->fl_end;
3912 flock.l_flock.pid = file_lock->fl_pid;
3914 /* Somewhat ugly workaround for svc lockd.
3915 * lockd installs custom fl_lmops->lm_compare_owner that checks
3916 * for the fl_owner to be the same (which it always is on local node
3917 * I guess between lockd processes) and then compares pid.
3918 * As such we assign pid to the owner field to make it all work,
3919 * conflict with normal locks is unlikely since pid space and
3920 * pointer space for current->files are not intersecting */
3921 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3922 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3926 einfo.ei_mode = LCK_PR;
3929 /* An unlock request may or may not have any relation to
3930 * existing locks so we may not be able to pass a lock handle
3931 * via a normal ldlm_lock_cancel() request. The request may even
3932 * unlock a byte range in the middle of an existing lock. In
3933 * order to process an unlock request we need all of the same
3934 * information that is given with a normal read or write record
3935 * lock request. To avoid creating another ldlm unlock (cancel)
3936 * message we'll treat a LCK_NL flock request as an unlock. */
3937 einfo.ei_mode = LCK_NL;
3940 einfo.ei_mode = LCK_PW;
3943 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3958 flags = LDLM_FL_BLOCK_NOWAIT;
3964 flags = LDLM_FL_TEST_LOCK;
3967 CERROR("unknown fcntl lock command: %d\n", cmd);
3971 /* Save the old mode so that if the mode in the lock changes we
3972 * can decrement the appropriate reader or writer refcount. */
3973 file_lock->fl_type = einfo.ei_mode;
3975 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3976 LUSTRE_OPC_ANY, NULL);
3977 if (IS_ERR(op_data))
3978 RETURN(PTR_ERR(op_data));
3980 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3981 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3982 flock.l_flock.pid, flags, einfo.ei_mode,
3983 flock.l_flock.start, flock.l_flock.end);
3985 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3988 /* Restore the file lock type if not TEST lock. */
3989 if (!(flags & LDLM_FL_TEST_LOCK))
3990 file_lock->fl_type = fl_type;
3992 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3993 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3994 !(flags & LDLM_FL_TEST_LOCK))
3995 rc2 = locks_lock_file_wait(file, file_lock);
3997 if ((file_lock->fl_flags & FL_FLOCK) &&
3998 (rc == 0 || file_lock->fl_type == F_UNLCK))
3999 rc2 = flock_lock_file_wait(file, file_lock);
4000 if ((file_lock->fl_flags & FL_POSIX) &&
4001 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4002 !(flags & LDLM_FL_TEST_LOCK))
4003 rc2 = posix_lock_file_wait(file, file_lock);
4004 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4006 if (rc2 && file_lock->fl_type != F_UNLCK) {
4007 einfo.ei_mode = LCK_NL;
4008 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4013 ll_finish_md_op_data(op_data);
4018 int ll_get_fid_by_name(struct inode *parent, const char *name,
4019 int namelen, struct lu_fid *fid,
4020 struct inode **inode)
4022 struct md_op_data *op_data = NULL;
4023 struct mdt_body *body;
4024 struct ptlrpc_request *req;
4028 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4029 LUSTRE_OPC_ANY, NULL);
4030 if (IS_ERR(op_data))
4031 RETURN(PTR_ERR(op_data));
4033 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4034 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4035 ll_finish_md_op_data(op_data);
4039 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4041 GOTO(out_req, rc = -EFAULT);
4043 *fid = body->mbo_fid1;
4046 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4048 ptlrpc_req_finished(req);
4052 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4055 struct dentry *dchild = NULL;
4056 struct inode *child_inode = NULL;
4057 struct md_op_data *op_data;
4058 struct ptlrpc_request *request = NULL;
4059 struct obd_client_handle *och = NULL;
4061 struct mdt_body *body;
4062 __u64 data_version = 0;
4063 size_t namelen = strlen(name);
4064 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4068 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4069 PFID(ll_inode2fid(parent)), name,
4070 lum->lum_stripe_offset, lum->lum_stripe_count);
4072 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4073 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4074 lustre_swab_lmv_user_md(lum);
4076 /* Get child FID first */
4077 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4080 dchild = d_lookup(file_dentry(file), &qstr);
4082 if (dchild->d_inode)
4083 child_inode = igrab(dchild->d_inode);
4088 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4097 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4098 OBD_CONNECT2_DIR_MIGRATE)) {
4099 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4100 ll_i2info(child_inode)->lli_lsm_md) {
4101 CERROR("%s: MDT doesn't support stripe directory "
4103 ll_get_fsname(parent->i_sb, NULL, 0));
4104 GOTO(out_iput, rc = -EOPNOTSUPP);
4109 * lfs migrate command needs to be blocked on the client
4110 * by checking the migrate FID against the FID of the
4113 if (child_inode == parent->i_sb->s_root->d_inode)
4114 GOTO(out_iput, rc = -EINVAL);
4116 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4117 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4118 if (IS_ERR(op_data))
4119 GOTO(out_iput, rc = PTR_ERR(op_data));
4121 inode_lock(child_inode);
4122 op_data->op_fid3 = *ll_inode2fid(child_inode);
4123 if (!fid_is_sane(&op_data->op_fid3)) {
4124 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4125 ll_get_fsname(parent->i_sb, NULL, 0), name,
4126 PFID(&op_data->op_fid3));
4127 GOTO(out_unlock, rc = -EINVAL);
4130 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4131 op_data->op_data = lum;
4132 op_data->op_data_size = lumlen;
4135 if (S_ISREG(child_inode->i_mode)) {
4136 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4140 GOTO(out_unlock, rc);
4143 rc = ll_data_version(child_inode, &data_version,
4146 GOTO(out_close, rc);
4148 op_data->op_open_handle = och->och_open_handle;
4149 op_data->op_data_version = data_version;
4150 op_data->op_lease_handle = och->och_lease_handle;
4151 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4153 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4154 och->och_mod->mod_open_req->rq_replay = 0;
4155 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4158 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4159 name, namelen, &request);
4161 LASSERT(request != NULL);
4162 ll_update_times(request, parent);
4165 if (rc == 0 || rc == -EAGAIN) {
4166 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4167 LASSERT(body != NULL);
4169 /* If the server does release layout lock, then we cleanup
4170 * the client och here, otherwise release it in out_close: */
4171 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4172 obd_mod_put(och->och_mod);
4173 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4175 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4181 if (request != NULL) {
4182 ptlrpc_req_finished(request);
4186 /* Try again if the lease has cancelled. */
4187 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4192 ll_lease_close(och, child_inode, NULL);
4194 clear_nlink(child_inode);
4196 inode_unlock(child_inode);
4197 ll_finish_md_op_data(op_data);
4204 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4212 * test if some locks matching bits and l_req_mode are acquired
4213 * - bits can be in different locks
4214 * - if found clear the common lock bits in *bits
4215 * - the bits not found, are kept in *bits
4217 * \param bits [IN] searched lock bits [IN]
4218 * \param l_req_mode [IN] searched lock mode
4219 * \retval boolean, true iff all bits are found
4221 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4223 struct lustre_handle lockh;
4224 union ldlm_policy_data policy;
4225 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4226 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4235 fid = &ll_i2info(inode)->lli_fid;
4236 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4237 ldlm_lockname[mode]);
4239 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4240 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4241 policy.l_inodebits.bits = *bits & (1 << i);
4242 if (policy.l_inodebits.bits == 0)
4245 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4246 &policy, mode, &lockh)) {
4247 struct ldlm_lock *lock;
4249 lock = ldlm_handle2lock(&lockh);
4252 ~(lock->l_policy_data.l_inodebits.bits);
4253 LDLM_LOCK_PUT(lock);
4255 *bits &= ~policy.l_inodebits.bits;
4262 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4263 struct lustre_handle *lockh, __u64 flags,
4264 enum ldlm_mode mode)
4266 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4271 fid = &ll_i2info(inode)->lli_fid;
4272 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4274 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4275 fid, LDLM_IBITS, &policy, mode, lockh);
4280 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4282 /* Already unlinked. Just update nlink and return success */
4283 if (rc == -ENOENT) {
4285 /* If it is striped directory, and there is bad stripe
4286 * Let's revalidate the dentry again, instead of returning
4288 if (S_ISDIR(inode->i_mode) &&
4289 ll_i2info(inode)->lli_lsm_md != NULL)
4292 /* This path cannot be hit for regular files unless in
4293 * case of obscure races, so no need to to validate
4295 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4297 } else if (rc != 0) {
4298 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4299 "%s: revalidate FID "DFID" error: rc = %d\n",
4300 ll_get_fsname(inode->i_sb, NULL, 0),
4301 PFID(ll_inode2fid(inode)), rc);
4307 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4309 struct inode *inode = dentry->d_inode;
4310 struct obd_export *exp = ll_i2mdexp(inode);
4311 struct lookup_intent oit = {
4314 struct ptlrpc_request *req = NULL;
4315 struct md_op_data *op_data;
4319 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4320 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4322 /* Call getattr by fid, so do not provide name at all. */
4323 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4324 LUSTRE_OPC_ANY, NULL);
4325 if (IS_ERR(op_data))
4326 RETURN(PTR_ERR(op_data));
4328 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4329 ll_finish_md_op_data(op_data);
4331 rc = ll_inode_revalidate_fini(inode, rc);
4335 rc = ll_revalidate_it_finish(req, &oit, dentry);
4337 ll_intent_release(&oit);
4341 /* Unlinked? Unhash dentry, so it is not picked up later by
4342 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4343 * here to preserve get_cwd functionality on 2.6.
4345 if (!dentry->d_inode->i_nlink) {
4346 ll_lock_dcache(inode);
4347 d_lustre_invalidate(dentry, 0);
4348 ll_unlock_dcache(inode);
4351 ll_lookup_finish_locks(&oit, dentry);
4353 ptlrpc_req_finished(req);
4358 static int ll_merge_md_attr(struct inode *inode)
4360 struct ll_inode_info *lli = ll_i2info(inode);
4361 struct cl_attr attr = { 0 };
4364 LASSERT(lli->lli_lsm_md != NULL);
4365 down_read(&lli->lli_lsm_sem);
4366 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4367 &attr, ll_md_blocking_ast);
4368 up_read(&lli->lli_lsm_sem);
4372 set_nlink(inode, attr.cat_nlink);
4373 inode->i_blocks = attr.cat_blocks;
4374 i_size_write(inode, attr.cat_size);
4376 ll_i2info(inode)->lli_atime = attr.cat_atime;
4377 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4378 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4383 static inline dev_t ll_compat_encode_dev(dev_t dev)
4385 /* The compat_sys_*stat*() syscalls will fail unless the
4386 * device majors and minors are both less than 256. Note that
4387 * the value returned here will be passed through
4388 * old_encode_dev() in cp_compat_stat(). And so we are not
4389 * trying to return a valid compat (u16) device number, just
4390 * one that will pass the old_valid_dev() check. */
4392 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4395 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4396 int ll_getattr(const struct path *path, struct kstat *stat,
4397 u32 request_mask, unsigned int flags)
4399 struct dentry *de = path->dentry;
4401 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4404 struct inode *inode = de->d_inode;
4405 struct ll_sb_info *sbi = ll_i2sbi(inode);
4406 struct ll_inode_info *lli = ll_i2info(inode);
4409 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4411 rc = ll_inode_revalidate(de, IT_GETATTR);
4415 if (S_ISREG(inode->i_mode)) {
4416 /* In case of restore, the MDT has the right size and has
4417 * already send it back without granting the layout lock,
4418 * inode is up-to-date so glimpse is useless.
4419 * Also to glimpse we need the layout, in case of a running
4420 * restore the MDT holds the layout lock so the glimpse will
4421 * block up to the end of restore (getattr will block)
4423 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4424 rc = ll_glimpse_size(inode);
4429 /* If object isn't regular a file then don't validate size. */
4430 if (S_ISDIR(inode->i_mode) &&
4431 lli->lli_lsm_md != NULL) {
4432 rc = ll_merge_md_attr(inode);
4437 inode->i_atime.tv_sec = lli->lli_atime;
4438 inode->i_mtime.tv_sec = lli->lli_mtime;
4439 inode->i_ctime.tv_sec = lli->lli_ctime;
4442 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4444 if (ll_need_32bit_api(sbi)) {
4445 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4446 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4447 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4449 stat->ino = inode->i_ino;
4450 stat->dev = inode->i_sb->s_dev;
4451 stat->rdev = inode->i_rdev;
4454 stat->mode = inode->i_mode;
4455 stat->uid = inode->i_uid;
4456 stat->gid = inode->i_gid;
4457 stat->atime = inode->i_atime;
4458 stat->mtime = inode->i_mtime;
4459 stat->ctime = inode->i_ctime;
4460 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4462 stat->nlink = inode->i_nlink;
4463 stat->size = i_size_read(inode);
4464 stat->blocks = inode->i_blocks;
4469 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4470 __u64 start, __u64 len)
4474 struct fiemap *fiemap;
4475 unsigned int extent_count = fieinfo->fi_extents_max;
4477 num_bytes = sizeof(*fiemap) + (extent_count *
4478 sizeof(struct fiemap_extent));
4479 OBD_ALLOC_LARGE(fiemap, num_bytes);
4484 fiemap->fm_flags = fieinfo->fi_flags;
4485 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4486 fiemap->fm_start = start;
4487 fiemap->fm_length = len;
4488 if (extent_count > 0 &&
4489 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4490 sizeof(struct fiemap_extent)) != 0)
4491 GOTO(out, rc = -EFAULT);
4493 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4495 fieinfo->fi_flags = fiemap->fm_flags;
4496 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4497 if (extent_count > 0 &&
4498 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4499 fiemap->fm_mapped_extents *
4500 sizeof(struct fiemap_extent)) != 0)
4501 GOTO(out, rc = -EFAULT);
4503 OBD_FREE_LARGE(fiemap, num_bytes);
4507 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4509 struct ll_inode_info *lli = ll_i2info(inode);
4510 struct posix_acl *acl = NULL;
4513 spin_lock(&lli->lli_lock);
4514 /* VFS' acl_permission_check->check_acl will release the refcount */
4515 acl = posix_acl_dup(lli->lli_posix_acl);
4516 spin_unlock(&lli->lli_lock);
4521 #ifdef HAVE_IOP_SET_ACL
4522 #ifdef CONFIG_FS_POSIX_ACL
4523 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4525 struct ll_sb_info *sbi = ll_i2sbi(inode);
4526 struct ptlrpc_request *req = NULL;
4527 const char *name = NULL;
4529 size_t value_size = 0;
4534 case ACL_TYPE_ACCESS:
4535 name = XATTR_NAME_POSIX_ACL_ACCESS;
4537 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4540 case ACL_TYPE_DEFAULT:
4541 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4542 if (!S_ISDIR(inode->i_mode))
4543 rc = acl ? -EACCES : 0;
4554 value_size = posix_acl_xattr_size(acl->a_count);
4555 value = kmalloc(value_size, GFP_NOFS);
4557 GOTO(out, rc = -ENOMEM);
4559 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4561 GOTO(out_value, rc);
4564 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4565 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4566 name, value, value_size, 0, 0, &req);
4568 ptlrpc_req_finished(req);
4573 forget_cached_acl(inode, type);
4575 set_cached_acl(inode, type, acl);
4578 #endif /* CONFIG_FS_POSIX_ACL */
4579 #endif /* HAVE_IOP_SET_ACL */
4581 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4583 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4584 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4586 ll_check_acl(struct inode *inode, int mask)
4589 # ifdef CONFIG_FS_POSIX_ACL
4590 struct posix_acl *acl;
4594 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4595 if (flags & IPERM_FLAG_RCU)
4598 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4603 rc = posix_acl_permission(inode, acl, mask);
4604 posix_acl_release(acl);
4607 # else /* !CONFIG_FS_POSIX_ACL */
4609 # endif /* CONFIG_FS_POSIX_ACL */
4611 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4613 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4614 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4616 # ifdef HAVE_INODE_PERMISION_2ARGS
4617 int ll_inode_permission(struct inode *inode, int mask)
4619 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4624 struct ll_sb_info *sbi;
4625 struct root_squash_info *squash;
4626 struct cred *cred = NULL;
4627 const struct cred *old_cred = NULL;
4629 bool squash_id = false;
4632 #ifdef MAY_NOT_BLOCK
4633 if (mask & MAY_NOT_BLOCK)
4635 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4636 if (flags & IPERM_FLAG_RCU)
4640 /* as root inode are NOT getting validated in lookup operation,
4641 * need to do it before permission check. */
4643 if (inode == inode->i_sb->s_root->d_inode) {
4644 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4649 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4650 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4652 /* squash fsuid/fsgid if needed */
4653 sbi = ll_i2sbi(inode);
4654 squash = &sbi->ll_squash;
4655 if (unlikely(squash->rsi_uid != 0 &&
4656 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4657 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4661 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4662 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4663 squash->rsi_uid, squash->rsi_gid);
4665 /* update current process's credentials
4666 * and FS capability */
4667 cred = prepare_creds();
4671 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4672 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4673 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4674 if ((1 << cap) & CFS_CAP_FS_MASK)
4675 cap_lower(cred->cap_effective, cap);
4677 old_cred = override_creds(cred);
4680 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4681 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4682 /* restore current process's credentials and FS capability */
4684 revert_creds(old_cred);
4691 /* -o localflock - only provides locally consistent flock locks */
4692 struct file_operations ll_file_operations = {
4693 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4694 # ifdef HAVE_SYNC_READ_WRITE
4695 .read = new_sync_read,
4696 .write = new_sync_write,
4698 .read_iter = ll_file_read_iter,
4699 .write_iter = ll_file_write_iter,
4700 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4701 .read = ll_file_read,
4702 .aio_read = ll_file_aio_read,
4703 .write = ll_file_write,
4704 .aio_write = ll_file_aio_write,
4705 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4706 .unlocked_ioctl = ll_file_ioctl,
4707 .open = ll_file_open,
4708 .release = ll_file_release,
4709 .mmap = ll_file_mmap,
4710 .llseek = ll_file_seek,
4711 .splice_read = ll_file_splice_read,
4716 struct file_operations ll_file_operations_flock = {
4717 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4718 # ifdef HAVE_SYNC_READ_WRITE
4719 .read = new_sync_read,
4720 .write = new_sync_write,
4721 # endif /* HAVE_SYNC_READ_WRITE */
4722 .read_iter = ll_file_read_iter,
4723 .write_iter = ll_file_write_iter,
4724 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4725 .read = ll_file_read,
4726 .aio_read = ll_file_aio_read,
4727 .write = ll_file_write,
4728 .aio_write = ll_file_aio_write,
4729 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4730 .unlocked_ioctl = ll_file_ioctl,
4731 .open = ll_file_open,
4732 .release = ll_file_release,
4733 .mmap = ll_file_mmap,
4734 .llseek = ll_file_seek,
4735 .splice_read = ll_file_splice_read,
4738 .flock = ll_file_flock,
4739 .lock = ll_file_flock
4742 /* These are for -o noflock - to return ENOSYS on flock calls */
4743 struct file_operations ll_file_operations_noflock = {
4744 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4745 # ifdef HAVE_SYNC_READ_WRITE
4746 .read = new_sync_read,
4747 .write = new_sync_write,
4748 # endif /* HAVE_SYNC_READ_WRITE */
4749 .read_iter = ll_file_read_iter,
4750 .write_iter = ll_file_write_iter,
4751 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4752 .read = ll_file_read,
4753 .aio_read = ll_file_aio_read,
4754 .write = ll_file_write,
4755 .aio_write = ll_file_aio_write,
4756 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4757 .unlocked_ioctl = ll_file_ioctl,
4758 .open = ll_file_open,
4759 .release = ll_file_release,
4760 .mmap = ll_file_mmap,
4761 .llseek = ll_file_seek,
4762 .splice_read = ll_file_splice_read,
4765 .flock = ll_file_noflock,
4766 .lock = ll_file_noflock
4769 struct inode_operations ll_file_inode_operations = {
4770 .setattr = ll_setattr,
4771 .getattr = ll_getattr,
4772 .permission = ll_inode_permission,
4773 #ifdef HAVE_IOP_XATTR
4774 .setxattr = ll_setxattr,
4775 .getxattr = ll_getxattr,
4776 .removexattr = ll_removexattr,
4778 .listxattr = ll_listxattr,
4779 .fiemap = ll_fiemap,
4780 #ifdef HAVE_IOP_GET_ACL
4781 .get_acl = ll_get_acl,
4783 #ifdef HAVE_IOP_SET_ACL
4784 .set_acl = ll_set_acl,
4788 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4790 struct ll_inode_info *lli = ll_i2info(inode);
4791 struct cl_object *obj = lli->lli_clob;
4800 env = cl_env_get(&refcheck);
4802 RETURN(PTR_ERR(env));
4804 rc = cl_conf_set(env, lli->lli_clob, conf);
4808 if (conf->coc_opc == OBJECT_CONF_SET) {
4809 struct ldlm_lock *lock = conf->coc_lock;
4810 struct cl_layout cl = {
4814 LASSERT(lock != NULL);
4815 LASSERT(ldlm_has_layout(lock));
4817 /* it can only be allowed to match after layout is
4818 * applied to inode otherwise false layout would be
4819 * seen. Applying layout shoud happen before dropping
4820 * the intent lock. */
4821 ldlm_lock_allow_match(lock);
4823 rc = cl_object_layout_get(env, obj, &cl);
4828 DFID": layout version change: %u -> %u\n",
4829 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4831 ll_layout_version_set(lli, cl.cl_layout_gen);
4835 cl_env_put(env, &refcheck);
4840 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4841 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4844 struct ll_sb_info *sbi = ll_i2sbi(inode);
4845 struct ptlrpc_request *req;
4852 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4853 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4854 lock->l_lvb_data, lock->l_lvb_len);
4856 if (lock->l_lvb_data != NULL)
4859 /* if layout lock was granted right away, the layout is returned
4860 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4861 * blocked and then granted via completion ast, we have to fetch
4862 * layout here. Please note that we can't use the LVB buffer in
4863 * completion AST because it doesn't have a large enough buffer */
4864 rc = ll_get_default_mdsize(sbi, &lmmsize);
4868 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4869 XATTR_NAME_LOV, lmmsize, &req);
4872 GOTO(out, rc = 0); /* empty layout */
4879 if (lmmsize == 0) /* empty layout */
4882 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4884 GOTO(out, rc = -EFAULT);
4886 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4887 if (lvbdata == NULL)
4888 GOTO(out, rc = -ENOMEM);
4890 memcpy(lvbdata, lmm, lmmsize);
4891 lock_res_and_lock(lock);
4892 if (unlikely(lock->l_lvb_data == NULL)) {
4893 lock->l_lvb_type = LVB_T_LAYOUT;
4894 lock->l_lvb_data = lvbdata;
4895 lock->l_lvb_len = lmmsize;
4898 unlock_res_and_lock(lock);
4901 OBD_FREE_LARGE(lvbdata, lmmsize);
4906 ptlrpc_req_finished(req);
4911 * Apply the layout to the inode. Layout lock is held and will be released
4914 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4915 struct inode *inode)
4917 struct ll_inode_info *lli = ll_i2info(inode);
4918 struct ll_sb_info *sbi = ll_i2sbi(inode);
4919 struct ldlm_lock *lock;
4920 struct cl_object_conf conf;
4923 bool wait_layout = false;
4926 LASSERT(lustre_handle_is_used(lockh));
4928 lock = ldlm_handle2lock(lockh);
4929 LASSERT(lock != NULL);
4930 LASSERT(ldlm_has_layout(lock));
4932 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4933 PFID(&lli->lli_fid), inode);
4935 /* in case this is a caching lock and reinstate with new inode */
4936 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4938 lock_res_and_lock(lock);
4939 lvb_ready = ldlm_is_lvb_ready(lock);
4940 unlock_res_and_lock(lock);
4942 /* checking lvb_ready is racy but this is okay. The worst case is
4943 * that multi processes may configure the file on the same time. */
4947 rc = ll_layout_fetch(inode, lock);
4951 /* for layout lock, lmm is stored in lock's lvb.
4952 * lvb_data is immutable if the lock is held so it's safe to access it
4955 * set layout to file. Unlikely this will fail as old layout was
4956 * surely eliminated */
4957 memset(&conf, 0, sizeof conf);
4958 conf.coc_opc = OBJECT_CONF_SET;
4959 conf.coc_inode = inode;
4960 conf.coc_lock = lock;
4961 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4962 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4963 rc = ll_layout_conf(inode, &conf);
4965 /* refresh layout failed, need to wait */
4966 wait_layout = rc == -EBUSY;
4969 LDLM_LOCK_PUT(lock);
4970 ldlm_lock_decref(lockh, mode);
4972 /* wait for IO to complete if it's still being used. */
4974 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4975 ll_get_fsname(inode->i_sb, NULL, 0),
4976 PFID(&lli->lli_fid), inode);
4978 memset(&conf, 0, sizeof conf);
4979 conf.coc_opc = OBJECT_CONF_WAIT;
4980 conf.coc_inode = inode;
4981 rc = ll_layout_conf(inode, &conf);
4985 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4986 ll_get_fsname(inode->i_sb, NULL, 0),
4987 PFID(&lli->lli_fid), rc);
4993 * Issue layout intent RPC to MDS.
4994 * \param inode [in] file inode
4995 * \param intent [in] layout intent
4997 * \retval 0 on success
4998 * \retval < 0 error code
5000 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5002 struct ll_inode_info *lli = ll_i2info(inode);
5003 struct ll_sb_info *sbi = ll_i2sbi(inode);
5004 struct md_op_data *op_data;
5005 struct lookup_intent it;
5006 struct ptlrpc_request *req;
5010 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5011 0, 0, LUSTRE_OPC_ANY, NULL);
5012 if (IS_ERR(op_data))
5013 RETURN(PTR_ERR(op_data));
5015 op_data->op_data = intent;
5016 op_data->op_data_size = sizeof(*intent);
5018 memset(&it, 0, sizeof(it));
5019 it.it_op = IT_LAYOUT;
5020 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5021 intent->li_opc == LAYOUT_INTENT_TRUNC)
5022 it.it_flags = FMODE_WRITE;
5024 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5025 ll_get_fsname(inode->i_sb, NULL, 0),
5026 PFID(&lli->lli_fid), inode);
5028 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5029 &ll_md_blocking_ast, 0);
5030 if (it.it_request != NULL)
5031 ptlrpc_req_finished(it.it_request);
5032 it.it_request = NULL;
5034 ll_finish_md_op_data(op_data);
5036 /* set lock data in case this is a new lock */
5038 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5040 ll_intent_drop_lock(&it);
5046 * This function checks if there exists a LAYOUT lock on the client side,
5047 * or enqueues it if it doesn't have one in cache.
5049 * This function will not hold layout lock so it may be revoked any time after
5050 * this function returns. Any operations depend on layout should be redone
5053 * This function should be called before lov_io_init() to get an uptodate
5054 * layout version, the caller should save the version number and after IO
5055 * is finished, this function should be called again to verify that layout
5056 * is not changed during IO time.
5058 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5060 struct ll_inode_info *lli = ll_i2info(inode);
5061 struct ll_sb_info *sbi = ll_i2sbi(inode);
5062 struct lustre_handle lockh;
5063 struct layout_intent intent = {
5064 .li_opc = LAYOUT_INTENT_ACCESS,
5066 enum ldlm_mode mode;
5070 *gen = ll_layout_version_get(lli);
5071 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5075 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5076 LASSERT(S_ISREG(inode->i_mode));
5078 /* take layout lock mutex to enqueue layout lock exclusively. */
5079 mutex_lock(&lli->lli_layout_mutex);
5082 /* mostly layout lock is caching on the local side, so try to
5083 * match it before grabbing layout lock mutex. */
5084 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5085 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5086 if (mode != 0) { /* hit cached lock */
5087 rc = ll_layout_lock_set(&lockh, mode, inode);
5093 rc = ll_layout_intent(inode, &intent);
5099 *gen = ll_layout_version_get(lli);
5100 mutex_unlock(&lli->lli_layout_mutex);
5106 * Issue layout intent RPC indicating where in a file an IO is about to write.
5108 * \param[in] inode file inode.
5109 * \param[in] ext write range with start offset of fille in bytes where
5110 * an IO is about to write, and exclusive end offset in
5113 * \retval 0 on success
5114 * \retval < 0 error code
5116 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5117 struct lu_extent *ext)
5119 struct layout_intent intent = {
5121 .li_extent.e_start = ext->e_start,
5122 .li_extent.e_end = ext->e_end,
5127 rc = ll_layout_intent(inode, &intent);
5133 * This function send a restore request to the MDT
5135 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5137 struct hsm_user_request *hur;
5141 len = sizeof(struct hsm_user_request) +
5142 sizeof(struct hsm_user_item);
5143 OBD_ALLOC(hur, len);
5147 hur->hur_request.hr_action = HUA_RESTORE;
5148 hur->hur_request.hr_archive_id = 0;
5149 hur->hur_request.hr_flags = 0;
5150 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5151 sizeof(hur->hur_user_item[0].hui_fid));
5152 hur->hur_user_item[0].hui_extent.offset = offset;
5153 hur->hur_user_item[0].hui_extent.length = length;
5154 hur->hur_request.hr_itemcount = 1;
5155 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,