4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
501 const char *name = NULL;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
514 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
515 name = de->d_name.name;
516 len = de->d_name.len;
519 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
520 name, len, 0, LUSTRE_OPC_ANY, NULL);
522 RETURN(PTR_ERR(op_data));
523 op_data->op_data = lmm;
524 op_data->op_data_size = lmmsize;
526 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
527 &ll_md_blocking_ast, 0);
528 ll_finish_md_op_data(op_data);
530 /* reason for keep own exit path - don`t flood log
531 * with messages with -ESTALE errors.
533 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
534 it_open_error(DISP_OPEN_OPEN, itp))
536 ll_release_openhandle(de, itp);
540 if (it_disposition(itp, DISP_LOOKUP_NEG))
541 GOTO(out, rc = -ENOENT);
543 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
544 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
545 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
549 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
551 if (!rc && itp->it_lock_mode) {
552 ll_dom_finish_open(de->d_inode, req, itp);
553 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
557 ptlrpc_req_finished(req);
558 ll_intent_drop_lock(itp);
560 /* We did open by fid, but by the time we got to the server,
561 * the object disappeared. If this is a create, we cannot really
562 * tell the userspace that the file it was trying to create
563 * does not exist. Instead let's return -ESTALE, and the VFS will
564 * retry the create with LOOKUP_REVAL that we are going to catch
565 * in ll_revalidate_dentry() and use lookup then.
567 if (rc == -ENOENT && itp->it_op & IT_CREAT)
573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
574 struct obd_client_handle *och)
576 struct mdt_body *body;
578 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
579 och->och_open_handle = body->mbo_open_handle;
580 och->och_fid = body->mbo_fid1;
581 och->och_lease_handle.cookie = it->it_lock_handle;
582 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
583 och->och_flags = it->it_flags;
585 return md_set_open_replay_data(md_exp, och, it);
588 static int ll_local_open(struct file *file, struct lookup_intent *it,
589 struct ll_file_data *fd, struct obd_client_handle *och)
591 struct inode *inode = file_inode(file);
594 LASSERT(!LUSTRE_FPRIVATE(file));
601 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
606 LUSTRE_FPRIVATE(file) = fd;
607 ll_readahead_init(inode, &fd->fd_ras);
608 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
610 /* ll_cl_context initialize */
611 rwlock_init(&fd->fd_lock);
612 INIT_LIST_HEAD(&fd->fd_lccs);
617 /* Open a file, and (for the very first open) create objects on the OSTs at
618 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
619 * creation or open until ll_lov_setstripe() ioctl is called.
621 * If we already have the stripe MD locally then we don't request it in
622 * md_open(), by passing a lmm_size = 0.
624 * It is up to the application to ensure no other processes open this file
625 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
626 * used. We might be able to avoid races of that sort by getting lli_open_sem
627 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
628 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
630 int ll_file_open(struct inode *inode, struct file *file)
632 struct ll_inode_info *lli = ll_i2info(inode);
633 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
634 .it_flags = file->f_flags };
635 struct obd_client_handle **och_p = NULL;
636 __u64 *och_usecount = NULL;
637 struct ll_file_data *fd;
641 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
642 PFID(ll_inode2fid(inode)), inode, file->f_flags);
644 it = file->private_data; /* XXX: compat macro */
645 file->private_data = NULL; /* prevent ll_local_open assertion */
647 fd = ll_file_data_get();
649 GOTO(out_nofiledata, rc = -ENOMEM);
652 if (S_ISDIR(inode->i_mode))
653 ll_authorize_statahead(inode, fd);
655 if (inode->i_sb->s_root == file_dentry(file)) {
656 LUSTRE_FPRIVATE(file) = fd;
660 if (!it || !it->it_disposition) {
661 /* Convert f_flags into access mode. We cannot use file->f_mode,
662 * because everything but O_ACCMODE mask was stripped from
664 if ((oit.it_flags + 1) & O_ACCMODE)
666 if (file->f_flags & O_TRUNC)
667 oit.it_flags |= FMODE_WRITE;
669 /* kernel only call f_op->open in dentry_open. filp_open calls
670 * dentry_open after call to open_namei that checks permissions.
671 * Only nfsd_open call dentry_open directly without checking
672 * permissions and because of that this code below is safe.
674 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
675 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
677 /* We do not want O_EXCL here, presumably we opened the file
678 * already? XXX - NFS implications? */
679 oit.it_flags &= ~O_EXCL;
681 /* bug20584, if "it_flags" contains O_CREAT, the file will be
682 * created if necessary, then "IT_CREAT" should be set to keep
683 * consistent with it */
684 if (oit.it_flags & O_CREAT)
685 oit.it_op |= IT_CREAT;
691 /* Let's see if we have file open on MDS already. */
692 if (it->it_flags & FMODE_WRITE) {
693 och_p = &lli->lli_mds_write_och;
694 och_usecount = &lli->lli_open_fd_write_count;
695 } else if (it->it_flags & FMODE_EXEC) {
696 och_p = &lli->lli_mds_exec_och;
697 och_usecount = &lli->lli_open_fd_exec_count;
699 och_p = &lli->lli_mds_read_och;
700 och_usecount = &lli->lli_open_fd_read_count;
703 mutex_lock(&lli->lli_och_mutex);
704 if (*och_p) { /* Open handle is present */
705 if (it_disposition(it, DISP_OPEN_OPEN)) {
706 /* Well, there's extra open request that we do not need,
707 let's close it somehow. This will decref request. */
708 rc = it_open_error(DISP_OPEN_OPEN, it);
710 mutex_unlock(&lli->lli_och_mutex);
711 GOTO(out_openerr, rc);
714 ll_release_openhandle(file_dentry(file), it);
718 rc = ll_local_open(file, it, fd, NULL);
721 mutex_unlock(&lli->lli_och_mutex);
722 GOTO(out_openerr, rc);
725 LASSERT(*och_usecount == 0);
726 if (!it->it_disposition) {
727 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
728 /* We cannot just request lock handle now, new ELC code
729 means that one of other OPEN locks for this file
730 could be cancelled, and since blocking ast handler
731 would attempt to grab och_mutex as well, that would
732 result in a deadlock */
733 mutex_unlock(&lli->lli_och_mutex);
735 * Normally called under two situations:
737 * 2. A race/condition on MDS resulting in no open
738 * handle to be returned from LOOKUP|OPEN request,
739 * for example if the target entry was a symlink.
741 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
742 * marked by a bit set in ll_iget_for_nfs. Clear the
743 * bit so that it's not confusing later callers.
745 * NB; when ldd is NULL, it must have come via normal
746 * lookup path only, since ll_iget_for_nfs always calls
749 if (ldd && ldd->lld_nfs_dentry) {
750 ldd->lld_nfs_dentry = 0;
751 it->it_flags |= MDS_OPEN_LOCK;
755 * Always specify MDS_OPEN_BY_FID because we don't want
756 * to get file with different fid.
758 it->it_flags |= MDS_OPEN_BY_FID;
759 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
762 GOTO(out_openerr, rc);
766 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
768 GOTO(out_och_free, rc = -ENOMEM);
772 /* md_intent_lock() didn't get a request ref if there was an
773 * open error, so don't do cleanup on the request here
775 /* XXX (green): Should not we bail out on any error here, not
776 * just open error? */
777 rc = it_open_error(DISP_OPEN_OPEN, it);
779 GOTO(out_och_free, rc);
781 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
782 "inode %p: disposition %x, status %d\n", inode,
783 it_disposition(it, ~0), it->it_status);
785 rc = ll_local_open(file, it, fd, *och_p);
787 GOTO(out_och_free, rc);
789 mutex_unlock(&lli->lli_och_mutex);
792 /* Must do this outside lli_och_mutex lock to prevent deadlock where
793 different kind of OPEN lock for this same inode gets cancelled
794 by ldlm_cancel_lru */
795 if (!S_ISREG(inode->i_mode))
796 GOTO(out_och_free, rc);
798 cl_lov_delay_create_clear(&file->f_flags);
799 GOTO(out_och_free, rc);
803 if (och_p && *och_p) {
804 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
805 *och_p = NULL; /* OBD_FREE writes some magic there */
808 mutex_unlock(&lli->lli_och_mutex);
811 if (lli->lli_opendir_key == fd)
812 ll_deauthorize_statahead(inode, fd);
814 ll_file_data_put(fd);
816 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
820 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
821 ptlrpc_req_finished(it->it_request);
822 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
829 struct ldlm_lock_desc *desc, void *data, int flag)
832 struct lustre_handle lockh;
836 case LDLM_CB_BLOCKING:
837 ldlm_lock2handle(lock, &lockh);
838 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
840 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
844 case LDLM_CB_CANCELING:
852 * When setting a lease on a file, we take ownership of the lli_mds_*_och
853 * and save it as fd->fd_och so as to force client to reopen the file even
854 * if it has an open lock in cache already.
856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
857 struct lustre_handle *old_open_handle)
859 struct ll_inode_info *lli = ll_i2info(inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
861 struct obd_client_handle **och_p;
866 /* Get the openhandle of the file */
867 mutex_lock(&lli->lli_och_mutex);
868 if (fd->fd_lease_och != NULL)
869 GOTO(out_unlock, rc = -EBUSY);
871 if (fd->fd_och == NULL) {
872 if (file->f_mode & FMODE_WRITE) {
873 LASSERT(lli->lli_mds_write_och != NULL);
874 och_p = &lli->lli_mds_write_och;
875 och_usecount = &lli->lli_open_fd_write_count;
877 LASSERT(lli->lli_mds_read_och != NULL);
878 och_p = &lli->lli_mds_read_och;
879 och_usecount = &lli->lli_open_fd_read_count;
882 if (*och_usecount > 1)
883 GOTO(out_unlock, rc = -EBUSY);
890 *old_open_handle = fd->fd_och->och_open_handle;
894 mutex_unlock(&lli->lli_och_mutex);
899 * Release ownership on lli_mds_*_och when putting back a file lease.
901 static int ll_lease_och_release(struct inode *inode, struct file *file)
903 struct ll_inode_info *lli = ll_i2info(inode);
904 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
905 struct obd_client_handle **och_p;
906 struct obd_client_handle *old_och = NULL;
911 mutex_lock(&lli->lli_och_mutex);
912 if (file->f_mode & FMODE_WRITE) {
913 och_p = &lli->lli_mds_write_och;
914 och_usecount = &lli->lli_open_fd_write_count;
916 och_p = &lli->lli_mds_read_och;
917 och_usecount = &lli->lli_open_fd_read_count;
920 /* The file may have been open by another process (broken lease) so
921 * *och_p is not NULL. In this case we should simply increase usecount
924 if (*och_p != NULL) {
925 old_och = fd->fd_och;
932 mutex_unlock(&lli->lli_och_mutex);
935 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
941 * Acquire a lease and open the file.
943 static struct obd_client_handle *
944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
947 struct lookup_intent it = { .it_op = IT_OPEN };
948 struct ll_sb_info *sbi = ll_i2sbi(inode);
949 struct md_op_data *op_data;
950 struct ptlrpc_request *req = NULL;
951 struct lustre_handle old_open_handle = { 0 };
952 struct obd_client_handle *och = NULL;
957 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
958 RETURN(ERR_PTR(-EINVAL));
961 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
962 RETURN(ERR_PTR(-EPERM));
964 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
971 RETURN(ERR_PTR(-ENOMEM));
973 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
974 LUSTRE_OPC_ANY, NULL);
976 GOTO(out, rc = PTR_ERR(op_data));
978 /* To tell the MDT this openhandle is from the same owner */
979 op_data->op_open_handle = old_open_handle;
981 it.it_flags = fmode | open_flags;
982 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
983 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
984 &ll_md_blocking_lease_ast,
985 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
986 * it can be cancelled which may mislead applications that the lease is
988 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
989 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
990 * doesn't deal with openhandle, so normal openhandle will be leaked. */
991 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
992 ll_finish_md_op_data(op_data);
993 ptlrpc_req_finished(req);
995 GOTO(out_release_it, rc);
997 if (it_disposition(&it, DISP_LOOKUP_NEG))
998 GOTO(out_release_it, rc = -ENOENT);
1000 rc = it_open_error(DISP_OPEN_OPEN, &it);
1002 GOTO(out_release_it, rc);
1004 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005 ll_och_fill(sbi->ll_md_exp, &it, och);
1007 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008 GOTO(out_close, rc = -EOPNOTSUPP);
1010 /* already get lease, handle lease lock */
1011 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012 if (it.it_lock_mode == 0 ||
1013 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014 /* open lock must return for lease */
1015 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1018 GOTO(out_close, rc = -EPROTO);
1021 ll_intent_release(&it);
1025 /* Cancel open lock */
1026 if (it.it_lock_mode != 0) {
1027 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1029 it.it_lock_mode = 0;
1030 och->och_lease_handle.cookie = 0ULL;
1032 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1034 CERROR("%s: error closing file "DFID": %d\n",
1035 ll_get_fsname(inode->i_sb, NULL, 0),
1036 PFID(&ll_i2info(inode)->lli_fid), rc2);
1037 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1039 ll_intent_release(&it);
1043 RETURN(ERR_PTR(rc));
1047 * Check whether a layout swap can be done between two inodes.
1049 * \param[in] inode1 First inode to check
1050 * \param[in] inode2 Second inode to check
1052 * \retval 0 on success, layout swap can be performed between both inodes
1053 * \retval negative error code if requirements are not met
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056 struct inode *inode2)
1058 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1061 if (inode_permission(inode1, MAY_WRITE) ||
1062 inode_permission(inode2, MAY_WRITE))
1065 if (inode1->i_sb != inode2->i_sb)
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072 struct inode *inode, struct inode *inode2)
1074 const struct lu_fid *fid1 = ll_inode2fid(inode);
1075 const struct lu_fid *fid2;
1079 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1082 rc = ll_check_swap_layouts_validity(inode, inode2);
1084 GOTO(out_free_och, rc);
1086 /* We now know that inode2 is a lustre inode */
1087 fid2 = ll_inode2fid(inode2);
1089 rc = lu_fid_cmp(fid1, fid2);
1091 GOTO(out_free_och, rc = -EINVAL);
1093 /* Close the file and {swap,merge} layouts between inode & inode2.
1094 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095 * because we still need it to pack l_remote_handle to MDT. */
1096 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1099 och = NULL; /* freed in ll_close_inode_openhandle() */
1109 * Release lease and close the file.
1110 * It will check if the lease has ever broken.
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113 struct inode *inode,
1114 bool *lease_broken, enum mds_op_bias bias,
1117 struct ldlm_lock *lock;
1118 bool cancelled = true;
1122 lock = ldlm_handle2lock(&och->och_lease_handle);
1124 lock_res_and_lock(lock);
1125 cancelled = ldlm_is_cancel(lock);
1126 unlock_res_and_lock(lock);
1127 LDLM_LOCK_PUT(lock);
1130 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1133 if (lease_broken != NULL)
1134 *lease_broken = cancelled;
1136 if (!cancelled && !bias)
1137 ldlm_cli_cancel(&och->och_lease_handle, 0);
1139 if (cancelled) { /* no need to excute intent */
1144 rc = ll_close_inode_openhandle(inode, och, bias, data);
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1151 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1155 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158 struct inode *inode, unsigned long arg)
1160 struct ll_sb_info *sbi = ll_i2sbi(inode);
1161 struct md_op_data *op_data;
1162 struct ll_ioc_lease_id ioc;
1163 __u64 data_version_unused;
1167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168 LUSTRE_OPC_ANY, NULL);
1169 if (IS_ERR(op_data))
1170 RETURN(PTR_ERR(op_data));
1172 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1176 /* before starting file resync, it's necessary to clean up page cache
1177 * in client memory, otherwise once the layout version is increased,
1178 * writing back cached data will be denied the OSTs. */
1179 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1183 op_data->op_lease_handle = och->och_lease_handle;
1184 op_data->op_mirror_id = ioc.lil_mirror_id;
1185 rc = md_file_resync(sbi->ll_md_exp, op_data);
1191 ll_finish_md_op_data(op_data);
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1197 struct ll_inode_info *lli = ll_i2info(inode);
1198 struct cl_object *obj = lli->lli_clob;
1199 struct cl_attr *attr = vvp_env_thread_attr(env);
1207 ll_inode_size_lock(inode);
1209 /* Merge timestamps the most recently obtained from MDS with
1210 * timestamps obtained from OSTs.
1212 * Do not overwrite atime of inode because it may be refreshed
1213 * by file_accessed() function. If the read was served by cache
1214 * data, there is no RPC to be sent so that atime may not be
1215 * transferred to OSTs at all. MDT only updates atime at close time
1216 * if it's at least 'mdd.*.atime_diff' older.
1217 * All in all, the atime in Lustre does not strictly comply with
1218 * POSIX. Solving this problem needs to send an RPC to MDT for each
1219 * read, this will hurt performance.
1221 if (inode->i_atime.tv_sec < lli->lli_atime ||
1222 lli->lli_update_atime) {
1223 inode->i_atime.tv_sec = lli->lli_atime;
1224 lli->lli_update_atime = 0;
1226 inode->i_mtime.tv_sec = lli->lli_mtime;
1227 inode->i_ctime.tv_sec = lli->lli_ctime;
1229 mtime = inode->i_mtime.tv_sec;
1230 atime = inode->i_atime.tv_sec;
1231 ctime = inode->i_ctime.tv_sec;
1233 cl_object_attr_lock(obj);
1234 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1237 rc = cl_object_attr_get(env, obj, attr);
1238 cl_object_attr_unlock(obj);
1241 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1243 if (atime < attr->cat_atime)
1244 atime = attr->cat_atime;
1246 if (ctime < attr->cat_ctime)
1247 ctime = attr->cat_ctime;
1249 if (mtime < attr->cat_mtime)
1250 mtime = attr->cat_mtime;
1252 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1253 PFID(&lli->lli_fid), attr->cat_size);
1255 i_size_write(inode, attr->cat_size);
1256 inode->i_blocks = attr->cat_blocks;
1258 inode->i_mtime.tv_sec = mtime;
1259 inode->i_atime.tv_sec = atime;
1260 inode->i_ctime.tv_sec = ctime;
1263 ll_inode_size_unlock(inode);
1269 * Set designated mirror for I/O.
1271 * So far only read, write, and truncated can support to issue I/O to
1272 * designated mirror.
1274 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1276 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1278 /* clear layout version for generic(non-resync) I/O in case it carries
1279 * stale layout version due to I/O restart */
1280 io->ci_layout_version = 0;
1282 /* FLR: disable non-delay for designated mirror I/O because obviously
1283 * only one mirror is available */
1284 if (fd->fd_designated_mirror > 0) {
1286 io->ci_designated_mirror = fd->fd_designated_mirror;
1287 io->ci_layout_version = fd->fd_layout_version;
1290 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1291 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1294 static bool file_is_noatime(const struct file *file)
1296 const struct vfsmount *mnt = file->f_path.mnt;
1297 const struct inode *inode = file_inode((struct file *)file);
1299 /* Adapted from file_accessed() and touch_atime().*/
1300 if (file->f_flags & O_NOATIME)
1303 if (inode->i_flags & S_NOATIME)
1306 if (IS_NOATIME(inode))
1309 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1312 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1315 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1321 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1323 struct inode *inode = file_inode(file);
1324 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1326 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1327 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1329 if (iot == CIT_WRITE) {
1330 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1331 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1332 file->f_flags & O_DIRECT ||
1335 io->ci_obj = ll_i2info(inode)->lli_clob;
1336 io->ci_lockreq = CILR_MAYBE;
1337 if (ll_file_nolock(file)) {
1338 io->ci_lockreq = CILR_NEVER;
1339 io->ci_no_srvlock = 1;
1340 } else if (file->f_flags & O_APPEND) {
1341 io->ci_lockreq = CILR_MANDATORY;
1343 io->ci_noatime = file_is_noatime(file);
1345 /* FLR: only use non-delay I/O for read as there is only one
1346 * avaliable mirror for write. */
1347 io->ci_ndelay = !(iot == CIT_WRITE);
1349 ll_io_set_mirror(io, file);
1353 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1354 struct file *file, enum cl_io_type iot,
1355 loff_t *ppos, size_t count)
1357 struct vvp_io *vio = vvp_env_io(env);
1358 struct inode *inode = file_inode(file);
1359 struct ll_inode_info *lli = ll_i2info(inode);
1360 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1361 struct range_lock range;
1365 unsigned retried = 0;
1366 bool restarted = false;
1370 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1371 file_dentry(file)->d_name.name,
1372 iot == CIT_READ ? "read" : "write", *ppos, count);
1375 io = vvp_env_thread_io(env);
1376 ll_io_init(io, file, iot);
1377 io->ci_ndelay_tried = retried;
1379 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1380 bool range_locked = false;
1382 if (file->f_flags & O_APPEND)
1383 range_lock_init(&range, 0, LUSTRE_EOF);
1385 range_lock_init(&range, *ppos, *ppos + count - 1);
1387 vio->vui_fd = LUSTRE_FPRIVATE(file);
1388 vio->vui_io_subtype = args->via_io_subtype;
1390 switch (vio->vui_io_subtype) {
1392 vio->vui_iter = args->u.normal.via_iter;
1393 vio->vui_iocb = args->u.normal.via_iocb;
1394 /* Direct IO reads must also take range lock,
1395 * or multiple reads will try to work on the same pages
1396 * See LU-6227 for details. */
1397 if (((iot == CIT_WRITE) ||
1398 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1399 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1400 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1402 rc = range_lock(&lli->lli_write_tree, &range);
1406 range_locked = true;
1410 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1411 vio->u.splice.vui_flags = args->u.splice.via_flags;
1414 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1418 ll_cl_add(file, env, io, LCC_RW);
1419 rc = cl_io_loop(env, io);
1420 ll_cl_remove(file, env);
1423 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1425 range_unlock(&lli->lli_write_tree, &range);
1428 /* cl_io_rw_init() handled IO */
1432 if (io->ci_nob > 0) {
1433 result += io->ci_nob;
1434 count -= io->ci_nob;
1435 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1437 /* prepare IO restart */
1438 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1439 args->u.normal.via_iter = vio->vui_iter;
1442 cl_io_fini(env, io);
1445 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1446 file->f_path.dentry->d_name.name,
1447 iot, rc, result, io->ci_need_restart);
1449 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1451 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1452 file_dentry(file)->d_name.name,
1453 iot == CIT_READ ? "read" : "write",
1454 *ppos, count, result, rc);
1455 /* preserve the tried count for FLR */
1456 retried = io->ci_ndelay_tried;
1461 if (iot == CIT_READ) {
1463 ll_stats_ops_tally(ll_i2sbi(inode),
1464 LPROC_LL_READ_BYTES, result);
1465 } else if (iot == CIT_WRITE) {
1467 ll_stats_ops_tally(ll_i2sbi(inode),
1468 LPROC_LL_WRITE_BYTES, result);
1469 fd->fd_write_failed = false;
1470 } else if (result == 0 && rc == 0) {
1473 fd->fd_write_failed = true;
1475 fd->fd_write_failed = false;
1476 } else if (rc != -ERESTARTSYS) {
1477 fd->fd_write_failed = true;
1481 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1483 RETURN(result > 0 ? result : rc);
1487 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1488 * especially for small I/O.
1490 * To serve a read request, CLIO has to create and initialize a cl_io and
1491 * then request DLM lock. This has turned out to have siginificant overhead
1492 * and affects the performance of small I/O dramatically.
1494 * It's not necessary to create a cl_io for each I/O. Under the help of read
1495 * ahead, most of the pages being read are already in memory cache and we can
1496 * read those pages directly because if the pages exist, the corresponding DLM
1497 * lock must exist so that page content must be valid.
1499 * In fast read implementation, the llite speculatively finds and reads pages
1500 * in memory cache. There are three scenarios for fast read:
1501 * - If the page exists and is uptodate, kernel VM will provide the data and
1502 * CLIO won't be intervened;
1503 * - If the page was brought into memory by read ahead, it will be exported
1504 * and read ahead parameters will be updated;
1505 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1506 * it will go back and invoke normal read, i.e., a cl_io will be created
1507 * and DLM lock will be requested.
1509 * POSIX compliance: posix standard states that read is intended to be atomic.
1510 * Lustre read implementation is in line with Linux kernel read implementation
1511 * and neither of them complies with POSIX standard in this matter. Fast read
1512 * doesn't make the situation worse on single node but it may interleave write
1513 * results from multiple nodes due to short read handling in ll_file_aio_read().
1515 * \param env - lu_env
1516 * \param iocb - kiocb from kernel
1517 * \param iter - user space buffers where the data will be copied
1519 * \retval - number of bytes have been read, or error code if error occurred.
1522 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1526 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1529 /* NB: we can't do direct IO for fast read because it will need a lock
1530 * to make IO engine happy. */
1531 if (iocb->ki_filp->f_flags & O_DIRECT)
1534 result = generic_file_read_iter(iocb, iter);
1536 /* If the first page is not in cache, generic_file_aio_read() will be
1537 * returned with -ENODATA.
1538 * See corresponding code in ll_readpage(). */
1539 if (result == -ENODATA)
1543 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1544 LPROC_LL_READ_BYTES, result);
1550 * Read from a file (through the page cache).
1552 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1555 struct vvp_io_args *args;
1560 result = ll_do_fast_read(iocb, to);
1561 if (result < 0 || iov_iter_count(to) == 0)
1564 env = cl_env_get(&refcheck);
1566 return PTR_ERR(env);
1568 args = ll_env_args(env, IO_NORMAL);
1569 args->u.normal.via_iter = to;
1570 args->u.normal.via_iocb = iocb;
1572 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1573 &iocb->ki_pos, iov_iter_count(to));
1576 else if (result == 0)
1579 cl_env_put(env, &refcheck);
1585 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1586 * If a page is already in the page cache and dirty (and some other things -
1587 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1588 * write to it without doing a full I/O, because Lustre already knows about it
1589 * and will write it out. This saves a lot of processing time.
1591 * All writes here are within one page, so exclusion is handled by the page
1592 * lock on the vm page. We do not do tiny writes for writes which touch
1593 * multiple pages because it's very unlikely multiple sequential pages are
1594 * are already dirty.
1596 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1597 * and are unlikely to be to already dirty pages.
1599 * Attribute updates are important here, we do them in ll_tiny_write_end.
1601 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1603 ssize_t count = iov_iter_count(iter);
1604 struct file *file = iocb->ki_filp;
1605 struct inode *inode = file_inode(file);
1610 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1611 * of function for why.
1613 if (count >= PAGE_SIZE ||
1614 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1617 result = __generic_file_write_iter(iocb, iter);
1619 /* If the page is not already dirty, ll_tiny_write_begin returns
1620 * -ENODATA. We continue on to normal write.
1622 if (result == -ENODATA)
1626 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1628 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1631 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1637 * Write to a file (through the page cache).
1639 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1641 struct vvp_io_args *args;
1643 ssize_t rc_tiny = 0, rc_normal;
1648 /* NB: we can't do direct IO for tiny writes because they use the page
1649 * cache, we can't do sync writes because tiny writes can't flush
1650 * pages, and we can't do append writes because we can't guarantee the
1651 * required DLM locks are held to protect file size.
1653 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1654 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1655 rc_tiny = ll_do_tiny_write(iocb, from);
1657 /* In case of error, go on and try normal write - Only stop if tiny
1658 * write completed I/O.
1660 if (iov_iter_count(from) == 0)
1661 GOTO(out, rc_normal = rc_tiny);
1663 env = cl_env_get(&refcheck);
1665 return PTR_ERR(env);
1667 args = ll_env_args(env, IO_NORMAL);
1668 args->u.normal.via_iter = from;
1669 args->u.normal.via_iocb = iocb;
1671 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1672 &iocb->ki_pos, iov_iter_count(from));
1674 /* On success, combine bytes written. */
1675 if (rc_tiny >= 0 && rc_normal > 0)
1676 rc_normal += rc_tiny;
1677 /* On error, only return error from normal write if tiny write did not
1678 * write any bytes. Otherwise return bytes written by tiny write.
1680 else if (rc_tiny > 0)
1681 rc_normal = rc_tiny;
1683 cl_env_put(env, &refcheck);
1688 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1690 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1692 static int ll_file_get_iov_count(const struct iovec *iov,
1693 unsigned long *nr_segs, size_t *count)
1698 for (seg = 0; seg < *nr_segs; seg++) {
1699 const struct iovec *iv = &iov[seg];
1702 * If any segment has a negative length, or the cumulative
1703 * length ever wraps negative then return -EINVAL.
1706 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1708 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1713 cnt -= iv->iov_len; /* This segment is no good */
1720 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1721 unsigned long nr_segs, loff_t pos)
1728 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1732 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1733 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1734 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1735 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1736 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1738 result = ll_file_read_iter(iocb, &to);
1743 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1746 struct iovec iov = { .iov_base = buf, .iov_len = count };
1751 init_sync_kiocb(&kiocb, file);
1752 kiocb.ki_pos = *ppos;
1753 #ifdef HAVE_KIOCB_KI_LEFT
1754 kiocb.ki_left = count;
1755 #elif defined(HAVE_KI_NBYTES)
1756 kiocb.i_nbytes = count;
1759 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1760 *ppos = kiocb.ki_pos;
1766 * Write to a file (through the page cache).
1769 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1770 unsigned long nr_segs, loff_t pos)
1772 struct iov_iter from;
1777 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1781 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1782 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1783 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1784 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1785 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1787 result = ll_file_write_iter(iocb, &from);
1792 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1793 size_t count, loff_t *ppos)
1795 struct iovec iov = { .iov_base = (void __user *)buf,
1802 init_sync_kiocb(&kiocb, file);
1803 kiocb.ki_pos = *ppos;
1804 #ifdef HAVE_KIOCB_KI_LEFT
1805 kiocb.ki_left = count;
1806 #elif defined(HAVE_KI_NBYTES)
1807 kiocb.ki_nbytes = count;
1810 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1811 *ppos = kiocb.ki_pos;
1815 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1818 * Send file content (through pagecache) somewhere with helper
1820 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1821 struct pipe_inode_info *pipe, size_t count,
1825 struct vvp_io_args *args;
1830 env = cl_env_get(&refcheck);
1832 RETURN(PTR_ERR(env));
1834 args = ll_env_args(env, IO_SPLICE);
1835 args->u.splice.via_pipe = pipe;
1836 args->u.splice.via_flags = flags;
1838 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1839 cl_env_put(env, &refcheck);
1843 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1844 __u64 flags, struct lov_user_md *lum, int lum_size)
1846 struct lookup_intent oit = {
1848 .it_flags = flags | MDS_OPEN_BY_FID,
1853 ll_inode_size_lock(inode);
1854 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1856 GOTO(out_unlock, rc);
1858 ll_release_openhandle(dentry, &oit);
1861 ll_inode_size_unlock(inode);
1862 ll_intent_release(&oit);
1867 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1868 struct lov_mds_md **lmmp, int *lmm_size,
1869 struct ptlrpc_request **request)
1871 struct ll_sb_info *sbi = ll_i2sbi(inode);
1872 struct mdt_body *body;
1873 struct lov_mds_md *lmm = NULL;
1874 struct ptlrpc_request *req = NULL;
1875 struct md_op_data *op_data;
1878 rc = ll_get_default_mdsize(sbi, &lmmsize);
1882 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1883 strlen(filename), lmmsize,
1884 LUSTRE_OPC_ANY, NULL);
1885 if (IS_ERR(op_data))
1886 RETURN(PTR_ERR(op_data));
1888 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1889 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1890 ll_finish_md_op_data(op_data);
1892 CDEBUG(D_INFO, "md_getattr_name failed "
1893 "on %s: rc %d\n", filename, rc);
1897 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1898 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1900 lmmsize = body->mbo_eadatasize;
1902 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1904 GOTO(out, rc = -ENODATA);
1907 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1908 LASSERT(lmm != NULL);
1910 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1911 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1912 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1913 GOTO(out, rc = -EPROTO);
1916 * This is coming from the MDS, so is probably in
1917 * little endian. We convert it to host endian before
1918 * passing it to userspace.
1920 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1923 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1924 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1925 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1926 if (le32_to_cpu(lmm->lmm_pattern) &
1927 LOV_PATTERN_F_RELEASED)
1931 /* if function called for directory - we should
1932 * avoid swab not existent lsm objects */
1933 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1934 lustre_swab_lov_user_md_v1(
1935 (struct lov_user_md_v1 *)lmm);
1936 if (S_ISREG(body->mbo_mode))
1937 lustre_swab_lov_user_md_objects(
1938 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1940 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1941 lustre_swab_lov_user_md_v3(
1942 (struct lov_user_md_v3 *)lmm);
1943 if (S_ISREG(body->mbo_mode))
1944 lustre_swab_lov_user_md_objects(
1945 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1947 } else if (lmm->lmm_magic ==
1948 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1949 lustre_swab_lov_comp_md_v1(
1950 (struct lov_comp_md_v1 *)lmm);
1956 *lmm_size = lmmsize;
1961 static int ll_lov_setea(struct inode *inode, struct file *file,
1964 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1965 struct lov_user_md *lump;
1966 int lum_size = sizeof(struct lov_user_md) +
1967 sizeof(struct lov_user_ost_data);
1971 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1974 OBD_ALLOC_LARGE(lump, lum_size);
1978 if (copy_from_user(lump, arg, lum_size))
1979 GOTO(out_lump, rc = -EFAULT);
1981 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1983 cl_lov_delay_create_clear(&file->f_flags);
1986 OBD_FREE_LARGE(lump, lum_size);
1990 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1997 env = cl_env_get(&refcheck);
1999 RETURN(PTR_ERR(env));
2001 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2002 cl_env_put(env, &refcheck);
2006 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2009 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2010 struct lov_user_md *klum;
2012 __u64 flags = FMODE_WRITE;
2015 rc = ll_copy_user_md(lum, &klum);
2020 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2025 rc = put_user(0, &lum->lmm_stripe_count);
2029 rc = ll_layout_refresh(inode, &gen);
2033 rc = ll_file_getstripe(inode, arg, lum_size);
2035 cl_lov_delay_create_clear(&file->f_flags);
2038 OBD_FREE(klum, lum_size);
2043 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2045 struct ll_inode_info *lli = ll_i2info(inode);
2046 struct cl_object *obj = lli->lli_clob;
2047 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2048 struct ll_grouplock grouplock;
2053 CWARN("group id for group lock must not be 0\n");
2057 if (ll_file_nolock(file))
2058 RETURN(-EOPNOTSUPP);
2060 spin_lock(&lli->lli_lock);
2061 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2062 CWARN("group lock already existed with gid %lu\n",
2063 fd->fd_grouplock.lg_gid);
2064 spin_unlock(&lli->lli_lock);
2067 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2068 spin_unlock(&lli->lli_lock);
2071 * XXX: group lock needs to protect all OST objects while PFL
2072 * can add new OST objects during the IO, so we'd instantiate
2073 * all OST objects before getting its group lock.
2078 struct cl_layout cl = {
2079 .cl_is_composite = false,
2081 struct lu_extent ext = {
2083 .e_end = OBD_OBJECT_EOF,
2086 env = cl_env_get(&refcheck);
2088 RETURN(PTR_ERR(env));
2090 rc = cl_object_layout_get(env, obj, &cl);
2091 if (!rc && cl.cl_is_composite)
2092 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2095 cl_env_put(env, &refcheck);
2100 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2101 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2105 spin_lock(&lli->lli_lock);
2106 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2107 spin_unlock(&lli->lli_lock);
2108 CERROR("another thread just won the race\n");
2109 cl_put_grouplock(&grouplock);
2113 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2114 fd->fd_grouplock = grouplock;
2115 spin_unlock(&lli->lli_lock);
2117 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2121 static int ll_put_grouplock(struct inode *inode, struct file *file,
2124 struct ll_inode_info *lli = ll_i2info(inode);
2125 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2126 struct ll_grouplock grouplock;
2129 spin_lock(&lli->lli_lock);
2130 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2131 spin_unlock(&lli->lli_lock);
2132 CWARN("no group lock held\n");
2136 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2138 if (fd->fd_grouplock.lg_gid != arg) {
2139 CWARN("group lock %lu doesn't match current id %lu\n",
2140 arg, fd->fd_grouplock.lg_gid);
2141 spin_unlock(&lli->lli_lock);
2145 grouplock = fd->fd_grouplock;
2146 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2147 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2148 spin_unlock(&lli->lli_lock);
2150 cl_put_grouplock(&grouplock);
2151 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2156 * Close inode open handle
2158 * \param dentry [in] dentry which contains the inode
2159 * \param it [in,out] intent which contains open info and result
2162 * \retval <0 failure
2164 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2166 struct inode *inode = dentry->d_inode;
2167 struct obd_client_handle *och;
2173 /* Root ? Do nothing. */
2174 if (dentry->d_inode->i_sb->s_root == dentry)
2177 /* No open handle to close? Move away */
2178 if (!it_disposition(it, DISP_OPEN_OPEN))
2181 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2183 OBD_ALLOC(och, sizeof(*och));
2185 GOTO(out, rc = -ENOMEM);
2187 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2189 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2191 /* this one is in place of ll_file_open */
2192 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2193 ptlrpc_req_finished(it->it_request);
2194 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2200 * Get size for inode for which FIEMAP mapping is requested.
2201 * Make the FIEMAP get_info call and returns the result.
2202 * \param fiemap kernel buffer to hold extens
2203 * \param num_bytes kernel buffer size
2205 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2211 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2214 /* Checks for fiemap flags */
2215 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2216 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2220 /* Check for FIEMAP_FLAG_SYNC */
2221 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2222 rc = filemap_fdatawrite(inode->i_mapping);
2227 env = cl_env_get(&refcheck);
2229 RETURN(PTR_ERR(env));
2231 if (i_size_read(inode) == 0) {
2232 rc = ll_glimpse_size(inode);
2237 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2238 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2239 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2241 /* If filesize is 0, then there would be no objects for mapping */
2242 if (fmkey.lfik_oa.o_size == 0) {
2243 fiemap->fm_mapped_extents = 0;
2247 fmkey.lfik_fiemap = *fiemap;
2249 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2250 &fmkey, fiemap, &num_bytes);
2252 cl_env_put(env, &refcheck);
2256 int ll_fid2path(struct inode *inode, void __user *arg)
2258 struct obd_export *exp = ll_i2mdexp(inode);
2259 const struct getinfo_fid2path __user *gfin = arg;
2261 struct getinfo_fid2path *gfout;
2267 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2268 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2271 /* Only need to get the buflen */
2272 if (get_user(pathlen, &gfin->gf_pathlen))
2275 if (pathlen > PATH_MAX)
2278 outsize = sizeof(*gfout) + pathlen;
2279 OBD_ALLOC(gfout, outsize);
2283 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2284 GOTO(gf_free, rc = -EFAULT);
2285 /* append root FID after gfout to let MDT know the root FID so that it
2286 * can lookup the correct path, this is mainly for fileset.
2287 * old server without fileset mount support will ignore this. */
2288 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2290 /* Call mdc_iocontrol */
2291 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2295 if (copy_to_user(arg, gfout, outsize))
2299 OBD_FREE(gfout, outsize);
2304 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2306 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2314 ioc->idv_version = 0;
2315 ioc->idv_layout_version = UINT_MAX;
2317 /* If no file object initialized, we consider its version is 0. */
2321 env = cl_env_get(&refcheck);
2323 RETURN(PTR_ERR(env));
2325 io = vvp_env_thread_io(env);
2327 io->u.ci_data_version.dv_data_version = 0;
2328 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2329 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2332 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2333 result = cl_io_loop(env, io);
2335 result = io->ci_result;
2337 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2338 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2340 cl_io_fini(env, io);
2342 if (unlikely(io->ci_need_restart))
2345 cl_env_put(env, &refcheck);
2351 * Read the data_version for inode.
2353 * This value is computed using stripe object version on OST.
2354 * Version is computed using server side locking.
2356 * @param flags if do sync on the OST side;
2358 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2359 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2361 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2363 struct ioc_data_version ioc = { .idv_flags = flags };
2366 rc = ll_ioc_data_version(inode, &ioc);
2368 *data_version = ioc.idv_version;
2374 * Trigger a HSM release request for the provided inode.
2376 int ll_hsm_release(struct inode *inode)
2379 struct obd_client_handle *och = NULL;
2380 __u64 data_version = 0;
2385 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2386 ll_get_fsname(inode->i_sb, NULL, 0),
2387 PFID(&ll_i2info(inode)->lli_fid));
2389 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2391 GOTO(out, rc = PTR_ERR(och));
2393 /* Grab latest data_version and [am]time values */
2394 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2398 env = cl_env_get(&refcheck);
2400 GOTO(out, rc = PTR_ERR(env));
2402 rc = ll_merge_attr(env, inode);
2403 cl_env_put(env, &refcheck);
2405 /* If error happen, we have the wrong size for a file.
2411 /* Release the file.
2412 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2413 * we still need it to pack l_remote_handle to MDT. */
2414 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2420 if (och != NULL && !IS_ERR(och)) /* close the file */
2421 ll_lease_close(och, inode, NULL);
2426 struct ll_swap_stack {
2429 struct inode *inode1;
2430 struct inode *inode2;
2435 static int ll_swap_layouts(struct file *file1, struct file *file2,
2436 struct lustre_swap_layouts *lsl)
2438 struct mdc_swap_layouts msl;
2439 struct md_op_data *op_data;
2442 struct ll_swap_stack *llss = NULL;
2445 OBD_ALLOC_PTR(llss);
2449 llss->inode1 = file_inode(file1);
2450 llss->inode2 = file_inode(file2);
2452 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2456 /* we use 2 bool because it is easier to swap than 2 bits */
2457 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2458 llss->check_dv1 = true;
2460 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2461 llss->check_dv2 = true;
2463 /* we cannot use lsl->sl_dvX directly because we may swap them */
2464 llss->dv1 = lsl->sl_dv1;
2465 llss->dv2 = lsl->sl_dv2;
2467 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2468 if (rc == 0) /* same file, done! */
2471 if (rc < 0) { /* sequentialize it */
2472 swap(llss->inode1, llss->inode2);
2474 swap(llss->dv1, llss->dv2);
2475 swap(llss->check_dv1, llss->check_dv2);
2479 if (gid != 0) { /* application asks to flush dirty cache */
2480 rc = ll_get_grouplock(llss->inode1, file1, gid);
2484 rc = ll_get_grouplock(llss->inode2, file2, gid);
2486 ll_put_grouplock(llss->inode1, file1, gid);
2491 /* ultimate check, before swaping the layouts we check if
2492 * dataversion has changed (if requested) */
2493 if (llss->check_dv1) {
2494 rc = ll_data_version(llss->inode1, &dv, 0);
2497 if (dv != llss->dv1)
2498 GOTO(putgl, rc = -EAGAIN);
2501 if (llss->check_dv2) {
2502 rc = ll_data_version(llss->inode2, &dv, 0);
2505 if (dv != llss->dv2)
2506 GOTO(putgl, rc = -EAGAIN);
2509 /* struct md_op_data is used to send the swap args to the mdt
2510 * only flags is missing, so we use struct mdc_swap_layouts
2511 * through the md_op_data->op_data */
2512 /* flags from user space have to be converted before they are send to
2513 * server, no flag is sent today, they are only used on the client */
2516 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2517 0, LUSTRE_OPC_ANY, &msl);
2518 if (IS_ERR(op_data))
2519 GOTO(free, rc = PTR_ERR(op_data));
2521 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2522 sizeof(*op_data), op_data, NULL);
2523 ll_finish_md_op_data(op_data);
2530 ll_put_grouplock(llss->inode2, file2, gid);
2531 ll_put_grouplock(llss->inode1, file1, gid);
2541 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2543 struct obd_export *exp = ll_i2mdexp(inode);
2544 struct md_op_data *op_data;
2548 /* Detect out-of range masks */
2549 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2552 /* Non-root users are forbidden to set or clear flags which are
2553 * NOT defined in HSM_USER_MASK. */
2554 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2555 !cfs_capable(CFS_CAP_SYS_ADMIN))
2558 if (!exp_connect_archive_id_array(exp)) {
2559 /* Detect out-of range archive id */
2560 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2561 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2565 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2566 LUSTRE_OPC_ANY, hss);
2567 if (IS_ERR(op_data))
2568 RETURN(PTR_ERR(op_data));
2570 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2573 ll_finish_md_op_data(op_data);
2578 static int ll_hsm_import(struct inode *inode, struct file *file,
2579 struct hsm_user_import *hui)
2581 struct hsm_state_set *hss = NULL;
2582 struct iattr *attr = NULL;
2586 if (!S_ISREG(inode->i_mode))
2592 GOTO(out, rc = -ENOMEM);
2594 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2595 hss->hss_archive_id = hui->hui_archive_id;
2596 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2597 rc = ll_hsm_state_set(inode, hss);
2601 OBD_ALLOC_PTR(attr);
2603 GOTO(out, rc = -ENOMEM);
2605 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2606 attr->ia_mode |= S_IFREG;
2607 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2608 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2609 attr->ia_size = hui->hui_size;
2610 attr->ia_mtime.tv_sec = hui->hui_mtime;
2611 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2612 attr->ia_atime.tv_sec = hui->hui_atime;
2613 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2615 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2616 ATTR_UID | ATTR_GID |
2617 ATTR_MTIME | ATTR_MTIME_SET |
2618 ATTR_ATIME | ATTR_ATIME_SET;
2622 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2626 inode_unlock(inode);
2638 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2640 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2641 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2644 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2646 struct inode *inode = file_inode(file);
2648 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2649 ATTR_MTIME | ATTR_MTIME_SET |
2652 .tv_sec = lfu->lfu_atime_sec,
2653 .tv_nsec = lfu->lfu_atime_nsec,
2656 .tv_sec = lfu->lfu_mtime_sec,
2657 .tv_nsec = lfu->lfu_mtime_nsec,
2660 .tv_sec = lfu->lfu_ctime_sec,
2661 .tv_nsec = lfu->lfu_ctime_nsec,
2667 if (!capable(CAP_SYS_ADMIN))
2670 if (!S_ISREG(inode->i_mode))
2674 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2676 inode_unlock(inode);
2681 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2684 case MODE_READ_USER:
2686 case MODE_WRITE_USER:
2693 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2695 /* Used to allow the upper layers of the client to request an LDLM lock
2696 * without doing an actual read or write.
2698 * Used for ladvise lockahead to manually request specific locks.
2700 * \param[in] file file this ladvise lock request is on
2701 * \param[in] ladvise ladvise struct describing this lock request
2703 * \retval 0 success, no detailed result available (sync requests
2704 * and requests sent to the server [not handled locally]
2705 * cannot return detailed results)
2706 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2707 * see definitions for details.
2708 * \retval negative negative errno on error
2710 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2712 struct lu_env *env = NULL;
2713 struct cl_io *io = NULL;
2714 struct cl_lock *lock = NULL;
2715 struct cl_lock_descr *descr = NULL;
2716 struct dentry *dentry = file->f_path.dentry;
2717 struct inode *inode = dentry->d_inode;
2718 enum cl_lock_mode cl_mode;
2719 off_t start = ladvise->lla_start;
2720 off_t end = ladvise->lla_end;
2726 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2727 "start=%llu, end=%llu\n", dentry->d_name.len,
2728 dentry->d_name.name, dentry->d_inode,
2729 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2732 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2734 GOTO(out, result = cl_mode);
2736 /* Get IO environment */
2737 result = cl_io_get(inode, &env, &io, &refcheck);
2741 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2744 * nothing to do for this io. This currently happens when
2745 * stripe sub-object's are not yet created.
2747 result = io->ci_result;
2748 } else if (result == 0) {
2749 lock = vvp_env_lock(env);
2750 descr = &lock->cll_descr;
2752 descr->cld_obj = io->ci_obj;
2753 /* Convert byte offsets to pages */
2754 descr->cld_start = cl_index(io->ci_obj, start);
2755 descr->cld_end = cl_index(io->ci_obj, end);
2756 descr->cld_mode = cl_mode;
2757 /* CEF_MUST is used because we do not want to convert a
2758 * lockahead request to a lockless lock */
2759 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2762 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2763 descr->cld_enq_flags |= CEF_SPECULATIVE;
2765 result = cl_lock_request(env, io, lock);
2767 /* On success, we need to release the lock */
2769 cl_lock_release(env, lock);
2771 cl_io_fini(env, io);
2772 cl_env_put(env, &refcheck);
2774 /* -ECANCELED indicates a matching lock with a different extent
2775 * was already present, and -EEXIST indicates a matching lock
2776 * on exactly the same extent was already present.
2777 * We convert them to positive values for userspace to make
2778 * recognizing true errors easier.
2779 * Note we can only return these detailed results on async requests,
2780 * as sync requests look the same as i/o requests for locking. */
2781 if (result == -ECANCELED)
2782 result = LLA_RESULT_DIFFERENT;
2783 else if (result == -EEXIST)
2784 result = LLA_RESULT_SAME;
2789 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2791 static int ll_ladvise_sanity(struct inode *inode,
2792 struct llapi_lu_ladvise *ladvise)
2794 enum lu_ladvise_type advice = ladvise->lla_advice;
2795 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2796 * be in the first 32 bits of enum ladvise_flags */
2797 __u32 flags = ladvise->lla_peradvice_flags;
2798 /* 3 lines at 80 characters per line, should be plenty */
2801 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2803 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2804 "last supported advice is %s (value '%d'): rc = %d\n",
2805 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2806 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2810 /* Per-advice checks */
2812 case LU_LADVISE_LOCKNOEXPAND:
2813 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2815 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2817 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2818 ladvise_names[advice], rc);
2822 case LU_LADVISE_LOCKAHEAD:
2823 /* Currently only READ and WRITE modes can be requested */
2824 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2825 ladvise->lla_lockahead_mode == 0) {
2827 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2829 ll_get_fsname(inode->i_sb, NULL, 0),
2830 ladvise->lla_lockahead_mode,
2831 ladvise_names[advice], rc);
2834 case LU_LADVISE_WILLREAD:
2835 case LU_LADVISE_DONTNEED:
2837 /* Note fall through above - These checks apply to all advices
2838 * except LOCKNOEXPAND */
2839 if (flags & ~LF_DEFAULT_MASK) {
2841 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2843 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2844 ladvise_names[advice], rc);
2847 if (ladvise->lla_start >= ladvise->lla_end) {
2849 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2850 "for %s: rc = %d\n",
2851 ll_get_fsname(inode->i_sb, NULL, 0),
2852 ladvise->lla_start, ladvise->lla_end,
2853 ladvise_names[advice], rc);
2865 * Give file access advices
2867 * The ladvise interface is similar to Linux fadvise() system call, except it
2868 * forwards the advices directly from Lustre client to server. The server side
2869 * codes will apply appropriate read-ahead and caching techniques for the
2870 * corresponding files.
2872 * A typical workload for ladvise is e.g. a bunch of different clients are
2873 * doing small random reads of a file, so prefetching pages into OSS cache
2874 * with big linear reads before the random IO is a net benefit. Fetching
2875 * all that data into each client cache with fadvise() may not be, due to
2876 * much more data being sent to the client.
2878 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2879 struct llapi_lu_ladvise *ladvise)
2883 struct cl_ladvise_io *lio;
2888 env = cl_env_get(&refcheck);
2890 RETURN(PTR_ERR(env));
2892 io = vvp_env_thread_io(env);
2893 io->ci_obj = ll_i2info(inode)->lli_clob;
2895 /* initialize parameters for ladvise */
2896 lio = &io->u.ci_ladvise;
2897 lio->li_start = ladvise->lla_start;
2898 lio->li_end = ladvise->lla_end;
2899 lio->li_fid = ll_inode2fid(inode);
2900 lio->li_advice = ladvise->lla_advice;
2901 lio->li_flags = flags;
2903 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2904 rc = cl_io_loop(env, io);
2908 cl_io_fini(env, io);
2909 cl_env_put(env, &refcheck);
2913 static int ll_lock_noexpand(struct file *file, int flags)
2915 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2917 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2922 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2925 struct fsxattr fsxattr;
2927 if (copy_from_user(&fsxattr,
2928 (const struct fsxattr __user *)arg,
2932 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2933 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2934 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2935 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2936 if (copy_to_user((struct fsxattr __user *)arg,
2937 &fsxattr, sizeof(fsxattr)))
2943 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
2946 * Project Quota ID state is only allowed to change from within the init
2947 * namespace. Enforce that restriction only if we are trying to change
2948 * the quota ID state. Everything else is allowed in user namespaces.
2950 if (current_user_ns() == &init_user_ns)
2953 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
2956 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
2957 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
2960 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
2967 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2971 struct md_op_data *op_data;
2972 struct ptlrpc_request *req = NULL;
2974 struct fsxattr fsxattr;
2975 struct cl_object *obj;
2979 if (copy_from_user(&fsxattr,
2980 (const struct fsxattr __user *)arg,
2984 rc = ll_ioctl_check_project(inode, &fsxattr);
2988 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2989 LUSTRE_OPC_ANY, NULL);
2990 if (IS_ERR(op_data))
2991 RETURN(PTR_ERR(op_data));
2993 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
2994 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
2995 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
2996 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
2997 op_data->op_projid = fsxattr.fsx_projid;
2998 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
2999 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3001 ptlrpc_req_finished(req);
3003 GOTO(out_fsxattr, rc);
3004 ll_update_inode_flags(inode, op_data->op_attr_flags);
3005 obj = ll_i2info(inode)->lli_clob;
3007 GOTO(out_fsxattr, rc);
3009 OBD_ALLOC_PTR(attr);
3011 GOTO(out_fsxattr, rc = -ENOMEM);
3013 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3014 fsxattr.fsx_xflags);
3017 ll_finish_md_op_data(op_data);
3021 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3024 struct inode *inode = file_inode(file);
3025 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3026 struct ll_inode_info *lli = ll_i2info(inode);
3027 struct obd_client_handle *och = NULL;
3028 struct split_param sp;
3031 enum mds_op_bias bias = 0;
3032 struct file *layout_file = NULL;
3034 size_t data_size = 0;
3038 mutex_lock(&lli->lli_och_mutex);
3039 if (fd->fd_lease_och != NULL) {
3040 och = fd->fd_lease_och;
3041 fd->fd_lease_och = NULL;
3043 mutex_unlock(&lli->lli_och_mutex);
3046 GOTO(out, rc = -ENOLCK);
3048 fmode = och->och_flags;
3050 switch (ioc->lil_flags) {
3051 case LL_LEASE_RESYNC_DONE:
3052 if (ioc->lil_count > IOC_IDS_MAX)
3053 GOTO(out, rc = -EINVAL);
3055 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3056 OBD_ALLOC(data, data_size);
3058 GOTO(out, rc = -ENOMEM);
3060 if (copy_from_user(data, (void __user *)arg, data_size))
3061 GOTO(out, rc = -EFAULT);
3063 bias = MDS_CLOSE_RESYNC_DONE;
3065 case LL_LEASE_LAYOUT_MERGE: {
3068 if (ioc->lil_count != 1)
3069 GOTO(out, rc = -EINVAL);
3071 arg += sizeof(*ioc);
3072 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3073 GOTO(out, rc = -EFAULT);
3075 layout_file = fget(fd);
3077 GOTO(out, rc = -EBADF);
3079 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3080 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3081 GOTO(out, rc = -EPERM);
3083 data = file_inode(layout_file);
3084 bias = MDS_CLOSE_LAYOUT_MERGE;
3087 case LL_LEASE_LAYOUT_SPLIT: {
3091 if (ioc->lil_count != 2)
3092 GOTO(out, rc = -EINVAL);
3094 arg += sizeof(*ioc);
3095 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3096 GOTO(out, rc = -EFAULT);
3098 arg += sizeof(__u32);
3099 if (copy_from_user(&mirror_id, (void __user *)arg,
3101 GOTO(out, rc = -EFAULT);
3103 layout_file = fget(fdv);
3105 GOTO(out, rc = -EBADF);
3107 sp.sp_inode = file_inode(layout_file);
3108 sp.sp_mirror_id = (__u16)mirror_id;
3110 bias = MDS_CLOSE_LAYOUT_SPLIT;
3114 /* without close intent */
3118 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3122 rc = ll_lease_och_release(inode, file);
3131 switch (ioc->lil_flags) {
3132 case LL_LEASE_RESYNC_DONE:
3134 OBD_FREE(data, data_size);
3136 case LL_LEASE_LAYOUT_MERGE:
3137 case LL_LEASE_LAYOUT_SPLIT:
3144 rc = ll_lease_type_from_fmode(fmode);
3148 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3151 struct inode *inode = file_inode(file);
3152 struct ll_inode_info *lli = ll_i2info(inode);
3153 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3154 struct obd_client_handle *och = NULL;
3155 __u64 open_flags = 0;
3161 switch (ioc->lil_mode) {
3162 case LL_LEASE_WRLCK:
3163 if (!(file->f_mode & FMODE_WRITE))
3165 fmode = FMODE_WRITE;
3167 case LL_LEASE_RDLCK:
3168 if (!(file->f_mode & FMODE_READ))
3172 case LL_LEASE_UNLCK:
3173 RETURN(ll_file_unlock_lease(file, ioc, arg));
3178 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3180 /* apply for lease */
3181 if (ioc->lil_flags & LL_LEASE_RESYNC)
3182 open_flags = MDS_OPEN_RESYNC;
3183 och = ll_lease_open(inode, file, fmode, open_flags);
3185 RETURN(PTR_ERR(och));
3187 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3188 rc = ll_lease_file_resync(och, inode, arg);
3190 ll_lease_close(och, inode, NULL);
3193 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3195 ll_lease_close(och, inode, NULL);
3201 mutex_lock(&lli->lli_och_mutex);
3202 if (fd->fd_lease_och == NULL) {
3203 fd->fd_lease_och = och;
3206 mutex_unlock(&lli->lli_och_mutex);
3208 /* impossible now that only excl is supported for now */
3209 ll_lease_close(och, inode, &lease_broken);
3216 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3218 struct inode *inode = file_inode(file);
3219 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3223 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3224 PFID(ll_inode2fid(inode)), inode, cmd);
3225 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3227 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3228 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3232 case LL_IOC_GETFLAGS:
3233 /* Get the current value of the file flags */
3234 return put_user(fd->fd_flags, (int __user *)arg);
3235 case LL_IOC_SETFLAGS:
3236 case LL_IOC_CLRFLAGS:
3237 /* Set or clear specific file flags */
3238 /* XXX This probably needs checks to ensure the flags are
3239 * not abused, and to handle any flag side effects.
3241 if (get_user(flags, (int __user *) arg))
3244 if (cmd == LL_IOC_SETFLAGS) {
3245 if ((flags & LL_FILE_IGNORE_LOCK) &&
3246 !(file->f_flags & O_DIRECT)) {
3247 CERROR("%s: unable to disable locking on "
3248 "non-O_DIRECT file\n", current->comm);
3252 fd->fd_flags |= flags;
3254 fd->fd_flags &= ~flags;
3257 case LL_IOC_LOV_SETSTRIPE:
3258 case LL_IOC_LOV_SETSTRIPE_NEW:
3259 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3260 case LL_IOC_LOV_SETEA:
3261 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3262 case LL_IOC_LOV_SWAP_LAYOUTS: {
3264 struct lustre_swap_layouts lsl;
3266 if (copy_from_user(&lsl, (char __user *)arg,
3267 sizeof(struct lustre_swap_layouts)))
3270 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3273 file2 = fget(lsl.sl_fd);
3277 /* O_WRONLY or O_RDWR */
3278 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3279 GOTO(out, rc = -EPERM);
3281 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3282 struct inode *inode2;
3283 struct ll_inode_info *lli;
3284 struct obd_client_handle *och = NULL;
3286 lli = ll_i2info(inode);
3287 mutex_lock(&lli->lli_och_mutex);
3288 if (fd->fd_lease_och != NULL) {
3289 och = fd->fd_lease_och;
3290 fd->fd_lease_och = NULL;
3292 mutex_unlock(&lli->lli_och_mutex);
3294 GOTO(out, rc = -ENOLCK);
3295 inode2 = file_inode(file2);
3296 rc = ll_swap_layouts_close(och, inode, inode2);
3298 rc = ll_swap_layouts(file, file2, &lsl);
3304 case LL_IOC_LOV_GETSTRIPE:
3305 case LL_IOC_LOV_GETSTRIPE_NEW:
3306 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3307 case FS_IOC_GETFLAGS:
3308 case FS_IOC_SETFLAGS:
3309 RETURN(ll_iocontrol(inode, file, cmd, arg));
3310 case FSFILT_IOC_GETVERSION:
3311 case FS_IOC_GETVERSION:
3312 RETURN(put_user(inode->i_generation, (int __user *)arg));
3313 /* We need to special case any other ioctls we want to handle,
3314 * to send them to the MDS/OST as appropriate and to properly
3315 * network encode the arg field. */
3316 case FS_IOC_SETVERSION:
3319 case LL_IOC_GROUP_LOCK:
3320 RETURN(ll_get_grouplock(inode, file, arg));
3321 case LL_IOC_GROUP_UNLOCK:
3322 RETURN(ll_put_grouplock(inode, file, arg));
3323 case IOC_OBD_STATFS:
3324 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3326 case LL_IOC_FLUSHCTX:
3327 RETURN(ll_flush_ctx(inode));
3328 case LL_IOC_PATH2FID: {
3329 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3330 sizeof(struct lu_fid)))
3335 case LL_IOC_GETPARENT:
3336 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3338 case OBD_IOC_FID2PATH:
3339 RETURN(ll_fid2path(inode, (void __user *)arg));
3340 case LL_IOC_DATA_VERSION: {
3341 struct ioc_data_version idv;
3344 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3347 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3348 rc = ll_ioc_data_version(inode, &idv);
3351 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3357 case LL_IOC_GET_MDTIDX: {
3360 mdtidx = ll_get_mdt_idx(inode);
3364 if (put_user((int)mdtidx, (int __user *)arg))
3369 case OBD_IOC_GETDTNAME:
3370 case OBD_IOC_GETMDNAME:
3371 RETURN(ll_get_obd_name(inode, cmd, arg));
3372 case LL_IOC_HSM_STATE_GET: {
3373 struct md_op_data *op_data;
3374 struct hsm_user_state *hus;
3381 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3382 LUSTRE_OPC_ANY, hus);
3383 if (IS_ERR(op_data)) {
3385 RETURN(PTR_ERR(op_data));
3388 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3391 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3394 ll_finish_md_op_data(op_data);
3398 case LL_IOC_HSM_STATE_SET: {
3399 struct hsm_state_set *hss;
3406 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3411 rc = ll_hsm_state_set(inode, hss);
3416 case LL_IOC_HSM_ACTION: {
3417 struct md_op_data *op_data;
3418 struct hsm_current_action *hca;
3425 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3426 LUSTRE_OPC_ANY, hca);
3427 if (IS_ERR(op_data)) {
3429 RETURN(PTR_ERR(op_data));
3432 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3435 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3438 ll_finish_md_op_data(op_data);
3442 case LL_IOC_SET_LEASE_OLD: {
3443 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3445 RETURN(ll_file_set_lease(file, &ioc, 0));
3447 case LL_IOC_SET_LEASE: {
3448 struct ll_ioc_lease ioc;
3450 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3453 RETURN(ll_file_set_lease(file, &ioc, arg));
3455 case LL_IOC_GET_LEASE: {
3456 struct ll_inode_info *lli = ll_i2info(inode);
3457 struct ldlm_lock *lock = NULL;
3460 mutex_lock(&lli->lli_och_mutex);
3461 if (fd->fd_lease_och != NULL) {
3462 struct obd_client_handle *och = fd->fd_lease_och;
3464 lock = ldlm_handle2lock(&och->och_lease_handle);
3466 lock_res_and_lock(lock);
3467 if (!ldlm_is_cancel(lock))
3468 fmode = och->och_flags;
3470 unlock_res_and_lock(lock);
3471 LDLM_LOCK_PUT(lock);
3474 mutex_unlock(&lli->lli_och_mutex);
3476 RETURN(ll_lease_type_from_fmode(fmode));
3478 case LL_IOC_HSM_IMPORT: {
3479 struct hsm_user_import *hui;
3485 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3490 rc = ll_hsm_import(inode, file, hui);
3495 case LL_IOC_FUTIMES_3: {
3496 struct ll_futimes_3 lfu;
3498 if (copy_from_user(&lfu,
3499 (const struct ll_futimes_3 __user *)arg,
3503 RETURN(ll_file_futimes_3(file, &lfu));
3505 case LL_IOC_LADVISE: {
3506 struct llapi_ladvise_hdr *k_ladvise_hdr;
3507 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3510 int alloc_size = sizeof(*k_ladvise_hdr);
3513 u_ladvise_hdr = (void __user *)arg;
3514 OBD_ALLOC_PTR(k_ladvise_hdr);
3515 if (k_ladvise_hdr == NULL)
3518 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3519 GOTO(out_ladvise, rc = -EFAULT);
3521 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3522 k_ladvise_hdr->lah_count < 1)
3523 GOTO(out_ladvise, rc = -EINVAL);
3525 num_advise = k_ladvise_hdr->lah_count;
3526 if (num_advise >= LAH_COUNT_MAX)
3527 GOTO(out_ladvise, rc = -EFBIG);
3529 OBD_FREE_PTR(k_ladvise_hdr);
3530 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3531 lah_advise[num_advise]);
3532 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3533 if (k_ladvise_hdr == NULL)
3537 * TODO: submit multiple advices to one server in a single RPC
3539 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3540 GOTO(out_ladvise, rc = -EFAULT);
3542 for (i = 0; i < num_advise; i++) {
3543 struct llapi_lu_ladvise *k_ladvise =
3544 &k_ladvise_hdr->lah_advise[i];
3545 struct llapi_lu_ladvise __user *u_ladvise =
3546 &u_ladvise_hdr->lah_advise[i];
3548 rc = ll_ladvise_sanity(inode, k_ladvise);
3550 GOTO(out_ladvise, rc);
3552 switch (k_ladvise->lla_advice) {
3553 case LU_LADVISE_LOCKNOEXPAND:
3554 rc = ll_lock_noexpand(file,
3555 k_ladvise->lla_peradvice_flags);
3556 GOTO(out_ladvise, rc);
3557 case LU_LADVISE_LOCKAHEAD:
3559 rc = ll_file_lock_ahead(file, k_ladvise);
3562 GOTO(out_ladvise, rc);
3565 &u_ladvise->lla_lockahead_result))
3566 GOTO(out_ladvise, rc = -EFAULT);
3569 rc = ll_ladvise(inode, file,
3570 k_ladvise_hdr->lah_flags,
3573 GOTO(out_ladvise, rc);
3580 OBD_FREE(k_ladvise_hdr, alloc_size);
3583 case LL_IOC_FLR_SET_MIRROR: {
3584 /* mirror I/O must be direct to avoid polluting page cache
3586 if (!(file->f_flags & O_DIRECT))
3589 fd->fd_designated_mirror = (__u32)arg;
3592 case LL_IOC_FSGETXATTR:
3593 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3594 case LL_IOC_FSSETXATTR:
3595 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3597 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3599 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3600 (void __user *)arg));
3604 #ifndef HAVE_FILE_LLSEEK_SIZE
3605 static inline loff_t
3606 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3608 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3610 if (offset > maxsize)
3613 if (offset != file->f_pos) {
3614 file->f_pos = offset;
3615 file->f_version = 0;
3621 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3622 loff_t maxsize, loff_t eof)
3624 struct inode *inode = file_inode(file);
3632 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3633 * position-querying operation. Avoid rewriting the "same"
3634 * f_pos value back to the file because a concurrent read(),
3635 * write() or lseek() might have altered it
3640 * f_lock protects against read/modify/write race with other
3641 * SEEK_CURs. Note that parallel writes and reads behave
3645 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3646 inode_unlock(inode);
3650 * In the generic case the entire file is data, so as long as
3651 * offset isn't at the end of the file then the offset is data.
3658 * There is a virtual hole at the end of the file, so as long as
3659 * offset isn't i_size or larger, return i_size.
3667 return llseek_execute(file, offset, maxsize);
3671 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3673 struct inode *inode = file_inode(file);
3674 loff_t retval, eof = 0;
3677 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3678 (origin == SEEK_CUR) ? file->f_pos : 0);
3679 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3680 PFID(ll_inode2fid(inode)), inode, retval, retval,
3682 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3684 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3685 retval = ll_glimpse_size(inode);
3688 eof = i_size_read(inode);
3691 retval = ll_generic_file_llseek_size(file, offset, origin,
3692 ll_file_maxbytes(inode), eof);
3696 static int ll_flush(struct file *file, fl_owner_t id)
3698 struct inode *inode = file_inode(file);
3699 struct ll_inode_info *lli = ll_i2info(inode);
3700 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3703 LASSERT(!S_ISDIR(inode->i_mode));
3705 /* catch async errors that were recorded back when async writeback
3706 * failed for pages in this mapping. */
3707 rc = lli->lli_async_rc;
3708 lli->lli_async_rc = 0;
3709 if (lli->lli_clob != NULL) {
3710 err = lov_read_and_clear_async_rc(lli->lli_clob);
3715 /* The application has been told write failure already.
3716 * Do not report failure again. */
3717 if (fd->fd_write_failed)
3719 return rc ? -EIO : 0;
3723 * Called to make sure a portion of file has been written out.
3724 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3726 * Return how many pages have been written.
3728 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3729 enum cl_fsync_mode mode, int ignore_layout)
3733 struct cl_fsync_io *fio;
3738 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3739 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3742 env = cl_env_get(&refcheck);
3744 RETURN(PTR_ERR(env));
3746 io = vvp_env_thread_io(env);
3747 io->ci_obj = ll_i2info(inode)->lli_clob;
3748 io->ci_ignore_layout = ignore_layout;
3750 /* initialize parameters for sync */
3751 fio = &io->u.ci_fsync;
3752 fio->fi_start = start;
3754 fio->fi_fid = ll_inode2fid(inode);
3755 fio->fi_mode = mode;
3756 fio->fi_nr_written = 0;
3758 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3759 result = cl_io_loop(env, io);
3761 result = io->ci_result;
3763 result = fio->fi_nr_written;
3764 cl_io_fini(env, io);
3765 cl_env_put(env, &refcheck);
3771 * When dentry is provided (the 'else' case), file_dentry() may be
3772 * null and dentry must be used directly rather than pulled from
3773 * file_dentry() as is done otherwise.
3776 #ifdef HAVE_FILE_FSYNC_4ARGS
3777 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3779 struct dentry *dentry = file_dentry(file);
3780 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3781 int ll_fsync(struct file *file, int datasync)
3783 struct dentry *dentry = file_dentry(file);
3785 loff_t end = LLONG_MAX;
3787 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3790 loff_t end = LLONG_MAX;
3792 struct inode *inode = dentry->d_inode;
3793 struct ll_inode_info *lli = ll_i2info(inode);
3794 struct ptlrpc_request *req;
3798 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3799 PFID(ll_inode2fid(inode)), inode);
3800 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3802 #ifdef HAVE_FILE_FSYNC_4ARGS
3803 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3806 /* fsync's caller has already called _fdata{sync,write}, we want
3807 * that IO to finish before calling the osc and mdc sync methods */
3808 rc = filemap_fdatawait(inode->i_mapping);
3811 /* catch async errors that were recorded back when async writeback
3812 * failed for pages in this mapping. */
3813 if (!S_ISDIR(inode->i_mode)) {
3814 err = lli->lli_async_rc;
3815 lli->lli_async_rc = 0;
3818 if (lli->lli_clob != NULL) {
3819 err = lov_read_and_clear_async_rc(lli->lli_clob);
3825 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3829 ptlrpc_req_finished(req);
3831 if (S_ISREG(inode->i_mode)) {
3832 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3834 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3835 if (rc == 0 && err < 0)
3838 fd->fd_write_failed = true;
3840 fd->fd_write_failed = false;
3843 #ifdef HAVE_FILE_FSYNC_4ARGS
3844 inode_unlock(inode);
3850 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3852 struct inode *inode = file_inode(file);
3853 struct ll_sb_info *sbi = ll_i2sbi(inode);
3854 struct ldlm_enqueue_info einfo = {
3855 .ei_type = LDLM_FLOCK,
3856 .ei_cb_cp = ldlm_flock_completion_ast,
3857 .ei_cbdata = file_lock,
3859 struct md_op_data *op_data;
3860 struct lustre_handle lockh = { 0 };
3861 union ldlm_policy_data flock = { { 0 } };
3862 int fl_type = file_lock->fl_type;
3868 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3869 PFID(ll_inode2fid(inode)), file_lock);
3871 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3873 if (file_lock->fl_flags & FL_FLOCK) {
3874 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3875 /* flocks are whole-file locks */
3876 flock.l_flock.end = OFFSET_MAX;
3877 /* For flocks owner is determined by the local file desctiptor*/
3878 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3879 } else if (file_lock->fl_flags & FL_POSIX) {
3880 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3881 flock.l_flock.start = file_lock->fl_start;
3882 flock.l_flock.end = file_lock->fl_end;
3886 flock.l_flock.pid = file_lock->fl_pid;
3888 /* Somewhat ugly workaround for svc lockd.
3889 * lockd installs custom fl_lmops->lm_compare_owner that checks
3890 * for the fl_owner to be the same (which it always is on local node
3891 * I guess between lockd processes) and then compares pid.
3892 * As such we assign pid to the owner field to make it all work,
3893 * conflict with normal locks is unlikely since pid space and
3894 * pointer space for current->files are not intersecting */
3895 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3896 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3900 einfo.ei_mode = LCK_PR;
3903 /* An unlock request may or may not have any relation to
3904 * existing locks so we may not be able to pass a lock handle
3905 * via a normal ldlm_lock_cancel() request. The request may even
3906 * unlock a byte range in the middle of an existing lock. In
3907 * order to process an unlock request we need all of the same
3908 * information that is given with a normal read or write record
3909 * lock request. To avoid creating another ldlm unlock (cancel)
3910 * message we'll treat a LCK_NL flock request as an unlock. */
3911 einfo.ei_mode = LCK_NL;
3914 einfo.ei_mode = LCK_PW;
3917 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3932 flags = LDLM_FL_BLOCK_NOWAIT;
3938 flags = LDLM_FL_TEST_LOCK;
3941 CERROR("unknown fcntl lock command: %d\n", cmd);
3945 /* Save the old mode so that if the mode in the lock changes we
3946 * can decrement the appropriate reader or writer refcount. */
3947 file_lock->fl_type = einfo.ei_mode;
3949 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3950 LUSTRE_OPC_ANY, NULL);
3951 if (IS_ERR(op_data))
3952 RETURN(PTR_ERR(op_data));
3954 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3955 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3956 flock.l_flock.pid, flags, einfo.ei_mode,
3957 flock.l_flock.start, flock.l_flock.end);
3959 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3962 /* Restore the file lock type if not TEST lock. */
3963 if (!(flags & LDLM_FL_TEST_LOCK))
3964 file_lock->fl_type = fl_type;
3966 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3967 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3968 !(flags & LDLM_FL_TEST_LOCK))
3969 rc2 = locks_lock_file_wait(file, file_lock);
3971 if ((file_lock->fl_flags & FL_FLOCK) &&
3972 (rc == 0 || file_lock->fl_type == F_UNLCK))
3973 rc2 = flock_lock_file_wait(file, file_lock);
3974 if ((file_lock->fl_flags & FL_POSIX) &&
3975 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3976 !(flags & LDLM_FL_TEST_LOCK))
3977 rc2 = posix_lock_file_wait(file, file_lock);
3978 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3980 if (rc2 && file_lock->fl_type != F_UNLCK) {
3981 einfo.ei_mode = LCK_NL;
3982 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3987 ll_finish_md_op_data(op_data);
3992 int ll_get_fid_by_name(struct inode *parent, const char *name,
3993 int namelen, struct lu_fid *fid,
3994 struct inode **inode)
3996 struct md_op_data *op_data = NULL;
3997 struct mdt_body *body;
3998 struct ptlrpc_request *req;
4002 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4003 LUSTRE_OPC_ANY, NULL);
4004 if (IS_ERR(op_data))
4005 RETURN(PTR_ERR(op_data));
4007 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4008 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4009 ll_finish_md_op_data(op_data);
4013 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4015 GOTO(out_req, rc = -EFAULT);
4017 *fid = body->mbo_fid1;
4020 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4022 ptlrpc_req_finished(req);
4026 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4029 struct dentry *dchild = NULL;
4030 struct inode *child_inode = NULL;
4031 struct md_op_data *op_data;
4032 struct ptlrpc_request *request = NULL;
4033 struct obd_client_handle *och = NULL;
4035 struct mdt_body *body;
4036 __u64 data_version = 0;
4037 size_t namelen = strlen(name);
4038 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4042 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4043 PFID(ll_inode2fid(parent)), name,
4044 lum->lum_stripe_offset, lum->lum_stripe_count);
4046 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4047 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4048 lustre_swab_lmv_user_md(lum);
4050 /* Get child FID first */
4051 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4054 dchild = d_lookup(file_dentry(file), &qstr);
4056 if (dchild->d_inode)
4057 child_inode = igrab(dchild->d_inode);
4062 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4071 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4072 OBD_CONNECT2_DIR_MIGRATE)) {
4073 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4074 ll_i2info(child_inode)->lli_lsm_md) {
4075 CERROR("%s: MDT doesn't support stripe directory "
4077 ll_get_fsname(parent->i_sb, NULL, 0));
4078 GOTO(out_iput, rc = -EOPNOTSUPP);
4083 * lfs migrate command needs to be blocked on the client
4084 * by checking the migrate FID against the FID of the
4087 if (child_inode == parent->i_sb->s_root->d_inode)
4088 GOTO(out_iput, rc = -EINVAL);
4090 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4091 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4092 if (IS_ERR(op_data))
4093 GOTO(out_iput, rc = PTR_ERR(op_data));
4095 inode_lock(child_inode);
4096 op_data->op_fid3 = *ll_inode2fid(child_inode);
4097 if (!fid_is_sane(&op_data->op_fid3)) {
4098 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4099 ll_get_fsname(parent->i_sb, NULL, 0), name,
4100 PFID(&op_data->op_fid3));
4101 GOTO(out_unlock, rc = -EINVAL);
4104 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4105 op_data->op_data = lum;
4106 op_data->op_data_size = lumlen;
4109 if (S_ISREG(child_inode->i_mode)) {
4110 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4114 GOTO(out_unlock, rc);
4117 rc = ll_data_version(child_inode, &data_version,
4120 GOTO(out_close, rc);
4122 op_data->op_open_handle = och->och_open_handle;
4123 op_data->op_data_version = data_version;
4124 op_data->op_lease_handle = och->och_lease_handle;
4125 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4127 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4128 och->och_mod->mod_open_req->rq_replay = 0;
4129 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4132 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4133 name, namelen, &request);
4135 LASSERT(request != NULL);
4136 ll_update_times(request, parent);
4138 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4139 LASSERT(body != NULL);
4141 /* If the server does release layout lock, then we cleanup
4142 * the client och here, otherwise release it in out_close: */
4143 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4144 obd_mod_put(och->och_mod);
4145 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4147 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4153 if (request != NULL) {
4154 ptlrpc_req_finished(request);
4158 /* Try again if the file layout has changed. */
4159 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4164 ll_lease_close(och, child_inode, NULL);
4166 clear_nlink(child_inode);
4168 inode_unlock(child_inode);
4169 ll_finish_md_op_data(op_data);
4176 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4184 * test if some locks matching bits and l_req_mode are acquired
4185 * - bits can be in different locks
4186 * - if found clear the common lock bits in *bits
4187 * - the bits not found, are kept in *bits
4189 * \param bits [IN] searched lock bits [IN]
4190 * \param l_req_mode [IN] searched lock mode
4191 * \retval boolean, true iff all bits are found
4193 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4195 struct lustre_handle lockh;
4196 union ldlm_policy_data policy;
4197 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4198 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4207 fid = &ll_i2info(inode)->lli_fid;
4208 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4209 ldlm_lockname[mode]);
4211 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4212 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4213 policy.l_inodebits.bits = *bits & (1 << i);
4214 if (policy.l_inodebits.bits == 0)
4217 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4218 &policy, mode, &lockh)) {
4219 struct ldlm_lock *lock;
4221 lock = ldlm_handle2lock(&lockh);
4224 ~(lock->l_policy_data.l_inodebits.bits);
4225 LDLM_LOCK_PUT(lock);
4227 *bits &= ~policy.l_inodebits.bits;
4234 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4235 struct lustre_handle *lockh, __u64 flags,
4236 enum ldlm_mode mode)
4238 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4243 fid = &ll_i2info(inode)->lli_fid;
4244 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4246 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4247 fid, LDLM_IBITS, &policy, mode, lockh);
4252 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4254 /* Already unlinked. Just update nlink and return success */
4255 if (rc == -ENOENT) {
4257 /* If it is striped directory, and there is bad stripe
4258 * Let's revalidate the dentry again, instead of returning
4260 if (S_ISDIR(inode->i_mode) &&
4261 ll_i2info(inode)->lli_lsm_md != NULL)
4264 /* This path cannot be hit for regular files unless in
4265 * case of obscure races, so no need to to validate
4267 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4269 } else if (rc != 0) {
4270 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4271 "%s: revalidate FID "DFID" error: rc = %d\n",
4272 ll_get_fsname(inode->i_sb, NULL, 0),
4273 PFID(ll_inode2fid(inode)), rc);
4279 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4281 struct inode *inode = dentry->d_inode;
4282 struct obd_export *exp = ll_i2mdexp(inode);
4283 struct lookup_intent oit = {
4286 struct ptlrpc_request *req = NULL;
4287 struct md_op_data *op_data;
4291 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4292 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4294 /* Call getattr by fid, so do not provide name at all. */
4295 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4296 LUSTRE_OPC_ANY, NULL);
4297 if (IS_ERR(op_data))
4298 RETURN(PTR_ERR(op_data));
4300 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4301 ll_finish_md_op_data(op_data);
4303 rc = ll_inode_revalidate_fini(inode, rc);
4307 rc = ll_revalidate_it_finish(req, &oit, dentry);
4309 ll_intent_release(&oit);
4313 /* Unlinked? Unhash dentry, so it is not picked up later by
4314 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4315 * here to preserve get_cwd functionality on 2.6.
4317 if (!dentry->d_inode->i_nlink) {
4318 ll_lock_dcache(inode);
4319 d_lustre_invalidate(dentry, 0);
4320 ll_unlock_dcache(inode);
4323 ll_lookup_finish_locks(&oit, dentry);
4325 ptlrpc_req_finished(req);
4330 static int ll_merge_md_attr(struct inode *inode)
4332 struct ll_inode_info *lli = ll_i2info(inode);
4333 struct cl_attr attr = { 0 };
4336 LASSERT(lli->lli_lsm_md != NULL);
4337 down_read(&lli->lli_lsm_sem);
4338 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4339 &attr, ll_md_blocking_ast);
4340 up_read(&lli->lli_lsm_sem);
4344 set_nlink(inode, attr.cat_nlink);
4345 inode->i_blocks = attr.cat_blocks;
4346 i_size_write(inode, attr.cat_size);
4348 ll_i2info(inode)->lli_atime = attr.cat_atime;
4349 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4350 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4355 static inline dev_t ll_compat_encode_dev(dev_t dev)
4357 /* The compat_sys_*stat*() syscalls will fail unless the
4358 * device majors and minors are both less than 256. Note that
4359 * the value returned here will be passed through
4360 * old_encode_dev() in cp_compat_stat(). And so we are not
4361 * trying to return a valid compat (u16) device number, just
4362 * one that will pass the old_valid_dev() check. */
4364 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4367 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4368 int ll_getattr(const struct path *path, struct kstat *stat,
4369 u32 request_mask, unsigned int flags)
4371 struct dentry *de = path->dentry;
4373 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4376 struct inode *inode = de->d_inode;
4377 struct ll_sb_info *sbi = ll_i2sbi(inode);
4378 struct ll_inode_info *lli = ll_i2info(inode);
4381 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4383 rc = ll_inode_revalidate(de, IT_GETATTR);
4387 if (S_ISREG(inode->i_mode)) {
4388 /* In case of restore, the MDT has the right size and has
4389 * already send it back without granting the layout lock,
4390 * inode is up-to-date so glimpse is useless.
4391 * Also to glimpse we need the layout, in case of a running
4392 * restore the MDT holds the layout lock so the glimpse will
4393 * block up to the end of restore (getattr will block)
4395 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4396 rc = ll_glimpse_size(inode);
4401 /* If object isn't regular a file then don't validate size. */
4402 if (S_ISDIR(inode->i_mode) &&
4403 lli->lli_lsm_md != NULL) {
4404 rc = ll_merge_md_attr(inode);
4409 inode->i_atime.tv_sec = lli->lli_atime;
4410 inode->i_mtime.tv_sec = lli->lli_mtime;
4411 inode->i_ctime.tv_sec = lli->lli_ctime;
4414 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4416 if (ll_need_32bit_api(sbi)) {
4417 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4418 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4419 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4421 stat->ino = inode->i_ino;
4422 stat->dev = inode->i_sb->s_dev;
4423 stat->rdev = inode->i_rdev;
4426 stat->mode = inode->i_mode;
4427 stat->uid = inode->i_uid;
4428 stat->gid = inode->i_gid;
4429 stat->atime = inode->i_atime;
4430 stat->mtime = inode->i_mtime;
4431 stat->ctime = inode->i_ctime;
4432 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4434 stat->nlink = inode->i_nlink;
4435 stat->size = i_size_read(inode);
4436 stat->blocks = inode->i_blocks;
4441 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4442 __u64 start, __u64 len)
4446 struct fiemap *fiemap;
4447 unsigned int extent_count = fieinfo->fi_extents_max;
4449 num_bytes = sizeof(*fiemap) + (extent_count *
4450 sizeof(struct fiemap_extent));
4451 OBD_ALLOC_LARGE(fiemap, num_bytes);
4456 fiemap->fm_flags = fieinfo->fi_flags;
4457 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4458 fiemap->fm_start = start;
4459 fiemap->fm_length = len;
4460 if (extent_count > 0 &&
4461 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4462 sizeof(struct fiemap_extent)) != 0)
4463 GOTO(out, rc = -EFAULT);
4465 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4467 fieinfo->fi_flags = fiemap->fm_flags;
4468 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4469 if (extent_count > 0 &&
4470 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4471 fiemap->fm_mapped_extents *
4472 sizeof(struct fiemap_extent)) != 0)
4473 GOTO(out, rc = -EFAULT);
4475 OBD_FREE_LARGE(fiemap, num_bytes);
4479 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4481 struct ll_inode_info *lli = ll_i2info(inode);
4482 struct posix_acl *acl = NULL;
4485 spin_lock(&lli->lli_lock);
4486 /* VFS' acl_permission_check->check_acl will release the refcount */
4487 acl = posix_acl_dup(lli->lli_posix_acl);
4488 spin_unlock(&lli->lli_lock);
4493 #ifdef HAVE_IOP_SET_ACL
4494 #ifdef CONFIG_FS_POSIX_ACL
4495 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4497 struct ll_sb_info *sbi = ll_i2sbi(inode);
4498 struct ptlrpc_request *req = NULL;
4499 const char *name = NULL;
4501 size_t value_size = 0;
4506 case ACL_TYPE_ACCESS:
4507 name = XATTR_NAME_POSIX_ACL_ACCESS;
4509 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4512 case ACL_TYPE_DEFAULT:
4513 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4514 if (!S_ISDIR(inode->i_mode))
4515 rc = acl ? -EACCES : 0;
4526 value_size = posix_acl_xattr_size(acl->a_count);
4527 value = kmalloc(value_size, GFP_NOFS);
4529 GOTO(out, rc = -ENOMEM);
4531 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4533 GOTO(out_value, rc);
4536 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4537 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4538 name, value, value_size, 0, 0, &req);
4540 ptlrpc_req_finished(req);
4545 forget_cached_acl(inode, type);
4547 set_cached_acl(inode, type, acl);
4550 #endif /* CONFIG_FS_POSIX_ACL */
4551 #endif /* HAVE_IOP_SET_ACL */
4553 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4555 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4556 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4558 ll_check_acl(struct inode *inode, int mask)
4561 # ifdef CONFIG_FS_POSIX_ACL
4562 struct posix_acl *acl;
4566 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4567 if (flags & IPERM_FLAG_RCU)
4570 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4575 rc = posix_acl_permission(inode, acl, mask);
4576 posix_acl_release(acl);
4579 # else /* !CONFIG_FS_POSIX_ACL */
4581 # endif /* CONFIG_FS_POSIX_ACL */
4583 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4585 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4586 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4588 # ifdef HAVE_INODE_PERMISION_2ARGS
4589 int ll_inode_permission(struct inode *inode, int mask)
4591 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4596 struct ll_sb_info *sbi;
4597 struct root_squash_info *squash;
4598 struct cred *cred = NULL;
4599 const struct cred *old_cred = NULL;
4601 bool squash_id = false;
4604 #ifdef MAY_NOT_BLOCK
4605 if (mask & MAY_NOT_BLOCK)
4607 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4608 if (flags & IPERM_FLAG_RCU)
4612 /* as root inode are NOT getting validated in lookup operation,
4613 * need to do it before permission check. */
4615 if (inode == inode->i_sb->s_root->d_inode) {
4616 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4621 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4622 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4624 /* squash fsuid/fsgid if needed */
4625 sbi = ll_i2sbi(inode);
4626 squash = &sbi->ll_squash;
4627 if (unlikely(squash->rsi_uid != 0 &&
4628 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4629 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4633 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4634 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4635 squash->rsi_uid, squash->rsi_gid);
4637 /* update current process's credentials
4638 * and FS capability */
4639 cred = prepare_creds();
4643 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4644 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4645 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4646 if ((1 << cap) & CFS_CAP_FS_MASK)
4647 cap_lower(cred->cap_effective, cap);
4649 old_cred = override_creds(cred);
4652 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4653 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4654 /* restore current process's credentials and FS capability */
4656 revert_creds(old_cred);
4663 /* -o localflock - only provides locally consistent flock locks */
4664 struct file_operations ll_file_operations = {
4665 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4666 # ifdef HAVE_SYNC_READ_WRITE
4667 .read = new_sync_read,
4668 .write = new_sync_write,
4670 .read_iter = ll_file_read_iter,
4671 .write_iter = ll_file_write_iter,
4672 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4673 .read = ll_file_read,
4674 .aio_read = ll_file_aio_read,
4675 .write = ll_file_write,
4676 .aio_write = ll_file_aio_write,
4677 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4678 .unlocked_ioctl = ll_file_ioctl,
4679 .open = ll_file_open,
4680 .release = ll_file_release,
4681 .mmap = ll_file_mmap,
4682 .llseek = ll_file_seek,
4683 .splice_read = ll_file_splice_read,
4688 struct file_operations ll_file_operations_flock = {
4689 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4690 # ifdef HAVE_SYNC_READ_WRITE
4691 .read = new_sync_read,
4692 .write = new_sync_write,
4693 # endif /* HAVE_SYNC_READ_WRITE */
4694 .read_iter = ll_file_read_iter,
4695 .write_iter = ll_file_write_iter,
4696 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4697 .read = ll_file_read,
4698 .aio_read = ll_file_aio_read,
4699 .write = ll_file_write,
4700 .aio_write = ll_file_aio_write,
4701 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4702 .unlocked_ioctl = ll_file_ioctl,
4703 .open = ll_file_open,
4704 .release = ll_file_release,
4705 .mmap = ll_file_mmap,
4706 .llseek = ll_file_seek,
4707 .splice_read = ll_file_splice_read,
4710 .flock = ll_file_flock,
4711 .lock = ll_file_flock
4714 /* These are for -o noflock - to return ENOSYS on flock calls */
4715 struct file_operations ll_file_operations_noflock = {
4716 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4717 # ifdef HAVE_SYNC_READ_WRITE
4718 .read = new_sync_read,
4719 .write = new_sync_write,
4720 # endif /* HAVE_SYNC_READ_WRITE */
4721 .read_iter = ll_file_read_iter,
4722 .write_iter = ll_file_write_iter,
4723 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4724 .read = ll_file_read,
4725 .aio_read = ll_file_aio_read,
4726 .write = ll_file_write,
4727 .aio_write = ll_file_aio_write,
4728 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4729 .unlocked_ioctl = ll_file_ioctl,
4730 .open = ll_file_open,
4731 .release = ll_file_release,
4732 .mmap = ll_file_mmap,
4733 .llseek = ll_file_seek,
4734 .splice_read = ll_file_splice_read,
4737 .flock = ll_file_noflock,
4738 .lock = ll_file_noflock
4741 struct inode_operations ll_file_inode_operations = {
4742 .setattr = ll_setattr,
4743 .getattr = ll_getattr,
4744 .permission = ll_inode_permission,
4745 #ifdef HAVE_IOP_XATTR
4746 .setxattr = ll_setxattr,
4747 .getxattr = ll_getxattr,
4748 .removexattr = ll_removexattr,
4750 .listxattr = ll_listxattr,
4751 .fiemap = ll_fiemap,
4752 #ifdef HAVE_IOP_GET_ACL
4753 .get_acl = ll_get_acl,
4755 #ifdef HAVE_IOP_SET_ACL
4756 .set_acl = ll_set_acl,
4760 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4762 struct ll_inode_info *lli = ll_i2info(inode);
4763 struct cl_object *obj = lli->lli_clob;
4772 env = cl_env_get(&refcheck);
4774 RETURN(PTR_ERR(env));
4776 rc = cl_conf_set(env, lli->lli_clob, conf);
4780 if (conf->coc_opc == OBJECT_CONF_SET) {
4781 struct ldlm_lock *lock = conf->coc_lock;
4782 struct cl_layout cl = {
4786 LASSERT(lock != NULL);
4787 LASSERT(ldlm_has_layout(lock));
4789 /* it can only be allowed to match after layout is
4790 * applied to inode otherwise false layout would be
4791 * seen. Applying layout shoud happen before dropping
4792 * the intent lock. */
4793 ldlm_lock_allow_match(lock);
4795 rc = cl_object_layout_get(env, obj, &cl);
4800 DFID": layout version change: %u -> %u\n",
4801 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4803 ll_layout_version_set(lli, cl.cl_layout_gen);
4807 cl_env_put(env, &refcheck);
4812 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4813 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4816 struct ll_sb_info *sbi = ll_i2sbi(inode);
4817 struct ptlrpc_request *req;
4824 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4825 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4826 lock->l_lvb_data, lock->l_lvb_len);
4828 if (lock->l_lvb_data != NULL)
4831 /* if layout lock was granted right away, the layout is returned
4832 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4833 * blocked and then granted via completion ast, we have to fetch
4834 * layout here. Please note that we can't use the LVB buffer in
4835 * completion AST because it doesn't have a large enough buffer */
4836 rc = ll_get_default_mdsize(sbi, &lmmsize);
4840 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4841 XATTR_NAME_LOV, lmmsize, &req);
4844 GOTO(out, rc = 0); /* empty layout */
4851 if (lmmsize == 0) /* empty layout */
4854 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4856 GOTO(out, rc = -EFAULT);
4858 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4859 if (lvbdata == NULL)
4860 GOTO(out, rc = -ENOMEM);
4862 memcpy(lvbdata, lmm, lmmsize);
4863 lock_res_and_lock(lock);
4864 if (unlikely(lock->l_lvb_data == NULL)) {
4865 lock->l_lvb_type = LVB_T_LAYOUT;
4866 lock->l_lvb_data = lvbdata;
4867 lock->l_lvb_len = lmmsize;
4870 unlock_res_and_lock(lock);
4873 OBD_FREE_LARGE(lvbdata, lmmsize);
4878 ptlrpc_req_finished(req);
4883 * Apply the layout to the inode. Layout lock is held and will be released
4886 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4887 struct inode *inode)
4889 struct ll_inode_info *lli = ll_i2info(inode);
4890 struct ll_sb_info *sbi = ll_i2sbi(inode);
4891 struct ldlm_lock *lock;
4892 struct cl_object_conf conf;
4895 bool wait_layout = false;
4898 LASSERT(lustre_handle_is_used(lockh));
4900 lock = ldlm_handle2lock(lockh);
4901 LASSERT(lock != NULL);
4902 LASSERT(ldlm_has_layout(lock));
4904 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4905 PFID(&lli->lli_fid), inode);
4907 /* in case this is a caching lock and reinstate with new inode */
4908 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4910 lock_res_and_lock(lock);
4911 lvb_ready = ldlm_is_lvb_ready(lock);
4912 unlock_res_and_lock(lock);
4914 /* checking lvb_ready is racy but this is okay. The worst case is
4915 * that multi processes may configure the file on the same time. */
4919 rc = ll_layout_fetch(inode, lock);
4923 /* for layout lock, lmm is stored in lock's lvb.
4924 * lvb_data is immutable if the lock is held so it's safe to access it
4927 * set layout to file. Unlikely this will fail as old layout was
4928 * surely eliminated */
4929 memset(&conf, 0, sizeof conf);
4930 conf.coc_opc = OBJECT_CONF_SET;
4931 conf.coc_inode = inode;
4932 conf.coc_lock = lock;
4933 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4934 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4935 rc = ll_layout_conf(inode, &conf);
4937 /* refresh layout failed, need to wait */
4938 wait_layout = rc == -EBUSY;
4941 LDLM_LOCK_PUT(lock);
4942 ldlm_lock_decref(lockh, mode);
4944 /* wait for IO to complete if it's still being used. */
4946 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4947 ll_get_fsname(inode->i_sb, NULL, 0),
4948 PFID(&lli->lli_fid), inode);
4950 memset(&conf, 0, sizeof conf);
4951 conf.coc_opc = OBJECT_CONF_WAIT;
4952 conf.coc_inode = inode;
4953 rc = ll_layout_conf(inode, &conf);
4957 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4958 ll_get_fsname(inode->i_sb, NULL, 0),
4959 PFID(&lli->lli_fid), rc);
4965 * Issue layout intent RPC to MDS.
4966 * \param inode [in] file inode
4967 * \param intent [in] layout intent
4969 * \retval 0 on success
4970 * \retval < 0 error code
4972 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4974 struct ll_inode_info *lli = ll_i2info(inode);
4975 struct ll_sb_info *sbi = ll_i2sbi(inode);
4976 struct md_op_data *op_data;
4977 struct lookup_intent it;
4978 struct ptlrpc_request *req;
4982 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4983 0, 0, LUSTRE_OPC_ANY, NULL);
4984 if (IS_ERR(op_data))
4985 RETURN(PTR_ERR(op_data));
4987 op_data->op_data = intent;
4988 op_data->op_data_size = sizeof(*intent);
4990 memset(&it, 0, sizeof(it));
4991 it.it_op = IT_LAYOUT;
4992 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4993 intent->li_opc == LAYOUT_INTENT_TRUNC)
4994 it.it_flags = FMODE_WRITE;
4996 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4997 ll_get_fsname(inode->i_sb, NULL, 0),
4998 PFID(&lli->lli_fid), inode);
5000 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5001 &ll_md_blocking_ast, 0);
5002 if (it.it_request != NULL)
5003 ptlrpc_req_finished(it.it_request);
5004 it.it_request = NULL;
5006 ll_finish_md_op_data(op_data);
5008 /* set lock data in case this is a new lock */
5010 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5012 ll_intent_drop_lock(&it);
5018 * This function checks if there exists a LAYOUT lock on the client side,
5019 * or enqueues it if it doesn't have one in cache.
5021 * This function will not hold layout lock so it may be revoked any time after
5022 * this function returns. Any operations depend on layout should be redone
5025 * This function should be called before lov_io_init() to get an uptodate
5026 * layout version, the caller should save the version number and after IO
5027 * is finished, this function should be called again to verify that layout
5028 * is not changed during IO time.
5030 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5032 struct ll_inode_info *lli = ll_i2info(inode);
5033 struct ll_sb_info *sbi = ll_i2sbi(inode);
5034 struct lustre_handle lockh;
5035 struct layout_intent intent = {
5036 .li_opc = LAYOUT_INTENT_ACCESS,
5038 enum ldlm_mode mode;
5042 *gen = ll_layout_version_get(lli);
5043 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5047 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5048 LASSERT(S_ISREG(inode->i_mode));
5050 /* take layout lock mutex to enqueue layout lock exclusively. */
5051 mutex_lock(&lli->lli_layout_mutex);
5054 /* mostly layout lock is caching on the local side, so try to
5055 * match it before grabbing layout lock mutex. */
5056 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5057 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5058 if (mode != 0) { /* hit cached lock */
5059 rc = ll_layout_lock_set(&lockh, mode, inode);
5065 rc = ll_layout_intent(inode, &intent);
5071 *gen = ll_layout_version_get(lli);
5072 mutex_unlock(&lli->lli_layout_mutex);
5078 * Issue layout intent RPC indicating where in a file an IO is about to write.
5080 * \param[in] inode file inode.
5081 * \param[in] ext write range with start offset of fille in bytes where
5082 * an IO is about to write, and exclusive end offset in
5085 * \retval 0 on success
5086 * \retval < 0 error code
5088 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5089 struct lu_extent *ext)
5091 struct layout_intent intent = {
5093 .li_extent.e_start = ext->e_start,
5094 .li_extent.e_end = ext->e_end,
5099 rc = ll_layout_intent(inode, &intent);
5105 * This function send a restore request to the MDT
5107 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5109 struct hsm_user_request *hur;
5113 len = sizeof(struct hsm_user_request) +
5114 sizeof(struct hsm_user_item);
5115 OBD_ALLOC(hur, len);
5119 hur->hur_request.hr_action = HUA_RESTORE;
5120 hur->hur_request.hr_archive_id = 0;
5121 hur->hur_request.hr_flags = 0;
5122 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5123 sizeof(hur->hur_user_item[0].hui_fid));
5124 hur->hur_user_item[0].hui_extent.offset = offset;
5125 hur->hur_user_item[0].hui_extent.length = length;
5126 hur->hur_request.hr_itemcount = 1;
5127 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,