4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
501 const char *name = NULL;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
514 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
515 name = de->d_name.name;
516 len = de->d_name.len;
519 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
520 name, len, 0, LUSTRE_OPC_ANY, NULL);
522 RETURN(PTR_ERR(op_data));
523 op_data->op_data = lmm;
524 op_data->op_data_size = lmmsize;
526 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
527 &ll_md_blocking_ast, 0);
528 ll_finish_md_op_data(op_data);
530 /* reason for keep own exit path - don`t flood log
531 * with messages with -ESTALE errors.
533 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
534 it_open_error(DISP_OPEN_OPEN, itp))
536 ll_release_openhandle(de, itp);
540 if (it_disposition(itp, DISP_LOOKUP_NEG))
541 GOTO(out, rc = -ENOENT);
543 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
544 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
545 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
549 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
551 if (!rc && itp->it_lock_mode) {
552 ll_dom_finish_open(de->d_inode, req, itp);
553 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
557 ptlrpc_req_finished(req);
558 ll_intent_drop_lock(itp);
560 /* We did open by fid, but by the time we got to the server,
561 * the object disappeared. If this is a create, we cannot really
562 * tell the userspace that the file it was trying to create
563 * does not exist. Instead let's return -ESTALE, and the VFS will
564 * retry the create with LOOKUP_REVAL that we are going to catch
565 * in ll_revalidate_dentry() and use lookup then.
567 if (rc == -ENOENT && itp->it_op & IT_CREAT)
573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
574 struct obd_client_handle *och)
576 struct mdt_body *body;
578 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
579 och->och_open_handle = body->mbo_open_handle;
580 och->och_fid = body->mbo_fid1;
581 och->och_lease_handle.cookie = it->it_lock_handle;
582 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
583 och->och_flags = it->it_flags;
585 return md_set_open_replay_data(md_exp, och, it);
588 static int ll_local_open(struct file *file, struct lookup_intent *it,
589 struct ll_file_data *fd, struct obd_client_handle *och)
591 struct inode *inode = file_inode(file);
594 LASSERT(!LUSTRE_FPRIVATE(file));
601 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
606 LUSTRE_FPRIVATE(file) = fd;
607 ll_readahead_init(inode, &fd->fd_ras);
608 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
610 /* ll_cl_context initialize */
611 rwlock_init(&fd->fd_lock);
612 INIT_LIST_HEAD(&fd->fd_lccs);
617 /* Open a file, and (for the very first open) create objects on the OSTs at
618 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
619 * creation or open until ll_lov_setstripe() ioctl is called.
621 * If we already have the stripe MD locally then we don't request it in
622 * md_open(), by passing a lmm_size = 0.
624 * It is up to the application to ensure no other processes open this file
625 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
626 * used. We might be able to avoid races of that sort by getting lli_open_sem
627 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
628 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
630 int ll_file_open(struct inode *inode, struct file *file)
632 struct ll_inode_info *lli = ll_i2info(inode);
633 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
634 .it_flags = file->f_flags };
635 struct obd_client_handle **och_p = NULL;
636 __u64 *och_usecount = NULL;
637 struct ll_file_data *fd;
641 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
642 PFID(ll_inode2fid(inode)), inode, file->f_flags);
644 it = file->private_data; /* XXX: compat macro */
645 file->private_data = NULL; /* prevent ll_local_open assertion */
647 fd = ll_file_data_get();
649 GOTO(out_nofiledata, rc = -ENOMEM);
652 if (S_ISDIR(inode->i_mode))
653 ll_authorize_statahead(inode, fd);
655 if (inode->i_sb->s_root == file_dentry(file)) {
656 LUSTRE_FPRIVATE(file) = fd;
660 if (!it || !it->it_disposition) {
661 /* Convert f_flags into access mode. We cannot use file->f_mode,
662 * because everything but O_ACCMODE mask was stripped from
664 if ((oit.it_flags + 1) & O_ACCMODE)
666 if (file->f_flags & O_TRUNC)
667 oit.it_flags |= FMODE_WRITE;
669 /* kernel only call f_op->open in dentry_open. filp_open calls
670 * dentry_open after call to open_namei that checks permissions.
671 * Only nfsd_open call dentry_open directly without checking
672 * permissions and because of that this code below is safe.
674 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
675 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
677 /* We do not want O_EXCL here, presumably we opened the file
678 * already? XXX - NFS implications? */
679 oit.it_flags &= ~O_EXCL;
681 /* bug20584, if "it_flags" contains O_CREAT, the file will be
682 * created if necessary, then "IT_CREAT" should be set to keep
683 * consistent with it */
684 if (oit.it_flags & O_CREAT)
685 oit.it_op |= IT_CREAT;
691 /* Let's see if we have file open on MDS already. */
692 if (it->it_flags & FMODE_WRITE) {
693 och_p = &lli->lli_mds_write_och;
694 och_usecount = &lli->lli_open_fd_write_count;
695 } else if (it->it_flags & FMODE_EXEC) {
696 och_p = &lli->lli_mds_exec_och;
697 och_usecount = &lli->lli_open_fd_exec_count;
699 och_p = &lli->lli_mds_read_och;
700 och_usecount = &lli->lli_open_fd_read_count;
703 mutex_lock(&lli->lli_och_mutex);
704 if (*och_p) { /* Open handle is present */
705 if (it_disposition(it, DISP_OPEN_OPEN)) {
706 /* Well, there's extra open request that we do not need,
707 let's close it somehow. This will decref request. */
708 rc = it_open_error(DISP_OPEN_OPEN, it);
710 mutex_unlock(&lli->lli_och_mutex);
711 GOTO(out_openerr, rc);
714 ll_release_openhandle(file_dentry(file), it);
718 rc = ll_local_open(file, it, fd, NULL);
721 mutex_unlock(&lli->lli_och_mutex);
722 GOTO(out_openerr, rc);
725 LASSERT(*och_usecount == 0);
726 if (!it->it_disposition) {
727 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
728 /* We cannot just request lock handle now, new ELC code
729 means that one of other OPEN locks for this file
730 could be cancelled, and since blocking ast handler
731 would attempt to grab och_mutex as well, that would
732 result in a deadlock */
733 mutex_unlock(&lli->lli_och_mutex);
735 * Normally called under two situations:
737 * 2. A race/condition on MDS resulting in no open
738 * handle to be returned from LOOKUP|OPEN request,
739 * for example if the target entry was a symlink.
741 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
742 * marked by a bit set in ll_iget_for_nfs. Clear the
743 * bit so that it's not confusing later callers.
745 * NB; when ldd is NULL, it must have come via normal
746 * lookup path only, since ll_iget_for_nfs always calls
749 if (ldd && ldd->lld_nfs_dentry) {
750 ldd->lld_nfs_dentry = 0;
751 it->it_flags |= MDS_OPEN_LOCK;
755 * Always specify MDS_OPEN_BY_FID because we don't want
756 * to get file with different fid.
758 it->it_flags |= MDS_OPEN_BY_FID;
759 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
762 GOTO(out_openerr, rc);
766 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
768 GOTO(out_och_free, rc = -ENOMEM);
772 /* md_intent_lock() didn't get a request ref if there was an
773 * open error, so don't do cleanup on the request here
775 /* XXX (green): Should not we bail out on any error here, not
776 * just open error? */
777 rc = it_open_error(DISP_OPEN_OPEN, it);
779 GOTO(out_och_free, rc);
781 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
782 "inode %p: disposition %x, status %d\n", inode,
783 it_disposition(it, ~0), it->it_status);
785 rc = ll_local_open(file, it, fd, *och_p);
787 GOTO(out_och_free, rc);
789 mutex_unlock(&lli->lli_och_mutex);
792 /* Must do this outside lli_och_mutex lock to prevent deadlock where
793 different kind of OPEN lock for this same inode gets cancelled
794 by ldlm_cancel_lru */
795 if (!S_ISREG(inode->i_mode))
796 GOTO(out_och_free, rc);
798 cl_lov_delay_create_clear(&file->f_flags);
799 GOTO(out_och_free, rc);
803 if (och_p && *och_p) {
804 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
805 *och_p = NULL; /* OBD_FREE writes some magic there */
808 mutex_unlock(&lli->lli_och_mutex);
811 if (lli->lli_opendir_key == fd)
812 ll_deauthorize_statahead(inode, fd);
814 ll_file_data_put(fd);
816 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
820 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
821 ptlrpc_req_finished(it->it_request);
822 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
829 struct ldlm_lock_desc *desc, void *data, int flag)
832 struct lustre_handle lockh;
836 case LDLM_CB_BLOCKING:
837 ldlm_lock2handle(lock, &lockh);
838 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
840 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
844 case LDLM_CB_CANCELING:
852 * When setting a lease on a file, we take ownership of the lli_mds_*_och
853 * and save it as fd->fd_och so as to force client to reopen the file even
854 * if it has an open lock in cache already.
856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
857 struct lustre_handle *old_open_handle)
859 struct ll_inode_info *lli = ll_i2info(inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
861 struct obd_client_handle **och_p;
866 /* Get the openhandle of the file */
867 mutex_lock(&lli->lli_och_mutex);
868 if (fd->fd_lease_och != NULL)
869 GOTO(out_unlock, rc = -EBUSY);
871 if (fd->fd_och == NULL) {
872 if (file->f_mode & FMODE_WRITE) {
873 LASSERT(lli->lli_mds_write_och != NULL);
874 och_p = &lli->lli_mds_write_och;
875 och_usecount = &lli->lli_open_fd_write_count;
877 LASSERT(lli->lli_mds_read_och != NULL);
878 och_p = &lli->lli_mds_read_och;
879 och_usecount = &lli->lli_open_fd_read_count;
882 if (*och_usecount > 1)
883 GOTO(out_unlock, rc = -EBUSY);
890 *old_open_handle = fd->fd_och->och_open_handle;
894 mutex_unlock(&lli->lli_och_mutex);
899 * Release ownership on lli_mds_*_och when putting back a file lease.
901 static int ll_lease_och_release(struct inode *inode, struct file *file)
903 struct ll_inode_info *lli = ll_i2info(inode);
904 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
905 struct obd_client_handle **och_p;
906 struct obd_client_handle *old_och = NULL;
911 mutex_lock(&lli->lli_och_mutex);
912 if (file->f_mode & FMODE_WRITE) {
913 och_p = &lli->lli_mds_write_och;
914 och_usecount = &lli->lli_open_fd_write_count;
916 och_p = &lli->lli_mds_read_och;
917 och_usecount = &lli->lli_open_fd_read_count;
920 /* The file may have been open by another process (broken lease) so
921 * *och_p is not NULL. In this case we should simply increase usecount
924 if (*och_p != NULL) {
925 old_och = fd->fd_och;
932 mutex_unlock(&lli->lli_och_mutex);
935 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
941 * Acquire a lease and open the file.
943 static struct obd_client_handle *
944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
947 struct lookup_intent it = { .it_op = IT_OPEN };
948 struct ll_sb_info *sbi = ll_i2sbi(inode);
949 struct md_op_data *op_data;
950 struct ptlrpc_request *req = NULL;
951 struct lustre_handle old_open_handle = { 0 };
952 struct obd_client_handle *och = NULL;
957 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
958 RETURN(ERR_PTR(-EINVAL));
961 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
962 RETURN(ERR_PTR(-EPERM));
964 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
971 RETURN(ERR_PTR(-ENOMEM));
973 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
974 LUSTRE_OPC_ANY, NULL);
976 GOTO(out, rc = PTR_ERR(op_data));
978 /* To tell the MDT this openhandle is from the same owner */
979 op_data->op_open_handle = old_open_handle;
981 it.it_flags = fmode | open_flags;
982 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
983 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
984 &ll_md_blocking_lease_ast,
985 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
986 * it can be cancelled which may mislead applications that the lease is
988 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
989 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
990 * doesn't deal with openhandle, so normal openhandle will be leaked. */
991 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
992 ll_finish_md_op_data(op_data);
993 ptlrpc_req_finished(req);
995 GOTO(out_release_it, rc);
997 if (it_disposition(&it, DISP_LOOKUP_NEG))
998 GOTO(out_release_it, rc = -ENOENT);
1000 rc = it_open_error(DISP_OPEN_OPEN, &it);
1002 GOTO(out_release_it, rc);
1004 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005 ll_och_fill(sbi->ll_md_exp, &it, och);
1007 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008 GOTO(out_close, rc = -EOPNOTSUPP);
1010 /* already get lease, handle lease lock */
1011 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012 if (it.it_lock_mode == 0 ||
1013 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014 /* open lock must return for lease */
1015 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1018 GOTO(out_close, rc = -EPROTO);
1021 ll_intent_release(&it);
1025 /* Cancel open lock */
1026 if (it.it_lock_mode != 0) {
1027 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1029 it.it_lock_mode = 0;
1030 och->och_lease_handle.cookie = 0ULL;
1032 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1034 CERROR("%s: error closing file "DFID": %d\n",
1035 ll_get_fsname(inode->i_sb, NULL, 0),
1036 PFID(&ll_i2info(inode)->lli_fid), rc2);
1037 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1039 ll_intent_release(&it);
1043 RETURN(ERR_PTR(rc));
1047 * Check whether a layout swap can be done between two inodes.
1049 * \param[in] inode1 First inode to check
1050 * \param[in] inode2 Second inode to check
1052 * \retval 0 on success, layout swap can be performed between both inodes
1053 * \retval negative error code if requirements are not met
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056 struct inode *inode2)
1058 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1061 if (inode_permission(inode1, MAY_WRITE) ||
1062 inode_permission(inode2, MAY_WRITE))
1065 if (inode1->i_sb != inode2->i_sb)
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072 struct inode *inode, struct inode *inode2)
1074 const struct lu_fid *fid1 = ll_inode2fid(inode);
1075 const struct lu_fid *fid2;
1079 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1082 rc = ll_check_swap_layouts_validity(inode, inode2);
1084 GOTO(out_free_och, rc);
1086 /* We now know that inode2 is a lustre inode */
1087 fid2 = ll_inode2fid(inode2);
1089 rc = lu_fid_cmp(fid1, fid2);
1091 GOTO(out_free_och, rc = -EINVAL);
1093 /* Close the file and {swap,merge} layouts between inode & inode2.
1094 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095 * because we still need it to pack l_remote_handle to MDT. */
1096 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1099 och = NULL; /* freed in ll_close_inode_openhandle() */
1109 * Release lease and close the file.
1110 * It will check if the lease has ever broken.
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113 struct inode *inode,
1114 bool *lease_broken, enum mds_op_bias bias,
1117 struct ldlm_lock *lock;
1118 bool cancelled = true;
1122 lock = ldlm_handle2lock(&och->och_lease_handle);
1124 lock_res_and_lock(lock);
1125 cancelled = ldlm_is_cancel(lock);
1126 unlock_res_and_lock(lock);
1127 LDLM_LOCK_PUT(lock);
1130 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1133 if (lease_broken != NULL)
1134 *lease_broken = cancelled;
1136 if (!cancelled && !bias)
1137 ldlm_cli_cancel(&och->och_lease_handle, 0);
1139 if (cancelled) { /* no need to excute intent */
1144 rc = ll_close_inode_openhandle(inode, och, bias, data);
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1151 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1155 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158 struct inode *inode, unsigned long arg)
1160 struct ll_sb_info *sbi = ll_i2sbi(inode);
1161 struct md_op_data *op_data;
1162 struct ll_ioc_lease_id ioc;
1163 __u64 data_version_unused;
1167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168 LUSTRE_OPC_ANY, NULL);
1169 if (IS_ERR(op_data))
1170 RETURN(PTR_ERR(op_data));
1172 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1176 /* before starting file resync, it's necessary to clean up page cache
1177 * in client memory, otherwise once the layout version is increased,
1178 * writing back cached data will be denied the OSTs. */
1179 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1183 op_data->op_lease_handle = och->och_lease_handle;
1184 op_data->op_mirror_id = ioc.lil_mirror_id;
1185 rc = md_file_resync(sbi->ll_md_exp, op_data);
1191 ll_finish_md_op_data(op_data);
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1197 struct ll_inode_info *lli = ll_i2info(inode);
1198 struct cl_object *obj = lli->lli_clob;
1199 struct cl_attr *attr = vvp_env_thread_attr(env);
1207 ll_inode_size_lock(inode);
1209 /* Merge timestamps the most recently obtained from MDS with
1210 * timestamps obtained from OSTs.
1212 * Do not overwrite atime of inode because it may be refreshed
1213 * by file_accessed() function. If the read was served by cache
1214 * data, there is no RPC to be sent so that atime may not be
1215 * transferred to OSTs at all. MDT only updates atime at close time
1216 * if it's at least 'mdd.*.atime_diff' older.
1217 * All in all, the atime in Lustre does not strictly comply with
1218 * POSIX. Solving this problem needs to send an RPC to MDT for each
1219 * read, this will hurt performance. */
1220 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1221 LTIME_S(inode->i_atime) = lli->lli_atime;
1222 lli->lli_update_atime = 0;
1224 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1225 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1227 atime = LTIME_S(inode->i_atime);
1228 mtime = LTIME_S(inode->i_mtime);
1229 ctime = LTIME_S(inode->i_ctime);
1231 cl_object_attr_lock(obj);
1232 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1235 rc = cl_object_attr_get(env, obj, attr);
1236 cl_object_attr_unlock(obj);
1239 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1241 if (atime < attr->cat_atime)
1242 atime = attr->cat_atime;
1244 if (ctime < attr->cat_ctime)
1245 ctime = attr->cat_ctime;
1247 if (mtime < attr->cat_mtime)
1248 mtime = attr->cat_mtime;
1250 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1251 PFID(&lli->lli_fid), attr->cat_size);
1253 i_size_write(inode, attr->cat_size);
1254 inode->i_blocks = attr->cat_blocks;
1256 LTIME_S(inode->i_atime) = atime;
1257 LTIME_S(inode->i_mtime) = mtime;
1258 LTIME_S(inode->i_ctime) = ctime;
1261 ll_inode_size_unlock(inode);
1267 * Set designated mirror for I/O.
1269 * So far only read, write, and truncated can support to issue I/O to
1270 * designated mirror.
1272 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1274 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1276 /* clear layout version for generic(non-resync) I/O in case it carries
1277 * stale layout version due to I/O restart */
1278 io->ci_layout_version = 0;
1280 /* FLR: disable non-delay for designated mirror I/O because obviously
1281 * only one mirror is available */
1282 if (fd->fd_designated_mirror > 0) {
1284 io->ci_designated_mirror = fd->fd_designated_mirror;
1285 io->ci_layout_version = fd->fd_layout_version;
1288 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1289 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1292 static bool file_is_noatime(const struct file *file)
1294 const struct vfsmount *mnt = file->f_path.mnt;
1295 const struct inode *inode = file_inode((struct file *)file);
1297 /* Adapted from file_accessed() and touch_atime().*/
1298 if (file->f_flags & O_NOATIME)
1301 if (inode->i_flags & S_NOATIME)
1304 if (IS_NOATIME(inode))
1307 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1310 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1313 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1319 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1321 struct inode *inode = file_inode(file);
1322 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1324 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1325 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1327 if (iot == CIT_WRITE) {
1328 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1329 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1330 file->f_flags & O_DIRECT ||
1333 io->ci_obj = ll_i2info(inode)->lli_clob;
1334 io->ci_lockreq = CILR_MAYBE;
1335 if (ll_file_nolock(file)) {
1336 io->ci_lockreq = CILR_NEVER;
1337 io->ci_no_srvlock = 1;
1338 } else if (file->f_flags & O_APPEND) {
1339 io->ci_lockreq = CILR_MANDATORY;
1341 io->ci_noatime = file_is_noatime(file);
1343 /* FLR: only use non-delay I/O for read as there is only one
1344 * avaliable mirror for write. */
1345 io->ci_ndelay = !(iot == CIT_WRITE);
1347 ll_io_set_mirror(io, file);
1351 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1352 struct file *file, enum cl_io_type iot,
1353 loff_t *ppos, size_t count)
1355 struct vvp_io *vio = vvp_env_io(env);
1356 struct inode *inode = file_inode(file);
1357 struct ll_inode_info *lli = ll_i2info(inode);
1358 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1359 struct range_lock range;
1363 unsigned retried = 0;
1364 bool restarted = false;
1368 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1369 file_dentry(file)->d_name.name,
1370 iot == CIT_READ ? "read" : "write", *ppos, count);
1373 io = vvp_env_thread_io(env);
1374 ll_io_init(io, file, iot);
1375 io->ci_ndelay_tried = retried;
1377 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1378 bool range_locked = false;
1380 if (file->f_flags & O_APPEND)
1381 range_lock_init(&range, 0, LUSTRE_EOF);
1383 range_lock_init(&range, *ppos, *ppos + count - 1);
1385 vio->vui_fd = LUSTRE_FPRIVATE(file);
1386 vio->vui_io_subtype = args->via_io_subtype;
1388 switch (vio->vui_io_subtype) {
1390 vio->vui_iter = args->u.normal.via_iter;
1391 vio->vui_iocb = args->u.normal.via_iocb;
1392 /* Direct IO reads must also take range lock,
1393 * or multiple reads will try to work on the same pages
1394 * See LU-6227 for details. */
1395 if (((iot == CIT_WRITE) ||
1396 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1397 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1398 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1400 rc = range_lock(&lli->lli_write_tree, &range);
1404 range_locked = true;
1408 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1409 vio->u.splice.vui_flags = args->u.splice.via_flags;
1412 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1416 ll_cl_add(file, env, io, LCC_RW);
1417 rc = cl_io_loop(env, io);
1418 ll_cl_remove(file, env);
1421 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1423 range_unlock(&lli->lli_write_tree, &range);
1426 /* cl_io_rw_init() handled IO */
1430 if (io->ci_nob > 0) {
1431 result += io->ci_nob;
1432 count -= io->ci_nob;
1433 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1435 /* prepare IO restart */
1436 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1437 args->u.normal.via_iter = vio->vui_iter;
1440 cl_io_fini(env, io);
1443 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1444 file->f_path.dentry->d_name.name,
1445 iot, rc, result, io->ci_need_restart);
1447 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1449 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1450 file_dentry(file)->d_name.name,
1451 iot == CIT_READ ? "read" : "write",
1452 *ppos, count, result, rc);
1453 /* preserve the tried count for FLR */
1454 retried = io->ci_ndelay_tried;
1459 if (iot == CIT_READ) {
1461 ll_stats_ops_tally(ll_i2sbi(inode),
1462 LPROC_LL_READ_BYTES, result);
1463 } else if (iot == CIT_WRITE) {
1465 ll_stats_ops_tally(ll_i2sbi(inode),
1466 LPROC_LL_WRITE_BYTES, result);
1467 fd->fd_write_failed = false;
1468 } else if (result == 0 && rc == 0) {
1471 fd->fd_write_failed = true;
1473 fd->fd_write_failed = false;
1474 } else if (rc != -ERESTARTSYS) {
1475 fd->fd_write_failed = true;
1479 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1481 RETURN(result > 0 ? result : rc);
1485 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1486 * especially for small I/O.
1488 * To serve a read request, CLIO has to create and initialize a cl_io and
1489 * then request DLM lock. This has turned out to have siginificant overhead
1490 * and affects the performance of small I/O dramatically.
1492 * It's not necessary to create a cl_io for each I/O. Under the help of read
1493 * ahead, most of the pages being read are already in memory cache and we can
1494 * read those pages directly because if the pages exist, the corresponding DLM
1495 * lock must exist so that page content must be valid.
1497 * In fast read implementation, the llite speculatively finds and reads pages
1498 * in memory cache. There are three scenarios for fast read:
1499 * - If the page exists and is uptodate, kernel VM will provide the data and
1500 * CLIO won't be intervened;
1501 * - If the page was brought into memory by read ahead, it will be exported
1502 * and read ahead parameters will be updated;
1503 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1504 * it will go back and invoke normal read, i.e., a cl_io will be created
1505 * and DLM lock will be requested.
1507 * POSIX compliance: posix standard states that read is intended to be atomic.
1508 * Lustre read implementation is in line with Linux kernel read implementation
1509 * and neither of them complies with POSIX standard in this matter. Fast read
1510 * doesn't make the situation worse on single node but it may interleave write
1511 * results from multiple nodes due to short read handling in ll_file_aio_read().
1513 * \param env - lu_env
1514 * \param iocb - kiocb from kernel
1515 * \param iter - user space buffers where the data will be copied
1517 * \retval - number of bytes have been read, or error code if error occurred.
1520 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1524 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1527 /* NB: we can't do direct IO for fast read because it will need a lock
1528 * to make IO engine happy. */
1529 if (iocb->ki_filp->f_flags & O_DIRECT)
1532 result = generic_file_read_iter(iocb, iter);
1534 /* If the first page is not in cache, generic_file_aio_read() will be
1535 * returned with -ENODATA.
1536 * See corresponding code in ll_readpage(). */
1537 if (result == -ENODATA)
1541 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1542 LPROC_LL_READ_BYTES, result);
1548 * Read from a file (through the page cache).
1550 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1553 struct vvp_io_args *args;
1558 result = ll_do_fast_read(iocb, to);
1559 if (result < 0 || iov_iter_count(to) == 0)
1562 env = cl_env_get(&refcheck);
1564 return PTR_ERR(env);
1566 args = ll_env_args(env, IO_NORMAL);
1567 args->u.normal.via_iter = to;
1568 args->u.normal.via_iocb = iocb;
1570 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1571 &iocb->ki_pos, iov_iter_count(to));
1574 else if (result == 0)
1577 cl_env_put(env, &refcheck);
1583 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1584 * If a page is already in the page cache and dirty (and some other things -
1585 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1586 * write to it without doing a full I/O, because Lustre already knows about it
1587 * and will write it out. This saves a lot of processing time.
1589 * All writes here are within one page, so exclusion is handled by the page
1590 * lock on the vm page. We do not do tiny writes for writes which touch
1591 * multiple pages because it's very unlikely multiple sequential pages are
1592 * are already dirty.
1594 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1595 * and are unlikely to be to already dirty pages.
1597 * Attribute updates are important here, we do them in ll_tiny_write_end.
1599 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1601 ssize_t count = iov_iter_count(iter);
1602 struct file *file = iocb->ki_filp;
1603 struct inode *inode = file_inode(file);
1608 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1609 * of function for why.
1611 if (count >= PAGE_SIZE ||
1612 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1615 result = __generic_file_write_iter(iocb, iter);
1617 /* If the page is not already dirty, ll_tiny_write_begin returns
1618 * -ENODATA. We continue on to normal write.
1620 if (result == -ENODATA)
1624 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1626 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1629 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1635 * Write to a file (through the page cache).
1637 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1639 struct vvp_io_args *args;
1641 ssize_t rc_tiny = 0, rc_normal;
1646 /* NB: we can't do direct IO for tiny writes because they use the page
1647 * cache, we can't do sync writes because tiny writes can't flush
1648 * pages, and we can't do append writes because we can't guarantee the
1649 * required DLM locks are held to protect file size.
1651 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1652 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1653 rc_tiny = ll_do_tiny_write(iocb, from);
1655 /* In case of error, go on and try normal write - Only stop if tiny
1656 * write completed I/O.
1658 if (iov_iter_count(from) == 0)
1659 GOTO(out, rc_normal = rc_tiny);
1661 env = cl_env_get(&refcheck);
1663 return PTR_ERR(env);
1665 args = ll_env_args(env, IO_NORMAL);
1666 args->u.normal.via_iter = from;
1667 args->u.normal.via_iocb = iocb;
1669 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1670 &iocb->ki_pos, iov_iter_count(from));
1672 /* On success, combine bytes written. */
1673 if (rc_tiny >= 0 && rc_normal > 0)
1674 rc_normal += rc_tiny;
1675 /* On error, only return error from normal write if tiny write did not
1676 * write any bytes. Otherwise return bytes written by tiny write.
1678 else if (rc_tiny > 0)
1679 rc_normal = rc_tiny;
1681 cl_env_put(env, &refcheck);
1686 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1688 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1690 static int ll_file_get_iov_count(const struct iovec *iov,
1691 unsigned long *nr_segs, size_t *count)
1696 for (seg = 0; seg < *nr_segs; seg++) {
1697 const struct iovec *iv = &iov[seg];
1700 * If any segment has a negative length, or the cumulative
1701 * length ever wraps negative then return -EINVAL.
1704 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1706 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1711 cnt -= iv->iov_len; /* This segment is no good */
1718 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1719 unsigned long nr_segs, loff_t pos)
1726 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1730 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1731 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1732 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1733 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1734 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1736 result = ll_file_read_iter(iocb, &to);
1741 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1744 struct iovec iov = { .iov_base = buf, .iov_len = count };
1749 init_sync_kiocb(&kiocb, file);
1750 kiocb.ki_pos = *ppos;
1751 #ifdef HAVE_KIOCB_KI_LEFT
1752 kiocb.ki_left = count;
1753 #elif defined(HAVE_KI_NBYTES)
1754 kiocb.i_nbytes = count;
1757 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1758 *ppos = kiocb.ki_pos;
1764 * Write to a file (through the page cache).
1767 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1768 unsigned long nr_segs, loff_t pos)
1770 struct iov_iter from;
1775 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1779 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1780 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1781 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1782 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1783 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1785 result = ll_file_write_iter(iocb, &from);
1790 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1791 size_t count, loff_t *ppos)
1793 struct iovec iov = { .iov_base = (void __user *)buf,
1800 init_sync_kiocb(&kiocb, file);
1801 kiocb.ki_pos = *ppos;
1802 #ifdef HAVE_KIOCB_KI_LEFT
1803 kiocb.ki_left = count;
1804 #elif defined(HAVE_KI_NBYTES)
1805 kiocb.ki_nbytes = count;
1808 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1809 *ppos = kiocb.ki_pos;
1813 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1816 * Send file content (through pagecache) somewhere with helper
1818 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1819 struct pipe_inode_info *pipe, size_t count,
1823 struct vvp_io_args *args;
1828 env = cl_env_get(&refcheck);
1830 RETURN(PTR_ERR(env));
1832 args = ll_env_args(env, IO_SPLICE);
1833 args->u.splice.via_pipe = pipe;
1834 args->u.splice.via_flags = flags;
1836 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1837 cl_env_put(env, &refcheck);
1841 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1842 __u64 flags, struct lov_user_md *lum, int lum_size)
1844 struct lookup_intent oit = {
1846 .it_flags = flags | MDS_OPEN_BY_FID,
1851 ll_inode_size_lock(inode);
1852 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1854 GOTO(out_unlock, rc);
1856 ll_release_openhandle(dentry, &oit);
1859 ll_inode_size_unlock(inode);
1860 ll_intent_release(&oit);
1865 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1866 struct lov_mds_md **lmmp, int *lmm_size,
1867 struct ptlrpc_request **request)
1869 struct ll_sb_info *sbi = ll_i2sbi(inode);
1870 struct mdt_body *body;
1871 struct lov_mds_md *lmm = NULL;
1872 struct ptlrpc_request *req = NULL;
1873 struct md_op_data *op_data;
1876 rc = ll_get_default_mdsize(sbi, &lmmsize);
1880 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1881 strlen(filename), lmmsize,
1882 LUSTRE_OPC_ANY, NULL);
1883 if (IS_ERR(op_data))
1884 RETURN(PTR_ERR(op_data));
1886 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1887 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1888 ll_finish_md_op_data(op_data);
1890 CDEBUG(D_INFO, "md_getattr_name failed "
1891 "on %s: rc %d\n", filename, rc);
1895 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1896 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1898 lmmsize = body->mbo_eadatasize;
1900 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1902 GOTO(out, rc = -ENODATA);
1905 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1906 LASSERT(lmm != NULL);
1908 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1909 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1910 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1911 GOTO(out, rc = -EPROTO);
1914 * This is coming from the MDS, so is probably in
1915 * little endian. We convert it to host endian before
1916 * passing it to userspace.
1918 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1921 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1922 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1923 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1924 if (le32_to_cpu(lmm->lmm_pattern) &
1925 LOV_PATTERN_F_RELEASED)
1929 /* if function called for directory - we should
1930 * avoid swab not existent lsm objects */
1931 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1932 lustre_swab_lov_user_md_v1(
1933 (struct lov_user_md_v1 *)lmm);
1934 if (S_ISREG(body->mbo_mode))
1935 lustre_swab_lov_user_md_objects(
1936 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1938 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1939 lustre_swab_lov_user_md_v3(
1940 (struct lov_user_md_v3 *)lmm);
1941 if (S_ISREG(body->mbo_mode))
1942 lustre_swab_lov_user_md_objects(
1943 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1945 } else if (lmm->lmm_magic ==
1946 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1947 lustre_swab_lov_comp_md_v1(
1948 (struct lov_comp_md_v1 *)lmm);
1954 *lmm_size = lmmsize;
1959 static int ll_lov_setea(struct inode *inode, struct file *file,
1962 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1963 struct lov_user_md *lump;
1964 int lum_size = sizeof(struct lov_user_md) +
1965 sizeof(struct lov_user_ost_data);
1969 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1972 OBD_ALLOC_LARGE(lump, lum_size);
1976 if (copy_from_user(lump, arg, lum_size))
1977 GOTO(out_lump, rc = -EFAULT);
1979 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1981 cl_lov_delay_create_clear(&file->f_flags);
1984 OBD_FREE_LARGE(lump, lum_size);
1988 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1995 env = cl_env_get(&refcheck);
1997 RETURN(PTR_ERR(env));
1999 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2000 cl_env_put(env, &refcheck);
2004 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2007 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2008 struct lov_user_md *klum;
2010 __u64 flags = FMODE_WRITE;
2013 rc = ll_copy_user_md(lum, &klum);
2018 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2023 rc = put_user(0, &lum->lmm_stripe_count);
2027 rc = ll_layout_refresh(inode, &gen);
2031 rc = ll_file_getstripe(inode, arg, lum_size);
2033 cl_lov_delay_create_clear(&file->f_flags);
2036 OBD_FREE(klum, lum_size);
2041 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2043 struct ll_inode_info *lli = ll_i2info(inode);
2044 struct cl_object *obj = lli->lli_clob;
2045 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2046 struct ll_grouplock grouplock;
2051 CWARN("group id for group lock must not be 0\n");
2055 if (ll_file_nolock(file))
2056 RETURN(-EOPNOTSUPP);
2058 spin_lock(&lli->lli_lock);
2059 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2060 CWARN("group lock already existed with gid %lu\n",
2061 fd->fd_grouplock.lg_gid);
2062 spin_unlock(&lli->lli_lock);
2065 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2066 spin_unlock(&lli->lli_lock);
2069 * XXX: group lock needs to protect all OST objects while PFL
2070 * can add new OST objects during the IO, so we'd instantiate
2071 * all OST objects before getting its group lock.
2076 struct cl_layout cl = {
2077 .cl_is_composite = false,
2079 struct lu_extent ext = {
2081 .e_end = OBD_OBJECT_EOF,
2084 env = cl_env_get(&refcheck);
2086 RETURN(PTR_ERR(env));
2088 rc = cl_object_layout_get(env, obj, &cl);
2089 if (!rc && cl.cl_is_composite)
2090 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2093 cl_env_put(env, &refcheck);
2098 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2099 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2103 spin_lock(&lli->lli_lock);
2104 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2105 spin_unlock(&lli->lli_lock);
2106 CERROR("another thread just won the race\n");
2107 cl_put_grouplock(&grouplock);
2111 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2112 fd->fd_grouplock = grouplock;
2113 spin_unlock(&lli->lli_lock);
2115 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2119 static int ll_put_grouplock(struct inode *inode, struct file *file,
2122 struct ll_inode_info *lli = ll_i2info(inode);
2123 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2124 struct ll_grouplock grouplock;
2127 spin_lock(&lli->lli_lock);
2128 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2129 spin_unlock(&lli->lli_lock);
2130 CWARN("no group lock held\n");
2134 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2136 if (fd->fd_grouplock.lg_gid != arg) {
2137 CWARN("group lock %lu doesn't match current id %lu\n",
2138 arg, fd->fd_grouplock.lg_gid);
2139 spin_unlock(&lli->lli_lock);
2143 grouplock = fd->fd_grouplock;
2144 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2145 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2146 spin_unlock(&lli->lli_lock);
2148 cl_put_grouplock(&grouplock);
2149 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2154 * Close inode open handle
2156 * \param dentry [in] dentry which contains the inode
2157 * \param it [in,out] intent which contains open info and result
2160 * \retval <0 failure
2162 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2164 struct inode *inode = dentry->d_inode;
2165 struct obd_client_handle *och;
2171 /* Root ? Do nothing. */
2172 if (dentry->d_inode->i_sb->s_root == dentry)
2175 /* No open handle to close? Move away */
2176 if (!it_disposition(it, DISP_OPEN_OPEN))
2179 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2181 OBD_ALLOC(och, sizeof(*och));
2183 GOTO(out, rc = -ENOMEM);
2185 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2187 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2189 /* this one is in place of ll_file_open */
2190 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2191 ptlrpc_req_finished(it->it_request);
2192 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2198 * Get size for inode for which FIEMAP mapping is requested.
2199 * Make the FIEMAP get_info call and returns the result.
2200 * \param fiemap kernel buffer to hold extens
2201 * \param num_bytes kernel buffer size
2203 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2209 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2212 /* Checks for fiemap flags */
2213 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2214 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2218 /* Check for FIEMAP_FLAG_SYNC */
2219 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2220 rc = filemap_fdatawrite(inode->i_mapping);
2225 env = cl_env_get(&refcheck);
2227 RETURN(PTR_ERR(env));
2229 if (i_size_read(inode) == 0) {
2230 rc = ll_glimpse_size(inode);
2235 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2236 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2237 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2239 /* If filesize is 0, then there would be no objects for mapping */
2240 if (fmkey.lfik_oa.o_size == 0) {
2241 fiemap->fm_mapped_extents = 0;
2245 fmkey.lfik_fiemap = *fiemap;
2247 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2248 &fmkey, fiemap, &num_bytes);
2250 cl_env_put(env, &refcheck);
2254 int ll_fid2path(struct inode *inode, void __user *arg)
2256 struct obd_export *exp = ll_i2mdexp(inode);
2257 const struct getinfo_fid2path __user *gfin = arg;
2259 struct getinfo_fid2path *gfout;
2265 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2266 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2269 /* Only need to get the buflen */
2270 if (get_user(pathlen, &gfin->gf_pathlen))
2273 if (pathlen > PATH_MAX)
2276 outsize = sizeof(*gfout) + pathlen;
2277 OBD_ALLOC(gfout, outsize);
2281 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2282 GOTO(gf_free, rc = -EFAULT);
2283 /* append root FID after gfout to let MDT know the root FID so that it
2284 * can lookup the correct path, this is mainly for fileset.
2285 * old server without fileset mount support will ignore this. */
2286 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2288 /* Call mdc_iocontrol */
2289 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2293 if (copy_to_user(arg, gfout, outsize))
2297 OBD_FREE(gfout, outsize);
2302 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2304 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2312 ioc->idv_version = 0;
2313 ioc->idv_layout_version = UINT_MAX;
2315 /* If no file object initialized, we consider its version is 0. */
2319 env = cl_env_get(&refcheck);
2321 RETURN(PTR_ERR(env));
2323 io = vvp_env_thread_io(env);
2325 io->u.ci_data_version.dv_data_version = 0;
2326 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2327 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2330 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2331 result = cl_io_loop(env, io);
2333 result = io->ci_result;
2335 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2336 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2338 cl_io_fini(env, io);
2340 if (unlikely(io->ci_need_restart))
2343 cl_env_put(env, &refcheck);
2349 * Read the data_version for inode.
2351 * This value is computed using stripe object version on OST.
2352 * Version is computed using server side locking.
2354 * @param flags if do sync on the OST side;
2356 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2357 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2359 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2361 struct ioc_data_version ioc = { .idv_flags = flags };
2364 rc = ll_ioc_data_version(inode, &ioc);
2366 *data_version = ioc.idv_version;
2372 * Trigger a HSM release request for the provided inode.
2374 int ll_hsm_release(struct inode *inode)
2377 struct obd_client_handle *och = NULL;
2378 __u64 data_version = 0;
2383 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2384 ll_get_fsname(inode->i_sb, NULL, 0),
2385 PFID(&ll_i2info(inode)->lli_fid));
2387 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2389 GOTO(out, rc = PTR_ERR(och));
2391 /* Grab latest data_version and [am]time values */
2392 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2396 env = cl_env_get(&refcheck);
2398 GOTO(out, rc = PTR_ERR(env));
2400 rc = ll_merge_attr(env, inode);
2401 cl_env_put(env, &refcheck);
2403 /* If error happen, we have the wrong size for a file.
2409 /* Release the file.
2410 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2411 * we still need it to pack l_remote_handle to MDT. */
2412 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2418 if (och != NULL && !IS_ERR(och)) /* close the file */
2419 ll_lease_close(och, inode, NULL);
2424 struct ll_swap_stack {
2427 struct inode *inode1;
2428 struct inode *inode2;
2433 static int ll_swap_layouts(struct file *file1, struct file *file2,
2434 struct lustre_swap_layouts *lsl)
2436 struct mdc_swap_layouts msl;
2437 struct md_op_data *op_data;
2440 struct ll_swap_stack *llss = NULL;
2443 OBD_ALLOC_PTR(llss);
2447 llss->inode1 = file_inode(file1);
2448 llss->inode2 = file_inode(file2);
2450 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2454 /* we use 2 bool because it is easier to swap than 2 bits */
2455 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2456 llss->check_dv1 = true;
2458 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2459 llss->check_dv2 = true;
2461 /* we cannot use lsl->sl_dvX directly because we may swap them */
2462 llss->dv1 = lsl->sl_dv1;
2463 llss->dv2 = lsl->sl_dv2;
2465 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2466 if (rc == 0) /* same file, done! */
2469 if (rc < 0) { /* sequentialize it */
2470 swap(llss->inode1, llss->inode2);
2472 swap(llss->dv1, llss->dv2);
2473 swap(llss->check_dv1, llss->check_dv2);
2477 if (gid != 0) { /* application asks to flush dirty cache */
2478 rc = ll_get_grouplock(llss->inode1, file1, gid);
2482 rc = ll_get_grouplock(llss->inode2, file2, gid);
2484 ll_put_grouplock(llss->inode1, file1, gid);
2489 /* ultimate check, before swaping the layouts we check if
2490 * dataversion has changed (if requested) */
2491 if (llss->check_dv1) {
2492 rc = ll_data_version(llss->inode1, &dv, 0);
2495 if (dv != llss->dv1)
2496 GOTO(putgl, rc = -EAGAIN);
2499 if (llss->check_dv2) {
2500 rc = ll_data_version(llss->inode2, &dv, 0);
2503 if (dv != llss->dv2)
2504 GOTO(putgl, rc = -EAGAIN);
2507 /* struct md_op_data is used to send the swap args to the mdt
2508 * only flags is missing, so we use struct mdc_swap_layouts
2509 * through the md_op_data->op_data */
2510 /* flags from user space have to be converted before they are send to
2511 * server, no flag is sent today, they are only used on the client */
2514 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2515 0, LUSTRE_OPC_ANY, &msl);
2516 if (IS_ERR(op_data))
2517 GOTO(free, rc = PTR_ERR(op_data));
2519 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2520 sizeof(*op_data), op_data, NULL);
2521 ll_finish_md_op_data(op_data);
2528 ll_put_grouplock(llss->inode2, file2, gid);
2529 ll_put_grouplock(llss->inode1, file1, gid);
2539 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2541 struct obd_export *exp = ll_i2mdexp(inode);
2542 struct md_op_data *op_data;
2546 /* Detect out-of range masks */
2547 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2550 /* Non-root users are forbidden to set or clear flags which are
2551 * NOT defined in HSM_USER_MASK. */
2552 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2553 !cfs_capable(CFS_CAP_SYS_ADMIN))
2556 if (!exp_connect_archive_id_array(exp)) {
2557 /* Detect out-of range archive id */
2558 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2559 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2563 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2564 LUSTRE_OPC_ANY, hss);
2565 if (IS_ERR(op_data))
2566 RETURN(PTR_ERR(op_data));
2568 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2571 ll_finish_md_op_data(op_data);
2576 static int ll_hsm_import(struct inode *inode, struct file *file,
2577 struct hsm_user_import *hui)
2579 struct hsm_state_set *hss = NULL;
2580 struct iattr *attr = NULL;
2584 if (!S_ISREG(inode->i_mode))
2590 GOTO(out, rc = -ENOMEM);
2592 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2593 hss->hss_archive_id = hui->hui_archive_id;
2594 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2595 rc = ll_hsm_state_set(inode, hss);
2599 OBD_ALLOC_PTR(attr);
2601 GOTO(out, rc = -ENOMEM);
2603 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2604 attr->ia_mode |= S_IFREG;
2605 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2606 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2607 attr->ia_size = hui->hui_size;
2608 attr->ia_mtime.tv_sec = hui->hui_mtime;
2609 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2610 attr->ia_atime.tv_sec = hui->hui_atime;
2611 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2613 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2614 ATTR_UID | ATTR_GID |
2615 ATTR_MTIME | ATTR_MTIME_SET |
2616 ATTR_ATIME | ATTR_ATIME_SET;
2620 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2624 inode_unlock(inode);
2636 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2638 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2639 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2642 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2644 struct inode *inode = file_inode(file);
2646 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2647 ATTR_MTIME | ATTR_MTIME_SET |
2650 .tv_sec = lfu->lfu_atime_sec,
2651 .tv_nsec = lfu->lfu_atime_nsec,
2654 .tv_sec = lfu->lfu_mtime_sec,
2655 .tv_nsec = lfu->lfu_mtime_nsec,
2658 .tv_sec = lfu->lfu_ctime_sec,
2659 .tv_nsec = lfu->lfu_ctime_nsec,
2665 if (!capable(CAP_SYS_ADMIN))
2668 if (!S_ISREG(inode->i_mode))
2672 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2674 inode_unlock(inode);
2679 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2682 case MODE_READ_USER:
2684 case MODE_WRITE_USER:
2691 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2693 /* Used to allow the upper layers of the client to request an LDLM lock
2694 * without doing an actual read or write.
2696 * Used for ladvise lockahead to manually request specific locks.
2698 * \param[in] file file this ladvise lock request is on
2699 * \param[in] ladvise ladvise struct describing this lock request
2701 * \retval 0 success, no detailed result available (sync requests
2702 * and requests sent to the server [not handled locally]
2703 * cannot return detailed results)
2704 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2705 * see definitions for details.
2706 * \retval negative negative errno on error
2708 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2710 struct lu_env *env = NULL;
2711 struct cl_io *io = NULL;
2712 struct cl_lock *lock = NULL;
2713 struct cl_lock_descr *descr = NULL;
2714 struct dentry *dentry = file->f_path.dentry;
2715 struct inode *inode = dentry->d_inode;
2716 enum cl_lock_mode cl_mode;
2717 off_t start = ladvise->lla_start;
2718 off_t end = ladvise->lla_end;
2724 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2725 "start=%llu, end=%llu\n", dentry->d_name.len,
2726 dentry->d_name.name, dentry->d_inode,
2727 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2730 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2732 GOTO(out, result = cl_mode);
2734 /* Get IO environment */
2735 result = cl_io_get(inode, &env, &io, &refcheck);
2739 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2742 * nothing to do for this io. This currently happens when
2743 * stripe sub-object's are not yet created.
2745 result = io->ci_result;
2746 } else if (result == 0) {
2747 lock = vvp_env_lock(env);
2748 descr = &lock->cll_descr;
2750 descr->cld_obj = io->ci_obj;
2751 /* Convert byte offsets to pages */
2752 descr->cld_start = cl_index(io->ci_obj, start);
2753 descr->cld_end = cl_index(io->ci_obj, end);
2754 descr->cld_mode = cl_mode;
2755 /* CEF_MUST is used because we do not want to convert a
2756 * lockahead request to a lockless lock */
2757 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2760 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2761 descr->cld_enq_flags |= CEF_SPECULATIVE;
2763 result = cl_lock_request(env, io, lock);
2765 /* On success, we need to release the lock */
2767 cl_lock_release(env, lock);
2769 cl_io_fini(env, io);
2770 cl_env_put(env, &refcheck);
2772 /* -ECANCELED indicates a matching lock with a different extent
2773 * was already present, and -EEXIST indicates a matching lock
2774 * on exactly the same extent was already present.
2775 * We convert them to positive values for userspace to make
2776 * recognizing true errors easier.
2777 * Note we can only return these detailed results on async requests,
2778 * as sync requests look the same as i/o requests for locking. */
2779 if (result == -ECANCELED)
2780 result = LLA_RESULT_DIFFERENT;
2781 else if (result == -EEXIST)
2782 result = LLA_RESULT_SAME;
2787 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2789 static int ll_ladvise_sanity(struct inode *inode,
2790 struct llapi_lu_ladvise *ladvise)
2792 enum lu_ladvise_type advice = ladvise->lla_advice;
2793 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2794 * be in the first 32 bits of enum ladvise_flags */
2795 __u32 flags = ladvise->lla_peradvice_flags;
2796 /* 3 lines at 80 characters per line, should be plenty */
2799 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2801 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2802 "last supported advice is %s (value '%d'): rc = %d\n",
2803 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2804 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2808 /* Per-advice checks */
2810 case LU_LADVISE_LOCKNOEXPAND:
2811 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2813 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2815 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2816 ladvise_names[advice], rc);
2820 case LU_LADVISE_LOCKAHEAD:
2821 /* Currently only READ and WRITE modes can be requested */
2822 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2823 ladvise->lla_lockahead_mode == 0) {
2825 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2827 ll_get_fsname(inode->i_sb, NULL, 0),
2828 ladvise->lla_lockahead_mode,
2829 ladvise_names[advice], rc);
2832 case LU_LADVISE_WILLREAD:
2833 case LU_LADVISE_DONTNEED:
2835 /* Note fall through above - These checks apply to all advices
2836 * except LOCKNOEXPAND */
2837 if (flags & ~LF_DEFAULT_MASK) {
2839 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2841 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2842 ladvise_names[advice], rc);
2845 if (ladvise->lla_start >= ladvise->lla_end) {
2847 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2848 "for %s: rc = %d\n",
2849 ll_get_fsname(inode->i_sb, NULL, 0),
2850 ladvise->lla_start, ladvise->lla_end,
2851 ladvise_names[advice], rc);
2863 * Give file access advices
2865 * The ladvise interface is similar to Linux fadvise() system call, except it
2866 * forwards the advices directly from Lustre client to server. The server side
2867 * codes will apply appropriate read-ahead and caching techniques for the
2868 * corresponding files.
2870 * A typical workload for ladvise is e.g. a bunch of different clients are
2871 * doing small random reads of a file, so prefetching pages into OSS cache
2872 * with big linear reads before the random IO is a net benefit. Fetching
2873 * all that data into each client cache with fadvise() may not be, due to
2874 * much more data being sent to the client.
2876 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2877 struct llapi_lu_ladvise *ladvise)
2881 struct cl_ladvise_io *lio;
2886 env = cl_env_get(&refcheck);
2888 RETURN(PTR_ERR(env));
2890 io = vvp_env_thread_io(env);
2891 io->ci_obj = ll_i2info(inode)->lli_clob;
2893 /* initialize parameters for ladvise */
2894 lio = &io->u.ci_ladvise;
2895 lio->li_start = ladvise->lla_start;
2896 lio->li_end = ladvise->lla_end;
2897 lio->li_fid = ll_inode2fid(inode);
2898 lio->li_advice = ladvise->lla_advice;
2899 lio->li_flags = flags;
2901 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2902 rc = cl_io_loop(env, io);
2906 cl_io_fini(env, io);
2907 cl_env_put(env, &refcheck);
2911 static int ll_lock_noexpand(struct file *file, int flags)
2913 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2915 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2920 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2923 struct fsxattr fsxattr;
2925 if (copy_from_user(&fsxattr,
2926 (const struct fsxattr __user *)arg,
2930 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2931 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2932 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2933 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2934 if (copy_to_user((struct fsxattr __user *)arg,
2935 &fsxattr, sizeof(fsxattr)))
2941 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
2944 * Project Quota ID state is only allowed to change from within the init
2945 * namespace. Enforce that restriction only if we are trying to change
2946 * the quota ID state. Everything else is allowed in user namespaces.
2948 if (current_user_ns() == &init_user_ns)
2951 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
2954 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
2955 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
2958 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
2965 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2969 struct md_op_data *op_data;
2970 struct ptlrpc_request *req = NULL;
2972 struct fsxattr fsxattr;
2973 struct cl_object *obj;
2977 if (copy_from_user(&fsxattr,
2978 (const struct fsxattr __user *)arg,
2982 rc = ll_ioctl_check_project(inode, &fsxattr);
2986 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2987 LUSTRE_OPC_ANY, NULL);
2988 if (IS_ERR(op_data))
2989 RETURN(PTR_ERR(op_data));
2991 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
2992 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
2993 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
2994 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
2995 op_data->op_projid = fsxattr.fsx_projid;
2996 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
2997 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2999 ptlrpc_req_finished(req);
3001 GOTO(out_fsxattr, rc);
3002 ll_update_inode_flags(inode, op_data->op_attr_flags);
3003 obj = ll_i2info(inode)->lli_clob;
3005 GOTO(out_fsxattr, rc);
3007 OBD_ALLOC_PTR(attr);
3009 GOTO(out_fsxattr, rc = -ENOMEM);
3011 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3012 fsxattr.fsx_xflags);
3015 ll_finish_md_op_data(op_data);
3019 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3022 struct inode *inode = file_inode(file);
3023 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3024 struct ll_inode_info *lli = ll_i2info(inode);
3025 struct obd_client_handle *och = NULL;
3026 struct split_param sp;
3029 enum mds_op_bias bias = 0;
3030 struct file *layout_file = NULL;
3032 size_t data_size = 0;
3036 mutex_lock(&lli->lli_och_mutex);
3037 if (fd->fd_lease_och != NULL) {
3038 och = fd->fd_lease_och;
3039 fd->fd_lease_och = NULL;
3041 mutex_unlock(&lli->lli_och_mutex);
3044 GOTO(out, rc = -ENOLCK);
3046 fmode = och->och_flags;
3048 switch (ioc->lil_flags) {
3049 case LL_LEASE_RESYNC_DONE:
3050 if (ioc->lil_count > IOC_IDS_MAX)
3051 GOTO(out, rc = -EINVAL);
3053 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3054 OBD_ALLOC(data, data_size);
3056 GOTO(out, rc = -ENOMEM);
3058 if (copy_from_user(data, (void __user *)arg, data_size))
3059 GOTO(out, rc = -EFAULT);
3061 bias = MDS_CLOSE_RESYNC_DONE;
3063 case LL_LEASE_LAYOUT_MERGE: {
3066 if (ioc->lil_count != 1)
3067 GOTO(out, rc = -EINVAL);
3069 arg += sizeof(*ioc);
3070 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3071 GOTO(out, rc = -EFAULT);
3073 layout_file = fget(fd);
3075 GOTO(out, rc = -EBADF);
3077 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3078 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3079 GOTO(out, rc = -EPERM);
3081 data = file_inode(layout_file);
3082 bias = MDS_CLOSE_LAYOUT_MERGE;
3085 case LL_LEASE_LAYOUT_SPLIT: {
3089 if (ioc->lil_count != 2)
3090 GOTO(out, rc = -EINVAL);
3092 arg += sizeof(*ioc);
3093 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3094 GOTO(out, rc = -EFAULT);
3096 arg += sizeof(__u32);
3097 if (copy_from_user(&mirror_id, (void __user *)arg,
3099 GOTO(out, rc = -EFAULT);
3101 layout_file = fget(fdv);
3103 GOTO(out, rc = -EBADF);
3105 sp.sp_inode = file_inode(layout_file);
3106 sp.sp_mirror_id = (__u16)mirror_id;
3108 bias = MDS_CLOSE_LAYOUT_SPLIT;
3112 /* without close intent */
3116 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3120 rc = ll_lease_och_release(inode, file);
3129 switch (ioc->lil_flags) {
3130 case LL_LEASE_RESYNC_DONE:
3132 OBD_FREE(data, data_size);
3134 case LL_LEASE_LAYOUT_MERGE:
3135 case LL_LEASE_LAYOUT_SPLIT:
3142 rc = ll_lease_type_from_fmode(fmode);
3146 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3149 struct inode *inode = file_inode(file);
3150 struct ll_inode_info *lli = ll_i2info(inode);
3151 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3152 struct obd_client_handle *och = NULL;
3153 __u64 open_flags = 0;
3159 switch (ioc->lil_mode) {
3160 case LL_LEASE_WRLCK:
3161 if (!(file->f_mode & FMODE_WRITE))
3163 fmode = FMODE_WRITE;
3165 case LL_LEASE_RDLCK:
3166 if (!(file->f_mode & FMODE_READ))
3170 case LL_LEASE_UNLCK:
3171 RETURN(ll_file_unlock_lease(file, ioc, arg));
3176 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3178 /* apply for lease */
3179 if (ioc->lil_flags & LL_LEASE_RESYNC)
3180 open_flags = MDS_OPEN_RESYNC;
3181 och = ll_lease_open(inode, file, fmode, open_flags);
3183 RETURN(PTR_ERR(och));
3185 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3186 rc = ll_lease_file_resync(och, inode, arg);
3188 ll_lease_close(och, inode, NULL);
3191 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3193 ll_lease_close(och, inode, NULL);
3199 mutex_lock(&lli->lli_och_mutex);
3200 if (fd->fd_lease_och == NULL) {
3201 fd->fd_lease_och = och;
3204 mutex_unlock(&lli->lli_och_mutex);
3206 /* impossible now that only excl is supported for now */
3207 ll_lease_close(och, inode, &lease_broken);
3214 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3216 struct inode *inode = file_inode(file);
3217 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3221 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3222 PFID(ll_inode2fid(inode)), inode, cmd);
3223 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3225 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3226 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3230 case LL_IOC_GETFLAGS:
3231 /* Get the current value of the file flags */
3232 return put_user(fd->fd_flags, (int __user *)arg);
3233 case LL_IOC_SETFLAGS:
3234 case LL_IOC_CLRFLAGS:
3235 /* Set or clear specific file flags */
3236 /* XXX This probably needs checks to ensure the flags are
3237 * not abused, and to handle any flag side effects.
3239 if (get_user(flags, (int __user *) arg))
3242 if (cmd == LL_IOC_SETFLAGS) {
3243 if ((flags & LL_FILE_IGNORE_LOCK) &&
3244 !(file->f_flags & O_DIRECT)) {
3245 CERROR("%s: unable to disable locking on "
3246 "non-O_DIRECT file\n", current->comm);
3250 fd->fd_flags |= flags;
3252 fd->fd_flags &= ~flags;
3255 case LL_IOC_LOV_SETSTRIPE:
3256 case LL_IOC_LOV_SETSTRIPE_NEW:
3257 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3258 case LL_IOC_LOV_SETEA:
3259 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3260 case LL_IOC_LOV_SWAP_LAYOUTS: {
3262 struct lustre_swap_layouts lsl;
3264 if (copy_from_user(&lsl, (char __user *)arg,
3265 sizeof(struct lustre_swap_layouts)))
3268 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3271 file2 = fget(lsl.sl_fd);
3275 /* O_WRONLY or O_RDWR */
3276 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3277 GOTO(out, rc = -EPERM);
3279 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3280 struct inode *inode2;
3281 struct ll_inode_info *lli;
3282 struct obd_client_handle *och = NULL;
3284 lli = ll_i2info(inode);
3285 mutex_lock(&lli->lli_och_mutex);
3286 if (fd->fd_lease_och != NULL) {
3287 och = fd->fd_lease_och;
3288 fd->fd_lease_och = NULL;
3290 mutex_unlock(&lli->lli_och_mutex);
3292 GOTO(out, rc = -ENOLCK);
3293 inode2 = file_inode(file2);
3294 rc = ll_swap_layouts_close(och, inode, inode2);
3296 rc = ll_swap_layouts(file, file2, &lsl);
3302 case LL_IOC_LOV_GETSTRIPE:
3303 case LL_IOC_LOV_GETSTRIPE_NEW:
3304 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3305 case FS_IOC_GETFLAGS:
3306 case FS_IOC_SETFLAGS:
3307 RETURN(ll_iocontrol(inode, file, cmd, arg));
3308 case FSFILT_IOC_GETVERSION:
3309 case FS_IOC_GETVERSION:
3310 RETURN(put_user(inode->i_generation, (int __user *)arg));
3311 /* We need to special case any other ioctls we want to handle,
3312 * to send them to the MDS/OST as appropriate and to properly
3313 * network encode the arg field. */
3314 case FS_IOC_SETVERSION:
3317 case LL_IOC_GROUP_LOCK:
3318 RETURN(ll_get_grouplock(inode, file, arg));
3319 case LL_IOC_GROUP_UNLOCK:
3320 RETURN(ll_put_grouplock(inode, file, arg));
3321 case IOC_OBD_STATFS:
3322 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3324 case LL_IOC_FLUSHCTX:
3325 RETURN(ll_flush_ctx(inode));
3326 case LL_IOC_PATH2FID: {
3327 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3328 sizeof(struct lu_fid)))
3333 case LL_IOC_GETPARENT:
3334 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3336 case OBD_IOC_FID2PATH:
3337 RETURN(ll_fid2path(inode, (void __user *)arg));
3338 case LL_IOC_DATA_VERSION: {
3339 struct ioc_data_version idv;
3342 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3345 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3346 rc = ll_ioc_data_version(inode, &idv);
3349 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3355 case LL_IOC_GET_MDTIDX: {
3358 mdtidx = ll_get_mdt_idx(inode);
3362 if (put_user((int)mdtidx, (int __user *)arg))
3367 case OBD_IOC_GETDTNAME:
3368 case OBD_IOC_GETMDNAME:
3369 RETURN(ll_get_obd_name(inode, cmd, arg));
3370 case LL_IOC_HSM_STATE_GET: {
3371 struct md_op_data *op_data;
3372 struct hsm_user_state *hus;
3379 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3380 LUSTRE_OPC_ANY, hus);
3381 if (IS_ERR(op_data)) {
3383 RETURN(PTR_ERR(op_data));
3386 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3389 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3392 ll_finish_md_op_data(op_data);
3396 case LL_IOC_HSM_STATE_SET: {
3397 struct hsm_state_set *hss;
3404 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3409 rc = ll_hsm_state_set(inode, hss);
3414 case LL_IOC_HSM_ACTION: {
3415 struct md_op_data *op_data;
3416 struct hsm_current_action *hca;
3423 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3424 LUSTRE_OPC_ANY, hca);
3425 if (IS_ERR(op_data)) {
3427 RETURN(PTR_ERR(op_data));
3430 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3433 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3436 ll_finish_md_op_data(op_data);
3440 case LL_IOC_SET_LEASE_OLD: {
3441 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3443 RETURN(ll_file_set_lease(file, &ioc, 0));
3445 case LL_IOC_SET_LEASE: {
3446 struct ll_ioc_lease ioc;
3448 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3451 RETURN(ll_file_set_lease(file, &ioc, arg));
3453 case LL_IOC_GET_LEASE: {
3454 struct ll_inode_info *lli = ll_i2info(inode);
3455 struct ldlm_lock *lock = NULL;
3458 mutex_lock(&lli->lli_och_mutex);
3459 if (fd->fd_lease_och != NULL) {
3460 struct obd_client_handle *och = fd->fd_lease_och;
3462 lock = ldlm_handle2lock(&och->och_lease_handle);
3464 lock_res_and_lock(lock);
3465 if (!ldlm_is_cancel(lock))
3466 fmode = och->och_flags;
3468 unlock_res_and_lock(lock);
3469 LDLM_LOCK_PUT(lock);
3472 mutex_unlock(&lli->lli_och_mutex);
3474 RETURN(ll_lease_type_from_fmode(fmode));
3476 case LL_IOC_HSM_IMPORT: {
3477 struct hsm_user_import *hui;
3483 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3488 rc = ll_hsm_import(inode, file, hui);
3493 case LL_IOC_FUTIMES_3: {
3494 struct ll_futimes_3 lfu;
3496 if (copy_from_user(&lfu,
3497 (const struct ll_futimes_3 __user *)arg,
3501 RETURN(ll_file_futimes_3(file, &lfu));
3503 case LL_IOC_LADVISE: {
3504 struct llapi_ladvise_hdr *k_ladvise_hdr;
3505 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3508 int alloc_size = sizeof(*k_ladvise_hdr);
3511 u_ladvise_hdr = (void __user *)arg;
3512 OBD_ALLOC_PTR(k_ladvise_hdr);
3513 if (k_ladvise_hdr == NULL)
3516 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3517 GOTO(out_ladvise, rc = -EFAULT);
3519 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3520 k_ladvise_hdr->lah_count < 1)
3521 GOTO(out_ladvise, rc = -EINVAL);
3523 num_advise = k_ladvise_hdr->lah_count;
3524 if (num_advise >= LAH_COUNT_MAX)
3525 GOTO(out_ladvise, rc = -EFBIG);
3527 OBD_FREE_PTR(k_ladvise_hdr);
3528 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3529 lah_advise[num_advise]);
3530 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3531 if (k_ladvise_hdr == NULL)
3535 * TODO: submit multiple advices to one server in a single RPC
3537 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3538 GOTO(out_ladvise, rc = -EFAULT);
3540 for (i = 0; i < num_advise; i++) {
3541 struct llapi_lu_ladvise *k_ladvise =
3542 &k_ladvise_hdr->lah_advise[i];
3543 struct llapi_lu_ladvise __user *u_ladvise =
3544 &u_ladvise_hdr->lah_advise[i];
3546 rc = ll_ladvise_sanity(inode, k_ladvise);
3548 GOTO(out_ladvise, rc);
3550 switch (k_ladvise->lla_advice) {
3551 case LU_LADVISE_LOCKNOEXPAND:
3552 rc = ll_lock_noexpand(file,
3553 k_ladvise->lla_peradvice_flags);
3554 GOTO(out_ladvise, rc);
3555 case LU_LADVISE_LOCKAHEAD:
3557 rc = ll_file_lock_ahead(file, k_ladvise);
3560 GOTO(out_ladvise, rc);
3563 &u_ladvise->lla_lockahead_result))
3564 GOTO(out_ladvise, rc = -EFAULT);
3567 rc = ll_ladvise(inode, file,
3568 k_ladvise_hdr->lah_flags,
3571 GOTO(out_ladvise, rc);
3578 OBD_FREE(k_ladvise_hdr, alloc_size);
3581 case LL_IOC_FLR_SET_MIRROR: {
3582 /* mirror I/O must be direct to avoid polluting page cache
3584 if (!(file->f_flags & O_DIRECT))
3587 fd->fd_designated_mirror = (__u32)arg;
3590 case LL_IOC_FSGETXATTR:
3591 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3592 case LL_IOC_FSSETXATTR:
3593 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3595 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3597 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3598 (void __user *)arg));
3602 #ifndef HAVE_FILE_LLSEEK_SIZE
3603 static inline loff_t
3604 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3606 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3608 if (offset > maxsize)
3611 if (offset != file->f_pos) {
3612 file->f_pos = offset;
3613 file->f_version = 0;
3619 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3620 loff_t maxsize, loff_t eof)
3622 struct inode *inode = file_inode(file);
3630 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3631 * position-querying operation. Avoid rewriting the "same"
3632 * f_pos value back to the file because a concurrent read(),
3633 * write() or lseek() might have altered it
3638 * f_lock protects against read/modify/write race with other
3639 * SEEK_CURs. Note that parallel writes and reads behave
3643 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3644 inode_unlock(inode);
3648 * In the generic case the entire file is data, so as long as
3649 * offset isn't at the end of the file then the offset is data.
3656 * There is a virtual hole at the end of the file, so as long as
3657 * offset isn't i_size or larger, return i_size.
3665 return llseek_execute(file, offset, maxsize);
3669 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3671 struct inode *inode = file_inode(file);
3672 loff_t retval, eof = 0;
3675 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3676 (origin == SEEK_CUR) ? file->f_pos : 0);
3677 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3678 PFID(ll_inode2fid(inode)), inode, retval, retval,
3680 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3682 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3683 retval = ll_glimpse_size(inode);
3686 eof = i_size_read(inode);
3689 retval = ll_generic_file_llseek_size(file, offset, origin,
3690 ll_file_maxbytes(inode), eof);
3694 static int ll_flush(struct file *file, fl_owner_t id)
3696 struct inode *inode = file_inode(file);
3697 struct ll_inode_info *lli = ll_i2info(inode);
3698 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3701 LASSERT(!S_ISDIR(inode->i_mode));
3703 /* catch async errors that were recorded back when async writeback
3704 * failed for pages in this mapping. */
3705 rc = lli->lli_async_rc;
3706 lli->lli_async_rc = 0;
3707 if (lli->lli_clob != NULL) {
3708 err = lov_read_and_clear_async_rc(lli->lli_clob);
3713 /* The application has been told write failure already.
3714 * Do not report failure again. */
3715 if (fd->fd_write_failed)
3717 return rc ? -EIO : 0;
3721 * Called to make sure a portion of file has been written out.
3722 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3724 * Return how many pages have been written.
3726 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3727 enum cl_fsync_mode mode, int ignore_layout)
3731 struct cl_fsync_io *fio;
3736 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3737 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3740 env = cl_env_get(&refcheck);
3742 RETURN(PTR_ERR(env));
3744 io = vvp_env_thread_io(env);
3745 io->ci_obj = ll_i2info(inode)->lli_clob;
3746 io->ci_ignore_layout = ignore_layout;
3748 /* initialize parameters for sync */
3749 fio = &io->u.ci_fsync;
3750 fio->fi_start = start;
3752 fio->fi_fid = ll_inode2fid(inode);
3753 fio->fi_mode = mode;
3754 fio->fi_nr_written = 0;
3756 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3757 result = cl_io_loop(env, io);
3759 result = io->ci_result;
3761 result = fio->fi_nr_written;
3762 cl_io_fini(env, io);
3763 cl_env_put(env, &refcheck);
3769 * When dentry is provided (the 'else' case), file_dentry() may be
3770 * null and dentry must be used directly rather than pulled from
3771 * file_dentry() as is done otherwise.
3774 #ifdef HAVE_FILE_FSYNC_4ARGS
3775 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3777 struct dentry *dentry = file_dentry(file);
3778 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3779 int ll_fsync(struct file *file, int datasync)
3781 struct dentry *dentry = file_dentry(file);
3783 loff_t end = LLONG_MAX;
3785 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3788 loff_t end = LLONG_MAX;
3790 struct inode *inode = dentry->d_inode;
3791 struct ll_inode_info *lli = ll_i2info(inode);
3792 struct ptlrpc_request *req;
3796 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3797 PFID(ll_inode2fid(inode)), inode);
3798 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3800 #ifdef HAVE_FILE_FSYNC_4ARGS
3801 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3804 /* fsync's caller has already called _fdata{sync,write}, we want
3805 * that IO to finish before calling the osc and mdc sync methods */
3806 rc = filemap_fdatawait(inode->i_mapping);
3809 /* catch async errors that were recorded back when async writeback
3810 * failed for pages in this mapping. */
3811 if (!S_ISDIR(inode->i_mode)) {
3812 err = lli->lli_async_rc;
3813 lli->lli_async_rc = 0;
3816 if (lli->lli_clob != NULL) {
3817 err = lov_read_and_clear_async_rc(lli->lli_clob);
3823 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3827 ptlrpc_req_finished(req);
3829 if (S_ISREG(inode->i_mode)) {
3830 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3832 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3833 if (rc == 0 && err < 0)
3836 fd->fd_write_failed = true;
3838 fd->fd_write_failed = false;
3841 #ifdef HAVE_FILE_FSYNC_4ARGS
3842 inode_unlock(inode);
3848 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3850 struct inode *inode = file_inode(file);
3851 struct ll_sb_info *sbi = ll_i2sbi(inode);
3852 struct ldlm_enqueue_info einfo = {
3853 .ei_type = LDLM_FLOCK,
3854 .ei_cb_cp = ldlm_flock_completion_ast,
3855 .ei_cbdata = file_lock,
3857 struct md_op_data *op_data;
3858 struct lustre_handle lockh = { 0 };
3859 union ldlm_policy_data flock = { { 0 } };
3860 int fl_type = file_lock->fl_type;
3866 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3867 PFID(ll_inode2fid(inode)), file_lock);
3869 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3871 if (file_lock->fl_flags & FL_FLOCK) {
3872 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3873 /* flocks are whole-file locks */
3874 flock.l_flock.end = OFFSET_MAX;
3875 /* For flocks owner is determined by the local file desctiptor*/
3876 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3877 } else if (file_lock->fl_flags & FL_POSIX) {
3878 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3879 flock.l_flock.start = file_lock->fl_start;
3880 flock.l_flock.end = file_lock->fl_end;
3884 flock.l_flock.pid = file_lock->fl_pid;
3886 /* Somewhat ugly workaround for svc lockd.
3887 * lockd installs custom fl_lmops->lm_compare_owner that checks
3888 * for the fl_owner to be the same (which it always is on local node
3889 * I guess between lockd processes) and then compares pid.
3890 * As such we assign pid to the owner field to make it all work,
3891 * conflict with normal locks is unlikely since pid space and
3892 * pointer space for current->files are not intersecting */
3893 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3894 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3898 einfo.ei_mode = LCK_PR;
3901 /* An unlock request may or may not have any relation to
3902 * existing locks so we may not be able to pass a lock handle
3903 * via a normal ldlm_lock_cancel() request. The request may even
3904 * unlock a byte range in the middle of an existing lock. In
3905 * order to process an unlock request we need all of the same
3906 * information that is given with a normal read or write record
3907 * lock request. To avoid creating another ldlm unlock (cancel)
3908 * message we'll treat a LCK_NL flock request as an unlock. */
3909 einfo.ei_mode = LCK_NL;
3912 einfo.ei_mode = LCK_PW;
3915 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3930 flags = LDLM_FL_BLOCK_NOWAIT;
3936 flags = LDLM_FL_TEST_LOCK;
3939 CERROR("unknown fcntl lock command: %d\n", cmd);
3943 /* Save the old mode so that if the mode in the lock changes we
3944 * can decrement the appropriate reader or writer refcount. */
3945 file_lock->fl_type = einfo.ei_mode;
3947 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3948 LUSTRE_OPC_ANY, NULL);
3949 if (IS_ERR(op_data))
3950 RETURN(PTR_ERR(op_data));
3952 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3953 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3954 flock.l_flock.pid, flags, einfo.ei_mode,
3955 flock.l_flock.start, flock.l_flock.end);
3957 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3960 /* Restore the file lock type if not TEST lock. */
3961 if (!(flags & LDLM_FL_TEST_LOCK))
3962 file_lock->fl_type = fl_type;
3964 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3965 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3966 !(flags & LDLM_FL_TEST_LOCK))
3967 rc2 = locks_lock_file_wait(file, file_lock);
3969 if ((file_lock->fl_flags & FL_FLOCK) &&
3970 (rc == 0 || file_lock->fl_type == F_UNLCK))
3971 rc2 = flock_lock_file_wait(file, file_lock);
3972 if ((file_lock->fl_flags & FL_POSIX) &&
3973 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3974 !(flags & LDLM_FL_TEST_LOCK))
3975 rc2 = posix_lock_file_wait(file, file_lock);
3976 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3978 if (rc2 && file_lock->fl_type != F_UNLCK) {
3979 einfo.ei_mode = LCK_NL;
3980 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3985 ll_finish_md_op_data(op_data);
3990 int ll_get_fid_by_name(struct inode *parent, const char *name,
3991 int namelen, struct lu_fid *fid,
3992 struct inode **inode)
3994 struct md_op_data *op_data = NULL;
3995 struct mdt_body *body;
3996 struct ptlrpc_request *req;
4000 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4001 LUSTRE_OPC_ANY, NULL);
4002 if (IS_ERR(op_data))
4003 RETURN(PTR_ERR(op_data));
4005 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4006 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4007 ll_finish_md_op_data(op_data);
4011 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4013 GOTO(out_req, rc = -EFAULT);
4015 *fid = body->mbo_fid1;
4018 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4020 ptlrpc_req_finished(req);
4024 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4027 struct dentry *dchild = NULL;
4028 struct inode *child_inode = NULL;
4029 struct md_op_data *op_data;
4030 struct ptlrpc_request *request = NULL;
4031 struct obd_client_handle *och = NULL;
4033 struct mdt_body *body;
4034 __u64 data_version = 0;
4035 size_t namelen = strlen(name);
4036 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4040 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4041 PFID(ll_inode2fid(parent)), name,
4042 lum->lum_stripe_offset, lum->lum_stripe_count);
4044 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4045 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4046 lustre_swab_lmv_user_md(lum);
4048 /* Get child FID first */
4049 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4052 dchild = d_lookup(file_dentry(file), &qstr);
4054 if (dchild->d_inode)
4055 child_inode = igrab(dchild->d_inode);
4060 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4069 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4070 OBD_CONNECT2_DIR_MIGRATE)) {
4071 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4072 ll_i2info(child_inode)->lli_lsm_md) {
4073 CERROR("%s: MDT doesn't support stripe directory "
4075 ll_get_fsname(parent->i_sb, NULL, 0));
4076 GOTO(out_iput, rc = -EOPNOTSUPP);
4081 * lfs migrate command needs to be blocked on the client
4082 * by checking the migrate FID against the FID of the
4085 if (child_inode == parent->i_sb->s_root->d_inode)
4086 GOTO(out_iput, rc = -EINVAL);
4088 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4089 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4090 if (IS_ERR(op_data))
4091 GOTO(out_iput, rc = PTR_ERR(op_data));
4093 inode_lock(child_inode);
4094 op_data->op_fid3 = *ll_inode2fid(child_inode);
4095 if (!fid_is_sane(&op_data->op_fid3)) {
4096 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4097 ll_get_fsname(parent->i_sb, NULL, 0), name,
4098 PFID(&op_data->op_fid3));
4099 GOTO(out_unlock, rc = -EINVAL);
4102 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4103 op_data->op_data = lum;
4104 op_data->op_data_size = lumlen;
4107 if (S_ISREG(child_inode->i_mode)) {
4108 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4112 GOTO(out_unlock, rc);
4115 rc = ll_data_version(child_inode, &data_version,
4118 GOTO(out_close, rc);
4120 op_data->op_open_handle = och->och_open_handle;
4121 op_data->op_data_version = data_version;
4122 op_data->op_lease_handle = och->och_lease_handle;
4123 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4125 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4126 och->och_mod->mod_open_req->rq_replay = 0;
4127 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4130 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4131 name, namelen, &request);
4133 LASSERT(request != NULL);
4134 ll_update_times(request, parent);
4136 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4137 LASSERT(body != NULL);
4139 /* If the server does release layout lock, then we cleanup
4140 * the client och here, otherwise release it in out_close: */
4141 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4142 obd_mod_put(och->och_mod);
4143 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4145 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4151 if (request != NULL) {
4152 ptlrpc_req_finished(request);
4156 /* Try again if the file layout has changed. */
4157 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4162 ll_lease_close(och, child_inode, NULL);
4164 clear_nlink(child_inode);
4166 inode_unlock(child_inode);
4167 ll_finish_md_op_data(op_data);
4174 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4182 * test if some locks matching bits and l_req_mode are acquired
4183 * - bits can be in different locks
4184 * - if found clear the common lock bits in *bits
4185 * - the bits not found, are kept in *bits
4187 * \param bits [IN] searched lock bits [IN]
4188 * \param l_req_mode [IN] searched lock mode
4189 * \retval boolean, true iff all bits are found
4191 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4193 struct lustre_handle lockh;
4194 union ldlm_policy_data policy;
4195 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4196 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4205 fid = &ll_i2info(inode)->lli_fid;
4206 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4207 ldlm_lockname[mode]);
4209 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4210 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4211 policy.l_inodebits.bits = *bits & (1 << i);
4212 if (policy.l_inodebits.bits == 0)
4215 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4216 &policy, mode, &lockh)) {
4217 struct ldlm_lock *lock;
4219 lock = ldlm_handle2lock(&lockh);
4222 ~(lock->l_policy_data.l_inodebits.bits);
4223 LDLM_LOCK_PUT(lock);
4225 *bits &= ~policy.l_inodebits.bits;
4232 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4233 struct lustre_handle *lockh, __u64 flags,
4234 enum ldlm_mode mode)
4236 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4241 fid = &ll_i2info(inode)->lli_fid;
4242 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4244 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4245 fid, LDLM_IBITS, &policy, mode, lockh);
4250 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4252 /* Already unlinked. Just update nlink and return success */
4253 if (rc == -ENOENT) {
4255 /* If it is striped directory, and there is bad stripe
4256 * Let's revalidate the dentry again, instead of returning
4258 if (S_ISDIR(inode->i_mode) &&
4259 ll_i2info(inode)->lli_lsm_md != NULL)
4262 /* This path cannot be hit for regular files unless in
4263 * case of obscure races, so no need to to validate
4265 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4267 } else if (rc != 0) {
4268 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4269 "%s: revalidate FID "DFID" error: rc = %d\n",
4270 ll_get_fsname(inode->i_sb, NULL, 0),
4271 PFID(ll_inode2fid(inode)), rc);
4277 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4279 struct inode *inode = dentry->d_inode;
4280 struct obd_export *exp = ll_i2mdexp(inode);
4281 struct lookup_intent oit = {
4284 struct ptlrpc_request *req = NULL;
4285 struct md_op_data *op_data;
4289 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4290 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4292 /* Call getattr by fid, so do not provide name at all. */
4293 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4294 LUSTRE_OPC_ANY, NULL);
4295 if (IS_ERR(op_data))
4296 RETURN(PTR_ERR(op_data));
4298 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4299 ll_finish_md_op_data(op_data);
4301 rc = ll_inode_revalidate_fini(inode, rc);
4305 rc = ll_revalidate_it_finish(req, &oit, dentry);
4307 ll_intent_release(&oit);
4311 /* Unlinked? Unhash dentry, so it is not picked up later by
4312 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4313 * here to preserve get_cwd functionality on 2.6.
4315 if (!dentry->d_inode->i_nlink) {
4316 ll_lock_dcache(inode);
4317 d_lustre_invalidate(dentry, 0);
4318 ll_unlock_dcache(inode);
4321 ll_lookup_finish_locks(&oit, dentry);
4323 ptlrpc_req_finished(req);
4328 static int ll_merge_md_attr(struct inode *inode)
4330 struct ll_inode_info *lli = ll_i2info(inode);
4331 struct cl_attr attr = { 0 };
4334 LASSERT(lli->lli_lsm_md != NULL);
4335 down_read(&lli->lli_lsm_sem);
4336 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4337 &attr, ll_md_blocking_ast);
4338 up_read(&lli->lli_lsm_sem);
4342 set_nlink(inode, attr.cat_nlink);
4343 inode->i_blocks = attr.cat_blocks;
4344 i_size_write(inode, attr.cat_size);
4346 ll_i2info(inode)->lli_atime = attr.cat_atime;
4347 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4348 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4353 static inline dev_t ll_compat_encode_dev(dev_t dev)
4355 /* The compat_sys_*stat*() syscalls will fail unless the
4356 * device majors and minors are both less than 256. Note that
4357 * the value returned here will be passed through
4358 * old_encode_dev() in cp_compat_stat(). And so we are not
4359 * trying to return a valid compat (u16) device number, just
4360 * one that will pass the old_valid_dev() check. */
4362 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4365 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4366 int ll_getattr(const struct path *path, struct kstat *stat,
4367 u32 request_mask, unsigned int flags)
4369 struct dentry *de = path->dentry;
4371 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4374 struct inode *inode = de->d_inode;
4375 struct ll_sb_info *sbi = ll_i2sbi(inode);
4376 struct ll_inode_info *lli = ll_i2info(inode);
4379 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4381 rc = ll_inode_revalidate(de, IT_GETATTR);
4385 if (S_ISREG(inode->i_mode)) {
4386 /* In case of restore, the MDT has the right size and has
4387 * already send it back without granting the layout lock,
4388 * inode is up-to-date so glimpse is useless.
4389 * Also to glimpse we need the layout, in case of a running
4390 * restore the MDT holds the layout lock so the glimpse will
4391 * block up to the end of restore (getattr will block)
4393 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4394 rc = ll_glimpse_size(inode);
4399 /* If object isn't regular a file then don't validate size. */
4400 if (S_ISDIR(inode->i_mode) &&
4401 lli->lli_lsm_md != NULL) {
4402 rc = ll_merge_md_attr(inode);
4407 LTIME_S(inode->i_atime) = lli->lli_atime;
4408 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4409 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4412 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4414 if (ll_need_32bit_api(sbi)) {
4415 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4416 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4417 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4419 stat->ino = inode->i_ino;
4420 stat->dev = inode->i_sb->s_dev;
4421 stat->rdev = inode->i_rdev;
4424 stat->mode = inode->i_mode;
4425 stat->uid = inode->i_uid;
4426 stat->gid = inode->i_gid;
4427 stat->atime = inode->i_atime;
4428 stat->mtime = inode->i_mtime;
4429 stat->ctime = inode->i_ctime;
4430 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4432 stat->nlink = inode->i_nlink;
4433 stat->size = i_size_read(inode);
4434 stat->blocks = inode->i_blocks;
4439 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4440 __u64 start, __u64 len)
4444 struct fiemap *fiemap;
4445 unsigned int extent_count = fieinfo->fi_extents_max;
4447 num_bytes = sizeof(*fiemap) + (extent_count *
4448 sizeof(struct fiemap_extent));
4449 OBD_ALLOC_LARGE(fiemap, num_bytes);
4454 fiemap->fm_flags = fieinfo->fi_flags;
4455 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4456 fiemap->fm_start = start;
4457 fiemap->fm_length = len;
4458 if (extent_count > 0 &&
4459 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4460 sizeof(struct fiemap_extent)) != 0)
4461 GOTO(out, rc = -EFAULT);
4463 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4465 fieinfo->fi_flags = fiemap->fm_flags;
4466 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4467 if (extent_count > 0 &&
4468 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4469 fiemap->fm_mapped_extents *
4470 sizeof(struct fiemap_extent)) != 0)
4471 GOTO(out, rc = -EFAULT);
4473 OBD_FREE_LARGE(fiemap, num_bytes);
4477 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4479 struct ll_inode_info *lli = ll_i2info(inode);
4480 struct posix_acl *acl = NULL;
4483 spin_lock(&lli->lli_lock);
4484 /* VFS' acl_permission_check->check_acl will release the refcount */
4485 acl = posix_acl_dup(lli->lli_posix_acl);
4486 spin_unlock(&lli->lli_lock);
4491 #ifdef HAVE_IOP_SET_ACL
4492 #ifdef CONFIG_FS_POSIX_ACL
4493 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4495 struct ll_sb_info *sbi = ll_i2sbi(inode);
4496 struct ptlrpc_request *req = NULL;
4497 const char *name = NULL;
4499 size_t value_size = 0;
4504 case ACL_TYPE_ACCESS:
4505 name = XATTR_NAME_POSIX_ACL_ACCESS;
4507 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4510 case ACL_TYPE_DEFAULT:
4511 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4512 if (!S_ISDIR(inode->i_mode))
4513 rc = acl ? -EACCES : 0;
4524 value_size = posix_acl_xattr_size(acl->a_count);
4525 value = kmalloc(value_size, GFP_NOFS);
4527 GOTO(out, rc = -ENOMEM);
4529 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4531 GOTO(out_value, rc);
4534 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4535 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4536 name, value, value_size, 0, 0, &req);
4538 ptlrpc_req_finished(req);
4543 forget_cached_acl(inode, type);
4545 set_cached_acl(inode, type, acl);
4548 #endif /* CONFIG_FS_POSIX_ACL */
4549 #endif /* HAVE_IOP_SET_ACL */
4551 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4553 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4554 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4556 ll_check_acl(struct inode *inode, int mask)
4559 # ifdef CONFIG_FS_POSIX_ACL
4560 struct posix_acl *acl;
4564 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4565 if (flags & IPERM_FLAG_RCU)
4568 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4573 rc = posix_acl_permission(inode, acl, mask);
4574 posix_acl_release(acl);
4577 # else /* !CONFIG_FS_POSIX_ACL */
4579 # endif /* CONFIG_FS_POSIX_ACL */
4581 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4583 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4584 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4586 # ifdef HAVE_INODE_PERMISION_2ARGS
4587 int ll_inode_permission(struct inode *inode, int mask)
4589 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4594 struct ll_sb_info *sbi;
4595 struct root_squash_info *squash;
4596 struct cred *cred = NULL;
4597 const struct cred *old_cred = NULL;
4599 bool squash_id = false;
4602 #ifdef MAY_NOT_BLOCK
4603 if (mask & MAY_NOT_BLOCK)
4605 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4606 if (flags & IPERM_FLAG_RCU)
4610 /* as root inode are NOT getting validated in lookup operation,
4611 * need to do it before permission check. */
4613 if (inode == inode->i_sb->s_root->d_inode) {
4614 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4619 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4620 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4622 /* squash fsuid/fsgid if needed */
4623 sbi = ll_i2sbi(inode);
4624 squash = &sbi->ll_squash;
4625 if (unlikely(squash->rsi_uid != 0 &&
4626 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4627 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4631 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4632 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4633 squash->rsi_uid, squash->rsi_gid);
4635 /* update current process's credentials
4636 * and FS capability */
4637 cred = prepare_creds();
4641 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4642 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4643 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4644 if ((1 << cap) & CFS_CAP_FS_MASK)
4645 cap_lower(cred->cap_effective, cap);
4647 old_cred = override_creds(cred);
4650 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4651 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4652 /* restore current process's credentials and FS capability */
4654 revert_creds(old_cred);
4661 /* -o localflock - only provides locally consistent flock locks */
4662 struct file_operations ll_file_operations = {
4663 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4664 # ifdef HAVE_SYNC_READ_WRITE
4665 .read = new_sync_read,
4666 .write = new_sync_write,
4668 .read_iter = ll_file_read_iter,
4669 .write_iter = ll_file_write_iter,
4670 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4671 .read = ll_file_read,
4672 .aio_read = ll_file_aio_read,
4673 .write = ll_file_write,
4674 .aio_write = ll_file_aio_write,
4675 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4676 .unlocked_ioctl = ll_file_ioctl,
4677 .open = ll_file_open,
4678 .release = ll_file_release,
4679 .mmap = ll_file_mmap,
4680 .llseek = ll_file_seek,
4681 .splice_read = ll_file_splice_read,
4686 struct file_operations ll_file_operations_flock = {
4687 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4688 # ifdef HAVE_SYNC_READ_WRITE
4689 .read = new_sync_read,
4690 .write = new_sync_write,
4691 # endif /* HAVE_SYNC_READ_WRITE */
4692 .read_iter = ll_file_read_iter,
4693 .write_iter = ll_file_write_iter,
4694 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4695 .read = ll_file_read,
4696 .aio_read = ll_file_aio_read,
4697 .write = ll_file_write,
4698 .aio_write = ll_file_aio_write,
4699 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4700 .unlocked_ioctl = ll_file_ioctl,
4701 .open = ll_file_open,
4702 .release = ll_file_release,
4703 .mmap = ll_file_mmap,
4704 .llseek = ll_file_seek,
4705 .splice_read = ll_file_splice_read,
4708 .flock = ll_file_flock,
4709 .lock = ll_file_flock
4712 /* These are for -o noflock - to return ENOSYS on flock calls */
4713 struct file_operations ll_file_operations_noflock = {
4714 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4715 # ifdef HAVE_SYNC_READ_WRITE
4716 .read = new_sync_read,
4717 .write = new_sync_write,
4718 # endif /* HAVE_SYNC_READ_WRITE */
4719 .read_iter = ll_file_read_iter,
4720 .write_iter = ll_file_write_iter,
4721 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4722 .read = ll_file_read,
4723 .aio_read = ll_file_aio_read,
4724 .write = ll_file_write,
4725 .aio_write = ll_file_aio_write,
4726 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4727 .unlocked_ioctl = ll_file_ioctl,
4728 .open = ll_file_open,
4729 .release = ll_file_release,
4730 .mmap = ll_file_mmap,
4731 .llseek = ll_file_seek,
4732 .splice_read = ll_file_splice_read,
4735 .flock = ll_file_noflock,
4736 .lock = ll_file_noflock
4739 struct inode_operations ll_file_inode_operations = {
4740 .setattr = ll_setattr,
4741 .getattr = ll_getattr,
4742 .permission = ll_inode_permission,
4743 #ifdef HAVE_IOP_XATTR
4744 .setxattr = ll_setxattr,
4745 .getxattr = ll_getxattr,
4746 .removexattr = ll_removexattr,
4748 .listxattr = ll_listxattr,
4749 .fiemap = ll_fiemap,
4750 #ifdef HAVE_IOP_GET_ACL
4751 .get_acl = ll_get_acl,
4753 #ifdef HAVE_IOP_SET_ACL
4754 .set_acl = ll_set_acl,
4758 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4760 struct ll_inode_info *lli = ll_i2info(inode);
4761 struct cl_object *obj = lli->lli_clob;
4770 env = cl_env_get(&refcheck);
4772 RETURN(PTR_ERR(env));
4774 rc = cl_conf_set(env, lli->lli_clob, conf);
4778 if (conf->coc_opc == OBJECT_CONF_SET) {
4779 struct ldlm_lock *lock = conf->coc_lock;
4780 struct cl_layout cl = {
4784 LASSERT(lock != NULL);
4785 LASSERT(ldlm_has_layout(lock));
4787 /* it can only be allowed to match after layout is
4788 * applied to inode otherwise false layout would be
4789 * seen. Applying layout shoud happen before dropping
4790 * the intent lock. */
4791 ldlm_lock_allow_match(lock);
4793 rc = cl_object_layout_get(env, obj, &cl);
4798 DFID": layout version change: %u -> %u\n",
4799 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4801 ll_layout_version_set(lli, cl.cl_layout_gen);
4805 cl_env_put(env, &refcheck);
4810 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4811 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4814 struct ll_sb_info *sbi = ll_i2sbi(inode);
4815 struct ptlrpc_request *req;
4822 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4823 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4824 lock->l_lvb_data, lock->l_lvb_len);
4826 if (lock->l_lvb_data != NULL)
4829 /* if layout lock was granted right away, the layout is returned
4830 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4831 * blocked and then granted via completion ast, we have to fetch
4832 * layout here. Please note that we can't use the LVB buffer in
4833 * completion AST because it doesn't have a large enough buffer */
4834 rc = ll_get_default_mdsize(sbi, &lmmsize);
4838 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4839 XATTR_NAME_LOV, lmmsize, &req);
4842 GOTO(out, rc = 0); /* empty layout */
4849 if (lmmsize == 0) /* empty layout */
4852 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4854 GOTO(out, rc = -EFAULT);
4856 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4857 if (lvbdata == NULL)
4858 GOTO(out, rc = -ENOMEM);
4860 memcpy(lvbdata, lmm, lmmsize);
4861 lock_res_and_lock(lock);
4862 if (unlikely(lock->l_lvb_data == NULL)) {
4863 lock->l_lvb_type = LVB_T_LAYOUT;
4864 lock->l_lvb_data = lvbdata;
4865 lock->l_lvb_len = lmmsize;
4868 unlock_res_and_lock(lock);
4871 OBD_FREE_LARGE(lvbdata, lmmsize);
4876 ptlrpc_req_finished(req);
4881 * Apply the layout to the inode. Layout lock is held and will be released
4884 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4885 struct inode *inode)
4887 struct ll_inode_info *lli = ll_i2info(inode);
4888 struct ll_sb_info *sbi = ll_i2sbi(inode);
4889 struct ldlm_lock *lock;
4890 struct cl_object_conf conf;
4893 bool wait_layout = false;
4896 LASSERT(lustre_handle_is_used(lockh));
4898 lock = ldlm_handle2lock(lockh);
4899 LASSERT(lock != NULL);
4900 LASSERT(ldlm_has_layout(lock));
4902 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4903 PFID(&lli->lli_fid), inode);
4905 /* in case this is a caching lock and reinstate with new inode */
4906 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4908 lock_res_and_lock(lock);
4909 lvb_ready = ldlm_is_lvb_ready(lock);
4910 unlock_res_and_lock(lock);
4912 /* checking lvb_ready is racy but this is okay. The worst case is
4913 * that multi processes may configure the file on the same time. */
4917 rc = ll_layout_fetch(inode, lock);
4921 /* for layout lock, lmm is stored in lock's lvb.
4922 * lvb_data is immutable if the lock is held so it's safe to access it
4925 * set layout to file. Unlikely this will fail as old layout was
4926 * surely eliminated */
4927 memset(&conf, 0, sizeof conf);
4928 conf.coc_opc = OBJECT_CONF_SET;
4929 conf.coc_inode = inode;
4930 conf.coc_lock = lock;
4931 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4932 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4933 rc = ll_layout_conf(inode, &conf);
4935 /* refresh layout failed, need to wait */
4936 wait_layout = rc == -EBUSY;
4939 LDLM_LOCK_PUT(lock);
4940 ldlm_lock_decref(lockh, mode);
4942 /* wait for IO to complete if it's still being used. */
4944 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4945 ll_get_fsname(inode->i_sb, NULL, 0),
4946 PFID(&lli->lli_fid), inode);
4948 memset(&conf, 0, sizeof conf);
4949 conf.coc_opc = OBJECT_CONF_WAIT;
4950 conf.coc_inode = inode;
4951 rc = ll_layout_conf(inode, &conf);
4955 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4956 ll_get_fsname(inode->i_sb, NULL, 0),
4957 PFID(&lli->lli_fid), rc);
4963 * Issue layout intent RPC to MDS.
4964 * \param inode [in] file inode
4965 * \param intent [in] layout intent
4967 * \retval 0 on success
4968 * \retval < 0 error code
4970 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4972 struct ll_inode_info *lli = ll_i2info(inode);
4973 struct ll_sb_info *sbi = ll_i2sbi(inode);
4974 struct md_op_data *op_data;
4975 struct lookup_intent it;
4976 struct ptlrpc_request *req;
4980 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4981 0, 0, LUSTRE_OPC_ANY, NULL);
4982 if (IS_ERR(op_data))
4983 RETURN(PTR_ERR(op_data));
4985 op_data->op_data = intent;
4986 op_data->op_data_size = sizeof(*intent);
4988 memset(&it, 0, sizeof(it));
4989 it.it_op = IT_LAYOUT;
4990 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4991 intent->li_opc == LAYOUT_INTENT_TRUNC)
4992 it.it_flags = FMODE_WRITE;
4994 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4995 ll_get_fsname(inode->i_sb, NULL, 0),
4996 PFID(&lli->lli_fid), inode);
4998 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4999 &ll_md_blocking_ast, 0);
5000 if (it.it_request != NULL)
5001 ptlrpc_req_finished(it.it_request);
5002 it.it_request = NULL;
5004 ll_finish_md_op_data(op_data);
5006 /* set lock data in case this is a new lock */
5008 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5010 ll_intent_drop_lock(&it);
5016 * This function checks if there exists a LAYOUT lock on the client side,
5017 * or enqueues it if it doesn't have one in cache.
5019 * This function will not hold layout lock so it may be revoked any time after
5020 * this function returns. Any operations depend on layout should be redone
5023 * This function should be called before lov_io_init() to get an uptodate
5024 * layout version, the caller should save the version number and after IO
5025 * is finished, this function should be called again to verify that layout
5026 * is not changed during IO time.
5028 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5030 struct ll_inode_info *lli = ll_i2info(inode);
5031 struct ll_sb_info *sbi = ll_i2sbi(inode);
5032 struct lustre_handle lockh;
5033 struct layout_intent intent = {
5034 .li_opc = LAYOUT_INTENT_ACCESS,
5036 enum ldlm_mode mode;
5040 *gen = ll_layout_version_get(lli);
5041 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5045 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5046 LASSERT(S_ISREG(inode->i_mode));
5048 /* take layout lock mutex to enqueue layout lock exclusively. */
5049 mutex_lock(&lli->lli_layout_mutex);
5052 /* mostly layout lock is caching on the local side, so try to
5053 * match it before grabbing layout lock mutex. */
5054 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5055 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5056 if (mode != 0) { /* hit cached lock */
5057 rc = ll_layout_lock_set(&lockh, mode, inode);
5063 rc = ll_layout_intent(inode, &intent);
5069 *gen = ll_layout_version_get(lli);
5070 mutex_unlock(&lli->lli_layout_mutex);
5076 * Issue layout intent RPC indicating where in a file an IO is about to write.
5078 * \param[in] inode file inode.
5079 * \param[in] ext write range with start offset of fille in bytes where
5080 * an IO is about to write, and exclusive end offset in
5083 * \retval 0 on success
5084 * \retval < 0 error code
5086 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5087 struct lu_extent *ext)
5089 struct layout_intent intent = {
5091 .li_extent.e_start = ext->e_start,
5092 .li_extent.e_end = ext->e_end,
5097 rc = ll_layout_intent(inode, &intent);
5103 * This function send a restore request to the MDT
5105 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5107 struct hsm_user_request *hur;
5111 len = sizeof(struct hsm_user_request) +
5112 sizeof(struct hsm_user_item);
5113 OBD_ALLOC(hur, len);
5117 hur->hur_request.hr_action = HUA_RESTORE;
5118 hur->hur_request.hr_archive_id = 0;
5119 hur->hur_request.hr_flags = 0;
5120 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5121 sizeof(hur->hur_user_item[0].hui_fid));
5122 hur->hur_user_item[0].hui_extent.offset = offset;
5123 hur->hur_user_item[0].hui_extent.length = length;
5124 hur->hur_request.hr_itemcount = 1;
5125 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,