4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
501 const char *name = NULL;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
514 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
515 name = de->d_name.name;
516 len = de->d_name.len;
519 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
520 name, len, 0, LUSTRE_OPC_ANY, NULL);
522 RETURN(PTR_ERR(op_data));
523 op_data->op_data = lmm;
524 op_data->op_data_size = lmmsize;
526 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
527 &ll_md_blocking_ast, 0);
528 ll_finish_md_op_data(op_data);
530 /* reason for keep own exit path - don`t flood log
531 * with messages with -ESTALE errors.
533 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
534 it_open_error(DISP_OPEN_OPEN, itp))
536 ll_release_openhandle(de, itp);
540 if (it_disposition(itp, DISP_LOOKUP_NEG))
541 GOTO(out, rc = -ENOENT);
543 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
544 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
545 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
549 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
551 if (!rc && itp->it_lock_mode) {
552 ll_dom_finish_open(de->d_inode, req, itp);
553 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
557 ptlrpc_req_finished(req);
558 ll_intent_drop_lock(itp);
560 /* We did open by fid, but by the time we got to the server,
561 * the object disappeared. If this is a create, we cannot really
562 * tell the userspace that the file it was trying to create
563 * does not exist. Instead let's return -ESTALE, and the VFS will
564 * retry the create with LOOKUP_REVAL that we are going to catch
565 * in ll_revalidate_dentry() and use lookup then.
567 if (rc == -ENOENT && itp->it_op & IT_CREAT)
573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
574 struct obd_client_handle *och)
576 struct mdt_body *body;
578 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
579 och->och_open_handle = body->mbo_open_handle;
580 och->och_fid = body->mbo_fid1;
581 och->och_lease_handle.cookie = it->it_lock_handle;
582 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
583 och->och_flags = it->it_flags;
585 return md_set_open_replay_data(md_exp, och, it);
588 static int ll_local_open(struct file *file, struct lookup_intent *it,
589 struct ll_file_data *fd, struct obd_client_handle *och)
591 struct inode *inode = file_inode(file);
594 LASSERT(!LUSTRE_FPRIVATE(file));
601 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
606 LUSTRE_FPRIVATE(file) = fd;
607 ll_readahead_init(inode, &fd->fd_ras);
608 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
610 /* ll_cl_context initialize */
611 rwlock_init(&fd->fd_lock);
612 INIT_LIST_HEAD(&fd->fd_lccs);
617 /* Open a file, and (for the very first open) create objects on the OSTs at
618 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
619 * creation or open until ll_lov_setstripe() ioctl is called.
621 * If we already have the stripe MD locally then we don't request it in
622 * md_open(), by passing a lmm_size = 0.
624 * It is up to the application to ensure no other processes open this file
625 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
626 * used. We might be able to avoid races of that sort by getting lli_open_sem
627 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
628 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
630 int ll_file_open(struct inode *inode, struct file *file)
632 struct ll_inode_info *lli = ll_i2info(inode);
633 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
634 .it_flags = file->f_flags };
635 struct obd_client_handle **och_p = NULL;
636 __u64 *och_usecount = NULL;
637 struct ll_file_data *fd;
641 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
642 PFID(ll_inode2fid(inode)), inode, file->f_flags);
644 it = file->private_data; /* XXX: compat macro */
645 file->private_data = NULL; /* prevent ll_local_open assertion */
647 fd = ll_file_data_get();
649 GOTO(out_nofiledata, rc = -ENOMEM);
652 if (S_ISDIR(inode->i_mode))
653 ll_authorize_statahead(inode, fd);
655 if (inode->i_sb->s_root == file_dentry(file)) {
656 LUSTRE_FPRIVATE(file) = fd;
660 if (!it || !it->it_disposition) {
661 /* Convert f_flags into access mode. We cannot use file->f_mode,
662 * because everything but O_ACCMODE mask was stripped from
664 if ((oit.it_flags + 1) & O_ACCMODE)
666 if (file->f_flags & O_TRUNC)
667 oit.it_flags |= FMODE_WRITE;
669 /* kernel only call f_op->open in dentry_open. filp_open calls
670 * dentry_open after call to open_namei that checks permissions.
671 * Only nfsd_open call dentry_open directly without checking
672 * permissions and because of that this code below is safe.
674 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
675 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
677 /* We do not want O_EXCL here, presumably we opened the file
678 * already? XXX - NFS implications? */
679 oit.it_flags &= ~O_EXCL;
681 /* bug20584, if "it_flags" contains O_CREAT, the file will be
682 * created if necessary, then "IT_CREAT" should be set to keep
683 * consistent with it */
684 if (oit.it_flags & O_CREAT)
685 oit.it_op |= IT_CREAT;
691 /* Let's see if we have file open on MDS already. */
692 if (it->it_flags & FMODE_WRITE) {
693 och_p = &lli->lli_mds_write_och;
694 och_usecount = &lli->lli_open_fd_write_count;
695 } else if (it->it_flags & FMODE_EXEC) {
696 och_p = &lli->lli_mds_exec_och;
697 och_usecount = &lli->lli_open_fd_exec_count;
699 och_p = &lli->lli_mds_read_och;
700 och_usecount = &lli->lli_open_fd_read_count;
703 mutex_lock(&lli->lli_och_mutex);
704 if (*och_p) { /* Open handle is present */
705 if (it_disposition(it, DISP_OPEN_OPEN)) {
706 /* Well, there's extra open request that we do not need,
707 let's close it somehow. This will decref request. */
708 rc = it_open_error(DISP_OPEN_OPEN, it);
710 mutex_unlock(&lli->lli_och_mutex);
711 GOTO(out_openerr, rc);
714 ll_release_openhandle(file_dentry(file), it);
718 rc = ll_local_open(file, it, fd, NULL);
721 mutex_unlock(&lli->lli_och_mutex);
722 GOTO(out_openerr, rc);
725 LASSERT(*och_usecount == 0);
726 if (!it->it_disposition) {
727 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
728 /* We cannot just request lock handle now, new ELC code
729 means that one of other OPEN locks for this file
730 could be cancelled, and since blocking ast handler
731 would attempt to grab och_mutex as well, that would
732 result in a deadlock */
733 mutex_unlock(&lli->lli_och_mutex);
735 * Normally called under two situations:
737 * 2. A race/condition on MDS resulting in no open
738 * handle to be returned from LOOKUP|OPEN request,
739 * for example if the target entry was a symlink.
741 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
742 * marked by a bit set in ll_iget_for_nfs. Clear the
743 * bit so that it's not confusing later callers.
745 * NB; when ldd is NULL, it must have come via normal
746 * lookup path only, since ll_iget_for_nfs always calls
749 if (ldd && ldd->lld_nfs_dentry) {
750 ldd->lld_nfs_dentry = 0;
751 it->it_flags |= MDS_OPEN_LOCK;
755 * Always specify MDS_OPEN_BY_FID because we don't want
756 * to get file with different fid.
758 it->it_flags |= MDS_OPEN_BY_FID;
759 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
762 GOTO(out_openerr, rc);
766 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
768 GOTO(out_och_free, rc = -ENOMEM);
772 /* md_intent_lock() didn't get a request ref if there was an
773 * open error, so don't do cleanup on the request here
775 /* XXX (green): Should not we bail out on any error here, not
776 * just open error? */
777 rc = it_open_error(DISP_OPEN_OPEN, it);
779 GOTO(out_och_free, rc);
781 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
782 "inode %p: disposition %x, status %d\n", inode,
783 it_disposition(it, ~0), it->it_status);
785 rc = ll_local_open(file, it, fd, *och_p);
787 GOTO(out_och_free, rc);
789 mutex_unlock(&lli->lli_och_mutex);
792 /* Must do this outside lli_och_mutex lock to prevent deadlock where
793 different kind of OPEN lock for this same inode gets cancelled
794 by ldlm_cancel_lru */
795 if (!S_ISREG(inode->i_mode))
796 GOTO(out_och_free, rc);
798 cl_lov_delay_create_clear(&file->f_flags);
799 GOTO(out_och_free, rc);
803 if (och_p && *och_p) {
804 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
805 *och_p = NULL; /* OBD_FREE writes some magic there */
808 mutex_unlock(&lli->lli_och_mutex);
811 if (lli->lli_opendir_key == fd)
812 ll_deauthorize_statahead(inode, fd);
814 ll_file_data_put(fd);
816 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
820 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
821 ptlrpc_req_finished(it->it_request);
822 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
829 struct ldlm_lock_desc *desc, void *data, int flag)
832 struct lustre_handle lockh;
836 case LDLM_CB_BLOCKING:
837 ldlm_lock2handle(lock, &lockh);
838 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
840 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
844 case LDLM_CB_CANCELING:
852 * When setting a lease on a file, we take ownership of the lli_mds_*_och
853 * and save it as fd->fd_och so as to force client to reopen the file even
854 * if it has an open lock in cache already.
856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
857 struct lustre_handle *old_open_handle)
859 struct ll_inode_info *lli = ll_i2info(inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
861 struct obd_client_handle **och_p;
866 /* Get the openhandle of the file */
867 mutex_lock(&lli->lli_och_mutex);
868 if (fd->fd_lease_och != NULL)
869 GOTO(out_unlock, rc = -EBUSY);
871 if (fd->fd_och == NULL) {
872 if (file->f_mode & FMODE_WRITE) {
873 LASSERT(lli->lli_mds_write_och != NULL);
874 och_p = &lli->lli_mds_write_och;
875 och_usecount = &lli->lli_open_fd_write_count;
877 LASSERT(lli->lli_mds_read_och != NULL);
878 och_p = &lli->lli_mds_read_och;
879 och_usecount = &lli->lli_open_fd_read_count;
882 if (*och_usecount > 1)
883 GOTO(out_unlock, rc = -EBUSY);
890 *old_open_handle = fd->fd_och->och_open_handle;
894 mutex_unlock(&lli->lli_och_mutex);
899 * Release ownership on lli_mds_*_och when putting back a file lease.
901 static int ll_lease_och_release(struct inode *inode, struct file *file)
903 struct ll_inode_info *lli = ll_i2info(inode);
904 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
905 struct obd_client_handle **och_p;
906 struct obd_client_handle *old_och = NULL;
911 mutex_lock(&lli->lli_och_mutex);
912 if (file->f_mode & FMODE_WRITE) {
913 och_p = &lli->lli_mds_write_och;
914 och_usecount = &lli->lli_open_fd_write_count;
916 och_p = &lli->lli_mds_read_och;
917 och_usecount = &lli->lli_open_fd_read_count;
920 /* The file may have been open by another process (broken lease) so
921 * *och_p is not NULL. In this case we should simply increase usecount
924 if (*och_p != NULL) {
925 old_och = fd->fd_och;
932 mutex_unlock(&lli->lli_och_mutex);
935 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
941 * Acquire a lease and open the file.
943 static struct obd_client_handle *
944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
947 struct lookup_intent it = { .it_op = IT_OPEN };
948 struct ll_sb_info *sbi = ll_i2sbi(inode);
949 struct md_op_data *op_data;
950 struct ptlrpc_request *req = NULL;
951 struct lustre_handle old_open_handle = { 0 };
952 struct obd_client_handle *och = NULL;
957 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
958 RETURN(ERR_PTR(-EINVAL));
961 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
962 RETURN(ERR_PTR(-EPERM));
964 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
971 RETURN(ERR_PTR(-ENOMEM));
973 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
974 LUSTRE_OPC_ANY, NULL);
976 GOTO(out, rc = PTR_ERR(op_data));
978 /* To tell the MDT this openhandle is from the same owner */
979 op_data->op_open_handle = old_open_handle;
981 it.it_flags = fmode | open_flags;
982 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
983 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
984 &ll_md_blocking_lease_ast,
985 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
986 * it can be cancelled which may mislead applications that the lease is
988 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
989 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
990 * doesn't deal with openhandle, so normal openhandle will be leaked. */
991 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
992 ll_finish_md_op_data(op_data);
993 ptlrpc_req_finished(req);
995 GOTO(out_release_it, rc);
997 if (it_disposition(&it, DISP_LOOKUP_NEG))
998 GOTO(out_release_it, rc = -ENOENT);
1000 rc = it_open_error(DISP_OPEN_OPEN, &it);
1002 GOTO(out_release_it, rc);
1004 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005 ll_och_fill(sbi->ll_md_exp, &it, och);
1007 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008 GOTO(out_close, rc = -EOPNOTSUPP);
1010 /* already get lease, handle lease lock */
1011 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012 if (it.it_lock_mode == 0 ||
1013 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014 /* open lock must return for lease */
1015 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1018 GOTO(out_close, rc = -EPROTO);
1021 ll_intent_release(&it);
1025 /* Cancel open lock */
1026 if (it.it_lock_mode != 0) {
1027 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1029 it.it_lock_mode = 0;
1030 och->och_lease_handle.cookie = 0ULL;
1032 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1034 CERROR("%s: error closing file "DFID": %d\n",
1035 ll_get_fsname(inode->i_sb, NULL, 0),
1036 PFID(&ll_i2info(inode)->lli_fid), rc2);
1037 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1039 ll_intent_release(&it);
1043 RETURN(ERR_PTR(rc));
1047 * Check whether a layout swap can be done between two inodes.
1049 * \param[in] inode1 First inode to check
1050 * \param[in] inode2 Second inode to check
1052 * \retval 0 on success, layout swap can be performed between both inodes
1053 * \retval negative error code if requirements are not met
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056 struct inode *inode2)
1058 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1061 if (inode_permission(inode1, MAY_WRITE) ||
1062 inode_permission(inode2, MAY_WRITE))
1065 if (inode1->i_sb != inode2->i_sb)
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072 struct inode *inode, struct inode *inode2)
1074 const struct lu_fid *fid1 = ll_inode2fid(inode);
1075 const struct lu_fid *fid2;
1079 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1082 rc = ll_check_swap_layouts_validity(inode, inode2);
1084 GOTO(out_free_och, rc);
1086 /* We now know that inode2 is a lustre inode */
1087 fid2 = ll_inode2fid(inode2);
1089 rc = lu_fid_cmp(fid1, fid2);
1091 GOTO(out_free_och, rc = -EINVAL);
1093 /* Close the file and {swap,merge} layouts between inode & inode2.
1094 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095 * because we still need it to pack l_remote_handle to MDT. */
1096 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1099 och = NULL; /* freed in ll_close_inode_openhandle() */
1109 * Release lease and close the file.
1110 * It will check if the lease has ever broken.
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113 struct inode *inode,
1114 bool *lease_broken, enum mds_op_bias bias,
1117 struct ldlm_lock *lock;
1118 bool cancelled = true;
1122 lock = ldlm_handle2lock(&och->och_lease_handle);
1124 lock_res_and_lock(lock);
1125 cancelled = ldlm_is_cancel(lock);
1126 unlock_res_and_lock(lock);
1127 LDLM_LOCK_PUT(lock);
1130 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1133 if (lease_broken != NULL)
1134 *lease_broken = cancelled;
1136 if (!cancelled && !bias)
1137 ldlm_cli_cancel(&och->och_lease_handle, 0);
1139 if (cancelled) { /* no need to excute intent */
1144 rc = ll_close_inode_openhandle(inode, och, bias, data);
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1151 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1155 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158 struct inode *inode, unsigned long arg)
1160 struct ll_sb_info *sbi = ll_i2sbi(inode);
1161 struct md_op_data *op_data;
1162 struct ll_ioc_lease_id ioc;
1163 __u64 data_version_unused;
1167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168 LUSTRE_OPC_ANY, NULL);
1169 if (IS_ERR(op_data))
1170 RETURN(PTR_ERR(op_data));
1172 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1176 /* before starting file resync, it's necessary to clean up page cache
1177 * in client memory, otherwise once the layout version is increased,
1178 * writing back cached data will be denied the OSTs. */
1179 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1183 op_data->op_lease_handle = och->och_lease_handle;
1184 op_data->op_mirror_id = ioc.lil_mirror_id;
1185 rc = md_file_resync(sbi->ll_md_exp, op_data);
1191 ll_finish_md_op_data(op_data);
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1197 struct ll_inode_info *lli = ll_i2info(inode);
1198 struct cl_object *obj = lli->lli_clob;
1199 struct cl_attr *attr = vvp_env_thread_attr(env);
1207 ll_inode_size_lock(inode);
1209 /* Merge timestamps the most recently obtained from MDS with
1210 * timestamps obtained from OSTs.
1212 * Do not overwrite atime of inode because it may be refreshed
1213 * by file_accessed() function. If the read was served by cache
1214 * data, there is no RPC to be sent so that atime may not be
1215 * transferred to OSTs at all. MDT only updates atime at close time
1216 * if it's at least 'mdd.*.atime_diff' older.
1217 * All in all, the atime in Lustre does not strictly comply with
1218 * POSIX. Solving this problem needs to send an RPC to MDT for each
1219 * read, this will hurt performance.
1221 if (inode->i_atime.tv_sec < lli->lli_atime ||
1222 lli->lli_update_atime) {
1223 inode->i_atime.tv_sec = lli->lli_atime;
1224 lli->lli_update_atime = 0;
1226 inode->i_mtime.tv_sec = lli->lli_mtime;
1227 inode->i_ctime.tv_sec = lli->lli_ctime;
1229 mtime = inode->i_mtime.tv_sec;
1230 atime = inode->i_atime.tv_sec;
1231 ctime = inode->i_ctime.tv_sec;
1233 cl_object_attr_lock(obj);
1234 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1237 rc = cl_object_attr_get(env, obj, attr);
1238 cl_object_attr_unlock(obj);
1241 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1243 if (atime < attr->cat_atime)
1244 atime = attr->cat_atime;
1246 if (ctime < attr->cat_ctime)
1247 ctime = attr->cat_ctime;
1249 if (mtime < attr->cat_mtime)
1250 mtime = attr->cat_mtime;
1252 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1253 PFID(&lli->lli_fid), attr->cat_size);
1255 i_size_write(inode, attr->cat_size);
1256 inode->i_blocks = attr->cat_blocks;
1258 inode->i_mtime.tv_sec = mtime;
1259 inode->i_atime.tv_sec = atime;
1260 inode->i_ctime.tv_sec = ctime;
1263 ll_inode_size_unlock(inode);
1269 * Set designated mirror for I/O.
1271 * So far only read, write, and truncated can support to issue I/O to
1272 * designated mirror.
1274 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1276 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1278 /* clear layout version for generic(non-resync) I/O in case it carries
1279 * stale layout version due to I/O restart */
1280 io->ci_layout_version = 0;
1282 /* FLR: disable non-delay for designated mirror I/O because obviously
1283 * only one mirror is available */
1284 if (fd->fd_designated_mirror > 0) {
1286 io->ci_designated_mirror = fd->fd_designated_mirror;
1287 io->ci_layout_version = fd->fd_layout_version;
1290 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1291 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1294 static bool file_is_noatime(const struct file *file)
1296 const struct vfsmount *mnt = file->f_path.mnt;
1297 const struct inode *inode = file_inode((struct file *)file);
1299 /* Adapted from file_accessed() and touch_atime().*/
1300 if (file->f_flags & O_NOATIME)
1303 if (inode->i_flags & S_NOATIME)
1306 if (IS_NOATIME(inode))
1309 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1312 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1315 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1321 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1323 struct inode *inode = file_inode(file);
1324 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1326 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1327 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1329 if (iot == CIT_WRITE) {
1330 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1331 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1332 file->f_flags & O_DIRECT ||
1335 io->ci_obj = ll_i2info(inode)->lli_clob;
1336 io->ci_lockreq = CILR_MAYBE;
1337 if (ll_file_nolock(file)) {
1338 io->ci_lockreq = CILR_NEVER;
1339 io->ci_no_srvlock = 1;
1340 } else if (file->f_flags & O_APPEND) {
1341 io->ci_lockreq = CILR_MANDATORY;
1343 io->ci_noatime = file_is_noatime(file);
1345 /* FLR: only use non-delay I/O for read as there is only one
1346 * avaliable mirror for write. */
1347 io->ci_ndelay = !(iot == CIT_WRITE);
1349 ll_io_set_mirror(io, file);
1353 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1354 struct file *file, enum cl_io_type iot,
1355 loff_t *ppos, size_t count)
1357 struct vvp_io *vio = vvp_env_io(env);
1358 struct inode *inode = file_inode(file);
1359 struct ll_inode_info *lli = ll_i2info(inode);
1360 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1361 struct range_lock range;
1365 unsigned retried = 0;
1366 bool restarted = false;
1370 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1371 file_dentry(file)->d_name.name,
1372 iot == CIT_READ ? "read" : "write", *ppos, count);
1375 io = vvp_env_thread_io(env);
1376 ll_io_init(io, file, iot);
1377 io->ci_ndelay_tried = retried;
1379 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1380 bool range_locked = false;
1382 if (file->f_flags & O_APPEND)
1383 range_lock_init(&range, 0, LUSTRE_EOF);
1385 range_lock_init(&range, *ppos, *ppos + count - 1);
1387 vio->vui_fd = LUSTRE_FPRIVATE(file);
1388 vio->vui_io_subtype = args->via_io_subtype;
1390 switch (vio->vui_io_subtype) {
1392 vio->vui_iter = args->u.normal.via_iter;
1393 vio->vui_iocb = args->u.normal.via_iocb;
1394 /* Direct IO reads must also take range lock,
1395 * or multiple reads will try to work on the same pages
1396 * See LU-6227 for details. */
1397 if (((iot == CIT_WRITE) ||
1398 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1399 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1400 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1402 rc = range_lock(&lli->lli_write_tree, &range);
1406 range_locked = true;
1410 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1411 vio->u.splice.vui_flags = args->u.splice.via_flags;
1414 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1418 ll_cl_add(file, env, io, LCC_RW);
1419 rc = cl_io_loop(env, io);
1420 ll_cl_remove(file, env);
1423 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1425 range_unlock(&lli->lli_write_tree, &range);
1428 /* cl_io_rw_init() handled IO */
1432 if (io->ci_nob > 0) {
1433 result += io->ci_nob;
1434 count -= io->ci_nob;
1435 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1437 /* prepare IO restart */
1438 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1439 args->u.normal.via_iter = vio->vui_iter;
1442 cl_io_fini(env, io);
1445 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1446 file->f_path.dentry->d_name.name,
1447 iot, rc, result, io->ci_need_restart);
1449 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1451 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1452 file_dentry(file)->d_name.name,
1453 iot == CIT_READ ? "read" : "write",
1454 *ppos, count, result, rc);
1455 /* preserve the tried count for FLR */
1456 retried = io->ci_ndelay_tried;
1461 if (iot == CIT_READ) {
1463 ll_stats_ops_tally(ll_i2sbi(inode),
1464 LPROC_LL_READ_BYTES, result);
1465 } else if (iot == CIT_WRITE) {
1467 ll_stats_ops_tally(ll_i2sbi(inode),
1468 LPROC_LL_WRITE_BYTES, result);
1469 fd->fd_write_failed = false;
1470 } else if (result == 0 && rc == 0) {
1473 fd->fd_write_failed = true;
1475 fd->fd_write_failed = false;
1476 } else if (rc != -ERESTARTSYS) {
1477 fd->fd_write_failed = true;
1481 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1483 RETURN(result > 0 ? result : rc);
1487 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1488 * especially for small I/O.
1490 * To serve a read request, CLIO has to create and initialize a cl_io and
1491 * then request DLM lock. This has turned out to have siginificant overhead
1492 * and affects the performance of small I/O dramatically.
1494 * It's not necessary to create a cl_io for each I/O. Under the help of read
1495 * ahead, most of the pages being read are already in memory cache and we can
1496 * read those pages directly because if the pages exist, the corresponding DLM
1497 * lock must exist so that page content must be valid.
1499 * In fast read implementation, the llite speculatively finds and reads pages
1500 * in memory cache. There are three scenarios for fast read:
1501 * - If the page exists and is uptodate, kernel VM will provide the data and
1502 * CLIO won't be intervened;
1503 * - If the page was brought into memory by read ahead, it will be exported
1504 * and read ahead parameters will be updated;
1505 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1506 * it will go back and invoke normal read, i.e., a cl_io will be created
1507 * and DLM lock will be requested.
1509 * POSIX compliance: posix standard states that read is intended to be atomic.
1510 * Lustre read implementation is in line with Linux kernel read implementation
1511 * and neither of them complies with POSIX standard in this matter. Fast read
1512 * doesn't make the situation worse on single node but it may interleave write
1513 * results from multiple nodes due to short read handling in ll_file_aio_read().
1515 * \param env - lu_env
1516 * \param iocb - kiocb from kernel
1517 * \param iter - user space buffers where the data will be copied
1519 * \retval - number of bytes have been read, or error code if error occurred.
1522 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1526 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1529 /* NB: we can't do direct IO for fast read because it will need a lock
1530 * to make IO engine happy. */
1531 if (iocb->ki_filp->f_flags & O_DIRECT)
1534 result = generic_file_read_iter(iocb, iter);
1536 /* If the first page is not in cache, generic_file_aio_read() will be
1537 * returned with -ENODATA.
1538 * See corresponding code in ll_readpage(). */
1539 if (result == -ENODATA)
1543 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1544 LPROC_LL_READ_BYTES, result);
1550 * Read from a file (through the page cache).
1552 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1555 struct vvp_io_args *args;
1560 result = ll_do_fast_read(iocb, to);
1561 if (result < 0 || iov_iter_count(to) == 0)
1564 env = cl_env_get(&refcheck);
1566 return PTR_ERR(env);
1568 args = ll_env_args(env, IO_NORMAL);
1569 args->u.normal.via_iter = to;
1570 args->u.normal.via_iocb = iocb;
1572 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1573 &iocb->ki_pos, iov_iter_count(to));
1576 else if (result == 0)
1579 cl_env_put(env, &refcheck);
1585 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1586 * If a page is already in the page cache and dirty (and some other things -
1587 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1588 * write to it without doing a full I/O, because Lustre already knows about it
1589 * and will write it out. This saves a lot of processing time.
1591 * All writes here are within one page, so exclusion is handled by the page
1592 * lock on the vm page. We do not do tiny writes for writes which touch
1593 * multiple pages because it's very unlikely multiple sequential pages are
1594 * are already dirty.
1596 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1597 * and are unlikely to be to already dirty pages.
1599 * Attribute updates are important here, we do them in ll_tiny_write_end.
1601 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1603 ssize_t count = iov_iter_count(iter);
1604 struct file *file = iocb->ki_filp;
1605 struct inode *inode = file_inode(file);
1606 bool lock_inode = !IS_NOSEC(inode);
1611 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1612 * of function for why.
1614 if (count >= PAGE_SIZE ||
1615 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1618 if (unlikely(lock_inode))
1620 result = __generic_file_write_iter(iocb, iter);
1622 if (unlikely(lock_inode))
1623 inode_unlock(inode);
1625 /* If the page is not already dirty, ll_tiny_write_begin returns
1626 * -ENODATA. We continue on to normal write.
1628 if (result == -ENODATA)
1632 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1634 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1637 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1643 * Write to a file (through the page cache).
1645 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1647 struct vvp_io_args *args;
1649 ssize_t rc_tiny = 0, rc_normal;
1654 /* NB: we can't do direct IO for tiny writes because they use the page
1655 * cache, we can't do sync writes because tiny writes can't flush
1656 * pages, and we can't do append writes because we can't guarantee the
1657 * required DLM locks are held to protect file size.
1659 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1660 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1661 rc_tiny = ll_do_tiny_write(iocb, from);
1663 /* In case of error, go on and try normal write - Only stop if tiny
1664 * write completed I/O.
1666 if (iov_iter_count(from) == 0)
1667 GOTO(out, rc_normal = rc_tiny);
1669 env = cl_env_get(&refcheck);
1671 return PTR_ERR(env);
1673 args = ll_env_args(env, IO_NORMAL);
1674 args->u.normal.via_iter = from;
1675 args->u.normal.via_iocb = iocb;
1677 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1678 &iocb->ki_pos, iov_iter_count(from));
1680 /* On success, combine bytes written. */
1681 if (rc_tiny >= 0 && rc_normal > 0)
1682 rc_normal += rc_tiny;
1683 /* On error, only return error from normal write if tiny write did not
1684 * write any bytes. Otherwise return bytes written by tiny write.
1686 else if (rc_tiny > 0)
1687 rc_normal = rc_tiny;
1689 cl_env_put(env, &refcheck);
1694 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1696 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1698 static int ll_file_get_iov_count(const struct iovec *iov,
1699 unsigned long *nr_segs, size_t *count)
1704 for (seg = 0; seg < *nr_segs; seg++) {
1705 const struct iovec *iv = &iov[seg];
1708 * If any segment has a negative length, or the cumulative
1709 * length ever wraps negative then return -EINVAL.
1712 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1714 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1719 cnt -= iv->iov_len; /* This segment is no good */
1726 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1727 unsigned long nr_segs, loff_t pos)
1734 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1738 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1739 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1740 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1741 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1742 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1744 result = ll_file_read_iter(iocb, &to);
1749 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1752 struct iovec iov = { .iov_base = buf, .iov_len = count };
1757 init_sync_kiocb(&kiocb, file);
1758 kiocb.ki_pos = *ppos;
1759 #ifdef HAVE_KIOCB_KI_LEFT
1760 kiocb.ki_left = count;
1761 #elif defined(HAVE_KI_NBYTES)
1762 kiocb.i_nbytes = count;
1765 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1766 *ppos = kiocb.ki_pos;
1772 * Write to a file (through the page cache).
1775 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1776 unsigned long nr_segs, loff_t pos)
1778 struct iov_iter from;
1783 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1787 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1788 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1789 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1790 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1791 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1793 result = ll_file_write_iter(iocb, &from);
1798 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1799 size_t count, loff_t *ppos)
1801 struct iovec iov = { .iov_base = (void __user *)buf,
1808 init_sync_kiocb(&kiocb, file);
1809 kiocb.ki_pos = *ppos;
1810 #ifdef HAVE_KIOCB_KI_LEFT
1811 kiocb.ki_left = count;
1812 #elif defined(HAVE_KI_NBYTES)
1813 kiocb.ki_nbytes = count;
1816 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1817 *ppos = kiocb.ki_pos;
1821 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1824 * Send file content (through pagecache) somewhere with helper
1826 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1827 struct pipe_inode_info *pipe, size_t count,
1831 struct vvp_io_args *args;
1836 env = cl_env_get(&refcheck);
1838 RETURN(PTR_ERR(env));
1840 args = ll_env_args(env, IO_SPLICE);
1841 args->u.splice.via_pipe = pipe;
1842 args->u.splice.via_flags = flags;
1844 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1845 cl_env_put(env, &refcheck);
1849 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1850 __u64 flags, struct lov_user_md *lum, int lum_size)
1852 struct lookup_intent oit = {
1854 .it_flags = flags | MDS_OPEN_BY_FID,
1859 ll_inode_size_lock(inode);
1860 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1862 GOTO(out_unlock, rc);
1864 ll_release_openhandle(dentry, &oit);
1867 ll_inode_size_unlock(inode);
1868 ll_intent_release(&oit);
1873 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1874 struct lov_mds_md **lmmp, int *lmm_size,
1875 struct ptlrpc_request **request)
1877 struct ll_sb_info *sbi = ll_i2sbi(inode);
1878 struct mdt_body *body;
1879 struct lov_mds_md *lmm = NULL;
1880 struct ptlrpc_request *req = NULL;
1881 struct md_op_data *op_data;
1884 rc = ll_get_default_mdsize(sbi, &lmmsize);
1888 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1889 strlen(filename), lmmsize,
1890 LUSTRE_OPC_ANY, NULL);
1891 if (IS_ERR(op_data))
1892 RETURN(PTR_ERR(op_data));
1894 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1895 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1896 ll_finish_md_op_data(op_data);
1898 CDEBUG(D_INFO, "md_getattr_name failed "
1899 "on %s: rc %d\n", filename, rc);
1903 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1904 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1906 lmmsize = body->mbo_eadatasize;
1908 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1910 GOTO(out, rc = -ENODATA);
1913 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1914 LASSERT(lmm != NULL);
1916 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1917 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1918 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1919 GOTO(out, rc = -EPROTO);
1922 * This is coming from the MDS, so is probably in
1923 * little endian. We convert it to host endian before
1924 * passing it to userspace.
1926 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1929 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1930 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1931 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1932 if (le32_to_cpu(lmm->lmm_pattern) &
1933 LOV_PATTERN_F_RELEASED)
1937 /* if function called for directory - we should
1938 * avoid swab not existent lsm objects */
1939 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1940 lustre_swab_lov_user_md_v1(
1941 (struct lov_user_md_v1 *)lmm);
1942 if (S_ISREG(body->mbo_mode))
1943 lustre_swab_lov_user_md_objects(
1944 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1946 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1947 lustre_swab_lov_user_md_v3(
1948 (struct lov_user_md_v3 *)lmm);
1949 if (S_ISREG(body->mbo_mode))
1950 lustre_swab_lov_user_md_objects(
1951 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1953 } else if (lmm->lmm_magic ==
1954 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1955 lustre_swab_lov_comp_md_v1(
1956 (struct lov_comp_md_v1 *)lmm);
1962 *lmm_size = lmmsize;
1967 static int ll_lov_setea(struct inode *inode, struct file *file,
1970 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1971 struct lov_user_md *lump;
1972 int lum_size = sizeof(struct lov_user_md) +
1973 sizeof(struct lov_user_ost_data);
1977 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1980 OBD_ALLOC_LARGE(lump, lum_size);
1984 if (copy_from_user(lump, arg, lum_size))
1985 GOTO(out_lump, rc = -EFAULT);
1987 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1989 cl_lov_delay_create_clear(&file->f_flags);
1992 OBD_FREE_LARGE(lump, lum_size);
1996 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2003 env = cl_env_get(&refcheck);
2005 RETURN(PTR_ERR(env));
2007 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2008 cl_env_put(env, &refcheck);
2012 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2015 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2016 struct lov_user_md *klum;
2018 __u64 flags = FMODE_WRITE;
2021 rc = ll_copy_user_md(lum, &klum);
2026 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2031 rc = put_user(0, &lum->lmm_stripe_count);
2035 rc = ll_layout_refresh(inode, &gen);
2039 rc = ll_file_getstripe(inode, arg, lum_size);
2041 cl_lov_delay_create_clear(&file->f_flags);
2044 OBD_FREE(klum, lum_size);
2049 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2051 struct ll_inode_info *lli = ll_i2info(inode);
2052 struct cl_object *obj = lli->lli_clob;
2053 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2054 struct ll_grouplock grouplock;
2059 CWARN("group id for group lock must not be 0\n");
2063 if (ll_file_nolock(file))
2064 RETURN(-EOPNOTSUPP);
2066 spin_lock(&lli->lli_lock);
2067 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2068 CWARN("group lock already existed with gid %lu\n",
2069 fd->fd_grouplock.lg_gid);
2070 spin_unlock(&lli->lli_lock);
2073 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2074 spin_unlock(&lli->lli_lock);
2077 * XXX: group lock needs to protect all OST objects while PFL
2078 * can add new OST objects during the IO, so we'd instantiate
2079 * all OST objects before getting its group lock.
2084 struct cl_layout cl = {
2085 .cl_is_composite = false,
2087 struct lu_extent ext = {
2089 .e_end = OBD_OBJECT_EOF,
2092 env = cl_env_get(&refcheck);
2094 RETURN(PTR_ERR(env));
2096 rc = cl_object_layout_get(env, obj, &cl);
2097 if (!rc && cl.cl_is_composite)
2098 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2101 cl_env_put(env, &refcheck);
2106 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2107 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2111 spin_lock(&lli->lli_lock);
2112 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2113 spin_unlock(&lli->lli_lock);
2114 CERROR("another thread just won the race\n");
2115 cl_put_grouplock(&grouplock);
2119 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2120 fd->fd_grouplock = grouplock;
2121 spin_unlock(&lli->lli_lock);
2123 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2127 static int ll_put_grouplock(struct inode *inode, struct file *file,
2130 struct ll_inode_info *lli = ll_i2info(inode);
2131 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2132 struct ll_grouplock grouplock;
2135 spin_lock(&lli->lli_lock);
2136 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2137 spin_unlock(&lli->lli_lock);
2138 CWARN("no group lock held\n");
2142 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2144 if (fd->fd_grouplock.lg_gid != arg) {
2145 CWARN("group lock %lu doesn't match current id %lu\n",
2146 arg, fd->fd_grouplock.lg_gid);
2147 spin_unlock(&lli->lli_lock);
2151 grouplock = fd->fd_grouplock;
2152 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2153 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2154 spin_unlock(&lli->lli_lock);
2156 cl_put_grouplock(&grouplock);
2157 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2162 * Close inode open handle
2164 * \param dentry [in] dentry which contains the inode
2165 * \param it [in,out] intent which contains open info and result
2168 * \retval <0 failure
2170 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2172 struct inode *inode = dentry->d_inode;
2173 struct obd_client_handle *och;
2179 /* Root ? Do nothing. */
2180 if (dentry->d_inode->i_sb->s_root == dentry)
2183 /* No open handle to close? Move away */
2184 if (!it_disposition(it, DISP_OPEN_OPEN))
2187 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2189 OBD_ALLOC(och, sizeof(*och));
2191 GOTO(out, rc = -ENOMEM);
2193 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2195 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2197 /* this one is in place of ll_file_open */
2198 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2199 ptlrpc_req_finished(it->it_request);
2200 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2206 * Get size for inode for which FIEMAP mapping is requested.
2207 * Make the FIEMAP get_info call and returns the result.
2208 * \param fiemap kernel buffer to hold extens
2209 * \param num_bytes kernel buffer size
2211 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2217 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2220 /* Checks for fiemap flags */
2221 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2222 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2226 /* Check for FIEMAP_FLAG_SYNC */
2227 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2228 rc = filemap_fdatawrite(inode->i_mapping);
2233 env = cl_env_get(&refcheck);
2235 RETURN(PTR_ERR(env));
2237 if (i_size_read(inode) == 0) {
2238 rc = ll_glimpse_size(inode);
2243 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2244 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2245 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2247 /* If filesize is 0, then there would be no objects for mapping */
2248 if (fmkey.lfik_oa.o_size == 0) {
2249 fiemap->fm_mapped_extents = 0;
2253 fmkey.lfik_fiemap = *fiemap;
2255 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2256 &fmkey, fiemap, &num_bytes);
2258 cl_env_put(env, &refcheck);
2262 int ll_fid2path(struct inode *inode, void __user *arg)
2264 struct obd_export *exp = ll_i2mdexp(inode);
2265 const struct getinfo_fid2path __user *gfin = arg;
2267 struct getinfo_fid2path *gfout;
2273 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2274 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2277 /* Only need to get the buflen */
2278 if (get_user(pathlen, &gfin->gf_pathlen))
2281 if (pathlen > PATH_MAX)
2284 outsize = sizeof(*gfout) + pathlen;
2285 OBD_ALLOC(gfout, outsize);
2289 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2290 GOTO(gf_free, rc = -EFAULT);
2291 /* append root FID after gfout to let MDT know the root FID so that it
2292 * can lookup the correct path, this is mainly for fileset.
2293 * old server without fileset mount support will ignore this. */
2294 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2296 /* Call mdc_iocontrol */
2297 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2301 if (copy_to_user(arg, gfout, outsize))
2305 OBD_FREE(gfout, outsize);
2310 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2312 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2320 ioc->idv_version = 0;
2321 ioc->idv_layout_version = UINT_MAX;
2323 /* If no file object initialized, we consider its version is 0. */
2327 env = cl_env_get(&refcheck);
2329 RETURN(PTR_ERR(env));
2331 io = vvp_env_thread_io(env);
2333 io->u.ci_data_version.dv_data_version = 0;
2334 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2335 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2338 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2339 result = cl_io_loop(env, io);
2341 result = io->ci_result;
2343 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2344 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2346 cl_io_fini(env, io);
2348 if (unlikely(io->ci_need_restart))
2351 cl_env_put(env, &refcheck);
2357 * Read the data_version for inode.
2359 * This value is computed using stripe object version on OST.
2360 * Version is computed using server side locking.
2362 * @param flags if do sync on the OST side;
2364 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2365 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2367 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2369 struct ioc_data_version ioc = { .idv_flags = flags };
2372 rc = ll_ioc_data_version(inode, &ioc);
2374 *data_version = ioc.idv_version;
2380 * Trigger a HSM release request for the provided inode.
2382 int ll_hsm_release(struct inode *inode)
2385 struct obd_client_handle *och = NULL;
2386 __u64 data_version = 0;
2391 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2392 ll_get_fsname(inode->i_sb, NULL, 0),
2393 PFID(&ll_i2info(inode)->lli_fid));
2395 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2397 GOTO(out, rc = PTR_ERR(och));
2399 /* Grab latest data_version and [am]time values */
2400 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2404 env = cl_env_get(&refcheck);
2406 GOTO(out, rc = PTR_ERR(env));
2408 rc = ll_merge_attr(env, inode);
2409 cl_env_put(env, &refcheck);
2411 /* If error happen, we have the wrong size for a file.
2417 /* Release the file.
2418 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2419 * we still need it to pack l_remote_handle to MDT. */
2420 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2426 if (och != NULL && !IS_ERR(och)) /* close the file */
2427 ll_lease_close(och, inode, NULL);
2432 struct ll_swap_stack {
2435 struct inode *inode1;
2436 struct inode *inode2;
2441 static int ll_swap_layouts(struct file *file1, struct file *file2,
2442 struct lustre_swap_layouts *lsl)
2444 struct mdc_swap_layouts msl;
2445 struct md_op_data *op_data;
2448 struct ll_swap_stack *llss = NULL;
2451 OBD_ALLOC_PTR(llss);
2455 llss->inode1 = file_inode(file1);
2456 llss->inode2 = file_inode(file2);
2458 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2462 /* we use 2 bool because it is easier to swap than 2 bits */
2463 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2464 llss->check_dv1 = true;
2466 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2467 llss->check_dv2 = true;
2469 /* we cannot use lsl->sl_dvX directly because we may swap them */
2470 llss->dv1 = lsl->sl_dv1;
2471 llss->dv2 = lsl->sl_dv2;
2473 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2474 if (rc == 0) /* same file, done! */
2477 if (rc < 0) { /* sequentialize it */
2478 swap(llss->inode1, llss->inode2);
2480 swap(llss->dv1, llss->dv2);
2481 swap(llss->check_dv1, llss->check_dv2);
2485 if (gid != 0) { /* application asks to flush dirty cache */
2486 rc = ll_get_grouplock(llss->inode1, file1, gid);
2490 rc = ll_get_grouplock(llss->inode2, file2, gid);
2492 ll_put_grouplock(llss->inode1, file1, gid);
2497 /* ultimate check, before swaping the layouts we check if
2498 * dataversion has changed (if requested) */
2499 if (llss->check_dv1) {
2500 rc = ll_data_version(llss->inode1, &dv, 0);
2503 if (dv != llss->dv1)
2504 GOTO(putgl, rc = -EAGAIN);
2507 if (llss->check_dv2) {
2508 rc = ll_data_version(llss->inode2, &dv, 0);
2511 if (dv != llss->dv2)
2512 GOTO(putgl, rc = -EAGAIN);
2515 /* struct md_op_data is used to send the swap args to the mdt
2516 * only flags is missing, so we use struct mdc_swap_layouts
2517 * through the md_op_data->op_data */
2518 /* flags from user space have to be converted before they are send to
2519 * server, no flag is sent today, they are only used on the client */
2522 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2523 0, LUSTRE_OPC_ANY, &msl);
2524 if (IS_ERR(op_data))
2525 GOTO(free, rc = PTR_ERR(op_data));
2527 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2528 sizeof(*op_data), op_data, NULL);
2529 ll_finish_md_op_data(op_data);
2536 ll_put_grouplock(llss->inode2, file2, gid);
2537 ll_put_grouplock(llss->inode1, file1, gid);
2547 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2549 struct obd_export *exp = ll_i2mdexp(inode);
2550 struct md_op_data *op_data;
2554 /* Detect out-of range masks */
2555 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2558 /* Non-root users are forbidden to set or clear flags which are
2559 * NOT defined in HSM_USER_MASK. */
2560 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2561 !cfs_capable(CFS_CAP_SYS_ADMIN))
2564 if (!exp_connect_archive_id_array(exp)) {
2565 /* Detect out-of range archive id */
2566 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2567 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2571 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2572 LUSTRE_OPC_ANY, hss);
2573 if (IS_ERR(op_data))
2574 RETURN(PTR_ERR(op_data));
2576 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2579 ll_finish_md_op_data(op_data);
2584 static int ll_hsm_import(struct inode *inode, struct file *file,
2585 struct hsm_user_import *hui)
2587 struct hsm_state_set *hss = NULL;
2588 struct iattr *attr = NULL;
2592 if (!S_ISREG(inode->i_mode))
2598 GOTO(out, rc = -ENOMEM);
2600 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2601 hss->hss_archive_id = hui->hui_archive_id;
2602 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2603 rc = ll_hsm_state_set(inode, hss);
2607 OBD_ALLOC_PTR(attr);
2609 GOTO(out, rc = -ENOMEM);
2611 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2612 attr->ia_mode |= S_IFREG;
2613 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2614 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2615 attr->ia_size = hui->hui_size;
2616 attr->ia_mtime.tv_sec = hui->hui_mtime;
2617 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2618 attr->ia_atime.tv_sec = hui->hui_atime;
2619 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2621 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2622 ATTR_UID | ATTR_GID |
2623 ATTR_MTIME | ATTR_MTIME_SET |
2624 ATTR_ATIME | ATTR_ATIME_SET;
2628 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2632 inode_unlock(inode);
2644 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2646 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2647 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2650 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2652 struct inode *inode = file_inode(file);
2654 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2655 ATTR_MTIME | ATTR_MTIME_SET |
2658 .tv_sec = lfu->lfu_atime_sec,
2659 .tv_nsec = lfu->lfu_atime_nsec,
2662 .tv_sec = lfu->lfu_mtime_sec,
2663 .tv_nsec = lfu->lfu_mtime_nsec,
2666 .tv_sec = lfu->lfu_ctime_sec,
2667 .tv_nsec = lfu->lfu_ctime_nsec,
2673 if (!capable(CAP_SYS_ADMIN))
2676 if (!S_ISREG(inode->i_mode))
2680 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2682 inode_unlock(inode);
2687 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2690 case MODE_READ_USER:
2692 case MODE_WRITE_USER:
2699 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2701 /* Used to allow the upper layers of the client to request an LDLM lock
2702 * without doing an actual read or write.
2704 * Used for ladvise lockahead to manually request specific locks.
2706 * \param[in] file file this ladvise lock request is on
2707 * \param[in] ladvise ladvise struct describing this lock request
2709 * \retval 0 success, no detailed result available (sync requests
2710 * and requests sent to the server [not handled locally]
2711 * cannot return detailed results)
2712 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2713 * see definitions for details.
2714 * \retval negative negative errno on error
2716 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2718 struct lu_env *env = NULL;
2719 struct cl_io *io = NULL;
2720 struct cl_lock *lock = NULL;
2721 struct cl_lock_descr *descr = NULL;
2722 struct dentry *dentry = file->f_path.dentry;
2723 struct inode *inode = dentry->d_inode;
2724 enum cl_lock_mode cl_mode;
2725 off_t start = ladvise->lla_start;
2726 off_t end = ladvise->lla_end;
2732 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2733 "start=%llu, end=%llu\n", dentry->d_name.len,
2734 dentry->d_name.name, dentry->d_inode,
2735 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2738 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2740 GOTO(out, result = cl_mode);
2742 /* Get IO environment */
2743 result = cl_io_get(inode, &env, &io, &refcheck);
2747 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2750 * nothing to do for this io. This currently happens when
2751 * stripe sub-object's are not yet created.
2753 result = io->ci_result;
2754 } else if (result == 0) {
2755 lock = vvp_env_lock(env);
2756 descr = &lock->cll_descr;
2758 descr->cld_obj = io->ci_obj;
2759 /* Convert byte offsets to pages */
2760 descr->cld_start = cl_index(io->ci_obj, start);
2761 descr->cld_end = cl_index(io->ci_obj, end);
2762 descr->cld_mode = cl_mode;
2763 /* CEF_MUST is used because we do not want to convert a
2764 * lockahead request to a lockless lock */
2765 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2768 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2769 descr->cld_enq_flags |= CEF_SPECULATIVE;
2771 result = cl_lock_request(env, io, lock);
2773 /* On success, we need to release the lock */
2775 cl_lock_release(env, lock);
2777 cl_io_fini(env, io);
2778 cl_env_put(env, &refcheck);
2780 /* -ECANCELED indicates a matching lock with a different extent
2781 * was already present, and -EEXIST indicates a matching lock
2782 * on exactly the same extent was already present.
2783 * We convert them to positive values for userspace to make
2784 * recognizing true errors easier.
2785 * Note we can only return these detailed results on async requests,
2786 * as sync requests look the same as i/o requests for locking. */
2787 if (result == -ECANCELED)
2788 result = LLA_RESULT_DIFFERENT;
2789 else if (result == -EEXIST)
2790 result = LLA_RESULT_SAME;
2795 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2797 static int ll_ladvise_sanity(struct inode *inode,
2798 struct llapi_lu_ladvise *ladvise)
2800 enum lu_ladvise_type advice = ladvise->lla_advice;
2801 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2802 * be in the first 32 bits of enum ladvise_flags */
2803 __u32 flags = ladvise->lla_peradvice_flags;
2804 /* 3 lines at 80 characters per line, should be plenty */
2807 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2809 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2810 "last supported advice is %s (value '%d'): rc = %d\n",
2811 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2812 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2816 /* Per-advice checks */
2818 case LU_LADVISE_LOCKNOEXPAND:
2819 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2821 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2823 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2824 ladvise_names[advice], rc);
2828 case LU_LADVISE_LOCKAHEAD:
2829 /* Currently only READ and WRITE modes can be requested */
2830 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2831 ladvise->lla_lockahead_mode == 0) {
2833 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2835 ll_get_fsname(inode->i_sb, NULL, 0),
2836 ladvise->lla_lockahead_mode,
2837 ladvise_names[advice], rc);
2840 case LU_LADVISE_WILLREAD:
2841 case LU_LADVISE_DONTNEED:
2843 /* Note fall through above - These checks apply to all advices
2844 * except LOCKNOEXPAND */
2845 if (flags & ~LF_DEFAULT_MASK) {
2847 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2849 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2850 ladvise_names[advice], rc);
2853 if (ladvise->lla_start >= ladvise->lla_end) {
2855 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2856 "for %s: rc = %d\n",
2857 ll_get_fsname(inode->i_sb, NULL, 0),
2858 ladvise->lla_start, ladvise->lla_end,
2859 ladvise_names[advice], rc);
2871 * Give file access advices
2873 * The ladvise interface is similar to Linux fadvise() system call, except it
2874 * forwards the advices directly from Lustre client to server. The server side
2875 * codes will apply appropriate read-ahead and caching techniques for the
2876 * corresponding files.
2878 * A typical workload for ladvise is e.g. a bunch of different clients are
2879 * doing small random reads of a file, so prefetching pages into OSS cache
2880 * with big linear reads before the random IO is a net benefit. Fetching
2881 * all that data into each client cache with fadvise() may not be, due to
2882 * much more data being sent to the client.
2884 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2885 struct llapi_lu_ladvise *ladvise)
2889 struct cl_ladvise_io *lio;
2894 env = cl_env_get(&refcheck);
2896 RETURN(PTR_ERR(env));
2898 io = vvp_env_thread_io(env);
2899 io->ci_obj = ll_i2info(inode)->lli_clob;
2901 /* initialize parameters for ladvise */
2902 lio = &io->u.ci_ladvise;
2903 lio->li_start = ladvise->lla_start;
2904 lio->li_end = ladvise->lla_end;
2905 lio->li_fid = ll_inode2fid(inode);
2906 lio->li_advice = ladvise->lla_advice;
2907 lio->li_flags = flags;
2909 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2910 rc = cl_io_loop(env, io);
2914 cl_io_fini(env, io);
2915 cl_env_put(env, &refcheck);
2919 static int ll_lock_noexpand(struct file *file, int flags)
2921 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2923 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2928 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2931 struct fsxattr fsxattr;
2933 if (copy_from_user(&fsxattr,
2934 (const struct fsxattr __user *)arg,
2938 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2939 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2940 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2941 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2942 if (copy_to_user((struct fsxattr __user *)arg,
2943 &fsxattr, sizeof(fsxattr)))
2949 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
2952 * Project Quota ID state is only allowed to change from within the init
2953 * namespace. Enforce that restriction only if we are trying to change
2954 * the quota ID state. Everything else is allowed in user namespaces.
2956 if (current_user_ns() == &init_user_ns)
2959 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
2962 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
2963 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
2966 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
2973 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2977 struct md_op_data *op_data;
2978 struct ptlrpc_request *req = NULL;
2980 struct fsxattr fsxattr;
2981 struct cl_object *obj;
2985 if (copy_from_user(&fsxattr,
2986 (const struct fsxattr __user *)arg,
2990 rc = ll_ioctl_check_project(inode, &fsxattr);
2994 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2995 LUSTRE_OPC_ANY, NULL);
2996 if (IS_ERR(op_data))
2997 RETURN(PTR_ERR(op_data));
2999 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3000 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3001 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3002 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3003 op_data->op_projid = fsxattr.fsx_projid;
3004 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3005 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3007 ptlrpc_req_finished(req);
3009 GOTO(out_fsxattr, rc);
3010 ll_update_inode_flags(inode, op_data->op_attr_flags);
3011 obj = ll_i2info(inode)->lli_clob;
3013 GOTO(out_fsxattr, rc);
3015 OBD_ALLOC_PTR(attr);
3017 GOTO(out_fsxattr, rc = -ENOMEM);
3019 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3020 fsxattr.fsx_xflags);
3023 ll_finish_md_op_data(op_data);
3027 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3030 struct inode *inode = file_inode(file);
3031 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3032 struct ll_inode_info *lli = ll_i2info(inode);
3033 struct obd_client_handle *och = NULL;
3034 struct split_param sp;
3037 enum mds_op_bias bias = 0;
3038 struct file *layout_file = NULL;
3040 size_t data_size = 0;
3044 mutex_lock(&lli->lli_och_mutex);
3045 if (fd->fd_lease_och != NULL) {
3046 och = fd->fd_lease_och;
3047 fd->fd_lease_och = NULL;
3049 mutex_unlock(&lli->lli_och_mutex);
3052 GOTO(out, rc = -ENOLCK);
3054 fmode = och->och_flags;
3056 switch (ioc->lil_flags) {
3057 case LL_LEASE_RESYNC_DONE:
3058 if (ioc->lil_count > IOC_IDS_MAX)
3059 GOTO(out, rc = -EINVAL);
3061 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3062 OBD_ALLOC(data, data_size);
3064 GOTO(out, rc = -ENOMEM);
3066 if (copy_from_user(data, (void __user *)arg, data_size))
3067 GOTO(out, rc = -EFAULT);
3069 bias = MDS_CLOSE_RESYNC_DONE;
3071 case LL_LEASE_LAYOUT_MERGE: {
3074 if (ioc->lil_count != 1)
3075 GOTO(out, rc = -EINVAL);
3077 arg += sizeof(*ioc);
3078 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3079 GOTO(out, rc = -EFAULT);
3081 layout_file = fget(fd);
3083 GOTO(out, rc = -EBADF);
3085 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3086 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3087 GOTO(out, rc = -EPERM);
3089 data = file_inode(layout_file);
3090 bias = MDS_CLOSE_LAYOUT_MERGE;
3093 case LL_LEASE_LAYOUT_SPLIT: {
3097 if (ioc->lil_count != 2)
3098 GOTO(out, rc = -EINVAL);
3100 arg += sizeof(*ioc);
3101 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3102 GOTO(out, rc = -EFAULT);
3104 arg += sizeof(__u32);
3105 if (copy_from_user(&mirror_id, (void __user *)arg,
3107 GOTO(out, rc = -EFAULT);
3109 layout_file = fget(fdv);
3111 GOTO(out, rc = -EBADF);
3113 sp.sp_inode = file_inode(layout_file);
3114 sp.sp_mirror_id = (__u16)mirror_id;
3116 bias = MDS_CLOSE_LAYOUT_SPLIT;
3120 /* without close intent */
3124 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3128 rc = ll_lease_och_release(inode, file);
3137 switch (ioc->lil_flags) {
3138 case LL_LEASE_RESYNC_DONE:
3140 OBD_FREE(data, data_size);
3142 case LL_LEASE_LAYOUT_MERGE:
3143 case LL_LEASE_LAYOUT_SPLIT:
3150 rc = ll_lease_type_from_fmode(fmode);
3154 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3157 struct inode *inode = file_inode(file);
3158 struct ll_inode_info *lli = ll_i2info(inode);
3159 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3160 struct obd_client_handle *och = NULL;
3161 __u64 open_flags = 0;
3167 switch (ioc->lil_mode) {
3168 case LL_LEASE_WRLCK:
3169 if (!(file->f_mode & FMODE_WRITE))
3171 fmode = FMODE_WRITE;
3173 case LL_LEASE_RDLCK:
3174 if (!(file->f_mode & FMODE_READ))
3178 case LL_LEASE_UNLCK:
3179 RETURN(ll_file_unlock_lease(file, ioc, arg));
3184 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3186 /* apply for lease */
3187 if (ioc->lil_flags & LL_LEASE_RESYNC)
3188 open_flags = MDS_OPEN_RESYNC;
3189 och = ll_lease_open(inode, file, fmode, open_flags);
3191 RETURN(PTR_ERR(och));
3193 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3194 rc = ll_lease_file_resync(och, inode, arg);
3196 ll_lease_close(och, inode, NULL);
3199 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3201 ll_lease_close(och, inode, NULL);
3207 mutex_lock(&lli->lli_och_mutex);
3208 if (fd->fd_lease_och == NULL) {
3209 fd->fd_lease_och = och;
3212 mutex_unlock(&lli->lli_och_mutex);
3214 /* impossible now that only excl is supported for now */
3215 ll_lease_close(och, inode, &lease_broken);
3222 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3224 struct inode *inode = file_inode(file);
3225 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3229 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3230 PFID(ll_inode2fid(inode)), inode, cmd);
3231 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3233 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3234 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3238 case LL_IOC_GETFLAGS:
3239 /* Get the current value of the file flags */
3240 return put_user(fd->fd_flags, (int __user *)arg);
3241 case LL_IOC_SETFLAGS:
3242 case LL_IOC_CLRFLAGS:
3243 /* Set or clear specific file flags */
3244 /* XXX This probably needs checks to ensure the flags are
3245 * not abused, and to handle any flag side effects.
3247 if (get_user(flags, (int __user *) arg))
3250 if (cmd == LL_IOC_SETFLAGS) {
3251 if ((flags & LL_FILE_IGNORE_LOCK) &&
3252 !(file->f_flags & O_DIRECT)) {
3253 CERROR("%s: unable to disable locking on "
3254 "non-O_DIRECT file\n", current->comm);
3258 fd->fd_flags |= flags;
3260 fd->fd_flags &= ~flags;
3263 case LL_IOC_LOV_SETSTRIPE:
3264 case LL_IOC_LOV_SETSTRIPE_NEW:
3265 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3266 case LL_IOC_LOV_SETEA:
3267 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3268 case LL_IOC_LOV_SWAP_LAYOUTS: {
3270 struct lustre_swap_layouts lsl;
3272 if (copy_from_user(&lsl, (char __user *)arg,
3273 sizeof(struct lustre_swap_layouts)))
3276 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3279 file2 = fget(lsl.sl_fd);
3283 /* O_WRONLY or O_RDWR */
3284 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3285 GOTO(out, rc = -EPERM);
3287 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3288 struct inode *inode2;
3289 struct ll_inode_info *lli;
3290 struct obd_client_handle *och = NULL;
3292 lli = ll_i2info(inode);
3293 mutex_lock(&lli->lli_och_mutex);
3294 if (fd->fd_lease_och != NULL) {
3295 och = fd->fd_lease_och;
3296 fd->fd_lease_och = NULL;
3298 mutex_unlock(&lli->lli_och_mutex);
3300 GOTO(out, rc = -ENOLCK);
3301 inode2 = file_inode(file2);
3302 rc = ll_swap_layouts_close(och, inode, inode2);
3304 rc = ll_swap_layouts(file, file2, &lsl);
3310 case LL_IOC_LOV_GETSTRIPE:
3311 case LL_IOC_LOV_GETSTRIPE_NEW:
3312 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3313 case FS_IOC_GETFLAGS:
3314 case FS_IOC_SETFLAGS:
3315 RETURN(ll_iocontrol(inode, file, cmd, arg));
3316 case FSFILT_IOC_GETVERSION:
3317 case FS_IOC_GETVERSION:
3318 RETURN(put_user(inode->i_generation, (int __user *)arg));
3319 /* We need to special case any other ioctls we want to handle,
3320 * to send them to the MDS/OST as appropriate and to properly
3321 * network encode the arg field. */
3322 case FS_IOC_SETVERSION:
3325 case LL_IOC_GROUP_LOCK:
3326 RETURN(ll_get_grouplock(inode, file, arg));
3327 case LL_IOC_GROUP_UNLOCK:
3328 RETURN(ll_put_grouplock(inode, file, arg));
3329 case IOC_OBD_STATFS:
3330 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3332 case LL_IOC_FLUSHCTX:
3333 RETURN(ll_flush_ctx(inode));
3334 case LL_IOC_PATH2FID: {
3335 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3336 sizeof(struct lu_fid)))
3341 case LL_IOC_GETPARENT:
3342 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3344 case OBD_IOC_FID2PATH:
3345 RETURN(ll_fid2path(inode, (void __user *)arg));
3346 case LL_IOC_DATA_VERSION: {
3347 struct ioc_data_version idv;
3350 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3353 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3354 rc = ll_ioc_data_version(inode, &idv);
3357 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3363 case LL_IOC_GET_MDTIDX: {
3366 mdtidx = ll_get_mdt_idx(inode);
3370 if (put_user((int)mdtidx, (int __user *)arg))
3375 case OBD_IOC_GETDTNAME:
3376 case OBD_IOC_GETMDNAME:
3377 RETURN(ll_get_obd_name(inode, cmd, arg));
3378 case LL_IOC_HSM_STATE_GET: {
3379 struct md_op_data *op_data;
3380 struct hsm_user_state *hus;
3387 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3388 LUSTRE_OPC_ANY, hus);
3389 if (IS_ERR(op_data)) {
3391 RETURN(PTR_ERR(op_data));
3394 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3397 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3400 ll_finish_md_op_data(op_data);
3404 case LL_IOC_HSM_STATE_SET: {
3405 struct hsm_state_set *hss;
3412 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3417 rc = ll_hsm_state_set(inode, hss);
3422 case LL_IOC_HSM_ACTION: {
3423 struct md_op_data *op_data;
3424 struct hsm_current_action *hca;
3431 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3432 LUSTRE_OPC_ANY, hca);
3433 if (IS_ERR(op_data)) {
3435 RETURN(PTR_ERR(op_data));
3438 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3441 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3444 ll_finish_md_op_data(op_data);
3448 case LL_IOC_SET_LEASE_OLD: {
3449 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3451 RETURN(ll_file_set_lease(file, &ioc, 0));
3453 case LL_IOC_SET_LEASE: {
3454 struct ll_ioc_lease ioc;
3456 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3459 RETURN(ll_file_set_lease(file, &ioc, arg));
3461 case LL_IOC_GET_LEASE: {
3462 struct ll_inode_info *lli = ll_i2info(inode);
3463 struct ldlm_lock *lock = NULL;
3466 mutex_lock(&lli->lli_och_mutex);
3467 if (fd->fd_lease_och != NULL) {
3468 struct obd_client_handle *och = fd->fd_lease_och;
3470 lock = ldlm_handle2lock(&och->och_lease_handle);
3472 lock_res_and_lock(lock);
3473 if (!ldlm_is_cancel(lock))
3474 fmode = och->och_flags;
3476 unlock_res_and_lock(lock);
3477 LDLM_LOCK_PUT(lock);
3480 mutex_unlock(&lli->lli_och_mutex);
3482 RETURN(ll_lease_type_from_fmode(fmode));
3484 case LL_IOC_HSM_IMPORT: {
3485 struct hsm_user_import *hui;
3491 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3496 rc = ll_hsm_import(inode, file, hui);
3501 case LL_IOC_FUTIMES_3: {
3502 struct ll_futimes_3 lfu;
3504 if (copy_from_user(&lfu,
3505 (const struct ll_futimes_3 __user *)arg,
3509 RETURN(ll_file_futimes_3(file, &lfu));
3511 case LL_IOC_LADVISE: {
3512 struct llapi_ladvise_hdr *k_ladvise_hdr;
3513 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3516 int alloc_size = sizeof(*k_ladvise_hdr);
3519 u_ladvise_hdr = (void __user *)arg;
3520 OBD_ALLOC_PTR(k_ladvise_hdr);
3521 if (k_ladvise_hdr == NULL)
3524 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3525 GOTO(out_ladvise, rc = -EFAULT);
3527 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3528 k_ladvise_hdr->lah_count < 1)
3529 GOTO(out_ladvise, rc = -EINVAL);
3531 num_advise = k_ladvise_hdr->lah_count;
3532 if (num_advise >= LAH_COUNT_MAX)
3533 GOTO(out_ladvise, rc = -EFBIG);
3535 OBD_FREE_PTR(k_ladvise_hdr);
3536 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3537 lah_advise[num_advise]);
3538 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3539 if (k_ladvise_hdr == NULL)
3543 * TODO: submit multiple advices to one server in a single RPC
3545 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3546 GOTO(out_ladvise, rc = -EFAULT);
3548 for (i = 0; i < num_advise; i++) {
3549 struct llapi_lu_ladvise *k_ladvise =
3550 &k_ladvise_hdr->lah_advise[i];
3551 struct llapi_lu_ladvise __user *u_ladvise =
3552 &u_ladvise_hdr->lah_advise[i];
3554 rc = ll_ladvise_sanity(inode, k_ladvise);
3556 GOTO(out_ladvise, rc);
3558 switch (k_ladvise->lla_advice) {
3559 case LU_LADVISE_LOCKNOEXPAND:
3560 rc = ll_lock_noexpand(file,
3561 k_ladvise->lla_peradvice_flags);
3562 GOTO(out_ladvise, rc);
3563 case LU_LADVISE_LOCKAHEAD:
3565 rc = ll_file_lock_ahead(file, k_ladvise);
3568 GOTO(out_ladvise, rc);
3571 &u_ladvise->lla_lockahead_result))
3572 GOTO(out_ladvise, rc = -EFAULT);
3575 rc = ll_ladvise(inode, file,
3576 k_ladvise_hdr->lah_flags,
3579 GOTO(out_ladvise, rc);
3586 OBD_FREE(k_ladvise_hdr, alloc_size);
3589 case LL_IOC_FLR_SET_MIRROR: {
3590 /* mirror I/O must be direct to avoid polluting page cache
3592 if (!(file->f_flags & O_DIRECT))
3595 fd->fd_designated_mirror = (__u32)arg;
3598 case LL_IOC_FSGETXATTR:
3599 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3600 case LL_IOC_FSSETXATTR:
3601 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3603 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3605 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3606 (void __user *)arg));
3610 #ifndef HAVE_FILE_LLSEEK_SIZE
3611 static inline loff_t
3612 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3614 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3616 if (offset > maxsize)
3619 if (offset != file->f_pos) {
3620 file->f_pos = offset;
3621 file->f_version = 0;
3627 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3628 loff_t maxsize, loff_t eof)
3630 struct inode *inode = file_inode(file);
3638 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3639 * position-querying operation. Avoid rewriting the "same"
3640 * f_pos value back to the file because a concurrent read(),
3641 * write() or lseek() might have altered it
3646 * f_lock protects against read/modify/write race with other
3647 * SEEK_CURs. Note that parallel writes and reads behave
3651 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3652 inode_unlock(inode);
3656 * In the generic case the entire file is data, so as long as
3657 * offset isn't at the end of the file then the offset is data.
3664 * There is a virtual hole at the end of the file, so as long as
3665 * offset isn't i_size or larger, return i_size.
3673 return llseek_execute(file, offset, maxsize);
3677 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3679 struct inode *inode = file_inode(file);
3680 loff_t retval, eof = 0;
3683 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3684 (origin == SEEK_CUR) ? file->f_pos : 0);
3685 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3686 PFID(ll_inode2fid(inode)), inode, retval, retval,
3688 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3690 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3691 retval = ll_glimpse_size(inode);
3694 eof = i_size_read(inode);
3697 retval = ll_generic_file_llseek_size(file, offset, origin,
3698 ll_file_maxbytes(inode), eof);
3702 static int ll_flush(struct file *file, fl_owner_t id)
3704 struct inode *inode = file_inode(file);
3705 struct ll_inode_info *lli = ll_i2info(inode);
3706 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3709 LASSERT(!S_ISDIR(inode->i_mode));
3711 /* catch async errors that were recorded back when async writeback
3712 * failed for pages in this mapping. */
3713 rc = lli->lli_async_rc;
3714 lli->lli_async_rc = 0;
3715 if (lli->lli_clob != NULL) {
3716 err = lov_read_and_clear_async_rc(lli->lli_clob);
3721 /* The application has been told write failure already.
3722 * Do not report failure again. */
3723 if (fd->fd_write_failed)
3725 return rc ? -EIO : 0;
3729 * Called to make sure a portion of file has been written out.
3730 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3732 * Return how many pages have been written.
3734 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3735 enum cl_fsync_mode mode, int ignore_layout)
3739 struct cl_fsync_io *fio;
3744 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3745 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3748 env = cl_env_get(&refcheck);
3750 RETURN(PTR_ERR(env));
3752 io = vvp_env_thread_io(env);
3753 io->ci_obj = ll_i2info(inode)->lli_clob;
3754 io->ci_ignore_layout = ignore_layout;
3756 /* initialize parameters for sync */
3757 fio = &io->u.ci_fsync;
3758 fio->fi_start = start;
3760 fio->fi_fid = ll_inode2fid(inode);
3761 fio->fi_mode = mode;
3762 fio->fi_nr_written = 0;
3764 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3765 result = cl_io_loop(env, io);
3767 result = io->ci_result;
3769 result = fio->fi_nr_written;
3770 cl_io_fini(env, io);
3771 cl_env_put(env, &refcheck);
3777 * When dentry is provided (the 'else' case), file_dentry() may be
3778 * null and dentry must be used directly rather than pulled from
3779 * file_dentry() as is done otherwise.
3782 #ifdef HAVE_FILE_FSYNC_4ARGS
3783 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3785 struct dentry *dentry = file_dentry(file);
3786 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3787 int ll_fsync(struct file *file, int datasync)
3789 struct dentry *dentry = file_dentry(file);
3791 loff_t end = LLONG_MAX;
3793 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3796 loff_t end = LLONG_MAX;
3798 struct inode *inode = dentry->d_inode;
3799 struct ll_inode_info *lli = ll_i2info(inode);
3800 struct ptlrpc_request *req;
3804 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3805 PFID(ll_inode2fid(inode)), inode);
3806 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3808 #ifdef HAVE_FILE_FSYNC_4ARGS
3809 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3812 /* fsync's caller has already called _fdata{sync,write}, we want
3813 * that IO to finish before calling the osc and mdc sync methods */
3814 rc = filemap_fdatawait(inode->i_mapping);
3817 /* catch async errors that were recorded back when async writeback
3818 * failed for pages in this mapping. */
3819 if (!S_ISDIR(inode->i_mode)) {
3820 err = lli->lli_async_rc;
3821 lli->lli_async_rc = 0;
3824 if (lli->lli_clob != NULL) {
3825 err = lov_read_and_clear_async_rc(lli->lli_clob);
3831 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3835 ptlrpc_req_finished(req);
3837 if (S_ISREG(inode->i_mode)) {
3838 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3840 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3841 if (rc == 0 && err < 0)
3844 fd->fd_write_failed = true;
3846 fd->fd_write_failed = false;
3849 #ifdef HAVE_FILE_FSYNC_4ARGS
3850 inode_unlock(inode);
3856 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3858 struct inode *inode = file_inode(file);
3859 struct ll_sb_info *sbi = ll_i2sbi(inode);
3860 struct ldlm_enqueue_info einfo = {
3861 .ei_type = LDLM_FLOCK,
3862 .ei_cb_cp = ldlm_flock_completion_ast,
3863 .ei_cbdata = file_lock,
3865 struct md_op_data *op_data;
3866 struct lustre_handle lockh = { 0 };
3867 union ldlm_policy_data flock = { { 0 } };
3868 int fl_type = file_lock->fl_type;
3874 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3875 PFID(ll_inode2fid(inode)), file_lock);
3877 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3879 if (file_lock->fl_flags & FL_FLOCK) {
3880 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3881 /* flocks are whole-file locks */
3882 flock.l_flock.end = OFFSET_MAX;
3883 /* For flocks owner is determined by the local file desctiptor*/
3884 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3885 } else if (file_lock->fl_flags & FL_POSIX) {
3886 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3887 flock.l_flock.start = file_lock->fl_start;
3888 flock.l_flock.end = file_lock->fl_end;
3892 flock.l_flock.pid = file_lock->fl_pid;
3894 /* Somewhat ugly workaround for svc lockd.
3895 * lockd installs custom fl_lmops->lm_compare_owner that checks
3896 * for the fl_owner to be the same (which it always is on local node
3897 * I guess between lockd processes) and then compares pid.
3898 * As such we assign pid to the owner field to make it all work,
3899 * conflict with normal locks is unlikely since pid space and
3900 * pointer space for current->files are not intersecting */
3901 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3902 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3906 einfo.ei_mode = LCK_PR;
3909 /* An unlock request may or may not have any relation to
3910 * existing locks so we may not be able to pass a lock handle
3911 * via a normal ldlm_lock_cancel() request. The request may even
3912 * unlock a byte range in the middle of an existing lock. In
3913 * order to process an unlock request we need all of the same
3914 * information that is given with a normal read or write record
3915 * lock request. To avoid creating another ldlm unlock (cancel)
3916 * message we'll treat a LCK_NL flock request as an unlock. */
3917 einfo.ei_mode = LCK_NL;
3920 einfo.ei_mode = LCK_PW;
3923 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3938 flags = LDLM_FL_BLOCK_NOWAIT;
3944 flags = LDLM_FL_TEST_LOCK;
3947 CERROR("unknown fcntl lock command: %d\n", cmd);
3951 /* Save the old mode so that if the mode in the lock changes we
3952 * can decrement the appropriate reader or writer refcount. */
3953 file_lock->fl_type = einfo.ei_mode;
3955 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3956 LUSTRE_OPC_ANY, NULL);
3957 if (IS_ERR(op_data))
3958 RETURN(PTR_ERR(op_data));
3960 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3961 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3962 flock.l_flock.pid, flags, einfo.ei_mode,
3963 flock.l_flock.start, flock.l_flock.end);
3965 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3968 /* Restore the file lock type if not TEST lock. */
3969 if (!(flags & LDLM_FL_TEST_LOCK))
3970 file_lock->fl_type = fl_type;
3972 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3973 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3974 !(flags & LDLM_FL_TEST_LOCK))
3975 rc2 = locks_lock_file_wait(file, file_lock);
3977 if ((file_lock->fl_flags & FL_FLOCK) &&
3978 (rc == 0 || file_lock->fl_type == F_UNLCK))
3979 rc2 = flock_lock_file_wait(file, file_lock);
3980 if ((file_lock->fl_flags & FL_POSIX) &&
3981 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3982 !(flags & LDLM_FL_TEST_LOCK))
3983 rc2 = posix_lock_file_wait(file, file_lock);
3984 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3986 if (rc2 && file_lock->fl_type != F_UNLCK) {
3987 einfo.ei_mode = LCK_NL;
3988 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3993 ll_finish_md_op_data(op_data);
3998 int ll_get_fid_by_name(struct inode *parent, const char *name,
3999 int namelen, struct lu_fid *fid,
4000 struct inode **inode)
4002 struct md_op_data *op_data = NULL;
4003 struct mdt_body *body;
4004 struct ptlrpc_request *req;
4008 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4009 LUSTRE_OPC_ANY, NULL);
4010 if (IS_ERR(op_data))
4011 RETURN(PTR_ERR(op_data));
4013 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4014 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4015 ll_finish_md_op_data(op_data);
4019 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4021 GOTO(out_req, rc = -EFAULT);
4023 *fid = body->mbo_fid1;
4026 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4028 ptlrpc_req_finished(req);
4032 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4035 struct dentry *dchild = NULL;
4036 struct inode *child_inode = NULL;
4037 struct md_op_data *op_data;
4038 struct ptlrpc_request *request = NULL;
4039 struct obd_client_handle *och = NULL;
4041 struct mdt_body *body;
4042 __u64 data_version = 0;
4043 size_t namelen = strlen(name);
4044 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4048 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4049 PFID(ll_inode2fid(parent)), name,
4050 lum->lum_stripe_offset, lum->lum_stripe_count);
4052 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4053 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4054 lustre_swab_lmv_user_md(lum);
4056 /* Get child FID first */
4057 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4060 dchild = d_lookup(file_dentry(file), &qstr);
4062 if (dchild->d_inode)
4063 child_inode = igrab(dchild->d_inode);
4068 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4077 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4078 OBD_CONNECT2_DIR_MIGRATE)) {
4079 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4080 ll_i2info(child_inode)->lli_lsm_md) {
4081 CERROR("%s: MDT doesn't support stripe directory "
4083 ll_get_fsname(parent->i_sb, NULL, 0));
4084 GOTO(out_iput, rc = -EOPNOTSUPP);
4089 * lfs migrate command needs to be blocked on the client
4090 * by checking the migrate FID against the FID of the
4093 if (child_inode == parent->i_sb->s_root->d_inode)
4094 GOTO(out_iput, rc = -EINVAL);
4096 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4097 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4098 if (IS_ERR(op_data))
4099 GOTO(out_iput, rc = PTR_ERR(op_data));
4101 inode_lock(child_inode);
4102 op_data->op_fid3 = *ll_inode2fid(child_inode);
4103 if (!fid_is_sane(&op_data->op_fid3)) {
4104 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4105 ll_get_fsname(parent->i_sb, NULL, 0), name,
4106 PFID(&op_data->op_fid3));
4107 GOTO(out_unlock, rc = -EINVAL);
4110 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4111 op_data->op_data = lum;
4112 op_data->op_data_size = lumlen;
4115 if (S_ISREG(child_inode->i_mode)) {
4116 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4120 GOTO(out_unlock, rc);
4123 rc = ll_data_version(child_inode, &data_version,
4126 GOTO(out_close, rc);
4128 op_data->op_open_handle = och->och_open_handle;
4129 op_data->op_data_version = data_version;
4130 op_data->op_lease_handle = och->och_lease_handle;
4131 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4133 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4134 och->och_mod->mod_open_req->rq_replay = 0;
4135 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4138 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4139 name, namelen, &request);
4141 LASSERT(request != NULL);
4142 ll_update_times(request, parent);
4144 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4145 LASSERT(body != NULL);
4147 /* If the server does release layout lock, then we cleanup
4148 * the client och here, otherwise release it in out_close: */
4149 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4150 obd_mod_put(och->och_mod);
4151 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4153 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4159 if (request != NULL) {
4160 ptlrpc_req_finished(request);
4164 /* Try again if the file layout has changed. */
4165 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4170 ll_lease_close(och, child_inode, NULL);
4172 clear_nlink(child_inode);
4174 inode_unlock(child_inode);
4175 ll_finish_md_op_data(op_data);
4182 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4190 * test if some locks matching bits and l_req_mode are acquired
4191 * - bits can be in different locks
4192 * - if found clear the common lock bits in *bits
4193 * - the bits not found, are kept in *bits
4195 * \param bits [IN] searched lock bits [IN]
4196 * \param l_req_mode [IN] searched lock mode
4197 * \retval boolean, true iff all bits are found
4199 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4201 struct lustre_handle lockh;
4202 union ldlm_policy_data policy;
4203 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4204 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4213 fid = &ll_i2info(inode)->lli_fid;
4214 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4215 ldlm_lockname[mode]);
4217 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4218 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4219 policy.l_inodebits.bits = *bits & (1 << i);
4220 if (policy.l_inodebits.bits == 0)
4223 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4224 &policy, mode, &lockh)) {
4225 struct ldlm_lock *lock;
4227 lock = ldlm_handle2lock(&lockh);
4230 ~(lock->l_policy_data.l_inodebits.bits);
4231 LDLM_LOCK_PUT(lock);
4233 *bits &= ~policy.l_inodebits.bits;
4240 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4241 struct lustre_handle *lockh, __u64 flags,
4242 enum ldlm_mode mode)
4244 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4249 fid = &ll_i2info(inode)->lli_fid;
4250 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4252 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4253 fid, LDLM_IBITS, &policy, mode, lockh);
4258 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4260 /* Already unlinked. Just update nlink and return success */
4261 if (rc == -ENOENT) {
4263 /* If it is striped directory, and there is bad stripe
4264 * Let's revalidate the dentry again, instead of returning
4266 if (S_ISDIR(inode->i_mode) &&
4267 ll_i2info(inode)->lli_lsm_md != NULL)
4270 /* This path cannot be hit for regular files unless in
4271 * case of obscure races, so no need to to validate
4273 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4275 } else if (rc != 0) {
4276 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4277 "%s: revalidate FID "DFID" error: rc = %d\n",
4278 ll_get_fsname(inode->i_sb, NULL, 0),
4279 PFID(ll_inode2fid(inode)), rc);
4285 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4287 struct inode *inode = dentry->d_inode;
4288 struct obd_export *exp = ll_i2mdexp(inode);
4289 struct lookup_intent oit = {
4292 struct ptlrpc_request *req = NULL;
4293 struct md_op_data *op_data;
4297 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4298 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4300 /* Call getattr by fid, so do not provide name at all. */
4301 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4302 LUSTRE_OPC_ANY, NULL);
4303 if (IS_ERR(op_data))
4304 RETURN(PTR_ERR(op_data));
4306 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4307 ll_finish_md_op_data(op_data);
4309 rc = ll_inode_revalidate_fini(inode, rc);
4313 rc = ll_revalidate_it_finish(req, &oit, dentry);
4315 ll_intent_release(&oit);
4319 /* Unlinked? Unhash dentry, so it is not picked up later by
4320 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4321 * here to preserve get_cwd functionality on 2.6.
4323 if (!dentry->d_inode->i_nlink) {
4324 ll_lock_dcache(inode);
4325 d_lustre_invalidate(dentry, 0);
4326 ll_unlock_dcache(inode);
4329 ll_lookup_finish_locks(&oit, dentry);
4331 ptlrpc_req_finished(req);
4336 static int ll_merge_md_attr(struct inode *inode)
4338 struct ll_inode_info *lli = ll_i2info(inode);
4339 struct cl_attr attr = { 0 };
4342 LASSERT(lli->lli_lsm_md != NULL);
4343 down_read(&lli->lli_lsm_sem);
4344 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4345 &attr, ll_md_blocking_ast);
4346 up_read(&lli->lli_lsm_sem);
4350 set_nlink(inode, attr.cat_nlink);
4351 inode->i_blocks = attr.cat_blocks;
4352 i_size_write(inode, attr.cat_size);
4354 ll_i2info(inode)->lli_atime = attr.cat_atime;
4355 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4356 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4361 static inline dev_t ll_compat_encode_dev(dev_t dev)
4363 /* The compat_sys_*stat*() syscalls will fail unless the
4364 * device majors and minors are both less than 256. Note that
4365 * the value returned here will be passed through
4366 * old_encode_dev() in cp_compat_stat(). And so we are not
4367 * trying to return a valid compat (u16) device number, just
4368 * one that will pass the old_valid_dev() check. */
4370 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4373 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4374 int ll_getattr(const struct path *path, struct kstat *stat,
4375 u32 request_mask, unsigned int flags)
4377 struct dentry *de = path->dentry;
4379 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4382 struct inode *inode = de->d_inode;
4383 struct ll_sb_info *sbi = ll_i2sbi(inode);
4384 struct ll_inode_info *lli = ll_i2info(inode);
4387 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4389 rc = ll_inode_revalidate(de, IT_GETATTR);
4393 if (S_ISREG(inode->i_mode)) {
4394 /* In case of restore, the MDT has the right size and has
4395 * already send it back without granting the layout lock,
4396 * inode is up-to-date so glimpse is useless.
4397 * Also to glimpse we need the layout, in case of a running
4398 * restore the MDT holds the layout lock so the glimpse will
4399 * block up to the end of restore (getattr will block)
4401 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4402 rc = ll_glimpse_size(inode);
4407 /* If object isn't regular a file then don't validate size. */
4408 if (S_ISDIR(inode->i_mode) &&
4409 lli->lli_lsm_md != NULL) {
4410 rc = ll_merge_md_attr(inode);
4415 inode->i_atime.tv_sec = lli->lli_atime;
4416 inode->i_mtime.tv_sec = lli->lli_mtime;
4417 inode->i_ctime.tv_sec = lli->lli_ctime;
4420 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4422 if (ll_need_32bit_api(sbi)) {
4423 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4424 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4425 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4427 stat->ino = inode->i_ino;
4428 stat->dev = inode->i_sb->s_dev;
4429 stat->rdev = inode->i_rdev;
4432 stat->mode = inode->i_mode;
4433 stat->uid = inode->i_uid;
4434 stat->gid = inode->i_gid;
4435 stat->atime = inode->i_atime;
4436 stat->mtime = inode->i_mtime;
4437 stat->ctime = inode->i_ctime;
4438 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4440 stat->nlink = inode->i_nlink;
4441 stat->size = i_size_read(inode);
4442 stat->blocks = inode->i_blocks;
4447 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4448 __u64 start, __u64 len)
4452 struct fiemap *fiemap;
4453 unsigned int extent_count = fieinfo->fi_extents_max;
4455 num_bytes = sizeof(*fiemap) + (extent_count *
4456 sizeof(struct fiemap_extent));
4457 OBD_ALLOC_LARGE(fiemap, num_bytes);
4462 fiemap->fm_flags = fieinfo->fi_flags;
4463 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4464 fiemap->fm_start = start;
4465 fiemap->fm_length = len;
4466 if (extent_count > 0 &&
4467 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4468 sizeof(struct fiemap_extent)) != 0)
4469 GOTO(out, rc = -EFAULT);
4471 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4473 fieinfo->fi_flags = fiemap->fm_flags;
4474 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4475 if (extent_count > 0 &&
4476 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4477 fiemap->fm_mapped_extents *
4478 sizeof(struct fiemap_extent)) != 0)
4479 GOTO(out, rc = -EFAULT);
4481 OBD_FREE_LARGE(fiemap, num_bytes);
4485 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4487 struct ll_inode_info *lli = ll_i2info(inode);
4488 struct posix_acl *acl = NULL;
4491 spin_lock(&lli->lli_lock);
4492 /* VFS' acl_permission_check->check_acl will release the refcount */
4493 acl = posix_acl_dup(lli->lli_posix_acl);
4494 spin_unlock(&lli->lli_lock);
4499 #ifdef HAVE_IOP_SET_ACL
4500 #ifdef CONFIG_FS_POSIX_ACL
4501 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4503 struct ll_sb_info *sbi = ll_i2sbi(inode);
4504 struct ptlrpc_request *req = NULL;
4505 const char *name = NULL;
4507 size_t value_size = 0;
4512 case ACL_TYPE_ACCESS:
4513 name = XATTR_NAME_POSIX_ACL_ACCESS;
4515 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4518 case ACL_TYPE_DEFAULT:
4519 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4520 if (!S_ISDIR(inode->i_mode))
4521 rc = acl ? -EACCES : 0;
4532 value_size = posix_acl_xattr_size(acl->a_count);
4533 value = kmalloc(value_size, GFP_NOFS);
4535 GOTO(out, rc = -ENOMEM);
4537 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4539 GOTO(out_value, rc);
4542 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4543 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4544 name, value, value_size, 0, 0, &req);
4546 ptlrpc_req_finished(req);
4551 forget_cached_acl(inode, type);
4553 set_cached_acl(inode, type, acl);
4556 #endif /* CONFIG_FS_POSIX_ACL */
4557 #endif /* HAVE_IOP_SET_ACL */
4559 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4561 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4562 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4564 ll_check_acl(struct inode *inode, int mask)
4567 # ifdef CONFIG_FS_POSIX_ACL
4568 struct posix_acl *acl;
4572 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4573 if (flags & IPERM_FLAG_RCU)
4576 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4581 rc = posix_acl_permission(inode, acl, mask);
4582 posix_acl_release(acl);
4585 # else /* !CONFIG_FS_POSIX_ACL */
4587 # endif /* CONFIG_FS_POSIX_ACL */
4589 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4591 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4592 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4594 # ifdef HAVE_INODE_PERMISION_2ARGS
4595 int ll_inode_permission(struct inode *inode, int mask)
4597 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4602 struct ll_sb_info *sbi;
4603 struct root_squash_info *squash;
4604 struct cred *cred = NULL;
4605 const struct cred *old_cred = NULL;
4607 bool squash_id = false;
4610 #ifdef MAY_NOT_BLOCK
4611 if (mask & MAY_NOT_BLOCK)
4613 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4614 if (flags & IPERM_FLAG_RCU)
4618 /* as root inode are NOT getting validated in lookup operation,
4619 * need to do it before permission check. */
4621 if (inode == inode->i_sb->s_root->d_inode) {
4622 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4627 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4628 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4630 /* squash fsuid/fsgid if needed */
4631 sbi = ll_i2sbi(inode);
4632 squash = &sbi->ll_squash;
4633 if (unlikely(squash->rsi_uid != 0 &&
4634 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4635 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4639 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4640 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4641 squash->rsi_uid, squash->rsi_gid);
4643 /* update current process's credentials
4644 * and FS capability */
4645 cred = prepare_creds();
4649 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4650 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4651 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4652 if ((1 << cap) & CFS_CAP_FS_MASK)
4653 cap_lower(cred->cap_effective, cap);
4655 old_cred = override_creds(cred);
4658 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4659 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4660 /* restore current process's credentials and FS capability */
4662 revert_creds(old_cred);
4669 /* -o localflock - only provides locally consistent flock locks */
4670 struct file_operations ll_file_operations = {
4671 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4672 # ifdef HAVE_SYNC_READ_WRITE
4673 .read = new_sync_read,
4674 .write = new_sync_write,
4676 .read_iter = ll_file_read_iter,
4677 .write_iter = ll_file_write_iter,
4678 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4679 .read = ll_file_read,
4680 .aio_read = ll_file_aio_read,
4681 .write = ll_file_write,
4682 .aio_write = ll_file_aio_write,
4683 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4684 .unlocked_ioctl = ll_file_ioctl,
4685 .open = ll_file_open,
4686 .release = ll_file_release,
4687 .mmap = ll_file_mmap,
4688 .llseek = ll_file_seek,
4689 .splice_read = ll_file_splice_read,
4694 struct file_operations ll_file_operations_flock = {
4695 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4696 # ifdef HAVE_SYNC_READ_WRITE
4697 .read = new_sync_read,
4698 .write = new_sync_write,
4699 # endif /* HAVE_SYNC_READ_WRITE */
4700 .read_iter = ll_file_read_iter,
4701 .write_iter = ll_file_write_iter,
4702 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4703 .read = ll_file_read,
4704 .aio_read = ll_file_aio_read,
4705 .write = ll_file_write,
4706 .aio_write = ll_file_aio_write,
4707 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4708 .unlocked_ioctl = ll_file_ioctl,
4709 .open = ll_file_open,
4710 .release = ll_file_release,
4711 .mmap = ll_file_mmap,
4712 .llseek = ll_file_seek,
4713 .splice_read = ll_file_splice_read,
4716 .flock = ll_file_flock,
4717 .lock = ll_file_flock
4720 /* These are for -o noflock - to return ENOSYS on flock calls */
4721 struct file_operations ll_file_operations_noflock = {
4722 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4723 # ifdef HAVE_SYNC_READ_WRITE
4724 .read = new_sync_read,
4725 .write = new_sync_write,
4726 # endif /* HAVE_SYNC_READ_WRITE */
4727 .read_iter = ll_file_read_iter,
4728 .write_iter = ll_file_write_iter,
4729 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4730 .read = ll_file_read,
4731 .aio_read = ll_file_aio_read,
4732 .write = ll_file_write,
4733 .aio_write = ll_file_aio_write,
4734 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4735 .unlocked_ioctl = ll_file_ioctl,
4736 .open = ll_file_open,
4737 .release = ll_file_release,
4738 .mmap = ll_file_mmap,
4739 .llseek = ll_file_seek,
4740 .splice_read = ll_file_splice_read,
4743 .flock = ll_file_noflock,
4744 .lock = ll_file_noflock
4747 struct inode_operations ll_file_inode_operations = {
4748 .setattr = ll_setattr,
4749 .getattr = ll_getattr,
4750 .permission = ll_inode_permission,
4751 #ifdef HAVE_IOP_XATTR
4752 .setxattr = ll_setxattr,
4753 .getxattr = ll_getxattr,
4754 .removexattr = ll_removexattr,
4756 .listxattr = ll_listxattr,
4757 .fiemap = ll_fiemap,
4758 #ifdef HAVE_IOP_GET_ACL
4759 .get_acl = ll_get_acl,
4761 #ifdef HAVE_IOP_SET_ACL
4762 .set_acl = ll_set_acl,
4766 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4768 struct ll_inode_info *lli = ll_i2info(inode);
4769 struct cl_object *obj = lli->lli_clob;
4778 env = cl_env_get(&refcheck);
4780 RETURN(PTR_ERR(env));
4782 rc = cl_conf_set(env, lli->lli_clob, conf);
4786 if (conf->coc_opc == OBJECT_CONF_SET) {
4787 struct ldlm_lock *lock = conf->coc_lock;
4788 struct cl_layout cl = {
4792 LASSERT(lock != NULL);
4793 LASSERT(ldlm_has_layout(lock));
4795 /* it can only be allowed to match after layout is
4796 * applied to inode otherwise false layout would be
4797 * seen. Applying layout shoud happen before dropping
4798 * the intent lock. */
4799 ldlm_lock_allow_match(lock);
4801 rc = cl_object_layout_get(env, obj, &cl);
4806 DFID": layout version change: %u -> %u\n",
4807 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4809 ll_layout_version_set(lli, cl.cl_layout_gen);
4813 cl_env_put(env, &refcheck);
4818 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4819 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4822 struct ll_sb_info *sbi = ll_i2sbi(inode);
4823 struct ptlrpc_request *req;
4830 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4831 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4832 lock->l_lvb_data, lock->l_lvb_len);
4834 if (lock->l_lvb_data != NULL)
4837 /* if layout lock was granted right away, the layout is returned
4838 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4839 * blocked and then granted via completion ast, we have to fetch
4840 * layout here. Please note that we can't use the LVB buffer in
4841 * completion AST because it doesn't have a large enough buffer */
4842 rc = ll_get_default_mdsize(sbi, &lmmsize);
4846 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4847 XATTR_NAME_LOV, lmmsize, &req);
4850 GOTO(out, rc = 0); /* empty layout */
4857 if (lmmsize == 0) /* empty layout */
4860 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4862 GOTO(out, rc = -EFAULT);
4864 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4865 if (lvbdata == NULL)
4866 GOTO(out, rc = -ENOMEM);
4868 memcpy(lvbdata, lmm, lmmsize);
4869 lock_res_and_lock(lock);
4870 if (unlikely(lock->l_lvb_data == NULL)) {
4871 lock->l_lvb_type = LVB_T_LAYOUT;
4872 lock->l_lvb_data = lvbdata;
4873 lock->l_lvb_len = lmmsize;
4876 unlock_res_and_lock(lock);
4879 OBD_FREE_LARGE(lvbdata, lmmsize);
4884 ptlrpc_req_finished(req);
4889 * Apply the layout to the inode. Layout lock is held and will be released
4892 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4893 struct inode *inode)
4895 struct ll_inode_info *lli = ll_i2info(inode);
4896 struct ll_sb_info *sbi = ll_i2sbi(inode);
4897 struct ldlm_lock *lock;
4898 struct cl_object_conf conf;
4901 bool wait_layout = false;
4904 LASSERT(lustre_handle_is_used(lockh));
4906 lock = ldlm_handle2lock(lockh);
4907 LASSERT(lock != NULL);
4908 LASSERT(ldlm_has_layout(lock));
4910 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4911 PFID(&lli->lli_fid), inode);
4913 /* in case this is a caching lock and reinstate with new inode */
4914 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4916 lock_res_and_lock(lock);
4917 lvb_ready = ldlm_is_lvb_ready(lock);
4918 unlock_res_and_lock(lock);
4920 /* checking lvb_ready is racy but this is okay. The worst case is
4921 * that multi processes may configure the file on the same time. */
4925 rc = ll_layout_fetch(inode, lock);
4929 /* for layout lock, lmm is stored in lock's lvb.
4930 * lvb_data is immutable if the lock is held so it's safe to access it
4933 * set layout to file. Unlikely this will fail as old layout was
4934 * surely eliminated */
4935 memset(&conf, 0, sizeof conf);
4936 conf.coc_opc = OBJECT_CONF_SET;
4937 conf.coc_inode = inode;
4938 conf.coc_lock = lock;
4939 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4940 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4941 rc = ll_layout_conf(inode, &conf);
4943 /* refresh layout failed, need to wait */
4944 wait_layout = rc == -EBUSY;
4947 LDLM_LOCK_PUT(lock);
4948 ldlm_lock_decref(lockh, mode);
4950 /* wait for IO to complete if it's still being used. */
4952 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4953 ll_get_fsname(inode->i_sb, NULL, 0),
4954 PFID(&lli->lli_fid), inode);
4956 memset(&conf, 0, sizeof conf);
4957 conf.coc_opc = OBJECT_CONF_WAIT;
4958 conf.coc_inode = inode;
4959 rc = ll_layout_conf(inode, &conf);
4963 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4964 ll_get_fsname(inode->i_sb, NULL, 0),
4965 PFID(&lli->lli_fid), rc);
4971 * Issue layout intent RPC to MDS.
4972 * \param inode [in] file inode
4973 * \param intent [in] layout intent
4975 * \retval 0 on success
4976 * \retval < 0 error code
4978 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4980 struct ll_inode_info *lli = ll_i2info(inode);
4981 struct ll_sb_info *sbi = ll_i2sbi(inode);
4982 struct md_op_data *op_data;
4983 struct lookup_intent it;
4984 struct ptlrpc_request *req;
4988 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4989 0, 0, LUSTRE_OPC_ANY, NULL);
4990 if (IS_ERR(op_data))
4991 RETURN(PTR_ERR(op_data));
4993 op_data->op_data = intent;
4994 op_data->op_data_size = sizeof(*intent);
4996 memset(&it, 0, sizeof(it));
4997 it.it_op = IT_LAYOUT;
4998 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4999 intent->li_opc == LAYOUT_INTENT_TRUNC)
5000 it.it_flags = FMODE_WRITE;
5002 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5003 ll_get_fsname(inode->i_sb, NULL, 0),
5004 PFID(&lli->lli_fid), inode);
5006 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5007 &ll_md_blocking_ast, 0);
5008 if (it.it_request != NULL)
5009 ptlrpc_req_finished(it.it_request);
5010 it.it_request = NULL;
5012 ll_finish_md_op_data(op_data);
5014 /* set lock data in case this is a new lock */
5016 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5018 ll_intent_drop_lock(&it);
5024 * This function checks if there exists a LAYOUT lock on the client side,
5025 * or enqueues it if it doesn't have one in cache.
5027 * This function will not hold layout lock so it may be revoked any time after
5028 * this function returns. Any operations depend on layout should be redone
5031 * This function should be called before lov_io_init() to get an uptodate
5032 * layout version, the caller should save the version number and after IO
5033 * is finished, this function should be called again to verify that layout
5034 * is not changed during IO time.
5036 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5038 struct ll_inode_info *lli = ll_i2info(inode);
5039 struct ll_sb_info *sbi = ll_i2sbi(inode);
5040 struct lustre_handle lockh;
5041 struct layout_intent intent = {
5042 .li_opc = LAYOUT_INTENT_ACCESS,
5044 enum ldlm_mode mode;
5048 *gen = ll_layout_version_get(lli);
5049 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5053 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5054 LASSERT(S_ISREG(inode->i_mode));
5056 /* take layout lock mutex to enqueue layout lock exclusively. */
5057 mutex_lock(&lli->lli_layout_mutex);
5060 /* mostly layout lock is caching on the local side, so try to
5061 * match it before grabbing layout lock mutex. */
5062 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5063 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5064 if (mode != 0) { /* hit cached lock */
5065 rc = ll_layout_lock_set(&lockh, mode, inode);
5071 rc = ll_layout_intent(inode, &intent);
5077 *gen = ll_layout_version_get(lli);
5078 mutex_unlock(&lli->lli_layout_mutex);
5084 * Issue layout intent RPC indicating where in a file an IO is about to write.
5086 * \param[in] inode file inode.
5087 * \param[in] ext write range with start offset of fille in bytes where
5088 * an IO is about to write, and exclusive end offset in
5091 * \retval 0 on success
5092 * \retval < 0 error code
5094 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5095 struct lu_extent *ext)
5097 struct layout_intent intent = {
5099 .li_extent.e_start = ext->e_start,
5100 .li_extent.e_end = ext->e_end,
5105 rc = ll_layout_intent(inode, &intent);
5111 * This function send a restore request to the MDT
5113 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5115 struct hsm_user_request *hur;
5119 len = sizeof(struct hsm_user_request) +
5120 sizeof(struct hsm_user_item);
5121 OBD_ALLOC(hur, len);
5125 hur->hur_request.hr_action = HUA_RESTORE;
5126 hur->hur_request.hr_archive_id = 0;
5127 hur->hur_request.hr_flags = 0;
5128 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5129 sizeof(hur->hur_user_item[0].hui_fid));
5130 hur->hur_user_item[0].hui_extent.offset = offset;
5131 hur->hur_user_item[0].hui_extent.length = length;
5132 hur->hur_request.hr_itemcount = 1;
5133 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,