4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE;
158 op_data->op_xvalid |= OP_XVALID_BLOCKS;
159 case MDS_CLOSE_LAYOUT_SPLIT:
160 case MDS_CLOSE_LAYOUT_SWAP: {
161 struct split_param *sp = data;
163 LASSERT(data != NULL);
164 op_data->op_bias |= bias;
165 op_data->op_data_version = 0;
166 op_data->op_lease_handle = och->och_lease_handle;
167 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
168 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
169 op_data->op_mirror_id = sp->sp_mirror_id;
171 op_data->op_fid2 = *ll_inode2fid(data);
176 case MDS_CLOSE_RESYNC_DONE: {
177 struct ll_ioc_lease *ioc = data;
179 LASSERT(data != NULL);
180 op_data->op_attr_blocks +=
181 ioc->lil_count * op_data->op_attr_blocks;
182 op_data->op_attr.ia_valid |= ATTR_SIZE;
183 op_data->op_xvalid |= OP_XVALID_BLOCKS;
184 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
186 op_data->op_lease_handle = och->och_lease_handle;
187 op_data->op_data = &ioc->lil_ids[0];
188 op_data->op_data_size =
189 ioc->lil_count * sizeof(ioc->lil_ids[0]);
193 case MDS_HSM_RELEASE:
194 LASSERT(data != NULL);
195 op_data->op_bias |= MDS_HSM_RELEASE;
196 op_data->op_data_version = *(__u64 *)data;
197 op_data->op_lease_handle = och->och_lease_handle;
198 op_data->op_attr.ia_valid |= ATTR_SIZE;
199 op_data->op_xvalid |= OP_XVALID_BLOCKS;
203 LASSERT(data == NULL);
207 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
208 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
209 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
210 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
212 rc = md_close(md_exp, op_data, och->och_mod, &req);
213 if (rc != 0 && rc != -EINTR)
214 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
215 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
217 if (rc == 0 && op_data->op_bias & bias) {
218 struct mdt_body *body;
220 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
221 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
225 ll_finish_md_op_data(op_data);
229 md_clear_open_replay_data(md_exp, och);
230 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
233 ptlrpc_req_finished(req); /* This is close request */
237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
239 struct ll_inode_info *lli = ll_i2info(inode);
240 struct obd_client_handle **och_p;
241 struct obd_client_handle *och;
246 if (fmode & FMODE_WRITE) {
247 och_p = &lli->lli_mds_write_och;
248 och_usecount = &lli->lli_open_fd_write_count;
249 } else if (fmode & FMODE_EXEC) {
250 och_p = &lli->lli_mds_exec_och;
251 och_usecount = &lli->lli_open_fd_exec_count;
253 LASSERT(fmode & FMODE_READ);
254 och_p = &lli->lli_mds_read_och;
255 och_usecount = &lli->lli_open_fd_read_count;
258 mutex_lock(&lli->lli_och_mutex);
259 if (*och_usecount > 0) {
260 /* There are still users of this handle, so skip
262 mutex_unlock(&lli->lli_och_mutex);
268 mutex_unlock(&lli->lli_och_mutex);
271 /* There might be a race and this handle may already
273 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
279 static int ll_md_close(struct inode *inode, struct file *file)
281 union ldlm_policy_data policy = {
282 .l_inodebits = { MDS_INODELOCK_OPEN },
284 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
286 struct ll_inode_info *lli = ll_i2info(inode);
287 struct lustre_handle lockh;
288 enum ldlm_mode lockmode;
292 /* clear group lock, if present */
293 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
294 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
296 if (fd->fd_lease_och != NULL) {
299 /* Usually the lease is not released when the
300 * application crashed, we need to release here. */
301 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
302 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
303 PFID(&lli->lli_fid), rc, lease_broken);
305 fd->fd_lease_och = NULL;
308 if (fd->fd_och != NULL) {
309 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
314 /* Let's see if we have good enough OPEN lock on the file and if
315 we can skip talking to MDS */
316 mutex_lock(&lli->lli_och_mutex);
317 if (fd->fd_omode & FMODE_WRITE) {
319 LASSERT(lli->lli_open_fd_write_count);
320 lli->lli_open_fd_write_count--;
321 } else if (fd->fd_omode & FMODE_EXEC) {
323 LASSERT(lli->lli_open_fd_exec_count);
324 lli->lli_open_fd_exec_count--;
327 LASSERT(lli->lli_open_fd_read_count);
328 lli->lli_open_fd_read_count--;
330 mutex_unlock(&lli->lli_och_mutex);
332 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
333 LDLM_IBITS, &policy, lockmode, &lockh))
334 rc = ll_md_real_close(inode, fd->fd_omode);
337 LUSTRE_FPRIVATE(file) = NULL;
338 ll_file_data_put(fd);
343 /* While this returns an error code, fput() the caller does not, so we need
344 * to make every effort to clean up all of our state here. Also, applications
345 * rarely check close errors and even if an error is returned they will not
346 * re-try the close call.
348 int ll_file_release(struct inode *inode, struct file *file)
350 struct ll_file_data *fd;
351 struct ll_sb_info *sbi = ll_i2sbi(inode);
352 struct ll_inode_info *lli = ll_i2info(inode);
356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
357 PFID(ll_inode2fid(inode)), inode);
359 if (inode->i_sb->s_root != file_dentry(file))
360 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
361 fd = LUSTRE_FPRIVATE(file);
364 /* The last ref on @file, maybe not the the owner pid of statahead,
365 * because parent and child process can share the same file handle. */
366 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
367 ll_deauthorize_statahead(inode, fd);
369 if (inode->i_sb->s_root == file_dentry(file)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 if (lli->lli_clob != NULL)
377 lov_read_and_clear_async_rc(lli->lli_clob);
378 lli->lli_async_rc = 0;
381 rc = ll_md_close(inode, file);
383 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
384 libcfs_debug_dumplog();
389 static inline int ll_dom_readpage(void *data, struct page *page)
391 struct niobuf_local *lnb = data;
394 kaddr = ll_kmap_atomic(page, KM_USER0);
395 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
396 if (lnb->lnb_len < PAGE_SIZE)
397 memset(kaddr + lnb->lnb_len, 0,
398 PAGE_SIZE - lnb->lnb_len);
399 flush_dcache_page(page);
400 SetPageUptodate(page);
401 ll_kunmap_atomic(kaddr, KM_USER0);
407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
408 struct lookup_intent *it)
410 struct ll_inode_info *lli = ll_i2info(inode);
411 struct cl_object *obj = lli->lli_clob;
412 struct address_space *mapping = inode->i_mapping;
414 struct niobuf_remote *rnb;
416 struct lustre_handle lockh;
417 struct ldlm_lock *lock;
418 unsigned long index, start;
419 struct niobuf_local lnb;
420 bool dom_lock = false;
427 if (it->it_lock_mode != 0) {
428 lockh.cookie = it->it_lock_handle;
429 lock = ldlm_handle2lock(&lockh);
431 dom_lock = ldlm_has_dom(lock);
437 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
441 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
442 if (rnb == NULL || rnb->rnb_len == 0)
445 /* LU-11595: Server may return whole file and that is OK always or
446 * it may return just file tail and its offset must be aligned with
447 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
448 * smaller then offset may be not aligned and that data is just ignored.
450 if (rnb->rnb_offset % PAGE_SIZE)
453 /* Server returns whole file or just file tail if it fills in
454 * reply buffer, in both cases total size should be inode size.
456 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
457 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
458 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
459 rnb->rnb_len, i_size_read(inode));
463 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
464 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
466 data = (char *)rnb + sizeof(*rnb);
468 lnb.lnb_file_offset = rnb->rnb_offset;
469 start = lnb.lnb_file_offset / PAGE_SIZE;
471 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
472 lnb.lnb_page_offset = 0;
474 lnb.lnb_data = data + (index << PAGE_SHIFT);
475 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
476 if (lnb.lnb_len > PAGE_SIZE)
477 lnb.lnb_len = PAGE_SIZE;
479 vmpage = read_cache_page(mapping, index + start,
480 ll_dom_readpage, &lnb);
481 if (IS_ERR(vmpage)) {
482 CWARN("%s: cannot fill page %lu for "DFID
483 " with data: rc = %li\n",
484 ll_i2sbi(inode)->ll_fsname, index + start,
485 PFID(lu_object_fid(&obj->co_lu)),
491 } while (rnb->rnb_len > (index << PAGE_SHIFT));
495 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
496 struct lookup_intent *itp)
498 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
499 struct dentry *parent = de->d_parent;
502 struct md_op_data *op_data;
503 struct ptlrpc_request *req = NULL;
507 LASSERT(parent != NULL);
508 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
510 /* if server supports open-by-fid, or file name is invalid, don't pack
511 * name in open request */
512 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
514 len = de->d_name.len;
515 name = kmalloc(len, GFP_NOFS);
519 spin_lock(&de->d_lock);
520 if (len != de->d_name.len) {
521 spin_unlock(&de->d_lock);
525 memcpy(name, de->d_name.name, len);
526 spin_unlock(&de->d_lock);
528 if (!lu_name_is_valid_2(name, len)) {
535 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
536 name, len, 0, LUSTRE_OPC_ANY, NULL);
537 if (IS_ERR(op_data)) {
539 RETURN(PTR_ERR(op_data));
541 op_data->op_data = lmm;
542 op_data->op_data_size = lmmsize;
544 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
545 &ll_md_blocking_ast, 0);
547 ll_finish_md_op_data(op_data);
549 /* reason for keep own exit path - don`t flood log
550 * with messages with -ESTALE errors.
552 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
553 it_open_error(DISP_OPEN_OPEN, itp))
555 ll_release_openhandle(de, itp);
559 if (it_disposition(itp, DISP_LOOKUP_NEG))
560 GOTO(out, rc = -ENOENT);
562 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
563 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
564 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
568 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
570 if (!rc && itp->it_lock_mode) {
571 ll_dom_finish_open(de->d_inode, req, itp);
572 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
576 ptlrpc_req_finished(req);
577 ll_intent_drop_lock(itp);
579 /* We did open by fid, but by the time we got to the server,
580 * the object disappeared. If this is a create, we cannot really
581 * tell the userspace that the file it was trying to create
582 * does not exist. Instead let's return -ESTALE, and the VFS will
583 * retry the create with LOOKUP_REVAL that we are going to catch
584 * in ll_revalidate_dentry() and use lookup then.
586 if (rc == -ENOENT && itp->it_op & IT_CREAT)
592 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
593 struct obd_client_handle *och)
595 struct mdt_body *body;
597 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
598 och->och_open_handle = body->mbo_open_handle;
599 och->och_fid = body->mbo_fid1;
600 och->och_lease_handle.cookie = it->it_lock_handle;
601 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
602 och->och_flags = it->it_flags;
604 return md_set_open_replay_data(md_exp, och, it);
607 static int ll_local_open(struct file *file, struct lookup_intent *it,
608 struct ll_file_data *fd, struct obd_client_handle *och)
610 struct inode *inode = file_inode(file);
613 LASSERT(!LUSTRE_FPRIVATE(file));
620 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
625 LUSTRE_FPRIVATE(file) = fd;
626 ll_readahead_init(inode, &fd->fd_ras);
627 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
629 /* ll_cl_context initialize */
630 rwlock_init(&fd->fd_lock);
631 INIT_LIST_HEAD(&fd->fd_lccs);
636 /* Open a file, and (for the very first open) create objects on the OSTs at
637 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
638 * creation or open until ll_lov_setstripe() ioctl is called.
640 * If we already have the stripe MD locally then we don't request it in
641 * md_open(), by passing a lmm_size = 0.
643 * It is up to the application to ensure no other processes open this file
644 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
645 * used. We might be able to avoid races of that sort by getting lli_open_sem
646 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
647 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
649 int ll_file_open(struct inode *inode, struct file *file)
651 struct ll_inode_info *lli = ll_i2info(inode);
652 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
653 .it_flags = file->f_flags };
654 struct obd_client_handle **och_p = NULL;
655 __u64 *och_usecount = NULL;
656 struct ll_file_data *fd;
660 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
661 PFID(ll_inode2fid(inode)), inode, file->f_flags);
663 it = file->private_data; /* XXX: compat macro */
664 file->private_data = NULL; /* prevent ll_local_open assertion */
666 fd = ll_file_data_get();
668 GOTO(out_nofiledata, rc = -ENOMEM);
671 if (S_ISDIR(inode->i_mode))
672 ll_authorize_statahead(inode, fd);
674 if (inode->i_sb->s_root == file_dentry(file)) {
675 LUSTRE_FPRIVATE(file) = fd;
679 if (!it || !it->it_disposition) {
680 /* Convert f_flags into access mode. We cannot use file->f_mode,
681 * because everything but O_ACCMODE mask was stripped from
683 if ((oit.it_flags + 1) & O_ACCMODE)
685 if (file->f_flags & O_TRUNC)
686 oit.it_flags |= FMODE_WRITE;
688 /* kernel only call f_op->open in dentry_open. filp_open calls
689 * dentry_open after call to open_namei that checks permissions.
690 * Only nfsd_open call dentry_open directly without checking
691 * permissions and because of that this code below is safe.
693 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
694 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
696 /* We do not want O_EXCL here, presumably we opened the file
697 * already? XXX - NFS implications? */
698 oit.it_flags &= ~O_EXCL;
700 /* bug20584, if "it_flags" contains O_CREAT, the file will be
701 * created if necessary, then "IT_CREAT" should be set to keep
702 * consistent with it */
703 if (oit.it_flags & O_CREAT)
704 oit.it_op |= IT_CREAT;
710 /* Let's see if we have file open on MDS already. */
711 if (it->it_flags & FMODE_WRITE) {
712 och_p = &lli->lli_mds_write_och;
713 och_usecount = &lli->lli_open_fd_write_count;
714 } else if (it->it_flags & FMODE_EXEC) {
715 och_p = &lli->lli_mds_exec_och;
716 och_usecount = &lli->lli_open_fd_exec_count;
718 och_p = &lli->lli_mds_read_och;
719 och_usecount = &lli->lli_open_fd_read_count;
722 mutex_lock(&lli->lli_och_mutex);
723 if (*och_p) { /* Open handle is present */
724 if (it_disposition(it, DISP_OPEN_OPEN)) {
725 /* Well, there's extra open request that we do not need,
726 let's close it somehow. This will decref request. */
727 rc = it_open_error(DISP_OPEN_OPEN, it);
729 mutex_unlock(&lli->lli_och_mutex);
730 GOTO(out_openerr, rc);
733 ll_release_openhandle(file_dentry(file), it);
737 rc = ll_local_open(file, it, fd, NULL);
740 mutex_unlock(&lli->lli_och_mutex);
741 GOTO(out_openerr, rc);
744 LASSERT(*och_usecount == 0);
745 if (!it->it_disposition) {
746 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
747 /* We cannot just request lock handle now, new ELC code
748 means that one of other OPEN locks for this file
749 could be cancelled, and since blocking ast handler
750 would attempt to grab och_mutex as well, that would
751 result in a deadlock */
752 mutex_unlock(&lli->lli_och_mutex);
754 * Normally called under two situations:
756 * 2. A race/condition on MDS resulting in no open
757 * handle to be returned from LOOKUP|OPEN request,
758 * for example if the target entry was a symlink.
760 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
761 * marked by a bit set in ll_iget_for_nfs. Clear the
762 * bit so that it's not confusing later callers.
764 * NB; when ldd is NULL, it must have come via normal
765 * lookup path only, since ll_iget_for_nfs always calls
768 if (ldd && ldd->lld_nfs_dentry) {
769 ldd->lld_nfs_dentry = 0;
770 it->it_flags |= MDS_OPEN_LOCK;
774 * Always specify MDS_OPEN_BY_FID because we don't want
775 * to get file with different fid.
777 it->it_flags |= MDS_OPEN_BY_FID;
778 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
781 GOTO(out_openerr, rc);
785 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
787 GOTO(out_och_free, rc = -ENOMEM);
791 /* md_intent_lock() didn't get a request ref if there was an
792 * open error, so don't do cleanup on the request here
794 /* XXX (green): Should not we bail out on any error here, not
795 * just open error? */
796 rc = it_open_error(DISP_OPEN_OPEN, it);
798 GOTO(out_och_free, rc);
800 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
801 "inode %p: disposition %x, status %d\n", inode,
802 it_disposition(it, ~0), it->it_status);
804 rc = ll_local_open(file, it, fd, *och_p);
806 GOTO(out_och_free, rc);
808 mutex_unlock(&lli->lli_och_mutex);
811 /* Must do this outside lli_och_mutex lock to prevent deadlock where
812 different kind of OPEN lock for this same inode gets cancelled
813 by ldlm_cancel_lru */
814 if (!S_ISREG(inode->i_mode))
815 GOTO(out_och_free, rc);
817 cl_lov_delay_create_clear(&file->f_flags);
818 GOTO(out_och_free, rc);
822 if (och_p && *och_p) {
823 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
824 *och_p = NULL; /* OBD_FREE writes some magic there */
827 mutex_unlock(&lli->lli_och_mutex);
830 if (lli->lli_opendir_key == fd)
831 ll_deauthorize_statahead(inode, fd);
833 ll_file_data_put(fd);
835 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
839 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
840 ptlrpc_req_finished(it->it_request);
841 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
847 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
848 struct ldlm_lock_desc *desc, void *data, int flag)
851 struct lustre_handle lockh;
855 case LDLM_CB_BLOCKING:
856 ldlm_lock2handle(lock, &lockh);
857 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
859 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
863 case LDLM_CB_CANCELING:
871 * When setting a lease on a file, we take ownership of the lli_mds_*_och
872 * and save it as fd->fd_och so as to force client to reopen the file even
873 * if it has an open lock in cache already.
875 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
876 struct lustre_handle *old_open_handle)
878 struct ll_inode_info *lli = ll_i2info(inode);
879 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
880 struct obd_client_handle **och_p;
885 /* Get the openhandle of the file */
886 mutex_lock(&lli->lli_och_mutex);
887 if (fd->fd_lease_och != NULL)
888 GOTO(out_unlock, rc = -EBUSY);
890 if (fd->fd_och == NULL) {
891 if (file->f_mode & FMODE_WRITE) {
892 LASSERT(lli->lli_mds_write_och != NULL);
893 och_p = &lli->lli_mds_write_och;
894 och_usecount = &lli->lli_open_fd_write_count;
896 LASSERT(lli->lli_mds_read_och != NULL);
897 och_p = &lli->lli_mds_read_och;
898 och_usecount = &lli->lli_open_fd_read_count;
901 if (*och_usecount > 1)
902 GOTO(out_unlock, rc = -EBUSY);
909 *old_open_handle = fd->fd_och->och_open_handle;
913 mutex_unlock(&lli->lli_och_mutex);
918 * Release ownership on lli_mds_*_och when putting back a file lease.
920 static int ll_lease_och_release(struct inode *inode, struct file *file)
922 struct ll_inode_info *lli = ll_i2info(inode);
923 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
924 struct obd_client_handle **och_p;
925 struct obd_client_handle *old_och = NULL;
930 mutex_lock(&lli->lli_och_mutex);
931 if (file->f_mode & FMODE_WRITE) {
932 och_p = &lli->lli_mds_write_och;
933 och_usecount = &lli->lli_open_fd_write_count;
935 och_p = &lli->lli_mds_read_och;
936 och_usecount = &lli->lli_open_fd_read_count;
939 /* The file may have been open by another process (broken lease) so
940 * *och_p is not NULL. In this case we should simply increase usecount
943 if (*och_p != NULL) {
944 old_och = fd->fd_och;
951 mutex_unlock(&lli->lli_och_mutex);
954 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
960 * Acquire a lease and open the file.
962 static struct obd_client_handle *
963 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
966 struct lookup_intent it = { .it_op = IT_OPEN };
967 struct ll_sb_info *sbi = ll_i2sbi(inode);
968 struct md_op_data *op_data;
969 struct ptlrpc_request *req = NULL;
970 struct lustre_handle old_open_handle = { 0 };
971 struct obd_client_handle *och = NULL;
976 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
977 RETURN(ERR_PTR(-EINVAL));
980 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
981 RETURN(ERR_PTR(-EPERM));
983 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
990 RETURN(ERR_PTR(-ENOMEM));
992 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
993 LUSTRE_OPC_ANY, NULL);
995 GOTO(out, rc = PTR_ERR(op_data));
997 /* To tell the MDT this openhandle is from the same owner */
998 op_data->op_open_handle = old_open_handle;
1000 it.it_flags = fmode | open_flags;
1001 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1002 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1003 &ll_md_blocking_lease_ast,
1004 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1005 * it can be cancelled which may mislead applications that the lease is
1007 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1008 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1009 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1010 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1011 ll_finish_md_op_data(op_data);
1012 ptlrpc_req_finished(req);
1014 GOTO(out_release_it, rc);
1016 if (it_disposition(&it, DISP_LOOKUP_NEG))
1017 GOTO(out_release_it, rc = -ENOENT);
1019 rc = it_open_error(DISP_OPEN_OPEN, &it);
1021 GOTO(out_release_it, rc);
1023 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1024 ll_och_fill(sbi->ll_md_exp, &it, och);
1026 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1027 GOTO(out_close, rc = -EOPNOTSUPP);
1029 /* already get lease, handle lease lock */
1030 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1031 if (it.it_lock_mode == 0 ||
1032 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1033 /* open lock must return for lease */
1034 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1035 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1037 GOTO(out_close, rc = -EPROTO);
1040 ll_intent_release(&it);
1044 /* Cancel open lock */
1045 if (it.it_lock_mode != 0) {
1046 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1048 it.it_lock_mode = 0;
1049 och->och_lease_handle.cookie = 0ULL;
1051 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1053 CERROR("%s: error closing file "DFID": %d\n",
1054 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1055 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1057 ll_intent_release(&it);
1061 RETURN(ERR_PTR(rc));
1065 * Check whether a layout swap can be done between two inodes.
1067 * \param[in] inode1 First inode to check
1068 * \param[in] inode2 Second inode to check
1070 * \retval 0 on success, layout swap can be performed between both inodes
1071 * \retval negative error code if requirements are not met
1073 static int ll_check_swap_layouts_validity(struct inode *inode1,
1074 struct inode *inode2)
1076 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1079 if (inode_permission(inode1, MAY_WRITE) ||
1080 inode_permission(inode2, MAY_WRITE))
1083 if (inode1->i_sb != inode2->i_sb)
1089 static int ll_swap_layouts_close(struct obd_client_handle *och,
1090 struct inode *inode, struct inode *inode2)
1092 const struct lu_fid *fid1 = ll_inode2fid(inode);
1093 const struct lu_fid *fid2;
1097 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1098 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1100 rc = ll_check_swap_layouts_validity(inode, inode2);
1102 GOTO(out_free_och, rc);
1104 /* We now know that inode2 is a lustre inode */
1105 fid2 = ll_inode2fid(inode2);
1107 rc = lu_fid_cmp(fid1, fid2);
1109 GOTO(out_free_och, rc = -EINVAL);
1111 /* Close the file and {swap,merge} layouts between inode & inode2.
1112 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1113 * because we still need it to pack l_remote_handle to MDT. */
1114 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1117 och = NULL; /* freed in ll_close_inode_openhandle() */
1127 * Release lease and close the file.
1128 * It will check if the lease has ever broken.
1130 static int ll_lease_close_intent(struct obd_client_handle *och,
1131 struct inode *inode,
1132 bool *lease_broken, enum mds_op_bias bias,
1135 struct ldlm_lock *lock;
1136 bool cancelled = true;
1140 lock = ldlm_handle2lock(&och->och_lease_handle);
1142 lock_res_and_lock(lock);
1143 cancelled = ldlm_is_cancel(lock);
1144 unlock_res_and_lock(lock);
1145 LDLM_LOCK_PUT(lock);
1148 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1149 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1151 if (lease_broken != NULL)
1152 *lease_broken = cancelled;
1154 if (!cancelled && !bias)
1155 ldlm_cli_cancel(&och->och_lease_handle, 0);
1157 if (cancelled) { /* no need to excute intent */
1162 rc = ll_close_inode_openhandle(inode, och, bias, data);
1166 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1169 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1173 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1175 static int ll_lease_file_resync(struct obd_client_handle *och,
1176 struct inode *inode, unsigned long arg)
1178 struct ll_sb_info *sbi = ll_i2sbi(inode);
1179 struct md_op_data *op_data;
1180 struct ll_ioc_lease_id ioc;
1181 __u64 data_version_unused;
1185 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1186 LUSTRE_OPC_ANY, NULL);
1187 if (IS_ERR(op_data))
1188 RETURN(PTR_ERR(op_data));
1190 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1194 /* before starting file resync, it's necessary to clean up page cache
1195 * in client memory, otherwise once the layout version is increased,
1196 * writing back cached data will be denied the OSTs. */
1197 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1201 op_data->op_lease_handle = och->och_lease_handle;
1202 op_data->op_mirror_id = ioc.lil_mirror_id;
1203 rc = md_file_resync(sbi->ll_md_exp, op_data);
1209 ll_finish_md_op_data(op_data);
1213 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1215 struct ll_inode_info *lli = ll_i2info(inode);
1216 struct cl_object *obj = lli->lli_clob;
1217 struct cl_attr *attr = vvp_env_thread_attr(env);
1225 ll_inode_size_lock(inode);
1227 /* Merge timestamps the most recently obtained from MDS with
1228 * timestamps obtained from OSTs.
1230 * Do not overwrite atime of inode because it may be refreshed
1231 * by file_accessed() function. If the read was served by cache
1232 * data, there is no RPC to be sent so that atime may not be
1233 * transferred to OSTs at all. MDT only updates atime at close time
1234 * if it's at least 'mdd.*.atime_diff' older.
1235 * All in all, the atime in Lustre does not strictly comply with
1236 * POSIX. Solving this problem needs to send an RPC to MDT for each
1237 * read, this will hurt performance.
1239 if (inode->i_atime.tv_sec < lli->lli_atime ||
1240 lli->lli_update_atime) {
1241 inode->i_atime.tv_sec = lli->lli_atime;
1242 lli->lli_update_atime = 0;
1244 inode->i_mtime.tv_sec = lli->lli_mtime;
1245 inode->i_ctime.tv_sec = lli->lli_ctime;
1247 mtime = inode->i_mtime.tv_sec;
1248 atime = inode->i_atime.tv_sec;
1249 ctime = inode->i_ctime.tv_sec;
1251 cl_object_attr_lock(obj);
1252 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1255 rc = cl_object_attr_get(env, obj, attr);
1256 cl_object_attr_unlock(obj);
1259 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1261 if (atime < attr->cat_atime)
1262 atime = attr->cat_atime;
1264 if (ctime < attr->cat_ctime)
1265 ctime = attr->cat_ctime;
1267 if (mtime < attr->cat_mtime)
1268 mtime = attr->cat_mtime;
1270 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1271 PFID(&lli->lli_fid), attr->cat_size);
1273 i_size_write(inode, attr->cat_size);
1274 inode->i_blocks = attr->cat_blocks;
1276 inode->i_mtime.tv_sec = mtime;
1277 inode->i_atime.tv_sec = atime;
1278 inode->i_ctime.tv_sec = ctime;
1281 ll_inode_size_unlock(inode);
1287 * Set designated mirror for I/O.
1289 * So far only read, write, and truncated can support to issue I/O to
1290 * designated mirror.
1292 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1294 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1296 /* clear layout version for generic(non-resync) I/O in case it carries
1297 * stale layout version due to I/O restart */
1298 io->ci_layout_version = 0;
1300 /* FLR: disable non-delay for designated mirror I/O because obviously
1301 * only one mirror is available */
1302 if (fd->fd_designated_mirror > 0) {
1304 io->ci_designated_mirror = fd->fd_designated_mirror;
1305 io->ci_layout_version = fd->fd_layout_version;
1308 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1309 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1312 static bool file_is_noatime(const struct file *file)
1314 const struct vfsmount *mnt = file->f_path.mnt;
1315 const struct inode *inode = file_inode((struct file *)file);
1317 /* Adapted from file_accessed() and touch_atime().*/
1318 if (file->f_flags & O_NOATIME)
1321 if (inode->i_flags & S_NOATIME)
1324 if (IS_NOATIME(inode))
1327 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1330 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1333 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1339 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1341 struct inode *inode = file_inode(file);
1342 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1344 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1345 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1347 if (iot == CIT_WRITE) {
1348 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1349 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1350 file->f_flags & O_DIRECT ||
1353 io->ci_obj = ll_i2info(inode)->lli_clob;
1354 io->ci_lockreq = CILR_MAYBE;
1355 if (ll_file_nolock(file)) {
1356 io->ci_lockreq = CILR_NEVER;
1357 io->ci_no_srvlock = 1;
1358 } else if (file->f_flags & O_APPEND) {
1359 io->ci_lockreq = CILR_MANDATORY;
1361 io->ci_noatime = file_is_noatime(file);
1363 /* FLR: only use non-delay I/O for read as there is only one
1364 * avaliable mirror for write. */
1365 io->ci_ndelay = !(iot == CIT_WRITE);
1367 ll_io_set_mirror(io, file);
1370 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1373 struct ll_inode_info *lli = ll_i2info(inode);
1374 struct ll_sb_info *sbi = ll_i2sbi(inode);
1375 enum obd_heat_type sample_type;
1376 enum obd_heat_type iobyte_type;
1377 __u64 now = ktime_get_real_seconds();
1379 if (!ll_sbi_has_file_heat(sbi) ||
1380 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1383 if (iot == CIT_READ) {
1384 sample_type = OBD_HEAT_READSAMPLE;
1385 iobyte_type = OBD_HEAT_READBYTE;
1386 } else if (iot == CIT_WRITE) {
1387 sample_type = OBD_HEAT_WRITESAMPLE;
1388 iobyte_type = OBD_HEAT_WRITEBYTE;
1393 spin_lock(&lli->lli_heat_lock);
1394 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1395 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1396 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1397 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398 spin_unlock(&lli->lli_heat_lock);
1402 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1403 struct file *file, enum cl_io_type iot,
1404 loff_t *ppos, size_t count)
1406 struct vvp_io *vio = vvp_env_io(env);
1407 struct inode *inode = file_inode(file);
1408 struct ll_inode_info *lli = ll_i2info(inode);
1409 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1410 struct range_lock range;
1414 unsigned retried = 0;
1415 bool restarted = false;
1419 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1420 file_dentry(file)->d_name.name,
1421 iot == CIT_READ ? "read" : "write", *ppos, count);
1424 io = vvp_env_thread_io(env);
1425 ll_io_init(io, file, iot);
1426 io->ci_ndelay_tried = retried;
1428 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1429 bool range_locked = false;
1431 if (file->f_flags & O_APPEND)
1432 range_lock_init(&range, 0, LUSTRE_EOF);
1434 range_lock_init(&range, *ppos, *ppos + count - 1);
1436 vio->vui_fd = LUSTRE_FPRIVATE(file);
1437 vio->vui_io_subtype = args->via_io_subtype;
1439 switch (vio->vui_io_subtype) {
1441 vio->vui_iter = args->u.normal.via_iter;
1442 vio->vui_iocb = args->u.normal.via_iocb;
1443 /* Direct IO reads must also take range lock,
1444 * or multiple reads will try to work on the same pages
1445 * See LU-6227 for details. */
1446 if (((iot == CIT_WRITE) ||
1447 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1448 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1449 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1451 rc = range_lock(&lli->lli_write_tree, &range);
1455 range_locked = true;
1459 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1460 vio->u.splice.vui_flags = args->u.splice.via_flags;
1463 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1467 ll_cl_add(file, env, io, LCC_RW);
1468 rc = cl_io_loop(env, io);
1469 ll_cl_remove(file, env);
1472 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1474 range_unlock(&lli->lli_write_tree, &range);
1477 /* cl_io_rw_init() handled IO */
1481 if (io->ci_nob > 0) {
1482 result += io->ci_nob;
1483 count -= io->ci_nob;
1484 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1486 /* prepare IO restart */
1487 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1488 args->u.normal.via_iter = vio->vui_iter;
1491 cl_io_fini(env, io);
1494 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1495 file->f_path.dentry->d_name.name,
1496 iot, rc, result, io->ci_need_restart);
1498 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1500 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1501 file_dentry(file)->d_name.name,
1502 iot == CIT_READ ? "read" : "write",
1503 *ppos, count, result, rc);
1504 /* preserve the tried count for FLR */
1505 retried = io->ci_ndelay_tried;
1510 if (iot == CIT_READ) {
1512 ll_stats_ops_tally(ll_i2sbi(inode),
1513 LPROC_LL_READ_BYTES, result);
1514 } else if (iot == CIT_WRITE) {
1516 ll_stats_ops_tally(ll_i2sbi(inode),
1517 LPROC_LL_WRITE_BYTES, result);
1518 fd->fd_write_failed = false;
1519 } else if (result == 0 && rc == 0) {
1522 fd->fd_write_failed = true;
1524 fd->fd_write_failed = false;
1525 } else if (rc != -ERESTARTSYS) {
1526 fd->fd_write_failed = true;
1530 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1532 ll_heat_add(inode, iot, result);
1534 RETURN(result > 0 ? result : rc);
1538 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1539 * especially for small I/O.
1541 * To serve a read request, CLIO has to create and initialize a cl_io and
1542 * then request DLM lock. This has turned out to have siginificant overhead
1543 * and affects the performance of small I/O dramatically.
1545 * It's not necessary to create a cl_io for each I/O. Under the help of read
1546 * ahead, most of the pages being read are already in memory cache and we can
1547 * read those pages directly because if the pages exist, the corresponding DLM
1548 * lock must exist so that page content must be valid.
1550 * In fast read implementation, the llite speculatively finds and reads pages
1551 * in memory cache. There are three scenarios for fast read:
1552 * - If the page exists and is uptodate, kernel VM will provide the data and
1553 * CLIO won't be intervened;
1554 * - If the page was brought into memory by read ahead, it will be exported
1555 * and read ahead parameters will be updated;
1556 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1557 * it will go back and invoke normal read, i.e., a cl_io will be created
1558 * and DLM lock will be requested.
1560 * POSIX compliance: posix standard states that read is intended to be atomic.
1561 * Lustre read implementation is in line with Linux kernel read implementation
1562 * and neither of them complies with POSIX standard in this matter. Fast read
1563 * doesn't make the situation worse on single node but it may interleave write
1564 * results from multiple nodes due to short read handling in ll_file_aio_read().
1566 * \param env - lu_env
1567 * \param iocb - kiocb from kernel
1568 * \param iter - user space buffers where the data will be copied
1570 * \retval - number of bytes have been read, or error code if error occurred.
1573 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1577 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1580 /* NB: we can't do direct IO for fast read because it will need a lock
1581 * to make IO engine happy. */
1582 if (iocb->ki_filp->f_flags & O_DIRECT)
1585 result = generic_file_read_iter(iocb, iter);
1587 /* If the first page is not in cache, generic_file_aio_read() will be
1588 * returned with -ENODATA.
1589 * See corresponding code in ll_readpage(). */
1590 if (result == -ENODATA)
1594 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1595 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1596 LPROC_LL_READ_BYTES, result);
1603 * Read from a file (through the page cache).
1605 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1608 struct vvp_io_args *args;
1613 result = ll_do_fast_read(iocb, to);
1614 if (result < 0 || iov_iter_count(to) == 0)
1617 env = cl_env_get(&refcheck);
1619 return PTR_ERR(env);
1621 args = ll_env_args(env, IO_NORMAL);
1622 args->u.normal.via_iter = to;
1623 args->u.normal.via_iocb = iocb;
1625 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1626 &iocb->ki_pos, iov_iter_count(to));
1629 else if (result == 0)
1632 cl_env_put(env, &refcheck);
1638 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1639 * If a page is already in the page cache and dirty (and some other things -
1640 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1641 * write to it without doing a full I/O, because Lustre already knows about it
1642 * and will write it out. This saves a lot of processing time.
1644 * All writes here are within one page, so exclusion is handled by the page
1645 * lock on the vm page. We do not do tiny writes for writes which touch
1646 * multiple pages because it's very unlikely multiple sequential pages are
1647 * are already dirty.
1649 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1650 * and are unlikely to be to already dirty pages.
1652 * Attribute updates are important here, we do them in ll_tiny_write_end.
1654 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1656 ssize_t count = iov_iter_count(iter);
1657 struct file *file = iocb->ki_filp;
1658 struct inode *inode = file_inode(file);
1659 bool lock_inode = !IS_NOSEC(inode);
1664 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1665 * of function for why.
1667 if (count >= PAGE_SIZE ||
1668 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1671 if (unlikely(lock_inode))
1673 result = __generic_file_write_iter(iocb, iter);
1675 if (unlikely(lock_inode))
1676 inode_unlock(inode);
1678 /* If the page is not already dirty, ll_tiny_write_begin returns
1679 * -ENODATA. We continue on to normal write.
1681 if (result == -ENODATA)
1685 ll_heat_add(inode, CIT_WRITE, result);
1686 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1688 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1691 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1697 * Write to a file (through the page cache).
1699 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1701 struct vvp_io_args *args;
1703 ssize_t rc_tiny = 0, rc_normal;
1708 /* NB: we can't do direct IO for tiny writes because they use the page
1709 * cache, we can't do sync writes because tiny writes can't flush
1710 * pages, and we can't do append writes because we can't guarantee the
1711 * required DLM locks are held to protect file size.
1713 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1714 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1715 rc_tiny = ll_do_tiny_write(iocb, from);
1717 /* In case of error, go on and try normal write - Only stop if tiny
1718 * write completed I/O.
1720 if (iov_iter_count(from) == 0)
1721 GOTO(out, rc_normal = rc_tiny);
1723 env = cl_env_get(&refcheck);
1725 return PTR_ERR(env);
1727 args = ll_env_args(env, IO_NORMAL);
1728 args->u.normal.via_iter = from;
1729 args->u.normal.via_iocb = iocb;
1731 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1732 &iocb->ki_pos, iov_iter_count(from));
1734 /* On success, combine bytes written. */
1735 if (rc_tiny >= 0 && rc_normal > 0)
1736 rc_normal += rc_tiny;
1737 /* On error, only return error from normal write if tiny write did not
1738 * write any bytes. Otherwise return bytes written by tiny write.
1740 else if (rc_tiny > 0)
1741 rc_normal = rc_tiny;
1743 cl_env_put(env, &refcheck);
1748 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1750 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1752 static int ll_file_get_iov_count(const struct iovec *iov,
1753 unsigned long *nr_segs, size_t *count)
1758 for (seg = 0; seg < *nr_segs; seg++) {
1759 const struct iovec *iv = &iov[seg];
1762 * If any segment has a negative length, or the cumulative
1763 * length ever wraps negative then return -EINVAL.
1766 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1768 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1773 cnt -= iv->iov_len; /* This segment is no good */
1780 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1781 unsigned long nr_segs, loff_t pos)
1788 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1792 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1793 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1794 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1795 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1796 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1798 result = ll_file_read_iter(iocb, &to);
1803 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1806 struct iovec iov = { .iov_base = buf, .iov_len = count };
1811 init_sync_kiocb(&kiocb, file);
1812 kiocb.ki_pos = *ppos;
1813 #ifdef HAVE_KIOCB_KI_LEFT
1814 kiocb.ki_left = count;
1815 #elif defined(HAVE_KI_NBYTES)
1816 kiocb.i_nbytes = count;
1819 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1820 *ppos = kiocb.ki_pos;
1826 * Write to a file (through the page cache).
1829 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1830 unsigned long nr_segs, loff_t pos)
1832 struct iov_iter from;
1837 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1841 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1842 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1843 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1844 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1845 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1847 result = ll_file_write_iter(iocb, &from);
1852 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1853 size_t count, loff_t *ppos)
1855 struct iovec iov = { .iov_base = (void __user *)buf,
1862 init_sync_kiocb(&kiocb, file);
1863 kiocb.ki_pos = *ppos;
1864 #ifdef HAVE_KIOCB_KI_LEFT
1865 kiocb.ki_left = count;
1866 #elif defined(HAVE_KI_NBYTES)
1867 kiocb.ki_nbytes = count;
1870 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1871 *ppos = kiocb.ki_pos;
1875 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1878 * Send file content (through pagecache) somewhere with helper
1880 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1881 struct pipe_inode_info *pipe, size_t count,
1885 struct vvp_io_args *args;
1890 env = cl_env_get(&refcheck);
1892 RETURN(PTR_ERR(env));
1894 args = ll_env_args(env, IO_SPLICE);
1895 args->u.splice.via_pipe = pipe;
1896 args->u.splice.via_flags = flags;
1898 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1899 cl_env_put(env, &refcheck);
1903 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1904 __u64 flags, struct lov_user_md *lum, int lum_size)
1906 struct lookup_intent oit = {
1908 .it_flags = flags | MDS_OPEN_BY_FID,
1913 ll_inode_size_lock(inode);
1914 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1916 GOTO(out_unlock, rc);
1918 ll_release_openhandle(dentry, &oit);
1921 ll_inode_size_unlock(inode);
1922 ll_intent_release(&oit);
1927 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1928 struct lov_mds_md **lmmp, int *lmm_size,
1929 struct ptlrpc_request **request)
1931 struct ll_sb_info *sbi = ll_i2sbi(inode);
1932 struct mdt_body *body;
1933 struct lov_mds_md *lmm = NULL;
1934 struct ptlrpc_request *req = NULL;
1935 struct md_op_data *op_data;
1938 rc = ll_get_default_mdsize(sbi, &lmmsize);
1942 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1943 strlen(filename), lmmsize,
1944 LUSTRE_OPC_ANY, NULL);
1945 if (IS_ERR(op_data))
1946 RETURN(PTR_ERR(op_data));
1948 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1949 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1950 ll_finish_md_op_data(op_data);
1952 CDEBUG(D_INFO, "md_getattr_name failed "
1953 "on %s: rc %d\n", filename, rc);
1957 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1958 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1960 lmmsize = body->mbo_eadatasize;
1962 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1964 GOTO(out, rc = -ENODATA);
1967 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1968 LASSERT(lmm != NULL);
1970 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1971 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1972 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1973 GOTO(out, rc = -EPROTO);
1976 * This is coming from the MDS, so is probably in
1977 * little endian. We convert it to host endian before
1978 * passing it to userspace.
1980 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1983 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1984 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1985 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1986 if (le32_to_cpu(lmm->lmm_pattern) &
1987 LOV_PATTERN_F_RELEASED)
1991 /* if function called for directory - we should
1992 * avoid swab not existent lsm objects */
1993 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1994 lustre_swab_lov_user_md_v1(
1995 (struct lov_user_md_v1 *)lmm);
1996 if (S_ISREG(body->mbo_mode))
1997 lustre_swab_lov_user_md_objects(
1998 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2000 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2001 lustre_swab_lov_user_md_v3(
2002 (struct lov_user_md_v3 *)lmm);
2003 if (S_ISREG(body->mbo_mode))
2004 lustre_swab_lov_user_md_objects(
2005 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2007 } else if (lmm->lmm_magic ==
2008 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2009 lustre_swab_lov_comp_md_v1(
2010 (struct lov_comp_md_v1 *)lmm);
2016 *lmm_size = lmmsize;
2021 static int ll_lov_setea(struct inode *inode, struct file *file,
2024 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2025 struct lov_user_md *lump;
2026 int lum_size = sizeof(struct lov_user_md) +
2027 sizeof(struct lov_user_ost_data);
2031 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2034 OBD_ALLOC_LARGE(lump, lum_size);
2038 if (copy_from_user(lump, arg, lum_size))
2039 GOTO(out_lump, rc = -EFAULT);
2041 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2043 cl_lov_delay_create_clear(&file->f_flags);
2046 OBD_FREE_LARGE(lump, lum_size);
2050 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2057 env = cl_env_get(&refcheck);
2059 RETURN(PTR_ERR(env));
2061 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2062 cl_env_put(env, &refcheck);
2066 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2069 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2070 struct lov_user_md *klum;
2072 __u64 flags = FMODE_WRITE;
2075 rc = ll_copy_user_md(lum, &klum);
2080 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2085 rc = put_user(0, &lum->lmm_stripe_count);
2089 rc = ll_layout_refresh(inode, &gen);
2093 rc = ll_file_getstripe(inode, arg, lum_size);
2095 cl_lov_delay_create_clear(&file->f_flags);
2098 OBD_FREE(klum, lum_size);
2103 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2105 struct ll_inode_info *lli = ll_i2info(inode);
2106 struct cl_object *obj = lli->lli_clob;
2107 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2108 struct ll_grouplock grouplock;
2113 CWARN("group id for group lock must not be 0\n");
2117 if (ll_file_nolock(file))
2118 RETURN(-EOPNOTSUPP);
2120 spin_lock(&lli->lli_lock);
2121 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2122 CWARN("group lock already existed with gid %lu\n",
2123 fd->fd_grouplock.lg_gid);
2124 spin_unlock(&lli->lli_lock);
2127 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2128 spin_unlock(&lli->lli_lock);
2131 * XXX: group lock needs to protect all OST objects while PFL
2132 * can add new OST objects during the IO, so we'd instantiate
2133 * all OST objects before getting its group lock.
2138 struct cl_layout cl = {
2139 .cl_is_composite = false,
2141 struct lu_extent ext = {
2143 .e_end = OBD_OBJECT_EOF,
2146 env = cl_env_get(&refcheck);
2148 RETURN(PTR_ERR(env));
2150 rc = cl_object_layout_get(env, obj, &cl);
2151 if (!rc && cl.cl_is_composite)
2152 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2155 cl_env_put(env, &refcheck);
2160 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2161 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2165 spin_lock(&lli->lli_lock);
2166 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2167 spin_unlock(&lli->lli_lock);
2168 CERROR("another thread just won the race\n");
2169 cl_put_grouplock(&grouplock);
2173 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2174 fd->fd_grouplock = grouplock;
2175 spin_unlock(&lli->lli_lock);
2177 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2181 static int ll_put_grouplock(struct inode *inode, struct file *file,
2184 struct ll_inode_info *lli = ll_i2info(inode);
2185 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2186 struct ll_grouplock grouplock;
2189 spin_lock(&lli->lli_lock);
2190 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2191 spin_unlock(&lli->lli_lock);
2192 CWARN("no group lock held\n");
2196 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2198 if (fd->fd_grouplock.lg_gid != arg) {
2199 CWARN("group lock %lu doesn't match current id %lu\n",
2200 arg, fd->fd_grouplock.lg_gid);
2201 spin_unlock(&lli->lli_lock);
2205 grouplock = fd->fd_grouplock;
2206 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2207 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2208 spin_unlock(&lli->lli_lock);
2210 cl_put_grouplock(&grouplock);
2211 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2216 * Close inode open handle
2218 * \param dentry [in] dentry which contains the inode
2219 * \param it [in,out] intent which contains open info and result
2222 * \retval <0 failure
2224 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2226 struct inode *inode = dentry->d_inode;
2227 struct obd_client_handle *och;
2233 /* Root ? Do nothing. */
2234 if (dentry->d_inode->i_sb->s_root == dentry)
2237 /* No open handle to close? Move away */
2238 if (!it_disposition(it, DISP_OPEN_OPEN))
2241 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2243 OBD_ALLOC(och, sizeof(*och));
2245 GOTO(out, rc = -ENOMEM);
2247 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2249 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2251 /* this one is in place of ll_file_open */
2252 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2253 ptlrpc_req_finished(it->it_request);
2254 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2260 * Get size for inode for which FIEMAP mapping is requested.
2261 * Make the FIEMAP get_info call and returns the result.
2262 * \param fiemap kernel buffer to hold extens
2263 * \param num_bytes kernel buffer size
2265 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2271 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2274 /* Checks for fiemap flags */
2275 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2276 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2280 /* Check for FIEMAP_FLAG_SYNC */
2281 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2282 rc = filemap_fdatawrite(inode->i_mapping);
2287 env = cl_env_get(&refcheck);
2289 RETURN(PTR_ERR(env));
2291 if (i_size_read(inode) == 0) {
2292 rc = ll_glimpse_size(inode);
2297 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2298 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2299 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2301 /* If filesize is 0, then there would be no objects for mapping */
2302 if (fmkey.lfik_oa.o_size == 0) {
2303 fiemap->fm_mapped_extents = 0;
2307 fmkey.lfik_fiemap = *fiemap;
2309 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2310 &fmkey, fiemap, &num_bytes);
2312 cl_env_put(env, &refcheck);
2316 int ll_fid2path(struct inode *inode, void __user *arg)
2318 struct obd_export *exp = ll_i2mdexp(inode);
2319 const struct getinfo_fid2path __user *gfin = arg;
2321 struct getinfo_fid2path *gfout;
2327 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2328 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2331 /* Only need to get the buflen */
2332 if (get_user(pathlen, &gfin->gf_pathlen))
2335 if (pathlen > PATH_MAX)
2338 outsize = sizeof(*gfout) + pathlen;
2339 OBD_ALLOC(gfout, outsize);
2343 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2344 GOTO(gf_free, rc = -EFAULT);
2345 /* append root FID after gfout to let MDT know the root FID so that it
2346 * can lookup the correct path, this is mainly for fileset.
2347 * old server without fileset mount support will ignore this. */
2348 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2350 /* Call mdc_iocontrol */
2351 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2355 if (copy_to_user(arg, gfout, outsize))
2359 OBD_FREE(gfout, outsize);
2364 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2366 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2374 ioc->idv_version = 0;
2375 ioc->idv_layout_version = UINT_MAX;
2377 /* If no file object initialized, we consider its version is 0. */
2381 env = cl_env_get(&refcheck);
2383 RETURN(PTR_ERR(env));
2385 io = vvp_env_thread_io(env);
2387 io->u.ci_data_version.dv_data_version = 0;
2388 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2389 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2392 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2393 result = cl_io_loop(env, io);
2395 result = io->ci_result;
2397 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2398 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2400 cl_io_fini(env, io);
2402 if (unlikely(io->ci_need_restart))
2405 cl_env_put(env, &refcheck);
2411 * Read the data_version for inode.
2413 * This value is computed using stripe object version on OST.
2414 * Version is computed using server side locking.
2416 * @param flags if do sync on the OST side;
2418 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2419 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2421 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2423 struct ioc_data_version ioc = { .idv_flags = flags };
2426 rc = ll_ioc_data_version(inode, &ioc);
2428 *data_version = ioc.idv_version;
2434 * Trigger a HSM release request for the provided inode.
2436 int ll_hsm_release(struct inode *inode)
2439 struct obd_client_handle *och = NULL;
2440 __u64 data_version = 0;
2445 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2446 ll_i2sbi(inode)->ll_fsname,
2447 PFID(&ll_i2info(inode)->lli_fid));
2449 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2451 GOTO(out, rc = PTR_ERR(och));
2453 /* Grab latest data_version and [am]time values */
2454 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2458 env = cl_env_get(&refcheck);
2460 GOTO(out, rc = PTR_ERR(env));
2462 rc = ll_merge_attr(env, inode);
2463 cl_env_put(env, &refcheck);
2465 /* If error happen, we have the wrong size for a file.
2471 /* Release the file.
2472 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2473 * we still need it to pack l_remote_handle to MDT. */
2474 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2480 if (och != NULL && !IS_ERR(och)) /* close the file */
2481 ll_lease_close(och, inode, NULL);
2486 struct ll_swap_stack {
2489 struct inode *inode1;
2490 struct inode *inode2;
2495 static int ll_swap_layouts(struct file *file1, struct file *file2,
2496 struct lustre_swap_layouts *lsl)
2498 struct mdc_swap_layouts msl;
2499 struct md_op_data *op_data;
2502 struct ll_swap_stack *llss = NULL;
2505 OBD_ALLOC_PTR(llss);
2509 llss->inode1 = file_inode(file1);
2510 llss->inode2 = file_inode(file2);
2512 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2516 /* we use 2 bool because it is easier to swap than 2 bits */
2517 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2518 llss->check_dv1 = true;
2520 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2521 llss->check_dv2 = true;
2523 /* we cannot use lsl->sl_dvX directly because we may swap them */
2524 llss->dv1 = lsl->sl_dv1;
2525 llss->dv2 = lsl->sl_dv2;
2527 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2528 if (rc == 0) /* same file, done! */
2531 if (rc < 0) { /* sequentialize it */
2532 swap(llss->inode1, llss->inode2);
2534 swap(llss->dv1, llss->dv2);
2535 swap(llss->check_dv1, llss->check_dv2);
2539 if (gid != 0) { /* application asks to flush dirty cache */
2540 rc = ll_get_grouplock(llss->inode1, file1, gid);
2544 rc = ll_get_grouplock(llss->inode2, file2, gid);
2546 ll_put_grouplock(llss->inode1, file1, gid);
2551 /* ultimate check, before swaping the layouts we check if
2552 * dataversion has changed (if requested) */
2553 if (llss->check_dv1) {
2554 rc = ll_data_version(llss->inode1, &dv, 0);
2557 if (dv != llss->dv1)
2558 GOTO(putgl, rc = -EAGAIN);
2561 if (llss->check_dv2) {
2562 rc = ll_data_version(llss->inode2, &dv, 0);
2565 if (dv != llss->dv2)
2566 GOTO(putgl, rc = -EAGAIN);
2569 /* struct md_op_data is used to send the swap args to the mdt
2570 * only flags is missing, so we use struct mdc_swap_layouts
2571 * through the md_op_data->op_data */
2572 /* flags from user space have to be converted before they are send to
2573 * server, no flag is sent today, they are only used on the client */
2576 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2577 0, LUSTRE_OPC_ANY, &msl);
2578 if (IS_ERR(op_data))
2579 GOTO(free, rc = PTR_ERR(op_data));
2581 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2582 sizeof(*op_data), op_data, NULL);
2583 ll_finish_md_op_data(op_data);
2590 ll_put_grouplock(llss->inode2, file2, gid);
2591 ll_put_grouplock(llss->inode1, file1, gid);
2601 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2603 struct obd_export *exp = ll_i2mdexp(inode);
2604 struct md_op_data *op_data;
2608 /* Detect out-of range masks */
2609 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2612 /* Non-root users are forbidden to set or clear flags which are
2613 * NOT defined in HSM_USER_MASK. */
2614 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2615 !cfs_capable(CFS_CAP_SYS_ADMIN))
2618 if (!exp_connect_archive_id_array(exp)) {
2619 /* Detect out-of range archive id */
2620 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2621 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2625 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2626 LUSTRE_OPC_ANY, hss);
2627 if (IS_ERR(op_data))
2628 RETURN(PTR_ERR(op_data));
2630 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2633 ll_finish_md_op_data(op_data);
2638 static int ll_hsm_import(struct inode *inode, struct file *file,
2639 struct hsm_user_import *hui)
2641 struct hsm_state_set *hss = NULL;
2642 struct iattr *attr = NULL;
2646 if (!S_ISREG(inode->i_mode))
2652 GOTO(out, rc = -ENOMEM);
2654 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2655 hss->hss_archive_id = hui->hui_archive_id;
2656 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2657 rc = ll_hsm_state_set(inode, hss);
2661 OBD_ALLOC_PTR(attr);
2663 GOTO(out, rc = -ENOMEM);
2665 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2666 attr->ia_mode |= S_IFREG;
2667 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2668 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2669 attr->ia_size = hui->hui_size;
2670 attr->ia_mtime.tv_sec = hui->hui_mtime;
2671 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2672 attr->ia_atime.tv_sec = hui->hui_atime;
2673 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2675 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2676 ATTR_UID | ATTR_GID |
2677 ATTR_MTIME | ATTR_MTIME_SET |
2678 ATTR_ATIME | ATTR_ATIME_SET;
2682 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2686 inode_unlock(inode);
2698 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2700 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2701 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2704 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2706 struct inode *inode = file_inode(file);
2708 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2709 ATTR_MTIME | ATTR_MTIME_SET |
2712 .tv_sec = lfu->lfu_atime_sec,
2713 .tv_nsec = lfu->lfu_atime_nsec,
2716 .tv_sec = lfu->lfu_mtime_sec,
2717 .tv_nsec = lfu->lfu_mtime_nsec,
2720 .tv_sec = lfu->lfu_ctime_sec,
2721 .tv_nsec = lfu->lfu_ctime_nsec,
2727 if (!capable(CAP_SYS_ADMIN))
2730 if (!S_ISREG(inode->i_mode))
2734 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2736 inode_unlock(inode);
2741 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2744 case MODE_READ_USER:
2746 case MODE_WRITE_USER:
2753 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2755 /* Used to allow the upper layers of the client to request an LDLM lock
2756 * without doing an actual read or write.
2758 * Used for ladvise lockahead to manually request specific locks.
2760 * \param[in] file file this ladvise lock request is on
2761 * \param[in] ladvise ladvise struct describing this lock request
2763 * \retval 0 success, no detailed result available (sync requests
2764 * and requests sent to the server [not handled locally]
2765 * cannot return detailed results)
2766 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2767 * see definitions for details.
2768 * \retval negative negative errno on error
2770 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2772 struct lu_env *env = NULL;
2773 struct cl_io *io = NULL;
2774 struct cl_lock *lock = NULL;
2775 struct cl_lock_descr *descr = NULL;
2776 struct dentry *dentry = file->f_path.dentry;
2777 struct inode *inode = dentry->d_inode;
2778 enum cl_lock_mode cl_mode;
2779 off_t start = ladvise->lla_start;
2780 off_t end = ladvise->lla_end;
2786 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2787 "start=%llu, end=%llu\n", dentry->d_name.len,
2788 dentry->d_name.name, dentry->d_inode,
2789 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2792 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2794 GOTO(out, result = cl_mode);
2796 /* Get IO environment */
2797 result = cl_io_get(inode, &env, &io, &refcheck);
2801 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2804 * nothing to do for this io. This currently happens when
2805 * stripe sub-object's are not yet created.
2807 result = io->ci_result;
2808 } else if (result == 0) {
2809 lock = vvp_env_lock(env);
2810 descr = &lock->cll_descr;
2812 descr->cld_obj = io->ci_obj;
2813 /* Convert byte offsets to pages */
2814 descr->cld_start = cl_index(io->ci_obj, start);
2815 descr->cld_end = cl_index(io->ci_obj, end);
2816 descr->cld_mode = cl_mode;
2817 /* CEF_MUST is used because we do not want to convert a
2818 * lockahead request to a lockless lock */
2819 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2822 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2823 descr->cld_enq_flags |= CEF_SPECULATIVE;
2825 result = cl_lock_request(env, io, lock);
2827 /* On success, we need to release the lock */
2829 cl_lock_release(env, lock);
2831 cl_io_fini(env, io);
2832 cl_env_put(env, &refcheck);
2834 /* -ECANCELED indicates a matching lock with a different extent
2835 * was already present, and -EEXIST indicates a matching lock
2836 * on exactly the same extent was already present.
2837 * We convert them to positive values for userspace to make
2838 * recognizing true errors easier.
2839 * Note we can only return these detailed results on async requests,
2840 * as sync requests look the same as i/o requests for locking. */
2841 if (result == -ECANCELED)
2842 result = LLA_RESULT_DIFFERENT;
2843 else if (result == -EEXIST)
2844 result = LLA_RESULT_SAME;
2849 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2851 static int ll_ladvise_sanity(struct inode *inode,
2852 struct llapi_lu_ladvise *ladvise)
2854 struct ll_sb_info *sbi = ll_i2sbi(inode);
2855 enum lu_ladvise_type advice = ladvise->lla_advice;
2856 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2857 * be in the first 32 bits of enum ladvise_flags */
2858 __u32 flags = ladvise->lla_peradvice_flags;
2859 /* 3 lines at 80 characters per line, should be plenty */
2862 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2864 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2865 "last supported advice is %s (value '%d'): rc = %d\n",
2866 sbi->ll_fsname, advice,
2867 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2871 /* Per-advice checks */
2873 case LU_LADVISE_LOCKNOEXPAND:
2874 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2876 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2877 "rc = %d\n", sbi->ll_fsname, flags,
2878 ladvise_names[advice], rc);
2882 case LU_LADVISE_LOCKAHEAD:
2883 /* Currently only READ and WRITE modes can be requested */
2884 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2885 ladvise->lla_lockahead_mode == 0) {
2887 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2888 "rc = %d\n", sbi->ll_fsname,
2889 ladvise->lla_lockahead_mode,
2890 ladvise_names[advice], rc);
2893 case LU_LADVISE_WILLREAD:
2894 case LU_LADVISE_DONTNEED:
2896 /* Note fall through above - These checks apply to all advices
2897 * except LOCKNOEXPAND */
2898 if (flags & ~LF_DEFAULT_MASK) {
2900 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2901 "rc = %d\n", sbi->ll_fsname, flags,
2902 ladvise_names[advice], rc);
2905 if (ladvise->lla_start >= ladvise->lla_end) {
2907 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2908 "for %s: rc = %d\n", sbi->ll_fsname,
2909 ladvise->lla_start, ladvise->lla_end,
2910 ladvise_names[advice], rc);
2922 * Give file access advices
2924 * The ladvise interface is similar to Linux fadvise() system call, except it
2925 * forwards the advices directly from Lustre client to server. The server side
2926 * codes will apply appropriate read-ahead and caching techniques for the
2927 * corresponding files.
2929 * A typical workload for ladvise is e.g. a bunch of different clients are
2930 * doing small random reads of a file, so prefetching pages into OSS cache
2931 * with big linear reads before the random IO is a net benefit. Fetching
2932 * all that data into each client cache with fadvise() may not be, due to
2933 * much more data being sent to the client.
2935 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2936 struct llapi_lu_ladvise *ladvise)
2940 struct cl_ladvise_io *lio;
2945 env = cl_env_get(&refcheck);
2947 RETURN(PTR_ERR(env));
2949 io = vvp_env_thread_io(env);
2950 io->ci_obj = ll_i2info(inode)->lli_clob;
2952 /* initialize parameters for ladvise */
2953 lio = &io->u.ci_ladvise;
2954 lio->li_start = ladvise->lla_start;
2955 lio->li_end = ladvise->lla_end;
2956 lio->li_fid = ll_inode2fid(inode);
2957 lio->li_advice = ladvise->lla_advice;
2958 lio->li_flags = flags;
2960 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2961 rc = cl_io_loop(env, io);
2965 cl_io_fini(env, io);
2966 cl_env_put(env, &refcheck);
2970 static int ll_lock_noexpand(struct file *file, int flags)
2972 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2974 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2979 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2982 struct fsxattr fsxattr;
2984 if (copy_from_user(&fsxattr,
2985 (const struct fsxattr __user *)arg,
2989 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2990 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2991 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2992 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2993 if (copy_to_user((struct fsxattr __user *)arg,
2994 &fsxattr, sizeof(fsxattr)))
3000 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3003 * Project Quota ID state is only allowed to change from within the init
3004 * namespace. Enforce that restriction only if we are trying to change
3005 * the quota ID state. Everything else is allowed in user namespaces.
3007 if (current_user_ns() == &init_user_ns)
3010 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3013 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3014 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3017 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3024 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3028 struct md_op_data *op_data;
3029 struct ptlrpc_request *req = NULL;
3031 struct fsxattr fsxattr;
3032 struct cl_object *obj;
3036 if (copy_from_user(&fsxattr,
3037 (const struct fsxattr __user *)arg,
3041 rc = ll_ioctl_check_project(inode, &fsxattr);
3045 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3046 LUSTRE_OPC_ANY, NULL);
3047 if (IS_ERR(op_data))
3048 RETURN(PTR_ERR(op_data));
3050 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3051 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3052 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3053 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3054 op_data->op_projid = fsxattr.fsx_projid;
3055 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3056 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3058 ptlrpc_req_finished(req);
3060 GOTO(out_fsxattr, rc);
3061 ll_update_inode_flags(inode, op_data->op_attr_flags);
3062 obj = ll_i2info(inode)->lli_clob;
3064 GOTO(out_fsxattr, rc);
3066 OBD_ALLOC_PTR(attr);
3068 GOTO(out_fsxattr, rc = -ENOMEM);
3070 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3071 fsxattr.fsx_xflags);
3074 ll_finish_md_op_data(op_data);
3078 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3081 struct inode *inode = file_inode(file);
3082 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3083 struct ll_inode_info *lli = ll_i2info(inode);
3084 struct obd_client_handle *och = NULL;
3085 struct split_param sp;
3088 enum mds_op_bias bias = 0;
3089 struct file *layout_file = NULL;
3091 size_t data_size = 0;
3095 mutex_lock(&lli->lli_och_mutex);
3096 if (fd->fd_lease_och != NULL) {
3097 och = fd->fd_lease_och;
3098 fd->fd_lease_och = NULL;
3100 mutex_unlock(&lli->lli_och_mutex);
3103 GOTO(out, rc = -ENOLCK);
3105 fmode = och->och_flags;
3107 switch (ioc->lil_flags) {
3108 case LL_LEASE_RESYNC_DONE:
3109 if (ioc->lil_count > IOC_IDS_MAX)
3110 GOTO(out, rc = -EINVAL);
3112 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3113 OBD_ALLOC(data, data_size);
3115 GOTO(out, rc = -ENOMEM);
3117 if (copy_from_user(data, (void __user *)arg, data_size))
3118 GOTO(out, rc = -EFAULT);
3120 bias = MDS_CLOSE_RESYNC_DONE;
3122 case LL_LEASE_LAYOUT_MERGE: {
3125 if (ioc->lil_count != 1)
3126 GOTO(out, rc = -EINVAL);
3128 arg += sizeof(*ioc);
3129 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3130 GOTO(out, rc = -EFAULT);
3132 layout_file = fget(fd);
3134 GOTO(out, rc = -EBADF);
3136 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3137 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3138 GOTO(out, rc = -EPERM);
3140 data = file_inode(layout_file);
3141 bias = MDS_CLOSE_LAYOUT_MERGE;
3144 case LL_LEASE_LAYOUT_SPLIT: {
3148 if (ioc->lil_count != 2)
3149 GOTO(out, rc = -EINVAL);
3151 arg += sizeof(*ioc);
3152 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3153 GOTO(out, rc = -EFAULT);
3155 arg += sizeof(__u32);
3156 if (copy_from_user(&mirror_id, (void __user *)arg,
3158 GOTO(out, rc = -EFAULT);
3160 layout_file = fget(fdv);
3162 GOTO(out, rc = -EBADF);
3164 sp.sp_inode = file_inode(layout_file);
3165 sp.sp_mirror_id = (__u16)mirror_id;
3167 bias = MDS_CLOSE_LAYOUT_SPLIT;
3171 /* without close intent */
3175 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3179 rc = ll_lease_och_release(inode, file);
3188 switch (ioc->lil_flags) {
3189 case LL_LEASE_RESYNC_DONE:
3191 OBD_FREE(data, data_size);
3193 case LL_LEASE_LAYOUT_MERGE:
3194 case LL_LEASE_LAYOUT_SPLIT:
3201 rc = ll_lease_type_from_fmode(fmode);
3205 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3208 struct inode *inode = file_inode(file);
3209 struct ll_inode_info *lli = ll_i2info(inode);
3210 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3211 struct obd_client_handle *och = NULL;
3212 __u64 open_flags = 0;
3218 switch (ioc->lil_mode) {
3219 case LL_LEASE_WRLCK:
3220 if (!(file->f_mode & FMODE_WRITE))
3222 fmode = FMODE_WRITE;
3224 case LL_LEASE_RDLCK:
3225 if (!(file->f_mode & FMODE_READ))
3229 case LL_LEASE_UNLCK:
3230 RETURN(ll_file_unlock_lease(file, ioc, arg));
3235 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3237 /* apply for lease */
3238 if (ioc->lil_flags & LL_LEASE_RESYNC)
3239 open_flags = MDS_OPEN_RESYNC;
3240 och = ll_lease_open(inode, file, fmode, open_flags);
3242 RETURN(PTR_ERR(och));
3244 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3245 rc = ll_lease_file_resync(och, inode, arg);
3247 ll_lease_close(och, inode, NULL);
3250 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3252 ll_lease_close(och, inode, NULL);
3258 mutex_lock(&lli->lli_och_mutex);
3259 if (fd->fd_lease_och == NULL) {
3260 fd->fd_lease_och = och;
3263 mutex_unlock(&lli->lli_och_mutex);
3265 /* impossible now that only excl is supported for now */
3266 ll_lease_close(och, inode, &lease_broken);
3272 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3274 struct ll_inode_info *lli = ll_i2info(inode);
3275 struct ll_sb_info *sbi = ll_i2sbi(inode);
3276 __u64 now = ktime_get_real_seconds();
3279 spin_lock(&lli->lli_heat_lock);
3280 heat->lh_flags = lli->lli_heat_flags;
3281 for (i = 0; i < heat->lh_count; i++)
3282 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3283 now, sbi->ll_heat_decay_weight,
3284 sbi->ll_heat_period_second);
3285 spin_unlock(&lli->lli_heat_lock);
3288 static int ll_heat_set(struct inode *inode, __u64 flags)
3290 struct ll_inode_info *lli = ll_i2info(inode);
3293 spin_lock(&lli->lli_heat_lock);
3294 if (flags & LU_HEAT_FLAG_CLEAR)
3295 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3297 if (flags & LU_HEAT_FLAG_OFF)
3298 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3300 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3302 spin_unlock(&lli->lli_heat_lock);
3308 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3310 struct inode *inode = file_inode(file);
3311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3315 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3316 PFID(ll_inode2fid(inode)), inode, cmd);
3317 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3319 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3320 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3324 case LL_IOC_GETFLAGS:
3325 /* Get the current value of the file flags */
3326 return put_user(fd->fd_flags, (int __user *)arg);
3327 case LL_IOC_SETFLAGS:
3328 case LL_IOC_CLRFLAGS:
3329 /* Set or clear specific file flags */
3330 /* XXX This probably needs checks to ensure the flags are
3331 * not abused, and to handle any flag side effects.
3333 if (get_user(flags, (int __user *) arg))
3336 if (cmd == LL_IOC_SETFLAGS) {
3337 if ((flags & LL_FILE_IGNORE_LOCK) &&
3338 !(file->f_flags & O_DIRECT)) {
3339 CERROR("%s: unable to disable locking on "
3340 "non-O_DIRECT file\n", current->comm);
3344 fd->fd_flags |= flags;
3346 fd->fd_flags &= ~flags;
3349 case LL_IOC_LOV_SETSTRIPE:
3350 case LL_IOC_LOV_SETSTRIPE_NEW:
3351 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3352 case LL_IOC_LOV_SETEA:
3353 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3354 case LL_IOC_LOV_SWAP_LAYOUTS: {
3356 struct lustre_swap_layouts lsl;
3358 if (copy_from_user(&lsl, (char __user *)arg,
3359 sizeof(struct lustre_swap_layouts)))
3362 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3365 file2 = fget(lsl.sl_fd);
3369 /* O_WRONLY or O_RDWR */
3370 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3371 GOTO(out, rc = -EPERM);
3373 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3374 struct inode *inode2;
3375 struct ll_inode_info *lli;
3376 struct obd_client_handle *och = NULL;
3378 lli = ll_i2info(inode);
3379 mutex_lock(&lli->lli_och_mutex);
3380 if (fd->fd_lease_och != NULL) {
3381 och = fd->fd_lease_och;
3382 fd->fd_lease_och = NULL;
3384 mutex_unlock(&lli->lli_och_mutex);
3386 GOTO(out, rc = -ENOLCK);
3387 inode2 = file_inode(file2);
3388 rc = ll_swap_layouts_close(och, inode, inode2);
3390 rc = ll_swap_layouts(file, file2, &lsl);
3396 case LL_IOC_LOV_GETSTRIPE:
3397 case LL_IOC_LOV_GETSTRIPE_NEW:
3398 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3399 case FS_IOC_GETFLAGS:
3400 case FS_IOC_SETFLAGS:
3401 RETURN(ll_iocontrol(inode, file, cmd, arg));
3402 case FSFILT_IOC_GETVERSION:
3403 case FS_IOC_GETVERSION:
3404 RETURN(put_user(inode->i_generation, (int __user *)arg));
3405 /* We need to special case any other ioctls we want to handle,
3406 * to send them to the MDS/OST as appropriate and to properly
3407 * network encode the arg field. */
3408 case FS_IOC_SETVERSION:
3411 case LL_IOC_GROUP_LOCK:
3412 RETURN(ll_get_grouplock(inode, file, arg));
3413 case LL_IOC_GROUP_UNLOCK:
3414 RETURN(ll_put_grouplock(inode, file, arg));
3415 case IOC_OBD_STATFS:
3416 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3418 case LL_IOC_FLUSHCTX:
3419 RETURN(ll_flush_ctx(inode));
3420 case LL_IOC_PATH2FID: {
3421 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3422 sizeof(struct lu_fid)))
3427 case LL_IOC_GETPARENT:
3428 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3430 case OBD_IOC_FID2PATH:
3431 RETURN(ll_fid2path(inode, (void __user *)arg));
3432 case LL_IOC_DATA_VERSION: {
3433 struct ioc_data_version idv;
3436 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3439 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3440 rc = ll_ioc_data_version(inode, &idv);
3443 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3449 case LL_IOC_GET_MDTIDX: {
3452 mdtidx = ll_get_mdt_idx(inode);
3456 if (put_user((int)mdtidx, (int __user *)arg))
3461 case OBD_IOC_GETDTNAME:
3462 case OBD_IOC_GETMDNAME:
3463 RETURN(ll_get_obd_name(inode, cmd, arg));
3464 case LL_IOC_HSM_STATE_GET: {
3465 struct md_op_data *op_data;
3466 struct hsm_user_state *hus;
3473 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3474 LUSTRE_OPC_ANY, hus);
3475 if (IS_ERR(op_data)) {
3477 RETURN(PTR_ERR(op_data));
3480 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3483 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3486 ll_finish_md_op_data(op_data);
3490 case LL_IOC_HSM_STATE_SET: {
3491 struct hsm_state_set *hss;
3498 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3503 rc = ll_hsm_state_set(inode, hss);
3508 case LL_IOC_HSM_ACTION: {
3509 struct md_op_data *op_data;
3510 struct hsm_current_action *hca;
3517 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3518 LUSTRE_OPC_ANY, hca);
3519 if (IS_ERR(op_data)) {
3521 RETURN(PTR_ERR(op_data));
3524 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3527 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3530 ll_finish_md_op_data(op_data);
3534 case LL_IOC_SET_LEASE_OLD: {
3535 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3537 RETURN(ll_file_set_lease(file, &ioc, 0));
3539 case LL_IOC_SET_LEASE: {
3540 struct ll_ioc_lease ioc;
3542 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3545 RETURN(ll_file_set_lease(file, &ioc, arg));
3547 case LL_IOC_GET_LEASE: {
3548 struct ll_inode_info *lli = ll_i2info(inode);
3549 struct ldlm_lock *lock = NULL;
3552 mutex_lock(&lli->lli_och_mutex);
3553 if (fd->fd_lease_och != NULL) {
3554 struct obd_client_handle *och = fd->fd_lease_och;
3556 lock = ldlm_handle2lock(&och->och_lease_handle);
3558 lock_res_and_lock(lock);
3559 if (!ldlm_is_cancel(lock))
3560 fmode = och->och_flags;
3562 unlock_res_and_lock(lock);
3563 LDLM_LOCK_PUT(lock);
3566 mutex_unlock(&lli->lli_och_mutex);
3568 RETURN(ll_lease_type_from_fmode(fmode));
3570 case LL_IOC_HSM_IMPORT: {
3571 struct hsm_user_import *hui;
3577 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3582 rc = ll_hsm_import(inode, file, hui);
3587 case LL_IOC_FUTIMES_3: {
3588 struct ll_futimes_3 lfu;
3590 if (copy_from_user(&lfu,
3591 (const struct ll_futimes_3 __user *)arg,
3595 RETURN(ll_file_futimes_3(file, &lfu));
3597 case LL_IOC_LADVISE: {
3598 struct llapi_ladvise_hdr *k_ladvise_hdr;
3599 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3602 int alloc_size = sizeof(*k_ladvise_hdr);
3605 u_ladvise_hdr = (void __user *)arg;
3606 OBD_ALLOC_PTR(k_ladvise_hdr);
3607 if (k_ladvise_hdr == NULL)
3610 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3611 GOTO(out_ladvise, rc = -EFAULT);
3613 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3614 k_ladvise_hdr->lah_count < 1)
3615 GOTO(out_ladvise, rc = -EINVAL);
3617 num_advise = k_ladvise_hdr->lah_count;
3618 if (num_advise >= LAH_COUNT_MAX)
3619 GOTO(out_ladvise, rc = -EFBIG);
3621 OBD_FREE_PTR(k_ladvise_hdr);
3622 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3623 lah_advise[num_advise]);
3624 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3625 if (k_ladvise_hdr == NULL)
3629 * TODO: submit multiple advices to one server in a single RPC
3631 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3632 GOTO(out_ladvise, rc = -EFAULT);
3634 for (i = 0; i < num_advise; i++) {
3635 struct llapi_lu_ladvise *k_ladvise =
3636 &k_ladvise_hdr->lah_advise[i];
3637 struct llapi_lu_ladvise __user *u_ladvise =
3638 &u_ladvise_hdr->lah_advise[i];
3640 rc = ll_ladvise_sanity(inode, k_ladvise);
3642 GOTO(out_ladvise, rc);
3644 switch (k_ladvise->lla_advice) {
3645 case LU_LADVISE_LOCKNOEXPAND:
3646 rc = ll_lock_noexpand(file,
3647 k_ladvise->lla_peradvice_flags);
3648 GOTO(out_ladvise, rc);
3649 case LU_LADVISE_LOCKAHEAD:
3651 rc = ll_file_lock_ahead(file, k_ladvise);
3654 GOTO(out_ladvise, rc);
3657 &u_ladvise->lla_lockahead_result))
3658 GOTO(out_ladvise, rc = -EFAULT);
3661 rc = ll_ladvise(inode, file,
3662 k_ladvise_hdr->lah_flags,
3665 GOTO(out_ladvise, rc);
3672 OBD_FREE(k_ladvise_hdr, alloc_size);
3675 case LL_IOC_FLR_SET_MIRROR: {
3676 /* mirror I/O must be direct to avoid polluting page cache
3678 if (!(file->f_flags & O_DIRECT))
3681 fd->fd_designated_mirror = (__u32)arg;
3684 case LL_IOC_FSGETXATTR:
3685 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3686 case LL_IOC_FSSETXATTR:
3687 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3689 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3690 case LL_IOC_HEAT_GET: {
3691 struct lu_heat uheat;
3692 struct lu_heat *heat;
3695 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3698 if (uheat.lh_count > OBD_HEAT_COUNT)
3699 uheat.lh_count = OBD_HEAT_COUNT;
3701 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3702 OBD_ALLOC(heat, size);
3706 heat->lh_count = uheat.lh_count;
3707 ll_heat_get(inode, heat);
3708 rc = copy_to_user((char __user *)arg, heat, size);
3709 OBD_FREE(heat, size);
3710 RETURN(rc ? -EFAULT : 0);
3712 case LL_IOC_HEAT_SET: {
3715 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3718 rc = ll_heat_set(inode, flags);
3722 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3723 (void __user *)arg));
3727 #ifndef HAVE_FILE_LLSEEK_SIZE
3728 static inline loff_t
3729 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3731 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3733 if (offset > maxsize)
3736 if (offset != file->f_pos) {
3737 file->f_pos = offset;
3738 file->f_version = 0;
3744 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3745 loff_t maxsize, loff_t eof)
3747 struct inode *inode = file_inode(file);
3755 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3756 * position-querying operation. Avoid rewriting the "same"
3757 * f_pos value back to the file because a concurrent read(),
3758 * write() or lseek() might have altered it
3763 * f_lock protects against read/modify/write race with other
3764 * SEEK_CURs. Note that parallel writes and reads behave
3768 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3769 inode_unlock(inode);
3773 * In the generic case the entire file is data, so as long as
3774 * offset isn't at the end of the file then the offset is data.
3781 * There is a virtual hole at the end of the file, so as long as
3782 * offset isn't i_size or larger, return i_size.
3790 return llseek_execute(file, offset, maxsize);
3794 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3796 struct inode *inode = file_inode(file);
3797 loff_t retval, eof = 0;
3800 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3801 (origin == SEEK_CUR) ? file->f_pos : 0);
3802 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3803 PFID(ll_inode2fid(inode)), inode, retval, retval,
3805 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3807 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3808 retval = ll_glimpse_size(inode);
3811 eof = i_size_read(inode);
3814 retval = ll_generic_file_llseek_size(file, offset, origin,
3815 ll_file_maxbytes(inode), eof);
3819 static int ll_flush(struct file *file, fl_owner_t id)
3821 struct inode *inode = file_inode(file);
3822 struct ll_inode_info *lli = ll_i2info(inode);
3823 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3826 LASSERT(!S_ISDIR(inode->i_mode));
3828 /* catch async errors that were recorded back when async writeback
3829 * failed for pages in this mapping. */
3830 rc = lli->lli_async_rc;
3831 lli->lli_async_rc = 0;
3832 if (lli->lli_clob != NULL) {
3833 err = lov_read_and_clear_async_rc(lli->lli_clob);
3838 /* The application has been told write failure already.
3839 * Do not report failure again. */
3840 if (fd->fd_write_failed)
3842 return rc ? -EIO : 0;
3846 * Called to make sure a portion of file has been written out.
3847 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3849 * Return how many pages have been written.
3851 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3852 enum cl_fsync_mode mode, int ignore_layout)
3856 struct cl_fsync_io *fio;
3861 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3862 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3865 env = cl_env_get(&refcheck);
3867 RETURN(PTR_ERR(env));
3869 io = vvp_env_thread_io(env);
3870 io->ci_obj = ll_i2info(inode)->lli_clob;
3871 io->ci_ignore_layout = ignore_layout;
3873 /* initialize parameters for sync */
3874 fio = &io->u.ci_fsync;
3875 fio->fi_start = start;
3877 fio->fi_fid = ll_inode2fid(inode);
3878 fio->fi_mode = mode;
3879 fio->fi_nr_written = 0;
3881 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3882 result = cl_io_loop(env, io);
3884 result = io->ci_result;
3886 result = fio->fi_nr_written;
3887 cl_io_fini(env, io);
3888 cl_env_put(env, &refcheck);
3894 * When dentry is provided (the 'else' case), file_dentry() may be
3895 * null and dentry must be used directly rather than pulled from
3896 * file_dentry() as is done otherwise.
3899 #ifdef HAVE_FILE_FSYNC_4ARGS
3900 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3902 struct dentry *dentry = file_dentry(file);
3903 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3904 int ll_fsync(struct file *file, int datasync)
3906 struct dentry *dentry = file_dentry(file);
3908 loff_t end = LLONG_MAX;
3910 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3913 loff_t end = LLONG_MAX;
3915 struct inode *inode = dentry->d_inode;
3916 struct ll_inode_info *lli = ll_i2info(inode);
3917 struct ptlrpc_request *req;
3921 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3922 PFID(ll_inode2fid(inode)), inode);
3923 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3925 #ifdef HAVE_FILE_FSYNC_4ARGS
3926 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3929 /* fsync's caller has already called _fdata{sync,write}, we want
3930 * that IO to finish before calling the osc and mdc sync methods */
3931 rc = filemap_fdatawait(inode->i_mapping);
3934 /* catch async errors that were recorded back when async writeback
3935 * failed for pages in this mapping. */
3936 if (!S_ISDIR(inode->i_mode)) {
3937 err = lli->lli_async_rc;
3938 lli->lli_async_rc = 0;
3941 if (lli->lli_clob != NULL) {
3942 err = lov_read_and_clear_async_rc(lli->lli_clob);
3948 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3952 ptlrpc_req_finished(req);
3954 if (S_ISREG(inode->i_mode)) {
3955 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3957 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3958 if (rc == 0 && err < 0)
3961 fd->fd_write_failed = true;
3963 fd->fd_write_failed = false;
3966 #ifdef HAVE_FILE_FSYNC_4ARGS
3967 inode_unlock(inode);
3973 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3975 struct inode *inode = file_inode(file);
3976 struct ll_sb_info *sbi = ll_i2sbi(inode);
3977 struct ldlm_enqueue_info einfo = {
3978 .ei_type = LDLM_FLOCK,
3979 .ei_cb_cp = ldlm_flock_completion_ast,
3980 .ei_cbdata = file_lock,
3982 struct md_op_data *op_data;
3983 struct lustre_handle lockh = { 0 };
3984 union ldlm_policy_data flock = { { 0 } };
3985 int fl_type = file_lock->fl_type;
3991 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3992 PFID(ll_inode2fid(inode)), file_lock);
3994 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3996 if (file_lock->fl_flags & FL_FLOCK) {
3997 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3998 /* flocks are whole-file locks */
3999 flock.l_flock.end = OFFSET_MAX;
4000 /* For flocks owner is determined by the local file desctiptor*/
4001 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4002 } else if (file_lock->fl_flags & FL_POSIX) {
4003 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4004 flock.l_flock.start = file_lock->fl_start;
4005 flock.l_flock.end = file_lock->fl_end;
4009 flock.l_flock.pid = file_lock->fl_pid;
4011 /* Somewhat ugly workaround for svc lockd.
4012 * lockd installs custom fl_lmops->lm_compare_owner that checks
4013 * for the fl_owner to be the same (which it always is on local node
4014 * I guess between lockd processes) and then compares pid.
4015 * As such we assign pid to the owner field to make it all work,
4016 * conflict with normal locks is unlikely since pid space and
4017 * pointer space for current->files are not intersecting */
4018 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4019 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4023 einfo.ei_mode = LCK_PR;
4026 /* An unlock request may or may not have any relation to
4027 * existing locks so we may not be able to pass a lock handle
4028 * via a normal ldlm_lock_cancel() request. The request may even
4029 * unlock a byte range in the middle of an existing lock. In
4030 * order to process an unlock request we need all of the same
4031 * information that is given with a normal read or write record
4032 * lock request. To avoid creating another ldlm unlock (cancel)
4033 * message we'll treat a LCK_NL flock request as an unlock. */
4034 einfo.ei_mode = LCK_NL;
4037 einfo.ei_mode = LCK_PW;
4040 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4055 flags = LDLM_FL_BLOCK_NOWAIT;
4061 flags = LDLM_FL_TEST_LOCK;
4064 CERROR("unknown fcntl lock command: %d\n", cmd);
4068 /* Save the old mode so that if the mode in the lock changes we
4069 * can decrement the appropriate reader or writer refcount. */
4070 file_lock->fl_type = einfo.ei_mode;
4072 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4073 LUSTRE_OPC_ANY, NULL);
4074 if (IS_ERR(op_data))
4075 RETURN(PTR_ERR(op_data));
4077 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4078 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4079 flock.l_flock.pid, flags, einfo.ei_mode,
4080 flock.l_flock.start, flock.l_flock.end);
4082 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4085 /* Restore the file lock type if not TEST lock. */
4086 if (!(flags & LDLM_FL_TEST_LOCK))
4087 file_lock->fl_type = fl_type;
4089 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4090 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4091 !(flags & LDLM_FL_TEST_LOCK))
4092 rc2 = locks_lock_file_wait(file, file_lock);
4094 if ((file_lock->fl_flags & FL_FLOCK) &&
4095 (rc == 0 || file_lock->fl_type == F_UNLCK))
4096 rc2 = flock_lock_file_wait(file, file_lock);
4097 if ((file_lock->fl_flags & FL_POSIX) &&
4098 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4099 !(flags & LDLM_FL_TEST_LOCK))
4100 rc2 = posix_lock_file_wait(file, file_lock);
4101 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4103 if (rc2 && file_lock->fl_type != F_UNLCK) {
4104 einfo.ei_mode = LCK_NL;
4105 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4110 ll_finish_md_op_data(op_data);
4115 int ll_get_fid_by_name(struct inode *parent, const char *name,
4116 int namelen, struct lu_fid *fid,
4117 struct inode **inode)
4119 struct md_op_data *op_data = NULL;
4120 struct mdt_body *body;
4121 struct ptlrpc_request *req;
4125 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4126 LUSTRE_OPC_ANY, NULL);
4127 if (IS_ERR(op_data))
4128 RETURN(PTR_ERR(op_data));
4130 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4131 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4132 ll_finish_md_op_data(op_data);
4136 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4138 GOTO(out_req, rc = -EFAULT);
4140 *fid = body->mbo_fid1;
4143 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4145 ptlrpc_req_finished(req);
4149 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4152 struct dentry *dchild = NULL;
4153 struct inode *child_inode = NULL;
4154 struct md_op_data *op_data;
4155 struct ptlrpc_request *request = NULL;
4156 struct obd_client_handle *och = NULL;
4158 struct mdt_body *body;
4159 __u64 data_version = 0;
4160 size_t namelen = strlen(name);
4161 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4165 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4166 PFID(ll_inode2fid(parent)), name,
4167 lum->lum_stripe_offset, lum->lum_stripe_count);
4169 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4170 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4171 lustre_swab_lmv_user_md(lum);
4173 /* Get child FID first */
4174 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4177 dchild = d_lookup(file_dentry(file), &qstr);
4179 if (dchild->d_inode)
4180 child_inode = igrab(dchild->d_inode);
4185 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4194 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4195 OBD_CONNECT2_DIR_MIGRATE)) {
4196 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4197 ll_i2info(child_inode)->lli_lsm_md) {
4198 CERROR("%s: MDT doesn't support stripe directory "
4199 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4200 GOTO(out_iput, rc = -EOPNOTSUPP);
4205 * lfs migrate command needs to be blocked on the client
4206 * by checking the migrate FID against the FID of the
4209 if (child_inode == parent->i_sb->s_root->d_inode)
4210 GOTO(out_iput, rc = -EINVAL);
4212 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4213 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4214 if (IS_ERR(op_data))
4215 GOTO(out_iput, rc = PTR_ERR(op_data));
4217 inode_lock(child_inode);
4218 op_data->op_fid3 = *ll_inode2fid(child_inode);
4219 if (!fid_is_sane(&op_data->op_fid3)) {
4220 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4221 ll_i2sbi(parent)->ll_fsname, name,
4222 PFID(&op_data->op_fid3));
4223 GOTO(out_unlock, rc = -EINVAL);
4226 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4227 op_data->op_data = lum;
4228 op_data->op_data_size = lumlen;
4231 if (S_ISREG(child_inode->i_mode)) {
4232 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4236 GOTO(out_unlock, rc);
4239 rc = ll_data_version(child_inode, &data_version,
4242 GOTO(out_close, rc);
4244 op_data->op_open_handle = och->och_open_handle;
4245 op_data->op_data_version = data_version;
4246 op_data->op_lease_handle = och->och_lease_handle;
4247 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4249 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4250 och->och_mod->mod_open_req->rq_replay = 0;
4251 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4254 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4255 name, namelen, &request);
4257 LASSERT(request != NULL);
4258 ll_update_times(request, parent);
4261 if (rc == 0 || rc == -EAGAIN) {
4262 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4263 LASSERT(body != NULL);
4265 /* If the server does release layout lock, then we cleanup
4266 * the client och here, otherwise release it in out_close: */
4267 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4268 obd_mod_put(och->och_mod);
4269 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4271 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4277 if (request != NULL) {
4278 ptlrpc_req_finished(request);
4282 /* Try again if the lease has cancelled. */
4283 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4288 ll_lease_close(och, child_inode, NULL);
4290 clear_nlink(child_inode);
4292 inode_unlock(child_inode);
4293 ll_finish_md_op_data(op_data);
4300 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4308 * test if some locks matching bits and l_req_mode are acquired
4309 * - bits can be in different locks
4310 * - if found clear the common lock bits in *bits
4311 * - the bits not found, are kept in *bits
4313 * \param bits [IN] searched lock bits [IN]
4314 * \param l_req_mode [IN] searched lock mode
4315 * \retval boolean, true iff all bits are found
4317 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4319 struct lustre_handle lockh;
4320 union ldlm_policy_data policy;
4321 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4322 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4331 fid = &ll_i2info(inode)->lli_fid;
4332 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4333 ldlm_lockname[mode]);
4335 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4336 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4337 policy.l_inodebits.bits = *bits & (1 << i);
4338 if (policy.l_inodebits.bits == 0)
4341 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4342 &policy, mode, &lockh)) {
4343 struct ldlm_lock *lock;
4345 lock = ldlm_handle2lock(&lockh);
4348 ~(lock->l_policy_data.l_inodebits.bits);
4349 LDLM_LOCK_PUT(lock);
4351 *bits &= ~policy.l_inodebits.bits;
4358 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4359 struct lustre_handle *lockh, __u64 flags,
4360 enum ldlm_mode mode)
4362 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4367 fid = &ll_i2info(inode)->lli_fid;
4368 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4370 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4371 fid, LDLM_IBITS, &policy, mode, lockh);
4376 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4378 /* Already unlinked. Just update nlink and return success */
4379 if (rc == -ENOENT) {
4381 /* If it is striped directory, and there is bad stripe
4382 * Let's revalidate the dentry again, instead of returning
4384 if (S_ISDIR(inode->i_mode) &&
4385 ll_i2info(inode)->lli_lsm_md != NULL)
4388 /* This path cannot be hit for regular files unless in
4389 * case of obscure races, so no need to to validate
4391 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4393 } else if (rc != 0) {
4394 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4395 "%s: revalidate FID "DFID" error: rc = %d\n",
4396 ll_i2sbi(inode)->ll_fsname,
4397 PFID(ll_inode2fid(inode)), rc);
4403 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4405 struct inode *inode = dentry->d_inode;
4406 struct obd_export *exp = ll_i2mdexp(inode);
4407 struct lookup_intent oit = {
4410 struct ptlrpc_request *req = NULL;
4411 struct md_op_data *op_data;
4415 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4416 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4418 /* Call getattr by fid, so do not provide name at all. */
4419 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4420 LUSTRE_OPC_ANY, NULL);
4421 if (IS_ERR(op_data))
4422 RETURN(PTR_ERR(op_data));
4424 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4425 ll_finish_md_op_data(op_data);
4427 rc = ll_inode_revalidate_fini(inode, rc);
4431 rc = ll_revalidate_it_finish(req, &oit, dentry);
4433 ll_intent_release(&oit);
4437 /* Unlinked? Unhash dentry, so it is not picked up later by
4438 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4439 * here to preserve get_cwd functionality on 2.6.
4441 if (!dentry->d_inode->i_nlink) {
4442 ll_lock_dcache(inode);
4443 d_lustre_invalidate(dentry, 0);
4444 ll_unlock_dcache(inode);
4447 ll_lookup_finish_locks(&oit, dentry);
4449 ptlrpc_req_finished(req);
4454 static int ll_merge_md_attr(struct inode *inode)
4456 struct ll_inode_info *lli = ll_i2info(inode);
4457 struct cl_attr attr = { 0 };
4460 LASSERT(lli->lli_lsm_md != NULL);
4461 down_read(&lli->lli_lsm_sem);
4462 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4463 &attr, ll_md_blocking_ast);
4464 up_read(&lli->lli_lsm_sem);
4468 set_nlink(inode, attr.cat_nlink);
4469 inode->i_blocks = attr.cat_blocks;
4470 i_size_write(inode, attr.cat_size);
4472 ll_i2info(inode)->lli_atime = attr.cat_atime;
4473 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4474 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4479 static inline dev_t ll_compat_encode_dev(dev_t dev)
4481 /* The compat_sys_*stat*() syscalls will fail unless the
4482 * device majors and minors are both less than 256. Note that
4483 * the value returned here will be passed through
4484 * old_encode_dev() in cp_compat_stat(). And so we are not
4485 * trying to return a valid compat (u16) device number, just
4486 * one that will pass the old_valid_dev() check. */
4488 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4491 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4492 int ll_getattr(const struct path *path, struct kstat *stat,
4493 u32 request_mask, unsigned int flags)
4495 struct dentry *de = path->dentry;
4497 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4500 struct inode *inode = de->d_inode;
4501 struct ll_sb_info *sbi = ll_i2sbi(inode);
4502 struct ll_inode_info *lli = ll_i2info(inode);
4505 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4507 rc = ll_inode_revalidate(de, IT_GETATTR);
4511 if (S_ISREG(inode->i_mode)) {
4512 /* In case of restore, the MDT has the right size and has
4513 * already send it back without granting the layout lock,
4514 * inode is up-to-date so glimpse is useless.
4515 * Also to glimpse we need the layout, in case of a running
4516 * restore the MDT holds the layout lock so the glimpse will
4517 * block up to the end of restore (getattr will block)
4519 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4520 rc = ll_glimpse_size(inode);
4525 /* If object isn't regular a file then don't validate size. */
4526 if (S_ISDIR(inode->i_mode) &&
4527 lli->lli_lsm_md != NULL) {
4528 rc = ll_merge_md_attr(inode);
4533 inode->i_atime.tv_sec = lli->lli_atime;
4534 inode->i_mtime.tv_sec = lli->lli_mtime;
4535 inode->i_ctime.tv_sec = lli->lli_ctime;
4538 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4540 if (ll_need_32bit_api(sbi)) {
4541 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4542 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4543 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4545 stat->ino = inode->i_ino;
4546 stat->dev = inode->i_sb->s_dev;
4547 stat->rdev = inode->i_rdev;
4550 stat->mode = inode->i_mode;
4551 stat->uid = inode->i_uid;
4552 stat->gid = inode->i_gid;
4553 stat->atime = inode->i_atime;
4554 stat->mtime = inode->i_mtime;
4555 stat->ctime = inode->i_ctime;
4556 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4558 stat->nlink = inode->i_nlink;
4559 stat->size = i_size_read(inode);
4560 stat->blocks = inode->i_blocks;
4565 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4566 __u64 start, __u64 len)
4570 struct fiemap *fiemap;
4571 unsigned int extent_count = fieinfo->fi_extents_max;
4573 num_bytes = sizeof(*fiemap) + (extent_count *
4574 sizeof(struct fiemap_extent));
4575 OBD_ALLOC_LARGE(fiemap, num_bytes);
4580 fiemap->fm_flags = fieinfo->fi_flags;
4581 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4582 fiemap->fm_start = start;
4583 fiemap->fm_length = len;
4584 if (extent_count > 0 &&
4585 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4586 sizeof(struct fiemap_extent)) != 0)
4587 GOTO(out, rc = -EFAULT);
4589 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4591 fieinfo->fi_flags = fiemap->fm_flags;
4592 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4593 if (extent_count > 0 &&
4594 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4595 fiemap->fm_mapped_extents *
4596 sizeof(struct fiemap_extent)) != 0)
4597 GOTO(out, rc = -EFAULT);
4599 OBD_FREE_LARGE(fiemap, num_bytes);
4603 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4605 struct ll_inode_info *lli = ll_i2info(inode);
4606 struct posix_acl *acl = NULL;
4609 spin_lock(&lli->lli_lock);
4610 /* VFS' acl_permission_check->check_acl will release the refcount */
4611 acl = posix_acl_dup(lli->lli_posix_acl);
4612 spin_unlock(&lli->lli_lock);
4617 #ifdef HAVE_IOP_SET_ACL
4618 #ifdef CONFIG_FS_POSIX_ACL
4619 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4621 struct ll_sb_info *sbi = ll_i2sbi(inode);
4622 struct ptlrpc_request *req = NULL;
4623 const char *name = NULL;
4625 size_t value_size = 0;
4630 case ACL_TYPE_ACCESS:
4631 name = XATTR_NAME_POSIX_ACL_ACCESS;
4633 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4636 case ACL_TYPE_DEFAULT:
4637 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4638 if (!S_ISDIR(inode->i_mode))
4639 rc = acl ? -EACCES : 0;
4650 value_size = posix_acl_xattr_size(acl->a_count);
4651 value = kmalloc(value_size, GFP_NOFS);
4653 GOTO(out, rc = -ENOMEM);
4655 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4657 GOTO(out_value, rc);
4660 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4661 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4662 name, value, value_size, 0, 0, &req);
4664 ptlrpc_req_finished(req);
4669 forget_cached_acl(inode, type);
4671 set_cached_acl(inode, type, acl);
4674 #endif /* CONFIG_FS_POSIX_ACL */
4675 #endif /* HAVE_IOP_SET_ACL */
4677 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4679 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4680 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4682 ll_check_acl(struct inode *inode, int mask)
4685 # ifdef CONFIG_FS_POSIX_ACL
4686 struct posix_acl *acl;
4690 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4691 if (flags & IPERM_FLAG_RCU)
4694 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4699 rc = posix_acl_permission(inode, acl, mask);
4700 posix_acl_release(acl);
4703 # else /* !CONFIG_FS_POSIX_ACL */
4705 # endif /* CONFIG_FS_POSIX_ACL */
4707 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4709 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4710 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4712 # ifdef HAVE_INODE_PERMISION_2ARGS
4713 int ll_inode_permission(struct inode *inode, int mask)
4715 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4720 struct ll_sb_info *sbi;
4721 struct root_squash_info *squash;
4722 struct cred *cred = NULL;
4723 const struct cred *old_cred = NULL;
4725 bool squash_id = false;
4728 #ifdef MAY_NOT_BLOCK
4729 if (mask & MAY_NOT_BLOCK)
4731 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4732 if (flags & IPERM_FLAG_RCU)
4736 /* as root inode are NOT getting validated in lookup operation,
4737 * need to do it before permission check. */
4739 if (inode == inode->i_sb->s_root->d_inode) {
4740 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4745 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4746 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4748 /* squash fsuid/fsgid if needed */
4749 sbi = ll_i2sbi(inode);
4750 squash = &sbi->ll_squash;
4751 if (unlikely(squash->rsi_uid != 0 &&
4752 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4753 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4757 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4758 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4759 squash->rsi_uid, squash->rsi_gid);
4761 /* update current process's credentials
4762 * and FS capability */
4763 cred = prepare_creds();
4767 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4768 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4769 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4770 if ((1 << cap) & CFS_CAP_FS_MASK)
4771 cap_lower(cred->cap_effective, cap);
4773 old_cred = override_creds(cred);
4776 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4777 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4778 /* restore current process's credentials and FS capability */
4780 revert_creds(old_cred);
4787 /* -o localflock - only provides locally consistent flock locks */
4788 struct file_operations ll_file_operations = {
4789 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4790 # ifdef HAVE_SYNC_READ_WRITE
4791 .read = new_sync_read,
4792 .write = new_sync_write,
4794 .read_iter = ll_file_read_iter,
4795 .write_iter = ll_file_write_iter,
4796 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4797 .read = ll_file_read,
4798 .aio_read = ll_file_aio_read,
4799 .write = ll_file_write,
4800 .aio_write = ll_file_aio_write,
4801 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4802 .unlocked_ioctl = ll_file_ioctl,
4803 .open = ll_file_open,
4804 .release = ll_file_release,
4805 .mmap = ll_file_mmap,
4806 .llseek = ll_file_seek,
4807 .splice_read = ll_file_splice_read,
4812 struct file_operations ll_file_operations_flock = {
4813 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4814 # ifdef HAVE_SYNC_READ_WRITE
4815 .read = new_sync_read,
4816 .write = new_sync_write,
4817 # endif /* HAVE_SYNC_READ_WRITE */
4818 .read_iter = ll_file_read_iter,
4819 .write_iter = ll_file_write_iter,
4820 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4821 .read = ll_file_read,
4822 .aio_read = ll_file_aio_read,
4823 .write = ll_file_write,
4824 .aio_write = ll_file_aio_write,
4825 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4826 .unlocked_ioctl = ll_file_ioctl,
4827 .open = ll_file_open,
4828 .release = ll_file_release,
4829 .mmap = ll_file_mmap,
4830 .llseek = ll_file_seek,
4831 .splice_read = ll_file_splice_read,
4834 .flock = ll_file_flock,
4835 .lock = ll_file_flock
4838 /* These are for -o noflock - to return ENOSYS on flock calls */
4839 struct file_operations ll_file_operations_noflock = {
4840 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4841 # ifdef HAVE_SYNC_READ_WRITE
4842 .read = new_sync_read,
4843 .write = new_sync_write,
4844 # endif /* HAVE_SYNC_READ_WRITE */
4845 .read_iter = ll_file_read_iter,
4846 .write_iter = ll_file_write_iter,
4847 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4848 .read = ll_file_read,
4849 .aio_read = ll_file_aio_read,
4850 .write = ll_file_write,
4851 .aio_write = ll_file_aio_write,
4852 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4853 .unlocked_ioctl = ll_file_ioctl,
4854 .open = ll_file_open,
4855 .release = ll_file_release,
4856 .mmap = ll_file_mmap,
4857 .llseek = ll_file_seek,
4858 .splice_read = ll_file_splice_read,
4861 .flock = ll_file_noflock,
4862 .lock = ll_file_noflock
4865 struct inode_operations ll_file_inode_operations = {
4866 .setattr = ll_setattr,
4867 .getattr = ll_getattr,
4868 .permission = ll_inode_permission,
4869 #ifdef HAVE_IOP_XATTR
4870 .setxattr = ll_setxattr,
4871 .getxattr = ll_getxattr,
4872 .removexattr = ll_removexattr,
4874 .listxattr = ll_listxattr,
4875 .fiemap = ll_fiemap,
4876 #ifdef HAVE_IOP_GET_ACL
4877 .get_acl = ll_get_acl,
4879 #ifdef HAVE_IOP_SET_ACL
4880 .set_acl = ll_set_acl,
4884 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4886 struct ll_inode_info *lli = ll_i2info(inode);
4887 struct cl_object *obj = lli->lli_clob;
4896 env = cl_env_get(&refcheck);
4898 RETURN(PTR_ERR(env));
4900 rc = cl_conf_set(env, lli->lli_clob, conf);
4904 if (conf->coc_opc == OBJECT_CONF_SET) {
4905 struct ldlm_lock *lock = conf->coc_lock;
4906 struct cl_layout cl = {
4910 LASSERT(lock != NULL);
4911 LASSERT(ldlm_has_layout(lock));
4913 /* it can only be allowed to match after layout is
4914 * applied to inode otherwise false layout would be
4915 * seen. Applying layout shoud happen before dropping
4916 * the intent lock. */
4917 ldlm_lock_allow_match(lock);
4919 rc = cl_object_layout_get(env, obj, &cl);
4924 DFID": layout version change: %u -> %u\n",
4925 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4927 ll_layout_version_set(lli, cl.cl_layout_gen);
4931 cl_env_put(env, &refcheck);
4936 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4937 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4940 struct ll_sb_info *sbi = ll_i2sbi(inode);
4941 struct ptlrpc_request *req;
4948 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4949 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4950 lock->l_lvb_data, lock->l_lvb_len);
4952 if (lock->l_lvb_data != NULL)
4955 /* if layout lock was granted right away, the layout is returned
4956 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4957 * blocked and then granted via completion ast, we have to fetch
4958 * layout here. Please note that we can't use the LVB buffer in
4959 * completion AST because it doesn't have a large enough buffer */
4960 rc = ll_get_default_mdsize(sbi, &lmmsize);
4964 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4965 XATTR_NAME_LOV, lmmsize, &req);
4968 GOTO(out, rc = 0); /* empty layout */
4975 if (lmmsize == 0) /* empty layout */
4978 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4980 GOTO(out, rc = -EFAULT);
4982 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4983 if (lvbdata == NULL)
4984 GOTO(out, rc = -ENOMEM);
4986 memcpy(lvbdata, lmm, lmmsize);
4987 lock_res_and_lock(lock);
4988 if (unlikely(lock->l_lvb_data == NULL)) {
4989 lock->l_lvb_type = LVB_T_LAYOUT;
4990 lock->l_lvb_data = lvbdata;
4991 lock->l_lvb_len = lmmsize;
4994 unlock_res_and_lock(lock);
4997 OBD_FREE_LARGE(lvbdata, lmmsize);
5002 ptlrpc_req_finished(req);
5007 * Apply the layout to the inode. Layout lock is held and will be released
5010 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5011 struct inode *inode)
5013 struct ll_inode_info *lli = ll_i2info(inode);
5014 struct ll_sb_info *sbi = ll_i2sbi(inode);
5015 struct ldlm_lock *lock;
5016 struct cl_object_conf conf;
5019 bool wait_layout = false;
5022 LASSERT(lustre_handle_is_used(lockh));
5024 lock = ldlm_handle2lock(lockh);
5025 LASSERT(lock != NULL);
5026 LASSERT(ldlm_has_layout(lock));
5028 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5029 PFID(&lli->lli_fid), inode);
5031 /* in case this is a caching lock and reinstate with new inode */
5032 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5034 lock_res_and_lock(lock);
5035 lvb_ready = ldlm_is_lvb_ready(lock);
5036 unlock_res_and_lock(lock);
5038 /* checking lvb_ready is racy but this is okay. The worst case is
5039 * that multi processes may configure the file on the same time. */
5043 rc = ll_layout_fetch(inode, lock);
5047 /* for layout lock, lmm is stored in lock's lvb.
5048 * lvb_data is immutable if the lock is held so it's safe to access it
5051 * set layout to file. Unlikely this will fail as old layout was
5052 * surely eliminated */
5053 memset(&conf, 0, sizeof conf);
5054 conf.coc_opc = OBJECT_CONF_SET;
5055 conf.coc_inode = inode;
5056 conf.coc_lock = lock;
5057 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5058 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5059 rc = ll_layout_conf(inode, &conf);
5061 /* refresh layout failed, need to wait */
5062 wait_layout = rc == -EBUSY;
5065 LDLM_LOCK_PUT(lock);
5066 ldlm_lock_decref(lockh, mode);
5068 /* wait for IO to complete if it's still being used. */
5070 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5071 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5073 memset(&conf, 0, sizeof conf);
5074 conf.coc_opc = OBJECT_CONF_WAIT;
5075 conf.coc_inode = inode;
5076 rc = ll_layout_conf(inode, &conf);
5080 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5081 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5087 * Issue layout intent RPC to MDS.
5088 * \param inode [in] file inode
5089 * \param intent [in] layout intent
5091 * \retval 0 on success
5092 * \retval < 0 error code
5094 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5096 struct ll_inode_info *lli = ll_i2info(inode);
5097 struct ll_sb_info *sbi = ll_i2sbi(inode);
5098 struct md_op_data *op_data;
5099 struct lookup_intent it;
5100 struct ptlrpc_request *req;
5104 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5105 0, 0, LUSTRE_OPC_ANY, NULL);
5106 if (IS_ERR(op_data))
5107 RETURN(PTR_ERR(op_data));
5109 op_data->op_data = intent;
5110 op_data->op_data_size = sizeof(*intent);
5112 memset(&it, 0, sizeof(it));
5113 it.it_op = IT_LAYOUT;
5114 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5115 intent->li_opc == LAYOUT_INTENT_TRUNC)
5116 it.it_flags = FMODE_WRITE;
5118 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5119 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5121 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5122 &ll_md_blocking_ast, 0);
5123 if (it.it_request != NULL)
5124 ptlrpc_req_finished(it.it_request);
5125 it.it_request = NULL;
5127 ll_finish_md_op_data(op_data);
5129 /* set lock data in case this is a new lock */
5131 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5133 ll_intent_drop_lock(&it);
5139 * This function checks if there exists a LAYOUT lock on the client side,
5140 * or enqueues it if it doesn't have one in cache.
5142 * This function will not hold layout lock so it may be revoked any time after
5143 * this function returns. Any operations depend on layout should be redone
5146 * This function should be called before lov_io_init() to get an uptodate
5147 * layout version, the caller should save the version number and after IO
5148 * is finished, this function should be called again to verify that layout
5149 * is not changed during IO time.
5151 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5153 struct ll_inode_info *lli = ll_i2info(inode);
5154 struct ll_sb_info *sbi = ll_i2sbi(inode);
5155 struct lustre_handle lockh;
5156 struct layout_intent intent = {
5157 .li_opc = LAYOUT_INTENT_ACCESS,
5159 enum ldlm_mode mode;
5163 *gen = ll_layout_version_get(lli);
5164 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5168 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5169 LASSERT(S_ISREG(inode->i_mode));
5171 /* take layout lock mutex to enqueue layout lock exclusively. */
5172 mutex_lock(&lli->lli_layout_mutex);
5175 /* mostly layout lock is caching on the local side, so try to
5176 * match it before grabbing layout lock mutex. */
5177 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5178 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5179 if (mode != 0) { /* hit cached lock */
5180 rc = ll_layout_lock_set(&lockh, mode, inode);
5186 rc = ll_layout_intent(inode, &intent);
5192 *gen = ll_layout_version_get(lli);
5193 mutex_unlock(&lli->lli_layout_mutex);
5199 * Issue layout intent RPC indicating where in a file an IO is about to write.
5201 * \param[in] inode file inode.
5202 * \param[in] ext write range with start offset of fille in bytes where
5203 * an IO is about to write, and exclusive end offset in
5206 * \retval 0 on success
5207 * \retval < 0 error code
5209 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5210 struct lu_extent *ext)
5212 struct layout_intent intent = {
5214 .li_extent.e_start = ext->e_start,
5215 .li_extent.e_end = ext->e_end,
5220 rc = ll_layout_intent(inode, &intent);
5226 * This function send a restore request to the MDT
5228 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5230 struct hsm_user_request *hur;
5234 len = sizeof(struct hsm_user_request) +
5235 sizeof(struct hsm_user_item);
5236 OBD_ALLOC(hur, len);
5240 hur->hur_request.hr_action = HUA_RESTORE;
5241 hur->hur_request.hr_archive_id = 0;
5242 hur->hur_request.hr_flags = 0;
5243 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5244 sizeof(hur->hur_user_item[0].hui_fid));
5245 hur->hur_user_item[0].hui_extent.offset = offset;
5246 hur->hur_user_item[0].hui_extent.length = length;
5247 hur->hur_request.hr_itemcount = 1;
5248 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,