4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE;
158 op_data->op_xvalid |= OP_XVALID_BLOCKS;
159 case MDS_CLOSE_LAYOUT_SPLIT:
160 case MDS_CLOSE_LAYOUT_SWAP: {
161 struct split_param *sp = data;
163 LASSERT(data != NULL);
164 op_data->op_bias |= bias;
165 op_data->op_data_version = 0;
166 op_data->op_lease_handle = och->och_lease_handle;
167 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
168 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
169 op_data->op_mirror_id = sp->sp_mirror_id;
171 op_data->op_fid2 = *ll_inode2fid(data);
176 case MDS_CLOSE_RESYNC_DONE: {
177 struct ll_ioc_lease *ioc = data;
179 LASSERT(data != NULL);
180 op_data->op_attr_blocks +=
181 ioc->lil_count * op_data->op_attr_blocks;
182 op_data->op_attr.ia_valid |= ATTR_SIZE;
183 op_data->op_xvalid |= OP_XVALID_BLOCKS;
184 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
186 op_data->op_lease_handle = och->och_lease_handle;
187 op_data->op_data = &ioc->lil_ids[0];
188 op_data->op_data_size =
189 ioc->lil_count * sizeof(ioc->lil_ids[0]);
193 case MDS_HSM_RELEASE:
194 LASSERT(data != NULL);
195 op_data->op_bias |= MDS_HSM_RELEASE;
196 op_data->op_data_version = *(__u64 *)data;
197 op_data->op_lease_handle = och->och_lease_handle;
198 op_data->op_attr.ia_valid |= ATTR_SIZE;
199 op_data->op_xvalid |= OP_XVALID_BLOCKS;
203 LASSERT(data == NULL);
207 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
208 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
209 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
210 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
212 rc = md_close(md_exp, op_data, och->och_mod, &req);
213 if (rc != 0 && rc != -EINTR)
214 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
215 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
217 if (rc == 0 && op_data->op_bias & bias) {
218 struct mdt_body *body;
220 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
221 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
225 ll_finish_md_op_data(op_data);
229 md_clear_open_replay_data(md_exp, och);
230 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
233 ptlrpc_req_finished(req); /* This is close request */
237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
239 struct ll_inode_info *lli = ll_i2info(inode);
240 struct obd_client_handle **och_p;
241 struct obd_client_handle *och;
246 if (fmode & FMODE_WRITE) {
247 och_p = &lli->lli_mds_write_och;
248 och_usecount = &lli->lli_open_fd_write_count;
249 } else if (fmode & FMODE_EXEC) {
250 och_p = &lli->lli_mds_exec_och;
251 och_usecount = &lli->lli_open_fd_exec_count;
253 LASSERT(fmode & FMODE_READ);
254 och_p = &lli->lli_mds_read_och;
255 och_usecount = &lli->lli_open_fd_read_count;
258 mutex_lock(&lli->lli_och_mutex);
259 if (*och_usecount > 0) {
260 /* There are still users of this handle, so skip
262 mutex_unlock(&lli->lli_och_mutex);
268 mutex_unlock(&lli->lli_och_mutex);
271 /* There might be a race and this handle may already
273 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
279 static int ll_md_close(struct inode *inode, struct file *file)
281 union ldlm_policy_data policy = {
282 .l_inodebits = { MDS_INODELOCK_OPEN },
284 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
286 struct ll_inode_info *lli = ll_i2info(inode);
287 struct lustre_handle lockh;
288 enum ldlm_mode lockmode;
292 /* clear group lock, if present */
293 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
294 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
296 if (fd->fd_lease_och != NULL) {
299 /* Usually the lease is not released when the
300 * application crashed, we need to release here. */
301 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
302 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
303 PFID(&lli->lli_fid), rc, lease_broken);
305 fd->fd_lease_och = NULL;
308 if (fd->fd_och != NULL) {
309 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
314 /* Let's see if we have good enough OPEN lock on the file and if
315 we can skip talking to MDS */
316 mutex_lock(&lli->lli_och_mutex);
317 if (fd->fd_omode & FMODE_WRITE) {
319 LASSERT(lli->lli_open_fd_write_count);
320 lli->lli_open_fd_write_count--;
321 } else if (fd->fd_omode & FMODE_EXEC) {
323 LASSERT(lli->lli_open_fd_exec_count);
324 lli->lli_open_fd_exec_count--;
327 LASSERT(lli->lli_open_fd_read_count);
328 lli->lli_open_fd_read_count--;
330 mutex_unlock(&lli->lli_och_mutex);
332 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
333 LDLM_IBITS, &policy, lockmode, &lockh))
334 rc = ll_md_real_close(inode, fd->fd_omode);
337 LUSTRE_FPRIVATE(file) = NULL;
338 ll_file_data_put(fd);
343 /* While this returns an error code, fput() the caller does not, so we need
344 * to make every effort to clean up all of our state here. Also, applications
345 * rarely check close errors and even if an error is returned they will not
346 * re-try the close call.
348 int ll_file_release(struct inode *inode, struct file *file)
350 struct ll_file_data *fd;
351 struct ll_sb_info *sbi = ll_i2sbi(inode);
352 struct ll_inode_info *lli = ll_i2info(inode);
356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
357 PFID(ll_inode2fid(inode)), inode);
359 if (inode->i_sb->s_root != file_dentry(file))
360 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
361 fd = LUSTRE_FPRIVATE(file);
364 /* The last ref on @file, maybe not the the owner pid of statahead,
365 * because parent and child process can share the same file handle. */
366 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
367 ll_deauthorize_statahead(inode, fd);
369 if (inode->i_sb->s_root == file_dentry(file)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 if (lli->lli_clob != NULL)
377 lov_read_and_clear_async_rc(lli->lli_clob);
378 lli->lli_async_rc = 0;
381 rc = ll_md_close(inode, file);
383 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
384 libcfs_debug_dumplog();
389 static inline int ll_dom_readpage(void *data, struct page *page)
391 struct niobuf_local *lnb = data;
394 kaddr = ll_kmap_atomic(page, KM_USER0);
395 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
396 if (lnb->lnb_len < PAGE_SIZE)
397 memset(kaddr + lnb->lnb_len, 0,
398 PAGE_SIZE - lnb->lnb_len);
399 flush_dcache_page(page);
400 SetPageUptodate(page);
401 ll_kunmap_atomic(kaddr, KM_USER0);
407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
408 struct lookup_intent *it)
410 struct ll_inode_info *lli = ll_i2info(inode);
411 struct cl_object *obj = lli->lli_clob;
412 struct address_space *mapping = inode->i_mapping;
414 struct niobuf_remote *rnb;
416 unsigned long index, start;
417 struct niobuf_local lnb;
424 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
428 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
429 if (rnb == NULL || rnb->rnb_len == 0)
432 /* LU-11595: Server may return whole file and that is OK always or
433 * it may return just file tail and its offset must be aligned with
434 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
435 * smaller then offset may be not aligned and that data is just ignored.
437 if (rnb->rnb_offset % PAGE_SIZE)
440 /* Server returns whole file or just file tail if it fills in
441 * reply buffer, in both cases total size should be inode size.
443 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
444 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
445 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
446 rnb->rnb_len, i_size_read(inode));
450 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
451 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
453 data = (char *)rnb + sizeof(*rnb);
455 lnb.lnb_file_offset = rnb->rnb_offset;
456 start = lnb.lnb_file_offset / PAGE_SIZE;
458 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
459 lnb.lnb_page_offset = 0;
461 lnb.lnb_data = data + (index << PAGE_SHIFT);
462 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
463 if (lnb.lnb_len > PAGE_SIZE)
464 lnb.lnb_len = PAGE_SIZE;
466 vmpage = read_cache_page(mapping, index + start,
467 ll_dom_readpage, &lnb);
468 if (IS_ERR(vmpage)) {
469 CWARN("%s: cannot fill page %lu for "DFID
470 " with data: rc = %li\n",
471 ll_i2sbi(inode)->ll_fsname, index + start,
472 PFID(lu_object_fid(&obj->co_lu)),
478 } while (rnb->rnb_len > (index << PAGE_SHIFT));
482 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
483 struct lookup_intent *itp)
485 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
486 struct dentry *parent = de->d_parent;
489 struct md_op_data *op_data;
490 struct ptlrpc_request *req = NULL;
494 LASSERT(parent != NULL);
495 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
497 /* if server supports open-by-fid, or file name is invalid, don't pack
498 * name in open request */
499 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
500 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
502 len = de->d_name.len;
503 name = kmalloc(len + 1, GFP_NOFS);
508 spin_lock(&de->d_lock);
509 if (len != de->d_name.len) {
510 spin_unlock(&de->d_lock);
514 memcpy(name, de->d_name.name, len);
516 spin_unlock(&de->d_lock);
518 if (!lu_name_is_valid_2(name, len)) {
524 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
525 name, len, 0, LUSTRE_OPC_ANY, NULL);
526 if (IS_ERR(op_data)) {
528 RETURN(PTR_ERR(op_data));
530 op_data->op_data = lmm;
531 op_data->op_data_size = lmmsize;
533 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
534 &ll_md_blocking_ast, 0);
536 ll_finish_md_op_data(op_data);
538 /* reason for keep own exit path - don`t flood log
539 * with messages with -ESTALE errors.
541 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
542 it_open_error(DISP_OPEN_OPEN, itp))
544 ll_release_openhandle(de, itp);
548 if (it_disposition(itp, DISP_LOOKUP_NEG))
549 GOTO(out, rc = -ENOENT);
551 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
552 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
553 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
557 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
559 if (!rc && itp->it_lock_mode) {
560 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
561 struct ldlm_lock *lock;
562 bool has_dom_bit = false;
564 /* If we got a lock back and it has a LOOKUP bit set,
565 * make sure the dentry is marked as valid so we can find it.
566 * We don't need to care about actual hashing since other bits
567 * of kernel will deal with that later.
569 lock = ldlm_handle2lock(&handle);
571 has_dom_bit = ldlm_has_dom(lock);
572 if (lock->l_policy_data.l_inodebits.bits &
573 MDS_INODELOCK_LOOKUP)
574 d_lustre_revalidate(de);
578 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
580 ll_dom_finish_open(de->d_inode, req, itp);
584 ptlrpc_req_finished(req);
585 ll_intent_drop_lock(itp);
587 /* We did open by fid, but by the time we got to the server,
588 * the object disappeared. If this is a create, we cannot really
589 * tell the userspace that the file it was trying to create
590 * does not exist. Instead let's return -ESTALE, and the VFS will
591 * retry the create with LOOKUP_REVAL that we are going to catch
592 * in ll_revalidate_dentry() and use lookup then.
594 if (rc == -ENOENT && itp->it_op & IT_CREAT)
600 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
601 struct obd_client_handle *och)
603 struct mdt_body *body;
605 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
606 och->och_open_handle = body->mbo_open_handle;
607 och->och_fid = body->mbo_fid1;
608 och->och_lease_handle.cookie = it->it_lock_handle;
609 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
610 och->och_flags = it->it_flags;
612 return md_set_open_replay_data(md_exp, och, it);
615 static int ll_local_open(struct file *file, struct lookup_intent *it,
616 struct ll_file_data *fd, struct obd_client_handle *och)
618 struct inode *inode = file_inode(file);
621 LASSERT(!LUSTRE_FPRIVATE(file));
628 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
633 LUSTRE_FPRIVATE(file) = fd;
634 ll_readahead_init(inode, &fd->fd_ras);
635 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
637 /* ll_cl_context initialize */
638 rwlock_init(&fd->fd_lock);
639 INIT_LIST_HEAD(&fd->fd_lccs);
644 /* Open a file, and (for the very first open) create objects on the OSTs at
645 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
646 * creation or open until ll_lov_setstripe() ioctl is called.
648 * If we already have the stripe MD locally then we don't request it in
649 * md_open(), by passing a lmm_size = 0.
651 * It is up to the application to ensure no other processes open this file
652 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
653 * used. We might be able to avoid races of that sort by getting lli_open_sem
654 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
655 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
657 int ll_file_open(struct inode *inode, struct file *file)
659 struct ll_inode_info *lli = ll_i2info(inode);
660 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
661 .it_flags = file->f_flags };
662 struct obd_client_handle **och_p = NULL;
663 __u64 *och_usecount = NULL;
664 struct ll_file_data *fd;
668 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
669 PFID(ll_inode2fid(inode)), inode, file->f_flags);
671 it = file->private_data; /* XXX: compat macro */
672 file->private_data = NULL; /* prevent ll_local_open assertion */
674 fd = ll_file_data_get();
676 GOTO(out_nofiledata, rc = -ENOMEM);
679 if (S_ISDIR(inode->i_mode))
680 ll_authorize_statahead(inode, fd);
682 if (inode->i_sb->s_root == file_dentry(file)) {
683 LUSTRE_FPRIVATE(file) = fd;
687 if (!it || !it->it_disposition) {
688 /* Convert f_flags into access mode. We cannot use file->f_mode,
689 * because everything but O_ACCMODE mask was stripped from
691 if ((oit.it_flags + 1) & O_ACCMODE)
693 if (file->f_flags & O_TRUNC)
694 oit.it_flags |= FMODE_WRITE;
696 /* kernel only call f_op->open in dentry_open. filp_open calls
697 * dentry_open after call to open_namei that checks permissions.
698 * Only nfsd_open call dentry_open directly without checking
699 * permissions and because of that this code below is safe.
701 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
702 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
704 /* We do not want O_EXCL here, presumably we opened the file
705 * already? XXX - NFS implications? */
706 oit.it_flags &= ~O_EXCL;
708 /* bug20584, if "it_flags" contains O_CREAT, the file will be
709 * created if necessary, then "IT_CREAT" should be set to keep
710 * consistent with it */
711 if (oit.it_flags & O_CREAT)
712 oit.it_op |= IT_CREAT;
718 /* Let's see if we have file open on MDS already. */
719 if (it->it_flags & FMODE_WRITE) {
720 och_p = &lli->lli_mds_write_och;
721 och_usecount = &lli->lli_open_fd_write_count;
722 } else if (it->it_flags & FMODE_EXEC) {
723 och_p = &lli->lli_mds_exec_och;
724 och_usecount = &lli->lli_open_fd_exec_count;
726 och_p = &lli->lli_mds_read_och;
727 och_usecount = &lli->lli_open_fd_read_count;
730 mutex_lock(&lli->lli_och_mutex);
731 if (*och_p) { /* Open handle is present */
732 if (it_disposition(it, DISP_OPEN_OPEN)) {
733 /* Well, there's extra open request that we do not need,
734 let's close it somehow. This will decref request. */
735 rc = it_open_error(DISP_OPEN_OPEN, it);
737 mutex_unlock(&lli->lli_och_mutex);
738 GOTO(out_openerr, rc);
741 ll_release_openhandle(file_dentry(file), it);
745 rc = ll_local_open(file, it, fd, NULL);
748 mutex_unlock(&lli->lli_och_mutex);
749 GOTO(out_openerr, rc);
752 LASSERT(*och_usecount == 0);
753 if (!it->it_disposition) {
754 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
755 /* We cannot just request lock handle now, new ELC code
756 means that one of other OPEN locks for this file
757 could be cancelled, and since blocking ast handler
758 would attempt to grab och_mutex as well, that would
759 result in a deadlock */
760 mutex_unlock(&lli->lli_och_mutex);
762 * Normally called under two situations:
764 * 2. A race/condition on MDS resulting in no open
765 * handle to be returned from LOOKUP|OPEN request,
766 * for example if the target entry was a symlink.
768 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
769 * marked by a bit set in ll_iget_for_nfs. Clear the
770 * bit so that it's not confusing later callers.
772 * NB; when ldd is NULL, it must have come via normal
773 * lookup path only, since ll_iget_for_nfs always calls
776 if (ldd && ldd->lld_nfs_dentry) {
777 ldd->lld_nfs_dentry = 0;
778 it->it_flags |= MDS_OPEN_LOCK;
782 * Always specify MDS_OPEN_BY_FID because we don't want
783 * to get file with different fid.
785 it->it_flags |= MDS_OPEN_BY_FID;
786 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
789 GOTO(out_openerr, rc);
793 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
795 GOTO(out_och_free, rc = -ENOMEM);
799 /* md_intent_lock() didn't get a request ref if there was an
800 * open error, so don't do cleanup on the request here
802 /* XXX (green): Should not we bail out on any error here, not
803 * just open error? */
804 rc = it_open_error(DISP_OPEN_OPEN, it);
806 GOTO(out_och_free, rc);
808 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
809 "inode %p: disposition %x, status %d\n", inode,
810 it_disposition(it, ~0), it->it_status);
812 rc = ll_local_open(file, it, fd, *och_p);
814 GOTO(out_och_free, rc);
816 mutex_unlock(&lli->lli_och_mutex);
819 /* Must do this outside lli_och_mutex lock to prevent deadlock where
820 different kind of OPEN lock for this same inode gets cancelled
821 by ldlm_cancel_lru */
822 if (!S_ISREG(inode->i_mode))
823 GOTO(out_och_free, rc);
825 cl_lov_delay_create_clear(&file->f_flags);
826 GOTO(out_och_free, rc);
830 if (och_p && *och_p) {
831 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
832 *och_p = NULL; /* OBD_FREE writes some magic there */
835 mutex_unlock(&lli->lli_och_mutex);
838 if (lli->lli_opendir_key == fd)
839 ll_deauthorize_statahead(inode, fd);
841 ll_file_data_put(fd);
843 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
847 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
848 ptlrpc_req_finished(it->it_request);
849 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
855 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
856 struct ldlm_lock_desc *desc, void *data, int flag)
859 struct lustre_handle lockh;
863 case LDLM_CB_BLOCKING:
864 ldlm_lock2handle(lock, &lockh);
865 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
867 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
871 case LDLM_CB_CANCELING:
879 * When setting a lease on a file, we take ownership of the lli_mds_*_och
880 * and save it as fd->fd_och so as to force client to reopen the file even
881 * if it has an open lock in cache already.
883 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
884 struct lustre_handle *old_open_handle)
886 struct ll_inode_info *lli = ll_i2info(inode);
887 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
888 struct obd_client_handle **och_p;
893 /* Get the openhandle of the file */
894 mutex_lock(&lli->lli_och_mutex);
895 if (fd->fd_lease_och != NULL)
896 GOTO(out_unlock, rc = -EBUSY);
898 if (fd->fd_och == NULL) {
899 if (file->f_mode & FMODE_WRITE) {
900 LASSERT(lli->lli_mds_write_och != NULL);
901 och_p = &lli->lli_mds_write_och;
902 och_usecount = &lli->lli_open_fd_write_count;
904 LASSERT(lli->lli_mds_read_och != NULL);
905 och_p = &lli->lli_mds_read_och;
906 och_usecount = &lli->lli_open_fd_read_count;
909 if (*och_usecount > 1)
910 GOTO(out_unlock, rc = -EBUSY);
917 *old_open_handle = fd->fd_och->och_open_handle;
921 mutex_unlock(&lli->lli_och_mutex);
926 * Release ownership on lli_mds_*_och when putting back a file lease.
928 static int ll_lease_och_release(struct inode *inode, struct file *file)
930 struct ll_inode_info *lli = ll_i2info(inode);
931 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
932 struct obd_client_handle **och_p;
933 struct obd_client_handle *old_och = NULL;
938 mutex_lock(&lli->lli_och_mutex);
939 if (file->f_mode & FMODE_WRITE) {
940 och_p = &lli->lli_mds_write_och;
941 och_usecount = &lli->lli_open_fd_write_count;
943 och_p = &lli->lli_mds_read_och;
944 och_usecount = &lli->lli_open_fd_read_count;
947 /* The file may have been open by another process (broken lease) so
948 * *och_p is not NULL. In this case we should simply increase usecount
951 if (*och_p != NULL) {
952 old_och = fd->fd_och;
959 mutex_unlock(&lli->lli_och_mutex);
962 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
968 * Acquire a lease and open the file.
970 static struct obd_client_handle *
971 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
974 struct lookup_intent it = { .it_op = IT_OPEN };
975 struct ll_sb_info *sbi = ll_i2sbi(inode);
976 struct md_op_data *op_data;
977 struct ptlrpc_request *req = NULL;
978 struct lustre_handle old_open_handle = { 0 };
979 struct obd_client_handle *och = NULL;
984 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
985 RETURN(ERR_PTR(-EINVAL));
988 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
989 RETURN(ERR_PTR(-EPERM));
991 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
998 RETURN(ERR_PTR(-ENOMEM));
1000 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1001 LUSTRE_OPC_ANY, NULL);
1002 if (IS_ERR(op_data))
1003 GOTO(out, rc = PTR_ERR(op_data));
1005 /* To tell the MDT this openhandle is from the same owner */
1006 op_data->op_open_handle = old_open_handle;
1008 it.it_flags = fmode | open_flags;
1009 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1010 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1011 &ll_md_blocking_lease_ast,
1012 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1013 * it can be cancelled which may mislead applications that the lease is
1015 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1016 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1017 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1018 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1019 ll_finish_md_op_data(op_data);
1020 ptlrpc_req_finished(req);
1022 GOTO(out_release_it, rc);
1024 if (it_disposition(&it, DISP_LOOKUP_NEG))
1025 GOTO(out_release_it, rc = -ENOENT);
1027 rc = it_open_error(DISP_OPEN_OPEN, &it);
1029 GOTO(out_release_it, rc);
1031 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1032 ll_och_fill(sbi->ll_md_exp, &it, och);
1034 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1035 GOTO(out_close, rc = -EOPNOTSUPP);
1037 /* already get lease, handle lease lock */
1038 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1039 if (it.it_lock_mode == 0 ||
1040 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1041 /* open lock must return for lease */
1042 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1043 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1045 GOTO(out_close, rc = -EPROTO);
1048 ll_intent_release(&it);
1052 /* Cancel open lock */
1053 if (it.it_lock_mode != 0) {
1054 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1056 it.it_lock_mode = 0;
1057 och->och_lease_handle.cookie = 0ULL;
1059 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1061 CERROR("%s: error closing file "DFID": %d\n",
1062 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1063 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1065 ll_intent_release(&it);
1069 RETURN(ERR_PTR(rc));
1073 * Check whether a layout swap can be done between two inodes.
1075 * \param[in] inode1 First inode to check
1076 * \param[in] inode2 Second inode to check
1078 * \retval 0 on success, layout swap can be performed between both inodes
1079 * \retval negative error code if requirements are not met
1081 static int ll_check_swap_layouts_validity(struct inode *inode1,
1082 struct inode *inode2)
1084 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1087 if (inode_permission(inode1, MAY_WRITE) ||
1088 inode_permission(inode2, MAY_WRITE))
1091 if (inode1->i_sb != inode2->i_sb)
1097 static int ll_swap_layouts_close(struct obd_client_handle *och,
1098 struct inode *inode, struct inode *inode2)
1100 const struct lu_fid *fid1 = ll_inode2fid(inode);
1101 const struct lu_fid *fid2;
1105 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1106 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1108 rc = ll_check_swap_layouts_validity(inode, inode2);
1110 GOTO(out_free_och, rc);
1112 /* We now know that inode2 is a lustre inode */
1113 fid2 = ll_inode2fid(inode2);
1115 rc = lu_fid_cmp(fid1, fid2);
1117 GOTO(out_free_och, rc = -EINVAL);
1119 /* Close the file and {swap,merge} layouts between inode & inode2.
1120 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1121 * because we still need it to pack l_remote_handle to MDT. */
1122 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1125 och = NULL; /* freed in ll_close_inode_openhandle() */
1135 * Release lease and close the file.
1136 * It will check if the lease has ever broken.
1138 static int ll_lease_close_intent(struct obd_client_handle *och,
1139 struct inode *inode,
1140 bool *lease_broken, enum mds_op_bias bias,
1143 struct ldlm_lock *lock;
1144 bool cancelled = true;
1148 lock = ldlm_handle2lock(&och->och_lease_handle);
1150 lock_res_and_lock(lock);
1151 cancelled = ldlm_is_cancel(lock);
1152 unlock_res_and_lock(lock);
1153 LDLM_LOCK_PUT(lock);
1156 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1157 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1159 if (lease_broken != NULL)
1160 *lease_broken = cancelled;
1162 if (!cancelled && !bias)
1163 ldlm_cli_cancel(&och->och_lease_handle, 0);
1165 if (cancelled) { /* no need to excute intent */
1170 rc = ll_close_inode_openhandle(inode, och, bias, data);
1174 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1177 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1181 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1183 static int ll_lease_file_resync(struct obd_client_handle *och,
1184 struct inode *inode, unsigned long arg)
1186 struct ll_sb_info *sbi = ll_i2sbi(inode);
1187 struct md_op_data *op_data;
1188 struct ll_ioc_lease_id ioc;
1189 __u64 data_version_unused;
1193 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1194 LUSTRE_OPC_ANY, NULL);
1195 if (IS_ERR(op_data))
1196 RETURN(PTR_ERR(op_data));
1198 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1202 /* before starting file resync, it's necessary to clean up page cache
1203 * in client memory, otherwise once the layout version is increased,
1204 * writing back cached data will be denied the OSTs. */
1205 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1209 op_data->op_lease_handle = och->och_lease_handle;
1210 op_data->op_mirror_id = ioc.lil_mirror_id;
1211 rc = md_file_resync(sbi->ll_md_exp, op_data);
1217 ll_finish_md_op_data(op_data);
1221 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1223 struct ll_inode_info *lli = ll_i2info(inode);
1224 struct cl_object *obj = lli->lli_clob;
1225 struct cl_attr *attr = vvp_env_thread_attr(env);
1233 ll_inode_size_lock(inode);
1235 /* Merge timestamps the most recently obtained from MDS with
1236 * timestamps obtained from OSTs.
1238 * Do not overwrite atime of inode because it may be refreshed
1239 * by file_accessed() function. If the read was served by cache
1240 * data, there is no RPC to be sent so that atime may not be
1241 * transferred to OSTs at all. MDT only updates atime at close time
1242 * if it's at least 'mdd.*.atime_diff' older.
1243 * All in all, the atime in Lustre does not strictly comply with
1244 * POSIX. Solving this problem needs to send an RPC to MDT for each
1245 * read, this will hurt performance.
1247 if (inode->i_atime.tv_sec < lli->lli_atime ||
1248 lli->lli_update_atime) {
1249 inode->i_atime.tv_sec = lli->lli_atime;
1250 lli->lli_update_atime = 0;
1252 inode->i_mtime.tv_sec = lli->lli_mtime;
1253 inode->i_ctime.tv_sec = lli->lli_ctime;
1255 mtime = inode->i_mtime.tv_sec;
1256 atime = inode->i_atime.tv_sec;
1257 ctime = inode->i_ctime.tv_sec;
1259 cl_object_attr_lock(obj);
1260 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1263 rc = cl_object_attr_get(env, obj, attr);
1264 cl_object_attr_unlock(obj);
1267 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1269 if (atime < attr->cat_atime)
1270 atime = attr->cat_atime;
1272 if (ctime < attr->cat_ctime)
1273 ctime = attr->cat_ctime;
1275 if (mtime < attr->cat_mtime)
1276 mtime = attr->cat_mtime;
1278 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1279 PFID(&lli->lli_fid), attr->cat_size);
1281 i_size_write(inode, attr->cat_size);
1282 inode->i_blocks = attr->cat_blocks;
1284 inode->i_mtime.tv_sec = mtime;
1285 inode->i_atime.tv_sec = atime;
1286 inode->i_ctime.tv_sec = ctime;
1289 ll_inode_size_unlock(inode);
1295 * Set designated mirror for I/O.
1297 * So far only read, write, and truncated can support to issue I/O to
1298 * designated mirror.
1300 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1304 /* clear layout version for generic(non-resync) I/O in case it carries
1305 * stale layout version due to I/O restart */
1306 io->ci_layout_version = 0;
1308 /* FLR: disable non-delay for designated mirror I/O because obviously
1309 * only one mirror is available */
1310 if (fd->fd_designated_mirror > 0) {
1312 io->ci_designated_mirror = fd->fd_designated_mirror;
1313 io->ci_layout_version = fd->fd_layout_version;
1316 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1317 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1320 static bool file_is_noatime(const struct file *file)
1322 const struct vfsmount *mnt = file->f_path.mnt;
1323 const struct inode *inode = file_inode((struct file *)file);
1325 /* Adapted from file_accessed() and touch_atime().*/
1326 if (file->f_flags & O_NOATIME)
1329 if (inode->i_flags & S_NOATIME)
1332 if (IS_NOATIME(inode))
1335 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1338 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1347 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1349 struct inode *inode = file_inode(file);
1350 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1352 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1353 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1355 if (iot == CIT_WRITE) {
1356 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1357 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1358 file->f_flags & O_DIRECT ||
1361 io->ci_obj = ll_i2info(inode)->lli_clob;
1362 io->ci_lockreq = CILR_MAYBE;
1363 if (ll_file_nolock(file)) {
1364 io->ci_lockreq = CILR_NEVER;
1365 io->ci_no_srvlock = 1;
1366 } else if (file->f_flags & O_APPEND) {
1367 io->ci_lockreq = CILR_MANDATORY;
1369 io->ci_noatime = file_is_noatime(file);
1371 /* FLR: only use non-delay I/O for read as there is only one
1372 * avaliable mirror for write. */
1373 io->ci_ndelay = !(iot == CIT_WRITE);
1375 ll_io_set_mirror(io, file);
1378 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1381 struct ll_inode_info *lli = ll_i2info(inode);
1382 struct ll_sb_info *sbi = ll_i2sbi(inode);
1383 enum obd_heat_type sample_type;
1384 enum obd_heat_type iobyte_type;
1385 __u64 now = ktime_get_real_seconds();
1387 if (!ll_sbi_has_file_heat(sbi) ||
1388 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1391 if (iot == CIT_READ) {
1392 sample_type = OBD_HEAT_READSAMPLE;
1393 iobyte_type = OBD_HEAT_READBYTE;
1394 } else if (iot == CIT_WRITE) {
1395 sample_type = OBD_HEAT_WRITESAMPLE;
1396 iobyte_type = OBD_HEAT_WRITEBYTE;
1401 spin_lock(&lli->lli_heat_lock);
1402 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1403 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1404 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1405 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1406 spin_unlock(&lli->lli_heat_lock);
1410 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1411 struct file *file, enum cl_io_type iot,
1412 loff_t *ppos, size_t count)
1414 struct vvp_io *vio = vvp_env_io(env);
1415 struct inode *inode = file_inode(file);
1416 struct ll_inode_info *lli = ll_i2info(inode);
1417 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1418 struct range_lock range;
1422 unsigned retried = 0;
1423 bool restarted = false;
1427 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1428 file_dentry(file)->d_name.name,
1429 iot == CIT_READ ? "read" : "write", *ppos, count);
1432 io = vvp_env_thread_io(env);
1433 ll_io_init(io, file, iot);
1434 io->ci_ndelay_tried = retried;
1436 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1437 bool range_locked = false;
1439 if (file->f_flags & O_APPEND)
1440 range_lock_init(&range, 0, LUSTRE_EOF);
1442 range_lock_init(&range, *ppos, *ppos + count - 1);
1444 vio->vui_fd = LUSTRE_FPRIVATE(file);
1445 vio->vui_io_subtype = args->via_io_subtype;
1447 switch (vio->vui_io_subtype) {
1449 vio->vui_iter = args->u.normal.via_iter;
1450 vio->vui_iocb = args->u.normal.via_iocb;
1451 /* Direct IO reads must also take range lock,
1452 * or multiple reads will try to work on the same pages
1453 * See LU-6227 for details. */
1454 if (((iot == CIT_WRITE) ||
1455 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1456 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1457 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1459 rc = range_lock(&lli->lli_write_tree, &range);
1463 range_locked = true;
1467 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1468 vio->u.splice.vui_flags = args->u.splice.via_flags;
1471 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1475 ll_cl_add(file, env, io, LCC_RW);
1476 rc = cl_io_loop(env, io);
1477 ll_cl_remove(file, env);
1480 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1482 range_unlock(&lli->lli_write_tree, &range);
1485 /* cl_io_rw_init() handled IO */
1489 if (io->ci_nob > 0) {
1490 result += io->ci_nob;
1491 count -= io->ci_nob;
1492 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1494 /* prepare IO restart */
1495 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1496 args->u.normal.via_iter = vio->vui_iter;
1499 cl_io_fini(env, io);
1502 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1503 file->f_path.dentry->d_name.name,
1504 iot, rc, result, io->ci_need_restart);
1506 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1508 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1509 file_dentry(file)->d_name.name,
1510 iot == CIT_READ ? "read" : "write",
1511 *ppos, count, result, rc);
1512 /* preserve the tried count for FLR */
1513 retried = io->ci_ndelay_tried;
1518 if (iot == CIT_READ) {
1520 ll_stats_ops_tally(ll_i2sbi(inode),
1521 LPROC_LL_READ_BYTES, result);
1522 } else if (iot == CIT_WRITE) {
1524 ll_stats_ops_tally(ll_i2sbi(inode),
1525 LPROC_LL_WRITE_BYTES, result);
1526 fd->fd_write_failed = false;
1527 } else if (result == 0 && rc == 0) {
1530 fd->fd_write_failed = true;
1532 fd->fd_write_failed = false;
1533 } else if (rc != -ERESTARTSYS) {
1534 fd->fd_write_failed = true;
1538 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1540 ll_heat_add(inode, iot, result);
1542 RETURN(result > 0 ? result : rc);
1546 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1547 * especially for small I/O.
1549 * To serve a read request, CLIO has to create and initialize a cl_io and
1550 * then request DLM lock. This has turned out to have siginificant overhead
1551 * and affects the performance of small I/O dramatically.
1553 * It's not necessary to create a cl_io for each I/O. Under the help of read
1554 * ahead, most of the pages being read are already in memory cache and we can
1555 * read those pages directly because if the pages exist, the corresponding DLM
1556 * lock must exist so that page content must be valid.
1558 * In fast read implementation, the llite speculatively finds and reads pages
1559 * in memory cache. There are three scenarios for fast read:
1560 * - If the page exists and is uptodate, kernel VM will provide the data and
1561 * CLIO won't be intervened;
1562 * - If the page was brought into memory by read ahead, it will be exported
1563 * and read ahead parameters will be updated;
1564 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1565 * it will go back and invoke normal read, i.e., a cl_io will be created
1566 * and DLM lock will be requested.
1568 * POSIX compliance: posix standard states that read is intended to be atomic.
1569 * Lustre read implementation is in line with Linux kernel read implementation
1570 * and neither of them complies with POSIX standard in this matter. Fast read
1571 * doesn't make the situation worse on single node but it may interleave write
1572 * results from multiple nodes due to short read handling in ll_file_aio_read().
1574 * \param env - lu_env
1575 * \param iocb - kiocb from kernel
1576 * \param iter - user space buffers where the data will be copied
1578 * \retval - number of bytes have been read, or error code if error occurred.
1581 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1585 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1588 /* NB: we can't do direct IO for fast read because it will need a lock
1589 * to make IO engine happy. */
1590 if (iocb->ki_filp->f_flags & O_DIRECT)
1593 result = generic_file_read_iter(iocb, iter);
1595 /* If the first page is not in cache, generic_file_aio_read() will be
1596 * returned with -ENODATA.
1597 * See corresponding code in ll_readpage(). */
1598 if (result == -ENODATA)
1602 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1603 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1604 LPROC_LL_READ_BYTES, result);
1611 * Read from a file (through the page cache).
1613 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1616 struct vvp_io_args *args;
1621 ll_ras_enter(iocb->ki_filp);
1623 result = ll_do_fast_read(iocb, to);
1624 if (result < 0 || iov_iter_count(to) == 0)
1627 env = cl_env_get(&refcheck);
1629 return PTR_ERR(env);
1631 args = ll_env_args(env, IO_NORMAL);
1632 args->u.normal.via_iter = to;
1633 args->u.normal.via_iocb = iocb;
1635 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1636 &iocb->ki_pos, iov_iter_count(to));
1639 else if (result == 0)
1642 cl_env_put(env, &refcheck);
1648 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1649 * If a page is already in the page cache and dirty (and some other things -
1650 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1651 * write to it without doing a full I/O, because Lustre already knows about it
1652 * and will write it out. This saves a lot of processing time.
1654 * All writes here are within one page, so exclusion is handled by the page
1655 * lock on the vm page. We do not do tiny writes for writes which touch
1656 * multiple pages because it's very unlikely multiple sequential pages are
1657 * are already dirty.
1659 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1660 * and are unlikely to be to already dirty pages.
1662 * Attribute updates are important here, we do them in ll_tiny_write_end.
1664 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1666 ssize_t count = iov_iter_count(iter);
1667 struct file *file = iocb->ki_filp;
1668 struct inode *inode = file_inode(file);
1669 bool lock_inode = !IS_NOSEC(inode);
1674 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1675 * of function for why.
1677 if (count >= PAGE_SIZE ||
1678 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1681 if (unlikely(lock_inode))
1683 result = __generic_file_write_iter(iocb, iter);
1685 if (unlikely(lock_inode))
1686 inode_unlock(inode);
1688 /* If the page is not already dirty, ll_tiny_write_begin returns
1689 * -ENODATA. We continue on to normal write.
1691 if (result == -ENODATA)
1695 ll_heat_add(inode, CIT_WRITE, result);
1696 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1698 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1701 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1707 * Write to a file (through the page cache).
1709 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1711 struct vvp_io_args *args;
1713 ssize_t rc_tiny = 0, rc_normal;
1718 /* NB: we can't do direct IO for tiny writes because they use the page
1719 * cache, we can't do sync writes because tiny writes can't flush
1720 * pages, and we can't do append writes because we can't guarantee the
1721 * required DLM locks are held to protect file size.
1723 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1724 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1725 rc_tiny = ll_do_tiny_write(iocb, from);
1727 /* In case of error, go on and try normal write - Only stop if tiny
1728 * write completed I/O.
1730 if (iov_iter_count(from) == 0)
1731 GOTO(out, rc_normal = rc_tiny);
1733 env = cl_env_get(&refcheck);
1735 return PTR_ERR(env);
1737 args = ll_env_args(env, IO_NORMAL);
1738 args->u.normal.via_iter = from;
1739 args->u.normal.via_iocb = iocb;
1741 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1742 &iocb->ki_pos, iov_iter_count(from));
1744 /* On success, combine bytes written. */
1745 if (rc_tiny >= 0 && rc_normal > 0)
1746 rc_normal += rc_tiny;
1747 /* On error, only return error from normal write if tiny write did not
1748 * write any bytes. Otherwise return bytes written by tiny write.
1750 else if (rc_tiny > 0)
1751 rc_normal = rc_tiny;
1753 cl_env_put(env, &refcheck);
1758 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1760 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1762 static int ll_file_get_iov_count(const struct iovec *iov,
1763 unsigned long *nr_segs, size_t *count)
1768 for (seg = 0; seg < *nr_segs; seg++) {
1769 const struct iovec *iv = &iov[seg];
1772 * If any segment has a negative length, or the cumulative
1773 * length ever wraps negative then return -EINVAL.
1776 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1778 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1783 cnt -= iv->iov_len; /* This segment is no good */
1790 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1791 unsigned long nr_segs, loff_t pos)
1798 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1802 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1803 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1804 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1805 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1806 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1808 result = ll_file_read_iter(iocb, &to);
1813 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1816 struct iovec iov = { .iov_base = buf, .iov_len = count };
1821 init_sync_kiocb(&kiocb, file);
1822 kiocb.ki_pos = *ppos;
1823 #ifdef HAVE_KIOCB_KI_LEFT
1824 kiocb.ki_left = count;
1825 #elif defined(HAVE_KI_NBYTES)
1826 kiocb.i_nbytes = count;
1829 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1830 *ppos = kiocb.ki_pos;
1836 * Write to a file (through the page cache).
1839 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1840 unsigned long nr_segs, loff_t pos)
1842 struct iov_iter from;
1847 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1851 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1852 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1853 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1854 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1855 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1857 result = ll_file_write_iter(iocb, &from);
1862 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1863 size_t count, loff_t *ppos)
1865 struct iovec iov = { .iov_base = (void __user *)buf,
1872 init_sync_kiocb(&kiocb, file);
1873 kiocb.ki_pos = *ppos;
1874 #ifdef HAVE_KIOCB_KI_LEFT
1875 kiocb.ki_left = count;
1876 #elif defined(HAVE_KI_NBYTES)
1877 kiocb.ki_nbytes = count;
1880 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1881 *ppos = kiocb.ki_pos;
1885 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1888 * Send file content (through pagecache) somewhere with helper
1890 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1891 struct pipe_inode_info *pipe, size_t count,
1895 struct vvp_io_args *args;
1900 ll_ras_enter(in_file);
1902 env = cl_env_get(&refcheck);
1904 RETURN(PTR_ERR(env));
1906 args = ll_env_args(env, IO_SPLICE);
1907 args->u.splice.via_pipe = pipe;
1908 args->u.splice.via_flags = flags;
1910 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1911 cl_env_put(env, &refcheck);
1915 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1916 __u64 flags, struct lov_user_md *lum, int lum_size)
1918 struct lookup_intent oit = {
1920 .it_flags = flags | MDS_OPEN_BY_FID,
1925 ll_inode_size_lock(inode);
1926 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1928 GOTO(out_unlock, rc);
1930 ll_release_openhandle(dentry, &oit);
1933 ll_inode_size_unlock(inode);
1934 ll_intent_release(&oit);
1939 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1940 struct lov_mds_md **lmmp, int *lmm_size,
1941 struct ptlrpc_request **request)
1943 struct ll_sb_info *sbi = ll_i2sbi(inode);
1944 struct mdt_body *body;
1945 struct lov_mds_md *lmm = NULL;
1946 struct ptlrpc_request *req = NULL;
1947 struct md_op_data *op_data;
1950 rc = ll_get_default_mdsize(sbi, &lmmsize);
1954 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1955 strlen(filename), lmmsize,
1956 LUSTRE_OPC_ANY, NULL);
1957 if (IS_ERR(op_data))
1958 RETURN(PTR_ERR(op_data));
1960 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1961 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1962 ll_finish_md_op_data(op_data);
1964 CDEBUG(D_INFO, "md_getattr_name failed "
1965 "on %s: rc %d\n", filename, rc);
1969 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1970 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1972 lmmsize = body->mbo_eadatasize;
1974 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1976 GOTO(out, rc = -ENODATA);
1979 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1980 LASSERT(lmm != NULL);
1982 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1983 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1984 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
1985 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
1986 GOTO(out, rc = -EPROTO);
1989 * This is coming from the MDS, so is probably in
1990 * little endian. We convert it to host endian before
1991 * passing it to userspace.
1993 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1996 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1997 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1998 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1999 if (le32_to_cpu(lmm->lmm_pattern) &
2000 LOV_PATTERN_F_RELEASED)
2004 /* if function called for directory - we should
2005 * avoid swab not existent lsm objects */
2006 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2007 lustre_swab_lov_user_md_v1(
2008 (struct lov_user_md_v1 *)lmm);
2009 if (S_ISREG(body->mbo_mode))
2010 lustre_swab_lov_user_md_objects(
2011 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2013 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2014 lustre_swab_lov_user_md_v3(
2015 (struct lov_user_md_v3 *)lmm);
2016 if (S_ISREG(body->mbo_mode))
2017 lustre_swab_lov_user_md_objects(
2018 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2020 } else if (lmm->lmm_magic ==
2021 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2022 lustre_swab_lov_comp_md_v1(
2023 (struct lov_comp_md_v1 *)lmm);
2024 } else if (lmm->lmm_magic ==
2025 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2026 struct lov_foreign_md *lfm;
2028 lfm = (struct lov_foreign_md *)lmm;
2029 __swab32s(&lfm->lfm_magic);
2030 __swab32s(&lfm->lfm_length);
2031 __swab32s(&lfm->lfm_type);
2032 __swab32s(&lfm->lfm_flags);
2038 *lmm_size = lmmsize;
2043 static int ll_lov_setea(struct inode *inode, struct file *file,
2046 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2047 struct lov_user_md *lump;
2048 int lum_size = sizeof(struct lov_user_md) +
2049 sizeof(struct lov_user_ost_data);
2053 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2056 OBD_ALLOC_LARGE(lump, lum_size);
2060 if (copy_from_user(lump, arg, lum_size))
2061 GOTO(out_lump, rc = -EFAULT);
2063 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2065 cl_lov_delay_create_clear(&file->f_flags);
2068 OBD_FREE_LARGE(lump, lum_size);
2072 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2079 env = cl_env_get(&refcheck);
2081 RETURN(PTR_ERR(env));
2083 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2084 cl_env_put(env, &refcheck);
2088 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2091 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2092 struct lov_user_md *klum;
2094 __u64 flags = FMODE_WRITE;
2097 rc = ll_copy_user_md(lum, &klum);
2102 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2107 rc = put_user(0, &lum->lmm_stripe_count);
2111 rc = ll_layout_refresh(inode, &gen);
2115 rc = ll_file_getstripe(inode, arg, lum_size);
2117 cl_lov_delay_create_clear(&file->f_flags);
2120 OBD_FREE(klum, lum_size);
2125 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2127 struct ll_inode_info *lli = ll_i2info(inode);
2128 struct cl_object *obj = lli->lli_clob;
2129 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2130 struct ll_grouplock grouplock;
2135 CWARN("group id for group lock must not be 0\n");
2139 if (ll_file_nolock(file))
2140 RETURN(-EOPNOTSUPP);
2142 spin_lock(&lli->lli_lock);
2143 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2144 CWARN("group lock already existed with gid %lu\n",
2145 fd->fd_grouplock.lg_gid);
2146 spin_unlock(&lli->lli_lock);
2149 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2150 spin_unlock(&lli->lli_lock);
2153 * XXX: group lock needs to protect all OST objects while PFL
2154 * can add new OST objects during the IO, so we'd instantiate
2155 * all OST objects before getting its group lock.
2160 struct cl_layout cl = {
2161 .cl_is_composite = false,
2163 struct lu_extent ext = {
2165 .e_end = OBD_OBJECT_EOF,
2168 env = cl_env_get(&refcheck);
2170 RETURN(PTR_ERR(env));
2172 rc = cl_object_layout_get(env, obj, &cl);
2173 if (!rc && cl.cl_is_composite)
2174 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2177 cl_env_put(env, &refcheck);
2182 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2183 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2187 spin_lock(&lli->lli_lock);
2188 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2189 spin_unlock(&lli->lli_lock);
2190 CERROR("another thread just won the race\n");
2191 cl_put_grouplock(&grouplock);
2195 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2196 fd->fd_grouplock = grouplock;
2197 spin_unlock(&lli->lli_lock);
2199 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2203 static int ll_put_grouplock(struct inode *inode, struct file *file,
2206 struct ll_inode_info *lli = ll_i2info(inode);
2207 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2208 struct ll_grouplock grouplock;
2211 spin_lock(&lli->lli_lock);
2212 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2213 spin_unlock(&lli->lli_lock);
2214 CWARN("no group lock held\n");
2218 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2220 if (fd->fd_grouplock.lg_gid != arg) {
2221 CWARN("group lock %lu doesn't match current id %lu\n",
2222 arg, fd->fd_grouplock.lg_gid);
2223 spin_unlock(&lli->lli_lock);
2227 grouplock = fd->fd_grouplock;
2228 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2229 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2230 spin_unlock(&lli->lli_lock);
2232 cl_put_grouplock(&grouplock);
2233 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2238 * Close inode open handle
2240 * \param dentry [in] dentry which contains the inode
2241 * \param it [in,out] intent which contains open info and result
2244 * \retval <0 failure
2246 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2248 struct inode *inode = dentry->d_inode;
2249 struct obd_client_handle *och;
2255 /* Root ? Do nothing. */
2256 if (dentry->d_inode->i_sb->s_root == dentry)
2259 /* No open handle to close? Move away */
2260 if (!it_disposition(it, DISP_OPEN_OPEN))
2263 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2265 OBD_ALLOC(och, sizeof(*och));
2267 GOTO(out, rc = -ENOMEM);
2269 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2271 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2273 /* this one is in place of ll_file_open */
2274 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2275 ptlrpc_req_finished(it->it_request);
2276 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2282 * Get size for inode for which FIEMAP mapping is requested.
2283 * Make the FIEMAP get_info call and returns the result.
2284 * \param fiemap kernel buffer to hold extens
2285 * \param num_bytes kernel buffer size
2287 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2293 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2296 /* Checks for fiemap flags */
2297 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2298 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2302 /* Check for FIEMAP_FLAG_SYNC */
2303 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2304 rc = filemap_fdatawrite(inode->i_mapping);
2309 env = cl_env_get(&refcheck);
2311 RETURN(PTR_ERR(env));
2313 if (i_size_read(inode) == 0) {
2314 rc = ll_glimpse_size(inode);
2319 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2320 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2321 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2323 /* If filesize is 0, then there would be no objects for mapping */
2324 if (fmkey.lfik_oa.o_size == 0) {
2325 fiemap->fm_mapped_extents = 0;
2329 fmkey.lfik_fiemap = *fiemap;
2331 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2332 &fmkey, fiemap, &num_bytes);
2334 cl_env_put(env, &refcheck);
2338 int ll_fid2path(struct inode *inode, void __user *arg)
2340 struct obd_export *exp = ll_i2mdexp(inode);
2341 const struct getinfo_fid2path __user *gfin = arg;
2343 struct getinfo_fid2path *gfout;
2349 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2350 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2353 /* Only need to get the buflen */
2354 if (get_user(pathlen, &gfin->gf_pathlen))
2357 if (pathlen > PATH_MAX)
2360 outsize = sizeof(*gfout) + pathlen;
2361 OBD_ALLOC(gfout, outsize);
2365 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2366 GOTO(gf_free, rc = -EFAULT);
2367 /* append root FID after gfout to let MDT know the root FID so that it
2368 * can lookup the correct path, this is mainly for fileset.
2369 * old server without fileset mount support will ignore this. */
2370 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2372 /* Call mdc_iocontrol */
2373 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2377 if (copy_to_user(arg, gfout, outsize))
2381 OBD_FREE(gfout, outsize);
2386 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2388 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2396 ioc->idv_version = 0;
2397 ioc->idv_layout_version = UINT_MAX;
2399 /* If no file object initialized, we consider its version is 0. */
2403 env = cl_env_get(&refcheck);
2405 RETURN(PTR_ERR(env));
2407 io = vvp_env_thread_io(env);
2409 io->u.ci_data_version.dv_data_version = 0;
2410 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2411 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2414 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2415 result = cl_io_loop(env, io);
2417 result = io->ci_result;
2419 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2420 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2422 cl_io_fini(env, io);
2424 if (unlikely(io->ci_need_restart))
2427 cl_env_put(env, &refcheck);
2433 * Read the data_version for inode.
2435 * This value is computed using stripe object version on OST.
2436 * Version is computed using server side locking.
2438 * @param flags if do sync on the OST side;
2440 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2441 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2443 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2445 struct ioc_data_version ioc = { .idv_flags = flags };
2448 rc = ll_ioc_data_version(inode, &ioc);
2450 *data_version = ioc.idv_version;
2456 * Trigger a HSM release request for the provided inode.
2458 int ll_hsm_release(struct inode *inode)
2461 struct obd_client_handle *och = NULL;
2462 __u64 data_version = 0;
2467 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2468 ll_i2sbi(inode)->ll_fsname,
2469 PFID(&ll_i2info(inode)->lli_fid));
2471 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2473 GOTO(out, rc = PTR_ERR(och));
2475 /* Grab latest data_version and [am]time values */
2476 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2480 env = cl_env_get(&refcheck);
2482 GOTO(out, rc = PTR_ERR(env));
2484 rc = ll_merge_attr(env, inode);
2485 cl_env_put(env, &refcheck);
2487 /* If error happen, we have the wrong size for a file.
2493 /* Release the file.
2494 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2495 * we still need it to pack l_remote_handle to MDT. */
2496 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2502 if (och != NULL && !IS_ERR(och)) /* close the file */
2503 ll_lease_close(och, inode, NULL);
2508 struct ll_swap_stack {
2511 struct inode *inode1;
2512 struct inode *inode2;
2517 static int ll_swap_layouts(struct file *file1, struct file *file2,
2518 struct lustre_swap_layouts *lsl)
2520 struct mdc_swap_layouts msl;
2521 struct md_op_data *op_data;
2524 struct ll_swap_stack *llss = NULL;
2527 OBD_ALLOC_PTR(llss);
2531 llss->inode1 = file_inode(file1);
2532 llss->inode2 = file_inode(file2);
2534 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2538 /* we use 2 bool because it is easier to swap than 2 bits */
2539 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2540 llss->check_dv1 = true;
2542 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2543 llss->check_dv2 = true;
2545 /* we cannot use lsl->sl_dvX directly because we may swap them */
2546 llss->dv1 = lsl->sl_dv1;
2547 llss->dv2 = lsl->sl_dv2;
2549 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2550 if (rc == 0) /* same file, done! */
2553 if (rc < 0) { /* sequentialize it */
2554 swap(llss->inode1, llss->inode2);
2556 swap(llss->dv1, llss->dv2);
2557 swap(llss->check_dv1, llss->check_dv2);
2561 if (gid != 0) { /* application asks to flush dirty cache */
2562 rc = ll_get_grouplock(llss->inode1, file1, gid);
2566 rc = ll_get_grouplock(llss->inode2, file2, gid);
2568 ll_put_grouplock(llss->inode1, file1, gid);
2573 /* ultimate check, before swaping the layouts we check if
2574 * dataversion has changed (if requested) */
2575 if (llss->check_dv1) {
2576 rc = ll_data_version(llss->inode1, &dv, 0);
2579 if (dv != llss->dv1)
2580 GOTO(putgl, rc = -EAGAIN);
2583 if (llss->check_dv2) {
2584 rc = ll_data_version(llss->inode2, &dv, 0);
2587 if (dv != llss->dv2)
2588 GOTO(putgl, rc = -EAGAIN);
2591 /* struct md_op_data is used to send the swap args to the mdt
2592 * only flags is missing, so we use struct mdc_swap_layouts
2593 * through the md_op_data->op_data */
2594 /* flags from user space have to be converted before they are send to
2595 * server, no flag is sent today, they are only used on the client */
2598 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2599 0, LUSTRE_OPC_ANY, &msl);
2600 if (IS_ERR(op_data))
2601 GOTO(free, rc = PTR_ERR(op_data));
2603 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2604 sizeof(*op_data), op_data, NULL);
2605 ll_finish_md_op_data(op_data);
2612 ll_put_grouplock(llss->inode2, file2, gid);
2613 ll_put_grouplock(llss->inode1, file1, gid);
2623 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2625 struct obd_export *exp = ll_i2mdexp(inode);
2626 struct md_op_data *op_data;
2630 /* Detect out-of range masks */
2631 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2634 /* Non-root users are forbidden to set or clear flags which are
2635 * NOT defined in HSM_USER_MASK. */
2636 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2637 !cfs_capable(CFS_CAP_SYS_ADMIN))
2640 if (!exp_connect_archive_id_array(exp)) {
2641 /* Detect out-of range archive id */
2642 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2643 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2647 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2648 LUSTRE_OPC_ANY, hss);
2649 if (IS_ERR(op_data))
2650 RETURN(PTR_ERR(op_data));
2652 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2655 ll_finish_md_op_data(op_data);
2660 static int ll_hsm_import(struct inode *inode, struct file *file,
2661 struct hsm_user_import *hui)
2663 struct hsm_state_set *hss = NULL;
2664 struct iattr *attr = NULL;
2668 if (!S_ISREG(inode->i_mode))
2674 GOTO(out, rc = -ENOMEM);
2676 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2677 hss->hss_archive_id = hui->hui_archive_id;
2678 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2679 rc = ll_hsm_state_set(inode, hss);
2683 OBD_ALLOC_PTR(attr);
2685 GOTO(out, rc = -ENOMEM);
2687 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2688 attr->ia_mode |= S_IFREG;
2689 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2690 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2691 attr->ia_size = hui->hui_size;
2692 attr->ia_mtime.tv_sec = hui->hui_mtime;
2693 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2694 attr->ia_atime.tv_sec = hui->hui_atime;
2695 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2697 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2698 ATTR_UID | ATTR_GID |
2699 ATTR_MTIME | ATTR_MTIME_SET |
2700 ATTR_ATIME | ATTR_ATIME_SET;
2704 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2708 inode_unlock(inode);
2720 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2722 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2723 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2726 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2728 struct inode *inode = file_inode(file);
2730 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2731 ATTR_MTIME | ATTR_MTIME_SET |
2734 .tv_sec = lfu->lfu_atime_sec,
2735 .tv_nsec = lfu->lfu_atime_nsec,
2738 .tv_sec = lfu->lfu_mtime_sec,
2739 .tv_nsec = lfu->lfu_mtime_nsec,
2742 .tv_sec = lfu->lfu_ctime_sec,
2743 .tv_nsec = lfu->lfu_ctime_nsec,
2749 if (!capable(CAP_SYS_ADMIN))
2752 if (!S_ISREG(inode->i_mode))
2756 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2758 inode_unlock(inode);
2763 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2766 case MODE_READ_USER:
2768 case MODE_WRITE_USER:
2775 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2777 /* Used to allow the upper layers of the client to request an LDLM lock
2778 * without doing an actual read or write.
2780 * Used for ladvise lockahead to manually request specific locks.
2782 * \param[in] file file this ladvise lock request is on
2783 * \param[in] ladvise ladvise struct describing this lock request
2785 * \retval 0 success, no detailed result available (sync requests
2786 * and requests sent to the server [not handled locally]
2787 * cannot return detailed results)
2788 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2789 * see definitions for details.
2790 * \retval negative negative errno on error
2792 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2794 struct lu_env *env = NULL;
2795 struct cl_io *io = NULL;
2796 struct cl_lock *lock = NULL;
2797 struct cl_lock_descr *descr = NULL;
2798 struct dentry *dentry = file->f_path.dentry;
2799 struct inode *inode = dentry->d_inode;
2800 enum cl_lock_mode cl_mode;
2801 off_t start = ladvise->lla_start;
2802 off_t end = ladvise->lla_end;
2808 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2809 "start=%llu, end=%llu\n", dentry->d_name.len,
2810 dentry->d_name.name, dentry->d_inode,
2811 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2814 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2816 GOTO(out, result = cl_mode);
2818 /* Get IO environment */
2819 result = cl_io_get(inode, &env, &io, &refcheck);
2823 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2826 * nothing to do for this io. This currently happens when
2827 * stripe sub-object's are not yet created.
2829 result = io->ci_result;
2830 } else if (result == 0) {
2831 lock = vvp_env_lock(env);
2832 descr = &lock->cll_descr;
2834 descr->cld_obj = io->ci_obj;
2835 /* Convert byte offsets to pages */
2836 descr->cld_start = cl_index(io->ci_obj, start);
2837 descr->cld_end = cl_index(io->ci_obj, end);
2838 descr->cld_mode = cl_mode;
2839 /* CEF_MUST is used because we do not want to convert a
2840 * lockahead request to a lockless lock */
2841 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2844 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2845 descr->cld_enq_flags |= CEF_SPECULATIVE;
2847 result = cl_lock_request(env, io, lock);
2849 /* On success, we need to release the lock */
2851 cl_lock_release(env, lock);
2853 cl_io_fini(env, io);
2854 cl_env_put(env, &refcheck);
2856 /* -ECANCELED indicates a matching lock with a different extent
2857 * was already present, and -EEXIST indicates a matching lock
2858 * on exactly the same extent was already present.
2859 * We convert them to positive values for userspace to make
2860 * recognizing true errors easier.
2861 * Note we can only return these detailed results on async requests,
2862 * as sync requests look the same as i/o requests for locking. */
2863 if (result == -ECANCELED)
2864 result = LLA_RESULT_DIFFERENT;
2865 else if (result == -EEXIST)
2866 result = LLA_RESULT_SAME;
2871 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2873 static int ll_ladvise_sanity(struct inode *inode,
2874 struct llapi_lu_ladvise *ladvise)
2876 struct ll_sb_info *sbi = ll_i2sbi(inode);
2877 enum lu_ladvise_type advice = ladvise->lla_advice;
2878 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2879 * be in the first 32 bits of enum ladvise_flags */
2880 __u32 flags = ladvise->lla_peradvice_flags;
2881 /* 3 lines at 80 characters per line, should be plenty */
2884 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2886 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2887 "last supported advice is %s (value '%d'): rc = %d\n",
2888 sbi->ll_fsname, advice,
2889 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2893 /* Per-advice checks */
2895 case LU_LADVISE_LOCKNOEXPAND:
2896 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2898 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2899 "rc = %d\n", sbi->ll_fsname, flags,
2900 ladvise_names[advice], rc);
2904 case LU_LADVISE_LOCKAHEAD:
2905 /* Currently only READ and WRITE modes can be requested */
2906 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2907 ladvise->lla_lockahead_mode == 0) {
2909 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2910 "rc = %d\n", sbi->ll_fsname,
2911 ladvise->lla_lockahead_mode,
2912 ladvise_names[advice], rc);
2915 case LU_LADVISE_WILLREAD:
2916 case LU_LADVISE_DONTNEED:
2918 /* Note fall through above - These checks apply to all advices
2919 * except LOCKNOEXPAND */
2920 if (flags & ~LF_DEFAULT_MASK) {
2922 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2923 "rc = %d\n", sbi->ll_fsname, flags,
2924 ladvise_names[advice], rc);
2927 if (ladvise->lla_start >= ladvise->lla_end) {
2929 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2930 "for %s: rc = %d\n", sbi->ll_fsname,
2931 ladvise->lla_start, ladvise->lla_end,
2932 ladvise_names[advice], rc);
2944 * Give file access advices
2946 * The ladvise interface is similar to Linux fadvise() system call, except it
2947 * forwards the advices directly from Lustre client to server. The server side
2948 * codes will apply appropriate read-ahead and caching techniques for the
2949 * corresponding files.
2951 * A typical workload for ladvise is e.g. a bunch of different clients are
2952 * doing small random reads of a file, so prefetching pages into OSS cache
2953 * with big linear reads before the random IO is a net benefit. Fetching
2954 * all that data into each client cache with fadvise() may not be, due to
2955 * much more data being sent to the client.
2957 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2958 struct llapi_lu_ladvise *ladvise)
2962 struct cl_ladvise_io *lio;
2967 env = cl_env_get(&refcheck);
2969 RETURN(PTR_ERR(env));
2971 io = vvp_env_thread_io(env);
2972 io->ci_obj = ll_i2info(inode)->lli_clob;
2974 /* initialize parameters for ladvise */
2975 lio = &io->u.ci_ladvise;
2976 lio->li_start = ladvise->lla_start;
2977 lio->li_end = ladvise->lla_end;
2978 lio->li_fid = ll_inode2fid(inode);
2979 lio->li_advice = ladvise->lla_advice;
2980 lio->li_flags = flags;
2982 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2983 rc = cl_io_loop(env, io);
2987 cl_io_fini(env, io);
2988 cl_env_put(env, &refcheck);
2992 static int ll_lock_noexpand(struct file *file, int flags)
2994 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2996 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3001 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3004 struct fsxattr fsxattr;
3006 if (copy_from_user(&fsxattr,
3007 (const struct fsxattr __user *)arg,
3011 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3012 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3013 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3014 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3015 if (copy_to_user((struct fsxattr __user *)arg,
3016 &fsxattr, sizeof(fsxattr)))
3022 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3025 * Project Quota ID state is only allowed to change from within the init
3026 * namespace. Enforce that restriction only if we are trying to change
3027 * the quota ID state. Everything else is allowed in user namespaces.
3029 if (current_user_ns() == &init_user_ns)
3032 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3035 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3036 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3039 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3046 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3050 struct md_op_data *op_data;
3051 struct ptlrpc_request *req = NULL;
3053 struct fsxattr fsxattr;
3054 struct cl_object *obj;
3058 if (copy_from_user(&fsxattr,
3059 (const struct fsxattr __user *)arg,
3063 rc = ll_ioctl_check_project(inode, &fsxattr);
3067 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3068 LUSTRE_OPC_ANY, NULL);
3069 if (IS_ERR(op_data))
3070 RETURN(PTR_ERR(op_data));
3072 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3073 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3074 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3075 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3076 op_data->op_projid = fsxattr.fsx_projid;
3077 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3078 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3080 ptlrpc_req_finished(req);
3082 GOTO(out_fsxattr, rc);
3083 ll_update_inode_flags(inode, op_data->op_attr_flags);
3084 obj = ll_i2info(inode)->lli_clob;
3086 GOTO(out_fsxattr, rc);
3088 OBD_ALLOC_PTR(attr);
3090 GOTO(out_fsxattr, rc = -ENOMEM);
3092 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3093 fsxattr.fsx_xflags);
3096 ll_finish_md_op_data(op_data);
3100 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3103 struct inode *inode = file_inode(file);
3104 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3105 struct ll_inode_info *lli = ll_i2info(inode);
3106 struct obd_client_handle *och = NULL;
3107 struct split_param sp;
3110 enum mds_op_bias bias = 0;
3111 struct file *layout_file = NULL;
3113 size_t data_size = 0;
3117 mutex_lock(&lli->lli_och_mutex);
3118 if (fd->fd_lease_och != NULL) {
3119 och = fd->fd_lease_och;
3120 fd->fd_lease_och = NULL;
3122 mutex_unlock(&lli->lli_och_mutex);
3125 GOTO(out, rc = -ENOLCK);
3127 fmode = och->och_flags;
3129 switch (ioc->lil_flags) {
3130 case LL_LEASE_RESYNC_DONE:
3131 if (ioc->lil_count > IOC_IDS_MAX)
3132 GOTO(out, rc = -EINVAL);
3134 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3135 OBD_ALLOC(data, data_size);
3137 GOTO(out, rc = -ENOMEM);
3139 if (copy_from_user(data, (void __user *)arg, data_size))
3140 GOTO(out, rc = -EFAULT);
3142 bias = MDS_CLOSE_RESYNC_DONE;
3144 case LL_LEASE_LAYOUT_MERGE: {
3147 if (ioc->lil_count != 1)
3148 GOTO(out, rc = -EINVAL);
3150 arg += sizeof(*ioc);
3151 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3152 GOTO(out, rc = -EFAULT);
3154 layout_file = fget(fd);
3156 GOTO(out, rc = -EBADF);
3158 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3159 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3160 GOTO(out, rc = -EPERM);
3162 data = file_inode(layout_file);
3163 bias = MDS_CLOSE_LAYOUT_MERGE;
3166 case LL_LEASE_LAYOUT_SPLIT: {
3170 if (ioc->lil_count != 2)
3171 GOTO(out, rc = -EINVAL);
3173 arg += sizeof(*ioc);
3174 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3175 GOTO(out, rc = -EFAULT);
3177 arg += sizeof(__u32);
3178 if (copy_from_user(&mirror_id, (void __user *)arg,
3180 GOTO(out, rc = -EFAULT);
3182 layout_file = fget(fdv);
3184 GOTO(out, rc = -EBADF);
3186 sp.sp_inode = file_inode(layout_file);
3187 sp.sp_mirror_id = (__u16)mirror_id;
3189 bias = MDS_CLOSE_LAYOUT_SPLIT;
3193 /* without close intent */
3197 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3201 rc = ll_lease_och_release(inode, file);
3210 switch (ioc->lil_flags) {
3211 case LL_LEASE_RESYNC_DONE:
3213 OBD_FREE(data, data_size);
3215 case LL_LEASE_LAYOUT_MERGE:
3216 case LL_LEASE_LAYOUT_SPLIT:
3223 rc = ll_lease_type_from_fmode(fmode);
3227 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3230 struct inode *inode = file_inode(file);
3231 struct ll_inode_info *lli = ll_i2info(inode);
3232 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3233 struct obd_client_handle *och = NULL;
3234 __u64 open_flags = 0;
3240 switch (ioc->lil_mode) {
3241 case LL_LEASE_WRLCK:
3242 if (!(file->f_mode & FMODE_WRITE))
3244 fmode = FMODE_WRITE;
3246 case LL_LEASE_RDLCK:
3247 if (!(file->f_mode & FMODE_READ))
3251 case LL_LEASE_UNLCK:
3252 RETURN(ll_file_unlock_lease(file, ioc, arg));
3257 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3259 /* apply for lease */
3260 if (ioc->lil_flags & LL_LEASE_RESYNC)
3261 open_flags = MDS_OPEN_RESYNC;
3262 och = ll_lease_open(inode, file, fmode, open_flags);
3264 RETURN(PTR_ERR(och));
3266 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3267 rc = ll_lease_file_resync(och, inode, arg);
3269 ll_lease_close(och, inode, NULL);
3272 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3274 ll_lease_close(och, inode, NULL);
3280 mutex_lock(&lli->lli_och_mutex);
3281 if (fd->fd_lease_och == NULL) {
3282 fd->fd_lease_och = och;
3285 mutex_unlock(&lli->lli_och_mutex);
3287 /* impossible now that only excl is supported for now */
3288 ll_lease_close(och, inode, &lease_broken);
3294 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3296 struct ll_inode_info *lli = ll_i2info(inode);
3297 struct ll_sb_info *sbi = ll_i2sbi(inode);
3298 __u64 now = ktime_get_real_seconds();
3301 spin_lock(&lli->lli_heat_lock);
3302 heat->lh_flags = lli->lli_heat_flags;
3303 for (i = 0; i < heat->lh_count; i++)
3304 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3305 now, sbi->ll_heat_decay_weight,
3306 sbi->ll_heat_period_second);
3307 spin_unlock(&lli->lli_heat_lock);
3310 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3312 struct ll_inode_info *lli = ll_i2info(inode);
3315 spin_lock(&lli->lli_heat_lock);
3316 if (flags & LU_HEAT_FLAG_CLEAR)
3317 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3319 if (flags & LU_HEAT_FLAG_OFF)
3320 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3322 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3324 spin_unlock(&lli->lli_heat_lock);
3330 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3332 struct inode *inode = file_inode(file);
3333 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3337 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3338 PFID(ll_inode2fid(inode)), inode, cmd);
3339 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3341 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3342 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3346 case LL_IOC_GETFLAGS:
3347 /* Get the current value of the file flags */
3348 return put_user(fd->fd_flags, (int __user *)arg);
3349 case LL_IOC_SETFLAGS:
3350 case LL_IOC_CLRFLAGS:
3351 /* Set or clear specific file flags */
3352 /* XXX This probably needs checks to ensure the flags are
3353 * not abused, and to handle any flag side effects.
3355 if (get_user(flags, (int __user *) arg))
3358 if (cmd == LL_IOC_SETFLAGS) {
3359 if ((flags & LL_FILE_IGNORE_LOCK) &&
3360 !(file->f_flags & O_DIRECT)) {
3361 CERROR("%s: unable to disable locking on "
3362 "non-O_DIRECT file\n", current->comm);
3366 fd->fd_flags |= flags;
3368 fd->fd_flags &= ~flags;
3371 case LL_IOC_LOV_SETSTRIPE:
3372 case LL_IOC_LOV_SETSTRIPE_NEW:
3373 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3374 case LL_IOC_LOV_SETEA:
3375 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3376 case LL_IOC_LOV_SWAP_LAYOUTS: {
3378 struct lustre_swap_layouts lsl;
3380 if (copy_from_user(&lsl, (char __user *)arg,
3381 sizeof(struct lustre_swap_layouts)))
3384 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3387 file2 = fget(lsl.sl_fd);
3391 /* O_WRONLY or O_RDWR */
3392 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3393 GOTO(out, rc = -EPERM);
3395 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3396 struct inode *inode2;
3397 struct ll_inode_info *lli;
3398 struct obd_client_handle *och = NULL;
3400 lli = ll_i2info(inode);
3401 mutex_lock(&lli->lli_och_mutex);
3402 if (fd->fd_lease_och != NULL) {
3403 och = fd->fd_lease_och;
3404 fd->fd_lease_och = NULL;
3406 mutex_unlock(&lli->lli_och_mutex);
3408 GOTO(out, rc = -ENOLCK);
3409 inode2 = file_inode(file2);
3410 rc = ll_swap_layouts_close(och, inode, inode2);
3412 rc = ll_swap_layouts(file, file2, &lsl);
3418 case LL_IOC_LOV_GETSTRIPE:
3419 case LL_IOC_LOV_GETSTRIPE_NEW:
3420 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3421 case FS_IOC_GETFLAGS:
3422 case FS_IOC_SETFLAGS:
3423 RETURN(ll_iocontrol(inode, file, cmd, arg));
3424 case FSFILT_IOC_GETVERSION:
3425 case FS_IOC_GETVERSION:
3426 RETURN(put_user(inode->i_generation, (int __user *)arg));
3427 /* We need to special case any other ioctls we want to handle,
3428 * to send them to the MDS/OST as appropriate and to properly
3429 * network encode the arg field. */
3430 case FS_IOC_SETVERSION:
3433 case LL_IOC_GROUP_LOCK:
3434 RETURN(ll_get_grouplock(inode, file, arg));
3435 case LL_IOC_GROUP_UNLOCK:
3436 RETURN(ll_put_grouplock(inode, file, arg));
3437 case IOC_OBD_STATFS:
3438 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3440 case LL_IOC_FLUSHCTX:
3441 RETURN(ll_flush_ctx(inode));
3442 case LL_IOC_PATH2FID: {
3443 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3444 sizeof(struct lu_fid)))
3449 case LL_IOC_GETPARENT:
3450 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3452 case OBD_IOC_FID2PATH:
3453 RETURN(ll_fid2path(inode, (void __user *)arg));
3454 case LL_IOC_DATA_VERSION: {
3455 struct ioc_data_version idv;
3458 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3461 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3462 rc = ll_ioc_data_version(inode, &idv);
3465 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3471 case LL_IOC_GET_MDTIDX: {
3474 mdtidx = ll_get_mdt_idx(inode);
3478 if (put_user((int)mdtidx, (int __user *)arg))
3483 case OBD_IOC_GETDTNAME:
3484 case OBD_IOC_GETMDNAME:
3485 RETURN(ll_get_obd_name(inode, cmd, arg));
3486 case LL_IOC_HSM_STATE_GET: {
3487 struct md_op_data *op_data;
3488 struct hsm_user_state *hus;
3495 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3496 LUSTRE_OPC_ANY, hus);
3497 if (IS_ERR(op_data)) {
3499 RETURN(PTR_ERR(op_data));
3502 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3505 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3508 ll_finish_md_op_data(op_data);
3512 case LL_IOC_HSM_STATE_SET: {
3513 struct hsm_state_set *hss;
3520 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3525 rc = ll_hsm_state_set(inode, hss);
3530 case LL_IOC_HSM_ACTION: {
3531 struct md_op_data *op_data;
3532 struct hsm_current_action *hca;
3539 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3540 LUSTRE_OPC_ANY, hca);
3541 if (IS_ERR(op_data)) {
3543 RETURN(PTR_ERR(op_data));
3546 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3549 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3552 ll_finish_md_op_data(op_data);
3556 case LL_IOC_SET_LEASE_OLD: {
3557 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3559 RETURN(ll_file_set_lease(file, &ioc, 0));
3561 case LL_IOC_SET_LEASE: {
3562 struct ll_ioc_lease ioc;
3564 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3567 RETURN(ll_file_set_lease(file, &ioc, arg));
3569 case LL_IOC_GET_LEASE: {
3570 struct ll_inode_info *lli = ll_i2info(inode);
3571 struct ldlm_lock *lock = NULL;
3574 mutex_lock(&lli->lli_och_mutex);
3575 if (fd->fd_lease_och != NULL) {
3576 struct obd_client_handle *och = fd->fd_lease_och;
3578 lock = ldlm_handle2lock(&och->och_lease_handle);
3580 lock_res_and_lock(lock);
3581 if (!ldlm_is_cancel(lock))
3582 fmode = och->och_flags;
3584 unlock_res_and_lock(lock);
3585 LDLM_LOCK_PUT(lock);
3588 mutex_unlock(&lli->lli_och_mutex);
3590 RETURN(ll_lease_type_from_fmode(fmode));
3592 case LL_IOC_HSM_IMPORT: {
3593 struct hsm_user_import *hui;
3599 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3604 rc = ll_hsm_import(inode, file, hui);
3609 case LL_IOC_FUTIMES_3: {
3610 struct ll_futimes_3 lfu;
3612 if (copy_from_user(&lfu,
3613 (const struct ll_futimes_3 __user *)arg,
3617 RETURN(ll_file_futimes_3(file, &lfu));
3619 case LL_IOC_LADVISE: {
3620 struct llapi_ladvise_hdr *k_ladvise_hdr;
3621 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3624 int alloc_size = sizeof(*k_ladvise_hdr);
3627 u_ladvise_hdr = (void __user *)arg;
3628 OBD_ALLOC_PTR(k_ladvise_hdr);
3629 if (k_ladvise_hdr == NULL)
3632 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3633 GOTO(out_ladvise, rc = -EFAULT);
3635 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3636 k_ladvise_hdr->lah_count < 1)
3637 GOTO(out_ladvise, rc = -EINVAL);
3639 num_advise = k_ladvise_hdr->lah_count;
3640 if (num_advise >= LAH_COUNT_MAX)
3641 GOTO(out_ladvise, rc = -EFBIG);
3643 OBD_FREE_PTR(k_ladvise_hdr);
3644 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3645 lah_advise[num_advise]);
3646 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3647 if (k_ladvise_hdr == NULL)
3651 * TODO: submit multiple advices to one server in a single RPC
3653 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3654 GOTO(out_ladvise, rc = -EFAULT);
3656 for (i = 0; i < num_advise; i++) {
3657 struct llapi_lu_ladvise *k_ladvise =
3658 &k_ladvise_hdr->lah_advise[i];
3659 struct llapi_lu_ladvise __user *u_ladvise =
3660 &u_ladvise_hdr->lah_advise[i];
3662 rc = ll_ladvise_sanity(inode, k_ladvise);
3664 GOTO(out_ladvise, rc);
3666 switch (k_ladvise->lla_advice) {
3667 case LU_LADVISE_LOCKNOEXPAND:
3668 rc = ll_lock_noexpand(file,
3669 k_ladvise->lla_peradvice_flags);
3670 GOTO(out_ladvise, rc);
3671 case LU_LADVISE_LOCKAHEAD:
3673 rc = ll_file_lock_ahead(file, k_ladvise);
3676 GOTO(out_ladvise, rc);
3679 &u_ladvise->lla_lockahead_result))
3680 GOTO(out_ladvise, rc = -EFAULT);
3683 rc = ll_ladvise(inode, file,
3684 k_ladvise_hdr->lah_flags,
3687 GOTO(out_ladvise, rc);
3694 OBD_FREE(k_ladvise_hdr, alloc_size);
3697 case LL_IOC_FLR_SET_MIRROR: {
3698 /* mirror I/O must be direct to avoid polluting page cache
3700 if (!(file->f_flags & O_DIRECT))
3703 fd->fd_designated_mirror = (__u32)arg;
3706 case LL_IOC_FSGETXATTR:
3707 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3708 case LL_IOC_FSSETXATTR:
3709 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3711 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3712 case LL_IOC_HEAT_GET: {
3713 struct lu_heat uheat;
3714 struct lu_heat *heat;
3717 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3720 if (uheat.lh_count > OBD_HEAT_COUNT)
3721 uheat.lh_count = OBD_HEAT_COUNT;
3723 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3724 OBD_ALLOC(heat, size);
3728 heat->lh_count = uheat.lh_count;
3729 ll_heat_get(inode, heat);
3730 rc = copy_to_user((char __user *)arg, heat, size);
3731 OBD_FREE(heat, size);
3732 RETURN(rc ? -EFAULT : 0);
3734 case LL_IOC_HEAT_SET: {
3737 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3740 rc = ll_heat_set(inode, flags);
3744 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3745 (void __user *)arg));
3749 #ifndef HAVE_FILE_LLSEEK_SIZE
3750 static inline loff_t
3751 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3753 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3755 if (offset > maxsize)
3758 if (offset != file->f_pos) {
3759 file->f_pos = offset;
3760 file->f_version = 0;
3766 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3767 loff_t maxsize, loff_t eof)
3769 struct inode *inode = file_inode(file);
3777 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3778 * position-querying operation. Avoid rewriting the "same"
3779 * f_pos value back to the file because a concurrent read(),
3780 * write() or lseek() might have altered it
3785 * f_lock protects against read/modify/write race with other
3786 * SEEK_CURs. Note that parallel writes and reads behave
3790 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3791 inode_unlock(inode);
3795 * In the generic case the entire file is data, so as long as
3796 * offset isn't at the end of the file then the offset is data.
3803 * There is a virtual hole at the end of the file, so as long as
3804 * offset isn't i_size or larger, return i_size.
3812 return llseek_execute(file, offset, maxsize);
3816 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3818 struct inode *inode = file_inode(file);
3819 loff_t retval, eof = 0;
3822 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3823 (origin == SEEK_CUR) ? file->f_pos : 0);
3824 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3825 PFID(ll_inode2fid(inode)), inode, retval, retval,
3827 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3829 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3830 retval = ll_glimpse_size(inode);
3833 eof = i_size_read(inode);
3836 retval = ll_generic_file_llseek_size(file, offset, origin,
3837 ll_file_maxbytes(inode), eof);
3841 static int ll_flush(struct file *file, fl_owner_t id)
3843 struct inode *inode = file_inode(file);
3844 struct ll_inode_info *lli = ll_i2info(inode);
3845 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3848 LASSERT(!S_ISDIR(inode->i_mode));
3850 /* catch async errors that were recorded back when async writeback
3851 * failed for pages in this mapping. */
3852 rc = lli->lli_async_rc;
3853 lli->lli_async_rc = 0;
3854 if (lli->lli_clob != NULL) {
3855 err = lov_read_and_clear_async_rc(lli->lli_clob);
3860 /* The application has been told write failure already.
3861 * Do not report failure again. */
3862 if (fd->fd_write_failed)
3864 return rc ? -EIO : 0;
3868 * Called to make sure a portion of file has been written out.
3869 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3871 * Return how many pages have been written.
3873 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3874 enum cl_fsync_mode mode, int ignore_layout)
3878 struct cl_fsync_io *fio;
3883 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3884 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3887 env = cl_env_get(&refcheck);
3889 RETURN(PTR_ERR(env));
3891 io = vvp_env_thread_io(env);
3892 io->ci_obj = ll_i2info(inode)->lli_clob;
3893 io->ci_ignore_layout = ignore_layout;
3895 /* initialize parameters for sync */
3896 fio = &io->u.ci_fsync;
3897 fio->fi_start = start;
3899 fio->fi_fid = ll_inode2fid(inode);
3900 fio->fi_mode = mode;
3901 fio->fi_nr_written = 0;
3903 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3904 result = cl_io_loop(env, io);
3906 result = io->ci_result;
3908 result = fio->fi_nr_written;
3909 cl_io_fini(env, io);
3910 cl_env_put(env, &refcheck);
3916 * When dentry is provided (the 'else' case), file_dentry() may be
3917 * null and dentry must be used directly rather than pulled from
3918 * file_dentry() as is done otherwise.
3921 #ifdef HAVE_FILE_FSYNC_4ARGS
3922 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3924 struct dentry *dentry = file_dentry(file);
3925 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3926 int ll_fsync(struct file *file, int datasync)
3928 struct dentry *dentry = file_dentry(file);
3930 loff_t end = LLONG_MAX;
3932 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3935 loff_t end = LLONG_MAX;
3937 struct inode *inode = dentry->d_inode;
3938 struct ll_inode_info *lli = ll_i2info(inode);
3939 struct ptlrpc_request *req;
3943 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3944 PFID(ll_inode2fid(inode)), inode);
3945 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3947 #ifdef HAVE_FILE_FSYNC_4ARGS
3948 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3951 /* fsync's caller has already called _fdata{sync,write}, we want
3952 * that IO to finish before calling the osc and mdc sync methods */
3953 rc = filemap_fdatawait(inode->i_mapping);
3956 /* catch async errors that were recorded back when async writeback
3957 * failed for pages in this mapping. */
3958 if (!S_ISDIR(inode->i_mode)) {
3959 err = lli->lli_async_rc;
3960 lli->lli_async_rc = 0;
3963 if (lli->lli_clob != NULL) {
3964 err = lov_read_and_clear_async_rc(lli->lli_clob);
3970 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3974 ptlrpc_req_finished(req);
3976 if (S_ISREG(inode->i_mode)) {
3977 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3979 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3980 if (rc == 0 && err < 0)
3983 fd->fd_write_failed = true;
3985 fd->fd_write_failed = false;
3988 #ifdef HAVE_FILE_FSYNC_4ARGS
3989 inode_unlock(inode);
3995 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3997 struct inode *inode = file_inode(file);
3998 struct ll_sb_info *sbi = ll_i2sbi(inode);
3999 struct ldlm_enqueue_info einfo = {
4000 .ei_type = LDLM_FLOCK,
4001 .ei_cb_cp = ldlm_flock_completion_ast,
4002 .ei_cbdata = file_lock,
4004 struct md_op_data *op_data;
4005 struct lustre_handle lockh = { 0 };
4006 union ldlm_policy_data flock = { { 0 } };
4007 int fl_type = file_lock->fl_type;
4013 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4014 PFID(ll_inode2fid(inode)), file_lock);
4016 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4018 if (file_lock->fl_flags & FL_FLOCK) {
4019 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4020 /* flocks are whole-file locks */
4021 flock.l_flock.end = OFFSET_MAX;
4022 /* For flocks owner is determined by the local file desctiptor*/
4023 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4024 } else if (file_lock->fl_flags & FL_POSIX) {
4025 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4026 flock.l_flock.start = file_lock->fl_start;
4027 flock.l_flock.end = file_lock->fl_end;
4031 flock.l_flock.pid = file_lock->fl_pid;
4033 /* Somewhat ugly workaround for svc lockd.
4034 * lockd installs custom fl_lmops->lm_compare_owner that checks
4035 * for the fl_owner to be the same (which it always is on local node
4036 * I guess between lockd processes) and then compares pid.
4037 * As such we assign pid to the owner field to make it all work,
4038 * conflict with normal locks is unlikely since pid space and
4039 * pointer space for current->files are not intersecting */
4040 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4041 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4045 einfo.ei_mode = LCK_PR;
4048 /* An unlock request may or may not have any relation to
4049 * existing locks so we may not be able to pass a lock handle
4050 * via a normal ldlm_lock_cancel() request. The request may even
4051 * unlock a byte range in the middle of an existing lock. In
4052 * order to process an unlock request we need all of the same
4053 * information that is given with a normal read or write record
4054 * lock request. To avoid creating another ldlm unlock (cancel)
4055 * message we'll treat a LCK_NL flock request as an unlock. */
4056 einfo.ei_mode = LCK_NL;
4059 einfo.ei_mode = LCK_PW;
4062 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4077 flags = LDLM_FL_BLOCK_NOWAIT;
4083 flags = LDLM_FL_TEST_LOCK;
4086 CERROR("unknown fcntl lock command: %d\n", cmd);
4090 /* Save the old mode so that if the mode in the lock changes we
4091 * can decrement the appropriate reader or writer refcount. */
4092 file_lock->fl_type = einfo.ei_mode;
4094 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4095 LUSTRE_OPC_ANY, NULL);
4096 if (IS_ERR(op_data))
4097 RETURN(PTR_ERR(op_data));
4099 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4100 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4101 flock.l_flock.pid, flags, einfo.ei_mode,
4102 flock.l_flock.start, flock.l_flock.end);
4104 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4107 /* Restore the file lock type if not TEST lock. */
4108 if (!(flags & LDLM_FL_TEST_LOCK))
4109 file_lock->fl_type = fl_type;
4111 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4112 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4113 !(flags & LDLM_FL_TEST_LOCK))
4114 rc2 = locks_lock_file_wait(file, file_lock);
4116 if ((file_lock->fl_flags & FL_FLOCK) &&
4117 (rc == 0 || file_lock->fl_type == F_UNLCK))
4118 rc2 = flock_lock_file_wait(file, file_lock);
4119 if ((file_lock->fl_flags & FL_POSIX) &&
4120 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4121 !(flags & LDLM_FL_TEST_LOCK))
4122 rc2 = posix_lock_file_wait(file, file_lock);
4123 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4125 if (rc2 && file_lock->fl_type != F_UNLCK) {
4126 einfo.ei_mode = LCK_NL;
4127 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4132 ll_finish_md_op_data(op_data);
4137 int ll_get_fid_by_name(struct inode *parent, const char *name,
4138 int namelen, struct lu_fid *fid,
4139 struct inode **inode)
4141 struct md_op_data *op_data = NULL;
4142 struct mdt_body *body;
4143 struct ptlrpc_request *req;
4147 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4148 LUSTRE_OPC_ANY, NULL);
4149 if (IS_ERR(op_data))
4150 RETURN(PTR_ERR(op_data));
4152 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4153 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4154 ll_finish_md_op_data(op_data);
4158 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4160 GOTO(out_req, rc = -EFAULT);
4162 *fid = body->mbo_fid1;
4165 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4167 ptlrpc_req_finished(req);
4171 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4174 struct dentry *dchild = NULL;
4175 struct inode *child_inode = NULL;
4176 struct md_op_data *op_data;
4177 struct ptlrpc_request *request = NULL;
4178 struct obd_client_handle *och = NULL;
4180 struct mdt_body *body;
4181 __u64 data_version = 0;
4182 size_t namelen = strlen(name);
4183 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4187 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4188 PFID(ll_inode2fid(parent)), name,
4189 lum->lum_stripe_offset, lum->lum_stripe_count);
4191 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4192 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4193 lustre_swab_lmv_user_md(lum);
4195 /* Get child FID first */
4196 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4199 dchild = d_lookup(file_dentry(file), &qstr);
4201 if (dchild->d_inode)
4202 child_inode = igrab(dchild->d_inode);
4207 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4216 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4217 OBD_CONNECT2_DIR_MIGRATE)) {
4218 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4219 ll_i2info(child_inode)->lli_lsm_md) {
4220 CERROR("%s: MDT doesn't support stripe directory "
4221 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4222 GOTO(out_iput, rc = -EOPNOTSUPP);
4227 * lfs migrate command needs to be blocked on the client
4228 * by checking the migrate FID against the FID of the
4231 if (child_inode == parent->i_sb->s_root->d_inode)
4232 GOTO(out_iput, rc = -EINVAL);
4234 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4235 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4236 if (IS_ERR(op_data))
4237 GOTO(out_iput, rc = PTR_ERR(op_data));
4239 inode_lock(child_inode);
4240 op_data->op_fid3 = *ll_inode2fid(child_inode);
4241 if (!fid_is_sane(&op_data->op_fid3)) {
4242 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4243 ll_i2sbi(parent)->ll_fsname, name,
4244 PFID(&op_data->op_fid3));
4245 GOTO(out_unlock, rc = -EINVAL);
4248 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4249 op_data->op_data = lum;
4250 op_data->op_data_size = lumlen;
4253 if (S_ISREG(child_inode->i_mode)) {
4254 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4258 GOTO(out_unlock, rc);
4261 rc = ll_data_version(child_inode, &data_version,
4264 GOTO(out_close, rc);
4266 op_data->op_open_handle = och->och_open_handle;
4267 op_data->op_data_version = data_version;
4268 op_data->op_lease_handle = och->och_lease_handle;
4269 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4271 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4272 och->och_mod->mod_open_req->rq_replay = 0;
4273 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4276 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4277 name, namelen, &request);
4279 LASSERT(request != NULL);
4280 ll_update_times(request, parent);
4283 if (rc == 0 || rc == -EAGAIN) {
4284 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4285 LASSERT(body != NULL);
4287 /* If the server does release layout lock, then we cleanup
4288 * the client och here, otherwise release it in out_close: */
4289 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4290 obd_mod_put(och->och_mod);
4291 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4293 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4299 if (request != NULL) {
4300 ptlrpc_req_finished(request);
4304 /* Try again if the lease has cancelled. */
4305 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4310 ll_lease_close(och, child_inode, NULL);
4312 clear_nlink(child_inode);
4314 inode_unlock(child_inode);
4315 ll_finish_md_op_data(op_data);
4322 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4330 * test if some locks matching bits and l_req_mode are acquired
4331 * - bits can be in different locks
4332 * - if found clear the common lock bits in *bits
4333 * - the bits not found, are kept in *bits
4335 * \param bits [IN] searched lock bits [IN]
4336 * \param l_req_mode [IN] searched lock mode
4337 * \retval boolean, true iff all bits are found
4339 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4341 struct lustre_handle lockh;
4342 union ldlm_policy_data policy;
4343 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4344 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4353 fid = &ll_i2info(inode)->lli_fid;
4354 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4355 ldlm_lockname[mode]);
4357 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4358 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4359 policy.l_inodebits.bits = *bits & (1 << i);
4360 if (policy.l_inodebits.bits == 0)
4363 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4364 &policy, mode, &lockh)) {
4365 struct ldlm_lock *lock;
4367 lock = ldlm_handle2lock(&lockh);
4370 ~(lock->l_policy_data.l_inodebits.bits);
4371 LDLM_LOCK_PUT(lock);
4373 *bits &= ~policy.l_inodebits.bits;
4380 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4381 struct lustre_handle *lockh, __u64 flags,
4382 enum ldlm_mode mode)
4384 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4389 fid = &ll_i2info(inode)->lli_fid;
4390 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4392 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4393 fid, LDLM_IBITS, &policy, mode, lockh);
4398 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4400 /* Already unlinked. Just update nlink and return success */
4401 if (rc == -ENOENT) {
4403 /* If it is striped directory, and there is bad stripe
4404 * Let's revalidate the dentry again, instead of returning
4406 if (S_ISDIR(inode->i_mode) &&
4407 ll_i2info(inode)->lli_lsm_md != NULL)
4410 /* This path cannot be hit for regular files unless in
4411 * case of obscure races, so no need to to validate
4413 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4415 } else if (rc != 0) {
4416 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4417 "%s: revalidate FID "DFID" error: rc = %d\n",
4418 ll_i2sbi(inode)->ll_fsname,
4419 PFID(ll_inode2fid(inode)), rc);
4425 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4427 struct inode *inode = dentry->d_inode;
4428 struct obd_export *exp = ll_i2mdexp(inode);
4429 struct lookup_intent oit = {
4432 struct ptlrpc_request *req = NULL;
4433 struct md_op_data *op_data;
4437 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4438 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4440 /* Call getattr by fid, so do not provide name at all. */
4441 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4442 LUSTRE_OPC_ANY, NULL);
4443 if (IS_ERR(op_data))
4444 RETURN(PTR_ERR(op_data));
4446 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4447 ll_finish_md_op_data(op_data);
4449 rc = ll_inode_revalidate_fini(inode, rc);
4453 rc = ll_revalidate_it_finish(req, &oit, dentry);
4455 ll_intent_release(&oit);
4459 /* Unlinked? Unhash dentry, so it is not picked up later by
4460 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4461 * here to preserve get_cwd functionality on 2.6.
4463 if (!dentry->d_inode->i_nlink) {
4464 ll_lock_dcache(inode);
4465 d_lustre_invalidate(dentry, 0);
4466 ll_unlock_dcache(inode);
4469 ll_lookup_finish_locks(&oit, dentry);
4471 ptlrpc_req_finished(req);
4476 static int ll_merge_md_attr(struct inode *inode)
4478 struct ll_inode_info *lli = ll_i2info(inode);
4479 struct cl_attr attr = { 0 };
4482 LASSERT(lli->lli_lsm_md != NULL);
4484 /* foreign dir is not striped dir */
4485 if (lli->lli_lsm_md->lsm_md_magic == LMV_MAGIC_FOREIGN)
4488 down_read(&lli->lli_lsm_sem);
4489 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4490 &attr, ll_md_blocking_ast);
4491 up_read(&lli->lli_lsm_sem);
4495 set_nlink(inode, attr.cat_nlink);
4496 inode->i_blocks = attr.cat_blocks;
4497 i_size_write(inode, attr.cat_size);
4499 ll_i2info(inode)->lli_atime = attr.cat_atime;
4500 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4501 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4506 static inline dev_t ll_compat_encode_dev(dev_t dev)
4508 /* The compat_sys_*stat*() syscalls will fail unless the
4509 * device majors and minors are both less than 256. Note that
4510 * the value returned here will be passed through
4511 * old_encode_dev() in cp_compat_stat(). And so we are not
4512 * trying to return a valid compat (u16) device number, just
4513 * one that will pass the old_valid_dev() check. */
4515 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4518 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4519 int ll_getattr(const struct path *path, struct kstat *stat,
4520 u32 request_mask, unsigned int flags)
4522 struct dentry *de = path->dentry;
4524 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4527 struct inode *inode = de->d_inode;
4528 struct ll_sb_info *sbi = ll_i2sbi(inode);
4529 struct ll_inode_info *lli = ll_i2info(inode);
4532 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4534 rc = ll_inode_revalidate(de, IT_GETATTR);
4538 if (S_ISREG(inode->i_mode)) {
4539 /* In case of restore, the MDT has the right size and has
4540 * already send it back without granting the layout lock,
4541 * inode is up-to-date so glimpse is useless.
4542 * Also to glimpse we need the layout, in case of a running
4543 * restore the MDT holds the layout lock so the glimpse will
4544 * block up to the end of restore (getattr will block)
4546 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4547 rc = ll_glimpse_size(inode);
4552 /* If object isn't regular a file then don't validate size. */
4553 if (S_ISDIR(inode->i_mode) &&
4554 lli->lli_lsm_md != NULL) {
4555 rc = ll_merge_md_attr(inode);
4560 inode->i_atime.tv_sec = lli->lli_atime;
4561 inode->i_mtime.tv_sec = lli->lli_mtime;
4562 inode->i_ctime.tv_sec = lli->lli_ctime;
4565 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4567 if (ll_need_32bit_api(sbi)) {
4568 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4569 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4570 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4572 stat->ino = inode->i_ino;
4573 stat->dev = inode->i_sb->s_dev;
4574 stat->rdev = inode->i_rdev;
4577 stat->mode = inode->i_mode;
4578 stat->uid = inode->i_uid;
4579 stat->gid = inode->i_gid;
4580 stat->atime = inode->i_atime;
4581 stat->mtime = inode->i_mtime;
4582 stat->ctime = inode->i_ctime;
4583 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4585 stat->nlink = inode->i_nlink;
4586 stat->size = i_size_read(inode);
4587 stat->blocks = inode->i_blocks;
4592 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4593 __u64 start, __u64 len)
4597 struct fiemap *fiemap;
4598 unsigned int extent_count = fieinfo->fi_extents_max;
4600 num_bytes = sizeof(*fiemap) + (extent_count *
4601 sizeof(struct fiemap_extent));
4602 OBD_ALLOC_LARGE(fiemap, num_bytes);
4607 fiemap->fm_flags = fieinfo->fi_flags;
4608 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4609 fiemap->fm_start = start;
4610 fiemap->fm_length = len;
4611 if (extent_count > 0 &&
4612 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4613 sizeof(struct fiemap_extent)) != 0)
4614 GOTO(out, rc = -EFAULT);
4616 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4618 fieinfo->fi_flags = fiemap->fm_flags;
4619 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4620 if (extent_count > 0 &&
4621 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4622 fiemap->fm_mapped_extents *
4623 sizeof(struct fiemap_extent)) != 0)
4624 GOTO(out, rc = -EFAULT);
4626 OBD_FREE_LARGE(fiemap, num_bytes);
4630 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4632 struct ll_inode_info *lli = ll_i2info(inode);
4633 struct posix_acl *acl = NULL;
4636 spin_lock(&lli->lli_lock);
4637 /* VFS' acl_permission_check->check_acl will release the refcount */
4638 acl = posix_acl_dup(lli->lli_posix_acl);
4639 spin_unlock(&lli->lli_lock);
4644 #ifdef HAVE_IOP_SET_ACL
4645 #ifdef CONFIG_FS_POSIX_ACL
4646 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4648 struct ll_sb_info *sbi = ll_i2sbi(inode);
4649 struct ptlrpc_request *req = NULL;
4650 const char *name = NULL;
4652 size_t value_size = 0;
4657 case ACL_TYPE_ACCESS:
4658 name = XATTR_NAME_POSIX_ACL_ACCESS;
4660 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4663 case ACL_TYPE_DEFAULT:
4664 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4665 if (!S_ISDIR(inode->i_mode))
4666 rc = acl ? -EACCES : 0;
4677 value_size = posix_acl_xattr_size(acl->a_count);
4678 value = kmalloc(value_size, GFP_NOFS);
4680 GOTO(out, rc = -ENOMEM);
4682 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4684 GOTO(out_value, rc);
4687 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4688 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4689 name, value, value_size, 0, 0, &req);
4691 ptlrpc_req_finished(req);
4696 forget_cached_acl(inode, type);
4698 set_cached_acl(inode, type, acl);
4701 #endif /* CONFIG_FS_POSIX_ACL */
4702 #endif /* HAVE_IOP_SET_ACL */
4704 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4706 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4707 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4709 ll_check_acl(struct inode *inode, int mask)
4712 # ifdef CONFIG_FS_POSIX_ACL
4713 struct posix_acl *acl;
4717 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4718 if (flags & IPERM_FLAG_RCU)
4721 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4726 rc = posix_acl_permission(inode, acl, mask);
4727 posix_acl_release(acl);
4730 # else /* !CONFIG_FS_POSIX_ACL */
4732 # endif /* CONFIG_FS_POSIX_ACL */
4734 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4736 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4737 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4739 # ifdef HAVE_INODE_PERMISION_2ARGS
4740 int ll_inode_permission(struct inode *inode, int mask)
4742 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4747 struct ll_sb_info *sbi;
4748 struct root_squash_info *squash;
4749 struct cred *cred = NULL;
4750 const struct cred *old_cred = NULL;
4752 bool squash_id = false;
4755 #ifdef MAY_NOT_BLOCK
4756 if (mask & MAY_NOT_BLOCK)
4758 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4759 if (flags & IPERM_FLAG_RCU)
4763 /* as root inode are NOT getting validated in lookup operation,
4764 * need to do it before permission check. */
4766 if (inode == inode->i_sb->s_root->d_inode) {
4767 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4772 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4773 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4775 /* squash fsuid/fsgid if needed */
4776 sbi = ll_i2sbi(inode);
4777 squash = &sbi->ll_squash;
4778 if (unlikely(squash->rsi_uid != 0 &&
4779 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4780 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4784 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4785 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4786 squash->rsi_uid, squash->rsi_gid);
4788 /* update current process's credentials
4789 * and FS capability */
4790 cred = prepare_creds();
4794 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4795 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4796 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4797 if ((1 << cap) & CFS_CAP_FS_MASK)
4798 cap_lower(cred->cap_effective, cap);
4800 old_cred = override_creds(cred);
4803 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4804 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4805 /* restore current process's credentials and FS capability */
4807 revert_creds(old_cred);
4814 /* -o localflock - only provides locally consistent flock locks */
4815 struct file_operations ll_file_operations = {
4816 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4817 # ifdef HAVE_SYNC_READ_WRITE
4818 .read = new_sync_read,
4819 .write = new_sync_write,
4821 .read_iter = ll_file_read_iter,
4822 .write_iter = ll_file_write_iter,
4823 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4824 .read = ll_file_read,
4825 .aio_read = ll_file_aio_read,
4826 .write = ll_file_write,
4827 .aio_write = ll_file_aio_write,
4828 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4829 .unlocked_ioctl = ll_file_ioctl,
4830 .open = ll_file_open,
4831 .release = ll_file_release,
4832 .mmap = ll_file_mmap,
4833 .llseek = ll_file_seek,
4834 .splice_read = ll_file_splice_read,
4839 struct file_operations ll_file_operations_flock = {
4840 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4841 # ifdef HAVE_SYNC_READ_WRITE
4842 .read = new_sync_read,
4843 .write = new_sync_write,
4844 # endif /* HAVE_SYNC_READ_WRITE */
4845 .read_iter = ll_file_read_iter,
4846 .write_iter = ll_file_write_iter,
4847 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4848 .read = ll_file_read,
4849 .aio_read = ll_file_aio_read,
4850 .write = ll_file_write,
4851 .aio_write = ll_file_aio_write,
4852 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4853 .unlocked_ioctl = ll_file_ioctl,
4854 .open = ll_file_open,
4855 .release = ll_file_release,
4856 .mmap = ll_file_mmap,
4857 .llseek = ll_file_seek,
4858 .splice_read = ll_file_splice_read,
4861 .flock = ll_file_flock,
4862 .lock = ll_file_flock
4865 /* These are for -o noflock - to return ENOSYS on flock calls */
4866 struct file_operations ll_file_operations_noflock = {
4867 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4868 # ifdef HAVE_SYNC_READ_WRITE
4869 .read = new_sync_read,
4870 .write = new_sync_write,
4871 # endif /* HAVE_SYNC_READ_WRITE */
4872 .read_iter = ll_file_read_iter,
4873 .write_iter = ll_file_write_iter,
4874 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4875 .read = ll_file_read,
4876 .aio_read = ll_file_aio_read,
4877 .write = ll_file_write,
4878 .aio_write = ll_file_aio_write,
4879 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4880 .unlocked_ioctl = ll_file_ioctl,
4881 .open = ll_file_open,
4882 .release = ll_file_release,
4883 .mmap = ll_file_mmap,
4884 .llseek = ll_file_seek,
4885 .splice_read = ll_file_splice_read,
4888 .flock = ll_file_noflock,
4889 .lock = ll_file_noflock
4892 struct inode_operations ll_file_inode_operations = {
4893 .setattr = ll_setattr,
4894 .getattr = ll_getattr,
4895 .permission = ll_inode_permission,
4896 #ifdef HAVE_IOP_XATTR
4897 .setxattr = ll_setxattr,
4898 .getxattr = ll_getxattr,
4899 .removexattr = ll_removexattr,
4901 .listxattr = ll_listxattr,
4902 .fiemap = ll_fiemap,
4903 #ifdef HAVE_IOP_GET_ACL
4904 .get_acl = ll_get_acl,
4906 #ifdef HAVE_IOP_SET_ACL
4907 .set_acl = ll_set_acl,
4911 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4913 struct ll_inode_info *lli = ll_i2info(inode);
4914 struct cl_object *obj = lli->lli_clob;
4923 env = cl_env_get(&refcheck);
4925 RETURN(PTR_ERR(env));
4927 rc = cl_conf_set(env, lli->lli_clob, conf);
4931 if (conf->coc_opc == OBJECT_CONF_SET) {
4932 struct ldlm_lock *lock = conf->coc_lock;
4933 struct cl_layout cl = {
4937 LASSERT(lock != NULL);
4938 LASSERT(ldlm_has_layout(lock));
4940 /* it can only be allowed to match after layout is
4941 * applied to inode otherwise false layout would be
4942 * seen. Applying layout shoud happen before dropping
4943 * the intent lock. */
4944 ldlm_lock_allow_match(lock);
4946 rc = cl_object_layout_get(env, obj, &cl);
4951 DFID": layout version change: %u -> %u\n",
4952 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4954 ll_layout_version_set(lli, cl.cl_layout_gen);
4958 cl_env_put(env, &refcheck);
4963 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4964 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4967 struct ll_sb_info *sbi = ll_i2sbi(inode);
4968 struct ptlrpc_request *req;
4975 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4976 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4977 lock->l_lvb_data, lock->l_lvb_len);
4979 if (lock->l_lvb_data != NULL)
4982 /* if layout lock was granted right away, the layout is returned
4983 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4984 * blocked and then granted via completion ast, we have to fetch
4985 * layout here. Please note that we can't use the LVB buffer in
4986 * completion AST because it doesn't have a large enough buffer */
4987 rc = ll_get_default_mdsize(sbi, &lmmsize);
4991 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4992 XATTR_NAME_LOV, lmmsize, &req);
4995 GOTO(out, rc = 0); /* empty layout */
5002 if (lmmsize == 0) /* empty layout */
5005 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5007 GOTO(out, rc = -EFAULT);
5009 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5010 if (lvbdata == NULL)
5011 GOTO(out, rc = -ENOMEM);
5013 memcpy(lvbdata, lmm, lmmsize);
5014 lock_res_and_lock(lock);
5015 if (unlikely(lock->l_lvb_data == NULL)) {
5016 lock->l_lvb_type = LVB_T_LAYOUT;
5017 lock->l_lvb_data = lvbdata;
5018 lock->l_lvb_len = lmmsize;
5021 unlock_res_and_lock(lock);
5024 OBD_FREE_LARGE(lvbdata, lmmsize);
5029 ptlrpc_req_finished(req);
5034 * Apply the layout to the inode. Layout lock is held and will be released
5037 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5038 struct inode *inode)
5040 struct ll_inode_info *lli = ll_i2info(inode);
5041 struct ll_sb_info *sbi = ll_i2sbi(inode);
5042 struct ldlm_lock *lock;
5043 struct cl_object_conf conf;
5046 bool wait_layout = false;
5049 LASSERT(lustre_handle_is_used(lockh));
5051 lock = ldlm_handle2lock(lockh);
5052 LASSERT(lock != NULL);
5053 LASSERT(ldlm_has_layout(lock));
5055 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5056 PFID(&lli->lli_fid), inode);
5058 /* in case this is a caching lock and reinstate with new inode */
5059 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5061 lock_res_and_lock(lock);
5062 lvb_ready = ldlm_is_lvb_ready(lock);
5063 unlock_res_and_lock(lock);
5065 /* checking lvb_ready is racy but this is okay. The worst case is
5066 * that multi processes may configure the file on the same time. */
5070 rc = ll_layout_fetch(inode, lock);
5074 /* for layout lock, lmm is stored in lock's lvb.
5075 * lvb_data is immutable if the lock is held so it's safe to access it
5078 * set layout to file. Unlikely this will fail as old layout was
5079 * surely eliminated */
5080 memset(&conf, 0, sizeof conf);
5081 conf.coc_opc = OBJECT_CONF_SET;
5082 conf.coc_inode = inode;
5083 conf.coc_lock = lock;
5084 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5085 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5086 rc = ll_layout_conf(inode, &conf);
5088 /* refresh layout failed, need to wait */
5089 wait_layout = rc == -EBUSY;
5092 LDLM_LOCK_PUT(lock);
5093 ldlm_lock_decref(lockh, mode);
5095 /* wait for IO to complete if it's still being used. */
5097 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5098 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5100 memset(&conf, 0, sizeof conf);
5101 conf.coc_opc = OBJECT_CONF_WAIT;
5102 conf.coc_inode = inode;
5103 rc = ll_layout_conf(inode, &conf);
5107 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5108 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5114 * Issue layout intent RPC to MDS.
5115 * \param inode [in] file inode
5116 * \param intent [in] layout intent
5118 * \retval 0 on success
5119 * \retval < 0 error code
5121 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5123 struct ll_inode_info *lli = ll_i2info(inode);
5124 struct ll_sb_info *sbi = ll_i2sbi(inode);
5125 struct md_op_data *op_data;
5126 struct lookup_intent it;
5127 struct ptlrpc_request *req;
5131 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5132 0, 0, LUSTRE_OPC_ANY, NULL);
5133 if (IS_ERR(op_data))
5134 RETURN(PTR_ERR(op_data));
5136 op_data->op_data = intent;
5137 op_data->op_data_size = sizeof(*intent);
5139 memset(&it, 0, sizeof(it));
5140 it.it_op = IT_LAYOUT;
5141 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5142 intent->li_opc == LAYOUT_INTENT_TRUNC)
5143 it.it_flags = FMODE_WRITE;
5145 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5146 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5148 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5149 &ll_md_blocking_ast, 0);
5150 if (it.it_request != NULL)
5151 ptlrpc_req_finished(it.it_request);
5152 it.it_request = NULL;
5154 ll_finish_md_op_data(op_data);
5156 /* set lock data in case this is a new lock */
5158 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5160 ll_intent_drop_lock(&it);
5166 * This function checks if there exists a LAYOUT lock on the client side,
5167 * or enqueues it if it doesn't have one in cache.
5169 * This function will not hold layout lock so it may be revoked any time after
5170 * this function returns. Any operations depend on layout should be redone
5173 * This function should be called before lov_io_init() to get an uptodate
5174 * layout version, the caller should save the version number and after IO
5175 * is finished, this function should be called again to verify that layout
5176 * is not changed during IO time.
5178 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5180 struct ll_inode_info *lli = ll_i2info(inode);
5181 struct ll_sb_info *sbi = ll_i2sbi(inode);
5182 struct lustre_handle lockh;
5183 struct layout_intent intent = {
5184 .li_opc = LAYOUT_INTENT_ACCESS,
5186 enum ldlm_mode mode;
5190 *gen = ll_layout_version_get(lli);
5191 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5195 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5196 LASSERT(S_ISREG(inode->i_mode));
5198 /* take layout lock mutex to enqueue layout lock exclusively. */
5199 mutex_lock(&lli->lli_layout_mutex);
5202 /* mostly layout lock is caching on the local side, so try to
5203 * match it before grabbing layout lock mutex. */
5204 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5205 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5206 if (mode != 0) { /* hit cached lock */
5207 rc = ll_layout_lock_set(&lockh, mode, inode);
5213 rc = ll_layout_intent(inode, &intent);
5219 *gen = ll_layout_version_get(lli);
5220 mutex_unlock(&lli->lli_layout_mutex);
5226 * Issue layout intent RPC indicating where in a file an IO is about to write.
5228 * \param[in] inode file inode.
5229 * \param[in] ext write range with start offset of fille in bytes where
5230 * an IO is about to write, and exclusive end offset in
5233 * \retval 0 on success
5234 * \retval < 0 error code
5236 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5237 struct lu_extent *ext)
5239 struct layout_intent intent = {
5241 .li_extent.e_start = ext->e_start,
5242 .li_extent.e_end = ext->e_end,
5247 rc = ll_layout_intent(inode, &intent);
5253 * This function send a restore request to the MDT
5255 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5257 struct hsm_user_request *hur;
5261 len = sizeof(struct hsm_user_request) +
5262 sizeof(struct hsm_user_item);
5263 OBD_ALLOC(hur, len);
5267 hur->hur_request.hr_action = HUA_RESTORE;
5268 hur->hur_request.hr_archive_id = 0;
5269 hur->hur_request.hr_flags = 0;
5270 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5271 sizeof(hur->hur_user_item[0].hui_fid));
5272 hur->hur_user_item[0].hui_extent.offset = offset;
5273 hur->hur_user_item[0].hui_extent.length = length;
5274 hur->hur_request.hr_itemcount = 1;
5275 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,