4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE;
158 op_data->op_xvalid |= OP_XVALID_BLOCKS;
159 case MDS_CLOSE_LAYOUT_SPLIT:
160 case MDS_CLOSE_LAYOUT_SWAP: {
161 struct split_param *sp = data;
163 LASSERT(data != NULL);
164 op_data->op_bias |= bias;
165 op_data->op_data_version = 0;
166 op_data->op_lease_handle = och->och_lease_handle;
167 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
168 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
169 op_data->op_mirror_id = sp->sp_mirror_id;
171 op_data->op_fid2 = *ll_inode2fid(data);
176 case MDS_CLOSE_RESYNC_DONE: {
177 struct ll_ioc_lease *ioc = data;
179 LASSERT(data != NULL);
180 op_data->op_attr_blocks +=
181 ioc->lil_count * op_data->op_attr_blocks;
182 op_data->op_attr.ia_valid |= ATTR_SIZE;
183 op_data->op_xvalid |= OP_XVALID_BLOCKS;
184 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
186 op_data->op_lease_handle = och->och_lease_handle;
187 op_data->op_data = &ioc->lil_ids[0];
188 op_data->op_data_size =
189 ioc->lil_count * sizeof(ioc->lil_ids[0]);
193 case MDS_HSM_RELEASE:
194 LASSERT(data != NULL);
195 op_data->op_bias |= MDS_HSM_RELEASE;
196 op_data->op_data_version = *(__u64 *)data;
197 op_data->op_lease_handle = och->och_lease_handle;
198 op_data->op_attr.ia_valid |= ATTR_SIZE;
199 op_data->op_xvalid |= OP_XVALID_BLOCKS;
203 LASSERT(data == NULL);
207 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
208 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
209 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
210 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
212 rc = md_close(md_exp, op_data, och->och_mod, &req);
213 if (rc != 0 && rc != -EINTR)
214 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
215 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
217 if (rc == 0 && op_data->op_bias & bias) {
218 struct mdt_body *body;
220 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
221 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
225 ll_finish_md_op_data(op_data);
229 md_clear_open_replay_data(md_exp, och);
230 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
233 ptlrpc_req_finished(req); /* This is close request */
237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
239 struct ll_inode_info *lli = ll_i2info(inode);
240 struct obd_client_handle **och_p;
241 struct obd_client_handle *och;
246 if (fmode & FMODE_WRITE) {
247 och_p = &lli->lli_mds_write_och;
248 och_usecount = &lli->lli_open_fd_write_count;
249 } else if (fmode & FMODE_EXEC) {
250 och_p = &lli->lli_mds_exec_och;
251 och_usecount = &lli->lli_open_fd_exec_count;
253 LASSERT(fmode & FMODE_READ);
254 och_p = &lli->lli_mds_read_och;
255 och_usecount = &lli->lli_open_fd_read_count;
258 mutex_lock(&lli->lli_och_mutex);
259 if (*och_usecount > 0) {
260 /* There are still users of this handle, so skip
262 mutex_unlock(&lli->lli_och_mutex);
268 mutex_unlock(&lli->lli_och_mutex);
271 /* There might be a race and this handle may already
273 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
279 static int ll_md_close(struct inode *inode, struct file *file)
281 union ldlm_policy_data policy = {
282 .l_inodebits = { MDS_INODELOCK_OPEN },
284 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
286 struct ll_inode_info *lli = ll_i2info(inode);
287 struct lustre_handle lockh;
288 enum ldlm_mode lockmode;
292 /* clear group lock, if present */
293 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
294 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
296 if (fd->fd_lease_och != NULL) {
299 /* Usually the lease is not released when the
300 * application crashed, we need to release here. */
301 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
302 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
303 PFID(&lli->lli_fid), rc, lease_broken);
305 fd->fd_lease_och = NULL;
308 if (fd->fd_och != NULL) {
309 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
314 /* Let's see if we have good enough OPEN lock on the file and if
315 we can skip talking to MDS */
316 mutex_lock(&lli->lli_och_mutex);
317 if (fd->fd_omode & FMODE_WRITE) {
319 LASSERT(lli->lli_open_fd_write_count);
320 lli->lli_open_fd_write_count--;
321 } else if (fd->fd_omode & FMODE_EXEC) {
323 LASSERT(lli->lli_open_fd_exec_count);
324 lli->lli_open_fd_exec_count--;
327 LASSERT(lli->lli_open_fd_read_count);
328 lli->lli_open_fd_read_count--;
330 mutex_unlock(&lli->lli_och_mutex);
332 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
333 LDLM_IBITS, &policy, lockmode, &lockh))
334 rc = ll_md_real_close(inode, fd->fd_omode);
337 LUSTRE_FPRIVATE(file) = NULL;
338 ll_file_data_put(fd);
343 /* While this returns an error code, fput() the caller does not, so we need
344 * to make every effort to clean up all of our state here. Also, applications
345 * rarely check close errors and even if an error is returned they will not
346 * re-try the close call.
348 int ll_file_release(struct inode *inode, struct file *file)
350 struct ll_file_data *fd;
351 struct ll_sb_info *sbi = ll_i2sbi(inode);
352 struct ll_inode_info *lli = ll_i2info(inode);
356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
357 PFID(ll_inode2fid(inode)), inode);
359 if (inode->i_sb->s_root != file_dentry(file))
360 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
361 fd = LUSTRE_FPRIVATE(file);
364 /* The last ref on @file, maybe not the the owner pid of statahead,
365 * because parent and child process can share the same file handle. */
366 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
367 ll_deauthorize_statahead(inode, fd);
369 if (inode->i_sb->s_root == file_dentry(file)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 if (lli->lli_clob != NULL)
377 lov_read_and_clear_async_rc(lli->lli_clob);
378 lli->lli_async_rc = 0;
381 rc = ll_md_close(inode, file);
383 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
384 libcfs_debug_dumplog();
389 static inline int ll_dom_readpage(void *data, struct page *page)
391 struct niobuf_local *lnb = data;
394 kaddr = ll_kmap_atomic(page, KM_USER0);
395 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
396 if (lnb->lnb_len < PAGE_SIZE)
397 memset(kaddr + lnb->lnb_len, 0,
398 PAGE_SIZE - lnb->lnb_len);
399 flush_dcache_page(page);
400 SetPageUptodate(page);
401 ll_kunmap_atomic(kaddr, KM_USER0);
407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
408 struct lookup_intent *it)
410 struct ll_inode_info *lli = ll_i2info(inode);
411 struct cl_object *obj = lli->lli_clob;
412 struct address_space *mapping = inode->i_mapping;
414 struct niobuf_remote *rnb;
416 struct lustre_handle lockh;
417 struct ldlm_lock *lock;
418 unsigned long index, start;
419 struct niobuf_local lnb;
420 bool dom_lock = false;
427 if (it->it_lock_mode != 0) {
428 lockh.cookie = it->it_lock_handle;
429 lock = ldlm_handle2lock(&lockh);
431 dom_lock = ldlm_has_dom(lock);
437 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
441 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
442 if (rnb == NULL || rnb->rnb_len == 0)
445 /* LU-11595: Server may return whole file and that is OK always or
446 * it may return just file tail and its offset must be aligned with
447 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
448 * smaller then offset may be not aligned and that data is just ignored.
450 if (rnb->rnb_offset % PAGE_SIZE)
453 /* Server returns whole file or just file tail if it fills in
454 * reply buffer, in both cases total size should be inode size.
456 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
457 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
458 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
459 rnb->rnb_len, i_size_read(inode));
463 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
464 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
466 data = (char *)rnb + sizeof(*rnb);
468 lnb.lnb_file_offset = rnb->rnb_offset;
469 start = lnb.lnb_file_offset / PAGE_SIZE;
471 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
472 lnb.lnb_page_offset = 0;
474 lnb.lnb_data = data + (index << PAGE_SHIFT);
475 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
476 if (lnb.lnb_len > PAGE_SIZE)
477 lnb.lnb_len = PAGE_SIZE;
479 vmpage = read_cache_page(mapping, index + start,
480 ll_dom_readpage, &lnb);
481 if (IS_ERR(vmpage)) {
482 CWARN("%s: cannot fill page %lu for "DFID
483 " with data: rc = %li\n",
484 ll_i2sbi(inode)->ll_fsname, index + start,
485 PFID(lu_object_fid(&obj->co_lu)),
491 } while (rnb->rnb_len > (index << PAGE_SHIFT));
495 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
496 struct lookup_intent *itp)
498 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
499 struct dentry *parent = de->d_parent;
502 struct md_op_data *op_data;
503 struct ptlrpc_request *req = NULL;
507 LASSERT(parent != NULL);
508 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
510 /* if server supports open-by-fid, or file name is invalid, don't pack
511 * name in open request */
512 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
513 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
515 len = de->d_name.len;
516 name = kmalloc(len + 1, GFP_NOFS);
521 spin_lock(&de->d_lock);
522 if (len != de->d_name.len) {
523 spin_unlock(&de->d_lock);
527 memcpy(name, de->d_name.name, len);
529 spin_unlock(&de->d_lock);
531 if (!lu_name_is_valid_2(name, len)) {
537 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
538 name, len, 0, LUSTRE_OPC_ANY, NULL);
539 if (IS_ERR(op_data)) {
541 RETURN(PTR_ERR(op_data));
543 op_data->op_data = lmm;
544 op_data->op_data_size = lmmsize;
546 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
547 &ll_md_blocking_ast, 0);
549 ll_finish_md_op_data(op_data);
551 /* reason for keep own exit path - don`t flood log
552 * with messages with -ESTALE errors.
554 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
555 it_open_error(DISP_OPEN_OPEN, itp))
557 ll_release_openhandle(de, itp);
561 if (it_disposition(itp, DISP_LOOKUP_NEG))
562 GOTO(out, rc = -ENOENT);
564 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
565 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
566 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
570 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
572 if (!rc && itp->it_lock_mode) {
573 ll_dom_finish_open(de->d_inode, req, itp);
574 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
578 ptlrpc_req_finished(req);
579 ll_intent_drop_lock(itp);
581 /* We did open by fid, but by the time we got to the server,
582 * the object disappeared. If this is a create, we cannot really
583 * tell the userspace that the file it was trying to create
584 * does not exist. Instead let's return -ESTALE, and the VFS will
585 * retry the create with LOOKUP_REVAL that we are going to catch
586 * in ll_revalidate_dentry() and use lookup then.
588 if (rc == -ENOENT && itp->it_op & IT_CREAT)
594 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
595 struct obd_client_handle *och)
597 struct mdt_body *body;
599 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
600 och->och_open_handle = body->mbo_open_handle;
601 och->och_fid = body->mbo_fid1;
602 och->och_lease_handle.cookie = it->it_lock_handle;
603 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
604 och->och_flags = it->it_flags;
606 return md_set_open_replay_data(md_exp, och, it);
609 static int ll_local_open(struct file *file, struct lookup_intent *it,
610 struct ll_file_data *fd, struct obd_client_handle *och)
612 struct inode *inode = file_inode(file);
615 LASSERT(!LUSTRE_FPRIVATE(file));
622 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
627 LUSTRE_FPRIVATE(file) = fd;
628 ll_readahead_init(inode, &fd->fd_ras);
629 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
631 /* ll_cl_context initialize */
632 rwlock_init(&fd->fd_lock);
633 INIT_LIST_HEAD(&fd->fd_lccs);
638 /* Open a file, and (for the very first open) create objects on the OSTs at
639 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
640 * creation or open until ll_lov_setstripe() ioctl is called.
642 * If we already have the stripe MD locally then we don't request it in
643 * md_open(), by passing a lmm_size = 0.
645 * It is up to the application to ensure no other processes open this file
646 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
647 * used. We might be able to avoid races of that sort by getting lli_open_sem
648 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
649 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
651 int ll_file_open(struct inode *inode, struct file *file)
653 struct ll_inode_info *lli = ll_i2info(inode);
654 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
655 .it_flags = file->f_flags };
656 struct obd_client_handle **och_p = NULL;
657 __u64 *och_usecount = NULL;
658 struct ll_file_data *fd;
662 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
663 PFID(ll_inode2fid(inode)), inode, file->f_flags);
665 it = file->private_data; /* XXX: compat macro */
666 file->private_data = NULL; /* prevent ll_local_open assertion */
668 fd = ll_file_data_get();
670 GOTO(out_nofiledata, rc = -ENOMEM);
673 if (S_ISDIR(inode->i_mode))
674 ll_authorize_statahead(inode, fd);
676 if (inode->i_sb->s_root == file_dentry(file)) {
677 LUSTRE_FPRIVATE(file) = fd;
681 if (!it || !it->it_disposition) {
682 /* Convert f_flags into access mode. We cannot use file->f_mode,
683 * because everything but O_ACCMODE mask was stripped from
685 if ((oit.it_flags + 1) & O_ACCMODE)
687 if (file->f_flags & O_TRUNC)
688 oit.it_flags |= FMODE_WRITE;
690 /* kernel only call f_op->open in dentry_open. filp_open calls
691 * dentry_open after call to open_namei that checks permissions.
692 * Only nfsd_open call dentry_open directly without checking
693 * permissions and because of that this code below is safe.
695 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
696 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
698 /* We do not want O_EXCL here, presumably we opened the file
699 * already? XXX - NFS implications? */
700 oit.it_flags &= ~O_EXCL;
702 /* bug20584, if "it_flags" contains O_CREAT, the file will be
703 * created if necessary, then "IT_CREAT" should be set to keep
704 * consistent with it */
705 if (oit.it_flags & O_CREAT)
706 oit.it_op |= IT_CREAT;
712 /* Let's see if we have file open on MDS already. */
713 if (it->it_flags & FMODE_WRITE) {
714 och_p = &lli->lli_mds_write_och;
715 och_usecount = &lli->lli_open_fd_write_count;
716 } else if (it->it_flags & FMODE_EXEC) {
717 och_p = &lli->lli_mds_exec_och;
718 och_usecount = &lli->lli_open_fd_exec_count;
720 och_p = &lli->lli_mds_read_och;
721 och_usecount = &lli->lli_open_fd_read_count;
724 mutex_lock(&lli->lli_och_mutex);
725 if (*och_p) { /* Open handle is present */
726 if (it_disposition(it, DISP_OPEN_OPEN)) {
727 /* Well, there's extra open request that we do not need,
728 let's close it somehow. This will decref request. */
729 rc = it_open_error(DISP_OPEN_OPEN, it);
731 mutex_unlock(&lli->lli_och_mutex);
732 GOTO(out_openerr, rc);
735 ll_release_openhandle(file_dentry(file), it);
739 rc = ll_local_open(file, it, fd, NULL);
742 mutex_unlock(&lli->lli_och_mutex);
743 GOTO(out_openerr, rc);
746 LASSERT(*och_usecount == 0);
747 if (!it->it_disposition) {
748 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
749 /* We cannot just request lock handle now, new ELC code
750 means that one of other OPEN locks for this file
751 could be cancelled, and since blocking ast handler
752 would attempt to grab och_mutex as well, that would
753 result in a deadlock */
754 mutex_unlock(&lli->lli_och_mutex);
756 * Normally called under two situations:
758 * 2. A race/condition on MDS resulting in no open
759 * handle to be returned from LOOKUP|OPEN request,
760 * for example if the target entry was a symlink.
762 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
763 * marked by a bit set in ll_iget_for_nfs. Clear the
764 * bit so that it's not confusing later callers.
766 * NB; when ldd is NULL, it must have come via normal
767 * lookup path only, since ll_iget_for_nfs always calls
770 if (ldd && ldd->lld_nfs_dentry) {
771 ldd->lld_nfs_dentry = 0;
772 it->it_flags |= MDS_OPEN_LOCK;
776 * Always specify MDS_OPEN_BY_FID because we don't want
777 * to get file with different fid.
779 it->it_flags |= MDS_OPEN_BY_FID;
780 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
783 GOTO(out_openerr, rc);
787 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
789 GOTO(out_och_free, rc = -ENOMEM);
793 /* md_intent_lock() didn't get a request ref if there was an
794 * open error, so don't do cleanup on the request here
796 /* XXX (green): Should not we bail out on any error here, not
797 * just open error? */
798 rc = it_open_error(DISP_OPEN_OPEN, it);
800 GOTO(out_och_free, rc);
802 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
803 "inode %p: disposition %x, status %d\n", inode,
804 it_disposition(it, ~0), it->it_status);
806 rc = ll_local_open(file, it, fd, *och_p);
808 GOTO(out_och_free, rc);
810 mutex_unlock(&lli->lli_och_mutex);
813 /* Must do this outside lli_och_mutex lock to prevent deadlock where
814 different kind of OPEN lock for this same inode gets cancelled
815 by ldlm_cancel_lru */
816 if (!S_ISREG(inode->i_mode))
817 GOTO(out_och_free, rc);
819 cl_lov_delay_create_clear(&file->f_flags);
820 GOTO(out_och_free, rc);
824 if (och_p && *och_p) {
825 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
826 *och_p = NULL; /* OBD_FREE writes some magic there */
829 mutex_unlock(&lli->lli_och_mutex);
832 if (lli->lli_opendir_key == fd)
833 ll_deauthorize_statahead(inode, fd);
835 ll_file_data_put(fd);
837 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
841 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
842 ptlrpc_req_finished(it->it_request);
843 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
849 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
850 struct ldlm_lock_desc *desc, void *data, int flag)
853 struct lustre_handle lockh;
857 case LDLM_CB_BLOCKING:
858 ldlm_lock2handle(lock, &lockh);
859 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
861 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
865 case LDLM_CB_CANCELING:
873 * When setting a lease on a file, we take ownership of the lli_mds_*_och
874 * and save it as fd->fd_och so as to force client to reopen the file even
875 * if it has an open lock in cache already.
877 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
878 struct lustre_handle *old_open_handle)
880 struct ll_inode_info *lli = ll_i2info(inode);
881 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
882 struct obd_client_handle **och_p;
887 /* Get the openhandle of the file */
888 mutex_lock(&lli->lli_och_mutex);
889 if (fd->fd_lease_och != NULL)
890 GOTO(out_unlock, rc = -EBUSY);
892 if (fd->fd_och == NULL) {
893 if (file->f_mode & FMODE_WRITE) {
894 LASSERT(lli->lli_mds_write_och != NULL);
895 och_p = &lli->lli_mds_write_och;
896 och_usecount = &lli->lli_open_fd_write_count;
898 LASSERT(lli->lli_mds_read_och != NULL);
899 och_p = &lli->lli_mds_read_och;
900 och_usecount = &lli->lli_open_fd_read_count;
903 if (*och_usecount > 1)
904 GOTO(out_unlock, rc = -EBUSY);
911 *old_open_handle = fd->fd_och->och_open_handle;
915 mutex_unlock(&lli->lli_och_mutex);
920 * Release ownership on lli_mds_*_och when putting back a file lease.
922 static int ll_lease_och_release(struct inode *inode, struct file *file)
924 struct ll_inode_info *lli = ll_i2info(inode);
925 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
926 struct obd_client_handle **och_p;
927 struct obd_client_handle *old_och = NULL;
932 mutex_lock(&lli->lli_och_mutex);
933 if (file->f_mode & FMODE_WRITE) {
934 och_p = &lli->lli_mds_write_och;
935 och_usecount = &lli->lli_open_fd_write_count;
937 och_p = &lli->lli_mds_read_och;
938 och_usecount = &lli->lli_open_fd_read_count;
941 /* The file may have been open by another process (broken lease) so
942 * *och_p is not NULL. In this case we should simply increase usecount
945 if (*och_p != NULL) {
946 old_och = fd->fd_och;
953 mutex_unlock(&lli->lli_och_mutex);
956 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
962 * Acquire a lease and open the file.
964 static struct obd_client_handle *
965 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
968 struct lookup_intent it = { .it_op = IT_OPEN };
969 struct ll_sb_info *sbi = ll_i2sbi(inode);
970 struct md_op_data *op_data;
971 struct ptlrpc_request *req = NULL;
972 struct lustre_handle old_open_handle = { 0 };
973 struct obd_client_handle *och = NULL;
978 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
979 RETURN(ERR_PTR(-EINVAL));
982 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
983 RETURN(ERR_PTR(-EPERM));
985 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
992 RETURN(ERR_PTR(-ENOMEM));
994 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
995 LUSTRE_OPC_ANY, NULL);
997 GOTO(out, rc = PTR_ERR(op_data));
999 /* To tell the MDT this openhandle is from the same owner */
1000 op_data->op_open_handle = old_open_handle;
1002 it.it_flags = fmode | open_flags;
1003 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1004 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1005 &ll_md_blocking_lease_ast,
1006 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1007 * it can be cancelled which may mislead applications that the lease is
1009 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1010 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1011 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1012 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1013 ll_finish_md_op_data(op_data);
1014 ptlrpc_req_finished(req);
1016 GOTO(out_release_it, rc);
1018 if (it_disposition(&it, DISP_LOOKUP_NEG))
1019 GOTO(out_release_it, rc = -ENOENT);
1021 rc = it_open_error(DISP_OPEN_OPEN, &it);
1023 GOTO(out_release_it, rc);
1025 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1026 ll_och_fill(sbi->ll_md_exp, &it, och);
1028 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1029 GOTO(out_close, rc = -EOPNOTSUPP);
1031 /* already get lease, handle lease lock */
1032 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1033 if (it.it_lock_mode == 0 ||
1034 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1035 /* open lock must return for lease */
1036 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1037 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1039 GOTO(out_close, rc = -EPROTO);
1042 ll_intent_release(&it);
1046 /* Cancel open lock */
1047 if (it.it_lock_mode != 0) {
1048 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1050 it.it_lock_mode = 0;
1051 och->och_lease_handle.cookie = 0ULL;
1053 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1055 CERROR("%s: error closing file "DFID": %d\n",
1056 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1057 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1059 ll_intent_release(&it);
1063 RETURN(ERR_PTR(rc));
1067 * Check whether a layout swap can be done between two inodes.
1069 * \param[in] inode1 First inode to check
1070 * \param[in] inode2 Second inode to check
1072 * \retval 0 on success, layout swap can be performed between both inodes
1073 * \retval negative error code if requirements are not met
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076 struct inode *inode2)
1078 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1081 if (inode_permission(inode1, MAY_WRITE) ||
1082 inode_permission(inode2, MAY_WRITE))
1085 if (inode1->i_sb != inode2->i_sb)
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092 struct inode *inode, struct inode *inode2)
1094 const struct lu_fid *fid1 = ll_inode2fid(inode);
1095 const struct lu_fid *fid2;
1099 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1102 rc = ll_check_swap_layouts_validity(inode, inode2);
1104 GOTO(out_free_och, rc);
1106 /* We now know that inode2 is a lustre inode */
1107 fid2 = ll_inode2fid(inode2);
1109 rc = lu_fid_cmp(fid1, fid2);
1111 GOTO(out_free_och, rc = -EINVAL);
1113 /* Close the file and {swap,merge} layouts between inode & inode2.
1114 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115 * because we still need it to pack l_remote_handle to MDT. */
1116 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1119 och = NULL; /* freed in ll_close_inode_openhandle() */
1129 * Release lease and close the file.
1130 * It will check if the lease has ever broken.
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133 struct inode *inode,
1134 bool *lease_broken, enum mds_op_bias bias,
1137 struct ldlm_lock *lock;
1138 bool cancelled = true;
1142 lock = ldlm_handle2lock(&och->och_lease_handle);
1144 lock_res_and_lock(lock);
1145 cancelled = ldlm_is_cancel(lock);
1146 unlock_res_and_lock(lock);
1147 LDLM_LOCK_PUT(lock);
1150 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1153 if (lease_broken != NULL)
1154 *lease_broken = cancelled;
1156 if (!cancelled && !bias)
1157 ldlm_cli_cancel(&och->och_lease_handle, 0);
1159 if (cancelled) { /* no need to excute intent */
1164 rc = ll_close_inode_openhandle(inode, och, bias, data);
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1171 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1175 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178 struct inode *inode, unsigned long arg)
1180 struct ll_sb_info *sbi = ll_i2sbi(inode);
1181 struct md_op_data *op_data;
1182 struct ll_ioc_lease_id ioc;
1183 __u64 data_version_unused;
1187 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188 LUSTRE_OPC_ANY, NULL);
1189 if (IS_ERR(op_data))
1190 RETURN(PTR_ERR(op_data));
1192 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1196 /* before starting file resync, it's necessary to clean up page cache
1197 * in client memory, otherwise once the layout version is increased,
1198 * writing back cached data will be denied the OSTs. */
1199 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1203 op_data->op_lease_handle = och->och_lease_handle;
1204 op_data->op_mirror_id = ioc.lil_mirror_id;
1205 rc = md_file_resync(sbi->ll_md_exp, op_data);
1211 ll_finish_md_op_data(op_data);
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1217 struct ll_inode_info *lli = ll_i2info(inode);
1218 struct cl_object *obj = lli->lli_clob;
1219 struct cl_attr *attr = vvp_env_thread_attr(env);
1227 ll_inode_size_lock(inode);
1229 /* Merge timestamps the most recently obtained from MDS with
1230 * timestamps obtained from OSTs.
1232 * Do not overwrite atime of inode because it may be refreshed
1233 * by file_accessed() function. If the read was served by cache
1234 * data, there is no RPC to be sent so that atime may not be
1235 * transferred to OSTs at all. MDT only updates atime at close time
1236 * if it's at least 'mdd.*.atime_diff' older.
1237 * All in all, the atime in Lustre does not strictly comply with
1238 * POSIX. Solving this problem needs to send an RPC to MDT for each
1239 * read, this will hurt performance.
1241 if (inode->i_atime.tv_sec < lli->lli_atime ||
1242 lli->lli_update_atime) {
1243 inode->i_atime.tv_sec = lli->lli_atime;
1244 lli->lli_update_atime = 0;
1246 inode->i_mtime.tv_sec = lli->lli_mtime;
1247 inode->i_ctime.tv_sec = lli->lli_ctime;
1249 mtime = inode->i_mtime.tv_sec;
1250 atime = inode->i_atime.tv_sec;
1251 ctime = inode->i_ctime.tv_sec;
1253 cl_object_attr_lock(obj);
1254 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1257 rc = cl_object_attr_get(env, obj, attr);
1258 cl_object_attr_unlock(obj);
1261 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1263 if (atime < attr->cat_atime)
1264 atime = attr->cat_atime;
1266 if (ctime < attr->cat_ctime)
1267 ctime = attr->cat_ctime;
1269 if (mtime < attr->cat_mtime)
1270 mtime = attr->cat_mtime;
1272 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273 PFID(&lli->lli_fid), attr->cat_size);
1275 i_size_write(inode, attr->cat_size);
1276 inode->i_blocks = attr->cat_blocks;
1278 inode->i_mtime.tv_sec = mtime;
1279 inode->i_atime.tv_sec = atime;
1280 inode->i_ctime.tv_sec = ctime;
1283 ll_inode_size_unlock(inode);
1289 * Set designated mirror for I/O.
1291 * So far only read, write, and truncated can support to issue I/O to
1292 * designated mirror.
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1296 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1298 /* clear layout version for generic(non-resync) I/O in case it carries
1299 * stale layout version due to I/O restart */
1300 io->ci_layout_version = 0;
1302 /* FLR: disable non-delay for designated mirror I/O because obviously
1303 * only one mirror is available */
1304 if (fd->fd_designated_mirror > 0) {
1306 io->ci_designated_mirror = fd->fd_designated_mirror;
1307 io->ci_layout_version = fd->fd_layout_version;
1310 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1314 static bool file_is_noatime(const struct file *file)
1316 const struct vfsmount *mnt = file->f_path.mnt;
1317 const struct inode *inode = file_inode((struct file *)file);
1319 /* Adapted from file_accessed() and touch_atime().*/
1320 if (file->f_flags & O_NOATIME)
1323 if (inode->i_flags & S_NOATIME)
1326 if (IS_NOATIME(inode))
1329 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1332 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1335 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1343 struct inode *inode = file_inode(file);
1344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1346 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1349 if (iot == CIT_WRITE) {
1350 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1352 file->f_flags & O_DIRECT ||
1355 io->ci_obj = ll_i2info(inode)->lli_clob;
1356 io->ci_lockreq = CILR_MAYBE;
1357 if (ll_file_nolock(file)) {
1358 io->ci_lockreq = CILR_NEVER;
1359 io->ci_no_srvlock = 1;
1360 } else if (file->f_flags & O_APPEND) {
1361 io->ci_lockreq = CILR_MANDATORY;
1363 io->ci_noatime = file_is_noatime(file);
1365 /* FLR: only use non-delay I/O for read as there is only one
1366 * avaliable mirror for write. */
1367 io->ci_ndelay = !(iot == CIT_WRITE);
1369 ll_io_set_mirror(io, file);
1372 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1375 struct ll_inode_info *lli = ll_i2info(inode);
1376 struct ll_sb_info *sbi = ll_i2sbi(inode);
1377 enum obd_heat_type sample_type;
1378 enum obd_heat_type iobyte_type;
1379 __u64 now = ktime_get_real_seconds();
1381 if (!ll_sbi_has_file_heat(sbi) ||
1382 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1385 if (iot == CIT_READ) {
1386 sample_type = OBD_HEAT_READSAMPLE;
1387 iobyte_type = OBD_HEAT_READBYTE;
1388 } else if (iot == CIT_WRITE) {
1389 sample_type = OBD_HEAT_WRITESAMPLE;
1390 iobyte_type = OBD_HEAT_WRITEBYTE;
1395 spin_lock(&lli->lli_heat_lock);
1396 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1397 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1399 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1400 spin_unlock(&lli->lli_heat_lock);
1404 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1405 struct file *file, enum cl_io_type iot,
1406 loff_t *ppos, size_t count)
1408 struct vvp_io *vio = vvp_env_io(env);
1409 struct inode *inode = file_inode(file);
1410 struct ll_inode_info *lli = ll_i2info(inode);
1411 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1412 struct range_lock range;
1416 unsigned retried = 0;
1417 bool restarted = false;
1421 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1422 file_dentry(file)->d_name.name,
1423 iot == CIT_READ ? "read" : "write", *ppos, count);
1426 io = vvp_env_thread_io(env);
1427 ll_io_init(io, file, iot);
1428 io->ci_ndelay_tried = retried;
1430 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1431 bool range_locked = false;
1433 if (file->f_flags & O_APPEND)
1434 range_lock_init(&range, 0, LUSTRE_EOF);
1436 range_lock_init(&range, *ppos, *ppos + count - 1);
1438 vio->vui_fd = LUSTRE_FPRIVATE(file);
1439 vio->vui_io_subtype = args->via_io_subtype;
1441 switch (vio->vui_io_subtype) {
1443 vio->vui_iter = args->u.normal.via_iter;
1444 vio->vui_iocb = args->u.normal.via_iocb;
1445 /* Direct IO reads must also take range lock,
1446 * or multiple reads will try to work on the same pages
1447 * See LU-6227 for details. */
1448 if (((iot == CIT_WRITE) ||
1449 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1450 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1451 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1453 rc = range_lock(&lli->lli_write_tree, &range);
1457 range_locked = true;
1461 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1462 vio->u.splice.vui_flags = args->u.splice.via_flags;
1465 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1469 ll_cl_add(file, env, io, LCC_RW);
1470 rc = cl_io_loop(env, io);
1471 ll_cl_remove(file, env);
1474 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1476 range_unlock(&lli->lli_write_tree, &range);
1479 /* cl_io_rw_init() handled IO */
1483 if (io->ci_nob > 0) {
1484 result += io->ci_nob;
1485 count -= io->ci_nob;
1486 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1488 /* prepare IO restart */
1489 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1490 args->u.normal.via_iter = vio->vui_iter;
1493 cl_io_fini(env, io);
1496 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1497 file->f_path.dentry->d_name.name,
1498 iot, rc, result, io->ci_need_restart);
1500 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1502 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1503 file_dentry(file)->d_name.name,
1504 iot == CIT_READ ? "read" : "write",
1505 *ppos, count, result, rc);
1506 /* preserve the tried count for FLR */
1507 retried = io->ci_ndelay_tried;
1512 if (iot == CIT_READ) {
1514 ll_stats_ops_tally(ll_i2sbi(inode),
1515 LPROC_LL_READ_BYTES, result);
1516 } else if (iot == CIT_WRITE) {
1518 ll_stats_ops_tally(ll_i2sbi(inode),
1519 LPROC_LL_WRITE_BYTES, result);
1520 fd->fd_write_failed = false;
1521 } else if (result == 0 && rc == 0) {
1524 fd->fd_write_failed = true;
1526 fd->fd_write_failed = false;
1527 } else if (rc != -ERESTARTSYS) {
1528 fd->fd_write_failed = true;
1532 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1534 ll_heat_add(inode, iot, result);
1536 RETURN(result > 0 ? result : rc);
1540 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1541 * especially for small I/O.
1543 * To serve a read request, CLIO has to create and initialize a cl_io and
1544 * then request DLM lock. This has turned out to have siginificant overhead
1545 * and affects the performance of small I/O dramatically.
1547 * It's not necessary to create a cl_io for each I/O. Under the help of read
1548 * ahead, most of the pages being read are already in memory cache and we can
1549 * read those pages directly because if the pages exist, the corresponding DLM
1550 * lock must exist so that page content must be valid.
1552 * In fast read implementation, the llite speculatively finds and reads pages
1553 * in memory cache. There are three scenarios for fast read:
1554 * - If the page exists and is uptodate, kernel VM will provide the data and
1555 * CLIO won't be intervened;
1556 * - If the page was brought into memory by read ahead, it will be exported
1557 * and read ahead parameters will be updated;
1558 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1559 * it will go back and invoke normal read, i.e., a cl_io will be created
1560 * and DLM lock will be requested.
1562 * POSIX compliance: posix standard states that read is intended to be atomic.
1563 * Lustre read implementation is in line with Linux kernel read implementation
1564 * and neither of them complies with POSIX standard in this matter. Fast read
1565 * doesn't make the situation worse on single node but it may interleave write
1566 * results from multiple nodes due to short read handling in ll_file_aio_read().
1568 * \param env - lu_env
1569 * \param iocb - kiocb from kernel
1570 * \param iter - user space buffers where the data will be copied
1572 * \retval - number of bytes have been read, or error code if error occurred.
1575 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1579 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1582 /* NB: we can't do direct IO for fast read because it will need a lock
1583 * to make IO engine happy. */
1584 if (iocb->ki_filp->f_flags & O_DIRECT)
1587 result = generic_file_read_iter(iocb, iter);
1589 /* If the first page is not in cache, generic_file_aio_read() will be
1590 * returned with -ENODATA.
1591 * See corresponding code in ll_readpage(). */
1592 if (result == -ENODATA)
1596 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1597 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1598 LPROC_LL_READ_BYTES, result);
1605 * Read from a file (through the page cache).
1607 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1610 struct vvp_io_args *args;
1615 ll_ras_enter(iocb->ki_filp);
1617 result = ll_do_fast_read(iocb, to);
1618 if (result < 0 || iov_iter_count(to) == 0)
1621 env = cl_env_get(&refcheck);
1623 return PTR_ERR(env);
1625 args = ll_env_args(env, IO_NORMAL);
1626 args->u.normal.via_iter = to;
1627 args->u.normal.via_iocb = iocb;
1629 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1630 &iocb->ki_pos, iov_iter_count(to));
1633 else if (result == 0)
1636 cl_env_put(env, &refcheck);
1642 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1643 * If a page is already in the page cache and dirty (and some other things -
1644 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1645 * write to it without doing a full I/O, because Lustre already knows about it
1646 * and will write it out. This saves a lot of processing time.
1648 * All writes here are within one page, so exclusion is handled by the page
1649 * lock on the vm page. We do not do tiny writes for writes which touch
1650 * multiple pages because it's very unlikely multiple sequential pages are
1651 * are already dirty.
1653 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1654 * and are unlikely to be to already dirty pages.
1656 * Attribute updates are important here, we do them in ll_tiny_write_end.
1658 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1660 ssize_t count = iov_iter_count(iter);
1661 struct file *file = iocb->ki_filp;
1662 struct inode *inode = file_inode(file);
1663 bool lock_inode = !IS_NOSEC(inode);
1668 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1669 * of function for why.
1671 if (count >= PAGE_SIZE ||
1672 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1675 if (unlikely(lock_inode))
1677 result = __generic_file_write_iter(iocb, iter);
1679 if (unlikely(lock_inode))
1680 inode_unlock(inode);
1682 /* If the page is not already dirty, ll_tiny_write_begin returns
1683 * -ENODATA. We continue on to normal write.
1685 if (result == -ENODATA)
1689 ll_heat_add(inode, CIT_WRITE, result);
1690 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1692 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1695 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1701 * Write to a file (through the page cache).
1703 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1705 struct vvp_io_args *args;
1707 ssize_t rc_tiny = 0, rc_normal;
1712 /* NB: we can't do direct IO for tiny writes because they use the page
1713 * cache, we can't do sync writes because tiny writes can't flush
1714 * pages, and we can't do append writes because we can't guarantee the
1715 * required DLM locks are held to protect file size.
1717 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1718 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1719 rc_tiny = ll_do_tiny_write(iocb, from);
1721 /* In case of error, go on and try normal write - Only stop if tiny
1722 * write completed I/O.
1724 if (iov_iter_count(from) == 0)
1725 GOTO(out, rc_normal = rc_tiny);
1727 env = cl_env_get(&refcheck);
1729 return PTR_ERR(env);
1731 args = ll_env_args(env, IO_NORMAL);
1732 args->u.normal.via_iter = from;
1733 args->u.normal.via_iocb = iocb;
1735 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1736 &iocb->ki_pos, iov_iter_count(from));
1738 /* On success, combine bytes written. */
1739 if (rc_tiny >= 0 && rc_normal > 0)
1740 rc_normal += rc_tiny;
1741 /* On error, only return error from normal write if tiny write did not
1742 * write any bytes. Otherwise return bytes written by tiny write.
1744 else if (rc_tiny > 0)
1745 rc_normal = rc_tiny;
1747 cl_env_put(env, &refcheck);
1752 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1754 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1756 static int ll_file_get_iov_count(const struct iovec *iov,
1757 unsigned long *nr_segs, size_t *count)
1762 for (seg = 0; seg < *nr_segs; seg++) {
1763 const struct iovec *iv = &iov[seg];
1766 * If any segment has a negative length, or the cumulative
1767 * length ever wraps negative then return -EINVAL.
1770 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1772 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1777 cnt -= iv->iov_len; /* This segment is no good */
1784 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1785 unsigned long nr_segs, loff_t pos)
1792 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1796 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1797 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1798 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1799 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1800 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1802 result = ll_file_read_iter(iocb, &to);
1807 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1810 struct iovec iov = { .iov_base = buf, .iov_len = count };
1815 init_sync_kiocb(&kiocb, file);
1816 kiocb.ki_pos = *ppos;
1817 #ifdef HAVE_KIOCB_KI_LEFT
1818 kiocb.ki_left = count;
1819 #elif defined(HAVE_KI_NBYTES)
1820 kiocb.i_nbytes = count;
1823 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1824 *ppos = kiocb.ki_pos;
1830 * Write to a file (through the page cache).
1833 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1834 unsigned long nr_segs, loff_t pos)
1836 struct iov_iter from;
1841 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1845 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1846 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1847 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1848 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1849 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1851 result = ll_file_write_iter(iocb, &from);
1856 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1857 size_t count, loff_t *ppos)
1859 struct iovec iov = { .iov_base = (void __user *)buf,
1866 init_sync_kiocb(&kiocb, file);
1867 kiocb.ki_pos = *ppos;
1868 #ifdef HAVE_KIOCB_KI_LEFT
1869 kiocb.ki_left = count;
1870 #elif defined(HAVE_KI_NBYTES)
1871 kiocb.ki_nbytes = count;
1874 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1875 *ppos = kiocb.ki_pos;
1879 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1882 * Send file content (through pagecache) somewhere with helper
1884 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1885 struct pipe_inode_info *pipe, size_t count,
1889 struct vvp_io_args *args;
1894 ll_ras_enter(in_file);
1896 env = cl_env_get(&refcheck);
1898 RETURN(PTR_ERR(env));
1900 args = ll_env_args(env, IO_SPLICE);
1901 args->u.splice.via_pipe = pipe;
1902 args->u.splice.via_flags = flags;
1904 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1905 cl_env_put(env, &refcheck);
1909 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1910 __u64 flags, struct lov_user_md *lum, int lum_size)
1912 struct lookup_intent oit = {
1914 .it_flags = flags | MDS_OPEN_BY_FID,
1919 ll_inode_size_lock(inode);
1920 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1922 GOTO(out_unlock, rc);
1924 ll_release_openhandle(dentry, &oit);
1927 ll_inode_size_unlock(inode);
1928 ll_intent_release(&oit);
1933 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1934 struct lov_mds_md **lmmp, int *lmm_size,
1935 struct ptlrpc_request **request)
1937 struct ll_sb_info *sbi = ll_i2sbi(inode);
1938 struct mdt_body *body;
1939 struct lov_mds_md *lmm = NULL;
1940 struct ptlrpc_request *req = NULL;
1941 struct md_op_data *op_data;
1944 rc = ll_get_default_mdsize(sbi, &lmmsize);
1948 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1949 strlen(filename), lmmsize,
1950 LUSTRE_OPC_ANY, NULL);
1951 if (IS_ERR(op_data))
1952 RETURN(PTR_ERR(op_data));
1954 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1955 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1956 ll_finish_md_op_data(op_data);
1958 CDEBUG(D_INFO, "md_getattr_name failed "
1959 "on %s: rc %d\n", filename, rc);
1963 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1964 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1966 lmmsize = body->mbo_eadatasize;
1968 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1970 GOTO(out, rc = -ENODATA);
1973 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1974 LASSERT(lmm != NULL);
1976 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1977 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1978 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1979 GOTO(out, rc = -EPROTO);
1982 * This is coming from the MDS, so is probably in
1983 * little endian. We convert it to host endian before
1984 * passing it to userspace.
1986 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1989 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1990 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1991 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1992 if (le32_to_cpu(lmm->lmm_pattern) &
1993 LOV_PATTERN_F_RELEASED)
1997 /* if function called for directory - we should
1998 * avoid swab not existent lsm objects */
1999 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2000 lustre_swab_lov_user_md_v1(
2001 (struct lov_user_md_v1 *)lmm);
2002 if (S_ISREG(body->mbo_mode))
2003 lustre_swab_lov_user_md_objects(
2004 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2006 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2007 lustre_swab_lov_user_md_v3(
2008 (struct lov_user_md_v3 *)lmm);
2009 if (S_ISREG(body->mbo_mode))
2010 lustre_swab_lov_user_md_objects(
2011 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2013 } else if (lmm->lmm_magic ==
2014 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2015 lustre_swab_lov_comp_md_v1(
2016 (struct lov_comp_md_v1 *)lmm);
2022 *lmm_size = lmmsize;
2027 static int ll_lov_setea(struct inode *inode, struct file *file,
2030 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2031 struct lov_user_md *lump;
2032 int lum_size = sizeof(struct lov_user_md) +
2033 sizeof(struct lov_user_ost_data);
2037 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2040 OBD_ALLOC_LARGE(lump, lum_size);
2044 if (copy_from_user(lump, arg, lum_size))
2045 GOTO(out_lump, rc = -EFAULT);
2047 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2049 cl_lov_delay_create_clear(&file->f_flags);
2052 OBD_FREE_LARGE(lump, lum_size);
2056 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2063 env = cl_env_get(&refcheck);
2065 RETURN(PTR_ERR(env));
2067 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2068 cl_env_put(env, &refcheck);
2072 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2075 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2076 struct lov_user_md *klum;
2078 __u64 flags = FMODE_WRITE;
2081 rc = ll_copy_user_md(lum, &klum);
2086 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2091 rc = put_user(0, &lum->lmm_stripe_count);
2095 rc = ll_layout_refresh(inode, &gen);
2099 rc = ll_file_getstripe(inode, arg, lum_size);
2101 cl_lov_delay_create_clear(&file->f_flags);
2104 OBD_FREE(klum, lum_size);
2109 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2111 struct ll_inode_info *lli = ll_i2info(inode);
2112 struct cl_object *obj = lli->lli_clob;
2113 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2114 struct ll_grouplock grouplock;
2119 CWARN("group id for group lock must not be 0\n");
2123 if (ll_file_nolock(file))
2124 RETURN(-EOPNOTSUPP);
2126 spin_lock(&lli->lli_lock);
2127 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2128 CWARN("group lock already existed with gid %lu\n",
2129 fd->fd_grouplock.lg_gid);
2130 spin_unlock(&lli->lli_lock);
2133 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2134 spin_unlock(&lli->lli_lock);
2137 * XXX: group lock needs to protect all OST objects while PFL
2138 * can add new OST objects during the IO, so we'd instantiate
2139 * all OST objects before getting its group lock.
2144 struct cl_layout cl = {
2145 .cl_is_composite = false,
2147 struct lu_extent ext = {
2149 .e_end = OBD_OBJECT_EOF,
2152 env = cl_env_get(&refcheck);
2154 RETURN(PTR_ERR(env));
2156 rc = cl_object_layout_get(env, obj, &cl);
2157 if (!rc && cl.cl_is_composite)
2158 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2161 cl_env_put(env, &refcheck);
2166 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2167 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2171 spin_lock(&lli->lli_lock);
2172 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2173 spin_unlock(&lli->lli_lock);
2174 CERROR("another thread just won the race\n");
2175 cl_put_grouplock(&grouplock);
2179 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2180 fd->fd_grouplock = grouplock;
2181 spin_unlock(&lli->lli_lock);
2183 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2187 static int ll_put_grouplock(struct inode *inode, struct file *file,
2190 struct ll_inode_info *lli = ll_i2info(inode);
2191 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2192 struct ll_grouplock grouplock;
2195 spin_lock(&lli->lli_lock);
2196 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2197 spin_unlock(&lli->lli_lock);
2198 CWARN("no group lock held\n");
2202 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2204 if (fd->fd_grouplock.lg_gid != arg) {
2205 CWARN("group lock %lu doesn't match current id %lu\n",
2206 arg, fd->fd_grouplock.lg_gid);
2207 spin_unlock(&lli->lli_lock);
2211 grouplock = fd->fd_grouplock;
2212 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2213 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2214 spin_unlock(&lli->lli_lock);
2216 cl_put_grouplock(&grouplock);
2217 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2222 * Close inode open handle
2224 * \param dentry [in] dentry which contains the inode
2225 * \param it [in,out] intent which contains open info and result
2228 * \retval <0 failure
2230 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2232 struct inode *inode = dentry->d_inode;
2233 struct obd_client_handle *och;
2239 /* Root ? Do nothing. */
2240 if (dentry->d_inode->i_sb->s_root == dentry)
2243 /* No open handle to close? Move away */
2244 if (!it_disposition(it, DISP_OPEN_OPEN))
2247 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2249 OBD_ALLOC(och, sizeof(*och));
2251 GOTO(out, rc = -ENOMEM);
2253 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2255 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2257 /* this one is in place of ll_file_open */
2258 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2259 ptlrpc_req_finished(it->it_request);
2260 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2266 * Get size for inode for which FIEMAP mapping is requested.
2267 * Make the FIEMAP get_info call and returns the result.
2268 * \param fiemap kernel buffer to hold extens
2269 * \param num_bytes kernel buffer size
2271 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2277 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2280 /* Checks for fiemap flags */
2281 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2282 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2286 /* Check for FIEMAP_FLAG_SYNC */
2287 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2288 rc = filemap_fdatawrite(inode->i_mapping);
2293 env = cl_env_get(&refcheck);
2295 RETURN(PTR_ERR(env));
2297 if (i_size_read(inode) == 0) {
2298 rc = ll_glimpse_size(inode);
2303 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2304 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2305 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2307 /* If filesize is 0, then there would be no objects for mapping */
2308 if (fmkey.lfik_oa.o_size == 0) {
2309 fiemap->fm_mapped_extents = 0;
2313 fmkey.lfik_fiemap = *fiemap;
2315 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2316 &fmkey, fiemap, &num_bytes);
2318 cl_env_put(env, &refcheck);
2322 int ll_fid2path(struct inode *inode, void __user *arg)
2324 struct obd_export *exp = ll_i2mdexp(inode);
2325 const struct getinfo_fid2path __user *gfin = arg;
2327 struct getinfo_fid2path *gfout;
2333 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2334 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2337 /* Only need to get the buflen */
2338 if (get_user(pathlen, &gfin->gf_pathlen))
2341 if (pathlen > PATH_MAX)
2344 outsize = sizeof(*gfout) + pathlen;
2345 OBD_ALLOC(gfout, outsize);
2349 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2350 GOTO(gf_free, rc = -EFAULT);
2351 /* append root FID after gfout to let MDT know the root FID so that it
2352 * can lookup the correct path, this is mainly for fileset.
2353 * old server without fileset mount support will ignore this. */
2354 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2356 /* Call mdc_iocontrol */
2357 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2361 if (copy_to_user(arg, gfout, outsize))
2365 OBD_FREE(gfout, outsize);
2370 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2372 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2380 ioc->idv_version = 0;
2381 ioc->idv_layout_version = UINT_MAX;
2383 /* If no file object initialized, we consider its version is 0. */
2387 env = cl_env_get(&refcheck);
2389 RETURN(PTR_ERR(env));
2391 io = vvp_env_thread_io(env);
2393 io->u.ci_data_version.dv_data_version = 0;
2394 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2395 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2398 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2399 result = cl_io_loop(env, io);
2401 result = io->ci_result;
2403 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2404 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2406 cl_io_fini(env, io);
2408 if (unlikely(io->ci_need_restart))
2411 cl_env_put(env, &refcheck);
2417 * Read the data_version for inode.
2419 * This value is computed using stripe object version on OST.
2420 * Version is computed using server side locking.
2422 * @param flags if do sync on the OST side;
2424 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2425 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2427 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2429 struct ioc_data_version ioc = { .idv_flags = flags };
2432 rc = ll_ioc_data_version(inode, &ioc);
2434 *data_version = ioc.idv_version;
2440 * Trigger a HSM release request for the provided inode.
2442 int ll_hsm_release(struct inode *inode)
2445 struct obd_client_handle *och = NULL;
2446 __u64 data_version = 0;
2451 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2452 ll_i2sbi(inode)->ll_fsname,
2453 PFID(&ll_i2info(inode)->lli_fid));
2455 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2457 GOTO(out, rc = PTR_ERR(och));
2459 /* Grab latest data_version and [am]time values */
2460 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2464 env = cl_env_get(&refcheck);
2466 GOTO(out, rc = PTR_ERR(env));
2468 rc = ll_merge_attr(env, inode);
2469 cl_env_put(env, &refcheck);
2471 /* If error happen, we have the wrong size for a file.
2477 /* Release the file.
2478 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2479 * we still need it to pack l_remote_handle to MDT. */
2480 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2486 if (och != NULL && !IS_ERR(och)) /* close the file */
2487 ll_lease_close(och, inode, NULL);
2492 struct ll_swap_stack {
2495 struct inode *inode1;
2496 struct inode *inode2;
2501 static int ll_swap_layouts(struct file *file1, struct file *file2,
2502 struct lustre_swap_layouts *lsl)
2504 struct mdc_swap_layouts msl;
2505 struct md_op_data *op_data;
2508 struct ll_swap_stack *llss = NULL;
2511 OBD_ALLOC_PTR(llss);
2515 llss->inode1 = file_inode(file1);
2516 llss->inode2 = file_inode(file2);
2518 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2522 /* we use 2 bool because it is easier to swap than 2 bits */
2523 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2524 llss->check_dv1 = true;
2526 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2527 llss->check_dv2 = true;
2529 /* we cannot use lsl->sl_dvX directly because we may swap them */
2530 llss->dv1 = lsl->sl_dv1;
2531 llss->dv2 = lsl->sl_dv2;
2533 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2534 if (rc == 0) /* same file, done! */
2537 if (rc < 0) { /* sequentialize it */
2538 swap(llss->inode1, llss->inode2);
2540 swap(llss->dv1, llss->dv2);
2541 swap(llss->check_dv1, llss->check_dv2);
2545 if (gid != 0) { /* application asks to flush dirty cache */
2546 rc = ll_get_grouplock(llss->inode1, file1, gid);
2550 rc = ll_get_grouplock(llss->inode2, file2, gid);
2552 ll_put_grouplock(llss->inode1, file1, gid);
2557 /* ultimate check, before swaping the layouts we check if
2558 * dataversion has changed (if requested) */
2559 if (llss->check_dv1) {
2560 rc = ll_data_version(llss->inode1, &dv, 0);
2563 if (dv != llss->dv1)
2564 GOTO(putgl, rc = -EAGAIN);
2567 if (llss->check_dv2) {
2568 rc = ll_data_version(llss->inode2, &dv, 0);
2571 if (dv != llss->dv2)
2572 GOTO(putgl, rc = -EAGAIN);
2575 /* struct md_op_data is used to send the swap args to the mdt
2576 * only flags is missing, so we use struct mdc_swap_layouts
2577 * through the md_op_data->op_data */
2578 /* flags from user space have to be converted before they are send to
2579 * server, no flag is sent today, they are only used on the client */
2582 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2583 0, LUSTRE_OPC_ANY, &msl);
2584 if (IS_ERR(op_data))
2585 GOTO(free, rc = PTR_ERR(op_data));
2587 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2588 sizeof(*op_data), op_data, NULL);
2589 ll_finish_md_op_data(op_data);
2596 ll_put_grouplock(llss->inode2, file2, gid);
2597 ll_put_grouplock(llss->inode1, file1, gid);
2607 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2609 struct obd_export *exp = ll_i2mdexp(inode);
2610 struct md_op_data *op_data;
2614 /* Detect out-of range masks */
2615 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2618 /* Non-root users are forbidden to set or clear flags which are
2619 * NOT defined in HSM_USER_MASK. */
2620 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2621 !cfs_capable(CFS_CAP_SYS_ADMIN))
2624 if (!exp_connect_archive_id_array(exp)) {
2625 /* Detect out-of range archive id */
2626 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2627 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2631 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2632 LUSTRE_OPC_ANY, hss);
2633 if (IS_ERR(op_data))
2634 RETURN(PTR_ERR(op_data));
2636 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2639 ll_finish_md_op_data(op_data);
2644 static int ll_hsm_import(struct inode *inode, struct file *file,
2645 struct hsm_user_import *hui)
2647 struct hsm_state_set *hss = NULL;
2648 struct iattr *attr = NULL;
2652 if (!S_ISREG(inode->i_mode))
2658 GOTO(out, rc = -ENOMEM);
2660 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2661 hss->hss_archive_id = hui->hui_archive_id;
2662 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2663 rc = ll_hsm_state_set(inode, hss);
2667 OBD_ALLOC_PTR(attr);
2669 GOTO(out, rc = -ENOMEM);
2671 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2672 attr->ia_mode |= S_IFREG;
2673 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2674 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2675 attr->ia_size = hui->hui_size;
2676 attr->ia_mtime.tv_sec = hui->hui_mtime;
2677 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2678 attr->ia_atime.tv_sec = hui->hui_atime;
2679 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2681 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2682 ATTR_UID | ATTR_GID |
2683 ATTR_MTIME | ATTR_MTIME_SET |
2684 ATTR_ATIME | ATTR_ATIME_SET;
2688 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2692 inode_unlock(inode);
2704 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2706 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2707 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2710 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2712 struct inode *inode = file_inode(file);
2714 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2715 ATTR_MTIME | ATTR_MTIME_SET |
2718 .tv_sec = lfu->lfu_atime_sec,
2719 .tv_nsec = lfu->lfu_atime_nsec,
2722 .tv_sec = lfu->lfu_mtime_sec,
2723 .tv_nsec = lfu->lfu_mtime_nsec,
2726 .tv_sec = lfu->lfu_ctime_sec,
2727 .tv_nsec = lfu->lfu_ctime_nsec,
2733 if (!capable(CAP_SYS_ADMIN))
2736 if (!S_ISREG(inode->i_mode))
2740 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2742 inode_unlock(inode);
2747 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2750 case MODE_READ_USER:
2752 case MODE_WRITE_USER:
2759 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2761 /* Used to allow the upper layers of the client to request an LDLM lock
2762 * without doing an actual read or write.
2764 * Used for ladvise lockahead to manually request specific locks.
2766 * \param[in] file file this ladvise lock request is on
2767 * \param[in] ladvise ladvise struct describing this lock request
2769 * \retval 0 success, no detailed result available (sync requests
2770 * and requests sent to the server [not handled locally]
2771 * cannot return detailed results)
2772 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2773 * see definitions for details.
2774 * \retval negative negative errno on error
2776 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2778 struct lu_env *env = NULL;
2779 struct cl_io *io = NULL;
2780 struct cl_lock *lock = NULL;
2781 struct cl_lock_descr *descr = NULL;
2782 struct dentry *dentry = file->f_path.dentry;
2783 struct inode *inode = dentry->d_inode;
2784 enum cl_lock_mode cl_mode;
2785 off_t start = ladvise->lla_start;
2786 off_t end = ladvise->lla_end;
2792 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2793 "start=%llu, end=%llu\n", dentry->d_name.len,
2794 dentry->d_name.name, dentry->d_inode,
2795 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2798 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2800 GOTO(out, result = cl_mode);
2802 /* Get IO environment */
2803 result = cl_io_get(inode, &env, &io, &refcheck);
2807 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2810 * nothing to do for this io. This currently happens when
2811 * stripe sub-object's are not yet created.
2813 result = io->ci_result;
2814 } else if (result == 0) {
2815 lock = vvp_env_lock(env);
2816 descr = &lock->cll_descr;
2818 descr->cld_obj = io->ci_obj;
2819 /* Convert byte offsets to pages */
2820 descr->cld_start = cl_index(io->ci_obj, start);
2821 descr->cld_end = cl_index(io->ci_obj, end);
2822 descr->cld_mode = cl_mode;
2823 /* CEF_MUST is used because we do not want to convert a
2824 * lockahead request to a lockless lock */
2825 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2828 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2829 descr->cld_enq_flags |= CEF_SPECULATIVE;
2831 result = cl_lock_request(env, io, lock);
2833 /* On success, we need to release the lock */
2835 cl_lock_release(env, lock);
2837 cl_io_fini(env, io);
2838 cl_env_put(env, &refcheck);
2840 /* -ECANCELED indicates a matching lock with a different extent
2841 * was already present, and -EEXIST indicates a matching lock
2842 * on exactly the same extent was already present.
2843 * We convert them to positive values for userspace to make
2844 * recognizing true errors easier.
2845 * Note we can only return these detailed results on async requests,
2846 * as sync requests look the same as i/o requests for locking. */
2847 if (result == -ECANCELED)
2848 result = LLA_RESULT_DIFFERENT;
2849 else if (result == -EEXIST)
2850 result = LLA_RESULT_SAME;
2855 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2857 static int ll_ladvise_sanity(struct inode *inode,
2858 struct llapi_lu_ladvise *ladvise)
2860 struct ll_sb_info *sbi = ll_i2sbi(inode);
2861 enum lu_ladvise_type advice = ladvise->lla_advice;
2862 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2863 * be in the first 32 bits of enum ladvise_flags */
2864 __u32 flags = ladvise->lla_peradvice_flags;
2865 /* 3 lines at 80 characters per line, should be plenty */
2868 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2870 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2871 "last supported advice is %s (value '%d'): rc = %d\n",
2872 sbi->ll_fsname, advice,
2873 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2877 /* Per-advice checks */
2879 case LU_LADVISE_LOCKNOEXPAND:
2880 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2882 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2883 "rc = %d\n", sbi->ll_fsname, flags,
2884 ladvise_names[advice], rc);
2888 case LU_LADVISE_LOCKAHEAD:
2889 /* Currently only READ and WRITE modes can be requested */
2890 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2891 ladvise->lla_lockahead_mode == 0) {
2893 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2894 "rc = %d\n", sbi->ll_fsname,
2895 ladvise->lla_lockahead_mode,
2896 ladvise_names[advice], rc);
2899 case LU_LADVISE_WILLREAD:
2900 case LU_LADVISE_DONTNEED:
2902 /* Note fall through above - These checks apply to all advices
2903 * except LOCKNOEXPAND */
2904 if (flags & ~LF_DEFAULT_MASK) {
2906 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2907 "rc = %d\n", sbi->ll_fsname, flags,
2908 ladvise_names[advice], rc);
2911 if (ladvise->lla_start >= ladvise->lla_end) {
2913 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2914 "for %s: rc = %d\n", sbi->ll_fsname,
2915 ladvise->lla_start, ladvise->lla_end,
2916 ladvise_names[advice], rc);
2928 * Give file access advices
2930 * The ladvise interface is similar to Linux fadvise() system call, except it
2931 * forwards the advices directly from Lustre client to server. The server side
2932 * codes will apply appropriate read-ahead and caching techniques for the
2933 * corresponding files.
2935 * A typical workload for ladvise is e.g. a bunch of different clients are
2936 * doing small random reads of a file, so prefetching pages into OSS cache
2937 * with big linear reads before the random IO is a net benefit. Fetching
2938 * all that data into each client cache with fadvise() may not be, due to
2939 * much more data being sent to the client.
2941 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2942 struct llapi_lu_ladvise *ladvise)
2946 struct cl_ladvise_io *lio;
2951 env = cl_env_get(&refcheck);
2953 RETURN(PTR_ERR(env));
2955 io = vvp_env_thread_io(env);
2956 io->ci_obj = ll_i2info(inode)->lli_clob;
2958 /* initialize parameters for ladvise */
2959 lio = &io->u.ci_ladvise;
2960 lio->li_start = ladvise->lla_start;
2961 lio->li_end = ladvise->lla_end;
2962 lio->li_fid = ll_inode2fid(inode);
2963 lio->li_advice = ladvise->lla_advice;
2964 lio->li_flags = flags;
2966 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2967 rc = cl_io_loop(env, io);
2971 cl_io_fini(env, io);
2972 cl_env_put(env, &refcheck);
2976 static int ll_lock_noexpand(struct file *file, int flags)
2978 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2980 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2985 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2988 struct fsxattr fsxattr;
2990 if (copy_from_user(&fsxattr,
2991 (const struct fsxattr __user *)arg,
2995 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2996 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2997 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2998 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2999 if (copy_to_user((struct fsxattr __user *)arg,
3000 &fsxattr, sizeof(fsxattr)))
3006 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3009 * Project Quota ID state is only allowed to change from within the init
3010 * namespace. Enforce that restriction only if we are trying to change
3011 * the quota ID state. Everything else is allowed in user namespaces.
3013 if (current_user_ns() == &init_user_ns)
3016 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3019 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3020 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3023 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3030 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3034 struct md_op_data *op_data;
3035 struct ptlrpc_request *req = NULL;
3037 struct fsxattr fsxattr;
3038 struct cl_object *obj;
3042 if (copy_from_user(&fsxattr,
3043 (const struct fsxattr __user *)arg,
3047 rc = ll_ioctl_check_project(inode, &fsxattr);
3051 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3052 LUSTRE_OPC_ANY, NULL);
3053 if (IS_ERR(op_data))
3054 RETURN(PTR_ERR(op_data));
3056 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3057 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3058 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3059 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3060 op_data->op_projid = fsxattr.fsx_projid;
3061 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3062 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3064 ptlrpc_req_finished(req);
3066 GOTO(out_fsxattr, rc);
3067 ll_update_inode_flags(inode, op_data->op_attr_flags);
3068 obj = ll_i2info(inode)->lli_clob;
3070 GOTO(out_fsxattr, rc);
3072 OBD_ALLOC_PTR(attr);
3074 GOTO(out_fsxattr, rc = -ENOMEM);
3076 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3077 fsxattr.fsx_xflags);
3080 ll_finish_md_op_data(op_data);
3084 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3087 struct inode *inode = file_inode(file);
3088 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3089 struct ll_inode_info *lli = ll_i2info(inode);
3090 struct obd_client_handle *och = NULL;
3091 struct split_param sp;
3094 enum mds_op_bias bias = 0;
3095 struct file *layout_file = NULL;
3097 size_t data_size = 0;
3101 mutex_lock(&lli->lli_och_mutex);
3102 if (fd->fd_lease_och != NULL) {
3103 och = fd->fd_lease_och;
3104 fd->fd_lease_och = NULL;
3106 mutex_unlock(&lli->lli_och_mutex);
3109 GOTO(out, rc = -ENOLCK);
3111 fmode = och->och_flags;
3113 switch (ioc->lil_flags) {
3114 case LL_LEASE_RESYNC_DONE:
3115 if (ioc->lil_count > IOC_IDS_MAX)
3116 GOTO(out, rc = -EINVAL);
3118 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3119 OBD_ALLOC(data, data_size);
3121 GOTO(out, rc = -ENOMEM);
3123 if (copy_from_user(data, (void __user *)arg, data_size))
3124 GOTO(out, rc = -EFAULT);
3126 bias = MDS_CLOSE_RESYNC_DONE;
3128 case LL_LEASE_LAYOUT_MERGE: {
3131 if (ioc->lil_count != 1)
3132 GOTO(out, rc = -EINVAL);
3134 arg += sizeof(*ioc);
3135 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3136 GOTO(out, rc = -EFAULT);
3138 layout_file = fget(fd);
3140 GOTO(out, rc = -EBADF);
3142 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3143 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3144 GOTO(out, rc = -EPERM);
3146 data = file_inode(layout_file);
3147 bias = MDS_CLOSE_LAYOUT_MERGE;
3150 case LL_LEASE_LAYOUT_SPLIT: {
3154 if (ioc->lil_count != 2)
3155 GOTO(out, rc = -EINVAL);
3157 arg += sizeof(*ioc);
3158 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3159 GOTO(out, rc = -EFAULT);
3161 arg += sizeof(__u32);
3162 if (copy_from_user(&mirror_id, (void __user *)arg,
3164 GOTO(out, rc = -EFAULT);
3166 layout_file = fget(fdv);
3168 GOTO(out, rc = -EBADF);
3170 sp.sp_inode = file_inode(layout_file);
3171 sp.sp_mirror_id = (__u16)mirror_id;
3173 bias = MDS_CLOSE_LAYOUT_SPLIT;
3177 /* without close intent */
3181 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3185 rc = ll_lease_och_release(inode, file);
3194 switch (ioc->lil_flags) {
3195 case LL_LEASE_RESYNC_DONE:
3197 OBD_FREE(data, data_size);
3199 case LL_LEASE_LAYOUT_MERGE:
3200 case LL_LEASE_LAYOUT_SPLIT:
3207 rc = ll_lease_type_from_fmode(fmode);
3211 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3214 struct inode *inode = file_inode(file);
3215 struct ll_inode_info *lli = ll_i2info(inode);
3216 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3217 struct obd_client_handle *och = NULL;
3218 __u64 open_flags = 0;
3224 switch (ioc->lil_mode) {
3225 case LL_LEASE_WRLCK:
3226 if (!(file->f_mode & FMODE_WRITE))
3228 fmode = FMODE_WRITE;
3230 case LL_LEASE_RDLCK:
3231 if (!(file->f_mode & FMODE_READ))
3235 case LL_LEASE_UNLCK:
3236 RETURN(ll_file_unlock_lease(file, ioc, arg));
3241 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3243 /* apply for lease */
3244 if (ioc->lil_flags & LL_LEASE_RESYNC)
3245 open_flags = MDS_OPEN_RESYNC;
3246 och = ll_lease_open(inode, file, fmode, open_flags);
3248 RETURN(PTR_ERR(och));
3250 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3251 rc = ll_lease_file_resync(och, inode, arg);
3253 ll_lease_close(och, inode, NULL);
3256 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3258 ll_lease_close(och, inode, NULL);
3264 mutex_lock(&lli->lli_och_mutex);
3265 if (fd->fd_lease_och == NULL) {
3266 fd->fd_lease_och = och;
3269 mutex_unlock(&lli->lli_och_mutex);
3271 /* impossible now that only excl is supported for now */
3272 ll_lease_close(och, inode, &lease_broken);
3278 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3280 struct ll_inode_info *lli = ll_i2info(inode);
3281 struct ll_sb_info *sbi = ll_i2sbi(inode);
3282 __u64 now = ktime_get_real_seconds();
3285 spin_lock(&lli->lli_heat_lock);
3286 heat->lh_flags = lli->lli_heat_flags;
3287 for (i = 0; i < heat->lh_count; i++)
3288 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3289 now, sbi->ll_heat_decay_weight,
3290 sbi->ll_heat_period_second);
3291 spin_unlock(&lli->lli_heat_lock);
3294 static int ll_heat_set(struct inode *inode, __u64 flags)
3296 struct ll_inode_info *lli = ll_i2info(inode);
3299 spin_lock(&lli->lli_heat_lock);
3300 if (flags & LU_HEAT_FLAG_CLEAR)
3301 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3303 if (flags & LU_HEAT_FLAG_OFF)
3304 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3306 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3308 spin_unlock(&lli->lli_heat_lock);
3314 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3316 struct inode *inode = file_inode(file);
3317 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3321 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3322 PFID(ll_inode2fid(inode)), inode, cmd);
3323 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3325 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3326 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3330 case LL_IOC_GETFLAGS:
3331 /* Get the current value of the file flags */
3332 return put_user(fd->fd_flags, (int __user *)arg);
3333 case LL_IOC_SETFLAGS:
3334 case LL_IOC_CLRFLAGS:
3335 /* Set or clear specific file flags */
3336 /* XXX This probably needs checks to ensure the flags are
3337 * not abused, and to handle any flag side effects.
3339 if (get_user(flags, (int __user *) arg))
3342 if (cmd == LL_IOC_SETFLAGS) {
3343 if ((flags & LL_FILE_IGNORE_LOCK) &&
3344 !(file->f_flags & O_DIRECT)) {
3345 CERROR("%s: unable to disable locking on "
3346 "non-O_DIRECT file\n", current->comm);
3350 fd->fd_flags |= flags;
3352 fd->fd_flags &= ~flags;
3355 case LL_IOC_LOV_SETSTRIPE:
3356 case LL_IOC_LOV_SETSTRIPE_NEW:
3357 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3358 case LL_IOC_LOV_SETEA:
3359 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3360 case LL_IOC_LOV_SWAP_LAYOUTS: {
3362 struct lustre_swap_layouts lsl;
3364 if (copy_from_user(&lsl, (char __user *)arg,
3365 sizeof(struct lustre_swap_layouts)))
3368 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3371 file2 = fget(lsl.sl_fd);
3375 /* O_WRONLY or O_RDWR */
3376 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3377 GOTO(out, rc = -EPERM);
3379 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3380 struct inode *inode2;
3381 struct ll_inode_info *lli;
3382 struct obd_client_handle *och = NULL;
3384 lli = ll_i2info(inode);
3385 mutex_lock(&lli->lli_och_mutex);
3386 if (fd->fd_lease_och != NULL) {
3387 och = fd->fd_lease_och;
3388 fd->fd_lease_och = NULL;
3390 mutex_unlock(&lli->lli_och_mutex);
3392 GOTO(out, rc = -ENOLCK);
3393 inode2 = file_inode(file2);
3394 rc = ll_swap_layouts_close(och, inode, inode2);
3396 rc = ll_swap_layouts(file, file2, &lsl);
3402 case LL_IOC_LOV_GETSTRIPE:
3403 case LL_IOC_LOV_GETSTRIPE_NEW:
3404 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3405 case FS_IOC_GETFLAGS:
3406 case FS_IOC_SETFLAGS:
3407 RETURN(ll_iocontrol(inode, file, cmd, arg));
3408 case FSFILT_IOC_GETVERSION:
3409 case FS_IOC_GETVERSION:
3410 RETURN(put_user(inode->i_generation, (int __user *)arg));
3411 /* We need to special case any other ioctls we want to handle,
3412 * to send them to the MDS/OST as appropriate and to properly
3413 * network encode the arg field. */
3414 case FS_IOC_SETVERSION:
3417 case LL_IOC_GROUP_LOCK:
3418 RETURN(ll_get_grouplock(inode, file, arg));
3419 case LL_IOC_GROUP_UNLOCK:
3420 RETURN(ll_put_grouplock(inode, file, arg));
3421 case IOC_OBD_STATFS:
3422 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3424 case LL_IOC_FLUSHCTX:
3425 RETURN(ll_flush_ctx(inode));
3426 case LL_IOC_PATH2FID: {
3427 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3428 sizeof(struct lu_fid)))
3433 case LL_IOC_GETPARENT:
3434 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3436 case OBD_IOC_FID2PATH:
3437 RETURN(ll_fid2path(inode, (void __user *)arg));
3438 case LL_IOC_DATA_VERSION: {
3439 struct ioc_data_version idv;
3442 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3445 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3446 rc = ll_ioc_data_version(inode, &idv);
3449 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3455 case LL_IOC_GET_MDTIDX: {
3458 mdtidx = ll_get_mdt_idx(inode);
3462 if (put_user((int)mdtidx, (int __user *)arg))
3467 case OBD_IOC_GETDTNAME:
3468 case OBD_IOC_GETMDNAME:
3469 RETURN(ll_get_obd_name(inode, cmd, arg));
3470 case LL_IOC_HSM_STATE_GET: {
3471 struct md_op_data *op_data;
3472 struct hsm_user_state *hus;
3479 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3480 LUSTRE_OPC_ANY, hus);
3481 if (IS_ERR(op_data)) {
3483 RETURN(PTR_ERR(op_data));
3486 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3489 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3492 ll_finish_md_op_data(op_data);
3496 case LL_IOC_HSM_STATE_SET: {
3497 struct hsm_state_set *hss;
3504 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3509 rc = ll_hsm_state_set(inode, hss);
3514 case LL_IOC_HSM_ACTION: {
3515 struct md_op_data *op_data;
3516 struct hsm_current_action *hca;
3523 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3524 LUSTRE_OPC_ANY, hca);
3525 if (IS_ERR(op_data)) {
3527 RETURN(PTR_ERR(op_data));
3530 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3533 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3536 ll_finish_md_op_data(op_data);
3540 case LL_IOC_SET_LEASE_OLD: {
3541 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3543 RETURN(ll_file_set_lease(file, &ioc, 0));
3545 case LL_IOC_SET_LEASE: {
3546 struct ll_ioc_lease ioc;
3548 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3551 RETURN(ll_file_set_lease(file, &ioc, arg));
3553 case LL_IOC_GET_LEASE: {
3554 struct ll_inode_info *lli = ll_i2info(inode);
3555 struct ldlm_lock *lock = NULL;
3558 mutex_lock(&lli->lli_och_mutex);
3559 if (fd->fd_lease_och != NULL) {
3560 struct obd_client_handle *och = fd->fd_lease_och;
3562 lock = ldlm_handle2lock(&och->och_lease_handle);
3564 lock_res_and_lock(lock);
3565 if (!ldlm_is_cancel(lock))
3566 fmode = och->och_flags;
3568 unlock_res_and_lock(lock);
3569 LDLM_LOCK_PUT(lock);
3572 mutex_unlock(&lli->lli_och_mutex);
3574 RETURN(ll_lease_type_from_fmode(fmode));
3576 case LL_IOC_HSM_IMPORT: {
3577 struct hsm_user_import *hui;
3583 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3588 rc = ll_hsm_import(inode, file, hui);
3593 case LL_IOC_FUTIMES_3: {
3594 struct ll_futimes_3 lfu;
3596 if (copy_from_user(&lfu,
3597 (const struct ll_futimes_3 __user *)arg,
3601 RETURN(ll_file_futimes_3(file, &lfu));
3603 case LL_IOC_LADVISE: {
3604 struct llapi_ladvise_hdr *k_ladvise_hdr;
3605 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3608 int alloc_size = sizeof(*k_ladvise_hdr);
3611 u_ladvise_hdr = (void __user *)arg;
3612 OBD_ALLOC_PTR(k_ladvise_hdr);
3613 if (k_ladvise_hdr == NULL)
3616 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3617 GOTO(out_ladvise, rc = -EFAULT);
3619 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3620 k_ladvise_hdr->lah_count < 1)
3621 GOTO(out_ladvise, rc = -EINVAL);
3623 num_advise = k_ladvise_hdr->lah_count;
3624 if (num_advise >= LAH_COUNT_MAX)
3625 GOTO(out_ladvise, rc = -EFBIG);
3627 OBD_FREE_PTR(k_ladvise_hdr);
3628 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3629 lah_advise[num_advise]);
3630 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3631 if (k_ladvise_hdr == NULL)
3635 * TODO: submit multiple advices to one server in a single RPC
3637 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3638 GOTO(out_ladvise, rc = -EFAULT);
3640 for (i = 0; i < num_advise; i++) {
3641 struct llapi_lu_ladvise *k_ladvise =
3642 &k_ladvise_hdr->lah_advise[i];
3643 struct llapi_lu_ladvise __user *u_ladvise =
3644 &u_ladvise_hdr->lah_advise[i];
3646 rc = ll_ladvise_sanity(inode, k_ladvise);
3648 GOTO(out_ladvise, rc);
3650 switch (k_ladvise->lla_advice) {
3651 case LU_LADVISE_LOCKNOEXPAND:
3652 rc = ll_lock_noexpand(file,
3653 k_ladvise->lla_peradvice_flags);
3654 GOTO(out_ladvise, rc);
3655 case LU_LADVISE_LOCKAHEAD:
3657 rc = ll_file_lock_ahead(file, k_ladvise);
3660 GOTO(out_ladvise, rc);
3663 &u_ladvise->lla_lockahead_result))
3664 GOTO(out_ladvise, rc = -EFAULT);
3667 rc = ll_ladvise(inode, file,
3668 k_ladvise_hdr->lah_flags,
3671 GOTO(out_ladvise, rc);
3678 OBD_FREE(k_ladvise_hdr, alloc_size);
3681 case LL_IOC_FLR_SET_MIRROR: {
3682 /* mirror I/O must be direct to avoid polluting page cache
3684 if (!(file->f_flags & O_DIRECT))
3687 fd->fd_designated_mirror = (__u32)arg;
3690 case LL_IOC_FSGETXATTR:
3691 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3692 case LL_IOC_FSSETXATTR:
3693 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3695 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3696 case LL_IOC_HEAT_GET: {
3697 struct lu_heat uheat;
3698 struct lu_heat *heat;
3701 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3704 if (uheat.lh_count > OBD_HEAT_COUNT)
3705 uheat.lh_count = OBD_HEAT_COUNT;
3707 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3708 OBD_ALLOC(heat, size);
3712 heat->lh_count = uheat.lh_count;
3713 ll_heat_get(inode, heat);
3714 rc = copy_to_user((char __user *)arg, heat, size);
3715 OBD_FREE(heat, size);
3716 RETURN(rc ? -EFAULT : 0);
3718 case LL_IOC_HEAT_SET: {
3721 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3724 rc = ll_heat_set(inode, flags);
3728 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3729 (void __user *)arg));
3733 #ifndef HAVE_FILE_LLSEEK_SIZE
3734 static inline loff_t
3735 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3737 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3739 if (offset > maxsize)
3742 if (offset != file->f_pos) {
3743 file->f_pos = offset;
3744 file->f_version = 0;
3750 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3751 loff_t maxsize, loff_t eof)
3753 struct inode *inode = file_inode(file);
3761 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3762 * position-querying operation. Avoid rewriting the "same"
3763 * f_pos value back to the file because a concurrent read(),
3764 * write() or lseek() might have altered it
3769 * f_lock protects against read/modify/write race with other
3770 * SEEK_CURs. Note that parallel writes and reads behave
3774 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3775 inode_unlock(inode);
3779 * In the generic case the entire file is data, so as long as
3780 * offset isn't at the end of the file then the offset is data.
3787 * There is a virtual hole at the end of the file, so as long as
3788 * offset isn't i_size or larger, return i_size.
3796 return llseek_execute(file, offset, maxsize);
3800 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3802 struct inode *inode = file_inode(file);
3803 loff_t retval, eof = 0;
3806 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3807 (origin == SEEK_CUR) ? file->f_pos : 0);
3808 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3809 PFID(ll_inode2fid(inode)), inode, retval, retval,
3811 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3813 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3814 retval = ll_glimpse_size(inode);
3817 eof = i_size_read(inode);
3820 retval = ll_generic_file_llseek_size(file, offset, origin,
3821 ll_file_maxbytes(inode), eof);
3825 static int ll_flush(struct file *file, fl_owner_t id)
3827 struct inode *inode = file_inode(file);
3828 struct ll_inode_info *lli = ll_i2info(inode);
3829 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3832 LASSERT(!S_ISDIR(inode->i_mode));
3834 /* catch async errors that were recorded back when async writeback
3835 * failed for pages in this mapping. */
3836 rc = lli->lli_async_rc;
3837 lli->lli_async_rc = 0;
3838 if (lli->lli_clob != NULL) {
3839 err = lov_read_and_clear_async_rc(lli->lli_clob);
3844 /* The application has been told write failure already.
3845 * Do not report failure again. */
3846 if (fd->fd_write_failed)
3848 return rc ? -EIO : 0;
3852 * Called to make sure a portion of file has been written out.
3853 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3855 * Return how many pages have been written.
3857 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3858 enum cl_fsync_mode mode, int ignore_layout)
3862 struct cl_fsync_io *fio;
3867 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3868 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3871 env = cl_env_get(&refcheck);
3873 RETURN(PTR_ERR(env));
3875 io = vvp_env_thread_io(env);
3876 io->ci_obj = ll_i2info(inode)->lli_clob;
3877 io->ci_ignore_layout = ignore_layout;
3879 /* initialize parameters for sync */
3880 fio = &io->u.ci_fsync;
3881 fio->fi_start = start;
3883 fio->fi_fid = ll_inode2fid(inode);
3884 fio->fi_mode = mode;
3885 fio->fi_nr_written = 0;
3887 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3888 result = cl_io_loop(env, io);
3890 result = io->ci_result;
3892 result = fio->fi_nr_written;
3893 cl_io_fini(env, io);
3894 cl_env_put(env, &refcheck);
3900 * When dentry is provided (the 'else' case), file_dentry() may be
3901 * null and dentry must be used directly rather than pulled from
3902 * file_dentry() as is done otherwise.
3905 #ifdef HAVE_FILE_FSYNC_4ARGS
3906 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3908 struct dentry *dentry = file_dentry(file);
3909 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3910 int ll_fsync(struct file *file, int datasync)
3912 struct dentry *dentry = file_dentry(file);
3914 loff_t end = LLONG_MAX;
3916 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3919 loff_t end = LLONG_MAX;
3921 struct inode *inode = dentry->d_inode;
3922 struct ll_inode_info *lli = ll_i2info(inode);
3923 struct ptlrpc_request *req;
3927 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3928 PFID(ll_inode2fid(inode)), inode);
3929 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3931 #ifdef HAVE_FILE_FSYNC_4ARGS
3932 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3935 /* fsync's caller has already called _fdata{sync,write}, we want
3936 * that IO to finish before calling the osc and mdc sync methods */
3937 rc = filemap_fdatawait(inode->i_mapping);
3940 /* catch async errors that were recorded back when async writeback
3941 * failed for pages in this mapping. */
3942 if (!S_ISDIR(inode->i_mode)) {
3943 err = lli->lli_async_rc;
3944 lli->lli_async_rc = 0;
3947 if (lli->lli_clob != NULL) {
3948 err = lov_read_and_clear_async_rc(lli->lli_clob);
3954 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3958 ptlrpc_req_finished(req);
3960 if (S_ISREG(inode->i_mode)) {
3961 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3963 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3964 if (rc == 0 && err < 0)
3967 fd->fd_write_failed = true;
3969 fd->fd_write_failed = false;
3972 #ifdef HAVE_FILE_FSYNC_4ARGS
3973 inode_unlock(inode);
3979 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3981 struct inode *inode = file_inode(file);
3982 struct ll_sb_info *sbi = ll_i2sbi(inode);
3983 struct ldlm_enqueue_info einfo = {
3984 .ei_type = LDLM_FLOCK,
3985 .ei_cb_cp = ldlm_flock_completion_ast,
3986 .ei_cbdata = file_lock,
3988 struct md_op_data *op_data;
3989 struct lustre_handle lockh = { 0 };
3990 union ldlm_policy_data flock = { { 0 } };
3991 int fl_type = file_lock->fl_type;
3997 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3998 PFID(ll_inode2fid(inode)), file_lock);
4000 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4002 if (file_lock->fl_flags & FL_FLOCK) {
4003 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4004 /* flocks are whole-file locks */
4005 flock.l_flock.end = OFFSET_MAX;
4006 /* For flocks owner is determined by the local file desctiptor*/
4007 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4008 } else if (file_lock->fl_flags & FL_POSIX) {
4009 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4010 flock.l_flock.start = file_lock->fl_start;
4011 flock.l_flock.end = file_lock->fl_end;
4015 flock.l_flock.pid = file_lock->fl_pid;
4017 /* Somewhat ugly workaround for svc lockd.
4018 * lockd installs custom fl_lmops->lm_compare_owner that checks
4019 * for the fl_owner to be the same (which it always is on local node
4020 * I guess between lockd processes) and then compares pid.
4021 * As such we assign pid to the owner field to make it all work,
4022 * conflict with normal locks is unlikely since pid space and
4023 * pointer space for current->files are not intersecting */
4024 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4025 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4029 einfo.ei_mode = LCK_PR;
4032 /* An unlock request may or may not have any relation to
4033 * existing locks so we may not be able to pass a lock handle
4034 * via a normal ldlm_lock_cancel() request. The request may even
4035 * unlock a byte range in the middle of an existing lock. In
4036 * order to process an unlock request we need all of the same
4037 * information that is given with a normal read or write record
4038 * lock request. To avoid creating another ldlm unlock (cancel)
4039 * message we'll treat a LCK_NL flock request as an unlock. */
4040 einfo.ei_mode = LCK_NL;
4043 einfo.ei_mode = LCK_PW;
4046 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4061 flags = LDLM_FL_BLOCK_NOWAIT;
4067 flags = LDLM_FL_TEST_LOCK;
4070 CERROR("unknown fcntl lock command: %d\n", cmd);
4074 /* Save the old mode so that if the mode in the lock changes we
4075 * can decrement the appropriate reader or writer refcount. */
4076 file_lock->fl_type = einfo.ei_mode;
4078 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4079 LUSTRE_OPC_ANY, NULL);
4080 if (IS_ERR(op_data))
4081 RETURN(PTR_ERR(op_data));
4083 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4084 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4085 flock.l_flock.pid, flags, einfo.ei_mode,
4086 flock.l_flock.start, flock.l_flock.end);
4088 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4091 /* Restore the file lock type if not TEST lock. */
4092 if (!(flags & LDLM_FL_TEST_LOCK))
4093 file_lock->fl_type = fl_type;
4095 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4096 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4097 !(flags & LDLM_FL_TEST_LOCK))
4098 rc2 = locks_lock_file_wait(file, file_lock);
4100 if ((file_lock->fl_flags & FL_FLOCK) &&
4101 (rc == 0 || file_lock->fl_type == F_UNLCK))
4102 rc2 = flock_lock_file_wait(file, file_lock);
4103 if ((file_lock->fl_flags & FL_POSIX) &&
4104 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4105 !(flags & LDLM_FL_TEST_LOCK))
4106 rc2 = posix_lock_file_wait(file, file_lock);
4107 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4109 if (rc2 && file_lock->fl_type != F_UNLCK) {
4110 einfo.ei_mode = LCK_NL;
4111 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4116 ll_finish_md_op_data(op_data);
4121 int ll_get_fid_by_name(struct inode *parent, const char *name,
4122 int namelen, struct lu_fid *fid,
4123 struct inode **inode)
4125 struct md_op_data *op_data = NULL;
4126 struct mdt_body *body;
4127 struct ptlrpc_request *req;
4131 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4132 LUSTRE_OPC_ANY, NULL);
4133 if (IS_ERR(op_data))
4134 RETURN(PTR_ERR(op_data));
4136 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4137 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4138 ll_finish_md_op_data(op_data);
4142 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4144 GOTO(out_req, rc = -EFAULT);
4146 *fid = body->mbo_fid1;
4149 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4151 ptlrpc_req_finished(req);
4155 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4158 struct dentry *dchild = NULL;
4159 struct inode *child_inode = NULL;
4160 struct md_op_data *op_data;
4161 struct ptlrpc_request *request = NULL;
4162 struct obd_client_handle *och = NULL;
4164 struct mdt_body *body;
4165 __u64 data_version = 0;
4166 size_t namelen = strlen(name);
4167 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4171 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4172 PFID(ll_inode2fid(parent)), name,
4173 lum->lum_stripe_offset, lum->lum_stripe_count);
4175 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4176 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4177 lustre_swab_lmv_user_md(lum);
4179 /* Get child FID first */
4180 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4183 dchild = d_lookup(file_dentry(file), &qstr);
4185 if (dchild->d_inode)
4186 child_inode = igrab(dchild->d_inode);
4191 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4200 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4201 OBD_CONNECT2_DIR_MIGRATE)) {
4202 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4203 ll_i2info(child_inode)->lli_lsm_md) {
4204 CERROR("%s: MDT doesn't support stripe directory "
4205 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4206 GOTO(out_iput, rc = -EOPNOTSUPP);
4211 * lfs migrate command needs to be blocked on the client
4212 * by checking the migrate FID against the FID of the
4215 if (child_inode == parent->i_sb->s_root->d_inode)
4216 GOTO(out_iput, rc = -EINVAL);
4218 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4219 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4220 if (IS_ERR(op_data))
4221 GOTO(out_iput, rc = PTR_ERR(op_data));
4223 inode_lock(child_inode);
4224 op_data->op_fid3 = *ll_inode2fid(child_inode);
4225 if (!fid_is_sane(&op_data->op_fid3)) {
4226 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4227 ll_i2sbi(parent)->ll_fsname, name,
4228 PFID(&op_data->op_fid3));
4229 GOTO(out_unlock, rc = -EINVAL);
4232 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4233 op_data->op_data = lum;
4234 op_data->op_data_size = lumlen;
4237 if (S_ISREG(child_inode->i_mode)) {
4238 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4242 GOTO(out_unlock, rc);
4245 rc = ll_data_version(child_inode, &data_version,
4248 GOTO(out_close, rc);
4250 op_data->op_open_handle = och->och_open_handle;
4251 op_data->op_data_version = data_version;
4252 op_data->op_lease_handle = och->och_lease_handle;
4253 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4255 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4256 och->och_mod->mod_open_req->rq_replay = 0;
4257 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4260 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4261 name, namelen, &request);
4263 LASSERT(request != NULL);
4264 ll_update_times(request, parent);
4267 if (rc == 0 || rc == -EAGAIN) {
4268 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4269 LASSERT(body != NULL);
4271 /* If the server does release layout lock, then we cleanup
4272 * the client och here, otherwise release it in out_close: */
4273 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4274 obd_mod_put(och->och_mod);
4275 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4277 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4283 if (request != NULL) {
4284 ptlrpc_req_finished(request);
4288 /* Try again if the lease has cancelled. */
4289 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4294 ll_lease_close(och, child_inode, NULL);
4296 clear_nlink(child_inode);
4298 inode_unlock(child_inode);
4299 ll_finish_md_op_data(op_data);
4306 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4314 * test if some locks matching bits and l_req_mode are acquired
4315 * - bits can be in different locks
4316 * - if found clear the common lock bits in *bits
4317 * - the bits not found, are kept in *bits
4319 * \param bits [IN] searched lock bits [IN]
4320 * \param l_req_mode [IN] searched lock mode
4321 * \retval boolean, true iff all bits are found
4323 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4325 struct lustre_handle lockh;
4326 union ldlm_policy_data policy;
4327 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4328 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4337 fid = &ll_i2info(inode)->lli_fid;
4338 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4339 ldlm_lockname[mode]);
4341 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4342 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4343 policy.l_inodebits.bits = *bits & (1 << i);
4344 if (policy.l_inodebits.bits == 0)
4347 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4348 &policy, mode, &lockh)) {
4349 struct ldlm_lock *lock;
4351 lock = ldlm_handle2lock(&lockh);
4354 ~(lock->l_policy_data.l_inodebits.bits);
4355 LDLM_LOCK_PUT(lock);
4357 *bits &= ~policy.l_inodebits.bits;
4364 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4365 struct lustre_handle *lockh, __u64 flags,
4366 enum ldlm_mode mode)
4368 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4373 fid = &ll_i2info(inode)->lli_fid;
4374 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4376 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4377 fid, LDLM_IBITS, &policy, mode, lockh);
4382 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4384 /* Already unlinked. Just update nlink and return success */
4385 if (rc == -ENOENT) {
4387 /* If it is striped directory, and there is bad stripe
4388 * Let's revalidate the dentry again, instead of returning
4390 if (S_ISDIR(inode->i_mode) &&
4391 ll_i2info(inode)->lli_lsm_md != NULL)
4394 /* This path cannot be hit for regular files unless in
4395 * case of obscure races, so no need to to validate
4397 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4399 } else if (rc != 0) {
4400 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4401 "%s: revalidate FID "DFID" error: rc = %d\n",
4402 ll_i2sbi(inode)->ll_fsname,
4403 PFID(ll_inode2fid(inode)), rc);
4409 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4411 struct inode *inode = dentry->d_inode;
4412 struct obd_export *exp = ll_i2mdexp(inode);
4413 struct lookup_intent oit = {
4416 struct ptlrpc_request *req = NULL;
4417 struct md_op_data *op_data;
4421 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4422 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4424 /* Call getattr by fid, so do not provide name at all. */
4425 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4426 LUSTRE_OPC_ANY, NULL);
4427 if (IS_ERR(op_data))
4428 RETURN(PTR_ERR(op_data));
4430 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4431 ll_finish_md_op_data(op_data);
4433 rc = ll_inode_revalidate_fini(inode, rc);
4437 rc = ll_revalidate_it_finish(req, &oit, dentry);
4439 ll_intent_release(&oit);
4443 /* Unlinked? Unhash dentry, so it is not picked up later by
4444 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4445 * here to preserve get_cwd functionality on 2.6.
4447 if (!dentry->d_inode->i_nlink) {
4448 ll_lock_dcache(inode);
4449 d_lustre_invalidate(dentry, 0);
4450 ll_unlock_dcache(inode);
4453 ll_lookup_finish_locks(&oit, dentry);
4455 ptlrpc_req_finished(req);
4460 static int ll_merge_md_attr(struct inode *inode)
4462 struct ll_inode_info *lli = ll_i2info(inode);
4463 struct cl_attr attr = { 0 };
4466 LASSERT(lli->lli_lsm_md != NULL);
4467 down_read(&lli->lli_lsm_sem);
4468 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4469 &attr, ll_md_blocking_ast);
4470 up_read(&lli->lli_lsm_sem);
4474 set_nlink(inode, attr.cat_nlink);
4475 inode->i_blocks = attr.cat_blocks;
4476 i_size_write(inode, attr.cat_size);
4478 ll_i2info(inode)->lli_atime = attr.cat_atime;
4479 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4480 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4485 static inline dev_t ll_compat_encode_dev(dev_t dev)
4487 /* The compat_sys_*stat*() syscalls will fail unless the
4488 * device majors and minors are both less than 256. Note that
4489 * the value returned here will be passed through
4490 * old_encode_dev() in cp_compat_stat(). And so we are not
4491 * trying to return a valid compat (u16) device number, just
4492 * one that will pass the old_valid_dev() check. */
4494 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4497 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4498 int ll_getattr(const struct path *path, struct kstat *stat,
4499 u32 request_mask, unsigned int flags)
4501 struct dentry *de = path->dentry;
4503 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4506 struct inode *inode = de->d_inode;
4507 struct ll_sb_info *sbi = ll_i2sbi(inode);
4508 struct ll_inode_info *lli = ll_i2info(inode);
4511 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4513 rc = ll_inode_revalidate(de, IT_GETATTR);
4517 if (S_ISREG(inode->i_mode)) {
4518 /* In case of restore, the MDT has the right size and has
4519 * already send it back without granting the layout lock,
4520 * inode is up-to-date so glimpse is useless.
4521 * Also to glimpse we need the layout, in case of a running
4522 * restore the MDT holds the layout lock so the glimpse will
4523 * block up to the end of restore (getattr will block)
4525 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4526 rc = ll_glimpse_size(inode);
4531 /* If object isn't regular a file then don't validate size. */
4532 if (S_ISDIR(inode->i_mode) &&
4533 lli->lli_lsm_md != NULL) {
4534 rc = ll_merge_md_attr(inode);
4539 inode->i_atime.tv_sec = lli->lli_atime;
4540 inode->i_mtime.tv_sec = lli->lli_mtime;
4541 inode->i_ctime.tv_sec = lli->lli_ctime;
4544 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4546 if (ll_need_32bit_api(sbi)) {
4547 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4548 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4549 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4551 stat->ino = inode->i_ino;
4552 stat->dev = inode->i_sb->s_dev;
4553 stat->rdev = inode->i_rdev;
4556 stat->mode = inode->i_mode;
4557 stat->uid = inode->i_uid;
4558 stat->gid = inode->i_gid;
4559 stat->atime = inode->i_atime;
4560 stat->mtime = inode->i_mtime;
4561 stat->ctime = inode->i_ctime;
4562 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4564 stat->nlink = inode->i_nlink;
4565 stat->size = i_size_read(inode);
4566 stat->blocks = inode->i_blocks;
4571 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4572 __u64 start, __u64 len)
4576 struct fiemap *fiemap;
4577 unsigned int extent_count = fieinfo->fi_extents_max;
4579 num_bytes = sizeof(*fiemap) + (extent_count *
4580 sizeof(struct fiemap_extent));
4581 OBD_ALLOC_LARGE(fiemap, num_bytes);
4586 fiemap->fm_flags = fieinfo->fi_flags;
4587 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4588 fiemap->fm_start = start;
4589 fiemap->fm_length = len;
4590 if (extent_count > 0 &&
4591 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4592 sizeof(struct fiemap_extent)) != 0)
4593 GOTO(out, rc = -EFAULT);
4595 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4597 fieinfo->fi_flags = fiemap->fm_flags;
4598 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4599 if (extent_count > 0 &&
4600 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4601 fiemap->fm_mapped_extents *
4602 sizeof(struct fiemap_extent)) != 0)
4603 GOTO(out, rc = -EFAULT);
4605 OBD_FREE_LARGE(fiemap, num_bytes);
4609 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4611 struct ll_inode_info *lli = ll_i2info(inode);
4612 struct posix_acl *acl = NULL;
4615 spin_lock(&lli->lli_lock);
4616 /* VFS' acl_permission_check->check_acl will release the refcount */
4617 acl = posix_acl_dup(lli->lli_posix_acl);
4618 spin_unlock(&lli->lli_lock);
4623 #ifdef HAVE_IOP_SET_ACL
4624 #ifdef CONFIG_FS_POSIX_ACL
4625 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4627 struct ll_sb_info *sbi = ll_i2sbi(inode);
4628 struct ptlrpc_request *req = NULL;
4629 const char *name = NULL;
4631 size_t value_size = 0;
4636 case ACL_TYPE_ACCESS:
4637 name = XATTR_NAME_POSIX_ACL_ACCESS;
4639 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4642 case ACL_TYPE_DEFAULT:
4643 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4644 if (!S_ISDIR(inode->i_mode))
4645 rc = acl ? -EACCES : 0;
4656 value_size = posix_acl_xattr_size(acl->a_count);
4657 value = kmalloc(value_size, GFP_NOFS);
4659 GOTO(out, rc = -ENOMEM);
4661 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4663 GOTO(out_value, rc);
4666 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4667 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4668 name, value, value_size, 0, 0, &req);
4670 ptlrpc_req_finished(req);
4675 forget_cached_acl(inode, type);
4677 set_cached_acl(inode, type, acl);
4680 #endif /* CONFIG_FS_POSIX_ACL */
4681 #endif /* HAVE_IOP_SET_ACL */
4683 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4685 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4686 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4688 ll_check_acl(struct inode *inode, int mask)
4691 # ifdef CONFIG_FS_POSIX_ACL
4692 struct posix_acl *acl;
4696 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4697 if (flags & IPERM_FLAG_RCU)
4700 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4705 rc = posix_acl_permission(inode, acl, mask);
4706 posix_acl_release(acl);
4709 # else /* !CONFIG_FS_POSIX_ACL */
4711 # endif /* CONFIG_FS_POSIX_ACL */
4713 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4715 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4716 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4718 # ifdef HAVE_INODE_PERMISION_2ARGS
4719 int ll_inode_permission(struct inode *inode, int mask)
4721 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4726 struct ll_sb_info *sbi;
4727 struct root_squash_info *squash;
4728 struct cred *cred = NULL;
4729 const struct cred *old_cred = NULL;
4731 bool squash_id = false;
4734 #ifdef MAY_NOT_BLOCK
4735 if (mask & MAY_NOT_BLOCK)
4737 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4738 if (flags & IPERM_FLAG_RCU)
4742 /* as root inode are NOT getting validated in lookup operation,
4743 * need to do it before permission check. */
4745 if (inode == inode->i_sb->s_root->d_inode) {
4746 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4751 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4752 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4754 /* squash fsuid/fsgid if needed */
4755 sbi = ll_i2sbi(inode);
4756 squash = &sbi->ll_squash;
4757 if (unlikely(squash->rsi_uid != 0 &&
4758 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4759 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4763 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4764 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4765 squash->rsi_uid, squash->rsi_gid);
4767 /* update current process's credentials
4768 * and FS capability */
4769 cred = prepare_creds();
4773 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4774 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4775 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4776 if ((1 << cap) & CFS_CAP_FS_MASK)
4777 cap_lower(cred->cap_effective, cap);
4779 old_cred = override_creds(cred);
4782 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4783 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4784 /* restore current process's credentials and FS capability */
4786 revert_creds(old_cred);
4793 /* -o localflock - only provides locally consistent flock locks */
4794 struct file_operations ll_file_operations = {
4795 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4796 # ifdef HAVE_SYNC_READ_WRITE
4797 .read = new_sync_read,
4798 .write = new_sync_write,
4800 .read_iter = ll_file_read_iter,
4801 .write_iter = ll_file_write_iter,
4802 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4803 .read = ll_file_read,
4804 .aio_read = ll_file_aio_read,
4805 .write = ll_file_write,
4806 .aio_write = ll_file_aio_write,
4807 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4808 .unlocked_ioctl = ll_file_ioctl,
4809 .open = ll_file_open,
4810 .release = ll_file_release,
4811 .mmap = ll_file_mmap,
4812 .llseek = ll_file_seek,
4813 .splice_read = ll_file_splice_read,
4818 struct file_operations ll_file_operations_flock = {
4819 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4820 # ifdef HAVE_SYNC_READ_WRITE
4821 .read = new_sync_read,
4822 .write = new_sync_write,
4823 # endif /* HAVE_SYNC_READ_WRITE */
4824 .read_iter = ll_file_read_iter,
4825 .write_iter = ll_file_write_iter,
4826 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4827 .read = ll_file_read,
4828 .aio_read = ll_file_aio_read,
4829 .write = ll_file_write,
4830 .aio_write = ll_file_aio_write,
4831 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4832 .unlocked_ioctl = ll_file_ioctl,
4833 .open = ll_file_open,
4834 .release = ll_file_release,
4835 .mmap = ll_file_mmap,
4836 .llseek = ll_file_seek,
4837 .splice_read = ll_file_splice_read,
4840 .flock = ll_file_flock,
4841 .lock = ll_file_flock
4844 /* These are for -o noflock - to return ENOSYS on flock calls */
4845 struct file_operations ll_file_operations_noflock = {
4846 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4847 # ifdef HAVE_SYNC_READ_WRITE
4848 .read = new_sync_read,
4849 .write = new_sync_write,
4850 # endif /* HAVE_SYNC_READ_WRITE */
4851 .read_iter = ll_file_read_iter,
4852 .write_iter = ll_file_write_iter,
4853 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4854 .read = ll_file_read,
4855 .aio_read = ll_file_aio_read,
4856 .write = ll_file_write,
4857 .aio_write = ll_file_aio_write,
4858 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4859 .unlocked_ioctl = ll_file_ioctl,
4860 .open = ll_file_open,
4861 .release = ll_file_release,
4862 .mmap = ll_file_mmap,
4863 .llseek = ll_file_seek,
4864 .splice_read = ll_file_splice_read,
4867 .flock = ll_file_noflock,
4868 .lock = ll_file_noflock
4871 struct inode_operations ll_file_inode_operations = {
4872 .setattr = ll_setattr,
4873 .getattr = ll_getattr,
4874 .permission = ll_inode_permission,
4875 #ifdef HAVE_IOP_XATTR
4876 .setxattr = ll_setxattr,
4877 .getxattr = ll_getxattr,
4878 .removexattr = ll_removexattr,
4880 .listxattr = ll_listxattr,
4881 .fiemap = ll_fiemap,
4882 #ifdef HAVE_IOP_GET_ACL
4883 .get_acl = ll_get_acl,
4885 #ifdef HAVE_IOP_SET_ACL
4886 .set_acl = ll_set_acl,
4890 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4892 struct ll_inode_info *lli = ll_i2info(inode);
4893 struct cl_object *obj = lli->lli_clob;
4902 env = cl_env_get(&refcheck);
4904 RETURN(PTR_ERR(env));
4906 rc = cl_conf_set(env, lli->lli_clob, conf);
4910 if (conf->coc_opc == OBJECT_CONF_SET) {
4911 struct ldlm_lock *lock = conf->coc_lock;
4912 struct cl_layout cl = {
4916 LASSERT(lock != NULL);
4917 LASSERT(ldlm_has_layout(lock));
4919 /* it can only be allowed to match after layout is
4920 * applied to inode otherwise false layout would be
4921 * seen. Applying layout shoud happen before dropping
4922 * the intent lock. */
4923 ldlm_lock_allow_match(lock);
4925 rc = cl_object_layout_get(env, obj, &cl);
4930 DFID": layout version change: %u -> %u\n",
4931 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4933 ll_layout_version_set(lli, cl.cl_layout_gen);
4937 cl_env_put(env, &refcheck);
4942 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4943 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4946 struct ll_sb_info *sbi = ll_i2sbi(inode);
4947 struct ptlrpc_request *req;
4954 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4955 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4956 lock->l_lvb_data, lock->l_lvb_len);
4958 if (lock->l_lvb_data != NULL)
4961 /* if layout lock was granted right away, the layout is returned
4962 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4963 * blocked and then granted via completion ast, we have to fetch
4964 * layout here. Please note that we can't use the LVB buffer in
4965 * completion AST because it doesn't have a large enough buffer */
4966 rc = ll_get_default_mdsize(sbi, &lmmsize);
4970 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4971 XATTR_NAME_LOV, lmmsize, &req);
4974 GOTO(out, rc = 0); /* empty layout */
4981 if (lmmsize == 0) /* empty layout */
4984 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4986 GOTO(out, rc = -EFAULT);
4988 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4989 if (lvbdata == NULL)
4990 GOTO(out, rc = -ENOMEM);
4992 memcpy(lvbdata, lmm, lmmsize);
4993 lock_res_and_lock(lock);
4994 if (unlikely(lock->l_lvb_data == NULL)) {
4995 lock->l_lvb_type = LVB_T_LAYOUT;
4996 lock->l_lvb_data = lvbdata;
4997 lock->l_lvb_len = lmmsize;
5000 unlock_res_and_lock(lock);
5003 OBD_FREE_LARGE(lvbdata, lmmsize);
5008 ptlrpc_req_finished(req);
5013 * Apply the layout to the inode. Layout lock is held and will be released
5016 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5017 struct inode *inode)
5019 struct ll_inode_info *lli = ll_i2info(inode);
5020 struct ll_sb_info *sbi = ll_i2sbi(inode);
5021 struct ldlm_lock *lock;
5022 struct cl_object_conf conf;
5025 bool wait_layout = false;
5028 LASSERT(lustre_handle_is_used(lockh));
5030 lock = ldlm_handle2lock(lockh);
5031 LASSERT(lock != NULL);
5032 LASSERT(ldlm_has_layout(lock));
5034 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5035 PFID(&lli->lli_fid), inode);
5037 /* in case this is a caching lock and reinstate with new inode */
5038 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5040 lock_res_and_lock(lock);
5041 lvb_ready = ldlm_is_lvb_ready(lock);
5042 unlock_res_and_lock(lock);
5044 /* checking lvb_ready is racy but this is okay. The worst case is
5045 * that multi processes may configure the file on the same time. */
5049 rc = ll_layout_fetch(inode, lock);
5053 /* for layout lock, lmm is stored in lock's lvb.
5054 * lvb_data is immutable if the lock is held so it's safe to access it
5057 * set layout to file. Unlikely this will fail as old layout was
5058 * surely eliminated */
5059 memset(&conf, 0, sizeof conf);
5060 conf.coc_opc = OBJECT_CONF_SET;
5061 conf.coc_inode = inode;
5062 conf.coc_lock = lock;
5063 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5064 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5065 rc = ll_layout_conf(inode, &conf);
5067 /* refresh layout failed, need to wait */
5068 wait_layout = rc == -EBUSY;
5071 LDLM_LOCK_PUT(lock);
5072 ldlm_lock_decref(lockh, mode);
5074 /* wait for IO to complete if it's still being used. */
5076 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5077 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5079 memset(&conf, 0, sizeof conf);
5080 conf.coc_opc = OBJECT_CONF_WAIT;
5081 conf.coc_inode = inode;
5082 rc = ll_layout_conf(inode, &conf);
5086 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5087 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5093 * Issue layout intent RPC to MDS.
5094 * \param inode [in] file inode
5095 * \param intent [in] layout intent
5097 * \retval 0 on success
5098 * \retval < 0 error code
5100 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5102 struct ll_inode_info *lli = ll_i2info(inode);
5103 struct ll_sb_info *sbi = ll_i2sbi(inode);
5104 struct md_op_data *op_data;
5105 struct lookup_intent it;
5106 struct ptlrpc_request *req;
5110 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5111 0, 0, LUSTRE_OPC_ANY, NULL);
5112 if (IS_ERR(op_data))
5113 RETURN(PTR_ERR(op_data));
5115 op_data->op_data = intent;
5116 op_data->op_data_size = sizeof(*intent);
5118 memset(&it, 0, sizeof(it));
5119 it.it_op = IT_LAYOUT;
5120 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5121 intent->li_opc == LAYOUT_INTENT_TRUNC)
5122 it.it_flags = FMODE_WRITE;
5124 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5125 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5127 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5128 &ll_md_blocking_ast, 0);
5129 if (it.it_request != NULL)
5130 ptlrpc_req_finished(it.it_request);
5131 it.it_request = NULL;
5133 ll_finish_md_op_data(op_data);
5135 /* set lock data in case this is a new lock */
5137 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5139 ll_intent_drop_lock(&it);
5145 * This function checks if there exists a LAYOUT lock on the client side,
5146 * or enqueues it if it doesn't have one in cache.
5148 * This function will not hold layout lock so it may be revoked any time after
5149 * this function returns. Any operations depend on layout should be redone
5152 * This function should be called before lov_io_init() to get an uptodate
5153 * layout version, the caller should save the version number and after IO
5154 * is finished, this function should be called again to verify that layout
5155 * is not changed during IO time.
5157 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5159 struct ll_inode_info *lli = ll_i2info(inode);
5160 struct ll_sb_info *sbi = ll_i2sbi(inode);
5161 struct lustre_handle lockh;
5162 struct layout_intent intent = {
5163 .li_opc = LAYOUT_INTENT_ACCESS,
5165 enum ldlm_mode mode;
5169 *gen = ll_layout_version_get(lli);
5170 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5174 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5175 LASSERT(S_ISREG(inode->i_mode));
5177 /* take layout lock mutex to enqueue layout lock exclusively. */
5178 mutex_lock(&lli->lli_layout_mutex);
5181 /* mostly layout lock is caching on the local side, so try to
5182 * match it before grabbing layout lock mutex. */
5183 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5184 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5185 if (mode != 0) { /* hit cached lock */
5186 rc = ll_layout_lock_set(&lockh, mode, inode);
5192 rc = ll_layout_intent(inode, &intent);
5198 *gen = ll_layout_version_get(lli);
5199 mutex_unlock(&lli->lli_layout_mutex);
5205 * Issue layout intent RPC indicating where in a file an IO is about to write.
5207 * \param[in] inode file inode.
5208 * \param[in] ext write range with start offset of fille in bytes where
5209 * an IO is about to write, and exclusive end offset in
5212 * \retval 0 on success
5213 * \retval < 0 error code
5215 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5216 struct lu_extent *ext)
5218 struct layout_intent intent = {
5220 .li_extent.e_start = ext->e_start,
5221 .li_extent.e_end = ext->e_end,
5226 rc = ll_layout_intent(inode, &intent);
5232 * This function send a restore request to the MDT
5234 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5236 struct hsm_user_request *hur;
5240 len = sizeof(struct hsm_user_request) +
5241 sizeof(struct hsm_user_item);
5242 OBD_ALLOC(hur, len);
5246 hur->hur_request.hr_action = HUA_RESTORE;
5247 hur->hur_request.hr_archive_id = 0;
5248 hur->hur_request.hr_flags = 0;
5249 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5250 sizeof(hur->hur_user_item[0].hui_fid));
5251 hur->hur_user_item[0].hui_extent.offset = offset;
5252 hur->hur_user_item[0].hui_extent.length = length;
5253 hur->hur_request.hr_itemcount = 1;
5254 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,