4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE;
158 op_data->op_xvalid |= OP_XVALID_BLOCKS;
159 case MDS_CLOSE_LAYOUT_SPLIT:
160 case MDS_CLOSE_LAYOUT_SWAP: {
161 struct split_param *sp = data;
163 LASSERT(data != NULL);
164 op_data->op_bias |= bias;
165 op_data->op_data_version = 0;
166 op_data->op_lease_handle = och->och_lease_handle;
167 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
168 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
169 op_data->op_mirror_id = sp->sp_mirror_id;
171 op_data->op_fid2 = *ll_inode2fid(data);
176 case MDS_CLOSE_RESYNC_DONE: {
177 struct ll_ioc_lease *ioc = data;
179 LASSERT(data != NULL);
180 op_data->op_attr_blocks +=
181 ioc->lil_count * op_data->op_attr_blocks;
182 op_data->op_attr.ia_valid |= ATTR_SIZE;
183 op_data->op_xvalid |= OP_XVALID_BLOCKS;
184 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
186 op_data->op_lease_handle = och->och_lease_handle;
187 op_data->op_data = &ioc->lil_ids[0];
188 op_data->op_data_size =
189 ioc->lil_count * sizeof(ioc->lil_ids[0]);
193 case MDS_HSM_RELEASE:
194 LASSERT(data != NULL);
195 op_data->op_bias |= MDS_HSM_RELEASE;
196 op_data->op_data_version = *(__u64 *)data;
197 op_data->op_lease_handle = och->och_lease_handle;
198 op_data->op_attr.ia_valid |= ATTR_SIZE;
199 op_data->op_xvalid |= OP_XVALID_BLOCKS;
203 LASSERT(data == NULL);
207 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
208 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
209 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
210 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
212 rc = md_close(md_exp, op_data, och->och_mod, &req);
213 if (rc != 0 && rc != -EINTR)
214 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
215 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
217 if (rc == 0 && op_data->op_bias & bias) {
218 struct mdt_body *body;
220 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
221 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
225 ll_finish_md_op_data(op_data);
229 md_clear_open_replay_data(md_exp, och);
230 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
233 ptlrpc_req_finished(req); /* This is close request */
237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
239 struct ll_inode_info *lli = ll_i2info(inode);
240 struct obd_client_handle **och_p;
241 struct obd_client_handle *och;
246 if (fmode & FMODE_WRITE) {
247 och_p = &lli->lli_mds_write_och;
248 och_usecount = &lli->lli_open_fd_write_count;
249 } else if (fmode & FMODE_EXEC) {
250 och_p = &lli->lli_mds_exec_och;
251 och_usecount = &lli->lli_open_fd_exec_count;
253 LASSERT(fmode & FMODE_READ);
254 och_p = &lli->lli_mds_read_och;
255 och_usecount = &lli->lli_open_fd_read_count;
258 mutex_lock(&lli->lli_och_mutex);
259 if (*och_usecount > 0) {
260 /* There are still users of this handle, so skip
262 mutex_unlock(&lli->lli_och_mutex);
268 mutex_unlock(&lli->lli_och_mutex);
271 /* There might be a race and this handle may already
273 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
279 static int ll_md_close(struct inode *inode, struct file *file)
281 union ldlm_policy_data policy = {
282 .l_inodebits = { MDS_INODELOCK_OPEN },
284 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
286 struct ll_inode_info *lli = ll_i2info(inode);
287 struct lustre_handle lockh;
288 enum ldlm_mode lockmode;
292 /* clear group lock, if present */
293 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
294 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
296 if (fd->fd_lease_och != NULL) {
299 /* Usually the lease is not released when the
300 * application crashed, we need to release here. */
301 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
302 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
303 PFID(&lli->lli_fid), rc, lease_broken);
305 fd->fd_lease_och = NULL;
308 if (fd->fd_och != NULL) {
309 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
314 /* Let's see if we have good enough OPEN lock on the file and if
315 we can skip talking to MDS */
316 mutex_lock(&lli->lli_och_mutex);
317 if (fd->fd_omode & FMODE_WRITE) {
319 LASSERT(lli->lli_open_fd_write_count);
320 lli->lli_open_fd_write_count--;
321 } else if (fd->fd_omode & FMODE_EXEC) {
323 LASSERT(lli->lli_open_fd_exec_count);
324 lli->lli_open_fd_exec_count--;
327 LASSERT(lli->lli_open_fd_read_count);
328 lli->lli_open_fd_read_count--;
330 mutex_unlock(&lli->lli_och_mutex);
332 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
333 LDLM_IBITS, &policy, lockmode, &lockh))
334 rc = ll_md_real_close(inode, fd->fd_omode);
337 LUSTRE_FPRIVATE(file) = NULL;
338 ll_file_data_put(fd);
343 /* While this returns an error code, fput() the caller does not, so we need
344 * to make every effort to clean up all of our state here. Also, applications
345 * rarely check close errors and even if an error is returned they will not
346 * re-try the close call.
348 int ll_file_release(struct inode *inode, struct file *file)
350 struct ll_file_data *fd;
351 struct ll_sb_info *sbi = ll_i2sbi(inode);
352 struct ll_inode_info *lli = ll_i2info(inode);
356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
357 PFID(ll_inode2fid(inode)), inode);
359 if (inode->i_sb->s_root != file_dentry(file))
360 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
361 fd = LUSTRE_FPRIVATE(file);
364 /* The last ref on @file, maybe not the the owner pid of statahead,
365 * because parent and child process can share the same file handle. */
366 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
367 ll_deauthorize_statahead(inode, fd);
369 if (inode->i_sb->s_root == file_dentry(file)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 if (lli->lli_clob != NULL)
377 lov_read_and_clear_async_rc(lli->lli_clob);
378 lli->lli_async_rc = 0;
381 rc = ll_md_close(inode, file);
383 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
384 libcfs_debug_dumplog();
389 static inline int ll_dom_readpage(void *data, struct page *page)
391 struct niobuf_local *lnb = data;
394 kaddr = ll_kmap_atomic(page, KM_USER0);
395 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
396 if (lnb->lnb_len < PAGE_SIZE)
397 memset(kaddr + lnb->lnb_len, 0,
398 PAGE_SIZE - lnb->lnb_len);
399 flush_dcache_page(page);
400 SetPageUptodate(page);
401 ll_kunmap_atomic(kaddr, KM_USER0);
407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
408 struct lookup_intent *it)
410 struct ll_inode_info *lli = ll_i2info(inode);
411 struct cl_object *obj = lli->lli_clob;
412 struct address_space *mapping = inode->i_mapping;
414 struct niobuf_remote *rnb;
416 struct lustre_handle lockh;
417 struct ldlm_lock *lock;
418 unsigned long index, start;
419 struct niobuf_local lnb;
420 bool dom_lock = false;
427 if (it->it_lock_mode != 0) {
428 lockh.cookie = it->it_lock_handle;
429 lock = ldlm_handle2lock(&lockh);
431 dom_lock = ldlm_has_dom(lock);
437 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
441 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
442 if (rnb == NULL || rnb->rnb_len == 0)
445 /* LU-11595: Server may return whole file and that is OK always or
446 * it may return just file tail and its offset must be aligned with
447 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
448 * smaller then offset may be not aligned and that data is just ignored.
450 if (rnb->rnb_offset % PAGE_SIZE)
453 /* Server returns whole file or just file tail if it fills in
454 * reply buffer, in both cases total size should be inode size.
456 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
457 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
458 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
459 rnb->rnb_len, i_size_read(inode));
463 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
464 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
466 data = (char *)rnb + sizeof(*rnb);
468 lnb.lnb_file_offset = rnb->rnb_offset;
469 start = lnb.lnb_file_offset / PAGE_SIZE;
471 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
472 lnb.lnb_page_offset = 0;
474 lnb.lnb_data = data + (index << PAGE_SHIFT);
475 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
476 if (lnb.lnb_len > PAGE_SIZE)
477 lnb.lnb_len = PAGE_SIZE;
479 vmpage = read_cache_page(mapping, index + start,
480 ll_dom_readpage, &lnb);
481 if (IS_ERR(vmpage)) {
482 CWARN("%s: cannot fill page %lu for "DFID
483 " with data: rc = %li\n",
484 ll_i2sbi(inode)->ll_fsname, index + start,
485 PFID(lu_object_fid(&obj->co_lu)),
491 } while (rnb->rnb_len > (index << PAGE_SHIFT));
495 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
496 struct lookup_intent *itp)
498 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
499 struct dentry *parent = de->d_parent;
502 struct md_op_data *op_data;
503 struct ptlrpc_request *req = NULL;
507 LASSERT(parent != NULL);
508 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
510 /* if server supports open-by-fid, or file name is invalid, don't pack
511 * name in open request */
512 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
513 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
515 len = de->d_name.len;
516 name = kmalloc(len + 1, GFP_NOFS);
521 spin_lock(&de->d_lock);
522 if (len != de->d_name.len) {
523 spin_unlock(&de->d_lock);
527 memcpy(name, de->d_name.name, len);
529 spin_unlock(&de->d_lock);
531 if (!lu_name_is_valid_2(name, len)) {
537 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
538 name, len, 0, LUSTRE_OPC_ANY, NULL);
539 if (IS_ERR(op_data)) {
541 RETURN(PTR_ERR(op_data));
543 op_data->op_data = lmm;
544 op_data->op_data_size = lmmsize;
546 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
547 &ll_md_blocking_ast, 0);
549 ll_finish_md_op_data(op_data);
551 /* reason for keep own exit path - don`t flood log
552 * with messages with -ESTALE errors.
554 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
555 it_open_error(DISP_OPEN_OPEN, itp))
557 ll_release_openhandle(de, itp);
561 if (it_disposition(itp, DISP_LOOKUP_NEG))
562 GOTO(out, rc = -ENOENT);
564 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
565 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
566 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
570 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
572 if (!rc && itp->it_lock_mode) {
573 ll_dom_finish_open(de->d_inode, req, itp);
574 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
578 ptlrpc_req_finished(req);
579 ll_intent_drop_lock(itp);
581 /* We did open by fid, but by the time we got to the server,
582 * the object disappeared. If this is a create, we cannot really
583 * tell the userspace that the file it was trying to create
584 * does not exist. Instead let's return -ESTALE, and the VFS will
585 * retry the create with LOOKUP_REVAL that we are going to catch
586 * in ll_revalidate_dentry() and use lookup then.
588 if (rc == -ENOENT && itp->it_op & IT_CREAT)
594 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
595 struct obd_client_handle *och)
597 struct mdt_body *body;
599 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
600 och->och_open_handle = body->mbo_open_handle;
601 och->och_fid = body->mbo_fid1;
602 och->och_lease_handle.cookie = it->it_lock_handle;
603 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
604 och->och_flags = it->it_flags;
606 return md_set_open_replay_data(md_exp, och, it);
609 static int ll_local_open(struct file *file, struct lookup_intent *it,
610 struct ll_file_data *fd, struct obd_client_handle *och)
612 struct inode *inode = file_inode(file);
615 LASSERT(!LUSTRE_FPRIVATE(file));
622 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
627 LUSTRE_FPRIVATE(file) = fd;
628 ll_readahead_init(inode, &fd->fd_ras);
629 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
631 /* ll_cl_context initialize */
632 rwlock_init(&fd->fd_lock);
633 INIT_LIST_HEAD(&fd->fd_lccs);
638 /* Open a file, and (for the very first open) create objects on the OSTs at
639 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
640 * creation or open until ll_lov_setstripe() ioctl is called.
642 * If we already have the stripe MD locally then we don't request it in
643 * md_open(), by passing a lmm_size = 0.
645 * It is up to the application to ensure no other processes open this file
646 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
647 * used. We might be able to avoid races of that sort by getting lli_open_sem
648 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
649 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
651 int ll_file_open(struct inode *inode, struct file *file)
653 struct ll_inode_info *lli = ll_i2info(inode);
654 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
655 .it_flags = file->f_flags };
656 struct obd_client_handle **och_p = NULL;
657 __u64 *och_usecount = NULL;
658 struct ll_file_data *fd;
662 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
663 PFID(ll_inode2fid(inode)), inode, file->f_flags);
665 it = file->private_data; /* XXX: compat macro */
666 file->private_data = NULL; /* prevent ll_local_open assertion */
668 fd = ll_file_data_get();
670 GOTO(out_nofiledata, rc = -ENOMEM);
673 if (S_ISDIR(inode->i_mode))
674 ll_authorize_statahead(inode, fd);
676 if (inode->i_sb->s_root == file_dentry(file)) {
677 LUSTRE_FPRIVATE(file) = fd;
681 if (!it || !it->it_disposition) {
682 /* Convert f_flags into access mode. We cannot use file->f_mode,
683 * because everything but O_ACCMODE mask was stripped from
685 if ((oit.it_flags + 1) & O_ACCMODE)
687 if (file->f_flags & O_TRUNC)
688 oit.it_flags |= FMODE_WRITE;
690 /* kernel only call f_op->open in dentry_open. filp_open calls
691 * dentry_open after call to open_namei that checks permissions.
692 * Only nfsd_open call dentry_open directly without checking
693 * permissions and because of that this code below is safe.
695 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
696 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
698 /* We do not want O_EXCL here, presumably we opened the file
699 * already? XXX - NFS implications? */
700 oit.it_flags &= ~O_EXCL;
702 /* bug20584, if "it_flags" contains O_CREAT, the file will be
703 * created if necessary, then "IT_CREAT" should be set to keep
704 * consistent with it */
705 if (oit.it_flags & O_CREAT)
706 oit.it_op |= IT_CREAT;
712 /* Let's see if we have file open on MDS already. */
713 if (it->it_flags & FMODE_WRITE) {
714 och_p = &lli->lli_mds_write_och;
715 och_usecount = &lli->lli_open_fd_write_count;
716 } else if (it->it_flags & FMODE_EXEC) {
717 och_p = &lli->lli_mds_exec_och;
718 och_usecount = &lli->lli_open_fd_exec_count;
720 och_p = &lli->lli_mds_read_och;
721 och_usecount = &lli->lli_open_fd_read_count;
724 mutex_lock(&lli->lli_och_mutex);
725 if (*och_p) { /* Open handle is present */
726 if (it_disposition(it, DISP_OPEN_OPEN)) {
727 /* Well, there's extra open request that we do not need,
728 let's close it somehow. This will decref request. */
729 rc = it_open_error(DISP_OPEN_OPEN, it);
731 mutex_unlock(&lli->lli_och_mutex);
732 GOTO(out_openerr, rc);
735 ll_release_openhandle(file_dentry(file), it);
739 rc = ll_local_open(file, it, fd, NULL);
742 mutex_unlock(&lli->lli_och_mutex);
743 GOTO(out_openerr, rc);
746 LASSERT(*och_usecount == 0);
747 if (!it->it_disposition) {
748 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
749 /* We cannot just request lock handle now, new ELC code
750 means that one of other OPEN locks for this file
751 could be cancelled, and since blocking ast handler
752 would attempt to grab och_mutex as well, that would
753 result in a deadlock */
754 mutex_unlock(&lli->lli_och_mutex);
756 * Normally called under two situations:
758 * 2. A race/condition on MDS resulting in no open
759 * handle to be returned from LOOKUP|OPEN request,
760 * for example if the target entry was a symlink.
762 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
763 * marked by a bit set in ll_iget_for_nfs. Clear the
764 * bit so that it's not confusing later callers.
766 * NB; when ldd is NULL, it must have come via normal
767 * lookup path only, since ll_iget_for_nfs always calls
770 if (ldd && ldd->lld_nfs_dentry) {
771 ldd->lld_nfs_dentry = 0;
772 it->it_flags |= MDS_OPEN_LOCK;
776 * Always specify MDS_OPEN_BY_FID because we don't want
777 * to get file with different fid.
779 it->it_flags |= MDS_OPEN_BY_FID;
780 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
783 GOTO(out_openerr, rc);
787 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
789 GOTO(out_och_free, rc = -ENOMEM);
793 /* md_intent_lock() didn't get a request ref if there was an
794 * open error, so don't do cleanup on the request here
796 /* XXX (green): Should not we bail out on any error here, not
797 * just open error? */
798 rc = it_open_error(DISP_OPEN_OPEN, it);
800 GOTO(out_och_free, rc);
802 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
803 "inode %p: disposition %x, status %d\n", inode,
804 it_disposition(it, ~0), it->it_status);
806 rc = ll_local_open(file, it, fd, *och_p);
808 GOTO(out_och_free, rc);
810 mutex_unlock(&lli->lli_och_mutex);
813 /* Must do this outside lli_och_mutex lock to prevent deadlock where
814 different kind of OPEN lock for this same inode gets cancelled
815 by ldlm_cancel_lru */
816 if (!S_ISREG(inode->i_mode))
817 GOTO(out_och_free, rc);
819 cl_lov_delay_create_clear(&file->f_flags);
820 GOTO(out_och_free, rc);
824 if (och_p && *och_p) {
825 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
826 *och_p = NULL; /* OBD_FREE writes some magic there */
829 mutex_unlock(&lli->lli_och_mutex);
832 if (lli->lli_opendir_key == fd)
833 ll_deauthorize_statahead(inode, fd);
835 ll_file_data_put(fd);
837 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
841 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
842 ptlrpc_req_finished(it->it_request);
843 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
849 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
850 struct ldlm_lock_desc *desc, void *data, int flag)
853 struct lustre_handle lockh;
857 case LDLM_CB_BLOCKING:
858 ldlm_lock2handle(lock, &lockh);
859 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
861 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
865 case LDLM_CB_CANCELING:
873 * When setting a lease on a file, we take ownership of the lli_mds_*_och
874 * and save it as fd->fd_och so as to force client to reopen the file even
875 * if it has an open lock in cache already.
877 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
878 struct lustre_handle *old_open_handle)
880 struct ll_inode_info *lli = ll_i2info(inode);
881 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
882 struct obd_client_handle **och_p;
887 /* Get the openhandle of the file */
888 mutex_lock(&lli->lli_och_mutex);
889 if (fd->fd_lease_och != NULL)
890 GOTO(out_unlock, rc = -EBUSY);
892 if (fd->fd_och == NULL) {
893 if (file->f_mode & FMODE_WRITE) {
894 LASSERT(lli->lli_mds_write_och != NULL);
895 och_p = &lli->lli_mds_write_och;
896 och_usecount = &lli->lli_open_fd_write_count;
898 LASSERT(lli->lli_mds_read_och != NULL);
899 och_p = &lli->lli_mds_read_och;
900 och_usecount = &lli->lli_open_fd_read_count;
903 if (*och_usecount > 1)
904 GOTO(out_unlock, rc = -EBUSY);
911 *old_open_handle = fd->fd_och->och_open_handle;
915 mutex_unlock(&lli->lli_och_mutex);
920 * Release ownership on lli_mds_*_och when putting back a file lease.
922 static int ll_lease_och_release(struct inode *inode, struct file *file)
924 struct ll_inode_info *lli = ll_i2info(inode);
925 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
926 struct obd_client_handle **och_p;
927 struct obd_client_handle *old_och = NULL;
932 mutex_lock(&lli->lli_och_mutex);
933 if (file->f_mode & FMODE_WRITE) {
934 och_p = &lli->lli_mds_write_och;
935 och_usecount = &lli->lli_open_fd_write_count;
937 och_p = &lli->lli_mds_read_och;
938 och_usecount = &lli->lli_open_fd_read_count;
941 /* The file may have been open by another process (broken lease) so
942 * *och_p is not NULL. In this case we should simply increase usecount
945 if (*och_p != NULL) {
946 old_och = fd->fd_och;
953 mutex_unlock(&lli->lli_och_mutex);
956 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
962 * Acquire a lease and open the file.
964 static struct obd_client_handle *
965 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
968 struct lookup_intent it = { .it_op = IT_OPEN };
969 struct ll_sb_info *sbi = ll_i2sbi(inode);
970 struct md_op_data *op_data;
971 struct ptlrpc_request *req = NULL;
972 struct lustre_handle old_open_handle = { 0 };
973 struct obd_client_handle *och = NULL;
978 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
979 RETURN(ERR_PTR(-EINVAL));
982 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
983 RETURN(ERR_PTR(-EPERM));
985 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
992 RETURN(ERR_PTR(-ENOMEM));
994 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
995 LUSTRE_OPC_ANY, NULL);
997 GOTO(out, rc = PTR_ERR(op_data));
999 /* To tell the MDT this openhandle is from the same owner */
1000 op_data->op_open_handle = old_open_handle;
1002 it.it_flags = fmode | open_flags;
1003 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1004 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1005 &ll_md_blocking_lease_ast,
1006 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1007 * it can be cancelled which may mislead applications that the lease is
1009 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1010 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1011 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1012 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1013 ll_finish_md_op_data(op_data);
1014 ptlrpc_req_finished(req);
1016 GOTO(out_release_it, rc);
1018 if (it_disposition(&it, DISP_LOOKUP_NEG))
1019 GOTO(out_release_it, rc = -ENOENT);
1021 rc = it_open_error(DISP_OPEN_OPEN, &it);
1023 GOTO(out_release_it, rc);
1025 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1026 ll_och_fill(sbi->ll_md_exp, &it, och);
1028 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1029 GOTO(out_close, rc = -EOPNOTSUPP);
1031 /* already get lease, handle lease lock */
1032 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1033 if (it.it_lock_mode == 0 ||
1034 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1035 /* open lock must return for lease */
1036 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1037 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1039 GOTO(out_close, rc = -EPROTO);
1042 ll_intent_release(&it);
1046 /* Cancel open lock */
1047 if (it.it_lock_mode != 0) {
1048 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1050 it.it_lock_mode = 0;
1051 och->och_lease_handle.cookie = 0ULL;
1053 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1055 CERROR("%s: error closing file "DFID": %d\n",
1056 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1057 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1059 ll_intent_release(&it);
1063 RETURN(ERR_PTR(rc));
1067 * Check whether a layout swap can be done between two inodes.
1069 * \param[in] inode1 First inode to check
1070 * \param[in] inode2 Second inode to check
1072 * \retval 0 on success, layout swap can be performed between both inodes
1073 * \retval negative error code if requirements are not met
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076 struct inode *inode2)
1078 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1081 if (inode_permission(inode1, MAY_WRITE) ||
1082 inode_permission(inode2, MAY_WRITE))
1085 if (inode1->i_sb != inode2->i_sb)
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092 struct inode *inode, struct inode *inode2)
1094 const struct lu_fid *fid1 = ll_inode2fid(inode);
1095 const struct lu_fid *fid2;
1099 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1102 rc = ll_check_swap_layouts_validity(inode, inode2);
1104 GOTO(out_free_och, rc);
1106 /* We now know that inode2 is a lustre inode */
1107 fid2 = ll_inode2fid(inode2);
1109 rc = lu_fid_cmp(fid1, fid2);
1111 GOTO(out_free_och, rc = -EINVAL);
1113 /* Close the file and {swap,merge} layouts between inode & inode2.
1114 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115 * because we still need it to pack l_remote_handle to MDT. */
1116 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1119 och = NULL; /* freed in ll_close_inode_openhandle() */
1129 * Release lease and close the file.
1130 * It will check if the lease has ever broken.
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133 struct inode *inode,
1134 bool *lease_broken, enum mds_op_bias bias,
1137 struct ldlm_lock *lock;
1138 bool cancelled = true;
1142 lock = ldlm_handle2lock(&och->och_lease_handle);
1144 lock_res_and_lock(lock);
1145 cancelled = ldlm_is_cancel(lock);
1146 unlock_res_and_lock(lock);
1147 LDLM_LOCK_PUT(lock);
1150 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1153 if (lease_broken != NULL)
1154 *lease_broken = cancelled;
1156 if (!cancelled && !bias)
1157 ldlm_cli_cancel(&och->och_lease_handle, 0);
1159 if (cancelled) { /* no need to excute intent */
1164 rc = ll_close_inode_openhandle(inode, och, bias, data);
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1171 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1175 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178 struct inode *inode, unsigned long arg)
1180 struct ll_sb_info *sbi = ll_i2sbi(inode);
1181 struct md_op_data *op_data;
1182 struct ll_ioc_lease_id ioc;
1183 __u64 data_version_unused;
1187 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188 LUSTRE_OPC_ANY, NULL);
1189 if (IS_ERR(op_data))
1190 RETURN(PTR_ERR(op_data));
1192 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1196 /* before starting file resync, it's necessary to clean up page cache
1197 * in client memory, otherwise once the layout version is increased,
1198 * writing back cached data will be denied the OSTs. */
1199 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1203 op_data->op_lease_handle = och->och_lease_handle;
1204 op_data->op_mirror_id = ioc.lil_mirror_id;
1205 rc = md_file_resync(sbi->ll_md_exp, op_data);
1211 ll_finish_md_op_data(op_data);
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1217 struct ll_inode_info *lli = ll_i2info(inode);
1218 struct cl_object *obj = lli->lli_clob;
1219 struct cl_attr *attr = vvp_env_thread_attr(env);
1227 ll_inode_size_lock(inode);
1229 /* Merge timestamps the most recently obtained from MDS with
1230 * timestamps obtained from OSTs.
1232 * Do not overwrite atime of inode because it may be refreshed
1233 * by file_accessed() function. If the read was served by cache
1234 * data, there is no RPC to be sent so that atime may not be
1235 * transferred to OSTs at all. MDT only updates atime at close time
1236 * if it's at least 'mdd.*.atime_diff' older.
1237 * All in all, the atime in Lustre does not strictly comply with
1238 * POSIX. Solving this problem needs to send an RPC to MDT for each
1239 * read, this will hurt performance.
1241 if (inode->i_atime.tv_sec < lli->lli_atime ||
1242 lli->lli_update_atime) {
1243 inode->i_atime.tv_sec = lli->lli_atime;
1244 lli->lli_update_atime = 0;
1246 inode->i_mtime.tv_sec = lli->lli_mtime;
1247 inode->i_ctime.tv_sec = lli->lli_ctime;
1249 mtime = inode->i_mtime.tv_sec;
1250 atime = inode->i_atime.tv_sec;
1251 ctime = inode->i_ctime.tv_sec;
1253 cl_object_attr_lock(obj);
1254 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1257 rc = cl_object_attr_get(env, obj, attr);
1258 cl_object_attr_unlock(obj);
1261 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1263 if (atime < attr->cat_atime)
1264 atime = attr->cat_atime;
1266 if (ctime < attr->cat_ctime)
1267 ctime = attr->cat_ctime;
1269 if (mtime < attr->cat_mtime)
1270 mtime = attr->cat_mtime;
1272 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273 PFID(&lli->lli_fid), attr->cat_size);
1275 i_size_write(inode, attr->cat_size);
1276 inode->i_blocks = attr->cat_blocks;
1278 inode->i_mtime.tv_sec = mtime;
1279 inode->i_atime.tv_sec = atime;
1280 inode->i_ctime.tv_sec = ctime;
1283 ll_inode_size_unlock(inode);
1289 * Set designated mirror for I/O.
1291 * So far only read, write, and truncated can support to issue I/O to
1292 * designated mirror.
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1296 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1298 /* clear layout version for generic(non-resync) I/O in case it carries
1299 * stale layout version due to I/O restart */
1300 io->ci_layout_version = 0;
1302 /* FLR: disable non-delay for designated mirror I/O because obviously
1303 * only one mirror is available */
1304 if (fd->fd_designated_mirror > 0) {
1306 io->ci_designated_mirror = fd->fd_designated_mirror;
1307 io->ci_layout_version = fd->fd_layout_version;
1310 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1314 static bool file_is_noatime(const struct file *file)
1316 const struct vfsmount *mnt = file->f_path.mnt;
1317 const struct inode *inode = file_inode((struct file *)file);
1319 /* Adapted from file_accessed() and touch_atime().*/
1320 if (file->f_flags & O_NOATIME)
1323 if (inode->i_flags & S_NOATIME)
1326 if (IS_NOATIME(inode))
1329 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1332 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1335 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1343 struct inode *inode = file_inode(file);
1344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1346 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1349 if (iot == CIT_WRITE) {
1350 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1352 file->f_flags & O_DIRECT ||
1355 io->ci_obj = ll_i2info(inode)->lli_clob;
1356 io->ci_lockreq = CILR_MAYBE;
1357 if (ll_file_nolock(file)) {
1358 io->ci_lockreq = CILR_NEVER;
1359 io->ci_no_srvlock = 1;
1360 } else if (file->f_flags & O_APPEND) {
1361 io->ci_lockreq = CILR_MANDATORY;
1363 io->ci_noatime = file_is_noatime(file);
1365 /* FLR: only use non-delay I/O for read as there is only one
1366 * avaliable mirror for write. */
1367 io->ci_ndelay = !(iot == CIT_WRITE);
1369 ll_io_set_mirror(io, file);
1372 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1375 struct ll_inode_info *lli = ll_i2info(inode);
1376 struct ll_sb_info *sbi = ll_i2sbi(inode);
1377 enum obd_heat_type sample_type;
1378 enum obd_heat_type iobyte_type;
1379 __u64 now = ktime_get_real_seconds();
1381 if (!ll_sbi_has_file_heat(sbi) ||
1382 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1385 if (iot == CIT_READ) {
1386 sample_type = OBD_HEAT_READSAMPLE;
1387 iobyte_type = OBD_HEAT_READBYTE;
1388 } else if (iot == CIT_WRITE) {
1389 sample_type = OBD_HEAT_WRITESAMPLE;
1390 iobyte_type = OBD_HEAT_WRITEBYTE;
1395 spin_lock(&lli->lli_heat_lock);
1396 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1397 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1399 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1400 spin_unlock(&lli->lli_heat_lock);
1404 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1405 struct file *file, enum cl_io_type iot,
1406 loff_t *ppos, size_t count)
1408 struct vvp_io *vio = vvp_env_io(env);
1409 struct inode *inode = file_inode(file);
1410 struct ll_inode_info *lli = ll_i2info(inode);
1411 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1412 struct range_lock range;
1416 unsigned retried = 0;
1417 bool restarted = false;
1421 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1422 file_dentry(file)->d_name.name,
1423 iot == CIT_READ ? "read" : "write", *ppos, count);
1426 io = vvp_env_thread_io(env);
1427 ll_io_init(io, file, iot);
1428 io->ci_ndelay_tried = retried;
1430 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1431 bool range_locked = false;
1433 if (file->f_flags & O_APPEND)
1434 range_lock_init(&range, 0, LUSTRE_EOF);
1436 range_lock_init(&range, *ppos, *ppos + count - 1);
1438 vio->vui_fd = LUSTRE_FPRIVATE(file);
1439 vio->vui_io_subtype = args->via_io_subtype;
1441 switch (vio->vui_io_subtype) {
1443 vio->vui_iter = args->u.normal.via_iter;
1444 vio->vui_iocb = args->u.normal.via_iocb;
1445 /* Direct IO reads must also take range lock,
1446 * or multiple reads will try to work on the same pages
1447 * See LU-6227 for details. */
1448 if (((iot == CIT_WRITE) ||
1449 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1450 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1451 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1453 rc = range_lock(&lli->lli_write_tree, &range);
1457 range_locked = true;
1461 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1462 vio->u.splice.vui_flags = args->u.splice.via_flags;
1465 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1469 ll_cl_add(file, env, io, LCC_RW);
1470 rc = cl_io_loop(env, io);
1471 ll_cl_remove(file, env);
1474 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1476 range_unlock(&lli->lli_write_tree, &range);
1479 /* cl_io_rw_init() handled IO */
1483 if (io->ci_nob > 0) {
1484 result += io->ci_nob;
1485 count -= io->ci_nob;
1486 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1488 /* prepare IO restart */
1489 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1490 args->u.normal.via_iter = vio->vui_iter;
1493 cl_io_fini(env, io);
1496 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1497 file->f_path.dentry->d_name.name,
1498 iot, rc, result, io->ci_need_restart);
1500 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1502 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1503 file_dentry(file)->d_name.name,
1504 iot == CIT_READ ? "read" : "write",
1505 *ppos, count, result, rc);
1506 /* preserve the tried count for FLR */
1507 retried = io->ci_ndelay_tried;
1512 if (iot == CIT_READ) {
1514 ll_stats_ops_tally(ll_i2sbi(inode),
1515 LPROC_LL_READ_BYTES, result);
1516 } else if (iot == CIT_WRITE) {
1518 ll_stats_ops_tally(ll_i2sbi(inode),
1519 LPROC_LL_WRITE_BYTES, result);
1520 fd->fd_write_failed = false;
1521 } else if (result == 0 && rc == 0) {
1524 fd->fd_write_failed = true;
1526 fd->fd_write_failed = false;
1527 } else if (rc != -ERESTARTSYS) {
1528 fd->fd_write_failed = true;
1532 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1534 ll_heat_add(inode, iot, result);
1536 RETURN(result > 0 ? result : rc);
1540 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1541 * especially for small I/O.
1543 * To serve a read request, CLIO has to create and initialize a cl_io and
1544 * then request DLM lock. This has turned out to have siginificant overhead
1545 * and affects the performance of small I/O dramatically.
1547 * It's not necessary to create a cl_io for each I/O. Under the help of read
1548 * ahead, most of the pages being read are already in memory cache and we can
1549 * read those pages directly because if the pages exist, the corresponding DLM
1550 * lock must exist so that page content must be valid.
1552 * In fast read implementation, the llite speculatively finds and reads pages
1553 * in memory cache. There are three scenarios for fast read:
1554 * - If the page exists and is uptodate, kernel VM will provide the data and
1555 * CLIO won't be intervened;
1556 * - If the page was brought into memory by read ahead, it will be exported
1557 * and read ahead parameters will be updated;
1558 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1559 * it will go back and invoke normal read, i.e., a cl_io will be created
1560 * and DLM lock will be requested.
1562 * POSIX compliance: posix standard states that read is intended to be atomic.
1563 * Lustre read implementation is in line with Linux kernel read implementation
1564 * and neither of them complies with POSIX standard in this matter. Fast read
1565 * doesn't make the situation worse on single node but it may interleave write
1566 * results from multiple nodes due to short read handling in ll_file_aio_read().
1568 * \param env - lu_env
1569 * \param iocb - kiocb from kernel
1570 * \param iter - user space buffers where the data will be copied
1572 * \retval - number of bytes have been read, or error code if error occurred.
1575 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1579 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1582 /* NB: we can't do direct IO for fast read because it will need a lock
1583 * to make IO engine happy. */
1584 if (iocb->ki_filp->f_flags & O_DIRECT)
1587 result = generic_file_read_iter(iocb, iter);
1589 /* If the first page is not in cache, generic_file_aio_read() will be
1590 * returned with -ENODATA.
1591 * See corresponding code in ll_readpage(). */
1592 if (result == -ENODATA)
1596 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1597 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1598 LPROC_LL_READ_BYTES, result);
1605 * Read from a file (through the page cache).
1607 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1610 struct vvp_io_args *args;
1615 ll_ras_enter(iocb->ki_filp);
1617 result = ll_do_fast_read(iocb, to);
1618 if (result < 0 || iov_iter_count(to) == 0)
1621 env = cl_env_get(&refcheck);
1623 return PTR_ERR(env);
1625 args = ll_env_args(env, IO_NORMAL);
1626 args->u.normal.via_iter = to;
1627 args->u.normal.via_iocb = iocb;
1629 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1630 &iocb->ki_pos, iov_iter_count(to));
1633 else if (result == 0)
1636 cl_env_put(env, &refcheck);
1642 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1643 * If a page is already in the page cache and dirty (and some other things -
1644 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1645 * write to it without doing a full I/O, because Lustre already knows about it
1646 * and will write it out. This saves a lot of processing time.
1648 * All writes here are within one page, so exclusion is handled by the page
1649 * lock on the vm page. We do not do tiny writes for writes which touch
1650 * multiple pages because it's very unlikely multiple sequential pages are
1651 * are already dirty.
1653 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1654 * and are unlikely to be to already dirty pages.
1656 * Attribute updates are important here, we do them in ll_tiny_write_end.
1658 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1660 ssize_t count = iov_iter_count(iter);
1661 struct file *file = iocb->ki_filp;
1662 struct inode *inode = file_inode(file);
1663 bool lock_inode = !IS_NOSEC(inode);
1668 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1669 * of function for why.
1671 if (count >= PAGE_SIZE ||
1672 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1675 if (unlikely(lock_inode))
1677 result = __generic_file_write_iter(iocb, iter);
1679 if (unlikely(lock_inode))
1680 inode_unlock(inode);
1682 /* If the page is not already dirty, ll_tiny_write_begin returns
1683 * -ENODATA. We continue on to normal write.
1685 if (result == -ENODATA)
1689 ll_heat_add(inode, CIT_WRITE, result);
1690 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1692 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1695 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1701 * Write to a file (through the page cache).
1703 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1705 struct vvp_io_args *args;
1707 ssize_t rc_tiny = 0, rc_normal;
1712 /* NB: we can't do direct IO for tiny writes because they use the page
1713 * cache, we can't do sync writes because tiny writes can't flush
1714 * pages, and we can't do append writes because we can't guarantee the
1715 * required DLM locks are held to protect file size.
1717 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1718 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1719 rc_tiny = ll_do_tiny_write(iocb, from);
1721 /* In case of error, go on and try normal write - Only stop if tiny
1722 * write completed I/O.
1724 if (iov_iter_count(from) == 0)
1725 GOTO(out, rc_normal = rc_tiny);
1727 env = cl_env_get(&refcheck);
1729 return PTR_ERR(env);
1731 args = ll_env_args(env, IO_NORMAL);
1732 args->u.normal.via_iter = from;
1733 args->u.normal.via_iocb = iocb;
1735 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1736 &iocb->ki_pos, iov_iter_count(from));
1738 /* On success, combine bytes written. */
1739 if (rc_tiny >= 0 && rc_normal > 0)
1740 rc_normal += rc_tiny;
1741 /* On error, only return error from normal write if tiny write did not
1742 * write any bytes. Otherwise return bytes written by tiny write.
1744 else if (rc_tiny > 0)
1745 rc_normal = rc_tiny;
1747 cl_env_put(env, &refcheck);
1752 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1754 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1756 static int ll_file_get_iov_count(const struct iovec *iov,
1757 unsigned long *nr_segs, size_t *count)
1762 for (seg = 0; seg < *nr_segs; seg++) {
1763 const struct iovec *iv = &iov[seg];
1766 * If any segment has a negative length, or the cumulative
1767 * length ever wraps negative then return -EINVAL.
1770 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1772 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1777 cnt -= iv->iov_len; /* This segment is no good */
1784 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1785 unsigned long nr_segs, loff_t pos)
1792 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1796 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1797 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1798 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1799 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1800 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1802 result = ll_file_read_iter(iocb, &to);
1807 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1810 struct iovec iov = { .iov_base = buf, .iov_len = count };
1815 init_sync_kiocb(&kiocb, file);
1816 kiocb.ki_pos = *ppos;
1817 #ifdef HAVE_KIOCB_KI_LEFT
1818 kiocb.ki_left = count;
1819 #elif defined(HAVE_KI_NBYTES)
1820 kiocb.i_nbytes = count;
1823 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1824 *ppos = kiocb.ki_pos;
1830 * Write to a file (through the page cache).
1833 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1834 unsigned long nr_segs, loff_t pos)
1836 struct iov_iter from;
1841 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1845 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1846 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1847 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1848 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1849 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1851 result = ll_file_write_iter(iocb, &from);
1856 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1857 size_t count, loff_t *ppos)
1859 struct iovec iov = { .iov_base = (void __user *)buf,
1866 init_sync_kiocb(&kiocb, file);
1867 kiocb.ki_pos = *ppos;
1868 #ifdef HAVE_KIOCB_KI_LEFT
1869 kiocb.ki_left = count;
1870 #elif defined(HAVE_KI_NBYTES)
1871 kiocb.ki_nbytes = count;
1874 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1875 *ppos = kiocb.ki_pos;
1879 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1882 * Send file content (through pagecache) somewhere with helper
1884 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1885 struct pipe_inode_info *pipe, size_t count,
1889 struct vvp_io_args *args;
1894 ll_ras_enter(in_file);
1896 env = cl_env_get(&refcheck);
1898 RETURN(PTR_ERR(env));
1900 args = ll_env_args(env, IO_SPLICE);
1901 args->u.splice.via_pipe = pipe;
1902 args->u.splice.via_flags = flags;
1904 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1905 cl_env_put(env, &refcheck);
1909 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1910 __u64 flags, struct lov_user_md *lum, int lum_size)
1912 struct lookup_intent oit = {
1914 .it_flags = flags | MDS_OPEN_BY_FID,
1919 ll_inode_size_lock(inode);
1920 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1922 GOTO(out_unlock, rc);
1924 ll_release_openhandle(dentry, &oit);
1927 ll_inode_size_unlock(inode);
1928 ll_intent_release(&oit);
1933 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1934 struct lov_mds_md **lmmp, int *lmm_size,
1935 struct ptlrpc_request **request)
1937 struct ll_sb_info *sbi = ll_i2sbi(inode);
1938 struct mdt_body *body;
1939 struct lov_mds_md *lmm = NULL;
1940 struct ptlrpc_request *req = NULL;
1941 struct md_op_data *op_data;
1944 rc = ll_get_default_mdsize(sbi, &lmmsize);
1948 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1949 strlen(filename), lmmsize,
1950 LUSTRE_OPC_ANY, NULL);
1951 if (IS_ERR(op_data))
1952 RETURN(PTR_ERR(op_data));
1954 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1955 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1956 ll_finish_md_op_data(op_data);
1958 CDEBUG(D_INFO, "md_getattr_name failed "
1959 "on %s: rc %d\n", filename, rc);
1963 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1964 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1966 lmmsize = body->mbo_eadatasize;
1968 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1970 GOTO(out, rc = -ENODATA);
1973 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1974 LASSERT(lmm != NULL);
1976 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1977 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1978 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
1979 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
1980 GOTO(out, rc = -EPROTO);
1983 * This is coming from the MDS, so is probably in
1984 * little endian. We convert it to host endian before
1985 * passing it to userspace.
1987 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1990 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1991 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1992 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1993 if (le32_to_cpu(lmm->lmm_pattern) &
1994 LOV_PATTERN_F_RELEASED)
1998 /* if function called for directory - we should
1999 * avoid swab not existent lsm objects */
2000 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2001 lustre_swab_lov_user_md_v1(
2002 (struct lov_user_md_v1 *)lmm);
2003 if (S_ISREG(body->mbo_mode))
2004 lustre_swab_lov_user_md_objects(
2005 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2007 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2008 lustre_swab_lov_user_md_v3(
2009 (struct lov_user_md_v3 *)lmm);
2010 if (S_ISREG(body->mbo_mode))
2011 lustre_swab_lov_user_md_objects(
2012 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2014 } else if (lmm->lmm_magic ==
2015 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2016 lustre_swab_lov_comp_md_v1(
2017 (struct lov_comp_md_v1 *)lmm);
2018 } else if (lmm->lmm_magic ==
2019 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2020 struct lov_foreign_md *lfm;
2022 lfm = (struct lov_foreign_md *)lmm;
2023 __swab32s(&lfm->lfm_magic);
2024 __swab32s(&lfm->lfm_length);
2025 __swab32s(&lfm->lfm_type);
2026 __swab32s(&lfm->lfm_flags);
2032 *lmm_size = lmmsize;
2037 static int ll_lov_setea(struct inode *inode, struct file *file,
2040 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2041 struct lov_user_md *lump;
2042 int lum_size = sizeof(struct lov_user_md) +
2043 sizeof(struct lov_user_ost_data);
2047 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2050 OBD_ALLOC_LARGE(lump, lum_size);
2054 if (copy_from_user(lump, arg, lum_size))
2055 GOTO(out_lump, rc = -EFAULT);
2057 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2059 cl_lov_delay_create_clear(&file->f_flags);
2062 OBD_FREE_LARGE(lump, lum_size);
2066 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2073 env = cl_env_get(&refcheck);
2075 RETURN(PTR_ERR(env));
2077 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2078 cl_env_put(env, &refcheck);
2082 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2085 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2086 struct lov_user_md *klum;
2088 __u64 flags = FMODE_WRITE;
2091 rc = ll_copy_user_md(lum, &klum);
2096 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2101 rc = put_user(0, &lum->lmm_stripe_count);
2105 rc = ll_layout_refresh(inode, &gen);
2109 rc = ll_file_getstripe(inode, arg, lum_size);
2111 cl_lov_delay_create_clear(&file->f_flags);
2114 OBD_FREE(klum, lum_size);
2119 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2121 struct ll_inode_info *lli = ll_i2info(inode);
2122 struct cl_object *obj = lli->lli_clob;
2123 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2124 struct ll_grouplock grouplock;
2129 CWARN("group id for group lock must not be 0\n");
2133 if (ll_file_nolock(file))
2134 RETURN(-EOPNOTSUPP);
2136 spin_lock(&lli->lli_lock);
2137 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2138 CWARN("group lock already existed with gid %lu\n",
2139 fd->fd_grouplock.lg_gid);
2140 spin_unlock(&lli->lli_lock);
2143 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2144 spin_unlock(&lli->lli_lock);
2147 * XXX: group lock needs to protect all OST objects while PFL
2148 * can add new OST objects during the IO, so we'd instantiate
2149 * all OST objects before getting its group lock.
2154 struct cl_layout cl = {
2155 .cl_is_composite = false,
2157 struct lu_extent ext = {
2159 .e_end = OBD_OBJECT_EOF,
2162 env = cl_env_get(&refcheck);
2164 RETURN(PTR_ERR(env));
2166 rc = cl_object_layout_get(env, obj, &cl);
2167 if (!rc && cl.cl_is_composite)
2168 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2171 cl_env_put(env, &refcheck);
2176 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2177 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2181 spin_lock(&lli->lli_lock);
2182 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2183 spin_unlock(&lli->lli_lock);
2184 CERROR("another thread just won the race\n");
2185 cl_put_grouplock(&grouplock);
2189 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2190 fd->fd_grouplock = grouplock;
2191 spin_unlock(&lli->lli_lock);
2193 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2197 static int ll_put_grouplock(struct inode *inode, struct file *file,
2200 struct ll_inode_info *lli = ll_i2info(inode);
2201 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2202 struct ll_grouplock grouplock;
2205 spin_lock(&lli->lli_lock);
2206 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2207 spin_unlock(&lli->lli_lock);
2208 CWARN("no group lock held\n");
2212 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2214 if (fd->fd_grouplock.lg_gid != arg) {
2215 CWARN("group lock %lu doesn't match current id %lu\n",
2216 arg, fd->fd_grouplock.lg_gid);
2217 spin_unlock(&lli->lli_lock);
2221 grouplock = fd->fd_grouplock;
2222 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2223 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2224 spin_unlock(&lli->lli_lock);
2226 cl_put_grouplock(&grouplock);
2227 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2232 * Close inode open handle
2234 * \param dentry [in] dentry which contains the inode
2235 * \param it [in,out] intent which contains open info and result
2238 * \retval <0 failure
2240 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2242 struct inode *inode = dentry->d_inode;
2243 struct obd_client_handle *och;
2249 /* Root ? Do nothing. */
2250 if (dentry->d_inode->i_sb->s_root == dentry)
2253 /* No open handle to close? Move away */
2254 if (!it_disposition(it, DISP_OPEN_OPEN))
2257 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2259 OBD_ALLOC(och, sizeof(*och));
2261 GOTO(out, rc = -ENOMEM);
2263 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2265 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2267 /* this one is in place of ll_file_open */
2268 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2269 ptlrpc_req_finished(it->it_request);
2270 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2276 * Get size for inode for which FIEMAP mapping is requested.
2277 * Make the FIEMAP get_info call and returns the result.
2278 * \param fiemap kernel buffer to hold extens
2279 * \param num_bytes kernel buffer size
2281 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2287 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2290 /* Checks for fiemap flags */
2291 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2292 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2296 /* Check for FIEMAP_FLAG_SYNC */
2297 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2298 rc = filemap_fdatawrite(inode->i_mapping);
2303 env = cl_env_get(&refcheck);
2305 RETURN(PTR_ERR(env));
2307 if (i_size_read(inode) == 0) {
2308 rc = ll_glimpse_size(inode);
2313 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2314 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2315 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2317 /* If filesize is 0, then there would be no objects for mapping */
2318 if (fmkey.lfik_oa.o_size == 0) {
2319 fiemap->fm_mapped_extents = 0;
2323 fmkey.lfik_fiemap = *fiemap;
2325 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2326 &fmkey, fiemap, &num_bytes);
2328 cl_env_put(env, &refcheck);
2332 int ll_fid2path(struct inode *inode, void __user *arg)
2334 struct obd_export *exp = ll_i2mdexp(inode);
2335 const struct getinfo_fid2path __user *gfin = arg;
2337 struct getinfo_fid2path *gfout;
2343 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2344 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2347 /* Only need to get the buflen */
2348 if (get_user(pathlen, &gfin->gf_pathlen))
2351 if (pathlen > PATH_MAX)
2354 outsize = sizeof(*gfout) + pathlen;
2355 OBD_ALLOC(gfout, outsize);
2359 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2360 GOTO(gf_free, rc = -EFAULT);
2361 /* append root FID after gfout to let MDT know the root FID so that it
2362 * can lookup the correct path, this is mainly for fileset.
2363 * old server without fileset mount support will ignore this. */
2364 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2366 /* Call mdc_iocontrol */
2367 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2371 if (copy_to_user(arg, gfout, outsize))
2375 OBD_FREE(gfout, outsize);
2380 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2382 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2390 ioc->idv_version = 0;
2391 ioc->idv_layout_version = UINT_MAX;
2393 /* If no file object initialized, we consider its version is 0. */
2397 env = cl_env_get(&refcheck);
2399 RETURN(PTR_ERR(env));
2401 io = vvp_env_thread_io(env);
2403 io->u.ci_data_version.dv_data_version = 0;
2404 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2405 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2408 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2409 result = cl_io_loop(env, io);
2411 result = io->ci_result;
2413 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2414 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2416 cl_io_fini(env, io);
2418 if (unlikely(io->ci_need_restart))
2421 cl_env_put(env, &refcheck);
2427 * Read the data_version for inode.
2429 * This value is computed using stripe object version on OST.
2430 * Version is computed using server side locking.
2432 * @param flags if do sync on the OST side;
2434 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2435 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2437 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2439 struct ioc_data_version ioc = { .idv_flags = flags };
2442 rc = ll_ioc_data_version(inode, &ioc);
2444 *data_version = ioc.idv_version;
2450 * Trigger a HSM release request for the provided inode.
2452 int ll_hsm_release(struct inode *inode)
2455 struct obd_client_handle *och = NULL;
2456 __u64 data_version = 0;
2461 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2462 ll_i2sbi(inode)->ll_fsname,
2463 PFID(&ll_i2info(inode)->lli_fid));
2465 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2467 GOTO(out, rc = PTR_ERR(och));
2469 /* Grab latest data_version and [am]time values */
2470 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2474 env = cl_env_get(&refcheck);
2476 GOTO(out, rc = PTR_ERR(env));
2478 rc = ll_merge_attr(env, inode);
2479 cl_env_put(env, &refcheck);
2481 /* If error happen, we have the wrong size for a file.
2487 /* Release the file.
2488 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2489 * we still need it to pack l_remote_handle to MDT. */
2490 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2496 if (och != NULL && !IS_ERR(och)) /* close the file */
2497 ll_lease_close(och, inode, NULL);
2502 struct ll_swap_stack {
2505 struct inode *inode1;
2506 struct inode *inode2;
2511 static int ll_swap_layouts(struct file *file1, struct file *file2,
2512 struct lustre_swap_layouts *lsl)
2514 struct mdc_swap_layouts msl;
2515 struct md_op_data *op_data;
2518 struct ll_swap_stack *llss = NULL;
2521 OBD_ALLOC_PTR(llss);
2525 llss->inode1 = file_inode(file1);
2526 llss->inode2 = file_inode(file2);
2528 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2532 /* we use 2 bool because it is easier to swap than 2 bits */
2533 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2534 llss->check_dv1 = true;
2536 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2537 llss->check_dv2 = true;
2539 /* we cannot use lsl->sl_dvX directly because we may swap them */
2540 llss->dv1 = lsl->sl_dv1;
2541 llss->dv2 = lsl->sl_dv2;
2543 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2544 if (rc == 0) /* same file, done! */
2547 if (rc < 0) { /* sequentialize it */
2548 swap(llss->inode1, llss->inode2);
2550 swap(llss->dv1, llss->dv2);
2551 swap(llss->check_dv1, llss->check_dv2);
2555 if (gid != 0) { /* application asks to flush dirty cache */
2556 rc = ll_get_grouplock(llss->inode1, file1, gid);
2560 rc = ll_get_grouplock(llss->inode2, file2, gid);
2562 ll_put_grouplock(llss->inode1, file1, gid);
2567 /* ultimate check, before swaping the layouts we check if
2568 * dataversion has changed (if requested) */
2569 if (llss->check_dv1) {
2570 rc = ll_data_version(llss->inode1, &dv, 0);
2573 if (dv != llss->dv1)
2574 GOTO(putgl, rc = -EAGAIN);
2577 if (llss->check_dv2) {
2578 rc = ll_data_version(llss->inode2, &dv, 0);
2581 if (dv != llss->dv2)
2582 GOTO(putgl, rc = -EAGAIN);
2585 /* struct md_op_data is used to send the swap args to the mdt
2586 * only flags is missing, so we use struct mdc_swap_layouts
2587 * through the md_op_data->op_data */
2588 /* flags from user space have to be converted before they are send to
2589 * server, no flag is sent today, they are only used on the client */
2592 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2593 0, LUSTRE_OPC_ANY, &msl);
2594 if (IS_ERR(op_data))
2595 GOTO(free, rc = PTR_ERR(op_data));
2597 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2598 sizeof(*op_data), op_data, NULL);
2599 ll_finish_md_op_data(op_data);
2606 ll_put_grouplock(llss->inode2, file2, gid);
2607 ll_put_grouplock(llss->inode1, file1, gid);
2617 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2619 struct obd_export *exp = ll_i2mdexp(inode);
2620 struct md_op_data *op_data;
2624 /* Detect out-of range masks */
2625 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2628 /* Non-root users are forbidden to set or clear flags which are
2629 * NOT defined in HSM_USER_MASK. */
2630 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2631 !cfs_capable(CFS_CAP_SYS_ADMIN))
2634 if (!exp_connect_archive_id_array(exp)) {
2635 /* Detect out-of range archive id */
2636 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2637 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2641 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2642 LUSTRE_OPC_ANY, hss);
2643 if (IS_ERR(op_data))
2644 RETURN(PTR_ERR(op_data));
2646 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2649 ll_finish_md_op_data(op_data);
2654 static int ll_hsm_import(struct inode *inode, struct file *file,
2655 struct hsm_user_import *hui)
2657 struct hsm_state_set *hss = NULL;
2658 struct iattr *attr = NULL;
2662 if (!S_ISREG(inode->i_mode))
2668 GOTO(out, rc = -ENOMEM);
2670 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2671 hss->hss_archive_id = hui->hui_archive_id;
2672 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2673 rc = ll_hsm_state_set(inode, hss);
2677 OBD_ALLOC_PTR(attr);
2679 GOTO(out, rc = -ENOMEM);
2681 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2682 attr->ia_mode |= S_IFREG;
2683 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2684 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2685 attr->ia_size = hui->hui_size;
2686 attr->ia_mtime.tv_sec = hui->hui_mtime;
2687 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2688 attr->ia_atime.tv_sec = hui->hui_atime;
2689 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2691 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2692 ATTR_UID | ATTR_GID |
2693 ATTR_MTIME | ATTR_MTIME_SET |
2694 ATTR_ATIME | ATTR_ATIME_SET;
2698 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2702 inode_unlock(inode);
2714 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2716 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2717 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2720 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2722 struct inode *inode = file_inode(file);
2724 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2725 ATTR_MTIME | ATTR_MTIME_SET |
2728 .tv_sec = lfu->lfu_atime_sec,
2729 .tv_nsec = lfu->lfu_atime_nsec,
2732 .tv_sec = lfu->lfu_mtime_sec,
2733 .tv_nsec = lfu->lfu_mtime_nsec,
2736 .tv_sec = lfu->lfu_ctime_sec,
2737 .tv_nsec = lfu->lfu_ctime_nsec,
2743 if (!capable(CAP_SYS_ADMIN))
2746 if (!S_ISREG(inode->i_mode))
2750 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2752 inode_unlock(inode);
2757 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2760 case MODE_READ_USER:
2762 case MODE_WRITE_USER:
2769 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2771 /* Used to allow the upper layers of the client to request an LDLM lock
2772 * without doing an actual read or write.
2774 * Used for ladvise lockahead to manually request specific locks.
2776 * \param[in] file file this ladvise lock request is on
2777 * \param[in] ladvise ladvise struct describing this lock request
2779 * \retval 0 success, no detailed result available (sync requests
2780 * and requests sent to the server [not handled locally]
2781 * cannot return detailed results)
2782 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2783 * see definitions for details.
2784 * \retval negative negative errno on error
2786 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2788 struct lu_env *env = NULL;
2789 struct cl_io *io = NULL;
2790 struct cl_lock *lock = NULL;
2791 struct cl_lock_descr *descr = NULL;
2792 struct dentry *dentry = file->f_path.dentry;
2793 struct inode *inode = dentry->d_inode;
2794 enum cl_lock_mode cl_mode;
2795 off_t start = ladvise->lla_start;
2796 off_t end = ladvise->lla_end;
2802 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2803 "start=%llu, end=%llu\n", dentry->d_name.len,
2804 dentry->d_name.name, dentry->d_inode,
2805 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2808 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2810 GOTO(out, result = cl_mode);
2812 /* Get IO environment */
2813 result = cl_io_get(inode, &env, &io, &refcheck);
2817 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2820 * nothing to do for this io. This currently happens when
2821 * stripe sub-object's are not yet created.
2823 result = io->ci_result;
2824 } else if (result == 0) {
2825 lock = vvp_env_lock(env);
2826 descr = &lock->cll_descr;
2828 descr->cld_obj = io->ci_obj;
2829 /* Convert byte offsets to pages */
2830 descr->cld_start = cl_index(io->ci_obj, start);
2831 descr->cld_end = cl_index(io->ci_obj, end);
2832 descr->cld_mode = cl_mode;
2833 /* CEF_MUST is used because we do not want to convert a
2834 * lockahead request to a lockless lock */
2835 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2838 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2839 descr->cld_enq_flags |= CEF_SPECULATIVE;
2841 result = cl_lock_request(env, io, lock);
2843 /* On success, we need to release the lock */
2845 cl_lock_release(env, lock);
2847 cl_io_fini(env, io);
2848 cl_env_put(env, &refcheck);
2850 /* -ECANCELED indicates a matching lock with a different extent
2851 * was already present, and -EEXIST indicates a matching lock
2852 * on exactly the same extent was already present.
2853 * We convert them to positive values for userspace to make
2854 * recognizing true errors easier.
2855 * Note we can only return these detailed results on async requests,
2856 * as sync requests look the same as i/o requests for locking. */
2857 if (result == -ECANCELED)
2858 result = LLA_RESULT_DIFFERENT;
2859 else if (result == -EEXIST)
2860 result = LLA_RESULT_SAME;
2865 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2867 static int ll_ladvise_sanity(struct inode *inode,
2868 struct llapi_lu_ladvise *ladvise)
2870 struct ll_sb_info *sbi = ll_i2sbi(inode);
2871 enum lu_ladvise_type advice = ladvise->lla_advice;
2872 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2873 * be in the first 32 bits of enum ladvise_flags */
2874 __u32 flags = ladvise->lla_peradvice_flags;
2875 /* 3 lines at 80 characters per line, should be plenty */
2878 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2880 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2881 "last supported advice is %s (value '%d'): rc = %d\n",
2882 sbi->ll_fsname, advice,
2883 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2887 /* Per-advice checks */
2889 case LU_LADVISE_LOCKNOEXPAND:
2890 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2892 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2893 "rc = %d\n", sbi->ll_fsname, flags,
2894 ladvise_names[advice], rc);
2898 case LU_LADVISE_LOCKAHEAD:
2899 /* Currently only READ and WRITE modes can be requested */
2900 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2901 ladvise->lla_lockahead_mode == 0) {
2903 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2904 "rc = %d\n", sbi->ll_fsname,
2905 ladvise->lla_lockahead_mode,
2906 ladvise_names[advice], rc);
2909 case LU_LADVISE_WILLREAD:
2910 case LU_LADVISE_DONTNEED:
2912 /* Note fall through above - These checks apply to all advices
2913 * except LOCKNOEXPAND */
2914 if (flags & ~LF_DEFAULT_MASK) {
2916 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2917 "rc = %d\n", sbi->ll_fsname, flags,
2918 ladvise_names[advice], rc);
2921 if (ladvise->lla_start >= ladvise->lla_end) {
2923 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2924 "for %s: rc = %d\n", sbi->ll_fsname,
2925 ladvise->lla_start, ladvise->lla_end,
2926 ladvise_names[advice], rc);
2938 * Give file access advices
2940 * The ladvise interface is similar to Linux fadvise() system call, except it
2941 * forwards the advices directly from Lustre client to server. The server side
2942 * codes will apply appropriate read-ahead and caching techniques for the
2943 * corresponding files.
2945 * A typical workload for ladvise is e.g. a bunch of different clients are
2946 * doing small random reads of a file, so prefetching pages into OSS cache
2947 * with big linear reads before the random IO is a net benefit. Fetching
2948 * all that data into each client cache with fadvise() may not be, due to
2949 * much more data being sent to the client.
2951 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2952 struct llapi_lu_ladvise *ladvise)
2956 struct cl_ladvise_io *lio;
2961 env = cl_env_get(&refcheck);
2963 RETURN(PTR_ERR(env));
2965 io = vvp_env_thread_io(env);
2966 io->ci_obj = ll_i2info(inode)->lli_clob;
2968 /* initialize parameters for ladvise */
2969 lio = &io->u.ci_ladvise;
2970 lio->li_start = ladvise->lla_start;
2971 lio->li_end = ladvise->lla_end;
2972 lio->li_fid = ll_inode2fid(inode);
2973 lio->li_advice = ladvise->lla_advice;
2974 lio->li_flags = flags;
2976 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2977 rc = cl_io_loop(env, io);
2981 cl_io_fini(env, io);
2982 cl_env_put(env, &refcheck);
2986 static int ll_lock_noexpand(struct file *file, int flags)
2988 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2990 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2995 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2998 struct fsxattr fsxattr;
3000 if (copy_from_user(&fsxattr,
3001 (const struct fsxattr __user *)arg,
3005 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3006 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3007 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3008 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3009 if (copy_to_user((struct fsxattr __user *)arg,
3010 &fsxattr, sizeof(fsxattr)))
3016 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3019 * Project Quota ID state is only allowed to change from within the init
3020 * namespace. Enforce that restriction only if we are trying to change
3021 * the quota ID state. Everything else is allowed in user namespaces.
3023 if (current_user_ns() == &init_user_ns)
3026 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3029 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3030 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3033 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3040 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3044 struct md_op_data *op_data;
3045 struct ptlrpc_request *req = NULL;
3047 struct fsxattr fsxattr;
3048 struct cl_object *obj;
3052 if (copy_from_user(&fsxattr,
3053 (const struct fsxattr __user *)arg,
3057 rc = ll_ioctl_check_project(inode, &fsxattr);
3061 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3062 LUSTRE_OPC_ANY, NULL);
3063 if (IS_ERR(op_data))
3064 RETURN(PTR_ERR(op_data));
3066 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3067 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3068 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3069 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3070 op_data->op_projid = fsxattr.fsx_projid;
3071 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3072 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3074 ptlrpc_req_finished(req);
3076 GOTO(out_fsxattr, rc);
3077 ll_update_inode_flags(inode, op_data->op_attr_flags);
3078 obj = ll_i2info(inode)->lli_clob;
3080 GOTO(out_fsxattr, rc);
3082 OBD_ALLOC_PTR(attr);
3084 GOTO(out_fsxattr, rc = -ENOMEM);
3086 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3087 fsxattr.fsx_xflags);
3090 ll_finish_md_op_data(op_data);
3094 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3097 struct inode *inode = file_inode(file);
3098 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3099 struct ll_inode_info *lli = ll_i2info(inode);
3100 struct obd_client_handle *och = NULL;
3101 struct split_param sp;
3104 enum mds_op_bias bias = 0;
3105 struct file *layout_file = NULL;
3107 size_t data_size = 0;
3111 mutex_lock(&lli->lli_och_mutex);
3112 if (fd->fd_lease_och != NULL) {
3113 och = fd->fd_lease_och;
3114 fd->fd_lease_och = NULL;
3116 mutex_unlock(&lli->lli_och_mutex);
3119 GOTO(out, rc = -ENOLCK);
3121 fmode = och->och_flags;
3123 switch (ioc->lil_flags) {
3124 case LL_LEASE_RESYNC_DONE:
3125 if (ioc->lil_count > IOC_IDS_MAX)
3126 GOTO(out, rc = -EINVAL);
3128 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3129 OBD_ALLOC(data, data_size);
3131 GOTO(out, rc = -ENOMEM);
3133 if (copy_from_user(data, (void __user *)arg, data_size))
3134 GOTO(out, rc = -EFAULT);
3136 bias = MDS_CLOSE_RESYNC_DONE;
3138 case LL_LEASE_LAYOUT_MERGE: {
3141 if (ioc->lil_count != 1)
3142 GOTO(out, rc = -EINVAL);
3144 arg += sizeof(*ioc);
3145 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3146 GOTO(out, rc = -EFAULT);
3148 layout_file = fget(fd);
3150 GOTO(out, rc = -EBADF);
3152 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3153 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3154 GOTO(out, rc = -EPERM);
3156 data = file_inode(layout_file);
3157 bias = MDS_CLOSE_LAYOUT_MERGE;
3160 case LL_LEASE_LAYOUT_SPLIT: {
3164 if (ioc->lil_count != 2)
3165 GOTO(out, rc = -EINVAL);
3167 arg += sizeof(*ioc);
3168 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3169 GOTO(out, rc = -EFAULT);
3171 arg += sizeof(__u32);
3172 if (copy_from_user(&mirror_id, (void __user *)arg,
3174 GOTO(out, rc = -EFAULT);
3176 layout_file = fget(fdv);
3178 GOTO(out, rc = -EBADF);
3180 sp.sp_inode = file_inode(layout_file);
3181 sp.sp_mirror_id = (__u16)mirror_id;
3183 bias = MDS_CLOSE_LAYOUT_SPLIT;
3187 /* without close intent */
3191 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3195 rc = ll_lease_och_release(inode, file);
3204 switch (ioc->lil_flags) {
3205 case LL_LEASE_RESYNC_DONE:
3207 OBD_FREE(data, data_size);
3209 case LL_LEASE_LAYOUT_MERGE:
3210 case LL_LEASE_LAYOUT_SPLIT:
3217 rc = ll_lease_type_from_fmode(fmode);
3221 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3224 struct inode *inode = file_inode(file);
3225 struct ll_inode_info *lli = ll_i2info(inode);
3226 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3227 struct obd_client_handle *och = NULL;
3228 __u64 open_flags = 0;
3234 switch (ioc->lil_mode) {
3235 case LL_LEASE_WRLCK:
3236 if (!(file->f_mode & FMODE_WRITE))
3238 fmode = FMODE_WRITE;
3240 case LL_LEASE_RDLCK:
3241 if (!(file->f_mode & FMODE_READ))
3245 case LL_LEASE_UNLCK:
3246 RETURN(ll_file_unlock_lease(file, ioc, arg));
3251 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3253 /* apply for lease */
3254 if (ioc->lil_flags & LL_LEASE_RESYNC)
3255 open_flags = MDS_OPEN_RESYNC;
3256 och = ll_lease_open(inode, file, fmode, open_flags);
3258 RETURN(PTR_ERR(och));
3260 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3261 rc = ll_lease_file_resync(och, inode, arg);
3263 ll_lease_close(och, inode, NULL);
3266 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3268 ll_lease_close(och, inode, NULL);
3274 mutex_lock(&lli->lli_och_mutex);
3275 if (fd->fd_lease_och == NULL) {
3276 fd->fd_lease_och = och;
3279 mutex_unlock(&lli->lli_och_mutex);
3281 /* impossible now that only excl is supported for now */
3282 ll_lease_close(och, inode, &lease_broken);
3288 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3290 struct ll_inode_info *lli = ll_i2info(inode);
3291 struct ll_sb_info *sbi = ll_i2sbi(inode);
3292 __u64 now = ktime_get_real_seconds();
3295 spin_lock(&lli->lli_heat_lock);
3296 heat->lh_flags = lli->lli_heat_flags;
3297 for (i = 0; i < heat->lh_count; i++)
3298 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3299 now, sbi->ll_heat_decay_weight,
3300 sbi->ll_heat_period_second);
3301 spin_unlock(&lli->lli_heat_lock);
3304 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3306 struct ll_inode_info *lli = ll_i2info(inode);
3309 spin_lock(&lli->lli_heat_lock);
3310 if (flags & LU_HEAT_FLAG_CLEAR)
3311 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3313 if (flags & LU_HEAT_FLAG_OFF)
3314 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3316 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3318 spin_unlock(&lli->lli_heat_lock);
3324 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3326 struct inode *inode = file_inode(file);
3327 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3331 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3332 PFID(ll_inode2fid(inode)), inode, cmd);
3333 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3335 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3336 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3340 case LL_IOC_GETFLAGS:
3341 /* Get the current value of the file flags */
3342 return put_user(fd->fd_flags, (int __user *)arg);
3343 case LL_IOC_SETFLAGS:
3344 case LL_IOC_CLRFLAGS:
3345 /* Set or clear specific file flags */
3346 /* XXX This probably needs checks to ensure the flags are
3347 * not abused, and to handle any flag side effects.
3349 if (get_user(flags, (int __user *) arg))
3352 if (cmd == LL_IOC_SETFLAGS) {
3353 if ((flags & LL_FILE_IGNORE_LOCK) &&
3354 !(file->f_flags & O_DIRECT)) {
3355 CERROR("%s: unable to disable locking on "
3356 "non-O_DIRECT file\n", current->comm);
3360 fd->fd_flags |= flags;
3362 fd->fd_flags &= ~flags;
3365 case LL_IOC_LOV_SETSTRIPE:
3366 case LL_IOC_LOV_SETSTRIPE_NEW:
3367 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3368 case LL_IOC_LOV_SETEA:
3369 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3370 case LL_IOC_LOV_SWAP_LAYOUTS: {
3372 struct lustre_swap_layouts lsl;
3374 if (copy_from_user(&lsl, (char __user *)arg,
3375 sizeof(struct lustre_swap_layouts)))
3378 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3381 file2 = fget(lsl.sl_fd);
3385 /* O_WRONLY or O_RDWR */
3386 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3387 GOTO(out, rc = -EPERM);
3389 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3390 struct inode *inode2;
3391 struct ll_inode_info *lli;
3392 struct obd_client_handle *och = NULL;
3394 lli = ll_i2info(inode);
3395 mutex_lock(&lli->lli_och_mutex);
3396 if (fd->fd_lease_och != NULL) {
3397 och = fd->fd_lease_och;
3398 fd->fd_lease_och = NULL;
3400 mutex_unlock(&lli->lli_och_mutex);
3402 GOTO(out, rc = -ENOLCK);
3403 inode2 = file_inode(file2);
3404 rc = ll_swap_layouts_close(och, inode, inode2);
3406 rc = ll_swap_layouts(file, file2, &lsl);
3412 case LL_IOC_LOV_GETSTRIPE:
3413 case LL_IOC_LOV_GETSTRIPE_NEW:
3414 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3415 case FS_IOC_GETFLAGS:
3416 case FS_IOC_SETFLAGS:
3417 RETURN(ll_iocontrol(inode, file, cmd, arg));
3418 case FSFILT_IOC_GETVERSION:
3419 case FS_IOC_GETVERSION:
3420 RETURN(put_user(inode->i_generation, (int __user *)arg));
3421 /* We need to special case any other ioctls we want to handle,
3422 * to send them to the MDS/OST as appropriate and to properly
3423 * network encode the arg field. */
3424 case FS_IOC_SETVERSION:
3427 case LL_IOC_GROUP_LOCK:
3428 RETURN(ll_get_grouplock(inode, file, arg));
3429 case LL_IOC_GROUP_UNLOCK:
3430 RETURN(ll_put_grouplock(inode, file, arg));
3431 case IOC_OBD_STATFS:
3432 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3434 case LL_IOC_FLUSHCTX:
3435 RETURN(ll_flush_ctx(inode));
3436 case LL_IOC_PATH2FID: {
3437 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3438 sizeof(struct lu_fid)))
3443 case LL_IOC_GETPARENT:
3444 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3446 case OBD_IOC_FID2PATH:
3447 RETURN(ll_fid2path(inode, (void __user *)arg));
3448 case LL_IOC_DATA_VERSION: {
3449 struct ioc_data_version idv;
3452 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3455 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3456 rc = ll_ioc_data_version(inode, &idv);
3459 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3465 case LL_IOC_GET_MDTIDX: {
3468 mdtidx = ll_get_mdt_idx(inode);
3472 if (put_user((int)mdtidx, (int __user *)arg))
3477 case OBD_IOC_GETDTNAME:
3478 case OBD_IOC_GETMDNAME:
3479 RETURN(ll_get_obd_name(inode, cmd, arg));
3480 case LL_IOC_HSM_STATE_GET: {
3481 struct md_op_data *op_data;
3482 struct hsm_user_state *hus;
3489 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3490 LUSTRE_OPC_ANY, hus);
3491 if (IS_ERR(op_data)) {
3493 RETURN(PTR_ERR(op_data));
3496 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3499 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3502 ll_finish_md_op_data(op_data);
3506 case LL_IOC_HSM_STATE_SET: {
3507 struct hsm_state_set *hss;
3514 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3519 rc = ll_hsm_state_set(inode, hss);
3524 case LL_IOC_HSM_ACTION: {
3525 struct md_op_data *op_data;
3526 struct hsm_current_action *hca;
3533 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3534 LUSTRE_OPC_ANY, hca);
3535 if (IS_ERR(op_data)) {
3537 RETURN(PTR_ERR(op_data));
3540 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3543 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3546 ll_finish_md_op_data(op_data);
3550 case LL_IOC_SET_LEASE_OLD: {
3551 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3553 RETURN(ll_file_set_lease(file, &ioc, 0));
3555 case LL_IOC_SET_LEASE: {
3556 struct ll_ioc_lease ioc;
3558 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3561 RETURN(ll_file_set_lease(file, &ioc, arg));
3563 case LL_IOC_GET_LEASE: {
3564 struct ll_inode_info *lli = ll_i2info(inode);
3565 struct ldlm_lock *lock = NULL;
3568 mutex_lock(&lli->lli_och_mutex);
3569 if (fd->fd_lease_och != NULL) {
3570 struct obd_client_handle *och = fd->fd_lease_och;
3572 lock = ldlm_handle2lock(&och->och_lease_handle);
3574 lock_res_and_lock(lock);
3575 if (!ldlm_is_cancel(lock))
3576 fmode = och->och_flags;
3578 unlock_res_and_lock(lock);
3579 LDLM_LOCK_PUT(lock);
3582 mutex_unlock(&lli->lli_och_mutex);
3584 RETURN(ll_lease_type_from_fmode(fmode));
3586 case LL_IOC_HSM_IMPORT: {
3587 struct hsm_user_import *hui;
3593 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3598 rc = ll_hsm_import(inode, file, hui);
3603 case LL_IOC_FUTIMES_3: {
3604 struct ll_futimes_3 lfu;
3606 if (copy_from_user(&lfu,
3607 (const struct ll_futimes_3 __user *)arg,
3611 RETURN(ll_file_futimes_3(file, &lfu));
3613 case LL_IOC_LADVISE: {
3614 struct llapi_ladvise_hdr *k_ladvise_hdr;
3615 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3618 int alloc_size = sizeof(*k_ladvise_hdr);
3621 u_ladvise_hdr = (void __user *)arg;
3622 OBD_ALLOC_PTR(k_ladvise_hdr);
3623 if (k_ladvise_hdr == NULL)
3626 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3627 GOTO(out_ladvise, rc = -EFAULT);
3629 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3630 k_ladvise_hdr->lah_count < 1)
3631 GOTO(out_ladvise, rc = -EINVAL);
3633 num_advise = k_ladvise_hdr->lah_count;
3634 if (num_advise >= LAH_COUNT_MAX)
3635 GOTO(out_ladvise, rc = -EFBIG);
3637 OBD_FREE_PTR(k_ladvise_hdr);
3638 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3639 lah_advise[num_advise]);
3640 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3641 if (k_ladvise_hdr == NULL)
3645 * TODO: submit multiple advices to one server in a single RPC
3647 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3648 GOTO(out_ladvise, rc = -EFAULT);
3650 for (i = 0; i < num_advise; i++) {
3651 struct llapi_lu_ladvise *k_ladvise =
3652 &k_ladvise_hdr->lah_advise[i];
3653 struct llapi_lu_ladvise __user *u_ladvise =
3654 &u_ladvise_hdr->lah_advise[i];
3656 rc = ll_ladvise_sanity(inode, k_ladvise);
3658 GOTO(out_ladvise, rc);
3660 switch (k_ladvise->lla_advice) {
3661 case LU_LADVISE_LOCKNOEXPAND:
3662 rc = ll_lock_noexpand(file,
3663 k_ladvise->lla_peradvice_flags);
3664 GOTO(out_ladvise, rc);
3665 case LU_LADVISE_LOCKAHEAD:
3667 rc = ll_file_lock_ahead(file, k_ladvise);
3670 GOTO(out_ladvise, rc);
3673 &u_ladvise->lla_lockahead_result))
3674 GOTO(out_ladvise, rc = -EFAULT);
3677 rc = ll_ladvise(inode, file,
3678 k_ladvise_hdr->lah_flags,
3681 GOTO(out_ladvise, rc);
3688 OBD_FREE(k_ladvise_hdr, alloc_size);
3691 case LL_IOC_FLR_SET_MIRROR: {
3692 /* mirror I/O must be direct to avoid polluting page cache
3694 if (!(file->f_flags & O_DIRECT))
3697 fd->fd_designated_mirror = (__u32)arg;
3700 case LL_IOC_FSGETXATTR:
3701 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3702 case LL_IOC_FSSETXATTR:
3703 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3705 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3706 case LL_IOC_HEAT_GET: {
3707 struct lu_heat uheat;
3708 struct lu_heat *heat;
3711 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3714 if (uheat.lh_count > OBD_HEAT_COUNT)
3715 uheat.lh_count = OBD_HEAT_COUNT;
3717 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3718 OBD_ALLOC(heat, size);
3722 heat->lh_count = uheat.lh_count;
3723 ll_heat_get(inode, heat);
3724 rc = copy_to_user((char __user *)arg, heat, size);
3725 OBD_FREE(heat, size);
3726 RETURN(rc ? -EFAULT : 0);
3728 case LL_IOC_HEAT_SET: {
3731 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3734 rc = ll_heat_set(inode, flags);
3738 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3739 (void __user *)arg));
3743 #ifndef HAVE_FILE_LLSEEK_SIZE
3744 static inline loff_t
3745 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3747 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3749 if (offset > maxsize)
3752 if (offset != file->f_pos) {
3753 file->f_pos = offset;
3754 file->f_version = 0;
3760 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3761 loff_t maxsize, loff_t eof)
3763 struct inode *inode = file_inode(file);
3771 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3772 * position-querying operation. Avoid rewriting the "same"
3773 * f_pos value back to the file because a concurrent read(),
3774 * write() or lseek() might have altered it
3779 * f_lock protects against read/modify/write race with other
3780 * SEEK_CURs. Note that parallel writes and reads behave
3784 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3785 inode_unlock(inode);
3789 * In the generic case the entire file is data, so as long as
3790 * offset isn't at the end of the file then the offset is data.
3797 * There is a virtual hole at the end of the file, so as long as
3798 * offset isn't i_size or larger, return i_size.
3806 return llseek_execute(file, offset, maxsize);
3810 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3812 struct inode *inode = file_inode(file);
3813 loff_t retval, eof = 0;
3816 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3817 (origin == SEEK_CUR) ? file->f_pos : 0);
3818 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3819 PFID(ll_inode2fid(inode)), inode, retval, retval,
3821 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3823 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3824 retval = ll_glimpse_size(inode);
3827 eof = i_size_read(inode);
3830 retval = ll_generic_file_llseek_size(file, offset, origin,
3831 ll_file_maxbytes(inode), eof);
3835 static int ll_flush(struct file *file, fl_owner_t id)
3837 struct inode *inode = file_inode(file);
3838 struct ll_inode_info *lli = ll_i2info(inode);
3839 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3842 LASSERT(!S_ISDIR(inode->i_mode));
3844 /* catch async errors that were recorded back when async writeback
3845 * failed for pages in this mapping. */
3846 rc = lli->lli_async_rc;
3847 lli->lli_async_rc = 0;
3848 if (lli->lli_clob != NULL) {
3849 err = lov_read_and_clear_async_rc(lli->lli_clob);
3854 /* The application has been told write failure already.
3855 * Do not report failure again. */
3856 if (fd->fd_write_failed)
3858 return rc ? -EIO : 0;
3862 * Called to make sure a portion of file has been written out.
3863 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3865 * Return how many pages have been written.
3867 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3868 enum cl_fsync_mode mode, int ignore_layout)
3872 struct cl_fsync_io *fio;
3877 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3878 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3881 env = cl_env_get(&refcheck);
3883 RETURN(PTR_ERR(env));
3885 io = vvp_env_thread_io(env);
3886 io->ci_obj = ll_i2info(inode)->lli_clob;
3887 io->ci_ignore_layout = ignore_layout;
3889 /* initialize parameters for sync */
3890 fio = &io->u.ci_fsync;
3891 fio->fi_start = start;
3893 fio->fi_fid = ll_inode2fid(inode);
3894 fio->fi_mode = mode;
3895 fio->fi_nr_written = 0;
3897 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3898 result = cl_io_loop(env, io);
3900 result = io->ci_result;
3902 result = fio->fi_nr_written;
3903 cl_io_fini(env, io);
3904 cl_env_put(env, &refcheck);
3910 * When dentry is provided (the 'else' case), file_dentry() may be
3911 * null and dentry must be used directly rather than pulled from
3912 * file_dentry() as is done otherwise.
3915 #ifdef HAVE_FILE_FSYNC_4ARGS
3916 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3918 struct dentry *dentry = file_dentry(file);
3919 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3920 int ll_fsync(struct file *file, int datasync)
3922 struct dentry *dentry = file_dentry(file);
3924 loff_t end = LLONG_MAX;
3926 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3929 loff_t end = LLONG_MAX;
3931 struct inode *inode = dentry->d_inode;
3932 struct ll_inode_info *lli = ll_i2info(inode);
3933 struct ptlrpc_request *req;
3937 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3938 PFID(ll_inode2fid(inode)), inode);
3939 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3941 #ifdef HAVE_FILE_FSYNC_4ARGS
3942 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3945 /* fsync's caller has already called _fdata{sync,write}, we want
3946 * that IO to finish before calling the osc and mdc sync methods */
3947 rc = filemap_fdatawait(inode->i_mapping);
3950 /* catch async errors that were recorded back when async writeback
3951 * failed for pages in this mapping. */
3952 if (!S_ISDIR(inode->i_mode)) {
3953 err = lli->lli_async_rc;
3954 lli->lli_async_rc = 0;
3957 if (lli->lli_clob != NULL) {
3958 err = lov_read_and_clear_async_rc(lli->lli_clob);
3964 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3968 ptlrpc_req_finished(req);
3970 if (S_ISREG(inode->i_mode)) {
3971 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3973 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3974 if (rc == 0 && err < 0)
3977 fd->fd_write_failed = true;
3979 fd->fd_write_failed = false;
3982 #ifdef HAVE_FILE_FSYNC_4ARGS
3983 inode_unlock(inode);
3989 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3991 struct inode *inode = file_inode(file);
3992 struct ll_sb_info *sbi = ll_i2sbi(inode);
3993 struct ldlm_enqueue_info einfo = {
3994 .ei_type = LDLM_FLOCK,
3995 .ei_cb_cp = ldlm_flock_completion_ast,
3996 .ei_cbdata = file_lock,
3998 struct md_op_data *op_data;
3999 struct lustre_handle lockh = { 0 };
4000 union ldlm_policy_data flock = { { 0 } };
4001 int fl_type = file_lock->fl_type;
4007 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4008 PFID(ll_inode2fid(inode)), file_lock);
4010 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4012 if (file_lock->fl_flags & FL_FLOCK) {
4013 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4014 /* flocks are whole-file locks */
4015 flock.l_flock.end = OFFSET_MAX;
4016 /* For flocks owner is determined by the local file desctiptor*/
4017 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4018 } else if (file_lock->fl_flags & FL_POSIX) {
4019 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4020 flock.l_flock.start = file_lock->fl_start;
4021 flock.l_flock.end = file_lock->fl_end;
4025 flock.l_flock.pid = file_lock->fl_pid;
4027 /* Somewhat ugly workaround for svc lockd.
4028 * lockd installs custom fl_lmops->lm_compare_owner that checks
4029 * for the fl_owner to be the same (which it always is on local node
4030 * I guess between lockd processes) and then compares pid.
4031 * As such we assign pid to the owner field to make it all work,
4032 * conflict with normal locks is unlikely since pid space and
4033 * pointer space for current->files are not intersecting */
4034 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4035 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4039 einfo.ei_mode = LCK_PR;
4042 /* An unlock request may or may not have any relation to
4043 * existing locks so we may not be able to pass a lock handle
4044 * via a normal ldlm_lock_cancel() request. The request may even
4045 * unlock a byte range in the middle of an existing lock. In
4046 * order to process an unlock request we need all of the same
4047 * information that is given with a normal read or write record
4048 * lock request. To avoid creating another ldlm unlock (cancel)
4049 * message we'll treat a LCK_NL flock request as an unlock. */
4050 einfo.ei_mode = LCK_NL;
4053 einfo.ei_mode = LCK_PW;
4056 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4071 flags = LDLM_FL_BLOCK_NOWAIT;
4077 flags = LDLM_FL_TEST_LOCK;
4080 CERROR("unknown fcntl lock command: %d\n", cmd);
4084 /* Save the old mode so that if the mode in the lock changes we
4085 * can decrement the appropriate reader or writer refcount. */
4086 file_lock->fl_type = einfo.ei_mode;
4088 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4089 LUSTRE_OPC_ANY, NULL);
4090 if (IS_ERR(op_data))
4091 RETURN(PTR_ERR(op_data));
4093 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4094 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4095 flock.l_flock.pid, flags, einfo.ei_mode,
4096 flock.l_flock.start, flock.l_flock.end);
4098 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4101 /* Restore the file lock type if not TEST lock. */
4102 if (!(flags & LDLM_FL_TEST_LOCK))
4103 file_lock->fl_type = fl_type;
4105 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4106 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4107 !(flags & LDLM_FL_TEST_LOCK))
4108 rc2 = locks_lock_file_wait(file, file_lock);
4110 if ((file_lock->fl_flags & FL_FLOCK) &&
4111 (rc == 0 || file_lock->fl_type == F_UNLCK))
4112 rc2 = flock_lock_file_wait(file, file_lock);
4113 if ((file_lock->fl_flags & FL_POSIX) &&
4114 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4115 !(flags & LDLM_FL_TEST_LOCK))
4116 rc2 = posix_lock_file_wait(file, file_lock);
4117 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4119 if (rc2 && file_lock->fl_type != F_UNLCK) {
4120 einfo.ei_mode = LCK_NL;
4121 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4126 ll_finish_md_op_data(op_data);
4131 int ll_get_fid_by_name(struct inode *parent, const char *name,
4132 int namelen, struct lu_fid *fid,
4133 struct inode **inode)
4135 struct md_op_data *op_data = NULL;
4136 struct mdt_body *body;
4137 struct ptlrpc_request *req;
4141 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4142 LUSTRE_OPC_ANY, NULL);
4143 if (IS_ERR(op_data))
4144 RETURN(PTR_ERR(op_data));
4146 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4147 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4148 ll_finish_md_op_data(op_data);
4152 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4154 GOTO(out_req, rc = -EFAULT);
4156 *fid = body->mbo_fid1;
4159 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4161 ptlrpc_req_finished(req);
4165 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4168 struct dentry *dchild = NULL;
4169 struct inode *child_inode = NULL;
4170 struct md_op_data *op_data;
4171 struct ptlrpc_request *request = NULL;
4172 struct obd_client_handle *och = NULL;
4174 struct mdt_body *body;
4175 __u64 data_version = 0;
4176 size_t namelen = strlen(name);
4177 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4181 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4182 PFID(ll_inode2fid(parent)), name,
4183 lum->lum_stripe_offset, lum->lum_stripe_count);
4185 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4186 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4187 lustre_swab_lmv_user_md(lum);
4189 /* Get child FID first */
4190 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4193 dchild = d_lookup(file_dentry(file), &qstr);
4195 if (dchild->d_inode)
4196 child_inode = igrab(dchild->d_inode);
4201 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4210 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4211 OBD_CONNECT2_DIR_MIGRATE)) {
4212 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4213 ll_i2info(child_inode)->lli_lsm_md) {
4214 CERROR("%s: MDT doesn't support stripe directory "
4215 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4216 GOTO(out_iput, rc = -EOPNOTSUPP);
4221 * lfs migrate command needs to be blocked on the client
4222 * by checking the migrate FID against the FID of the
4225 if (child_inode == parent->i_sb->s_root->d_inode)
4226 GOTO(out_iput, rc = -EINVAL);
4228 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4229 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4230 if (IS_ERR(op_data))
4231 GOTO(out_iput, rc = PTR_ERR(op_data));
4233 inode_lock(child_inode);
4234 op_data->op_fid3 = *ll_inode2fid(child_inode);
4235 if (!fid_is_sane(&op_data->op_fid3)) {
4236 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4237 ll_i2sbi(parent)->ll_fsname, name,
4238 PFID(&op_data->op_fid3));
4239 GOTO(out_unlock, rc = -EINVAL);
4242 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4243 op_data->op_data = lum;
4244 op_data->op_data_size = lumlen;
4247 if (S_ISREG(child_inode->i_mode)) {
4248 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4252 GOTO(out_unlock, rc);
4255 rc = ll_data_version(child_inode, &data_version,
4258 GOTO(out_close, rc);
4260 op_data->op_open_handle = och->och_open_handle;
4261 op_data->op_data_version = data_version;
4262 op_data->op_lease_handle = och->och_lease_handle;
4263 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4265 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4266 och->och_mod->mod_open_req->rq_replay = 0;
4267 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4270 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4271 name, namelen, &request);
4273 LASSERT(request != NULL);
4274 ll_update_times(request, parent);
4277 if (rc == 0 || rc == -EAGAIN) {
4278 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4279 LASSERT(body != NULL);
4281 /* If the server does release layout lock, then we cleanup
4282 * the client och here, otherwise release it in out_close: */
4283 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4284 obd_mod_put(och->och_mod);
4285 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4287 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4293 if (request != NULL) {
4294 ptlrpc_req_finished(request);
4298 /* Try again if the lease has cancelled. */
4299 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4304 ll_lease_close(och, child_inode, NULL);
4306 clear_nlink(child_inode);
4308 inode_unlock(child_inode);
4309 ll_finish_md_op_data(op_data);
4316 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4324 * test if some locks matching bits and l_req_mode are acquired
4325 * - bits can be in different locks
4326 * - if found clear the common lock bits in *bits
4327 * - the bits not found, are kept in *bits
4329 * \param bits [IN] searched lock bits [IN]
4330 * \param l_req_mode [IN] searched lock mode
4331 * \retval boolean, true iff all bits are found
4333 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4335 struct lustre_handle lockh;
4336 union ldlm_policy_data policy;
4337 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4338 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4347 fid = &ll_i2info(inode)->lli_fid;
4348 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4349 ldlm_lockname[mode]);
4351 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4352 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4353 policy.l_inodebits.bits = *bits & (1 << i);
4354 if (policy.l_inodebits.bits == 0)
4357 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4358 &policy, mode, &lockh)) {
4359 struct ldlm_lock *lock;
4361 lock = ldlm_handle2lock(&lockh);
4364 ~(lock->l_policy_data.l_inodebits.bits);
4365 LDLM_LOCK_PUT(lock);
4367 *bits &= ~policy.l_inodebits.bits;
4374 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4375 struct lustre_handle *lockh, __u64 flags,
4376 enum ldlm_mode mode)
4378 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4383 fid = &ll_i2info(inode)->lli_fid;
4384 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4386 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4387 fid, LDLM_IBITS, &policy, mode, lockh);
4392 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4394 /* Already unlinked. Just update nlink and return success */
4395 if (rc == -ENOENT) {
4397 /* If it is striped directory, and there is bad stripe
4398 * Let's revalidate the dentry again, instead of returning
4400 if (S_ISDIR(inode->i_mode) &&
4401 ll_i2info(inode)->lli_lsm_md != NULL)
4404 /* This path cannot be hit for regular files unless in
4405 * case of obscure races, so no need to to validate
4407 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4409 } else if (rc != 0) {
4410 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4411 "%s: revalidate FID "DFID" error: rc = %d\n",
4412 ll_i2sbi(inode)->ll_fsname,
4413 PFID(ll_inode2fid(inode)), rc);
4419 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4421 struct inode *inode = dentry->d_inode;
4422 struct obd_export *exp = ll_i2mdexp(inode);
4423 struct lookup_intent oit = {
4426 struct ptlrpc_request *req = NULL;
4427 struct md_op_data *op_data;
4431 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4432 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4434 /* Call getattr by fid, so do not provide name at all. */
4435 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4436 LUSTRE_OPC_ANY, NULL);
4437 if (IS_ERR(op_data))
4438 RETURN(PTR_ERR(op_data));
4440 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4441 ll_finish_md_op_data(op_data);
4443 rc = ll_inode_revalidate_fini(inode, rc);
4447 rc = ll_revalidate_it_finish(req, &oit, dentry);
4449 ll_intent_release(&oit);
4453 /* Unlinked? Unhash dentry, so it is not picked up later by
4454 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4455 * here to preserve get_cwd functionality on 2.6.
4457 if (!dentry->d_inode->i_nlink) {
4458 ll_lock_dcache(inode);
4459 d_lustre_invalidate(dentry, 0);
4460 ll_unlock_dcache(inode);
4463 ll_lookup_finish_locks(&oit, dentry);
4465 ptlrpc_req_finished(req);
4470 static int ll_merge_md_attr(struct inode *inode)
4472 struct ll_inode_info *lli = ll_i2info(inode);
4473 struct cl_attr attr = { 0 };
4476 LASSERT(lli->lli_lsm_md != NULL);
4478 /* foreign dir is not striped dir */
4479 if (lli->lli_lsm_md->lsm_md_magic == LMV_MAGIC_FOREIGN)
4482 down_read(&lli->lli_lsm_sem);
4483 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4484 &attr, ll_md_blocking_ast);
4485 up_read(&lli->lli_lsm_sem);
4489 set_nlink(inode, attr.cat_nlink);
4490 inode->i_blocks = attr.cat_blocks;
4491 i_size_write(inode, attr.cat_size);
4493 ll_i2info(inode)->lli_atime = attr.cat_atime;
4494 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4495 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4500 static inline dev_t ll_compat_encode_dev(dev_t dev)
4502 /* The compat_sys_*stat*() syscalls will fail unless the
4503 * device majors and minors are both less than 256. Note that
4504 * the value returned here will be passed through
4505 * old_encode_dev() in cp_compat_stat(). And so we are not
4506 * trying to return a valid compat (u16) device number, just
4507 * one that will pass the old_valid_dev() check. */
4509 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4512 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4513 int ll_getattr(const struct path *path, struct kstat *stat,
4514 u32 request_mask, unsigned int flags)
4516 struct dentry *de = path->dentry;
4518 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4521 struct inode *inode = de->d_inode;
4522 struct ll_sb_info *sbi = ll_i2sbi(inode);
4523 struct ll_inode_info *lli = ll_i2info(inode);
4526 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4528 rc = ll_inode_revalidate(de, IT_GETATTR);
4532 if (S_ISREG(inode->i_mode)) {
4533 /* In case of restore, the MDT has the right size and has
4534 * already send it back without granting the layout lock,
4535 * inode is up-to-date so glimpse is useless.
4536 * Also to glimpse we need the layout, in case of a running
4537 * restore the MDT holds the layout lock so the glimpse will
4538 * block up to the end of restore (getattr will block)
4540 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4541 rc = ll_glimpse_size(inode);
4546 /* If object isn't regular a file then don't validate size. */
4547 if (S_ISDIR(inode->i_mode) &&
4548 lli->lli_lsm_md != NULL) {
4549 rc = ll_merge_md_attr(inode);
4554 inode->i_atime.tv_sec = lli->lli_atime;
4555 inode->i_mtime.tv_sec = lli->lli_mtime;
4556 inode->i_ctime.tv_sec = lli->lli_ctime;
4559 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4561 if (ll_need_32bit_api(sbi)) {
4562 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4563 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4564 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4566 stat->ino = inode->i_ino;
4567 stat->dev = inode->i_sb->s_dev;
4568 stat->rdev = inode->i_rdev;
4571 stat->mode = inode->i_mode;
4572 stat->uid = inode->i_uid;
4573 stat->gid = inode->i_gid;
4574 stat->atime = inode->i_atime;
4575 stat->mtime = inode->i_mtime;
4576 stat->ctime = inode->i_ctime;
4577 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4579 stat->nlink = inode->i_nlink;
4580 stat->size = i_size_read(inode);
4581 stat->blocks = inode->i_blocks;
4586 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4587 __u64 start, __u64 len)
4591 struct fiemap *fiemap;
4592 unsigned int extent_count = fieinfo->fi_extents_max;
4594 num_bytes = sizeof(*fiemap) + (extent_count *
4595 sizeof(struct fiemap_extent));
4596 OBD_ALLOC_LARGE(fiemap, num_bytes);
4601 fiemap->fm_flags = fieinfo->fi_flags;
4602 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4603 fiemap->fm_start = start;
4604 fiemap->fm_length = len;
4605 if (extent_count > 0 &&
4606 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4607 sizeof(struct fiemap_extent)) != 0)
4608 GOTO(out, rc = -EFAULT);
4610 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4612 fieinfo->fi_flags = fiemap->fm_flags;
4613 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4614 if (extent_count > 0 &&
4615 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4616 fiemap->fm_mapped_extents *
4617 sizeof(struct fiemap_extent)) != 0)
4618 GOTO(out, rc = -EFAULT);
4620 OBD_FREE_LARGE(fiemap, num_bytes);
4624 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4626 struct ll_inode_info *lli = ll_i2info(inode);
4627 struct posix_acl *acl = NULL;
4630 spin_lock(&lli->lli_lock);
4631 /* VFS' acl_permission_check->check_acl will release the refcount */
4632 acl = posix_acl_dup(lli->lli_posix_acl);
4633 spin_unlock(&lli->lli_lock);
4638 #ifdef HAVE_IOP_SET_ACL
4639 #ifdef CONFIG_FS_POSIX_ACL
4640 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4642 struct ll_sb_info *sbi = ll_i2sbi(inode);
4643 struct ptlrpc_request *req = NULL;
4644 const char *name = NULL;
4646 size_t value_size = 0;
4651 case ACL_TYPE_ACCESS:
4652 name = XATTR_NAME_POSIX_ACL_ACCESS;
4654 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4657 case ACL_TYPE_DEFAULT:
4658 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4659 if (!S_ISDIR(inode->i_mode))
4660 rc = acl ? -EACCES : 0;
4671 value_size = posix_acl_xattr_size(acl->a_count);
4672 value = kmalloc(value_size, GFP_NOFS);
4674 GOTO(out, rc = -ENOMEM);
4676 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4678 GOTO(out_value, rc);
4681 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4682 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4683 name, value, value_size, 0, 0, &req);
4685 ptlrpc_req_finished(req);
4690 forget_cached_acl(inode, type);
4692 set_cached_acl(inode, type, acl);
4695 #endif /* CONFIG_FS_POSIX_ACL */
4696 #endif /* HAVE_IOP_SET_ACL */
4698 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4700 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4701 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4703 ll_check_acl(struct inode *inode, int mask)
4706 # ifdef CONFIG_FS_POSIX_ACL
4707 struct posix_acl *acl;
4711 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4712 if (flags & IPERM_FLAG_RCU)
4715 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4720 rc = posix_acl_permission(inode, acl, mask);
4721 posix_acl_release(acl);
4724 # else /* !CONFIG_FS_POSIX_ACL */
4726 # endif /* CONFIG_FS_POSIX_ACL */
4728 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4730 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4731 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4733 # ifdef HAVE_INODE_PERMISION_2ARGS
4734 int ll_inode_permission(struct inode *inode, int mask)
4736 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4741 struct ll_sb_info *sbi;
4742 struct root_squash_info *squash;
4743 struct cred *cred = NULL;
4744 const struct cred *old_cred = NULL;
4746 bool squash_id = false;
4749 #ifdef MAY_NOT_BLOCK
4750 if (mask & MAY_NOT_BLOCK)
4752 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4753 if (flags & IPERM_FLAG_RCU)
4757 /* as root inode are NOT getting validated in lookup operation,
4758 * need to do it before permission check. */
4760 if (inode == inode->i_sb->s_root->d_inode) {
4761 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4766 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4767 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4769 /* squash fsuid/fsgid if needed */
4770 sbi = ll_i2sbi(inode);
4771 squash = &sbi->ll_squash;
4772 if (unlikely(squash->rsi_uid != 0 &&
4773 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4774 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4778 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4779 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4780 squash->rsi_uid, squash->rsi_gid);
4782 /* update current process's credentials
4783 * and FS capability */
4784 cred = prepare_creds();
4788 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4789 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4790 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4791 if ((1 << cap) & CFS_CAP_FS_MASK)
4792 cap_lower(cred->cap_effective, cap);
4794 old_cred = override_creds(cred);
4797 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4798 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4799 /* restore current process's credentials and FS capability */
4801 revert_creds(old_cred);
4808 /* -o localflock - only provides locally consistent flock locks */
4809 struct file_operations ll_file_operations = {
4810 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4811 # ifdef HAVE_SYNC_READ_WRITE
4812 .read = new_sync_read,
4813 .write = new_sync_write,
4815 .read_iter = ll_file_read_iter,
4816 .write_iter = ll_file_write_iter,
4817 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4818 .read = ll_file_read,
4819 .aio_read = ll_file_aio_read,
4820 .write = ll_file_write,
4821 .aio_write = ll_file_aio_write,
4822 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4823 .unlocked_ioctl = ll_file_ioctl,
4824 .open = ll_file_open,
4825 .release = ll_file_release,
4826 .mmap = ll_file_mmap,
4827 .llseek = ll_file_seek,
4828 .splice_read = ll_file_splice_read,
4833 struct file_operations ll_file_operations_flock = {
4834 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4835 # ifdef HAVE_SYNC_READ_WRITE
4836 .read = new_sync_read,
4837 .write = new_sync_write,
4838 # endif /* HAVE_SYNC_READ_WRITE */
4839 .read_iter = ll_file_read_iter,
4840 .write_iter = ll_file_write_iter,
4841 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4842 .read = ll_file_read,
4843 .aio_read = ll_file_aio_read,
4844 .write = ll_file_write,
4845 .aio_write = ll_file_aio_write,
4846 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4847 .unlocked_ioctl = ll_file_ioctl,
4848 .open = ll_file_open,
4849 .release = ll_file_release,
4850 .mmap = ll_file_mmap,
4851 .llseek = ll_file_seek,
4852 .splice_read = ll_file_splice_read,
4855 .flock = ll_file_flock,
4856 .lock = ll_file_flock
4859 /* These are for -o noflock - to return ENOSYS on flock calls */
4860 struct file_operations ll_file_operations_noflock = {
4861 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4862 # ifdef HAVE_SYNC_READ_WRITE
4863 .read = new_sync_read,
4864 .write = new_sync_write,
4865 # endif /* HAVE_SYNC_READ_WRITE */
4866 .read_iter = ll_file_read_iter,
4867 .write_iter = ll_file_write_iter,
4868 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4869 .read = ll_file_read,
4870 .aio_read = ll_file_aio_read,
4871 .write = ll_file_write,
4872 .aio_write = ll_file_aio_write,
4873 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4874 .unlocked_ioctl = ll_file_ioctl,
4875 .open = ll_file_open,
4876 .release = ll_file_release,
4877 .mmap = ll_file_mmap,
4878 .llseek = ll_file_seek,
4879 .splice_read = ll_file_splice_read,
4882 .flock = ll_file_noflock,
4883 .lock = ll_file_noflock
4886 struct inode_operations ll_file_inode_operations = {
4887 .setattr = ll_setattr,
4888 .getattr = ll_getattr,
4889 .permission = ll_inode_permission,
4890 #ifdef HAVE_IOP_XATTR
4891 .setxattr = ll_setxattr,
4892 .getxattr = ll_getxattr,
4893 .removexattr = ll_removexattr,
4895 .listxattr = ll_listxattr,
4896 .fiemap = ll_fiemap,
4897 #ifdef HAVE_IOP_GET_ACL
4898 .get_acl = ll_get_acl,
4900 #ifdef HAVE_IOP_SET_ACL
4901 .set_acl = ll_set_acl,
4905 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4907 struct ll_inode_info *lli = ll_i2info(inode);
4908 struct cl_object *obj = lli->lli_clob;
4917 env = cl_env_get(&refcheck);
4919 RETURN(PTR_ERR(env));
4921 rc = cl_conf_set(env, lli->lli_clob, conf);
4925 if (conf->coc_opc == OBJECT_CONF_SET) {
4926 struct ldlm_lock *lock = conf->coc_lock;
4927 struct cl_layout cl = {
4931 LASSERT(lock != NULL);
4932 LASSERT(ldlm_has_layout(lock));
4934 /* it can only be allowed to match after layout is
4935 * applied to inode otherwise false layout would be
4936 * seen. Applying layout shoud happen before dropping
4937 * the intent lock. */
4938 ldlm_lock_allow_match(lock);
4940 rc = cl_object_layout_get(env, obj, &cl);
4945 DFID": layout version change: %u -> %u\n",
4946 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4948 ll_layout_version_set(lli, cl.cl_layout_gen);
4952 cl_env_put(env, &refcheck);
4957 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4958 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4961 struct ll_sb_info *sbi = ll_i2sbi(inode);
4962 struct ptlrpc_request *req;
4969 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4970 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4971 lock->l_lvb_data, lock->l_lvb_len);
4973 if (lock->l_lvb_data != NULL)
4976 /* if layout lock was granted right away, the layout is returned
4977 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4978 * blocked and then granted via completion ast, we have to fetch
4979 * layout here. Please note that we can't use the LVB buffer in
4980 * completion AST because it doesn't have a large enough buffer */
4981 rc = ll_get_default_mdsize(sbi, &lmmsize);
4985 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4986 XATTR_NAME_LOV, lmmsize, &req);
4989 GOTO(out, rc = 0); /* empty layout */
4996 if (lmmsize == 0) /* empty layout */
4999 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5001 GOTO(out, rc = -EFAULT);
5003 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5004 if (lvbdata == NULL)
5005 GOTO(out, rc = -ENOMEM);
5007 memcpy(lvbdata, lmm, lmmsize);
5008 lock_res_and_lock(lock);
5009 if (unlikely(lock->l_lvb_data == NULL)) {
5010 lock->l_lvb_type = LVB_T_LAYOUT;
5011 lock->l_lvb_data = lvbdata;
5012 lock->l_lvb_len = lmmsize;
5015 unlock_res_and_lock(lock);
5018 OBD_FREE_LARGE(lvbdata, lmmsize);
5023 ptlrpc_req_finished(req);
5028 * Apply the layout to the inode. Layout lock is held and will be released
5031 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5032 struct inode *inode)
5034 struct ll_inode_info *lli = ll_i2info(inode);
5035 struct ll_sb_info *sbi = ll_i2sbi(inode);
5036 struct ldlm_lock *lock;
5037 struct cl_object_conf conf;
5040 bool wait_layout = false;
5043 LASSERT(lustre_handle_is_used(lockh));
5045 lock = ldlm_handle2lock(lockh);
5046 LASSERT(lock != NULL);
5047 LASSERT(ldlm_has_layout(lock));
5049 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5050 PFID(&lli->lli_fid), inode);
5052 /* in case this is a caching lock and reinstate with new inode */
5053 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5055 lock_res_and_lock(lock);
5056 lvb_ready = ldlm_is_lvb_ready(lock);
5057 unlock_res_and_lock(lock);
5059 /* checking lvb_ready is racy but this is okay. The worst case is
5060 * that multi processes may configure the file on the same time. */
5064 rc = ll_layout_fetch(inode, lock);
5068 /* for layout lock, lmm is stored in lock's lvb.
5069 * lvb_data is immutable if the lock is held so it's safe to access it
5072 * set layout to file. Unlikely this will fail as old layout was
5073 * surely eliminated */
5074 memset(&conf, 0, sizeof conf);
5075 conf.coc_opc = OBJECT_CONF_SET;
5076 conf.coc_inode = inode;
5077 conf.coc_lock = lock;
5078 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5079 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5080 rc = ll_layout_conf(inode, &conf);
5082 /* refresh layout failed, need to wait */
5083 wait_layout = rc == -EBUSY;
5086 LDLM_LOCK_PUT(lock);
5087 ldlm_lock_decref(lockh, mode);
5089 /* wait for IO to complete if it's still being used. */
5091 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5092 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5094 memset(&conf, 0, sizeof conf);
5095 conf.coc_opc = OBJECT_CONF_WAIT;
5096 conf.coc_inode = inode;
5097 rc = ll_layout_conf(inode, &conf);
5101 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5102 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5108 * Issue layout intent RPC to MDS.
5109 * \param inode [in] file inode
5110 * \param intent [in] layout intent
5112 * \retval 0 on success
5113 * \retval < 0 error code
5115 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5117 struct ll_inode_info *lli = ll_i2info(inode);
5118 struct ll_sb_info *sbi = ll_i2sbi(inode);
5119 struct md_op_data *op_data;
5120 struct lookup_intent it;
5121 struct ptlrpc_request *req;
5125 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5126 0, 0, LUSTRE_OPC_ANY, NULL);
5127 if (IS_ERR(op_data))
5128 RETURN(PTR_ERR(op_data));
5130 op_data->op_data = intent;
5131 op_data->op_data_size = sizeof(*intent);
5133 memset(&it, 0, sizeof(it));
5134 it.it_op = IT_LAYOUT;
5135 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5136 intent->li_opc == LAYOUT_INTENT_TRUNC)
5137 it.it_flags = FMODE_WRITE;
5139 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5140 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5142 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5143 &ll_md_blocking_ast, 0);
5144 if (it.it_request != NULL)
5145 ptlrpc_req_finished(it.it_request);
5146 it.it_request = NULL;
5148 ll_finish_md_op_data(op_data);
5150 /* set lock data in case this is a new lock */
5152 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5154 ll_intent_drop_lock(&it);
5160 * This function checks if there exists a LAYOUT lock on the client side,
5161 * or enqueues it if it doesn't have one in cache.
5163 * This function will not hold layout lock so it may be revoked any time after
5164 * this function returns. Any operations depend on layout should be redone
5167 * This function should be called before lov_io_init() to get an uptodate
5168 * layout version, the caller should save the version number and after IO
5169 * is finished, this function should be called again to verify that layout
5170 * is not changed during IO time.
5172 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5174 struct ll_inode_info *lli = ll_i2info(inode);
5175 struct ll_sb_info *sbi = ll_i2sbi(inode);
5176 struct lustre_handle lockh;
5177 struct layout_intent intent = {
5178 .li_opc = LAYOUT_INTENT_ACCESS,
5180 enum ldlm_mode mode;
5184 *gen = ll_layout_version_get(lli);
5185 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5189 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5190 LASSERT(S_ISREG(inode->i_mode));
5192 /* take layout lock mutex to enqueue layout lock exclusively. */
5193 mutex_lock(&lli->lli_layout_mutex);
5196 /* mostly layout lock is caching on the local side, so try to
5197 * match it before grabbing layout lock mutex. */
5198 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5199 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5200 if (mode != 0) { /* hit cached lock */
5201 rc = ll_layout_lock_set(&lockh, mode, inode);
5207 rc = ll_layout_intent(inode, &intent);
5213 *gen = ll_layout_version_get(lli);
5214 mutex_unlock(&lli->lli_layout_mutex);
5220 * Issue layout intent RPC indicating where in a file an IO is about to write.
5222 * \param[in] inode file inode.
5223 * \param[in] ext write range with start offset of fille in bytes where
5224 * an IO is about to write, and exclusive end offset in
5227 * \retval 0 on success
5228 * \retval < 0 error code
5230 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5231 struct lu_extent *ext)
5233 struct layout_intent intent = {
5235 .li_extent.e_start = ext->e_start,
5236 .li_extent.e_end = ext->e_end,
5241 rc = ll_layout_intent(inode, &intent);
5247 * This function send a restore request to the MDT
5249 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5251 struct hsm_user_request *hur;
5255 len = sizeof(struct hsm_user_request) +
5256 sizeof(struct hsm_user_item);
5257 OBD_ALLOC(hur, len);
5261 hur->hur_request.hr_action = HUA_RESTORE;
5262 hur->hur_request.hr_archive_id = 0;
5263 hur->hur_request.hr_flags = 0;
5264 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5265 sizeof(hur->hur_user_item[0].hui_fid));
5266 hur->hur_user_item[0].hui_extent.offset = offset;
5267 hur->hur_user_item[0].hui_extent.length = length;
5268 hur->hur_request.hr_itemcount = 1;
5269 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,