4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_HSM_RELEASE:
160 LASSERT(data != NULL);
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *(__u64 *)data;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 LASSERT(data == NULL);
172 rc = md_close(md_exp, op_data, och->och_mod, &req);
173 if (rc != 0 && rc != -EINTR)
174 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
175 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
177 if (rc == 0 && op_data->op_bias & bias) {
178 struct mdt_body *body;
180 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
181 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
185 ll_finish_md_op_data(op_data);
189 md_clear_open_replay_data(md_exp, och);
190 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
193 ptlrpc_req_finished(req); /* This is close request */
197 int ll_md_real_close(struct inode *inode, fmode_t fmode)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (fmode & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (fmode & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(fmode & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount > 0) {
220 /* There are still users of this handle, so skip
222 mutex_unlock(&lli->lli_och_mutex);
228 mutex_unlock(&lli->lli_och_mutex);
231 /* There might be a race and this handle may already
233 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
239 static int ll_md_close(struct inode *inode, struct file *file)
241 union ldlm_policy_data policy = {
242 .l_inodebits = { MDS_INODELOCK_OPEN },
244 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
247 struct lustre_handle lockh;
248 enum ldlm_mode lockmode;
252 /* clear group lock, if present */
253 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
254 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
256 if (fd->fd_lease_och != NULL) {
259 /* Usually the lease is not released when the
260 * application crashed, we need to release here. */
261 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
262 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
263 PFID(&lli->lli_fid), rc, lease_broken);
265 fd->fd_lease_och = NULL;
268 if (fd->fd_och != NULL) {
269 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
274 /* Let's see if we have good enough OPEN lock on the file and if
275 we can skip talking to MDS */
276 mutex_lock(&lli->lli_och_mutex);
277 if (fd->fd_omode & FMODE_WRITE) {
279 LASSERT(lli->lli_open_fd_write_count);
280 lli->lli_open_fd_write_count--;
281 } else if (fd->fd_omode & FMODE_EXEC) {
283 LASSERT(lli->lli_open_fd_exec_count);
284 lli->lli_open_fd_exec_count--;
287 LASSERT(lli->lli_open_fd_read_count);
288 lli->lli_open_fd_read_count--;
290 mutex_unlock(&lli->lli_och_mutex);
292 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
293 LDLM_IBITS, &policy, lockmode, &lockh))
294 rc = ll_md_real_close(inode, fd->fd_omode);
297 LUSTRE_FPRIVATE(file) = NULL;
298 ll_file_data_put(fd);
303 /* While this returns an error code, fput() the caller does not, so we need
304 * to make every effort to clean up all of our state here. Also, applications
305 * rarely check close errors and even if an error is returned they will not
306 * re-try the close call.
308 int ll_file_release(struct inode *inode, struct file *file)
310 struct ll_file_data *fd;
311 struct ll_sb_info *sbi = ll_i2sbi(inode);
312 struct ll_inode_info *lli = ll_i2info(inode);
316 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
317 PFID(ll_inode2fid(inode)), inode);
319 if (inode->i_sb->s_root != file_dentry(file))
320 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
321 fd = LUSTRE_FPRIVATE(file);
324 /* The last ref on @file, maybe not the the owner pid of statahead,
325 * because parent and child process can share the same file handle. */
326 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
327 ll_deauthorize_statahead(inode, fd);
329 if (inode->i_sb->s_root == file_dentry(file)) {
330 LUSTRE_FPRIVATE(file) = NULL;
331 ll_file_data_put(fd);
335 if (!S_ISDIR(inode->i_mode)) {
336 if (lli->lli_clob != NULL)
337 lov_read_and_clear_async_rc(lli->lli_clob);
338 lli->lli_async_rc = 0;
341 rc = ll_md_close(inode, file);
343 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
344 libcfs_debug_dumplog();
349 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
350 struct lookup_intent *itp)
352 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
353 struct dentry *parent = de->d_parent;
354 const char *name = NULL;
356 struct md_op_data *op_data;
357 struct ptlrpc_request *req = NULL;
361 LASSERT(parent != NULL);
362 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
364 /* if server supports open-by-fid, or file name is invalid, don't pack
365 * name in open request */
366 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
367 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
368 name = de->d_name.name;
369 len = de->d_name.len;
372 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
373 name, len, 0, LUSTRE_OPC_ANY, NULL);
375 RETURN(PTR_ERR(op_data));
376 op_data->op_data = lmm;
377 op_data->op_data_size = lmmsize;
379 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
380 &ll_md_blocking_ast, 0);
381 ll_finish_md_op_data(op_data);
383 /* reason for keep own exit path - don`t flood log
384 * with messages with -ESTALE errors.
386 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
387 it_open_error(DISP_OPEN_OPEN, itp))
389 ll_release_openhandle(de, itp);
393 if (it_disposition(itp, DISP_LOOKUP_NEG))
394 GOTO(out, rc = -ENOENT);
396 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
397 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
398 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
402 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
403 if (!rc && itp->it_lock_mode)
404 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
407 ptlrpc_req_finished(req);
408 ll_intent_drop_lock(itp);
410 /* We did open by fid, but by the time we got to the server,
411 * the object disappeared. If this is a create, we cannot really
412 * tell the userspace that the file it was trying to create
413 * does not exist. Instead let's return -ESTALE, and the VFS will
414 * retry the create with LOOKUP_REVAL that we are going to catch
415 * in ll_revalidate_dentry() and use lookup then.
417 if (rc == -ENOENT && itp->it_op & IT_CREAT)
423 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
424 struct obd_client_handle *och)
426 struct mdt_body *body;
428 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
429 och->och_fh = body->mbo_handle;
430 och->och_fid = body->mbo_fid1;
431 och->och_lease_handle.cookie = it->it_lock_handle;
432 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
433 och->och_flags = it->it_flags;
435 return md_set_open_replay_data(md_exp, och, it);
438 static int ll_local_open(struct file *file, struct lookup_intent *it,
439 struct ll_file_data *fd, struct obd_client_handle *och)
441 struct inode *inode = file_inode(file);
444 LASSERT(!LUSTRE_FPRIVATE(file));
451 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
456 LUSTRE_FPRIVATE(file) = fd;
457 ll_readahead_init(inode, &fd->fd_ras);
458 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
460 /* ll_cl_context initialize */
461 rwlock_init(&fd->fd_lock);
462 INIT_LIST_HEAD(&fd->fd_lccs);
467 /* Open a file, and (for the very first open) create objects on the OSTs at
468 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
469 * creation or open until ll_lov_setstripe() ioctl is called.
471 * If we already have the stripe MD locally then we don't request it in
472 * md_open(), by passing a lmm_size = 0.
474 * It is up to the application to ensure no other processes open this file
475 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
476 * used. We might be able to avoid races of that sort by getting lli_open_sem
477 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
478 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480 int ll_file_open(struct inode *inode, struct file *file)
482 struct ll_inode_info *lli = ll_i2info(inode);
483 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
484 .it_flags = file->f_flags };
485 struct obd_client_handle **och_p = NULL;
486 __u64 *och_usecount = NULL;
487 struct ll_file_data *fd;
491 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
492 PFID(ll_inode2fid(inode)), inode, file->f_flags);
494 it = file->private_data; /* XXX: compat macro */
495 file->private_data = NULL; /* prevent ll_local_open assertion */
497 fd = ll_file_data_get();
499 GOTO(out_openerr, rc = -ENOMEM);
502 if (S_ISDIR(inode->i_mode))
503 ll_authorize_statahead(inode, fd);
505 if (inode->i_sb->s_root == file_dentry(file)) {
506 LUSTRE_FPRIVATE(file) = fd;
510 if (!it || !it->it_disposition) {
511 /* Convert f_flags into access mode. We cannot use file->f_mode,
512 * because everything but O_ACCMODE mask was stripped from
514 if ((oit.it_flags + 1) & O_ACCMODE)
516 if (file->f_flags & O_TRUNC)
517 oit.it_flags |= FMODE_WRITE;
519 /* kernel only call f_op->open in dentry_open. filp_open calls
520 * dentry_open after call to open_namei that checks permissions.
521 * Only nfsd_open call dentry_open directly without checking
522 * permissions and because of that this code below is safe. */
523 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
524 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
526 /* We do not want O_EXCL here, presumably we opened the file
527 * already? XXX - NFS implications? */
528 oit.it_flags &= ~O_EXCL;
530 /* bug20584, if "it_flags" contains O_CREAT, the file will be
531 * created if necessary, then "IT_CREAT" should be set to keep
532 * consistent with it */
533 if (oit.it_flags & O_CREAT)
534 oit.it_op |= IT_CREAT;
540 /* Let's see if we have file open on MDS already. */
541 if (it->it_flags & FMODE_WRITE) {
542 och_p = &lli->lli_mds_write_och;
543 och_usecount = &lli->lli_open_fd_write_count;
544 } else if (it->it_flags & FMODE_EXEC) {
545 och_p = &lli->lli_mds_exec_och;
546 och_usecount = &lli->lli_open_fd_exec_count;
548 och_p = &lli->lli_mds_read_och;
549 och_usecount = &lli->lli_open_fd_read_count;
552 mutex_lock(&lli->lli_och_mutex);
553 if (*och_p) { /* Open handle is present */
554 if (it_disposition(it, DISP_OPEN_OPEN)) {
555 /* Well, there's extra open request that we do not need,
556 let's close it somehow. This will decref request. */
557 rc = it_open_error(DISP_OPEN_OPEN, it);
559 mutex_unlock(&lli->lli_och_mutex);
560 GOTO(out_openerr, rc);
563 ll_release_openhandle(file_dentry(file), it);
567 rc = ll_local_open(file, it, fd, NULL);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 LASSERT(*och_usecount == 0);
575 if (!it->it_disposition) {
576 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
577 /* We cannot just request lock handle now, new ELC code
578 means that one of other OPEN locks for this file
579 could be cancelled, and since blocking ast handler
580 would attempt to grab och_mutex as well, that would
581 result in a deadlock */
582 mutex_unlock(&lli->lli_och_mutex);
584 * Normally called under two situations:
586 * 2. A race/condition on MDS resulting in no open
587 * handle to be returned from LOOKUP|OPEN request,
588 * for example if the target entry was a symlink.
590 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
591 * marked by a bit set in ll_iget_for_nfs. Clear the
592 * bit so that it's not confusing later callers.
594 * NB; when ldd is NULL, it must have come via normal
595 * lookup path only, since ll_iget_for_nfs always calls
598 if (ldd && ldd->lld_nfs_dentry) {
599 ldd->lld_nfs_dentry = 0;
600 it->it_flags |= MDS_OPEN_LOCK;
604 * Always specify MDS_OPEN_BY_FID because we don't want
605 * to get file with different fid.
607 it->it_flags |= MDS_OPEN_BY_FID;
608 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
611 GOTO(out_openerr, rc);
615 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
617 GOTO(out_och_free, rc = -ENOMEM);
621 /* md_intent_lock() didn't get a request ref if there was an
622 * open error, so don't do cleanup on the request here
624 /* XXX (green): Should not we bail out on any error here, not
625 * just open error? */
626 rc = it_open_error(DISP_OPEN_OPEN, it);
628 GOTO(out_och_free, rc);
630 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
631 "inode %p: disposition %x, status %d\n", inode,
632 it_disposition(it, ~0), it->it_status);
634 rc = ll_local_open(file, it, fd, *och_p);
636 GOTO(out_och_free, rc);
638 mutex_unlock(&lli->lli_och_mutex);
641 /* Must do this outside lli_och_mutex lock to prevent deadlock where
642 different kind of OPEN lock for this same inode gets cancelled
643 by ldlm_cancel_lru */
644 if (!S_ISREG(inode->i_mode))
645 GOTO(out_och_free, rc);
647 cl_lov_delay_create_clear(&file->f_flags);
648 GOTO(out_och_free, rc);
652 if (och_p && *och_p) {
653 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
654 *och_p = NULL; /* OBD_FREE writes some magic there */
657 mutex_unlock(&lli->lli_och_mutex);
660 if (lli->lli_opendir_key == fd)
661 ll_deauthorize_statahead(inode, fd);
663 ll_file_data_put(fd);
665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
668 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
669 ptlrpc_req_finished(it->it_request);
670 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
676 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
677 struct ldlm_lock_desc *desc, void *data, int flag)
680 struct lustre_handle lockh;
684 case LDLM_CB_BLOCKING:
685 ldlm_lock2handle(lock, &lockh);
686 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
688 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
692 case LDLM_CB_CANCELING:
700 * When setting a lease on a file, we take ownership of the lli_mds_*_och
701 * and save it as fd->fd_och so as to force client to reopen the file even
702 * if it has an open lock in cache already.
704 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
705 struct lustre_handle *old_handle)
707 struct ll_inode_info *lli = ll_i2info(inode);
708 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
709 struct obd_client_handle **och_p;
714 /* Get the openhandle of the file */
715 mutex_lock(&lli->lli_och_mutex);
716 if (fd->fd_lease_och != NULL)
717 GOTO(out_unlock, rc = -EBUSY);
719 if (fd->fd_och == NULL) {
720 if (file->f_mode & FMODE_WRITE) {
721 LASSERT(lli->lli_mds_write_och != NULL);
722 och_p = &lli->lli_mds_write_och;
723 och_usecount = &lli->lli_open_fd_write_count;
725 LASSERT(lli->lli_mds_read_och != NULL);
726 och_p = &lli->lli_mds_read_och;
727 och_usecount = &lli->lli_open_fd_read_count;
730 if (*och_usecount > 1)
731 GOTO(out_unlock, rc = -EBUSY);
738 *old_handle = fd->fd_och->och_fh;
742 mutex_unlock(&lli->lli_och_mutex);
747 * Release ownership on lli_mds_*_och when putting back a file lease.
749 static int ll_lease_och_release(struct inode *inode, struct file *file)
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
754 struct obd_client_handle *old_och = NULL;
759 mutex_lock(&lli->lli_och_mutex);
760 if (file->f_mode & FMODE_WRITE) {
761 och_p = &lli->lli_mds_write_och;
762 och_usecount = &lli->lli_open_fd_write_count;
764 och_p = &lli->lli_mds_read_och;
765 och_usecount = &lli->lli_open_fd_read_count;
768 /* The file may have been open by another process (broken lease) so
769 * *och_p is not NULL. In this case we should simply increase usecount
772 if (*och_p != NULL) {
773 old_och = fd->fd_och;
780 mutex_unlock(&lli->lli_och_mutex);
783 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
789 * Acquire a lease and open the file.
791 static struct obd_client_handle *
792 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
795 struct lookup_intent it = { .it_op = IT_OPEN };
796 struct ll_sb_info *sbi = ll_i2sbi(inode);
797 struct md_op_data *op_data;
798 struct ptlrpc_request *req = NULL;
799 struct lustre_handle old_handle = { 0 };
800 struct obd_client_handle *och = NULL;
805 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
806 RETURN(ERR_PTR(-EINVAL));
809 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
810 RETURN(ERR_PTR(-EPERM));
812 rc = ll_lease_och_acquire(inode, file, &old_handle);
819 RETURN(ERR_PTR(-ENOMEM));
821 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
822 LUSTRE_OPC_ANY, NULL);
824 GOTO(out, rc = PTR_ERR(op_data));
826 /* To tell the MDT this openhandle is from the same owner */
827 op_data->op_handle = old_handle;
829 it.it_flags = fmode | open_flags;
830 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
831 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
832 &ll_md_blocking_lease_ast,
833 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
834 * it can be cancelled which may mislead applications that the lease is
836 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
837 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
838 * doesn't deal with openhandle, so normal openhandle will be leaked. */
839 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
840 ll_finish_md_op_data(op_data);
841 ptlrpc_req_finished(req);
843 GOTO(out_release_it, rc);
845 if (it_disposition(&it, DISP_LOOKUP_NEG))
846 GOTO(out_release_it, rc = -ENOENT);
848 rc = it_open_error(DISP_OPEN_OPEN, &it);
850 GOTO(out_release_it, rc);
852 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
853 ll_och_fill(sbi->ll_md_exp, &it, och);
855 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
856 GOTO(out_close, rc = -EOPNOTSUPP);
858 /* already get lease, handle lease lock */
859 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
860 if (it.it_lock_mode == 0 ||
861 it.it_lock_bits != MDS_INODELOCK_OPEN) {
862 /* open lock must return for lease */
863 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
864 PFID(ll_inode2fid(inode)), it.it_lock_mode,
866 GOTO(out_close, rc = -EPROTO);
869 ll_intent_release(&it);
873 /* Cancel open lock */
874 if (it.it_lock_mode != 0) {
875 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
878 och->och_lease_handle.cookie = 0ULL;
880 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
882 CERROR("%s: error closing file "DFID": %d\n",
883 ll_get_fsname(inode->i_sb, NULL, 0),
884 PFID(&ll_i2info(inode)->lli_fid), rc2);
885 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
887 ll_intent_release(&it);
895 * Check whether a layout swap can be done between two inodes.
897 * \param[in] inode1 First inode to check
898 * \param[in] inode2 Second inode to check
900 * \retval 0 on success, layout swap can be performed between both inodes
901 * \retval negative error code if requirements are not met
903 static int ll_check_swap_layouts_validity(struct inode *inode1,
904 struct inode *inode2)
906 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
909 if (inode_permission(inode1, MAY_WRITE) ||
910 inode_permission(inode2, MAY_WRITE))
913 if (inode1->i_sb != inode2->i_sb)
919 static int ll_swap_layouts_close(struct obd_client_handle *och,
920 struct inode *inode, struct inode *inode2,
923 const struct lu_fid *fid1 = ll_inode2fid(inode);
924 const struct lu_fid *fid2;
925 enum mds_op_bias bias;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
944 case SWAP_LAYOUTS_CLOSE:
945 bias = MDS_CLOSE_LAYOUT_SWAP;
947 case MERGE_LAYOUTS_CLOSE:
948 bias = MDS_CLOSE_LAYOUT_MERGE;
951 GOTO(out_free_och, rc = -EOPNOTSUPP);
954 /* Close the file and {swap,merge} layouts between inode & inode2.
955 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
956 * because we still need it to pack l_remote_handle to MDT. */
957 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
959 och = NULL; /* freed in ll_close_inode_openhandle() */
969 * Release lease and close the file.
970 * It will check if the lease has ever broken.
972 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
975 struct ldlm_lock *lock;
976 bool cancelled = true;
980 lock = ldlm_handle2lock(&och->och_lease_handle);
982 lock_res_and_lock(lock);
983 cancelled = ldlm_is_cancel(lock);
984 unlock_res_and_lock(lock);
988 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
989 PFID(&ll_i2info(inode)->lli_fid), cancelled);
992 ldlm_cli_cancel(&och->och_lease_handle, 0);
994 if (lease_broken != NULL)
995 *lease_broken = cancelled;
997 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1001 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1003 struct ll_inode_info *lli = ll_i2info(inode);
1004 struct cl_object *obj = lli->lli_clob;
1005 struct cl_attr *attr = vvp_env_thread_attr(env);
1013 ll_inode_size_lock(inode);
1015 /* Merge timestamps the most recently obtained from MDS with
1016 * timestamps obtained from OSTs.
1018 * Do not overwrite atime of inode because it may be refreshed
1019 * by file_accessed() function. If the read was served by cache
1020 * data, there is no RPC to be sent so that atime may not be
1021 * transferred to OSTs at all. MDT only updates atime at close time
1022 * if it's at least 'mdd.*.atime_diff' older.
1023 * All in all, the atime in Lustre does not strictly comply with
1024 * POSIX. Solving this problem needs to send an RPC to MDT for each
1025 * read, this will hurt performance. */
1026 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1027 LTIME_S(inode->i_atime) = lli->lli_atime;
1028 lli->lli_update_atime = 0;
1030 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1031 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1033 atime = LTIME_S(inode->i_atime);
1034 mtime = LTIME_S(inode->i_mtime);
1035 ctime = LTIME_S(inode->i_ctime);
1037 cl_object_attr_lock(obj);
1038 rc = cl_object_attr_get(env, obj, attr);
1039 cl_object_attr_unlock(obj);
1042 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1044 if (atime < attr->cat_atime)
1045 atime = attr->cat_atime;
1047 if (ctime < attr->cat_ctime)
1048 ctime = attr->cat_ctime;
1050 if (mtime < attr->cat_mtime)
1051 mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1056 i_size_write(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_atime) = atime;
1060 LTIME_S(inode->i_mtime) = mtime;
1061 LTIME_S(inode->i_ctime) = ctime;
1064 ll_inode_size_unlock(inode);
1070 * Set designated mirror for I/O.
1072 * So far only read, write, and truncated can support to issue I/O to
1073 * designated mirror.
1075 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1077 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1079 /* FLR: disable non-delay for designated mirror I/O because obviously
1080 * only one mirror is available */
1081 if (fd->fd_designated_mirror > 0) {
1083 io->ci_designated_mirror = fd->fd_designated_mirror;
1084 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1088 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1089 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1092 static bool file_is_noatime(const struct file *file)
1094 const struct vfsmount *mnt = file->f_path.mnt;
1095 const struct inode *inode = file_inode((struct file *)file);
1097 /* Adapted from file_accessed() and touch_atime().*/
1098 if (file->f_flags & O_NOATIME)
1101 if (inode->i_flags & S_NOATIME)
1104 if (IS_NOATIME(inode))
1107 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1110 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1113 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1119 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1121 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1123 struct inode *inode = file_inode(file);
1124 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1126 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1127 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1128 io->u.ci_rw.rw_file = file;
1129 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1130 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1131 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1133 if (iot == CIT_WRITE) {
1134 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1135 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1136 file->f_flags & O_DIRECT ||
1139 io->ci_obj = ll_i2info(inode)->lli_clob;
1140 io->ci_lockreq = CILR_MAYBE;
1141 if (ll_file_nolock(file)) {
1142 io->ci_lockreq = CILR_NEVER;
1143 io->ci_no_srvlock = 1;
1144 } else if (file->f_flags & O_APPEND) {
1145 io->ci_lockreq = CILR_MANDATORY;
1147 io->ci_noatime = file_is_noatime(file);
1148 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1149 io->ci_pio = !io->u.ci_rw.rw_append;
1153 /* FLR: only use non-delay I/O for read as there is only one
1154 * avaliable mirror for write. */
1155 io->ci_ndelay = !(iot == CIT_WRITE);
1157 ll_io_set_mirror(io, file);
1160 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1162 struct cl_io_pt *pt = ptask->pt_cbdata;
1163 struct file *file = pt->cip_file;
1166 loff_t pos = pt->cip_pos;
1171 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1172 file_dentry(file)->d_name.name,
1173 pt->cip_iot == CIT_READ ? "read" : "write",
1174 pos, pos + pt->cip_count);
1176 env = cl_env_get(&refcheck);
1178 RETURN(PTR_ERR(env));
1180 io = vvp_env_thread_io(env);
1181 ll_io_init(io, file, pt->cip_iot);
1182 io->u.ci_rw.rw_iter = pt->cip_iter;
1183 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1184 io->ci_pio = 0; /* It's already in parallel task */
1186 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1187 pt->cip_count - pt->cip_result);
1189 struct vvp_io *vio = vvp_env_io(env);
1191 vio->vui_io_subtype = IO_NORMAL;
1192 vio->vui_fd = LUSTRE_FPRIVATE(file);
1194 ll_cl_add(file, env, io, LCC_RW);
1195 rc = cl_io_loop(env, io);
1196 ll_cl_remove(file, env);
1198 /* cl_io_rw_init() handled IO */
1202 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1208 if (io->ci_nob > 0) {
1209 pt->cip_result += io->ci_nob;
1210 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1212 pt->cip_iocb.ki_pos = pos;
1213 #ifdef HAVE_KIOCB_KI_LEFT
1214 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1215 #elif defined(HAVE_KI_NBYTES)
1216 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1220 cl_io_fini(env, io);
1221 cl_env_put(env, &refcheck);
1223 pt->cip_need_restart = io->ci_need_restart;
1225 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1226 file_dentry(file)->d_name.name,
1227 pt->cip_iot == CIT_READ ? "read" : "write",
1228 pt->cip_result, rc);
1230 RETURN(pt->cip_result > 0 ? 0 : rc);
1234 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1235 struct file *file, enum cl_io_type iot,
1236 loff_t *ppos, size_t count)
1238 struct range_lock range;
1239 struct vvp_io *vio = vvp_env_io(env);
1240 struct inode *inode = file_inode(file);
1241 struct ll_inode_info *lli = ll_i2info(inode);
1242 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1247 unsigned retried = 0;
1248 bool restarted = false;
1252 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1253 file_dentry(file)->d_name.name,
1254 iot == CIT_READ ? "read" : "write", pos, pos + count);
1257 io = vvp_env_thread_io(env);
1258 ll_io_init(io, file, iot);
1259 if (args->via_io_subtype == IO_NORMAL) {
1260 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1261 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1263 if (args->via_io_subtype != IO_NORMAL || restarted)
1265 io->ci_ndelay_tried = retried;
1267 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1268 bool range_locked = false;
1270 if (file->f_flags & O_APPEND)
1271 range_lock_init(&range, 0, LUSTRE_EOF);
1273 range_lock_init(&range, pos, pos + count - 1);
1275 vio->vui_fd = LUSTRE_FPRIVATE(file);
1276 vio->vui_io_subtype = args->via_io_subtype;
1278 switch (vio->vui_io_subtype) {
1280 /* Direct IO reads must also take range lock,
1281 * or multiple reads will try to work on the same pages
1282 * See LU-6227 for details. */
1283 if (((iot == CIT_WRITE) ||
1284 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1285 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1286 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1288 rc = range_lock(&lli->lli_write_tree, &range);
1292 range_locked = true;
1296 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1297 vio->u.splice.vui_flags = args->u.splice.via_flags;
1300 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1304 ll_cl_add(file, env, io, LCC_RW);
1305 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1306 !lli->lli_inode_locked) {
1308 lli->lli_inode_locked = 1;
1310 rc = cl_io_loop(env, io);
1311 if (lli->lli_inode_locked) {
1312 lli->lli_inode_locked = 0;
1313 inode_unlock(inode);
1315 ll_cl_remove(file, env);
1318 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1320 range_unlock(&lli->lli_write_tree, &range);
1323 /* cl_io_rw_init() handled IO */
1327 if (io->ci_nob > 0) {
1328 result += io->ci_nob;
1329 count -= io->ci_nob;
1331 if (args->via_io_subtype == IO_NORMAL) {
1332 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1334 args->u.normal.via_iocb->ki_pos = pos;
1335 #ifdef HAVE_KIOCB_KI_LEFT
1336 args->u.normal.via_iocb->ki_left = count;
1337 #elif defined(HAVE_KI_NBYTES)
1338 args->u.normal.via_iocb->ki_nbytes = count;
1342 pos = io->u.ci_rw.rw_range.cir_pos;
1346 cl_io_fini(env, io);
1349 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1350 file->f_path.dentry->d_name.name,
1351 iot, rc, result, io->ci_need_restart);
1353 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1355 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1356 file_dentry(file)->d_name.name,
1357 iot == CIT_READ ? "read" : "write",
1358 pos, pos + count, result, rc);
1359 /* preserve the tried count for FLR */
1360 retried = io->ci_ndelay_tried;
1365 if (iot == CIT_READ) {
1367 ll_stats_ops_tally(ll_i2sbi(inode),
1368 LPROC_LL_READ_BYTES, result);
1369 } else if (iot == CIT_WRITE) {
1371 ll_stats_ops_tally(ll_i2sbi(inode),
1372 LPROC_LL_WRITE_BYTES, result);
1373 fd->fd_write_failed = false;
1374 } else if (result == 0 && rc == 0) {
1377 fd->fd_write_failed = true;
1379 fd->fd_write_failed = false;
1380 } else if (rc != -ERESTARTSYS) {
1381 fd->fd_write_failed = true;
1385 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1386 file_dentry(file)->d_name.name,
1387 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1391 RETURN(result > 0 ? result : rc);
1395 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1396 * especially for small I/O.
1398 * To serve a read request, CLIO has to create and initialize a cl_io and
1399 * then request DLM lock. This has turned out to have siginificant overhead
1400 * and affects the performance of small I/O dramatically.
1402 * It's not necessary to create a cl_io for each I/O. Under the help of read
1403 * ahead, most of the pages being read are already in memory cache and we can
1404 * read those pages directly because if the pages exist, the corresponding DLM
1405 * lock must exist so that page content must be valid.
1407 * In fast read implementation, the llite speculatively finds and reads pages
1408 * in memory cache. There are three scenarios for fast read:
1409 * - If the page exists and is uptodate, kernel VM will provide the data and
1410 * CLIO won't be intervened;
1411 * - If the page was brought into memory by read ahead, it will be exported
1412 * and read ahead parameters will be updated;
1413 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1414 * it will go back and invoke normal read, i.e., a cl_io will be created
1415 * and DLM lock will be requested.
1417 * POSIX compliance: posix standard states that read is intended to be atomic.
1418 * Lustre read implementation is in line with Linux kernel read implementation
1419 * and neither of them complies with POSIX standard in this matter. Fast read
1420 * doesn't make the situation worse on single node but it may interleave write
1421 * results from multiple nodes due to short read handling in ll_file_aio_read().
1423 * \param env - lu_env
1424 * \param iocb - kiocb from kernel
1425 * \param iter - user space buffers where the data will be copied
1427 * \retval - number of bytes have been read, or error code if error occurred.
1430 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1434 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1437 /* NB: we can't do direct IO for fast read because it will need a lock
1438 * to make IO engine happy. */
1439 if (iocb->ki_filp->f_flags & O_DIRECT)
1442 result = generic_file_read_iter(iocb, iter);
1444 /* If the first page is not in cache, generic_file_aio_read() will be
1445 * returned with -ENODATA.
1446 * See corresponding code in ll_readpage(). */
1447 if (result == -ENODATA)
1451 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1452 LPROC_LL_READ_BYTES, result);
1458 * Read from a file (through the page cache).
1460 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1463 struct vvp_io_args *args;
1468 result = ll_do_fast_read(iocb, to);
1469 if (result < 0 || iov_iter_count(to) == 0)
1472 env = cl_env_get(&refcheck);
1474 return PTR_ERR(env);
1476 args = ll_env_args(env, IO_NORMAL);
1477 args->u.normal.via_iter = to;
1478 args->u.normal.via_iocb = iocb;
1480 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1481 &iocb->ki_pos, iov_iter_count(to));
1484 else if (result == 0)
1487 cl_env_put(env, &refcheck);
1493 * Write to a file (through the page cache).
1495 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1497 struct vvp_io_args *args;
1502 env = cl_env_get(&refcheck);
1504 return PTR_ERR(env);
1506 args = ll_env_args(env, IO_NORMAL);
1507 args->u.normal.via_iter = from;
1508 args->u.normal.via_iocb = iocb;
1510 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1511 &iocb->ki_pos, iov_iter_count(from));
1512 cl_env_put(env, &refcheck);
1516 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1518 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1520 static int ll_file_get_iov_count(const struct iovec *iov,
1521 unsigned long *nr_segs, size_t *count)
1526 for (seg = 0; seg < *nr_segs; seg++) {
1527 const struct iovec *iv = &iov[seg];
1530 * If any segment has a negative length, or the cumulative
1531 * length ever wraps negative then return -EINVAL.
1534 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1536 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1541 cnt -= iv->iov_len; /* This segment is no good */
1548 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1549 unsigned long nr_segs, loff_t pos)
1556 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1560 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1561 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1562 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1563 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1564 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1566 result = ll_file_read_iter(iocb, &to);
1571 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1574 struct iovec iov = { .iov_base = buf, .iov_len = count };
1579 init_sync_kiocb(&kiocb, file);
1580 kiocb.ki_pos = *ppos;
1581 #ifdef HAVE_KIOCB_KI_LEFT
1582 kiocb.ki_left = count;
1583 #elif defined(HAVE_KI_NBYTES)
1584 kiocb.i_nbytes = count;
1587 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1588 *ppos = kiocb.ki_pos;
1594 * Write to a file (through the page cache).
1597 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1598 unsigned long nr_segs, loff_t pos)
1600 struct iov_iter from;
1605 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1609 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1610 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1611 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1612 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1613 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1615 result = ll_file_write_iter(iocb, &from);
1620 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1621 size_t count, loff_t *ppos)
1624 struct iovec iov = { .iov_base = (void __user *)buf,
1626 struct kiocb *kiocb;
1631 env = cl_env_get(&refcheck);
1633 RETURN(PTR_ERR(env));
1635 kiocb = &ll_env_info(env)->lti_kiocb;
1636 init_sync_kiocb(kiocb, file);
1637 kiocb->ki_pos = *ppos;
1638 #ifdef HAVE_KIOCB_KI_LEFT
1639 kiocb->ki_left = count;
1640 #elif defined(HAVE_KI_NBYTES)
1641 kiocb->ki_nbytes = count;
1644 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1645 *ppos = kiocb->ki_pos;
1647 cl_env_put(env, &refcheck);
1650 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1653 * Send file content (through pagecache) somewhere with helper
1655 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1656 struct pipe_inode_info *pipe, size_t count,
1660 struct vvp_io_args *args;
1665 env = cl_env_get(&refcheck);
1667 RETURN(PTR_ERR(env));
1669 args = ll_env_args(env, IO_SPLICE);
1670 args->u.splice.via_pipe = pipe;
1671 args->u.splice.via_flags = flags;
1673 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1674 cl_env_put(env, &refcheck);
1678 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1679 __u64 flags, struct lov_user_md *lum, int lum_size)
1681 struct lookup_intent oit = {
1683 .it_flags = flags | MDS_OPEN_BY_FID,
1688 ll_inode_size_lock(inode);
1689 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1691 GOTO(out_unlock, rc);
1693 ll_release_openhandle(dentry, &oit);
1696 ll_inode_size_unlock(inode);
1697 ll_intent_release(&oit);
1702 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1703 struct lov_mds_md **lmmp, int *lmm_size,
1704 struct ptlrpc_request **request)
1706 struct ll_sb_info *sbi = ll_i2sbi(inode);
1707 struct mdt_body *body;
1708 struct lov_mds_md *lmm = NULL;
1709 struct ptlrpc_request *req = NULL;
1710 struct md_op_data *op_data;
1713 rc = ll_get_default_mdsize(sbi, &lmmsize);
1717 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1718 strlen(filename), lmmsize,
1719 LUSTRE_OPC_ANY, NULL);
1720 if (IS_ERR(op_data))
1721 RETURN(PTR_ERR(op_data));
1723 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1724 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1725 ll_finish_md_op_data(op_data);
1727 CDEBUG(D_INFO, "md_getattr_name failed "
1728 "on %s: rc %d\n", filename, rc);
1732 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1733 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1735 lmmsize = body->mbo_eadatasize;
1737 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1739 GOTO(out, rc = -ENODATA);
1742 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1743 LASSERT(lmm != NULL);
1745 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1746 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1747 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1748 GOTO(out, rc = -EPROTO);
1751 * This is coming from the MDS, so is probably in
1752 * little endian. We convert it to host endian before
1753 * passing it to userspace.
1755 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1758 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1759 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1760 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1761 if (le32_to_cpu(lmm->lmm_pattern) &
1762 LOV_PATTERN_F_RELEASED)
1766 /* if function called for directory - we should
1767 * avoid swab not existent lsm objects */
1768 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1769 lustre_swab_lov_user_md_v1(
1770 (struct lov_user_md_v1 *)lmm);
1771 if (S_ISREG(body->mbo_mode))
1772 lustre_swab_lov_user_md_objects(
1773 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1775 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1776 lustre_swab_lov_user_md_v3(
1777 (struct lov_user_md_v3 *)lmm);
1778 if (S_ISREG(body->mbo_mode))
1779 lustre_swab_lov_user_md_objects(
1780 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1782 } else if (lmm->lmm_magic ==
1783 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1784 lustre_swab_lov_comp_md_v1(
1785 (struct lov_comp_md_v1 *)lmm);
1791 *lmm_size = lmmsize;
1796 static int ll_lov_setea(struct inode *inode, struct file *file,
1799 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1800 struct lov_user_md *lump;
1801 int lum_size = sizeof(struct lov_user_md) +
1802 sizeof(struct lov_user_ost_data);
1806 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1809 OBD_ALLOC_LARGE(lump, lum_size);
1813 if (copy_from_user(lump, arg, lum_size))
1814 GOTO(out_lump, rc = -EFAULT);
1816 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1818 cl_lov_delay_create_clear(&file->f_flags);
1821 OBD_FREE_LARGE(lump, lum_size);
1825 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1832 env = cl_env_get(&refcheck);
1834 RETURN(PTR_ERR(env));
1836 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1837 cl_env_put(env, &refcheck);
1841 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1844 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1845 struct lov_user_md *klum;
1847 __u64 flags = FMODE_WRITE;
1850 rc = ll_copy_user_md(lum, &klum);
1855 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1860 rc = put_user(0, &lum->lmm_stripe_count);
1864 rc = ll_layout_refresh(inode, &gen);
1868 rc = ll_file_getstripe(inode, arg, lum_size);
1870 cl_lov_delay_create_clear(&file->f_flags);
1873 OBD_FREE(klum, lum_size);
1878 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1880 struct ll_inode_info *lli = ll_i2info(inode);
1881 struct cl_object *obj = lli->lli_clob;
1882 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1883 struct ll_grouplock grouplock;
1888 CWARN("group id for group lock must not be 0\n");
1892 if (ll_file_nolock(file))
1893 RETURN(-EOPNOTSUPP);
1895 spin_lock(&lli->lli_lock);
1896 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1897 CWARN("group lock already existed with gid %lu\n",
1898 fd->fd_grouplock.lg_gid);
1899 spin_unlock(&lli->lli_lock);
1902 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1903 spin_unlock(&lli->lli_lock);
1906 * XXX: group lock needs to protect all OST objects while PFL
1907 * can add new OST objects during the IO, so we'd instantiate
1908 * all OST objects before getting its group lock.
1913 struct cl_layout cl = {
1914 .cl_is_composite = false,
1917 env = cl_env_get(&refcheck);
1919 RETURN(PTR_ERR(env));
1921 rc = cl_object_layout_get(env, obj, &cl);
1922 if (!rc && cl.cl_is_composite)
1923 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1925 cl_env_put(env, &refcheck);
1930 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1931 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1935 spin_lock(&lli->lli_lock);
1936 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1937 spin_unlock(&lli->lli_lock);
1938 CERROR("another thread just won the race\n");
1939 cl_put_grouplock(&grouplock);
1943 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1944 fd->fd_grouplock = grouplock;
1945 spin_unlock(&lli->lli_lock);
1947 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1951 static int ll_put_grouplock(struct inode *inode, struct file *file,
1954 struct ll_inode_info *lli = ll_i2info(inode);
1955 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1956 struct ll_grouplock grouplock;
1959 spin_lock(&lli->lli_lock);
1960 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1961 spin_unlock(&lli->lli_lock);
1962 CWARN("no group lock held\n");
1966 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1968 if (fd->fd_grouplock.lg_gid != arg) {
1969 CWARN("group lock %lu doesn't match current id %lu\n",
1970 arg, fd->fd_grouplock.lg_gid);
1971 spin_unlock(&lli->lli_lock);
1975 grouplock = fd->fd_grouplock;
1976 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1977 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1978 spin_unlock(&lli->lli_lock);
1980 cl_put_grouplock(&grouplock);
1981 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1986 * Close inode open handle
1988 * \param dentry [in] dentry which contains the inode
1989 * \param it [in,out] intent which contains open info and result
1992 * \retval <0 failure
1994 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1996 struct inode *inode = dentry->d_inode;
1997 struct obd_client_handle *och;
2003 /* Root ? Do nothing. */
2004 if (dentry->d_inode->i_sb->s_root == dentry)
2007 /* No open handle to close? Move away */
2008 if (!it_disposition(it, DISP_OPEN_OPEN))
2011 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2013 OBD_ALLOC(och, sizeof(*och));
2015 GOTO(out, rc = -ENOMEM);
2017 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2019 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2021 /* this one is in place of ll_file_open */
2022 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2023 ptlrpc_req_finished(it->it_request);
2024 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2030 * Get size for inode for which FIEMAP mapping is requested.
2031 * Make the FIEMAP get_info call and returns the result.
2032 * \param fiemap kernel buffer to hold extens
2033 * \param num_bytes kernel buffer size
2035 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2041 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2044 /* Checks for fiemap flags */
2045 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2046 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2050 /* Check for FIEMAP_FLAG_SYNC */
2051 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2052 rc = filemap_fdatawrite(inode->i_mapping);
2057 env = cl_env_get(&refcheck);
2059 RETURN(PTR_ERR(env));
2061 if (i_size_read(inode) == 0) {
2062 rc = ll_glimpse_size(inode);
2067 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2068 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2069 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2071 /* If filesize is 0, then there would be no objects for mapping */
2072 if (fmkey.lfik_oa.o_size == 0) {
2073 fiemap->fm_mapped_extents = 0;
2077 fmkey.lfik_fiemap = *fiemap;
2079 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2080 &fmkey, fiemap, &num_bytes);
2082 cl_env_put(env, &refcheck);
2086 int ll_fid2path(struct inode *inode, void __user *arg)
2088 struct obd_export *exp = ll_i2mdexp(inode);
2089 const struct getinfo_fid2path __user *gfin = arg;
2091 struct getinfo_fid2path *gfout;
2097 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2098 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2101 /* Only need to get the buflen */
2102 if (get_user(pathlen, &gfin->gf_pathlen))
2105 if (pathlen > PATH_MAX)
2108 outsize = sizeof(*gfout) + pathlen;
2109 OBD_ALLOC(gfout, outsize);
2113 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2114 GOTO(gf_free, rc = -EFAULT);
2115 /* append root FID after gfout to let MDT know the root FID so that it
2116 * can lookup the correct path, this is mainly for fileset.
2117 * old server without fileset mount support will ignore this. */
2118 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2120 /* Call mdc_iocontrol */
2121 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2125 if (copy_to_user(arg, gfout, outsize))
2129 OBD_FREE(gfout, outsize);
2134 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2136 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2144 ioc->idv_version = 0;
2145 ioc->idv_layout_version = UINT_MAX;
2147 /* If no file object initialized, we consider its version is 0. */
2151 env = cl_env_get(&refcheck);
2153 RETURN(PTR_ERR(env));
2155 io = vvp_env_thread_io(env);
2157 io->u.ci_data_version.dv_data_version = 0;
2158 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2159 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2162 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2163 result = cl_io_loop(env, io);
2165 result = io->ci_result;
2167 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2168 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2170 cl_io_fini(env, io);
2172 if (unlikely(io->ci_need_restart))
2175 cl_env_put(env, &refcheck);
2181 * Read the data_version for inode.
2183 * This value is computed using stripe object version on OST.
2184 * Version is computed using server side locking.
2186 * @param flags if do sync on the OST side;
2188 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2189 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2191 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2193 struct ioc_data_version ioc = { .idv_flags = flags };
2196 rc = ll_ioc_data_version(inode, &ioc);
2198 *data_version = ioc.idv_version;
2204 * Trigger a HSM release request for the provided inode.
2206 int ll_hsm_release(struct inode *inode)
2209 struct obd_client_handle *och = NULL;
2210 __u64 data_version = 0;
2215 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2216 ll_get_fsname(inode->i_sb, NULL, 0),
2217 PFID(&ll_i2info(inode)->lli_fid));
2219 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2221 GOTO(out, rc = PTR_ERR(och));
2223 /* Grab latest data_version and [am]time values */
2224 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2228 env = cl_env_get(&refcheck);
2230 GOTO(out, rc = PTR_ERR(env));
2232 ll_merge_attr(env, inode);
2233 cl_env_put(env, &refcheck);
2235 /* Release the file.
2236 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2237 * we still need it to pack l_remote_handle to MDT. */
2238 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2244 if (och != NULL && !IS_ERR(och)) /* close the file */
2245 ll_lease_close(och, inode, NULL);
2250 struct ll_swap_stack {
2253 struct inode *inode1;
2254 struct inode *inode2;
2259 static int ll_swap_layouts(struct file *file1, struct file *file2,
2260 struct lustre_swap_layouts *lsl)
2262 struct mdc_swap_layouts msl;
2263 struct md_op_data *op_data;
2266 struct ll_swap_stack *llss = NULL;
2269 OBD_ALLOC_PTR(llss);
2273 llss->inode1 = file_inode(file1);
2274 llss->inode2 = file_inode(file2);
2276 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2280 /* we use 2 bool because it is easier to swap than 2 bits */
2281 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2282 llss->check_dv1 = true;
2284 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2285 llss->check_dv2 = true;
2287 /* we cannot use lsl->sl_dvX directly because we may swap them */
2288 llss->dv1 = lsl->sl_dv1;
2289 llss->dv2 = lsl->sl_dv2;
2291 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2292 if (rc == 0) /* same file, done! */
2295 if (rc < 0) { /* sequentialize it */
2296 swap(llss->inode1, llss->inode2);
2298 swap(llss->dv1, llss->dv2);
2299 swap(llss->check_dv1, llss->check_dv2);
2303 if (gid != 0) { /* application asks to flush dirty cache */
2304 rc = ll_get_grouplock(llss->inode1, file1, gid);
2308 rc = ll_get_grouplock(llss->inode2, file2, gid);
2310 ll_put_grouplock(llss->inode1, file1, gid);
2315 /* ultimate check, before swaping the layouts we check if
2316 * dataversion has changed (if requested) */
2317 if (llss->check_dv1) {
2318 rc = ll_data_version(llss->inode1, &dv, 0);
2321 if (dv != llss->dv1)
2322 GOTO(putgl, rc = -EAGAIN);
2325 if (llss->check_dv2) {
2326 rc = ll_data_version(llss->inode2, &dv, 0);
2329 if (dv != llss->dv2)
2330 GOTO(putgl, rc = -EAGAIN);
2333 /* struct md_op_data is used to send the swap args to the mdt
2334 * only flags is missing, so we use struct mdc_swap_layouts
2335 * through the md_op_data->op_data */
2336 /* flags from user space have to be converted before they are send to
2337 * server, no flag is sent today, they are only used on the client */
2340 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2341 0, LUSTRE_OPC_ANY, &msl);
2342 if (IS_ERR(op_data))
2343 GOTO(free, rc = PTR_ERR(op_data));
2345 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2346 sizeof(*op_data), op_data, NULL);
2347 ll_finish_md_op_data(op_data);
2354 ll_put_grouplock(llss->inode2, file2, gid);
2355 ll_put_grouplock(llss->inode1, file1, gid);
2365 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2367 struct md_op_data *op_data;
2371 /* Detect out-of range masks */
2372 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2375 /* Non-root users are forbidden to set or clear flags which are
2376 * NOT defined in HSM_USER_MASK. */
2377 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2378 !cfs_capable(CFS_CAP_SYS_ADMIN))
2381 /* Detect out-of range archive id */
2382 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2383 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2386 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2387 LUSTRE_OPC_ANY, hss);
2388 if (IS_ERR(op_data))
2389 RETURN(PTR_ERR(op_data));
2391 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2392 sizeof(*op_data), op_data, NULL);
2394 ll_finish_md_op_data(op_data);
2399 static int ll_hsm_import(struct inode *inode, struct file *file,
2400 struct hsm_user_import *hui)
2402 struct hsm_state_set *hss = NULL;
2403 struct iattr *attr = NULL;
2407 if (!S_ISREG(inode->i_mode))
2413 GOTO(out, rc = -ENOMEM);
2415 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2416 hss->hss_archive_id = hui->hui_archive_id;
2417 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2418 rc = ll_hsm_state_set(inode, hss);
2422 OBD_ALLOC_PTR(attr);
2424 GOTO(out, rc = -ENOMEM);
2426 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2427 attr->ia_mode |= S_IFREG;
2428 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2429 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2430 attr->ia_size = hui->hui_size;
2431 attr->ia_mtime.tv_sec = hui->hui_mtime;
2432 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2433 attr->ia_atime.tv_sec = hui->hui_atime;
2434 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2436 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2437 ATTR_UID | ATTR_GID |
2438 ATTR_MTIME | ATTR_MTIME_SET |
2439 ATTR_ATIME | ATTR_ATIME_SET;
2443 rc = ll_setattr_raw(file_dentry(file), attr, true);
2447 inode_unlock(inode);
2459 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2461 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2462 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2465 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2467 struct inode *inode = file_inode(file);
2469 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2470 ATTR_MTIME | ATTR_MTIME_SET |
2471 ATTR_CTIME | ATTR_CTIME_SET,
2473 .tv_sec = lfu->lfu_atime_sec,
2474 .tv_nsec = lfu->lfu_atime_nsec,
2477 .tv_sec = lfu->lfu_mtime_sec,
2478 .tv_nsec = lfu->lfu_mtime_nsec,
2481 .tv_sec = lfu->lfu_ctime_sec,
2482 .tv_nsec = lfu->lfu_ctime_nsec,
2488 if (!capable(CAP_SYS_ADMIN))
2491 if (!S_ISREG(inode->i_mode))
2495 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2496 inode_unlock(inode);
2501 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2504 case MODE_READ_USER:
2506 case MODE_WRITE_USER:
2513 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2515 /* Used to allow the upper layers of the client to request an LDLM lock
2516 * without doing an actual read or write.
2518 * Used for ladvise lockahead to manually request specific locks.
2520 * \param[in] file file this ladvise lock request is on
2521 * \param[in] ladvise ladvise struct describing this lock request
2523 * \retval 0 success, no detailed result available (sync requests
2524 * and requests sent to the server [not handled locally]
2525 * cannot return detailed results)
2526 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2527 * see definitions for details.
2528 * \retval negative negative errno on error
2530 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2532 struct lu_env *env = NULL;
2533 struct cl_io *io = NULL;
2534 struct cl_lock *lock = NULL;
2535 struct cl_lock_descr *descr = NULL;
2536 struct dentry *dentry = file->f_path.dentry;
2537 struct inode *inode = dentry->d_inode;
2538 enum cl_lock_mode cl_mode;
2539 off_t start = ladvise->lla_start;
2540 off_t end = ladvise->lla_end;
2546 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2547 "start=%llu, end=%llu\n", dentry->d_name.len,
2548 dentry->d_name.name, dentry->d_inode,
2549 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2552 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2554 GOTO(out, result = cl_mode);
2556 /* Get IO environment */
2557 result = cl_io_get(inode, &env, &io, &refcheck);
2561 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2564 * nothing to do for this io. This currently happens when
2565 * stripe sub-object's are not yet created.
2567 result = io->ci_result;
2568 } else if (result == 0) {
2569 lock = vvp_env_lock(env);
2570 descr = &lock->cll_descr;
2572 descr->cld_obj = io->ci_obj;
2573 /* Convert byte offsets to pages */
2574 descr->cld_start = cl_index(io->ci_obj, start);
2575 descr->cld_end = cl_index(io->ci_obj, end);
2576 descr->cld_mode = cl_mode;
2577 /* CEF_MUST is used because we do not want to convert a
2578 * lockahead request to a lockless lock */
2579 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2582 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2583 descr->cld_enq_flags |= CEF_SPECULATIVE;
2585 result = cl_lock_request(env, io, lock);
2587 /* On success, we need to release the lock */
2589 cl_lock_release(env, lock);
2591 cl_io_fini(env, io);
2592 cl_env_put(env, &refcheck);
2594 /* -ECANCELED indicates a matching lock with a different extent
2595 * was already present, and -EEXIST indicates a matching lock
2596 * on exactly the same extent was already present.
2597 * We convert them to positive values for userspace to make
2598 * recognizing true errors easier.
2599 * Note we can only return these detailed results on async requests,
2600 * as sync requests look the same as i/o requests for locking. */
2601 if (result == -ECANCELED)
2602 result = LLA_RESULT_DIFFERENT;
2603 else if (result == -EEXIST)
2604 result = LLA_RESULT_SAME;
2609 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2611 static int ll_ladvise_sanity(struct inode *inode,
2612 struct llapi_lu_ladvise *ladvise)
2614 enum lu_ladvise_type advice = ladvise->lla_advice;
2615 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2616 * be in the first 32 bits of enum ladvise_flags */
2617 __u32 flags = ladvise->lla_peradvice_flags;
2618 /* 3 lines at 80 characters per line, should be plenty */
2621 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2623 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2624 "last supported advice is %s (value '%d'): rc = %d\n",
2625 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2626 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2630 /* Per-advice checks */
2632 case LU_LADVISE_LOCKNOEXPAND:
2633 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2635 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2637 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2638 ladvise_names[advice], rc);
2642 case LU_LADVISE_LOCKAHEAD:
2643 /* Currently only READ and WRITE modes can be requested */
2644 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2645 ladvise->lla_lockahead_mode == 0) {
2647 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2649 ll_get_fsname(inode->i_sb, NULL, 0),
2650 ladvise->lla_lockahead_mode,
2651 ladvise_names[advice], rc);
2654 case LU_LADVISE_WILLREAD:
2655 case LU_LADVISE_DONTNEED:
2657 /* Note fall through above - These checks apply to all advices
2658 * except LOCKNOEXPAND */
2659 if (flags & ~LF_DEFAULT_MASK) {
2661 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2663 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2664 ladvise_names[advice], rc);
2667 if (ladvise->lla_start >= ladvise->lla_end) {
2669 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2670 "for %s: rc = %d\n",
2671 ll_get_fsname(inode->i_sb, NULL, 0),
2672 ladvise->lla_start, ladvise->lla_end,
2673 ladvise_names[advice], rc);
2685 * Give file access advices
2687 * The ladvise interface is similar to Linux fadvise() system call, except it
2688 * forwards the advices directly from Lustre client to server. The server side
2689 * codes will apply appropriate read-ahead and caching techniques for the
2690 * corresponding files.
2692 * A typical workload for ladvise is e.g. a bunch of different clients are
2693 * doing small random reads of a file, so prefetching pages into OSS cache
2694 * with big linear reads before the random IO is a net benefit. Fetching
2695 * all that data into each client cache with fadvise() may not be, due to
2696 * much more data being sent to the client.
2698 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2699 struct llapi_lu_ladvise *ladvise)
2703 struct cl_ladvise_io *lio;
2708 env = cl_env_get(&refcheck);
2710 RETURN(PTR_ERR(env));
2712 io = vvp_env_thread_io(env);
2713 io->ci_obj = ll_i2info(inode)->lli_clob;
2715 /* initialize parameters for ladvise */
2716 lio = &io->u.ci_ladvise;
2717 lio->li_start = ladvise->lla_start;
2718 lio->li_end = ladvise->lla_end;
2719 lio->li_fid = ll_inode2fid(inode);
2720 lio->li_advice = ladvise->lla_advice;
2721 lio->li_flags = flags;
2723 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2724 rc = cl_io_loop(env, io);
2728 cl_io_fini(env, io);
2729 cl_env_put(env, &refcheck);
2733 static int ll_lock_noexpand(struct file *file, int flags)
2735 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2737 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2742 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2745 struct fsxattr fsxattr;
2747 if (copy_from_user(&fsxattr,
2748 (const struct fsxattr __user *)arg,
2752 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2753 if (copy_to_user((struct fsxattr __user *)arg,
2754 &fsxattr, sizeof(fsxattr)))
2760 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2764 struct md_op_data *op_data;
2765 struct ptlrpc_request *req = NULL;
2767 struct fsxattr fsxattr;
2769 /* only root could change project ID */
2770 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2773 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2774 LUSTRE_OPC_ANY, NULL);
2775 if (IS_ERR(op_data))
2776 RETURN(PTR_ERR(op_data));
2778 if (copy_from_user(&fsxattr,
2779 (const struct fsxattr __user *)arg,
2781 GOTO(out_fsxattr1, rc = -EFAULT);
2783 op_data->op_projid = fsxattr.fsx_projid;
2784 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2785 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2787 ptlrpc_req_finished(req);
2790 ll_finish_md_op_data(op_data);
2797 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2799 struct inode *inode = file_inode(file);
2800 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2804 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2805 PFID(ll_inode2fid(inode)), inode, cmd);
2806 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2808 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2809 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2813 case LL_IOC_GETFLAGS:
2814 /* Get the current value of the file flags */
2815 return put_user(fd->fd_flags, (int __user *)arg);
2816 case LL_IOC_SETFLAGS:
2817 case LL_IOC_CLRFLAGS:
2818 /* Set or clear specific file flags */
2819 /* XXX This probably needs checks to ensure the flags are
2820 * not abused, and to handle any flag side effects.
2822 if (get_user(flags, (int __user *) arg))
2825 if (cmd == LL_IOC_SETFLAGS) {
2826 if ((flags & LL_FILE_IGNORE_LOCK) &&
2827 !(file->f_flags & O_DIRECT)) {
2828 CERROR("%s: unable to disable locking on "
2829 "non-O_DIRECT file\n", current->comm);
2833 fd->fd_flags |= flags;
2835 fd->fd_flags &= ~flags;
2838 case LL_IOC_LOV_SETSTRIPE:
2839 case LL_IOC_LOV_SETSTRIPE_NEW:
2840 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2841 case LL_IOC_LOV_SETEA:
2842 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2843 case LL_IOC_LOV_SWAP_LAYOUTS: {
2845 struct lustre_swap_layouts lsl;
2848 if (copy_from_user(&lsl, (char __user *)arg,
2849 sizeof(struct lustre_swap_layouts)))
2852 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2855 file2 = fget(lsl.sl_fd);
2859 /* O_WRONLY or O_RDWR */
2860 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2861 GOTO(out, rc = -EPERM);
2863 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
2865 struct inode *inode2;
2866 struct ll_inode_info *lli;
2867 struct obd_client_handle *och = NULL;
2869 lli = ll_i2info(inode);
2870 mutex_lock(&lli->lli_och_mutex);
2871 if (fd->fd_lease_och != NULL) {
2872 och = fd->fd_lease_och;
2873 fd->fd_lease_och = NULL;
2875 mutex_unlock(&lli->lli_och_mutex);
2877 GOTO(out, rc = -ENOLCK);
2878 inode2 = file_inode(file2);
2879 rc = ll_swap_layouts_close(och, inode, inode2, intent);
2881 rc = ll_swap_layouts(file, file2, &lsl);
2887 case LL_IOC_LOV_GETSTRIPE:
2888 case LL_IOC_LOV_GETSTRIPE_NEW:
2889 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2890 case FSFILT_IOC_GETFLAGS:
2891 case FSFILT_IOC_SETFLAGS:
2892 RETURN(ll_iocontrol(inode, file, cmd, arg));
2893 case FSFILT_IOC_GETVERSION_OLD:
2894 case FSFILT_IOC_GETVERSION:
2895 RETURN(put_user(inode->i_generation, (int __user *)arg));
2896 case LL_IOC_GROUP_LOCK:
2897 RETURN(ll_get_grouplock(inode, file, arg));
2898 case LL_IOC_GROUP_UNLOCK:
2899 RETURN(ll_put_grouplock(inode, file, arg));
2900 case IOC_OBD_STATFS:
2901 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2903 /* We need to special case any other ioctls we want to handle,
2904 * to send them to the MDS/OST as appropriate and to properly
2905 * network encode the arg field.
2906 case FSFILT_IOC_SETVERSION_OLD:
2907 case FSFILT_IOC_SETVERSION:
2909 case LL_IOC_FLUSHCTX:
2910 RETURN(ll_flush_ctx(inode));
2911 case LL_IOC_PATH2FID: {
2912 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2913 sizeof(struct lu_fid)))
2918 case LL_IOC_GETPARENT:
2919 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2921 case OBD_IOC_FID2PATH:
2922 RETURN(ll_fid2path(inode, (void __user *)arg));
2923 case LL_IOC_DATA_VERSION: {
2924 struct ioc_data_version idv;
2927 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2930 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2931 rc = ll_ioc_data_version(inode, &idv);
2934 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2940 case LL_IOC_GET_MDTIDX: {
2943 mdtidx = ll_get_mdt_idx(inode);
2947 if (put_user((int)mdtidx, (int __user *)arg))
2952 case OBD_IOC_GETDTNAME:
2953 case OBD_IOC_GETMDNAME:
2954 RETURN(ll_get_obd_name(inode, cmd, arg));
2955 case LL_IOC_HSM_STATE_GET: {
2956 struct md_op_data *op_data;
2957 struct hsm_user_state *hus;
2964 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2965 LUSTRE_OPC_ANY, hus);
2966 if (IS_ERR(op_data)) {
2968 RETURN(PTR_ERR(op_data));
2971 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2974 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2977 ll_finish_md_op_data(op_data);
2981 case LL_IOC_HSM_STATE_SET: {
2982 struct hsm_state_set *hss;
2989 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2994 rc = ll_hsm_state_set(inode, hss);
2999 case LL_IOC_HSM_ACTION: {
3000 struct md_op_data *op_data;
3001 struct hsm_current_action *hca;
3008 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3009 LUSTRE_OPC_ANY, hca);
3010 if (IS_ERR(op_data)) {
3012 RETURN(PTR_ERR(op_data));
3015 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3018 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3021 ll_finish_md_op_data(op_data);
3025 case LL_IOC_SET_LEASE: {
3026 struct ll_inode_info *lli = ll_i2info(inode);
3027 struct obd_client_handle *och = NULL;
3032 case LL_LEASE_WRLCK:
3033 if (!(file->f_mode & FMODE_WRITE))
3035 fmode = FMODE_WRITE;
3037 case LL_LEASE_RDLCK:
3038 if (!(file->f_mode & FMODE_READ))
3042 case LL_LEASE_UNLCK:
3043 mutex_lock(&lli->lli_och_mutex);
3044 if (fd->fd_lease_och != NULL) {
3045 och = fd->fd_lease_och;
3046 fd->fd_lease_och = NULL;
3048 mutex_unlock(&lli->lli_och_mutex);
3053 fmode = och->och_flags;
3054 rc = ll_lease_close(och, inode, &lease_broken);
3058 rc = ll_lease_och_release(inode, file);
3065 RETURN(ll_lease_type_from_fmode(fmode));
3070 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3072 /* apply for lease */
3073 och = ll_lease_open(inode, file, fmode, 0);
3075 RETURN(PTR_ERR(och));
3078 mutex_lock(&lli->lli_och_mutex);
3079 if (fd->fd_lease_och == NULL) {
3080 fd->fd_lease_och = och;
3083 mutex_unlock(&lli->lli_och_mutex);
3085 /* impossible now that only excl is supported for now */
3086 ll_lease_close(och, inode, &lease_broken);
3091 case LL_IOC_GET_LEASE: {
3092 struct ll_inode_info *lli = ll_i2info(inode);
3093 struct ldlm_lock *lock = NULL;
3096 mutex_lock(&lli->lli_och_mutex);
3097 if (fd->fd_lease_och != NULL) {
3098 struct obd_client_handle *och = fd->fd_lease_och;
3100 lock = ldlm_handle2lock(&och->och_lease_handle);
3102 lock_res_and_lock(lock);
3103 if (!ldlm_is_cancel(lock))
3104 fmode = och->och_flags;
3106 unlock_res_and_lock(lock);
3107 LDLM_LOCK_PUT(lock);
3110 mutex_unlock(&lli->lli_och_mutex);
3112 RETURN(ll_lease_type_from_fmode(fmode));
3114 case LL_IOC_HSM_IMPORT: {
3115 struct hsm_user_import *hui;
3121 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3126 rc = ll_hsm_import(inode, file, hui);
3131 case LL_IOC_FUTIMES_3: {
3132 struct ll_futimes_3 lfu;
3134 if (copy_from_user(&lfu,
3135 (const struct ll_futimes_3 __user *)arg,
3139 RETURN(ll_file_futimes_3(file, &lfu));
3141 case LL_IOC_LADVISE: {
3142 struct llapi_ladvise_hdr *k_ladvise_hdr;
3143 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3146 int alloc_size = sizeof(*k_ladvise_hdr);
3149 u_ladvise_hdr = (void __user *)arg;
3150 OBD_ALLOC_PTR(k_ladvise_hdr);
3151 if (k_ladvise_hdr == NULL)
3154 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3155 GOTO(out_ladvise, rc = -EFAULT);
3157 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3158 k_ladvise_hdr->lah_count < 1)
3159 GOTO(out_ladvise, rc = -EINVAL);
3161 num_advise = k_ladvise_hdr->lah_count;
3162 if (num_advise >= LAH_COUNT_MAX)
3163 GOTO(out_ladvise, rc = -EFBIG);
3165 OBD_FREE_PTR(k_ladvise_hdr);
3166 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3167 lah_advise[num_advise]);
3168 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3169 if (k_ladvise_hdr == NULL)
3173 * TODO: submit multiple advices to one server in a single RPC
3175 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3176 GOTO(out_ladvise, rc = -EFAULT);
3178 for (i = 0; i < num_advise; i++) {
3179 struct llapi_lu_ladvise *k_ladvise =
3180 &k_ladvise_hdr->lah_advise[i];
3181 struct llapi_lu_ladvise __user *u_ladvise =
3182 &u_ladvise_hdr->lah_advise[i];
3184 rc = ll_ladvise_sanity(inode, k_ladvise);
3186 GOTO(out_ladvise, rc);
3188 switch (k_ladvise->lla_advice) {
3189 case LU_LADVISE_LOCKNOEXPAND:
3190 rc = ll_lock_noexpand(file,
3191 k_ladvise->lla_peradvice_flags);
3192 GOTO(out_ladvise, rc);
3193 case LU_LADVISE_LOCKAHEAD:
3195 rc = ll_file_lock_ahead(file, k_ladvise);
3198 GOTO(out_ladvise, rc);
3201 &u_ladvise->lla_lockahead_result))
3202 GOTO(out_ladvise, rc = -EFAULT);
3205 rc = ll_ladvise(inode, file,
3206 k_ladvise_hdr->lah_flags,
3209 GOTO(out_ladvise, rc);
3216 OBD_FREE(k_ladvise_hdr, alloc_size);
3219 case LL_IOC_FLR_SET_MIRROR: {
3220 /* mirror I/O must be direct to avoid polluting page cache
3222 if (!(file->f_flags & O_DIRECT))
3225 fd->fd_designated_mirror = (__u32)arg;
3228 case LL_IOC_FSGETXATTR:
3229 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3230 case LL_IOC_FSSETXATTR:
3231 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3233 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3235 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3236 (void __user *)arg));
3240 #ifndef HAVE_FILE_LLSEEK_SIZE
3241 static inline loff_t
3242 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3244 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3246 if (offset > maxsize)
3249 if (offset != file->f_pos) {
3250 file->f_pos = offset;
3251 file->f_version = 0;
3257 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3258 loff_t maxsize, loff_t eof)
3260 struct inode *inode = file_inode(file);
3268 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3269 * position-querying operation. Avoid rewriting the "same"
3270 * f_pos value back to the file because a concurrent read(),
3271 * write() or lseek() might have altered it
3276 * f_lock protects against read/modify/write race with other
3277 * SEEK_CURs. Note that parallel writes and reads behave
3281 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3282 inode_unlock(inode);
3286 * In the generic case the entire file is data, so as long as
3287 * offset isn't at the end of the file then the offset is data.
3294 * There is a virtual hole at the end of the file, so as long as
3295 * offset isn't i_size or larger, return i_size.
3303 return llseek_execute(file, offset, maxsize);
3307 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3309 struct inode *inode = file_inode(file);
3310 loff_t retval, eof = 0;
3313 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3314 (origin == SEEK_CUR) ? file->f_pos : 0);
3315 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3316 PFID(ll_inode2fid(inode)), inode, retval, retval,
3318 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3320 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3321 retval = ll_glimpse_size(inode);
3324 eof = i_size_read(inode);
3327 retval = ll_generic_file_llseek_size(file, offset, origin,
3328 ll_file_maxbytes(inode), eof);
3332 static int ll_flush(struct file *file, fl_owner_t id)
3334 struct inode *inode = file_inode(file);
3335 struct ll_inode_info *lli = ll_i2info(inode);
3336 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3339 LASSERT(!S_ISDIR(inode->i_mode));
3341 /* catch async errors that were recorded back when async writeback
3342 * failed for pages in this mapping. */
3343 rc = lli->lli_async_rc;
3344 lli->lli_async_rc = 0;
3345 if (lli->lli_clob != NULL) {
3346 err = lov_read_and_clear_async_rc(lli->lli_clob);
3351 /* The application has been told write failure already.
3352 * Do not report failure again. */
3353 if (fd->fd_write_failed)
3355 return rc ? -EIO : 0;
3359 * Called to make sure a portion of file has been written out.
3360 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3362 * Return how many pages have been written.
3364 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3365 enum cl_fsync_mode mode, int ignore_layout)
3369 struct cl_fsync_io *fio;
3374 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3375 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3378 env = cl_env_get(&refcheck);
3380 RETURN(PTR_ERR(env));
3382 io = vvp_env_thread_io(env);
3383 io->ci_obj = ll_i2info(inode)->lli_clob;
3384 io->ci_ignore_layout = ignore_layout;
3386 /* initialize parameters for sync */
3387 fio = &io->u.ci_fsync;
3388 fio->fi_start = start;
3390 fio->fi_fid = ll_inode2fid(inode);
3391 fio->fi_mode = mode;
3392 fio->fi_nr_written = 0;
3394 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3395 result = cl_io_loop(env, io);
3397 result = io->ci_result;
3399 result = fio->fi_nr_written;
3400 cl_io_fini(env, io);
3401 cl_env_put(env, &refcheck);
3407 * When dentry is provided (the 'else' case), file_dentry() may be
3408 * null and dentry must be used directly rather than pulled from
3409 * file_dentry() as is done otherwise.
3412 #ifdef HAVE_FILE_FSYNC_4ARGS
3413 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3415 struct dentry *dentry = file_dentry(file);
3417 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3418 int ll_fsync(struct file *file, int datasync)
3420 struct dentry *dentry = file_dentry(file);
3422 loff_t end = LLONG_MAX;
3424 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3427 loff_t end = LLONG_MAX;
3429 struct inode *inode = dentry->d_inode;
3430 struct ll_inode_info *lli = ll_i2info(inode);
3431 struct ptlrpc_request *req;
3435 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3436 PFID(ll_inode2fid(inode)), inode);
3437 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3439 #ifdef HAVE_FILE_FSYNC_4ARGS
3440 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3441 lock_inode = !lli->lli_inode_locked;
3445 /* fsync's caller has already called _fdata{sync,write}, we want
3446 * that IO to finish before calling the osc and mdc sync methods */
3447 rc = filemap_fdatawait(inode->i_mapping);
3450 /* catch async errors that were recorded back when async writeback
3451 * failed for pages in this mapping. */
3452 if (!S_ISDIR(inode->i_mode)) {
3453 err = lli->lli_async_rc;
3454 lli->lli_async_rc = 0;
3457 if (lli->lli_clob != NULL) {
3458 err = lov_read_and_clear_async_rc(lli->lli_clob);
3464 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3468 ptlrpc_req_finished(req);
3470 if (S_ISREG(inode->i_mode)) {
3471 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3473 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3474 if (rc == 0 && err < 0)
3477 fd->fd_write_failed = true;
3479 fd->fd_write_failed = false;
3482 #ifdef HAVE_FILE_FSYNC_4ARGS
3484 inode_unlock(inode);
3490 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3492 struct inode *inode = file_inode(file);
3493 struct ll_sb_info *sbi = ll_i2sbi(inode);
3494 struct ldlm_enqueue_info einfo = {
3495 .ei_type = LDLM_FLOCK,
3496 .ei_cb_cp = ldlm_flock_completion_ast,
3497 .ei_cbdata = file_lock,
3499 struct md_op_data *op_data;
3500 struct lustre_handle lockh = { 0 };
3501 union ldlm_policy_data flock = { { 0 } };
3502 int fl_type = file_lock->fl_type;
3508 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3509 PFID(ll_inode2fid(inode)), file_lock);
3511 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3513 if (file_lock->fl_flags & FL_FLOCK) {
3514 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3515 /* flocks are whole-file locks */
3516 flock.l_flock.end = OFFSET_MAX;
3517 /* For flocks owner is determined by the local file desctiptor*/
3518 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3519 } else if (file_lock->fl_flags & FL_POSIX) {
3520 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3521 flock.l_flock.start = file_lock->fl_start;
3522 flock.l_flock.end = file_lock->fl_end;
3526 flock.l_flock.pid = file_lock->fl_pid;
3528 /* Somewhat ugly workaround for svc lockd.
3529 * lockd installs custom fl_lmops->lm_compare_owner that checks
3530 * for the fl_owner to be the same (which it always is on local node
3531 * I guess between lockd processes) and then compares pid.
3532 * As such we assign pid to the owner field to make it all work,
3533 * conflict with normal locks is unlikely since pid space and
3534 * pointer space for current->files are not intersecting */
3535 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3536 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3540 einfo.ei_mode = LCK_PR;
3543 /* An unlock request may or may not have any relation to
3544 * existing locks so we may not be able to pass a lock handle
3545 * via a normal ldlm_lock_cancel() request. The request may even
3546 * unlock a byte range in the middle of an existing lock. In
3547 * order to process an unlock request we need all of the same
3548 * information that is given with a normal read or write record
3549 * lock request. To avoid creating another ldlm unlock (cancel)
3550 * message we'll treat a LCK_NL flock request as an unlock. */
3551 einfo.ei_mode = LCK_NL;
3554 einfo.ei_mode = LCK_PW;
3557 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3572 flags = LDLM_FL_BLOCK_NOWAIT;
3578 flags = LDLM_FL_TEST_LOCK;
3581 CERROR("unknown fcntl lock command: %d\n", cmd);
3585 /* Save the old mode so that if the mode in the lock changes we
3586 * can decrement the appropriate reader or writer refcount. */
3587 file_lock->fl_type = einfo.ei_mode;
3589 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3590 LUSTRE_OPC_ANY, NULL);
3591 if (IS_ERR(op_data))
3592 RETURN(PTR_ERR(op_data));
3594 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3595 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3596 flock.l_flock.pid, flags, einfo.ei_mode,
3597 flock.l_flock.start, flock.l_flock.end);
3599 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3602 /* Restore the file lock type if not TEST lock. */
3603 if (!(flags & LDLM_FL_TEST_LOCK))
3604 file_lock->fl_type = fl_type;
3606 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3607 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3608 !(flags & LDLM_FL_TEST_LOCK))
3609 rc2 = locks_lock_file_wait(file, file_lock);
3611 if ((file_lock->fl_flags & FL_FLOCK) &&
3612 (rc == 0 || file_lock->fl_type == F_UNLCK))
3613 rc2 = flock_lock_file_wait(file, file_lock);
3614 if ((file_lock->fl_flags & FL_POSIX) &&
3615 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3616 !(flags & LDLM_FL_TEST_LOCK))
3617 rc2 = posix_lock_file_wait(file, file_lock);
3618 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3620 if (rc2 && file_lock->fl_type != F_UNLCK) {
3621 einfo.ei_mode = LCK_NL;
3622 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3627 ll_finish_md_op_data(op_data);
3632 int ll_get_fid_by_name(struct inode *parent, const char *name,
3633 int namelen, struct lu_fid *fid,
3634 struct inode **inode)
3636 struct md_op_data *op_data = NULL;
3637 struct mdt_body *body;
3638 struct ptlrpc_request *req;
3642 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3643 LUSTRE_OPC_ANY, NULL);
3644 if (IS_ERR(op_data))
3645 RETURN(PTR_ERR(op_data));
3647 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3648 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3649 ll_finish_md_op_data(op_data);
3653 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3655 GOTO(out_req, rc = -EFAULT);
3657 *fid = body->mbo_fid1;
3660 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3662 ptlrpc_req_finished(req);
3666 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3667 const char *name, int namelen)
3669 struct dentry *dchild = NULL;
3670 struct inode *child_inode = NULL;
3671 struct md_op_data *op_data;
3672 struct ptlrpc_request *request = NULL;
3673 struct obd_client_handle *och = NULL;
3675 struct mdt_body *body;
3677 __u64 data_version = 0;
3680 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3681 name, PFID(ll_inode2fid(parent)), mdtidx);
3683 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3684 0, LUSTRE_OPC_ANY, NULL);
3685 if (IS_ERR(op_data))
3686 RETURN(PTR_ERR(op_data));
3688 /* Get child FID first */
3689 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3692 dchild = d_lookup(file_dentry(file), &qstr);
3693 if (dchild != NULL) {
3694 if (dchild->d_inode != NULL)
3695 child_inode = igrab(dchild->d_inode);
3699 if (child_inode == NULL) {
3700 rc = ll_get_fid_by_name(parent, name, namelen,
3701 &op_data->op_fid3, &child_inode);
3706 if (child_inode == NULL)
3707 GOTO(out_free, rc = -EINVAL);
3710 * lfs migrate command needs to be blocked on the client
3711 * by checking the migrate FID against the FID of the
3714 if (child_inode == parent->i_sb->s_root->d_inode)
3715 GOTO(out_iput, rc = -EINVAL);
3717 inode_lock(child_inode);
3718 op_data->op_fid3 = *ll_inode2fid(child_inode);
3719 if (!fid_is_sane(&op_data->op_fid3)) {
3720 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3721 ll_get_fsname(parent->i_sb, NULL, 0), name,
3722 PFID(&op_data->op_fid3));
3723 GOTO(out_unlock, rc = -EINVAL);
3726 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3728 GOTO(out_unlock, rc);
3731 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3732 PFID(&op_data->op_fid3), mdtidx);
3733 GOTO(out_unlock, rc = 0);
3736 if (S_ISREG(child_inode->i_mode)) {
3737 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3741 GOTO(out_unlock, rc);
3744 rc = ll_data_version(child_inode, &data_version,
3747 GOTO(out_close, rc);
3749 op_data->op_handle = och->och_fh;
3750 op_data->op_data = och->och_mod;
3751 op_data->op_data_version = data_version;
3752 op_data->op_lease_handle = och->och_lease_handle;
3753 op_data->op_bias |= MDS_RENAME_MIGRATE;
3756 op_data->op_mds = mdtidx;
3757 op_data->op_cli_flags = CLI_MIGRATE;
3758 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3759 namelen, name, namelen, &request);
3761 LASSERT(request != NULL);
3762 ll_update_times(request, parent);
3764 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3765 LASSERT(body != NULL);
3767 /* If the server does release layout lock, then we cleanup
3768 * the client och here, otherwise release it in out_close: */
3770 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3771 obd_mod_put(och->och_mod);
3772 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3774 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3780 if (request != NULL) {
3781 ptlrpc_req_finished(request);
3785 /* Try again if the file layout has changed. */
3786 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3790 if (och != NULL) /* close the file */
3791 ll_lease_close(och, child_inode, NULL);
3793 clear_nlink(child_inode);
3795 inode_unlock(child_inode);
3799 ll_finish_md_op_data(op_data);
3804 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3812 * test if some locks matching bits and l_req_mode are acquired
3813 * - bits can be in different locks
3814 * - if found clear the common lock bits in *bits
3815 * - the bits not found, are kept in *bits
3817 * \param bits [IN] searched lock bits [IN]
3818 * \param l_req_mode [IN] searched lock mode
3819 * \retval boolean, true iff all bits are found
3821 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3823 struct lustre_handle lockh;
3824 union ldlm_policy_data policy;
3825 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3826 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3835 fid = &ll_i2info(inode)->lli_fid;
3836 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3837 ldlm_lockname[mode]);
3839 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3840 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3841 policy.l_inodebits.bits = *bits & (1 << i);
3842 if (policy.l_inodebits.bits == 0)
3845 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3846 &policy, mode, &lockh)) {
3847 struct ldlm_lock *lock;
3849 lock = ldlm_handle2lock(&lockh);
3852 ~(lock->l_policy_data.l_inodebits.bits);
3853 LDLM_LOCK_PUT(lock);
3855 *bits &= ~policy.l_inodebits.bits;
3862 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3863 struct lustre_handle *lockh, __u64 flags,
3864 enum ldlm_mode mode)
3866 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3871 fid = &ll_i2info(inode)->lli_fid;
3872 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3874 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3875 fid, LDLM_IBITS, &policy, mode, lockh);
3880 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3882 /* Already unlinked. Just update nlink and return success */
3883 if (rc == -ENOENT) {
3885 /* If it is striped directory, and there is bad stripe
3886 * Let's revalidate the dentry again, instead of returning
3888 if (S_ISDIR(inode->i_mode) &&
3889 ll_i2info(inode)->lli_lsm_md != NULL)
3892 /* This path cannot be hit for regular files unless in
3893 * case of obscure races, so no need to to validate
3895 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3897 } else if (rc != 0) {
3898 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3899 "%s: revalidate FID "DFID" error: rc = %d\n",
3900 ll_get_fsname(inode->i_sb, NULL, 0),
3901 PFID(ll_inode2fid(inode)), rc);
3907 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3909 struct inode *inode = dentry->d_inode;
3910 struct ptlrpc_request *req = NULL;
3911 struct obd_export *exp;
3915 LASSERT(inode != NULL);
3917 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3918 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3920 exp = ll_i2mdexp(inode);
3922 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3923 * But under CMD case, it caused some lock issues, should be fixed
3924 * with new CMD ibits lock. See bug 12718 */
3925 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3926 struct lookup_intent oit = { .it_op = IT_GETATTR };
3927 struct md_op_data *op_data;
3929 if (ibits == MDS_INODELOCK_LOOKUP)
3930 oit.it_op = IT_LOOKUP;
3932 /* Call getattr by fid, so do not provide name at all. */
3933 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3934 dentry->d_inode, NULL, 0, 0,
3935 LUSTRE_OPC_ANY, NULL);
3936 if (IS_ERR(op_data))
3937 RETURN(PTR_ERR(op_data));
3939 rc = md_intent_lock(exp, op_data, &oit, &req,
3940 &ll_md_blocking_ast, 0);
3941 ll_finish_md_op_data(op_data);
3943 rc = ll_inode_revalidate_fini(inode, rc);
3947 rc = ll_revalidate_it_finish(req, &oit, dentry);
3949 ll_intent_release(&oit);
3953 /* Unlinked? Unhash dentry, so it is not picked up later by
3954 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3955 here to preserve get_cwd functionality on 2.6.
3957 if (!dentry->d_inode->i_nlink) {
3958 ll_lock_dcache(inode);
3959 d_lustre_invalidate(dentry, 0);
3960 ll_unlock_dcache(inode);
3963 ll_lookup_finish_locks(&oit, dentry);
3964 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3965 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3966 u64 valid = OBD_MD_FLGETATTR;
3967 struct md_op_data *op_data;
3970 if (S_ISREG(inode->i_mode)) {
3971 rc = ll_get_default_mdsize(sbi, &ealen);
3974 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3977 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3978 0, ealen, LUSTRE_OPC_ANY,
3980 if (IS_ERR(op_data))
3981 RETURN(PTR_ERR(op_data));
3983 op_data->op_valid = valid;
3984 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3985 ll_finish_md_op_data(op_data);
3987 rc = ll_inode_revalidate_fini(inode, rc);
3991 rc = ll_prep_inode(&inode, req, NULL, NULL);
3994 ptlrpc_req_finished(req);
3998 static int ll_merge_md_attr(struct inode *inode)
4000 struct cl_attr attr = { 0 };
4003 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4004 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4005 &attr, ll_md_blocking_ast);
4009 set_nlink(inode, attr.cat_nlink);
4010 inode->i_blocks = attr.cat_blocks;
4011 i_size_write(inode, attr.cat_size);
4013 ll_i2info(inode)->lli_atime = attr.cat_atime;
4014 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4015 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4021 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4023 struct inode *inode = dentry->d_inode;
4027 rc = __ll_inode_revalidate(dentry, ibits);
4031 /* if object isn't regular file, don't validate size */
4032 if (!S_ISREG(inode->i_mode)) {
4033 if (S_ISDIR(inode->i_mode) &&
4034 ll_i2info(inode)->lli_lsm_md != NULL) {
4035 rc = ll_merge_md_attr(inode);
4040 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
4041 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
4042 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
4044 /* In case of restore, the MDT has the right size and has
4045 * already send it back without granting the layout lock,
4046 * inode is up-to-date so glimpse is useless.
4047 * Also to glimpse we need the layout, in case of a running
4048 * restore the MDT holds the layout lock so the glimpse will
4049 * block up to the end of restore (getattr will block)
4051 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4052 rc = ll_glimpse_size(inode);
4057 static inline dev_t ll_compat_encode_dev(dev_t dev)
4059 /* The compat_sys_*stat*() syscalls will fail unless the
4060 * device majors and minors are both less than 256. Note that
4061 * the value returned here will be passed through
4062 * old_encode_dev() in cp_compat_stat(). And so we are not
4063 * trying to return a valid compat (u16) device number, just
4064 * one that will pass the old_valid_dev() check. */
4066 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4069 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4070 int ll_getattr(const struct path *path, struct kstat *stat,
4071 u32 request_mask, unsigned int flags)
4074 struct dentry *de = path->dentry;
4076 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4079 struct inode *inode = de->d_inode;
4080 struct ll_sb_info *sbi = ll_i2sbi(inode);
4081 struct ll_inode_info *lli = ll_i2info(inode);
4084 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4085 MDS_INODELOCK_LOOKUP);
4086 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4091 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4093 if (ll_need_32bit_api(sbi)) {
4094 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4095 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4096 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4098 stat->ino = inode->i_ino;
4099 stat->dev = inode->i_sb->s_dev;
4100 stat->rdev = inode->i_rdev;
4103 stat->mode = inode->i_mode;
4104 stat->uid = inode->i_uid;
4105 stat->gid = inode->i_gid;
4106 stat->atime = inode->i_atime;
4107 stat->mtime = inode->i_mtime;
4108 stat->ctime = inode->i_ctime;
4109 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4111 stat->nlink = inode->i_nlink;
4112 stat->size = i_size_read(inode);
4113 stat->blocks = inode->i_blocks;
4118 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4119 __u64 start, __u64 len)
4123 struct fiemap *fiemap;
4124 unsigned int extent_count = fieinfo->fi_extents_max;
4126 num_bytes = sizeof(*fiemap) + (extent_count *
4127 sizeof(struct fiemap_extent));
4128 OBD_ALLOC_LARGE(fiemap, num_bytes);
4133 fiemap->fm_flags = fieinfo->fi_flags;
4134 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4135 fiemap->fm_start = start;
4136 fiemap->fm_length = len;
4137 if (extent_count > 0 &&
4138 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4139 sizeof(struct fiemap_extent)) != 0)
4140 GOTO(out, rc = -EFAULT);
4142 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4144 fieinfo->fi_flags = fiemap->fm_flags;
4145 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4146 if (extent_count > 0 &&
4147 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4148 fiemap->fm_mapped_extents *
4149 sizeof(struct fiemap_extent)) != 0)
4150 GOTO(out, rc = -EFAULT);
4152 OBD_FREE_LARGE(fiemap, num_bytes);
4156 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4158 struct ll_inode_info *lli = ll_i2info(inode);
4159 struct posix_acl *acl = NULL;
4162 spin_lock(&lli->lli_lock);
4163 /* VFS' acl_permission_check->check_acl will release the refcount */
4164 acl = posix_acl_dup(lli->lli_posix_acl);
4165 spin_unlock(&lli->lli_lock);
4170 #ifdef HAVE_IOP_SET_ACL
4171 #ifdef CONFIG_FS_POSIX_ACL
4172 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4174 const char *name = NULL;
4181 case ACL_TYPE_ACCESS:
4183 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4187 name = XATTR_NAME_POSIX_ACL_ACCESS;
4189 case ACL_TYPE_DEFAULT:
4190 if (!S_ISDIR(inode->i_mode))
4191 GOTO(out, rc = acl ? -EACCES : 0);
4192 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4195 GOTO(out, rc = -EINVAL);
4199 size = posix_acl_xattr_size(acl->a_count);
4200 value = kmalloc(size, GFP_NOFS);
4202 GOTO(out, rc = -ENOMEM);
4204 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4209 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4210 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4215 set_cached_acl(inode, type, acl);
4217 forget_cached_acl(inode, type);
4220 #endif /* CONFIG_FS_POSIX_ACL */
4221 #endif /* HAVE_IOP_SET_ACL */
4223 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4225 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4226 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4228 ll_check_acl(struct inode *inode, int mask)
4231 # ifdef CONFIG_FS_POSIX_ACL
4232 struct posix_acl *acl;
4236 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4237 if (flags & IPERM_FLAG_RCU)
4240 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4245 rc = posix_acl_permission(inode, acl, mask);
4246 posix_acl_release(acl);
4249 # else /* !CONFIG_FS_POSIX_ACL */
4251 # endif /* CONFIG_FS_POSIX_ACL */
4253 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4255 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4256 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4258 # ifdef HAVE_INODE_PERMISION_2ARGS
4259 int ll_inode_permission(struct inode *inode, int mask)
4261 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4266 struct ll_sb_info *sbi;
4267 struct root_squash_info *squash;
4268 struct cred *cred = NULL;
4269 const struct cred *old_cred = NULL;
4271 bool squash_id = false;
4274 #ifdef MAY_NOT_BLOCK
4275 if (mask & MAY_NOT_BLOCK)
4277 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4278 if (flags & IPERM_FLAG_RCU)
4282 /* as root inode are NOT getting validated in lookup operation,
4283 * need to do it before permission check. */
4285 if (inode == inode->i_sb->s_root->d_inode) {
4286 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4287 MDS_INODELOCK_LOOKUP);
4292 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4293 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4295 /* squash fsuid/fsgid if needed */
4296 sbi = ll_i2sbi(inode);
4297 squash = &sbi->ll_squash;
4298 if (unlikely(squash->rsi_uid != 0 &&
4299 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4300 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4304 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4305 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4306 squash->rsi_uid, squash->rsi_gid);
4308 /* update current process's credentials
4309 * and FS capability */
4310 cred = prepare_creds();
4314 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4315 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4316 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4317 if ((1 << cap) & CFS_CAP_FS_MASK)
4318 cap_lower(cred->cap_effective, cap);
4320 old_cred = override_creds(cred);
4323 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4324 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4325 /* restore current process's credentials and FS capability */
4327 revert_creds(old_cred);
4334 /* -o localflock - only provides locally consistent flock locks */
4335 struct file_operations ll_file_operations = {
4336 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4337 # ifdef HAVE_SYNC_READ_WRITE
4338 .read = new_sync_read,
4339 .write = new_sync_write,
4341 .read_iter = ll_file_read_iter,
4342 .write_iter = ll_file_write_iter,
4343 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4344 .read = ll_file_read,
4345 .aio_read = ll_file_aio_read,
4346 .write = ll_file_write,
4347 .aio_write = ll_file_aio_write,
4348 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4349 .unlocked_ioctl = ll_file_ioctl,
4350 .open = ll_file_open,
4351 .release = ll_file_release,
4352 .mmap = ll_file_mmap,
4353 .llseek = ll_file_seek,
4354 .splice_read = ll_file_splice_read,
4359 struct file_operations ll_file_operations_flock = {
4360 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4361 # ifdef HAVE_SYNC_READ_WRITE
4362 .read = new_sync_read,
4363 .write = new_sync_write,
4364 # endif /* HAVE_SYNC_READ_WRITE */
4365 .read_iter = ll_file_read_iter,
4366 .write_iter = ll_file_write_iter,
4367 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4368 .read = ll_file_read,
4369 .aio_read = ll_file_aio_read,
4370 .write = ll_file_write,
4371 .aio_write = ll_file_aio_write,
4372 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4373 .unlocked_ioctl = ll_file_ioctl,
4374 .open = ll_file_open,
4375 .release = ll_file_release,
4376 .mmap = ll_file_mmap,
4377 .llseek = ll_file_seek,
4378 .splice_read = ll_file_splice_read,
4381 .flock = ll_file_flock,
4382 .lock = ll_file_flock
4385 /* These are for -o noflock - to return ENOSYS on flock calls */
4386 struct file_operations ll_file_operations_noflock = {
4387 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4388 # ifdef HAVE_SYNC_READ_WRITE
4389 .read = new_sync_read,
4390 .write = new_sync_write,
4391 # endif /* HAVE_SYNC_READ_WRITE */
4392 .read_iter = ll_file_read_iter,
4393 .write_iter = ll_file_write_iter,
4394 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4395 .read = ll_file_read,
4396 .aio_read = ll_file_aio_read,
4397 .write = ll_file_write,
4398 .aio_write = ll_file_aio_write,
4399 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4400 .unlocked_ioctl = ll_file_ioctl,
4401 .open = ll_file_open,
4402 .release = ll_file_release,
4403 .mmap = ll_file_mmap,
4404 .llseek = ll_file_seek,
4405 .splice_read = ll_file_splice_read,
4408 .flock = ll_file_noflock,
4409 .lock = ll_file_noflock
4412 struct inode_operations ll_file_inode_operations = {
4413 .setattr = ll_setattr,
4414 .getattr = ll_getattr,
4415 .permission = ll_inode_permission,
4416 #ifdef HAVE_IOP_XATTR
4417 .setxattr = ll_setxattr,
4418 .getxattr = ll_getxattr,
4419 .removexattr = ll_removexattr,
4421 .listxattr = ll_listxattr,
4422 .fiemap = ll_fiemap,
4423 #ifdef HAVE_IOP_GET_ACL
4424 .get_acl = ll_get_acl,
4426 #ifdef HAVE_IOP_SET_ACL
4427 .set_acl = ll_set_acl,
4431 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4433 struct ll_inode_info *lli = ll_i2info(inode);
4434 struct cl_object *obj = lli->lli_clob;
4443 env = cl_env_get(&refcheck);
4445 RETURN(PTR_ERR(env));
4447 rc = cl_conf_set(env, lli->lli_clob, conf);
4451 if (conf->coc_opc == OBJECT_CONF_SET) {
4452 struct ldlm_lock *lock = conf->coc_lock;
4453 struct cl_layout cl = {
4457 LASSERT(lock != NULL);
4458 LASSERT(ldlm_has_layout(lock));
4460 /* it can only be allowed to match after layout is
4461 * applied to inode otherwise false layout would be
4462 * seen. Applying layout shoud happen before dropping
4463 * the intent lock. */
4464 ldlm_lock_allow_match(lock);
4466 rc = cl_object_layout_get(env, obj, &cl);
4471 DFID": layout version change: %u -> %u\n",
4472 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4474 ll_layout_version_set(lli, cl.cl_layout_gen);
4478 cl_env_put(env, &refcheck);
4483 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4484 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4487 struct ll_sb_info *sbi = ll_i2sbi(inode);
4488 struct ptlrpc_request *req;
4489 struct mdt_body *body;
4496 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4497 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4498 lock->l_lvb_data, lock->l_lvb_len);
4500 if (lock->l_lvb_data != NULL)
4503 /* if layout lock was granted right away, the layout is returned
4504 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4505 * blocked and then granted via completion ast, we have to fetch
4506 * layout here. Please note that we can't use the LVB buffer in
4507 * completion AST because it doesn't have a large enough buffer */
4508 rc = ll_get_default_mdsize(sbi, &lmmsize);
4510 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4511 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4516 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4518 GOTO(out, rc = -EPROTO);
4520 lmmsize = body->mbo_eadatasize;
4521 if (lmmsize == 0) /* empty layout */
4524 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4526 GOTO(out, rc = -EFAULT);
4528 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4529 if (lvbdata == NULL)
4530 GOTO(out, rc = -ENOMEM);
4532 memcpy(lvbdata, lmm, lmmsize);
4533 lock_res_and_lock(lock);
4534 if (unlikely(lock->l_lvb_data == NULL)) {
4535 lock->l_lvb_type = LVB_T_LAYOUT;
4536 lock->l_lvb_data = lvbdata;
4537 lock->l_lvb_len = lmmsize;
4540 unlock_res_and_lock(lock);
4543 OBD_FREE_LARGE(lvbdata, lmmsize);
4548 ptlrpc_req_finished(req);
4553 * Apply the layout to the inode. Layout lock is held and will be released
4556 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4557 struct inode *inode)
4559 struct ll_inode_info *lli = ll_i2info(inode);
4560 struct ll_sb_info *sbi = ll_i2sbi(inode);
4561 struct ldlm_lock *lock;
4562 struct cl_object_conf conf;
4565 bool wait_layout = false;
4568 LASSERT(lustre_handle_is_used(lockh));
4570 lock = ldlm_handle2lock(lockh);
4571 LASSERT(lock != NULL);
4572 LASSERT(ldlm_has_layout(lock));
4574 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4575 PFID(&lli->lli_fid), inode);
4577 /* in case this is a caching lock and reinstate with new inode */
4578 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4580 lock_res_and_lock(lock);
4581 lvb_ready = ldlm_is_lvb_ready(lock);
4582 unlock_res_and_lock(lock);
4584 /* checking lvb_ready is racy but this is okay. The worst case is
4585 * that multi processes may configure the file on the same time. */
4589 rc = ll_layout_fetch(inode, lock);
4593 /* for layout lock, lmm is stored in lock's lvb.
4594 * lvb_data is immutable if the lock is held so it's safe to access it
4597 * set layout to file. Unlikely this will fail as old layout was
4598 * surely eliminated */
4599 memset(&conf, 0, sizeof conf);
4600 conf.coc_opc = OBJECT_CONF_SET;
4601 conf.coc_inode = inode;
4602 conf.coc_lock = lock;
4603 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4604 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4605 rc = ll_layout_conf(inode, &conf);
4607 /* refresh layout failed, need to wait */
4608 wait_layout = rc == -EBUSY;
4611 LDLM_LOCK_PUT(lock);
4612 ldlm_lock_decref(lockh, mode);
4614 /* wait for IO to complete if it's still being used. */
4616 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4617 ll_get_fsname(inode->i_sb, NULL, 0),
4618 PFID(&lli->lli_fid), inode);
4620 memset(&conf, 0, sizeof conf);
4621 conf.coc_opc = OBJECT_CONF_WAIT;
4622 conf.coc_inode = inode;
4623 rc = ll_layout_conf(inode, &conf);
4627 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4628 ll_get_fsname(inode->i_sb, NULL, 0),
4629 PFID(&lli->lli_fid), rc);
4635 * Issue layout intent RPC to MDS.
4636 * \param inode [in] file inode
4637 * \param intent [in] layout intent
4639 * \retval 0 on success
4640 * \retval < 0 error code
4642 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4644 struct ll_inode_info *lli = ll_i2info(inode);
4645 struct ll_sb_info *sbi = ll_i2sbi(inode);
4646 struct md_op_data *op_data;
4647 struct lookup_intent it;
4648 struct ptlrpc_request *req;
4652 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4653 0, 0, LUSTRE_OPC_ANY, NULL);
4654 if (IS_ERR(op_data))
4655 RETURN(PTR_ERR(op_data));
4657 op_data->op_data = intent;
4658 op_data->op_data_size = sizeof(*intent);
4660 memset(&it, 0, sizeof(it));
4661 it.it_op = IT_LAYOUT;
4662 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4663 intent->li_opc == LAYOUT_INTENT_TRUNC)
4664 it.it_flags = FMODE_WRITE;
4666 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4667 ll_get_fsname(inode->i_sb, NULL, 0),
4668 PFID(&lli->lli_fid), inode);
4670 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4671 &ll_md_blocking_ast, 0);
4672 if (it.it_request != NULL)
4673 ptlrpc_req_finished(it.it_request);
4674 it.it_request = NULL;
4676 ll_finish_md_op_data(op_data);
4678 /* set lock data in case this is a new lock */
4680 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4682 ll_intent_drop_lock(&it);
4688 * This function checks if there exists a LAYOUT lock on the client side,
4689 * or enqueues it if it doesn't have one in cache.
4691 * This function will not hold layout lock so it may be revoked any time after
4692 * this function returns. Any operations depend on layout should be redone
4695 * This function should be called before lov_io_init() to get an uptodate
4696 * layout version, the caller should save the version number and after IO
4697 * is finished, this function should be called again to verify that layout
4698 * is not changed during IO time.
4700 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4702 struct ll_inode_info *lli = ll_i2info(inode);
4703 struct ll_sb_info *sbi = ll_i2sbi(inode);
4704 struct lustre_handle lockh;
4705 struct layout_intent intent = {
4706 .li_opc = LAYOUT_INTENT_ACCESS,
4708 enum ldlm_mode mode;
4712 *gen = ll_layout_version_get(lli);
4713 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4717 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4718 LASSERT(S_ISREG(inode->i_mode));
4720 /* take layout lock mutex to enqueue layout lock exclusively. */
4721 mutex_lock(&lli->lli_layout_mutex);
4724 /* mostly layout lock is caching on the local side, so try to
4725 * match it before grabbing layout lock mutex. */
4726 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4727 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4728 if (mode != 0) { /* hit cached lock */
4729 rc = ll_layout_lock_set(&lockh, mode, inode);
4735 rc = ll_layout_intent(inode, &intent);
4741 *gen = ll_layout_version_get(lli);
4742 mutex_unlock(&lli->lli_layout_mutex);
4748 * Issue layout intent RPC indicating where in a file an IO is about to write.
4750 * \param[in] inode file inode.
4751 * \param[in] start start offset of fille in bytes where an IO is about to
4753 * \param[in] end exclusive end offset in bytes of the write range.
4755 * \retval 0 on success
4756 * \retval < 0 error code
4758 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4760 struct layout_intent intent = {
4761 .li_opc = LAYOUT_INTENT_WRITE,
4762 .li_extent.e_start = start,
4763 .li_extent.e_end = end,
4768 rc = ll_layout_intent(inode, &intent);
4774 * This function send a restore request to the MDT
4776 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4778 struct hsm_user_request *hur;
4782 len = sizeof(struct hsm_user_request) +
4783 sizeof(struct hsm_user_item);
4784 OBD_ALLOC(hur, len);
4788 hur->hur_request.hr_action = HUA_RESTORE;
4789 hur->hur_request.hr_archive_id = 0;
4790 hur->hur_request.hr_flags = 0;
4791 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4792 sizeof(hur->hur_user_item[0].hui_fid));
4793 hur->hur_user_item[0].hui_extent.offset = offset;
4794 hur->hur_user_item[0].hui_extent.length = length;
4795 hur->hur_request.hr_itemcount = 1;
4796 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,