4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_HSM_RELEASE:
160 LASSERT(data != NULL);
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *(__u64 *)data;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 LASSERT(data == NULL);
172 rc = md_close(md_exp, op_data, och->och_mod, &req);
173 if (rc != 0 && rc != -EINTR)
174 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
175 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
177 if (rc == 0 && op_data->op_bias & bias) {
178 struct mdt_body *body;
180 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
181 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
185 ll_finish_md_op_data(op_data);
189 md_clear_open_replay_data(md_exp, och);
190 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
193 ptlrpc_req_finished(req); /* This is close request */
197 int ll_md_real_close(struct inode *inode, fmode_t fmode)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (fmode & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (fmode & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(fmode & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount > 0) {
220 /* There are still users of this handle, so skip
222 mutex_unlock(&lli->lli_och_mutex);
228 mutex_unlock(&lli->lli_och_mutex);
231 /* There might be a race and this handle may already
233 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
239 static int ll_md_close(struct inode *inode, struct file *file)
241 union ldlm_policy_data policy = {
242 .l_inodebits = { MDS_INODELOCK_OPEN },
244 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
247 struct lustre_handle lockh;
248 enum ldlm_mode lockmode;
252 /* clear group lock, if present */
253 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
254 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
256 if (fd->fd_lease_och != NULL) {
259 /* Usually the lease is not released when the
260 * application crashed, we need to release here. */
261 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
262 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
263 PFID(&lli->lli_fid), rc, lease_broken);
265 fd->fd_lease_och = NULL;
268 if (fd->fd_och != NULL) {
269 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
274 /* Let's see if we have good enough OPEN lock on the file and if
275 we can skip talking to MDS */
276 mutex_lock(&lli->lli_och_mutex);
277 if (fd->fd_omode & FMODE_WRITE) {
279 LASSERT(lli->lli_open_fd_write_count);
280 lli->lli_open_fd_write_count--;
281 } else if (fd->fd_omode & FMODE_EXEC) {
283 LASSERT(lli->lli_open_fd_exec_count);
284 lli->lli_open_fd_exec_count--;
287 LASSERT(lli->lli_open_fd_read_count);
288 lli->lli_open_fd_read_count--;
290 mutex_unlock(&lli->lli_och_mutex);
292 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
293 LDLM_IBITS, &policy, lockmode, &lockh))
294 rc = ll_md_real_close(inode, fd->fd_omode);
297 LUSTRE_FPRIVATE(file) = NULL;
298 ll_file_data_put(fd);
303 /* While this returns an error code, fput() the caller does not, so we need
304 * to make every effort to clean up all of our state here. Also, applications
305 * rarely check close errors and even if an error is returned they will not
306 * re-try the close call.
308 int ll_file_release(struct inode *inode, struct file *file)
310 struct ll_file_data *fd;
311 struct ll_sb_info *sbi = ll_i2sbi(inode);
312 struct ll_inode_info *lli = ll_i2info(inode);
316 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
317 PFID(ll_inode2fid(inode)), inode);
319 if (inode->i_sb->s_root != file_dentry(file))
320 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
321 fd = LUSTRE_FPRIVATE(file);
324 /* The last ref on @file, maybe not the the owner pid of statahead,
325 * because parent and child process can share the same file handle. */
326 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
327 ll_deauthorize_statahead(inode, fd);
329 if (inode->i_sb->s_root == file_dentry(file)) {
330 LUSTRE_FPRIVATE(file) = NULL;
331 ll_file_data_put(fd);
335 if (!S_ISDIR(inode->i_mode)) {
336 if (lli->lli_clob != NULL)
337 lov_read_and_clear_async_rc(lli->lli_clob);
338 lli->lli_async_rc = 0;
341 rc = ll_md_close(inode, file);
343 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
344 libcfs_debug_dumplog();
349 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
350 struct lookup_intent *itp)
352 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
353 struct dentry *parent = de->d_parent;
354 const char *name = NULL;
356 struct md_op_data *op_data;
357 struct ptlrpc_request *req = NULL;
361 LASSERT(parent != NULL);
362 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
364 /* if server supports open-by-fid, or file name is invalid, don't pack
365 * name in open request */
366 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
367 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
368 name = de->d_name.name;
369 len = de->d_name.len;
372 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
373 name, len, 0, LUSTRE_OPC_ANY, NULL);
375 RETURN(PTR_ERR(op_data));
376 op_data->op_data = lmm;
377 op_data->op_data_size = lmmsize;
379 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
380 &ll_md_blocking_ast, 0);
381 ll_finish_md_op_data(op_data);
383 /* reason for keep own exit path - don`t flood log
384 * with messages with -ESTALE errors.
386 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
387 it_open_error(DISP_OPEN_OPEN, itp))
389 ll_release_openhandle(de, itp);
393 if (it_disposition(itp, DISP_LOOKUP_NEG))
394 GOTO(out, rc = -ENOENT);
396 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
397 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
398 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
402 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
403 if (!rc && itp->it_lock_mode)
404 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
407 ptlrpc_req_finished(req);
408 ll_intent_drop_lock(itp);
410 /* We did open by fid, but by the time we got to the server,
411 * the object disappeared. If this is a create, we cannot really
412 * tell the userspace that the file it was trying to create
413 * does not exist. Instead let's return -ESTALE, and the VFS will
414 * retry the create with LOOKUP_REVAL that we are going to catch
415 * in ll_revalidate_dentry() and use lookup then.
417 if (rc == -ENOENT && itp->it_op & IT_CREAT)
423 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
424 struct obd_client_handle *och)
426 struct mdt_body *body;
428 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
429 och->och_fh = body->mbo_handle;
430 och->och_fid = body->mbo_fid1;
431 och->och_lease_handle.cookie = it->it_lock_handle;
432 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
433 och->och_flags = it->it_flags;
435 return md_set_open_replay_data(md_exp, och, it);
438 static int ll_local_open(struct file *file, struct lookup_intent *it,
439 struct ll_file_data *fd, struct obd_client_handle *och)
441 struct inode *inode = file_inode(file);
444 LASSERT(!LUSTRE_FPRIVATE(file));
451 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
456 LUSTRE_FPRIVATE(file) = fd;
457 ll_readahead_init(inode, &fd->fd_ras);
458 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
460 /* ll_cl_context initialize */
461 rwlock_init(&fd->fd_lock);
462 INIT_LIST_HEAD(&fd->fd_lccs);
467 /* Open a file, and (for the very first open) create objects on the OSTs at
468 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
469 * creation or open until ll_lov_setstripe() ioctl is called.
471 * If we already have the stripe MD locally then we don't request it in
472 * md_open(), by passing a lmm_size = 0.
474 * It is up to the application to ensure no other processes open this file
475 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
476 * used. We might be able to avoid races of that sort by getting lli_open_sem
477 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
478 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480 int ll_file_open(struct inode *inode, struct file *file)
482 struct ll_inode_info *lli = ll_i2info(inode);
483 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
484 .it_flags = file->f_flags };
485 struct obd_client_handle **och_p = NULL;
486 __u64 *och_usecount = NULL;
487 struct ll_file_data *fd;
491 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
492 PFID(ll_inode2fid(inode)), inode, file->f_flags);
494 it = file->private_data; /* XXX: compat macro */
495 file->private_data = NULL; /* prevent ll_local_open assertion */
497 fd = ll_file_data_get();
499 GOTO(out_openerr, rc = -ENOMEM);
502 if (S_ISDIR(inode->i_mode))
503 ll_authorize_statahead(inode, fd);
505 if (inode->i_sb->s_root == file_dentry(file)) {
506 LUSTRE_FPRIVATE(file) = fd;
510 if (!it || !it->it_disposition) {
511 /* Convert f_flags into access mode. We cannot use file->f_mode,
512 * because everything but O_ACCMODE mask was stripped from
514 if ((oit.it_flags + 1) & O_ACCMODE)
516 if (file->f_flags & O_TRUNC)
517 oit.it_flags |= FMODE_WRITE;
519 /* kernel only call f_op->open in dentry_open. filp_open calls
520 * dentry_open after call to open_namei that checks permissions.
521 * Only nfsd_open call dentry_open directly without checking
522 * permissions and because of that this code below is safe. */
523 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
524 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
526 /* We do not want O_EXCL here, presumably we opened the file
527 * already? XXX - NFS implications? */
528 oit.it_flags &= ~O_EXCL;
530 /* bug20584, if "it_flags" contains O_CREAT, the file will be
531 * created if necessary, then "IT_CREAT" should be set to keep
532 * consistent with it */
533 if (oit.it_flags & O_CREAT)
534 oit.it_op |= IT_CREAT;
540 /* Let's see if we have file open on MDS already. */
541 if (it->it_flags & FMODE_WRITE) {
542 och_p = &lli->lli_mds_write_och;
543 och_usecount = &lli->lli_open_fd_write_count;
544 } else if (it->it_flags & FMODE_EXEC) {
545 och_p = &lli->lli_mds_exec_och;
546 och_usecount = &lli->lli_open_fd_exec_count;
548 och_p = &lli->lli_mds_read_och;
549 och_usecount = &lli->lli_open_fd_read_count;
552 mutex_lock(&lli->lli_och_mutex);
553 if (*och_p) { /* Open handle is present */
554 if (it_disposition(it, DISP_OPEN_OPEN)) {
555 /* Well, there's extra open request that we do not need,
556 let's close it somehow. This will decref request. */
557 rc = it_open_error(DISP_OPEN_OPEN, it);
559 mutex_unlock(&lli->lli_och_mutex);
560 GOTO(out_openerr, rc);
563 ll_release_openhandle(file_dentry(file), it);
567 rc = ll_local_open(file, it, fd, NULL);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 LASSERT(*och_usecount == 0);
575 if (!it->it_disposition) {
576 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
577 /* We cannot just request lock handle now, new ELC code
578 means that one of other OPEN locks for this file
579 could be cancelled, and since blocking ast handler
580 would attempt to grab och_mutex as well, that would
581 result in a deadlock */
582 mutex_unlock(&lli->lli_och_mutex);
584 * Normally called under two situations:
586 * 2. A race/condition on MDS resulting in no open
587 * handle to be returned from LOOKUP|OPEN request,
588 * for example if the target entry was a symlink.
590 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
591 * marked by a bit set in ll_iget_for_nfs. Clear the
592 * bit so that it's not confusing later callers.
594 * NB; when ldd is NULL, it must have come via normal
595 * lookup path only, since ll_iget_for_nfs always calls
598 if (ldd && ldd->lld_nfs_dentry) {
599 ldd->lld_nfs_dentry = 0;
600 it->it_flags |= MDS_OPEN_LOCK;
604 * Always specify MDS_OPEN_BY_FID because we don't want
605 * to get file with different fid.
607 it->it_flags |= MDS_OPEN_BY_FID;
608 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
611 GOTO(out_openerr, rc);
615 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
617 GOTO(out_och_free, rc = -ENOMEM);
621 /* md_intent_lock() didn't get a request ref if there was an
622 * open error, so don't do cleanup on the request here
624 /* XXX (green): Should not we bail out on any error here, not
625 * just open error? */
626 rc = it_open_error(DISP_OPEN_OPEN, it);
628 GOTO(out_och_free, rc);
630 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
631 "inode %p: disposition %x, status %d\n", inode,
632 it_disposition(it, ~0), it->it_status);
634 rc = ll_local_open(file, it, fd, *och_p);
636 GOTO(out_och_free, rc);
638 mutex_unlock(&lli->lli_och_mutex);
641 /* Must do this outside lli_och_mutex lock to prevent deadlock where
642 different kind of OPEN lock for this same inode gets cancelled
643 by ldlm_cancel_lru */
644 if (!S_ISREG(inode->i_mode))
645 GOTO(out_och_free, rc);
647 cl_lov_delay_create_clear(&file->f_flags);
648 GOTO(out_och_free, rc);
652 if (och_p && *och_p) {
653 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
654 *och_p = NULL; /* OBD_FREE writes some magic there */
657 mutex_unlock(&lli->lli_och_mutex);
660 if (lli->lli_opendir_key == fd)
661 ll_deauthorize_statahead(inode, fd);
663 ll_file_data_put(fd);
665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
668 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
669 ptlrpc_req_finished(it->it_request);
670 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
676 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
677 struct ldlm_lock_desc *desc, void *data, int flag)
680 struct lustre_handle lockh;
684 case LDLM_CB_BLOCKING:
685 ldlm_lock2handle(lock, &lockh);
686 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
688 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
692 case LDLM_CB_CANCELING:
700 * When setting a lease on a file, we take ownership of the lli_mds_*_och
701 * and save it as fd->fd_och so as to force client to reopen the file even
702 * if it has an open lock in cache already.
704 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
705 struct lustre_handle *old_handle)
707 struct ll_inode_info *lli = ll_i2info(inode);
708 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
709 struct obd_client_handle **och_p;
714 /* Get the openhandle of the file */
715 mutex_lock(&lli->lli_och_mutex);
716 if (fd->fd_lease_och != NULL)
717 GOTO(out_unlock, rc = -EBUSY);
719 if (fd->fd_och == NULL) {
720 if (file->f_mode & FMODE_WRITE) {
721 LASSERT(lli->lli_mds_write_och != NULL);
722 och_p = &lli->lli_mds_write_och;
723 och_usecount = &lli->lli_open_fd_write_count;
725 LASSERT(lli->lli_mds_read_och != NULL);
726 och_p = &lli->lli_mds_read_och;
727 och_usecount = &lli->lli_open_fd_read_count;
730 if (*och_usecount > 1)
731 GOTO(out_unlock, rc = -EBUSY);
738 *old_handle = fd->fd_och->och_fh;
742 mutex_unlock(&lli->lli_och_mutex);
747 * Release ownership on lli_mds_*_och when putting back a file lease.
749 static int ll_lease_och_release(struct inode *inode, struct file *file)
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
754 struct obd_client_handle *old_och = NULL;
759 mutex_lock(&lli->lli_och_mutex);
760 if (file->f_mode & FMODE_WRITE) {
761 och_p = &lli->lli_mds_write_och;
762 och_usecount = &lli->lli_open_fd_write_count;
764 och_p = &lli->lli_mds_read_och;
765 och_usecount = &lli->lli_open_fd_read_count;
768 /* The file may have been open by another process (broken lease) so
769 * *och_p is not NULL. In this case we should simply increase usecount
772 if (*och_p != NULL) {
773 old_och = fd->fd_och;
780 mutex_unlock(&lli->lli_och_mutex);
783 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
789 * Acquire a lease and open the file.
791 static struct obd_client_handle *
792 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
795 struct lookup_intent it = { .it_op = IT_OPEN };
796 struct ll_sb_info *sbi = ll_i2sbi(inode);
797 struct md_op_data *op_data;
798 struct ptlrpc_request *req = NULL;
799 struct lustre_handle old_handle = { 0 };
800 struct obd_client_handle *och = NULL;
805 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
806 RETURN(ERR_PTR(-EINVAL));
809 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
810 RETURN(ERR_PTR(-EPERM));
812 rc = ll_lease_och_acquire(inode, file, &old_handle);
819 RETURN(ERR_PTR(-ENOMEM));
821 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
822 LUSTRE_OPC_ANY, NULL);
824 GOTO(out, rc = PTR_ERR(op_data));
826 /* To tell the MDT this openhandle is from the same owner */
827 op_data->op_handle = old_handle;
829 it.it_flags = fmode | open_flags;
830 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
831 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
832 &ll_md_blocking_lease_ast,
833 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
834 * it can be cancelled which may mislead applications that the lease is
836 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
837 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
838 * doesn't deal with openhandle, so normal openhandle will be leaked. */
839 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
840 ll_finish_md_op_data(op_data);
841 ptlrpc_req_finished(req);
843 GOTO(out_release_it, rc);
845 if (it_disposition(&it, DISP_LOOKUP_NEG))
846 GOTO(out_release_it, rc = -ENOENT);
848 rc = it_open_error(DISP_OPEN_OPEN, &it);
850 GOTO(out_release_it, rc);
852 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
853 ll_och_fill(sbi->ll_md_exp, &it, och);
855 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
856 GOTO(out_close, rc = -EOPNOTSUPP);
858 /* already get lease, handle lease lock */
859 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
860 if (it.it_lock_mode == 0 ||
861 it.it_lock_bits != MDS_INODELOCK_OPEN) {
862 /* open lock must return for lease */
863 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
864 PFID(ll_inode2fid(inode)), it.it_lock_mode,
866 GOTO(out_close, rc = -EPROTO);
869 ll_intent_release(&it);
873 /* Cancel open lock */
874 if (it.it_lock_mode != 0) {
875 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
878 och->och_lease_handle.cookie = 0ULL;
880 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
882 CERROR("%s: error closing file "DFID": %d\n",
883 ll_get_fsname(inode->i_sb, NULL, 0),
884 PFID(&ll_i2info(inode)->lli_fid), rc2);
885 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
887 ll_intent_release(&it);
895 * Check whether a layout swap can be done between two inodes.
897 * \param[in] inode1 First inode to check
898 * \param[in] inode2 Second inode to check
900 * \retval 0 on success, layout swap can be performed between both inodes
901 * \retval negative error code if requirements are not met
903 static int ll_check_swap_layouts_validity(struct inode *inode1,
904 struct inode *inode2)
906 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
909 if (inode_permission(inode1, MAY_WRITE) ||
910 inode_permission(inode2, MAY_WRITE))
913 if (inode1->i_sb != inode2->i_sb)
919 static int ll_swap_layouts_close(struct obd_client_handle *och,
920 struct inode *inode, struct inode *inode2,
923 const struct lu_fid *fid1 = ll_inode2fid(inode);
924 const struct lu_fid *fid2;
925 enum mds_op_bias bias;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
944 case SWAP_LAYOUTS_CLOSE:
945 bias = MDS_CLOSE_LAYOUT_SWAP;
947 case MERGE_LAYOUTS_CLOSE:
948 bias = MDS_CLOSE_LAYOUT_MERGE;
951 GOTO(out_free_och, rc = -EOPNOTSUPP);
954 /* Close the file and {swap,merge} layouts between inode & inode2.
955 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
956 * because we still need it to pack l_remote_handle to MDT. */
957 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
959 och = NULL; /* freed in ll_close_inode_openhandle() */
969 * Release lease and close the file.
970 * It will check if the lease has ever broken.
972 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
975 struct ldlm_lock *lock;
976 bool cancelled = true;
980 lock = ldlm_handle2lock(&och->och_lease_handle);
982 lock_res_and_lock(lock);
983 cancelled = ldlm_is_cancel(lock);
984 unlock_res_and_lock(lock);
988 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
989 PFID(&ll_i2info(inode)->lli_fid), cancelled);
992 ldlm_cli_cancel(&och->och_lease_handle, 0);
994 if (lease_broken != NULL)
995 *lease_broken = cancelled;
997 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1001 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1003 struct ll_inode_info *lli = ll_i2info(inode);
1004 struct cl_object *obj = lli->lli_clob;
1005 struct cl_attr *attr = vvp_env_thread_attr(env);
1013 ll_inode_size_lock(inode);
1015 /* Merge timestamps the most recently obtained from MDS with
1016 * timestamps obtained from OSTs.
1018 * Do not overwrite atime of inode because it may be refreshed
1019 * by file_accessed() function. If the read was served by cache
1020 * data, there is no RPC to be sent so that atime may not be
1021 * transferred to OSTs at all. MDT only updates atime at close time
1022 * if it's at least 'mdd.*.atime_diff' older.
1023 * All in all, the atime in Lustre does not strictly comply with
1024 * POSIX. Solving this problem needs to send an RPC to MDT for each
1025 * read, this will hurt performance. */
1026 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1027 LTIME_S(inode->i_atime) = lli->lli_atime;
1028 lli->lli_update_atime = 0;
1030 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1031 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1033 atime = LTIME_S(inode->i_atime);
1034 mtime = LTIME_S(inode->i_mtime);
1035 ctime = LTIME_S(inode->i_ctime);
1037 cl_object_attr_lock(obj);
1038 rc = cl_object_attr_get(env, obj, attr);
1039 cl_object_attr_unlock(obj);
1042 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1044 if (atime < attr->cat_atime)
1045 atime = attr->cat_atime;
1047 if (ctime < attr->cat_ctime)
1048 ctime = attr->cat_ctime;
1050 if (mtime < attr->cat_mtime)
1051 mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1056 i_size_write(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_atime) = atime;
1060 LTIME_S(inode->i_mtime) = mtime;
1061 LTIME_S(inode->i_ctime) = ctime;
1064 ll_inode_size_unlock(inode);
1069 static bool file_is_noatime(const struct file *file)
1071 const struct vfsmount *mnt = file->f_path.mnt;
1072 const struct inode *inode = file_inode((struct file *)file);
1074 /* Adapted from file_accessed() and touch_atime().*/
1075 if (file->f_flags & O_NOATIME)
1078 if (inode->i_flags & S_NOATIME)
1081 if (IS_NOATIME(inode))
1084 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1087 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1090 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1096 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1098 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1100 struct inode *inode = file_inode(file);
1101 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1103 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1104 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1105 io->u.ci_rw.rw_file = file;
1106 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1107 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1108 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1110 if (iot == CIT_WRITE) {
1111 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1112 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1113 file->f_flags & O_DIRECT ||
1116 io->ci_obj = ll_i2info(inode)->lli_clob;
1117 io->ci_lockreq = CILR_MAYBE;
1118 if (ll_file_nolock(file)) {
1119 io->ci_lockreq = CILR_NEVER;
1120 io->ci_no_srvlock = 1;
1121 } else if (file->f_flags & O_APPEND) {
1122 io->ci_lockreq = CILR_MANDATORY;
1124 io->ci_noatime = file_is_noatime(file);
1125 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1126 io->ci_pio = !io->u.ci_rw.rw_append;
1130 /* FLR: only use non-delay I/O for read as there is only one
1131 * avaliable mirror for write. */
1132 io->ci_ndelay = !(iot == CIT_WRITE);
1135 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1137 struct cl_io_pt *pt = ptask->pt_cbdata;
1138 struct file *file = pt->cip_file;
1141 loff_t pos = pt->cip_pos;
1146 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1147 file_dentry(file)->d_name.name,
1148 pt->cip_iot == CIT_READ ? "read" : "write",
1149 pos, pos + pt->cip_count);
1151 env = cl_env_get(&refcheck);
1153 RETURN(PTR_ERR(env));
1155 io = vvp_env_thread_io(env);
1156 ll_io_init(io, file, pt->cip_iot);
1157 io->u.ci_rw.rw_iter = pt->cip_iter;
1158 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1159 io->ci_pio = 0; /* It's already in parallel task */
1161 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1162 pt->cip_count - pt->cip_result);
1164 struct vvp_io *vio = vvp_env_io(env);
1166 vio->vui_io_subtype = IO_NORMAL;
1167 vio->vui_fd = LUSTRE_FPRIVATE(file);
1169 ll_cl_add(file, env, io, LCC_RW);
1170 rc = cl_io_loop(env, io);
1171 ll_cl_remove(file, env);
1173 /* cl_io_rw_init() handled IO */
1177 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1183 if (io->ci_nob > 0) {
1184 pt->cip_result += io->ci_nob;
1185 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1187 pt->cip_iocb.ki_pos = pos;
1188 #ifdef HAVE_KIOCB_KI_LEFT
1189 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1190 #elif defined(HAVE_KI_NBYTES)
1191 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1195 cl_io_fini(env, io);
1196 cl_env_put(env, &refcheck);
1198 pt->cip_need_restart = io->ci_need_restart;
1200 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1201 file_dentry(file)->d_name.name,
1202 pt->cip_iot == CIT_READ ? "read" : "write",
1203 pt->cip_result, rc);
1205 RETURN(pt->cip_result > 0 ? 0 : rc);
1209 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1210 struct file *file, enum cl_io_type iot,
1211 loff_t *ppos, size_t count)
1213 struct range_lock range;
1214 struct vvp_io *vio = vvp_env_io(env);
1215 struct inode *inode = file_inode(file);
1216 struct ll_inode_info *lli = ll_i2info(inode);
1217 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1222 unsigned retried = 0;
1223 bool restarted = false;
1227 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1228 file_dentry(file)->d_name.name,
1229 iot == CIT_READ ? "read" : "write", pos, pos + count);
1232 io = vvp_env_thread_io(env);
1233 ll_io_init(io, file, iot);
1234 if (args->via_io_subtype == IO_NORMAL) {
1235 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1236 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1238 if (args->via_io_subtype != IO_NORMAL || restarted)
1240 io->ci_ndelay_tried = retried;
1242 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1243 bool range_locked = false;
1245 if (file->f_flags & O_APPEND)
1246 range_lock_init(&range, 0, LUSTRE_EOF);
1248 range_lock_init(&range, pos, pos + count - 1);
1250 vio->vui_fd = LUSTRE_FPRIVATE(file);
1251 vio->vui_io_subtype = args->via_io_subtype;
1253 switch (vio->vui_io_subtype) {
1255 /* Direct IO reads must also take range lock,
1256 * or multiple reads will try to work on the same pages
1257 * See LU-6227 for details. */
1258 if (((iot == CIT_WRITE) ||
1259 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1260 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1261 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1263 rc = range_lock(&lli->lli_write_tree, &range);
1267 range_locked = true;
1271 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1272 vio->u.splice.vui_flags = args->u.splice.via_flags;
1275 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1279 ll_cl_add(file, env, io, LCC_RW);
1280 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1281 !lli->lli_inode_locked) {
1283 lli->lli_inode_locked = 1;
1285 rc = cl_io_loop(env, io);
1286 if (lli->lli_inode_locked) {
1287 lli->lli_inode_locked = 0;
1288 inode_unlock(inode);
1290 ll_cl_remove(file, env);
1293 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1295 range_unlock(&lli->lli_write_tree, &range);
1298 /* cl_io_rw_init() handled IO */
1302 if (io->ci_nob > 0) {
1303 result += io->ci_nob;
1304 count -= io->ci_nob;
1306 if (args->via_io_subtype == IO_NORMAL) {
1307 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1309 args->u.normal.via_iocb->ki_pos = pos;
1310 #ifdef HAVE_KIOCB_KI_LEFT
1311 args->u.normal.via_iocb->ki_left = count;
1312 #elif defined(HAVE_KI_NBYTES)
1313 args->u.normal.via_iocb->ki_nbytes = count;
1317 pos = io->u.ci_rw.rw_range.cir_pos;
1321 cl_io_fini(env, io);
1324 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1325 file->f_path.dentry->d_name.name,
1326 iot, rc, result, io->ci_need_restart);
1328 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1330 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1331 file_dentry(file)->d_name.name,
1332 iot == CIT_READ ? "read" : "write",
1333 pos, pos + count, result, rc);
1334 /* preserve the tried count for FLR */
1335 retried = io->ci_ndelay_tried;
1340 if (iot == CIT_READ) {
1342 ll_stats_ops_tally(ll_i2sbi(inode),
1343 LPROC_LL_READ_BYTES, result);
1344 } else if (iot == CIT_WRITE) {
1346 ll_stats_ops_tally(ll_i2sbi(inode),
1347 LPROC_LL_WRITE_BYTES, result);
1348 fd->fd_write_failed = false;
1349 } else if (result == 0 && rc == 0) {
1352 fd->fd_write_failed = true;
1354 fd->fd_write_failed = false;
1355 } else if (rc != -ERESTARTSYS) {
1356 fd->fd_write_failed = true;
1360 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1361 file_dentry(file)->d_name.name,
1362 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1366 RETURN(result > 0 ? result : rc);
1370 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1371 * especially for small I/O.
1373 * To serve a read request, CLIO has to create and initialize a cl_io and
1374 * then request DLM lock. This has turned out to have siginificant overhead
1375 * and affects the performance of small I/O dramatically.
1377 * It's not necessary to create a cl_io for each I/O. Under the help of read
1378 * ahead, most of the pages being read are already in memory cache and we can
1379 * read those pages directly because if the pages exist, the corresponding DLM
1380 * lock must exist so that page content must be valid.
1382 * In fast read implementation, the llite speculatively finds and reads pages
1383 * in memory cache. There are three scenarios for fast read:
1384 * - If the page exists and is uptodate, kernel VM will provide the data and
1385 * CLIO won't be intervened;
1386 * - If the page was brought into memory by read ahead, it will be exported
1387 * and read ahead parameters will be updated;
1388 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1389 * it will go back and invoke normal read, i.e., a cl_io will be created
1390 * and DLM lock will be requested.
1392 * POSIX compliance: posix standard states that read is intended to be atomic.
1393 * Lustre read implementation is in line with Linux kernel read implementation
1394 * and neither of them complies with POSIX standard in this matter. Fast read
1395 * doesn't make the situation worse on single node but it may interleave write
1396 * results from multiple nodes due to short read handling in ll_file_aio_read().
1398 * \param env - lu_env
1399 * \param iocb - kiocb from kernel
1400 * \param iter - user space buffers where the data will be copied
1402 * \retval - number of bytes have been read, or error code if error occurred.
1405 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1409 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1412 /* NB: we can't do direct IO for fast read because it will need a lock
1413 * to make IO engine happy. */
1414 if (iocb->ki_filp->f_flags & O_DIRECT)
1417 result = generic_file_read_iter(iocb, iter);
1419 /* If the first page is not in cache, generic_file_aio_read() will be
1420 * returned with -ENODATA.
1421 * See corresponding code in ll_readpage(). */
1422 if (result == -ENODATA)
1426 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1427 LPROC_LL_READ_BYTES, result);
1433 * Read from a file (through the page cache).
1435 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1438 struct vvp_io_args *args;
1443 result = ll_do_fast_read(iocb, to);
1444 if (result < 0 || iov_iter_count(to) == 0)
1447 env = cl_env_get(&refcheck);
1449 return PTR_ERR(env);
1451 args = ll_env_args(env, IO_NORMAL);
1452 args->u.normal.via_iter = to;
1453 args->u.normal.via_iocb = iocb;
1455 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1456 &iocb->ki_pos, iov_iter_count(to));
1459 else if (result == 0)
1462 cl_env_put(env, &refcheck);
1468 * Write to a file (through the page cache).
1470 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1472 struct vvp_io_args *args;
1477 env = cl_env_get(&refcheck);
1479 return PTR_ERR(env);
1481 args = ll_env_args(env, IO_NORMAL);
1482 args->u.normal.via_iter = from;
1483 args->u.normal.via_iocb = iocb;
1485 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1486 &iocb->ki_pos, iov_iter_count(from));
1487 cl_env_put(env, &refcheck);
1491 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1493 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1495 static int ll_file_get_iov_count(const struct iovec *iov,
1496 unsigned long *nr_segs, size_t *count)
1501 for (seg = 0; seg < *nr_segs; seg++) {
1502 const struct iovec *iv = &iov[seg];
1505 * If any segment has a negative length, or the cumulative
1506 * length ever wraps negative then return -EINVAL.
1509 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1511 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1516 cnt -= iv->iov_len; /* This segment is no good */
1523 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1524 unsigned long nr_segs, loff_t pos)
1531 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1535 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1536 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1537 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1538 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1539 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1541 result = ll_file_read_iter(iocb, &to);
1546 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1549 struct iovec iov = { .iov_base = buf, .iov_len = count };
1554 init_sync_kiocb(&kiocb, file);
1555 kiocb.ki_pos = *ppos;
1556 #ifdef HAVE_KIOCB_KI_LEFT
1557 kiocb.ki_left = count;
1558 #elif defined(HAVE_KI_NBYTES)
1559 kiocb.i_nbytes = count;
1562 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1563 *ppos = kiocb.ki_pos;
1569 * Write to a file (through the page cache).
1572 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1573 unsigned long nr_segs, loff_t pos)
1575 struct iov_iter from;
1580 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1584 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1585 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1586 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1587 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1588 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1590 result = ll_file_write_iter(iocb, &from);
1595 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1596 size_t count, loff_t *ppos)
1599 struct iovec iov = { .iov_base = (void __user *)buf,
1601 struct kiocb *kiocb;
1606 env = cl_env_get(&refcheck);
1608 RETURN(PTR_ERR(env));
1610 kiocb = &ll_env_info(env)->lti_kiocb;
1611 init_sync_kiocb(kiocb, file);
1612 kiocb->ki_pos = *ppos;
1613 #ifdef HAVE_KIOCB_KI_LEFT
1614 kiocb->ki_left = count;
1615 #elif defined(HAVE_KI_NBYTES)
1616 kiocb->ki_nbytes = count;
1619 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1620 *ppos = kiocb->ki_pos;
1622 cl_env_put(env, &refcheck);
1625 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1628 * Send file content (through pagecache) somewhere with helper
1630 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1631 struct pipe_inode_info *pipe, size_t count,
1635 struct vvp_io_args *args;
1640 env = cl_env_get(&refcheck);
1642 RETURN(PTR_ERR(env));
1644 args = ll_env_args(env, IO_SPLICE);
1645 args->u.splice.via_pipe = pipe;
1646 args->u.splice.via_flags = flags;
1648 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1649 cl_env_put(env, &refcheck);
1653 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1654 __u64 flags, struct lov_user_md *lum, int lum_size)
1656 struct lookup_intent oit = {
1658 .it_flags = flags | MDS_OPEN_BY_FID,
1663 ll_inode_size_lock(inode);
1664 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1666 GOTO(out_unlock, rc);
1668 ll_release_openhandle(dentry, &oit);
1671 ll_inode_size_unlock(inode);
1672 ll_intent_release(&oit);
1677 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1678 struct lov_mds_md **lmmp, int *lmm_size,
1679 struct ptlrpc_request **request)
1681 struct ll_sb_info *sbi = ll_i2sbi(inode);
1682 struct mdt_body *body;
1683 struct lov_mds_md *lmm = NULL;
1684 struct ptlrpc_request *req = NULL;
1685 struct md_op_data *op_data;
1688 rc = ll_get_default_mdsize(sbi, &lmmsize);
1692 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1693 strlen(filename), lmmsize,
1694 LUSTRE_OPC_ANY, NULL);
1695 if (IS_ERR(op_data))
1696 RETURN(PTR_ERR(op_data));
1698 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1699 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1700 ll_finish_md_op_data(op_data);
1702 CDEBUG(D_INFO, "md_getattr_name failed "
1703 "on %s: rc %d\n", filename, rc);
1707 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1708 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1710 lmmsize = body->mbo_eadatasize;
1712 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1714 GOTO(out, rc = -ENODATA);
1717 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1718 LASSERT(lmm != NULL);
1720 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1721 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1722 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1723 GOTO(out, rc = -EPROTO);
1726 * This is coming from the MDS, so is probably in
1727 * little endian. We convert it to host endian before
1728 * passing it to userspace.
1730 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1733 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1734 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1735 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1736 if (le32_to_cpu(lmm->lmm_pattern) &
1737 LOV_PATTERN_F_RELEASED)
1741 /* if function called for directory - we should
1742 * avoid swab not existent lsm objects */
1743 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1744 lustre_swab_lov_user_md_v1(
1745 (struct lov_user_md_v1 *)lmm);
1746 if (S_ISREG(body->mbo_mode))
1747 lustre_swab_lov_user_md_objects(
1748 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1750 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1751 lustre_swab_lov_user_md_v3(
1752 (struct lov_user_md_v3 *)lmm);
1753 if (S_ISREG(body->mbo_mode))
1754 lustre_swab_lov_user_md_objects(
1755 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1757 } else if (lmm->lmm_magic ==
1758 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1759 lustre_swab_lov_comp_md_v1(
1760 (struct lov_comp_md_v1 *)lmm);
1766 *lmm_size = lmmsize;
1771 static int ll_lov_setea(struct inode *inode, struct file *file,
1774 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1775 struct lov_user_md *lump;
1776 int lum_size = sizeof(struct lov_user_md) +
1777 sizeof(struct lov_user_ost_data);
1781 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1784 OBD_ALLOC_LARGE(lump, lum_size);
1788 if (copy_from_user(lump, arg, lum_size))
1789 GOTO(out_lump, rc = -EFAULT);
1791 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1793 cl_lov_delay_create_clear(&file->f_flags);
1796 OBD_FREE_LARGE(lump, lum_size);
1800 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1807 env = cl_env_get(&refcheck);
1809 RETURN(PTR_ERR(env));
1811 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1812 cl_env_put(env, &refcheck);
1816 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1819 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1820 struct lov_user_md *klum;
1822 __u64 flags = FMODE_WRITE;
1825 rc = ll_copy_user_md(lum, &klum);
1830 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1835 rc = put_user(0, &lum->lmm_stripe_count);
1839 rc = ll_layout_refresh(inode, &gen);
1843 rc = ll_file_getstripe(inode, arg, lum_size);
1845 cl_lov_delay_create_clear(&file->f_flags);
1848 OBD_FREE(klum, lum_size);
1853 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1855 struct ll_inode_info *lli = ll_i2info(inode);
1856 struct cl_object *obj = lli->lli_clob;
1857 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1858 struct ll_grouplock grouplock;
1863 CWARN("group id for group lock must not be 0\n");
1867 if (ll_file_nolock(file))
1868 RETURN(-EOPNOTSUPP);
1870 spin_lock(&lli->lli_lock);
1871 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1872 CWARN("group lock already existed with gid %lu\n",
1873 fd->fd_grouplock.lg_gid);
1874 spin_unlock(&lli->lli_lock);
1877 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1878 spin_unlock(&lli->lli_lock);
1881 * XXX: group lock needs to protect all OST objects while PFL
1882 * can add new OST objects during the IO, so we'd instantiate
1883 * all OST objects before getting its group lock.
1888 struct cl_layout cl = {
1889 .cl_is_composite = false,
1892 env = cl_env_get(&refcheck);
1894 RETURN(PTR_ERR(env));
1896 rc = cl_object_layout_get(env, obj, &cl);
1897 if (!rc && cl.cl_is_composite)
1898 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1900 cl_env_put(env, &refcheck);
1905 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1906 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1910 spin_lock(&lli->lli_lock);
1911 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1912 spin_unlock(&lli->lli_lock);
1913 CERROR("another thread just won the race\n");
1914 cl_put_grouplock(&grouplock);
1918 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1919 fd->fd_grouplock = grouplock;
1920 spin_unlock(&lli->lli_lock);
1922 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1926 static int ll_put_grouplock(struct inode *inode, struct file *file,
1929 struct ll_inode_info *lli = ll_i2info(inode);
1930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1931 struct ll_grouplock grouplock;
1934 spin_lock(&lli->lli_lock);
1935 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1936 spin_unlock(&lli->lli_lock);
1937 CWARN("no group lock held\n");
1941 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1943 if (fd->fd_grouplock.lg_gid != arg) {
1944 CWARN("group lock %lu doesn't match current id %lu\n",
1945 arg, fd->fd_grouplock.lg_gid);
1946 spin_unlock(&lli->lli_lock);
1950 grouplock = fd->fd_grouplock;
1951 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1952 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1953 spin_unlock(&lli->lli_lock);
1955 cl_put_grouplock(&grouplock);
1956 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1961 * Close inode open handle
1963 * \param dentry [in] dentry which contains the inode
1964 * \param it [in,out] intent which contains open info and result
1967 * \retval <0 failure
1969 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1971 struct inode *inode = dentry->d_inode;
1972 struct obd_client_handle *och;
1978 /* Root ? Do nothing. */
1979 if (dentry->d_inode->i_sb->s_root == dentry)
1982 /* No open handle to close? Move away */
1983 if (!it_disposition(it, DISP_OPEN_OPEN))
1986 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1988 OBD_ALLOC(och, sizeof(*och));
1990 GOTO(out, rc = -ENOMEM);
1992 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1994 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1996 /* this one is in place of ll_file_open */
1997 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1998 ptlrpc_req_finished(it->it_request);
1999 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2005 * Get size for inode for which FIEMAP mapping is requested.
2006 * Make the FIEMAP get_info call and returns the result.
2007 * \param fiemap kernel buffer to hold extens
2008 * \param num_bytes kernel buffer size
2010 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2016 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2019 /* Checks for fiemap flags */
2020 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2021 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2025 /* Check for FIEMAP_FLAG_SYNC */
2026 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2027 rc = filemap_fdatawrite(inode->i_mapping);
2032 env = cl_env_get(&refcheck);
2034 RETURN(PTR_ERR(env));
2036 if (i_size_read(inode) == 0) {
2037 rc = ll_glimpse_size(inode);
2042 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2043 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2044 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2046 /* If filesize is 0, then there would be no objects for mapping */
2047 if (fmkey.lfik_oa.o_size == 0) {
2048 fiemap->fm_mapped_extents = 0;
2052 fmkey.lfik_fiemap = *fiemap;
2054 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2055 &fmkey, fiemap, &num_bytes);
2057 cl_env_put(env, &refcheck);
2061 int ll_fid2path(struct inode *inode, void __user *arg)
2063 struct obd_export *exp = ll_i2mdexp(inode);
2064 const struct getinfo_fid2path __user *gfin = arg;
2066 struct getinfo_fid2path *gfout;
2072 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2073 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2076 /* Only need to get the buflen */
2077 if (get_user(pathlen, &gfin->gf_pathlen))
2080 if (pathlen > PATH_MAX)
2083 outsize = sizeof(*gfout) + pathlen;
2084 OBD_ALLOC(gfout, outsize);
2088 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2089 GOTO(gf_free, rc = -EFAULT);
2090 /* append root FID after gfout to let MDT know the root FID so that it
2091 * can lookup the correct path, this is mainly for fileset.
2092 * old server without fileset mount support will ignore this. */
2093 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2095 /* Call mdc_iocontrol */
2096 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2100 if (copy_to_user(arg, gfout, outsize))
2104 OBD_FREE(gfout, outsize);
2109 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2111 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2119 ioc->idv_version = 0;
2120 ioc->idv_layout_version = UINT_MAX;
2122 /* If no file object initialized, we consider its version is 0. */
2126 env = cl_env_get(&refcheck);
2128 RETURN(PTR_ERR(env));
2130 io = vvp_env_thread_io(env);
2132 io->u.ci_data_version.dv_data_version = 0;
2133 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2134 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2137 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2138 result = cl_io_loop(env, io);
2140 result = io->ci_result;
2142 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2143 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2145 cl_io_fini(env, io);
2147 if (unlikely(io->ci_need_restart))
2150 cl_env_put(env, &refcheck);
2156 * Read the data_version for inode.
2158 * This value is computed using stripe object version on OST.
2159 * Version is computed using server side locking.
2161 * @param flags if do sync on the OST side;
2163 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2164 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2166 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2168 struct ioc_data_version ioc = { .idv_flags = flags };
2171 rc = ll_ioc_data_version(inode, &ioc);
2173 *data_version = ioc.idv_version;
2179 * Trigger a HSM release request for the provided inode.
2181 int ll_hsm_release(struct inode *inode)
2184 struct obd_client_handle *och = NULL;
2185 __u64 data_version = 0;
2190 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2191 ll_get_fsname(inode->i_sb, NULL, 0),
2192 PFID(&ll_i2info(inode)->lli_fid));
2194 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2196 GOTO(out, rc = PTR_ERR(och));
2198 /* Grab latest data_version and [am]time values */
2199 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2203 env = cl_env_get(&refcheck);
2205 GOTO(out, rc = PTR_ERR(env));
2207 ll_merge_attr(env, inode);
2208 cl_env_put(env, &refcheck);
2210 /* Release the file.
2211 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2212 * we still need it to pack l_remote_handle to MDT. */
2213 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2219 if (och != NULL && !IS_ERR(och)) /* close the file */
2220 ll_lease_close(och, inode, NULL);
2225 struct ll_swap_stack {
2228 struct inode *inode1;
2229 struct inode *inode2;
2234 static int ll_swap_layouts(struct file *file1, struct file *file2,
2235 struct lustre_swap_layouts *lsl)
2237 struct mdc_swap_layouts msl;
2238 struct md_op_data *op_data;
2241 struct ll_swap_stack *llss = NULL;
2244 OBD_ALLOC_PTR(llss);
2248 llss->inode1 = file_inode(file1);
2249 llss->inode2 = file_inode(file2);
2251 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2255 /* we use 2 bool because it is easier to swap than 2 bits */
2256 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2257 llss->check_dv1 = true;
2259 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2260 llss->check_dv2 = true;
2262 /* we cannot use lsl->sl_dvX directly because we may swap them */
2263 llss->dv1 = lsl->sl_dv1;
2264 llss->dv2 = lsl->sl_dv2;
2266 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2267 if (rc == 0) /* same file, done! */
2270 if (rc < 0) { /* sequentialize it */
2271 swap(llss->inode1, llss->inode2);
2273 swap(llss->dv1, llss->dv2);
2274 swap(llss->check_dv1, llss->check_dv2);
2278 if (gid != 0) { /* application asks to flush dirty cache */
2279 rc = ll_get_grouplock(llss->inode1, file1, gid);
2283 rc = ll_get_grouplock(llss->inode2, file2, gid);
2285 ll_put_grouplock(llss->inode1, file1, gid);
2290 /* ultimate check, before swaping the layouts we check if
2291 * dataversion has changed (if requested) */
2292 if (llss->check_dv1) {
2293 rc = ll_data_version(llss->inode1, &dv, 0);
2296 if (dv != llss->dv1)
2297 GOTO(putgl, rc = -EAGAIN);
2300 if (llss->check_dv2) {
2301 rc = ll_data_version(llss->inode2, &dv, 0);
2304 if (dv != llss->dv2)
2305 GOTO(putgl, rc = -EAGAIN);
2308 /* struct md_op_data is used to send the swap args to the mdt
2309 * only flags is missing, so we use struct mdc_swap_layouts
2310 * through the md_op_data->op_data */
2311 /* flags from user space have to be converted before they are send to
2312 * server, no flag is sent today, they are only used on the client */
2315 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2316 0, LUSTRE_OPC_ANY, &msl);
2317 if (IS_ERR(op_data))
2318 GOTO(free, rc = PTR_ERR(op_data));
2320 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2321 sizeof(*op_data), op_data, NULL);
2322 ll_finish_md_op_data(op_data);
2329 ll_put_grouplock(llss->inode2, file2, gid);
2330 ll_put_grouplock(llss->inode1, file1, gid);
2340 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2342 struct md_op_data *op_data;
2346 /* Detect out-of range masks */
2347 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2350 /* Non-root users are forbidden to set or clear flags which are
2351 * NOT defined in HSM_USER_MASK. */
2352 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2353 !cfs_capable(CFS_CAP_SYS_ADMIN))
2356 /* Detect out-of range archive id */
2357 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2358 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2361 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2362 LUSTRE_OPC_ANY, hss);
2363 if (IS_ERR(op_data))
2364 RETURN(PTR_ERR(op_data));
2366 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2367 sizeof(*op_data), op_data, NULL);
2369 ll_finish_md_op_data(op_data);
2374 static int ll_hsm_import(struct inode *inode, struct file *file,
2375 struct hsm_user_import *hui)
2377 struct hsm_state_set *hss = NULL;
2378 struct iattr *attr = NULL;
2382 if (!S_ISREG(inode->i_mode))
2388 GOTO(out, rc = -ENOMEM);
2390 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2391 hss->hss_archive_id = hui->hui_archive_id;
2392 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2393 rc = ll_hsm_state_set(inode, hss);
2397 OBD_ALLOC_PTR(attr);
2399 GOTO(out, rc = -ENOMEM);
2401 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2402 attr->ia_mode |= S_IFREG;
2403 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2404 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2405 attr->ia_size = hui->hui_size;
2406 attr->ia_mtime.tv_sec = hui->hui_mtime;
2407 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2408 attr->ia_atime.tv_sec = hui->hui_atime;
2409 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2411 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2412 ATTR_UID | ATTR_GID |
2413 ATTR_MTIME | ATTR_MTIME_SET |
2414 ATTR_ATIME | ATTR_ATIME_SET;
2418 rc = ll_setattr_raw(file_dentry(file), attr, true);
2422 inode_unlock(inode);
2434 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2436 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2437 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2440 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2442 struct inode *inode = file_inode(file);
2444 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2445 ATTR_MTIME | ATTR_MTIME_SET |
2446 ATTR_CTIME | ATTR_CTIME_SET,
2448 .tv_sec = lfu->lfu_atime_sec,
2449 .tv_nsec = lfu->lfu_atime_nsec,
2452 .tv_sec = lfu->lfu_mtime_sec,
2453 .tv_nsec = lfu->lfu_mtime_nsec,
2456 .tv_sec = lfu->lfu_ctime_sec,
2457 .tv_nsec = lfu->lfu_ctime_nsec,
2463 if (!capable(CAP_SYS_ADMIN))
2466 if (!S_ISREG(inode->i_mode))
2470 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2471 inode_unlock(inode);
2476 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2479 case MODE_READ_USER:
2481 case MODE_WRITE_USER:
2488 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2490 /* Used to allow the upper layers of the client to request an LDLM lock
2491 * without doing an actual read or write.
2493 * Used for ladvise lockahead to manually request specific locks.
2495 * \param[in] file file this ladvise lock request is on
2496 * \param[in] ladvise ladvise struct describing this lock request
2498 * \retval 0 success, no detailed result available (sync requests
2499 * and requests sent to the server [not handled locally]
2500 * cannot return detailed results)
2501 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2502 * see definitions for details.
2503 * \retval negative negative errno on error
2505 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2507 struct lu_env *env = NULL;
2508 struct cl_io *io = NULL;
2509 struct cl_lock *lock = NULL;
2510 struct cl_lock_descr *descr = NULL;
2511 struct dentry *dentry = file->f_path.dentry;
2512 struct inode *inode = dentry->d_inode;
2513 enum cl_lock_mode cl_mode;
2514 off_t start = ladvise->lla_start;
2515 off_t end = ladvise->lla_end;
2521 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2522 "start=%llu, end=%llu\n", dentry->d_name.len,
2523 dentry->d_name.name, dentry->d_inode,
2524 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2527 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2529 GOTO(out, result = cl_mode);
2531 /* Get IO environment */
2532 result = cl_io_get(inode, &env, &io, &refcheck);
2536 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2539 * nothing to do for this io. This currently happens when
2540 * stripe sub-object's are not yet created.
2542 result = io->ci_result;
2543 } else if (result == 0) {
2544 lock = vvp_env_lock(env);
2545 descr = &lock->cll_descr;
2547 descr->cld_obj = io->ci_obj;
2548 /* Convert byte offsets to pages */
2549 descr->cld_start = cl_index(io->ci_obj, start);
2550 descr->cld_end = cl_index(io->ci_obj, end);
2551 descr->cld_mode = cl_mode;
2552 /* CEF_MUST is used because we do not want to convert a
2553 * lockahead request to a lockless lock */
2554 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2557 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2558 descr->cld_enq_flags |= CEF_SPECULATIVE;
2560 result = cl_lock_request(env, io, lock);
2562 /* On success, we need to release the lock */
2564 cl_lock_release(env, lock);
2566 cl_io_fini(env, io);
2567 cl_env_put(env, &refcheck);
2569 /* -ECANCELED indicates a matching lock with a different extent
2570 * was already present, and -EEXIST indicates a matching lock
2571 * on exactly the same extent was already present.
2572 * We convert them to positive values for userspace to make
2573 * recognizing true errors easier.
2574 * Note we can only return these detailed results on async requests,
2575 * as sync requests look the same as i/o requests for locking. */
2576 if (result == -ECANCELED)
2577 result = LLA_RESULT_DIFFERENT;
2578 else if (result == -EEXIST)
2579 result = LLA_RESULT_SAME;
2584 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2586 static int ll_ladvise_sanity(struct inode *inode,
2587 struct llapi_lu_ladvise *ladvise)
2589 enum lu_ladvise_type advice = ladvise->lla_advice;
2590 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2591 * be in the first 32 bits of enum ladvise_flags */
2592 __u32 flags = ladvise->lla_peradvice_flags;
2593 /* 3 lines at 80 characters per line, should be plenty */
2596 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2598 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2599 "last supported advice is %s (value '%d'): rc = %d\n",
2600 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2601 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2605 /* Per-advice checks */
2607 case LU_LADVISE_LOCKNOEXPAND:
2608 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2610 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2612 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2613 ladvise_names[advice], rc);
2617 case LU_LADVISE_LOCKAHEAD:
2618 /* Currently only READ and WRITE modes can be requested */
2619 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2620 ladvise->lla_lockahead_mode == 0) {
2622 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2624 ll_get_fsname(inode->i_sb, NULL, 0),
2625 ladvise->lla_lockahead_mode,
2626 ladvise_names[advice], rc);
2629 case LU_LADVISE_WILLREAD:
2630 case LU_LADVISE_DONTNEED:
2632 /* Note fall through above - These checks apply to all advices
2633 * except LOCKNOEXPAND */
2634 if (flags & ~LF_DEFAULT_MASK) {
2636 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2638 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2639 ladvise_names[advice], rc);
2642 if (ladvise->lla_start >= ladvise->lla_end) {
2644 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2645 "for %s: rc = %d\n",
2646 ll_get_fsname(inode->i_sb, NULL, 0),
2647 ladvise->lla_start, ladvise->lla_end,
2648 ladvise_names[advice], rc);
2660 * Give file access advices
2662 * The ladvise interface is similar to Linux fadvise() system call, except it
2663 * forwards the advices directly from Lustre client to server. The server side
2664 * codes will apply appropriate read-ahead and caching techniques for the
2665 * corresponding files.
2667 * A typical workload for ladvise is e.g. a bunch of different clients are
2668 * doing small random reads of a file, so prefetching pages into OSS cache
2669 * with big linear reads before the random IO is a net benefit. Fetching
2670 * all that data into each client cache with fadvise() may not be, due to
2671 * much more data being sent to the client.
2673 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2674 struct llapi_lu_ladvise *ladvise)
2678 struct cl_ladvise_io *lio;
2683 env = cl_env_get(&refcheck);
2685 RETURN(PTR_ERR(env));
2687 io = vvp_env_thread_io(env);
2688 io->ci_obj = ll_i2info(inode)->lli_clob;
2690 /* initialize parameters for ladvise */
2691 lio = &io->u.ci_ladvise;
2692 lio->li_start = ladvise->lla_start;
2693 lio->li_end = ladvise->lla_end;
2694 lio->li_fid = ll_inode2fid(inode);
2695 lio->li_advice = ladvise->lla_advice;
2696 lio->li_flags = flags;
2698 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2699 rc = cl_io_loop(env, io);
2703 cl_io_fini(env, io);
2704 cl_env_put(env, &refcheck);
2708 static int ll_lock_noexpand(struct file *file, int flags)
2710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2712 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2717 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2720 struct fsxattr fsxattr;
2722 if (copy_from_user(&fsxattr,
2723 (const struct fsxattr __user *)arg,
2727 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2728 if (copy_to_user((struct fsxattr __user *)arg,
2729 &fsxattr, sizeof(fsxattr)))
2735 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2739 struct md_op_data *op_data;
2740 struct ptlrpc_request *req = NULL;
2742 struct fsxattr fsxattr;
2744 /* only root could change project ID */
2745 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2748 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2749 LUSTRE_OPC_ANY, NULL);
2750 if (IS_ERR(op_data))
2751 RETURN(PTR_ERR(op_data));
2753 if (copy_from_user(&fsxattr,
2754 (const struct fsxattr __user *)arg,
2756 GOTO(out_fsxattr1, rc = -EFAULT);
2758 op_data->op_projid = fsxattr.fsx_projid;
2759 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2760 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2762 ptlrpc_req_finished(req);
2765 ll_finish_md_op_data(op_data);
2772 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2774 struct inode *inode = file_inode(file);
2775 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2779 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2780 PFID(ll_inode2fid(inode)), inode, cmd);
2781 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2783 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2784 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2788 case LL_IOC_GETFLAGS:
2789 /* Get the current value of the file flags */
2790 return put_user(fd->fd_flags, (int __user *)arg);
2791 case LL_IOC_SETFLAGS:
2792 case LL_IOC_CLRFLAGS:
2793 /* Set or clear specific file flags */
2794 /* XXX This probably needs checks to ensure the flags are
2795 * not abused, and to handle any flag side effects.
2797 if (get_user(flags, (int __user *) arg))
2800 if (cmd == LL_IOC_SETFLAGS) {
2801 if ((flags & LL_FILE_IGNORE_LOCK) &&
2802 !(file->f_flags & O_DIRECT)) {
2803 CERROR("%s: unable to disable locking on "
2804 "non-O_DIRECT file\n", current->comm);
2808 fd->fd_flags |= flags;
2810 fd->fd_flags &= ~flags;
2813 case LL_IOC_LOV_SETSTRIPE:
2814 case LL_IOC_LOV_SETSTRIPE_NEW:
2815 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2816 case LL_IOC_LOV_SETEA:
2817 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2818 case LL_IOC_LOV_SWAP_LAYOUTS: {
2820 struct lustre_swap_layouts lsl;
2823 if (copy_from_user(&lsl, (char __user *)arg,
2824 sizeof(struct lustre_swap_layouts)))
2827 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2830 file2 = fget(lsl.sl_fd);
2834 /* O_WRONLY or O_RDWR */
2835 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2836 GOTO(out, rc = -EPERM);
2838 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
2840 struct inode *inode2;
2841 struct ll_inode_info *lli;
2842 struct obd_client_handle *och = NULL;
2844 lli = ll_i2info(inode);
2845 mutex_lock(&lli->lli_och_mutex);
2846 if (fd->fd_lease_och != NULL) {
2847 och = fd->fd_lease_och;
2848 fd->fd_lease_och = NULL;
2850 mutex_unlock(&lli->lli_och_mutex);
2852 GOTO(out, rc = -ENOLCK);
2853 inode2 = file_inode(file2);
2854 rc = ll_swap_layouts_close(och, inode, inode2, intent);
2856 rc = ll_swap_layouts(file, file2, &lsl);
2862 case LL_IOC_LOV_GETSTRIPE:
2863 case LL_IOC_LOV_GETSTRIPE_NEW:
2864 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2865 case FSFILT_IOC_GETFLAGS:
2866 case FSFILT_IOC_SETFLAGS:
2867 RETURN(ll_iocontrol(inode, file, cmd, arg));
2868 case FSFILT_IOC_GETVERSION_OLD:
2869 case FSFILT_IOC_GETVERSION:
2870 RETURN(put_user(inode->i_generation, (int __user *)arg));
2871 case LL_IOC_GROUP_LOCK:
2872 RETURN(ll_get_grouplock(inode, file, arg));
2873 case LL_IOC_GROUP_UNLOCK:
2874 RETURN(ll_put_grouplock(inode, file, arg));
2875 case IOC_OBD_STATFS:
2876 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2878 /* We need to special case any other ioctls we want to handle,
2879 * to send them to the MDS/OST as appropriate and to properly
2880 * network encode the arg field.
2881 case FSFILT_IOC_SETVERSION_OLD:
2882 case FSFILT_IOC_SETVERSION:
2884 case LL_IOC_FLUSHCTX:
2885 RETURN(ll_flush_ctx(inode));
2886 case LL_IOC_PATH2FID: {
2887 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2888 sizeof(struct lu_fid)))
2893 case LL_IOC_GETPARENT:
2894 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2896 case OBD_IOC_FID2PATH:
2897 RETURN(ll_fid2path(inode, (void __user *)arg));
2898 case LL_IOC_DATA_VERSION: {
2899 struct ioc_data_version idv;
2902 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2905 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2906 rc = ll_ioc_data_version(inode, &idv);
2909 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2915 case LL_IOC_GET_MDTIDX: {
2918 mdtidx = ll_get_mdt_idx(inode);
2922 if (put_user((int)mdtidx, (int __user *)arg))
2927 case OBD_IOC_GETDTNAME:
2928 case OBD_IOC_GETMDNAME:
2929 RETURN(ll_get_obd_name(inode, cmd, arg));
2930 case LL_IOC_HSM_STATE_GET: {
2931 struct md_op_data *op_data;
2932 struct hsm_user_state *hus;
2939 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2940 LUSTRE_OPC_ANY, hus);
2941 if (IS_ERR(op_data)) {
2943 RETURN(PTR_ERR(op_data));
2946 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2949 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2952 ll_finish_md_op_data(op_data);
2956 case LL_IOC_HSM_STATE_SET: {
2957 struct hsm_state_set *hss;
2964 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2969 rc = ll_hsm_state_set(inode, hss);
2974 case LL_IOC_HSM_ACTION: {
2975 struct md_op_data *op_data;
2976 struct hsm_current_action *hca;
2983 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2984 LUSTRE_OPC_ANY, hca);
2985 if (IS_ERR(op_data)) {
2987 RETURN(PTR_ERR(op_data));
2990 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2993 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2996 ll_finish_md_op_data(op_data);
3000 case LL_IOC_SET_LEASE: {
3001 struct ll_inode_info *lli = ll_i2info(inode);
3002 struct obd_client_handle *och = NULL;
3007 case LL_LEASE_WRLCK:
3008 if (!(file->f_mode & FMODE_WRITE))
3010 fmode = FMODE_WRITE;
3012 case LL_LEASE_RDLCK:
3013 if (!(file->f_mode & FMODE_READ))
3017 case LL_LEASE_UNLCK:
3018 mutex_lock(&lli->lli_och_mutex);
3019 if (fd->fd_lease_och != NULL) {
3020 och = fd->fd_lease_och;
3021 fd->fd_lease_och = NULL;
3023 mutex_unlock(&lli->lli_och_mutex);
3028 fmode = och->och_flags;
3029 rc = ll_lease_close(och, inode, &lease_broken);
3033 rc = ll_lease_och_release(inode, file);
3040 RETURN(ll_lease_type_from_fmode(fmode));
3045 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3047 /* apply for lease */
3048 och = ll_lease_open(inode, file, fmode, 0);
3050 RETURN(PTR_ERR(och));
3053 mutex_lock(&lli->lli_och_mutex);
3054 if (fd->fd_lease_och == NULL) {
3055 fd->fd_lease_och = och;
3058 mutex_unlock(&lli->lli_och_mutex);
3060 /* impossible now that only excl is supported for now */
3061 ll_lease_close(och, inode, &lease_broken);
3066 case LL_IOC_GET_LEASE: {
3067 struct ll_inode_info *lli = ll_i2info(inode);
3068 struct ldlm_lock *lock = NULL;
3071 mutex_lock(&lli->lli_och_mutex);
3072 if (fd->fd_lease_och != NULL) {
3073 struct obd_client_handle *och = fd->fd_lease_och;
3075 lock = ldlm_handle2lock(&och->och_lease_handle);
3077 lock_res_and_lock(lock);
3078 if (!ldlm_is_cancel(lock))
3079 fmode = och->och_flags;
3081 unlock_res_and_lock(lock);
3082 LDLM_LOCK_PUT(lock);
3085 mutex_unlock(&lli->lli_och_mutex);
3087 RETURN(ll_lease_type_from_fmode(fmode));
3089 case LL_IOC_HSM_IMPORT: {
3090 struct hsm_user_import *hui;
3096 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3101 rc = ll_hsm_import(inode, file, hui);
3106 case LL_IOC_FUTIMES_3: {
3107 struct ll_futimes_3 lfu;
3109 if (copy_from_user(&lfu,
3110 (const struct ll_futimes_3 __user *)arg,
3114 RETURN(ll_file_futimes_3(file, &lfu));
3116 case LL_IOC_LADVISE: {
3117 struct llapi_ladvise_hdr *k_ladvise_hdr;
3118 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3121 int alloc_size = sizeof(*k_ladvise_hdr);
3124 u_ladvise_hdr = (void __user *)arg;
3125 OBD_ALLOC_PTR(k_ladvise_hdr);
3126 if (k_ladvise_hdr == NULL)
3129 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3130 GOTO(out_ladvise, rc = -EFAULT);
3132 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3133 k_ladvise_hdr->lah_count < 1)
3134 GOTO(out_ladvise, rc = -EINVAL);
3136 num_advise = k_ladvise_hdr->lah_count;
3137 if (num_advise >= LAH_COUNT_MAX)
3138 GOTO(out_ladvise, rc = -EFBIG);
3140 OBD_FREE_PTR(k_ladvise_hdr);
3141 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3142 lah_advise[num_advise]);
3143 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3144 if (k_ladvise_hdr == NULL)
3148 * TODO: submit multiple advices to one server in a single RPC
3150 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3151 GOTO(out_ladvise, rc = -EFAULT);
3153 for (i = 0; i < num_advise; i++) {
3154 struct llapi_lu_ladvise *k_ladvise =
3155 &k_ladvise_hdr->lah_advise[i];
3156 struct llapi_lu_ladvise __user *u_ladvise =
3157 &u_ladvise_hdr->lah_advise[i];
3159 rc = ll_ladvise_sanity(inode, k_ladvise);
3161 GOTO(out_ladvise, rc);
3163 switch (k_ladvise->lla_advice) {
3164 case LU_LADVISE_LOCKNOEXPAND:
3165 rc = ll_lock_noexpand(file,
3166 k_ladvise->lla_peradvice_flags);
3167 GOTO(out_ladvise, rc);
3168 case LU_LADVISE_LOCKAHEAD:
3170 rc = ll_file_lock_ahead(file, k_ladvise);
3173 GOTO(out_ladvise, rc);
3176 &u_ladvise->lla_lockahead_result))
3177 GOTO(out_ladvise, rc = -EFAULT);
3180 rc = ll_ladvise(inode, file,
3181 k_ladvise_hdr->lah_flags,
3184 GOTO(out_ladvise, rc);
3191 OBD_FREE(k_ladvise_hdr, alloc_size);
3194 case LL_IOC_FSGETXATTR:
3195 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3196 case LL_IOC_FSSETXATTR:
3197 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3199 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3201 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3202 (void __user *)arg));
3206 #ifndef HAVE_FILE_LLSEEK_SIZE
3207 static inline loff_t
3208 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3210 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3212 if (offset > maxsize)
3215 if (offset != file->f_pos) {
3216 file->f_pos = offset;
3217 file->f_version = 0;
3223 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3224 loff_t maxsize, loff_t eof)
3226 struct inode *inode = file_inode(file);
3234 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3235 * position-querying operation. Avoid rewriting the "same"
3236 * f_pos value back to the file because a concurrent read(),
3237 * write() or lseek() might have altered it
3242 * f_lock protects against read/modify/write race with other
3243 * SEEK_CURs. Note that parallel writes and reads behave
3247 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3248 inode_unlock(inode);
3252 * In the generic case the entire file is data, so as long as
3253 * offset isn't at the end of the file then the offset is data.
3260 * There is a virtual hole at the end of the file, so as long as
3261 * offset isn't i_size or larger, return i_size.
3269 return llseek_execute(file, offset, maxsize);
3273 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3275 struct inode *inode = file_inode(file);
3276 loff_t retval, eof = 0;
3279 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3280 (origin == SEEK_CUR) ? file->f_pos : 0);
3281 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3282 PFID(ll_inode2fid(inode)), inode, retval, retval,
3284 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3286 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3287 retval = ll_glimpse_size(inode);
3290 eof = i_size_read(inode);
3293 retval = ll_generic_file_llseek_size(file, offset, origin,
3294 ll_file_maxbytes(inode), eof);
3298 static int ll_flush(struct file *file, fl_owner_t id)
3300 struct inode *inode = file_inode(file);
3301 struct ll_inode_info *lli = ll_i2info(inode);
3302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3305 LASSERT(!S_ISDIR(inode->i_mode));
3307 /* catch async errors that were recorded back when async writeback
3308 * failed for pages in this mapping. */
3309 rc = lli->lli_async_rc;
3310 lli->lli_async_rc = 0;
3311 if (lli->lli_clob != NULL) {
3312 err = lov_read_and_clear_async_rc(lli->lli_clob);
3317 /* The application has been told write failure already.
3318 * Do not report failure again. */
3319 if (fd->fd_write_failed)
3321 return rc ? -EIO : 0;
3325 * Called to make sure a portion of file has been written out.
3326 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3328 * Return how many pages have been written.
3330 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3331 enum cl_fsync_mode mode, int ignore_layout)
3335 struct cl_fsync_io *fio;
3340 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3341 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3344 env = cl_env_get(&refcheck);
3346 RETURN(PTR_ERR(env));
3348 io = vvp_env_thread_io(env);
3349 io->ci_obj = ll_i2info(inode)->lli_clob;
3350 io->ci_ignore_layout = ignore_layout;
3352 /* initialize parameters for sync */
3353 fio = &io->u.ci_fsync;
3354 fio->fi_start = start;
3356 fio->fi_fid = ll_inode2fid(inode);
3357 fio->fi_mode = mode;
3358 fio->fi_nr_written = 0;
3360 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3361 result = cl_io_loop(env, io);
3363 result = io->ci_result;
3365 result = fio->fi_nr_written;
3366 cl_io_fini(env, io);
3367 cl_env_put(env, &refcheck);
3373 * When dentry is provided (the 'else' case), file_dentry() may be
3374 * null and dentry must be used directly rather than pulled from
3375 * file_dentry() as is done otherwise.
3378 #ifdef HAVE_FILE_FSYNC_4ARGS
3379 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3381 struct dentry *dentry = file_dentry(file);
3383 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3384 int ll_fsync(struct file *file, int datasync)
3386 struct dentry *dentry = file_dentry(file);
3388 loff_t end = LLONG_MAX;
3390 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3393 loff_t end = LLONG_MAX;
3395 struct inode *inode = dentry->d_inode;
3396 struct ll_inode_info *lli = ll_i2info(inode);
3397 struct ptlrpc_request *req;
3401 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3402 PFID(ll_inode2fid(inode)), inode);
3403 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3405 #ifdef HAVE_FILE_FSYNC_4ARGS
3406 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3407 lock_inode = !lli->lli_inode_locked;
3411 /* fsync's caller has already called _fdata{sync,write}, we want
3412 * that IO to finish before calling the osc and mdc sync methods */
3413 rc = filemap_fdatawait(inode->i_mapping);
3416 /* catch async errors that were recorded back when async writeback
3417 * failed for pages in this mapping. */
3418 if (!S_ISDIR(inode->i_mode)) {
3419 err = lli->lli_async_rc;
3420 lli->lli_async_rc = 0;
3423 if (lli->lli_clob != NULL) {
3424 err = lov_read_and_clear_async_rc(lli->lli_clob);
3430 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3434 ptlrpc_req_finished(req);
3436 if (S_ISREG(inode->i_mode)) {
3437 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3439 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3440 if (rc == 0 && err < 0)
3443 fd->fd_write_failed = true;
3445 fd->fd_write_failed = false;
3448 #ifdef HAVE_FILE_FSYNC_4ARGS
3450 inode_unlock(inode);
3456 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3458 struct inode *inode = file_inode(file);
3459 struct ll_sb_info *sbi = ll_i2sbi(inode);
3460 struct ldlm_enqueue_info einfo = {
3461 .ei_type = LDLM_FLOCK,
3462 .ei_cb_cp = ldlm_flock_completion_ast,
3463 .ei_cbdata = file_lock,
3465 struct md_op_data *op_data;
3466 struct lustre_handle lockh = { 0 };
3467 union ldlm_policy_data flock = { { 0 } };
3468 int fl_type = file_lock->fl_type;
3474 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3475 PFID(ll_inode2fid(inode)), file_lock);
3477 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3479 if (file_lock->fl_flags & FL_FLOCK) {
3480 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3481 /* flocks are whole-file locks */
3482 flock.l_flock.end = OFFSET_MAX;
3483 /* For flocks owner is determined by the local file desctiptor*/
3484 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3485 } else if (file_lock->fl_flags & FL_POSIX) {
3486 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3487 flock.l_flock.start = file_lock->fl_start;
3488 flock.l_flock.end = file_lock->fl_end;
3492 flock.l_flock.pid = file_lock->fl_pid;
3494 /* Somewhat ugly workaround for svc lockd.
3495 * lockd installs custom fl_lmops->lm_compare_owner that checks
3496 * for the fl_owner to be the same (which it always is on local node
3497 * I guess between lockd processes) and then compares pid.
3498 * As such we assign pid to the owner field to make it all work,
3499 * conflict with normal locks is unlikely since pid space and
3500 * pointer space for current->files are not intersecting */
3501 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3502 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3506 einfo.ei_mode = LCK_PR;
3509 /* An unlock request may or may not have any relation to
3510 * existing locks so we may not be able to pass a lock handle
3511 * via a normal ldlm_lock_cancel() request. The request may even
3512 * unlock a byte range in the middle of an existing lock. In
3513 * order to process an unlock request we need all of the same
3514 * information that is given with a normal read or write record
3515 * lock request. To avoid creating another ldlm unlock (cancel)
3516 * message we'll treat a LCK_NL flock request as an unlock. */
3517 einfo.ei_mode = LCK_NL;
3520 einfo.ei_mode = LCK_PW;
3523 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3538 flags = LDLM_FL_BLOCK_NOWAIT;
3544 flags = LDLM_FL_TEST_LOCK;
3547 CERROR("unknown fcntl lock command: %d\n", cmd);
3551 /* Save the old mode so that if the mode in the lock changes we
3552 * can decrement the appropriate reader or writer refcount. */
3553 file_lock->fl_type = einfo.ei_mode;
3555 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3556 LUSTRE_OPC_ANY, NULL);
3557 if (IS_ERR(op_data))
3558 RETURN(PTR_ERR(op_data));
3560 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3561 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3562 flock.l_flock.pid, flags, einfo.ei_mode,
3563 flock.l_flock.start, flock.l_flock.end);
3565 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3568 /* Restore the file lock type if not TEST lock. */
3569 if (!(flags & LDLM_FL_TEST_LOCK))
3570 file_lock->fl_type = fl_type;
3572 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3573 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3574 !(flags & LDLM_FL_TEST_LOCK))
3575 rc2 = locks_lock_file_wait(file, file_lock);
3577 if ((file_lock->fl_flags & FL_FLOCK) &&
3578 (rc == 0 || file_lock->fl_type == F_UNLCK))
3579 rc2 = flock_lock_file_wait(file, file_lock);
3580 if ((file_lock->fl_flags & FL_POSIX) &&
3581 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3582 !(flags & LDLM_FL_TEST_LOCK))
3583 rc2 = posix_lock_file_wait(file, file_lock);
3584 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3586 if (rc2 && file_lock->fl_type != F_UNLCK) {
3587 einfo.ei_mode = LCK_NL;
3588 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3593 ll_finish_md_op_data(op_data);
3598 int ll_get_fid_by_name(struct inode *parent, const char *name,
3599 int namelen, struct lu_fid *fid,
3600 struct inode **inode)
3602 struct md_op_data *op_data = NULL;
3603 struct mdt_body *body;
3604 struct ptlrpc_request *req;
3608 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3609 LUSTRE_OPC_ANY, NULL);
3610 if (IS_ERR(op_data))
3611 RETURN(PTR_ERR(op_data));
3613 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3614 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3615 ll_finish_md_op_data(op_data);
3619 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3621 GOTO(out_req, rc = -EFAULT);
3623 *fid = body->mbo_fid1;
3626 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3628 ptlrpc_req_finished(req);
3632 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3633 const char *name, int namelen)
3635 struct dentry *dchild = NULL;
3636 struct inode *child_inode = NULL;
3637 struct md_op_data *op_data;
3638 struct ptlrpc_request *request = NULL;
3639 struct obd_client_handle *och = NULL;
3641 struct mdt_body *body;
3643 __u64 data_version = 0;
3646 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3647 name, PFID(ll_inode2fid(parent)), mdtidx);
3649 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3650 0, LUSTRE_OPC_ANY, NULL);
3651 if (IS_ERR(op_data))
3652 RETURN(PTR_ERR(op_data));
3654 /* Get child FID first */
3655 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3658 dchild = d_lookup(file_dentry(file), &qstr);
3659 if (dchild != NULL) {
3660 if (dchild->d_inode != NULL)
3661 child_inode = igrab(dchild->d_inode);
3665 if (child_inode == NULL) {
3666 rc = ll_get_fid_by_name(parent, name, namelen,
3667 &op_data->op_fid3, &child_inode);
3672 if (child_inode == NULL)
3673 GOTO(out_free, rc = -EINVAL);
3676 * lfs migrate command needs to be blocked on the client
3677 * by checking the migrate FID against the FID of the
3680 if (child_inode == parent->i_sb->s_root->d_inode)
3681 GOTO(out_iput, rc = -EINVAL);
3683 inode_lock(child_inode);
3684 op_data->op_fid3 = *ll_inode2fid(child_inode);
3685 if (!fid_is_sane(&op_data->op_fid3)) {
3686 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3687 ll_get_fsname(parent->i_sb, NULL, 0), name,
3688 PFID(&op_data->op_fid3));
3689 GOTO(out_unlock, rc = -EINVAL);
3692 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3694 GOTO(out_unlock, rc);
3697 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3698 PFID(&op_data->op_fid3), mdtidx);
3699 GOTO(out_unlock, rc = 0);
3702 if (S_ISREG(child_inode->i_mode)) {
3703 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3707 GOTO(out_unlock, rc);
3710 rc = ll_data_version(child_inode, &data_version,
3713 GOTO(out_close, rc);
3715 op_data->op_handle = och->och_fh;
3716 op_data->op_data = och->och_mod;
3717 op_data->op_data_version = data_version;
3718 op_data->op_lease_handle = och->och_lease_handle;
3719 op_data->op_bias |= MDS_RENAME_MIGRATE;
3722 op_data->op_mds = mdtidx;
3723 op_data->op_cli_flags = CLI_MIGRATE;
3724 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3725 namelen, name, namelen, &request);
3727 LASSERT(request != NULL);
3728 ll_update_times(request, parent);
3730 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3731 LASSERT(body != NULL);
3733 /* If the server does release layout lock, then we cleanup
3734 * the client och here, otherwise release it in out_close: */
3736 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3737 obd_mod_put(och->och_mod);
3738 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3740 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3746 if (request != NULL) {
3747 ptlrpc_req_finished(request);
3751 /* Try again if the file layout has changed. */
3752 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3756 if (och != NULL) /* close the file */
3757 ll_lease_close(och, child_inode, NULL);
3759 clear_nlink(child_inode);
3761 inode_unlock(child_inode);
3765 ll_finish_md_op_data(op_data);
3770 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3778 * test if some locks matching bits and l_req_mode are acquired
3779 * - bits can be in different locks
3780 * - if found clear the common lock bits in *bits
3781 * - the bits not found, are kept in *bits
3783 * \param bits [IN] searched lock bits [IN]
3784 * \param l_req_mode [IN] searched lock mode
3785 * \retval boolean, true iff all bits are found
3787 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3789 struct lustre_handle lockh;
3790 union ldlm_policy_data policy;
3791 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3792 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3801 fid = &ll_i2info(inode)->lli_fid;
3802 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3803 ldlm_lockname[mode]);
3805 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3806 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3807 policy.l_inodebits.bits = *bits & (1 << i);
3808 if (policy.l_inodebits.bits == 0)
3811 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3812 &policy, mode, &lockh)) {
3813 struct ldlm_lock *lock;
3815 lock = ldlm_handle2lock(&lockh);
3818 ~(lock->l_policy_data.l_inodebits.bits);
3819 LDLM_LOCK_PUT(lock);
3821 *bits &= ~policy.l_inodebits.bits;
3828 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3829 struct lustre_handle *lockh, __u64 flags,
3830 enum ldlm_mode mode)
3832 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3837 fid = &ll_i2info(inode)->lli_fid;
3838 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3840 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3841 fid, LDLM_IBITS, &policy, mode, lockh);
3846 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3848 /* Already unlinked. Just update nlink and return success */
3849 if (rc == -ENOENT) {
3851 /* If it is striped directory, and there is bad stripe
3852 * Let's revalidate the dentry again, instead of returning
3854 if (S_ISDIR(inode->i_mode) &&
3855 ll_i2info(inode)->lli_lsm_md != NULL)
3858 /* This path cannot be hit for regular files unless in
3859 * case of obscure races, so no need to to validate
3861 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3863 } else if (rc != 0) {
3864 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3865 "%s: revalidate FID "DFID" error: rc = %d\n",
3866 ll_get_fsname(inode->i_sb, NULL, 0),
3867 PFID(ll_inode2fid(inode)), rc);
3873 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3875 struct inode *inode = dentry->d_inode;
3876 struct ptlrpc_request *req = NULL;
3877 struct obd_export *exp;
3881 LASSERT(inode != NULL);
3883 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3884 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3886 exp = ll_i2mdexp(inode);
3888 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3889 * But under CMD case, it caused some lock issues, should be fixed
3890 * with new CMD ibits lock. See bug 12718 */
3891 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3892 struct lookup_intent oit = { .it_op = IT_GETATTR };
3893 struct md_op_data *op_data;
3895 if (ibits == MDS_INODELOCK_LOOKUP)
3896 oit.it_op = IT_LOOKUP;
3898 /* Call getattr by fid, so do not provide name at all. */
3899 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3900 dentry->d_inode, NULL, 0, 0,
3901 LUSTRE_OPC_ANY, NULL);
3902 if (IS_ERR(op_data))
3903 RETURN(PTR_ERR(op_data));
3905 rc = md_intent_lock(exp, op_data, &oit, &req,
3906 &ll_md_blocking_ast, 0);
3907 ll_finish_md_op_data(op_data);
3909 rc = ll_inode_revalidate_fini(inode, rc);
3913 rc = ll_revalidate_it_finish(req, &oit, dentry);
3915 ll_intent_release(&oit);
3919 /* Unlinked? Unhash dentry, so it is not picked up later by
3920 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3921 here to preserve get_cwd functionality on 2.6.
3923 if (!dentry->d_inode->i_nlink) {
3924 ll_lock_dcache(inode);
3925 d_lustre_invalidate(dentry, 0);
3926 ll_unlock_dcache(inode);
3929 ll_lookup_finish_locks(&oit, dentry);
3930 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3931 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3932 u64 valid = OBD_MD_FLGETATTR;
3933 struct md_op_data *op_data;
3936 if (S_ISREG(inode->i_mode)) {
3937 rc = ll_get_default_mdsize(sbi, &ealen);
3940 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3943 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3944 0, ealen, LUSTRE_OPC_ANY,
3946 if (IS_ERR(op_data))
3947 RETURN(PTR_ERR(op_data));
3949 op_data->op_valid = valid;
3950 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3951 ll_finish_md_op_data(op_data);
3953 rc = ll_inode_revalidate_fini(inode, rc);
3957 rc = ll_prep_inode(&inode, req, NULL, NULL);
3960 ptlrpc_req_finished(req);
3964 static int ll_merge_md_attr(struct inode *inode)
3966 struct cl_attr attr = { 0 };
3969 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3970 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3971 &attr, ll_md_blocking_ast);
3975 set_nlink(inode, attr.cat_nlink);
3976 inode->i_blocks = attr.cat_blocks;
3977 i_size_write(inode, attr.cat_size);
3979 ll_i2info(inode)->lli_atime = attr.cat_atime;
3980 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3981 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3987 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3989 struct inode *inode = dentry->d_inode;
3993 rc = __ll_inode_revalidate(dentry, ibits);
3997 /* if object isn't regular file, don't validate size */
3998 if (!S_ISREG(inode->i_mode)) {
3999 if (S_ISDIR(inode->i_mode) &&
4000 ll_i2info(inode)->lli_lsm_md != NULL) {
4001 rc = ll_merge_md_attr(inode);
4006 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
4007 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
4008 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
4010 /* In case of restore, the MDT has the right size and has
4011 * already send it back without granting the layout lock,
4012 * inode is up-to-date so glimpse is useless.
4013 * Also to glimpse we need the layout, in case of a running
4014 * restore the MDT holds the layout lock so the glimpse will
4015 * block up to the end of restore (getattr will block)
4017 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4018 rc = ll_glimpse_size(inode);
4023 static inline dev_t ll_compat_encode_dev(dev_t dev)
4025 /* The compat_sys_*stat*() syscalls will fail unless the
4026 * device majors and minors are both less than 256. Note that
4027 * the value returned here will be passed through
4028 * old_encode_dev() in cp_compat_stat(). And so we are not
4029 * trying to return a valid compat (u16) device number, just
4030 * one that will pass the old_valid_dev() check. */
4032 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4035 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4036 int ll_getattr(const struct path *path, struct kstat *stat,
4037 u32 request_mask, unsigned int flags)
4040 struct dentry *de = path->dentry;
4042 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4045 struct inode *inode = de->d_inode;
4046 struct ll_sb_info *sbi = ll_i2sbi(inode);
4047 struct ll_inode_info *lli = ll_i2info(inode);
4050 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4051 MDS_INODELOCK_LOOKUP);
4052 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4057 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4059 if (ll_need_32bit_api(sbi)) {
4060 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4061 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4062 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4064 stat->ino = inode->i_ino;
4065 stat->dev = inode->i_sb->s_dev;
4066 stat->rdev = inode->i_rdev;
4069 stat->mode = inode->i_mode;
4070 stat->uid = inode->i_uid;
4071 stat->gid = inode->i_gid;
4072 stat->atime = inode->i_atime;
4073 stat->mtime = inode->i_mtime;
4074 stat->ctime = inode->i_ctime;
4075 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4077 stat->nlink = inode->i_nlink;
4078 stat->size = i_size_read(inode);
4079 stat->blocks = inode->i_blocks;
4084 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4085 __u64 start, __u64 len)
4089 struct fiemap *fiemap;
4090 unsigned int extent_count = fieinfo->fi_extents_max;
4092 num_bytes = sizeof(*fiemap) + (extent_count *
4093 sizeof(struct fiemap_extent));
4094 OBD_ALLOC_LARGE(fiemap, num_bytes);
4099 fiemap->fm_flags = fieinfo->fi_flags;
4100 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4101 fiemap->fm_start = start;
4102 fiemap->fm_length = len;
4103 if (extent_count > 0 &&
4104 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4105 sizeof(struct fiemap_extent)) != 0)
4106 GOTO(out, rc = -EFAULT);
4108 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4110 fieinfo->fi_flags = fiemap->fm_flags;
4111 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4112 if (extent_count > 0 &&
4113 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4114 fiemap->fm_mapped_extents *
4115 sizeof(struct fiemap_extent)) != 0)
4116 GOTO(out, rc = -EFAULT);
4118 OBD_FREE_LARGE(fiemap, num_bytes);
4122 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4124 struct ll_inode_info *lli = ll_i2info(inode);
4125 struct posix_acl *acl = NULL;
4128 spin_lock(&lli->lli_lock);
4129 /* VFS' acl_permission_check->check_acl will release the refcount */
4130 acl = posix_acl_dup(lli->lli_posix_acl);
4131 spin_unlock(&lli->lli_lock);
4136 #ifdef HAVE_IOP_SET_ACL
4137 #ifdef CONFIG_FS_POSIX_ACL
4138 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4140 const char *name = NULL;
4147 case ACL_TYPE_ACCESS:
4149 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4153 name = XATTR_NAME_POSIX_ACL_ACCESS;
4155 case ACL_TYPE_DEFAULT:
4156 if (!S_ISDIR(inode->i_mode))
4157 GOTO(out, rc = acl ? -EACCES : 0);
4158 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4161 GOTO(out, rc = -EINVAL);
4165 size = posix_acl_xattr_size(acl->a_count);
4166 value = kmalloc(size, GFP_NOFS);
4168 GOTO(out, rc = -ENOMEM);
4170 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4175 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4176 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4181 set_cached_acl(inode, type, acl);
4183 forget_cached_acl(inode, type);
4186 #endif /* CONFIG_FS_POSIX_ACL */
4187 #endif /* HAVE_IOP_SET_ACL */
4189 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4191 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4192 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4194 ll_check_acl(struct inode *inode, int mask)
4197 # ifdef CONFIG_FS_POSIX_ACL
4198 struct posix_acl *acl;
4202 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4203 if (flags & IPERM_FLAG_RCU)
4206 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4211 rc = posix_acl_permission(inode, acl, mask);
4212 posix_acl_release(acl);
4215 # else /* !CONFIG_FS_POSIX_ACL */
4217 # endif /* CONFIG_FS_POSIX_ACL */
4219 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4221 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4222 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4224 # ifdef HAVE_INODE_PERMISION_2ARGS
4225 int ll_inode_permission(struct inode *inode, int mask)
4227 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4232 struct ll_sb_info *sbi;
4233 struct root_squash_info *squash;
4234 struct cred *cred = NULL;
4235 const struct cred *old_cred = NULL;
4237 bool squash_id = false;
4240 #ifdef MAY_NOT_BLOCK
4241 if (mask & MAY_NOT_BLOCK)
4243 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4244 if (flags & IPERM_FLAG_RCU)
4248 /* as root inode are NOT getting validated in lookup operation,
4249 * need to do it before permission check. */
4251 if (inode == inode->i_sb->s_root->d_inode) {
4252 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4253 MDS_INODELOCK_LOOKUP);
4258 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4259 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4261 /* squash fsuid/fsgid if needed */
4262 sbi = ll_i2sbi(inode);
4263 squash = &sbi->ll_squash;
4264 if (unlikely(squash->rsi_uid != 0 &&
4265 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4266 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4270 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4271 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4272 squash->rsi_uid, squash->rsi_gid);
4274 /* update current process's credentials
4275 * and FS capability */
4276 cred = prepare_creds();
4280 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4281 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4282 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4283 if ((1 << cap) & CFS_CAP_FS_MASK)
4284 cap_lower(cred->cap_effective, cap);
4286 old_cred = override_creds(cred);
4289 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4290 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4291 /* restore current process's credentials and FS capability */
4293 revert_creds(old_cred);
4300 /* -o localflock - only provides locally consistent flock locks */
4301 struct file_operations ll_file_operations = {
4302 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4303 # ifdef HAVE_SYNC_READ_WRITE
4304 .read = new_sync_read,
4305 .write = new_sync_write,
4307 .read_iter = ll_file_read_iter,
4308 .write_iter = ll_file_write_iter,
4309 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4310 .read = ll_file_read,
4311 .aio_read = ll_file_aio_read,
4312 .write = ll_file_write,
4313 .aio_write = ll_file_aio_write,
4314 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4315 .unlocked_ioctl = ll_file_ioctl,
4316 .open = ll_file_open,
4317 .release = ll_file_release,
4318 .mmap = ll_file_mmap,
4319 .llseek = ll_file_seek,
4320 .splice_read = ll_file_splice_read,
4325 struct file_operations ll_file_operations_flock = {
4326 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4327 # ifdef HAVE_SYNC_READ_WRITE
4328 .read = new_sync_read,
4329 .write = new_sync_write,
4330 # endif /* HAVE_SYNC_READ_WRITE */
4331 .read_iter = ll_file_read_iter,
4332 .write_iter = ll_file_write_iter,
4333 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4334 .read = ll_file_read,
4335 .aio_read = ll_file_aio_read,
4336 .write = ll_file_write,
4337 .aio_write = ll_file_aio_write,
4338 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4339 .unlocked_ioctl = ll_file_ioctl,
4340 .open = ll_file_open,
4341 .release = ll_file_release,
4342 .mmap = ll_file_mmap,
4343 .llseek = ll_file_seek,
4344 .splice_read = ll_file_splice_read,
4347 .flock = ll_file_flock,
4348 .lock = ll_file_flock
4351 /* These are for -o noflock - to return ENOSYS on flock calls */
4352 struct file_operations ll_file_operations_noflock = {
4353 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4354 # ifdef HAVE_SYNC_READ_WRITE
4355 .read = new_sync_read,
4356 .write = new_sync_write,
4357 # endif /* HAVE_SYNC_READ_WRITE */
4358 .read_iter = ll_file_read_iter,
4359 .write_iter = ll_file_write_iter,
4360 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4361 .read = ll_file_read,
4362 .aio_read = ll_file_aio_read,
4363 .write = ll_file_write,
4364 .aio_write = ll_file_aio_write,
4365 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4366 .unlocked_ioctl = ll_file_ioctl,
4367 .open = ll_file_open,
4368 .release = ll_file_release,
4369 .mmap = ll_file_mmap,
4370 .llseek = ll_file_seek,
4371 .splice_read = ll_file_splice_read,
4374 .flock = ll_file_noflock,
4375 .lock = ll_file_noflock
4378 struct inode_operations ll_file_inode_operations = {
4379 .setattr = ll_setattr,
4380 .getattr = ll_getattr,
4381 .permission = ll_inode_permission,
4382 #ifdef HAVE_IOP_XATTR
4383 .setxattr = ll_setxattr,
4384 .getxattr = ll_getxattr,
4385 .removexattr = ll_removexattr,
4387 .listxattr = ll_listxattr,
4388 .fiemap = ll_fiemap,
4389 #ifdef HAVE_IOP_GET_ACL
4390 .get_acl = ll_get_acl,
4392 #ifdef HAVE_IOP_SET_ACL
4393 .set_acl = ll_set_acl,
4397 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4399 struct ll_inode_info *lli = ll_i2info(inode);
4400 struct cl_object *obj = lli->lli_clob;
4409 env = cl_env_get(&refcheck);
4411 RETURN(PTR_ERR(env));
4413 rc = cl_conf_set(env, lli->lli_clob, conf);
4417 if (conf->coc_opc == OBJECT_CONF_SET) {
4418 struct ldlm_lock *lock = conf->coc_lock;
4419 struct cl_layout cl = {
4423 LASSERT(lock != NULL);
4424 LASSERT(ldlm_has_layout(lock));
4426 /* it can only be allowed to match after layout is
4427 * applied to inode otherwise false layout would be
4428 * seen. Applying layout shoud happen before dropping
4429 * the intent lock. */
4430 ldlm_lock_allow_match(lock);
4432 rc = cl_object_layout_get(env, obj, &cl);
4437 DFID": layout version change: %u -> %u\n",
4438 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4440 ll_layout_version_set(lli, cl.cl_layout_gen);
4444 cl_env_put(env, &refcheck);
4449 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4450 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4453 struct ll_sb_info *sbi = ll_i2sbi(inode);
4454 struct ptlrpc_request *req;
4455 struct mdt_body *body;
4462 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4463 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4464 lock->l_lvb_data, lock->l_lvb_len);
4466 if (lock->l_lvb_data != NULL)
4469 /* if layout lock was granted right away, the layout is returned
4470 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4471 * blocked and then granted via completion ast, we have to fetch
4472 * layout here. Please note that we can't use the LVB buffer in
4473 * completion AST because it doesn't have a large enough buffer */
4474 rc = ll_get_default_mdsize(sbi, &lmmsize);
4476 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4477 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4482 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4484 GOTO(out, rc = -EPROTO);
4486 lmmsize = body->mbo_eadatasize;
4487 if (lmmsize == 0) /* empty layout */
4490 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4492 GOTO(out, rc = -EFAULT);
4494 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4495 if (lvbdata == NULL)
4496 GOTO(out, rc = -ENOMEM);
4498 memcpy(lvbdata, lmm, lmmsize);
4499 lock_res_and_lock(lock);
4500 if (unlikely(lock->l_lvb_data == NULL)) {
4501 lock->l_lvb_type = LVB_T_LAYOUT;
4502 lock->l_lvb_data = lvbdata;
4503 lock->l_lvb_len = lmmsize;
4506 unlock_res_and_lock(lock);
4509 OBD_FREE_LARGE(lvbdata, lmmsize);
4514 ptlrpc_req_finished(req);
4519 * Apply the layout to the inode. Layout lock is held and will be released
4522 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4523 struct inode *inode)
4525 struct ll_inode_info *lli = ll_i2info(inode);
4526 struct ll_sb_info *sbi = ll_i2sbi(inode);
4527 struct ldlm_lock *lock;
4528 struct cl_object_conf conf;
4531 bool wait_layout = false;
4534 LASSERT(lustre_handle_is_used(lockh));
4536 lock = ldlm_handle2lock(lockh);
4537 LASSERT(lock != NULL);
4538 LASSERT(ldlm_has_layout(lock));
4540 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4541 PFID(&lli->lli_fid), inode);
4543 /* in case this is a caching lock and reinstate with new inode */
4544 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4546 lock_res_and_lock(lock);
4547 lvb_ready = ldlm_is_lvb_ready(lock);
4548 unlock_res_and_lock(lock);
4550 /* checking lvb_ready is racy but this is okay. The worst case is
4551 * that multi processes may configure the file on the same time. */
4555 rc = ll_layout_fetch(inode, lock);
4559 /* for layout lock, lmm is stored in lock's lvb.
4560 * lvb_data is immutable if the lock is held so it's safe to access it
4563 * set layout to file. Unlikely this will fail as old layout was
4564 * surely eliminated */
4565 memset(&conf, 0, sizeof conf);
4566 conf.coc_opc = OBJECT_CONF_SET;
4567 conf.coc_inode = inode;
4568 conf.coc_lock = lock;
4569 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4570 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4571 rc = ll_layout_conf(inode, &conf);
4573 /* refresh layout failed, need to wait */
4574 wait_layout = rc == -EBUSY;
4577 LDLM_LOCK_PUT(lock);
4578 ldlm_lock_decref(lockh, mode);
4580 /* wait for IO to complete if it's still being used. */
4582 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4583 ll_get_fsname(inode->i_sb, NULL, 0),
4584 PFID(&lli->lli_fid), inode);
4586 memset(&conf, 0, sizeof conf);
4587 conf.coc_opc = OBJECT_CONF_WAIT;
4588 conf.coc_inode = inode;
4589 rc = ll_layout_conf(inode, &conf);
4593 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4594 ll_get_fsname(inode->i_sb, NULL, 0),
4595 PFID(&lli->lli_fid), rc);
4601 * Issue layout intent RPC to MDS.
4602 * \param inode [in] file inode
4603 * \param intent [in] layout intent
4605 * \retval 0 on success
4606 * \retval < 0 error code
4608 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4610 struct ll_inode_info *lli = ll_i2info(inode);
4611 struct ll_sb_info *sbi = ll_i2sbi(inode);
4612 struct md_op_data *op_data;
4613 struct lookup_intent it;
4614 struct ptlrpc_request *req;
4618 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4619 0, 0, LUSTRE_OPC_ANY, NULL);
4620 if (IS_ERR(op_data))
4621 RETURN(PTR_ERR(op_data));
4623 op_data->op_data = intent;
4624 op_data->op_data_size = sizeof(*intent);
4626 memset(&it, 0, sizeof(it));
4627 it.it_op = IT_LAYOUT;
4628 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4629 intent->li_opc == LAYOUT_INTENT_TRUNC)
4630 it.it_flags = FMODE_WRITE;
4632 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4633 ll_get_fsname(inode->i_sb, NULL, 0),
4634 PFID(&lli->lli_fid), inode);
4636 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4637 &ll_md_blocking_ast, 0);
4638 if (it.it_request != NULL)
4639 ptlrpc_req_finished(it.it_request);
4640 it.it_request = NULL;
4642 ll_finish_md_op_data(op_data);
4644 /* set lock data in case this is a new lock */
4646 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4648 ll_intent_drop_lock(&it);
4654 * This function checks if there exists a LAYOUT lock on the client side,
4655 * or enqueues it if it doesn't have one in cache.
4657 * This function will not hold layout lock so it may be revoked any time after
4658 * this function returns. Any operations depend on layout should be redone
4661 * This function should be called before lov_io_init() to get an uptodate
4662 * layout version, the caller should save the version number and after IO
4663 * is finished, this function should be called again to verify that layout
4664 * is not changed during IO time.
4666 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4668 struct ll_inode_info *lli = ll_i2info(inode);
4669 struct ll_sb_info *sbi = ll_i2sbi(inode);
4670 struct lustre_handle lockh;
4671 struct layout_intent intent = {
4672 .li_opc = LAYOUT_INTENT_ACCESS,
4674 enum ldlm_mode mode;
4678 *gen = ll_layout_version_get(lli);
4679 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4683 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4684 LASSERT(S_ISREG(inode->i_mode));
4686 /* take layout lock mutex to enqueue layout lock exclusively. */
4687 mutex_lock(&lli->lli_layout_mutex);
4690 /* mostly layout lock is caching on the local side, so try to
4691 * match it before grabbing layout lock mutex. */
4692 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4693 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4694 if (mode != 0) { /* hit cached lock */
4695 rc = ll_layout_lock_set(&lockh, mode, inode);
4701 rc = ll_layout_intent(inode, &intent);
4707 *gen = ll_layout_version_get(lli);
4708 mutex_unlock(&lli->lli_layout_mutex);
4714 * Issue layout intent RPC indicating where in a file an IO is about to write.
4716 * \param[in] inode file inode.
4717 * \param[in] start start offset of fille in bytes where an IO is about to
4719 * \param[in] end exclusive end offset in bytes of the write range.
4721 * \retval 0 on success
4722 * \retval < 0 error code
4724 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4726 struct layout_intent intent = {
4727 .li_opc = LAYOUT_INTENT_WRITE,
4728 .li_extent.e_start = start,
4729 .li_extent.e_end = end,
4734 rc = ll_layout_intent(inode, &intent);
4740 * This function send a restore request to the MDT
4742 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4744 struct hsm_user_request *hur;
4748 len = sizeof(struct hsm_user_request) +
4749 sizeof(struct hsm_user_item);
4750 OBD_ALLOC(hur, len);
4754 hur->hur_request.hr_action = HUA_RESTORE;
4755 hur->hur_request.hr_archive_id = 0;
4756 hur->hur_request.hr_flags = 0;
4757 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4758 sizeof(hur->hur_user_item[0].hui_fid));
4759 hur->hur_user_item[0].hui_extent.offset = offset;
4760 hur->hur_user_item[0].hui_extent.length = length;
4761 hur->hur_request.hr_itemcount = 1;
4762 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,