4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_HSM_RELEASE:
160 LASSERT(data != NULL);
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *(__u64 *)data;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 LASSERT(data == NULL);
172 rc = md_close(md_exp, op_data, och->och_mod, &req);
173 if (rc != 0 && rc != -EINTR)
174 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
175 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
177 if (rc == 0 && op_data->op_bias & bias) {
178 struct mdt_body *body;
180 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
181 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
185 ll_finish_md_op_data(op_data);
189 md_clear_open_replay_data(md_exp, och);
190 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
193 ptlrpc_req_finished(req); /* This is close request */
197 int ll_md_real_close(struct inode *inode, fmode_t fmode)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (fmode & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (fmode & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(fmode & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount > 0) {
220 /* There are still users of this handle, so skip
222 mutex_unlock(&lli->lli_och_mutex);
228 mutex_unlock(&lli->lli_och_mutex);
231 /* There might be a race and this handle may already
233 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
239 static int ll_md_close(struct inode *inode, struct file *file)
241 union ldlm_policy_data policy = {
242 .l_inodebits = { MDS_INODELOCK_OPEN },
244 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
247 struct lustre_handle lockh;
248 enum ldlm_mode lockmode;
252 /* clear group lock, if present */
253 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
254 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
256 if (fd->fd_lease_och != NULL) {
259 /* Usually the lease is not released when the
260 * application crashed, we need to release here. */
261 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
262 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
263 PFID(&lli->lli_fid), rc, lease_broken);
265 fd->fd_lease_och = NULL;
268 if (fd->fd_och != NULL) {
269 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
274 /* Let's see if we have good enough OPEN lock on the file and if
275 we can skip talking to MDS */
276 mutex_lock(&lli->lli_och_mutex);
277 if (fd->fd_omode & FMODE_WRITE) {
279 LASSERT(lli->lli_open_fd_write_count);
280 lli->lli_open_fd_write_count--;
281 } else if (fd->fd_omode & FMODE_EXEC) {
283 LASSERT(lli->lli_open_fd_exec_count);
284 lli->lli_open_fd_exec_count--;
287 LASSERT(lli->lli_open_fd_read_count);
288 lli->lli_open_fd_read_count--;
290 mutex_unlock(&lli->lli_och_mutex);
292 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
293 LDLM_IBITS, &policy, lockmode, &lockh))
294 rc = ll_md_real_close(inode, fd->fd_omode);
297 LUSTRE_FPRIVATE(file) = NULL;
298 ll_file_data_put(fd);
303 /* While this returns an error code, fput() the caller does not, so we need
304 * to make every effort to clean up all of our state here. Also, applications
305 * rarely check close errors and even if an error is returned they will not
306 * re-try the close call.
308 int ll_file_release(struct inode *inode, struct file *file)
310 struct ll_file_data *fd;
311 struct ll_sb_info *sbi = ll_i2sbi(inode);
312 struct ll_inode_info *lli = ll_i2info(inode);
316 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
317 PFID(ll_inode2fid(inode)), inode);
319 if (inode->i_sb->s_root != file_dentry(file))
320 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
321 fd = LUSTRE_FPRIVATE(file);
324 /* The last ref on @file, maybe not the the owner pid of statahead,
325 * because parent and child process can share the same file handle. */
326 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
327 ll_deauthorize_statahead(inode, fd);
329 if (inode->i_sb->s_root == file_dentry(file)) {
330 LUSTRE_FPRIVATE(file) = NULL;
331 ll_file_data_put(fd);
335 if (!S_ISDIR(inode->i_mode)) {
336 if (lli->lli_clob != NULL)
337 lov_read_and_clear_async_rc(lli->lli_clob);
338 lli->lli_async_rc = 0;
341 rc = ll_md_close(inode, file);
343 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
344 libcfs_debug_dumplog();
349 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
350 struct lookup_intent *itp)
352 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
353 struct dentry *parent = de->d_parent;
354 const char *name = NULL;
356 struct md_op_data *op_data;
357 struct ptlrpc_request *req = NULL;
361 LASSERT(parent != NULL);
362 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
364 /* if server supports open-by-fid, or file name is invalid, don't pack
365 * name in open request */
366 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
367 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
368 name = de->d_name.name;
369 len = de->d_name.len;
372 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
373 name, len, 0, LUSTRE_OPC_ANY, NULL);
375 RETURN(PTR_ERR(op_data));
376 op_data->op_data = lmm;
377 op_data->op_data_size = lmmsize;
379 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
380 &ll_md_blocking_ast, 0);
381 ll_finish_md_op_data(op_data);
383 /* reason for keep own exit path - don`t flood log
384 * with messages with -ESTALE errors.
386 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
387 it_open_error(DISP_OPEN_OPEN, itp))
389 ll_release_openhandle(de, itp);
393 if (it_disposition(itp, DISP_LOOKUP_NEG))
394 GOTO(out, rc = -ENOENT);
396 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
397 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
398 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
402 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
403 if (!rc && itp->it_lock_mode)
404 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
407 ptlrpc_req_finished(req);
408 ll_intent_drop_lock(itp);
410 /* We did open by fid, but by the time we got to the server,
411 * the object disappeared. If this is a create, we cannot really
412 * tell the userspace that the file it was trying to create
413 * does not exist. Instead let's return -ESTALE, and the VFS will
414 * retry the create with LOOKUP_REVAL that we are going to catch
415 * in ll_revalidate_dentry() and use lookup then.
417 if (rc == -ENOENT && itp->it_op & IT_CREAT)
423 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
424 struct obd_client_handle *och)
426 struct mdt_body *body;
428 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
429 och->och_fh = body->mbo_handle;
430 och->och_fid = body->mbo_fid1;
431 och->och_lease_handle.cookie = it->it_lock_handle;
432 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
433 och->och_flags = it->it_flags;
435 return md_set_open_replay_data(md_exp, och, it);
438 static int ll_local_open(struct file *file, struct lookup_intent *it,
439 struct ll_file_data *fd, struct obd_client_handle *och)
441 struct inode *inode = file_inode(file);
444 LASSERT(!LUSTRE_FPRIVATE(file));
451 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
456 LUSTRE_FPRIVATE(file) = fd;
457 ll_readahead_init(inode, &fd->fd_ras);
458 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
460 /* ll_cl_context initialize */
461 rwlock_init(&fd->fd_lock);
462 INIT_LIST_HEAD(&fd->fd_lccs);
467 /* Open a file, and (for the very first open) create objects on the OSTs at
468 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
469 * creation or open until ll_lov_setstripe() ioctl is called.
471 * If we already have the stripe MD locally then we don't request it in
472 * md_open(), by passing a lmm_size = 0.
474 * It is up to the application to ensure no other processes open this file
475 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
476 * used. We might be able to avoid races of that sort by getting lli_open_sem
477 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
478 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480 int ll_file_open(struct inode *inode, struct file *file)
482 struct ll_inode_info *lli = ll_i2info(inode);
483 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
484 .it_flags = file->f_flags };
485 struct obd_client_handle **och_p = NULL;
486 __u64 *och_usecount = NULL;
487 struct ll_file_data *fd;
491 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
492 PFID(ll_inode2fid(inode)), inode, file->f_flags);
494 it = file->private_data; /* XXX: compat macro */
495 file->private_data = NULL; /* prevent ll_local_open assertion */
497 fd = ll_file_data_get();
499 GOTO(out_openerr, rc = -ENOMEM);
502 if (S_ISDIR(inode->i_mode))
503 ll_authorize_statahead(inode, fd);
505 if (inode->i_sb->s_root == file_dentry(file)) {
506 LUSTRE_FPRIVATE(file) = fd;
510 if (!it || !it->it_disposition) {
511 /* Convert f_flags into access mode. We cannot use file->f_mode,
512 * because everything but O_ACCMODE mask was stripped from
514 if ((oit.it_flags + 1) & O_ACCMODE)
516 if (file->f_flags & O_TRUNC)
517 oit.it_flags |= FMODE_WRITE;
519 /* kernel only call f_op->open in dentry_open. filp_open calls
520 * dentry_open after call to open_namei that checks permissions.
521 * Only nfsd_open call dentry_open directly without checking
522 * permissions and because of that this code below is safe. */
523 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
524 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
526 /* We do not want O_EXCL here, presumably we opened the file
527 * already? XXX - NFS implications? */
528 oit.it_flags &= ~O_EXCL;
530 /* bug20584, if "it_flags" contains O_CREAT, the file will be
531 * created if necessary, then "IT_CREAT" should be set to keep
532 * consistent with it */
533 if (oit.it_flags & O_CREAT)
534 oit.it_op |= IT_CREAT;
540 /* Let's see if we have file open on MDS already. */
541 if (it->it_flags & FMODE_WRITE) {
542 och_p = &lli->lli_mds_write_och;
543 och_usecount = &lli->lli_open_fd_write_count;
544 } else if (it->it_flags & FMODE_EXEC) {
545 och_p = &lli->lli_mds_exec_och;
546 och_usecount = &lli->lli_open_fd_exec_count;
548 och_p = &lli->lli_mds_read_och;
549 och_usecount = &lli->lli_open_fd_read_count;
552 mutex_lock(&lli->lli_och_mutex);
553 if (*och_p) { /* Open handle is present */
554 if (it_disposition(it, DISP_OPEN_OPEN)) {
555 /* Well, there's extra open request that we do not need,
556 let's close it somehow. This will decref request. */
557 rc = it_open_error(DISP_OPEN_OPEN, it);
559 mutex_unlock(&lli->lli_och_mutex);
560 GOTO(out_openerr, rc);
563 ll_release_openhandle(file_dentry(file), it);
567 rc = ll_local_open(file, it, fd, NULL);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 LASSERT(*och_usecount == 0);
575 if (!it->it_disposition) {
576 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
577 /* We cannot just request lock handle now, new ELC code
578 means that one of other OPEN locks for this file
579 could be cancelled, and since blocking ast handler
580 would attempt to grab och_mutex as well, that would
581 result in a deadlock */
582 mutex_unlock(&lli->lli_och_mutex);
584 * Normally called under two situations:
586 * 2. A race/condition on MDS resulting in no open
587 * handle to be returned from LOOKUP|OPEN request,
588 * for example if the target entry was a symlink.
590 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
591 * marked by a bit set in ll_iget_for_nfs. Clear the
592 * bit so that it's not confusing later callers.
594 * NB; when ldd is NULL, it must have come via normal
595 * lookup path only, since ll_iget_for_nfs always calls
598 if (ldd && ldd->lld_nfs_dentry) {
599 ldd->lld_nfs_dentry = 0;
600 it->it_flags |= MDS_OPEN_LOCK;
604 * Always specify MDS_OPEN_BY_FID because we don't want
605 * to get file with different fid.
607 it->it_flags |= MDS_OPEN_BY_FID;
608 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
611 GOTO(out_openerr, rc);
615 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
617 GOTO(out_och_free, rc = -ENOMEM);
621 /* md_intent_lock() didn't get a request ref if there was an
622 * open error, so don't do cleanup on the request here
624 /* XXX (green): Should not we bail out on any error here, not
625 * just open error? */
626 rc = it_open_error(DISP_OPEN_OPEN, it);
628 GOTO(out_och_free, rc);
630 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
631 "inode %p: disposition %x, status %d\n", inode,
632 it_disposition(it, ~0), it->it_status);
634 rc = ll_local_open(file, it, fd, *och_p);
636 GOTO(out_och_free, rc);
638 mutex_unlock(&lli->lli_och_mutex);
641 /* Must do this outside lli_och_mutex lock to prevent deadlock where
642 different kind of OPEN lock for this same inode gets cancelled
643 by ldlm_cancel_lru */
644 if (!S_ISREG(inode->i_mode))
645 GOTO(out_och_free, rc);
647 cl_lov_delay_create_clear(&file->f_flags);
648 GOTO(out_och_free, rc);
652 if (och_p && *och_p) {
653 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
654 *och_p = NULL; /* OBD_FREE writes some magic there */
657 mutex_unlock(&lli->lli_och_mutex);
660 if (lli->lli_opendir_key == fd)
661 ll_deauthorize_statahead(inode, fd);
663 ll_file_data_put(fd);
665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
668 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
669 ptlrpc_req_finished(it->it_request);
670 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
676 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
677 struct ldlm_lock_desc *desc, void *data, int flag)
680 struct lustre_handle lockh;
684 case LDLM_CB_BLOCKING:
685 ldlm_lock2handle(lock, &lockh);
686 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
688 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
692 case LDLM_CB_CANCELING:
700 * When setting a lease on a file, we take ownership of the lli_mds_*_och
701 * and save it as fd->fd_och so as to force client to reopen the file even
702 * if it has an open lock in cache already.
704 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
705 struct lustre_handle *old_handle)
707 struct ll_inode_info *lli = ll_i2info(inode);
708 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
709 struct obd_client_handle **och_p;
714 /* Get the openhandle of the file */
715 mutex_lock(&lli->lli_och_mutex);
716 if (fd->fd_lease_och != NULL)
717 GOTO(out_unlock, rc = -EBUSY);
719 if (fd->fd_och == NULL) {
720 if (file->f_mode & FMODE_WRITE) {
721 LASSERT(lli->lli_mds_write_och != NULL);
722 och_p = &lli->lli_mds_write_och;
723 och_usecount = &lli->lli_open_fd_write_count;
725 LASSERT(lli->lli_mds_read_och != NULL);
726 och_p = &lli->lli_mds_read_och;
727 och_usecount = &lli->lli_open_fd_read_count;
730 if (*och_usecount > 1)
731 GOTO(out_unlock, rc = -EBUSY);
738 *old_handle = fd->fd_och->och_fh;
742 mutex_unlock(&lli->lli_och_mutex);
747 * Release ownership on lli_mds_*_och when putting back a file lease.
749 static int ll_lease_och_release(struct inode *inode, struct file *file)
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
754 struct obd_client_handle *old_och = NULL;
759 mutex_lock(&lli->lli_och_mutex);
760 if (file->f_mode & FMODE_WRITE) {
761 och_p = &lli->lli_mds_write_och;
762 och_usecount = &lli->lli_open_fd_write_count;
764 och_p = &lli->lli_mds_read_och;
765 och_usecount = &lli->lli_open_fd_read_count;
768 /* The file may have been open by another process (broken lease) so
769 * *och_p is not NULL. In this case we should simply increase usecount
772 if (*och_p != NULL) {
773 old_och = fd->fd_och;
780 mutex_unlock(&lli->lli_och_mutex);
783 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
789 * Acquire a lease and open the file.
791 static struct obd_client_handle *
792 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
795 struct lookup_intent it = { .it_op = IT_OPEN };
796 struct ll_sb_info *sbi = ll_i2sbi(inode);
797 struct md_op_data *op_data;
798 struct ptlrpc_request *req = NULL;
799 struct lustre_handle old_handle = { 0 };
800 struct obd_client_handle *och = NULL;
805 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
806 RETURN(ERR_PTR(-EINVAL));
809 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
810 RETURN(ERR_PTR(-EPERM));
812 rc = ll_lease_och_acquire(inode, file, &old_handle);
819 RETURN(ERR_PTR(-ENOMEM));
821 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
822 LUSTRE_OPC_ANY, NULL);
824 GOTO(out, rc = PTR_ERR(op_data));
826 /* To tell the MDT this openhandle is from the same owner */
827 op_data->op_handle = old_handle;
829 it.it_flags = fmode | open_flags;
830 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
831 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
832 &ll_md_blocking_lease_ast,
833 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
834 * it can be cancelled which may mislead applications that the lease is
836 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
837 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
838 * doesn't deal with openhandle, so normal openhandle will be leaked. */
839 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
840 ll_finish_md_op_data(op_data);
841 ptlrpc_req_finished(req);
843 GOTO(out_release_it, rc);
845 if (it_disposition(&it, DISP_LOOKUP_NEG))
846 GOTO(out_release_it, rc = -ENOENT);
848 rc = it_open_error(DISP_OPEN_OPEN, &it);
850 GOTO(out_release_it, rc);
852 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
853 ll_och_fill(sbi->ll_md_exp, &it, och);
855 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
856 GOTO(out_close, rc = -EOPNOTSUPP);
858 /* already get lease, handle lease lock */
859 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
860 if (it.it_lock_mode == 0 ||
861 it.it_lock_bits != MDS_INODELOCK_OPEN) {
862 /* open lock must return for lease */
863 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
864 PFID(ll_inode2fid(inode)), it.it_lock_mode,
866 GOTO(out_close, rc = -EPROTO);
869 ll_intent_release(&it);
873 /* Cancel open lock */
874 if (it.it_lock_mode != 0) {
875 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
878 och->och_lease_handle.cookie = 0ULL;
880 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
882 CERROR("%s: error closing file "DFID": %d\n",
883 ll_get_fsname(inode->i_sb, NULL, 0),
884 PFID(&ll_i2info(inode)->lli_fid), rc2);
885 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
887 ll_intent_release(&it);
895 * Check whether a layout swap can be done between two inodes.
897 * \param[in] inode1 First inode to check
898 * \param[in] inode2 Second inode to check
900 * \retval 0 on success, layout swap can be performed between both inodes
901 * \retval negative error code if requirements are not met
903 static int ll_check_swap_layouts_validity(struct inode *inode1,
904 struct inode *inode2)
906 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
909 if (inode_permission(inode1, MAY_WRITE) ||
910 inode_permission(inode2, MAY_WRITE))
913 if (inode1->i_sb != inode2->i_sb)
919 static int ll_swap_layouts_close(struct obd_client_handle *och,
920 struct inode *inode, struct inode *inode2,
923 const struct lu_fid *fid1 = ll_inode2fid(inode);
924 const struct lu_fid *fid2;
925 enum mds_op_bias bias;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
944 case SWAP_LAYOUTS_CLOSE:
945 bias = MDS_CLOSE_LAYOUT_SWAP;
947 case MERGE_LAYOUTS_CLOSE:
948 bias = MDS_CLOSE_LAYOUT_MERGE;
951 GOTO(out_free_och, rc = -EOPNOTSUPP);
954 /* Close the file and {swap,merge} layouts between inode & inode2.
955 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
956 * because we still need it to pack l_remote_handle to MDT. */
957 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
959 och = NULL; /* freed in ll_close_inode_openhandle() */
969 * Release lease and close the file.
970 * It will check if the lease has ever broken.
972 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
975 struct ldlm_lock *lock;
976 bool cancelled = true;
980 lock = ldlm_handle2lock(&och->och_lease_handle);
982 lock_res_and_lock(lock);
983 cancelled = ldlm_is_cancel(lock);
984 unlock_res_and_lock(lock);
988 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
989 PFID(&ll_i2info(inode)->lli_fid), cancelled);
992 ldlm_cli_cancel(&och->och_lease_handle, 0);
994 if (lease_broken != NULL)
995 *lease_broken = cancelled;
997 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1001 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1003 struct ll_inode_info *lli = ll_i2info(inode);
1004 struct cl_object *obj = lli->lli_clob;
1005 struct cl_attr *attr = vvp_env_thread_attr(env);
1013 ll_inode_size_lock(inode);
1015 /* Merge timestamps the most recently obtained from MDS with
1016 * timestamps obtained from OSTs.
1018 * Do not overwrite atime of inode because it may be refreshed
1019 * by file_accessed() function. If the read was served by cache
1020 * data, there is no RPC to be sent so that atime may not be
1021 * transferred to OSTs at all. MDT only updates atime at close time
1022 * if it's at least 'mdd.*.atime_diff' older.
1023 * All in all, the atime in Lustre does not strictly comply with
1024 * POSIX. Solving this problem needs to send an RPC to MDT for each
1025 * read, this will hurt performance. */
1026 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1027 LTIME_S(inode->i_atime) = lli->lli_atime;
1028 lli->lli_update_atime = 0;
1030 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1031 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1033 atime = LTIME_S(inode->i_atime);
1034 mtime = LTIME_S(inode->i_mtime);
1035 ctime = LTIME_S(inode->i_ctime);
1037 cl_object_attr_lock(obj);
1038 rc = cl_object_attr_get(env, obj, attr);
1039 cl_object_attr_unlock(obj);
1042 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1044 if (atime < attr->cat_atime)
1045 atime = attr->cat_atime;
1047 if (ctime < attr->cat_ctime)
1048 ctime = attr->cat_ctime;
1050 if (mtime < attr->cat_mtime)
1051 mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1056 i_size_write(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_atime) = atime;
1060 LTIME_S(inode->i_mtime) = mtime;
1061 LTIME_S(inode->i_ctime) = ctime;
1064 ll_inode_size_unlock(inode);
1069 static bool file_is_noatime(const struct file *file)
1071 const struct vfsmount *mnt = file->f_path.mnt;
1072 const struct inode *inode = file_inode((struct file *)file);
1074 /* Adapted from file_accessed() and touch_atime().*/
1075 if (file->f_flags & O_NOATIME)
1078 if (inode->i_flags & S_NOATIME)
1081 if (IS_NOATIME(inode))
1084 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1087 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1090 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1096 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1098 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1100 struct inode *inode = file_inode(file);
1101 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1103 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1104 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1105 io->u.ci_rw.rw_file = file;
1106 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1107 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1108 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1110 if (iot == CIT_WRITE) {
1111 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1112 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1113 file->f_flags & O_DIRECT ||
1116 io->ci_obj = ll_i2info(inode)->lli_clob;
1117 io->ci_lockreq = CILR_MAYBE;
1118 if (ll_file_nolock(file)) {
1119 io->ci_lockreq = CILR_NEVER;
1120 io->ci_no_srvlock = 1;
1121 } else if (file->f_flags & O_APPEND) {
1122 io->ci_lockreq = CILR_MANDATORY;
1124 io->ci_noatime = file_is_noatime(file);
1125 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1126 io->ci_pio = !io->u.ci_rw.rw_append;
1130 /* FLR: only use non-delay I/O for read as there is only one
1131 * avaliable mirror for write. */
1132 io->ci_ndelay = !(iot == CIT_WRITE);
1135 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1137 struct cl_io_pt *pt = ptask->pt_cbdata;
1138 struct file *file = pt->cip_file;
1141 loff_t pos = pt->cip_pos;
1146 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1147 file_dentry(file)->d_name.name,
1148 pt->cip_iot == CIT_READ ? "read" : "write",
1149 pos, pos + pt->cip_count);
1151 env = cl_env_get(&refcheck);
1153 RETURN(PTR_ERR(env));
1155 io = vvp_env_thread_io(env);
1156 ll_io_init(io, file, pt->cip_iot);
1157 io->u.ci_rw.rw_iter = pt->cip_iter;
1158 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1159 io->ci_pio = 0; /* It's already in parallel task */
1161 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1162 pt->cip_count - pt->cip_result);
1164 struct vvp_io *vio = vvp_env_io(env);
1166 vio->vui_io_subtype = IO_NORMAL;
1167 vio->vui_fd = LUSTRE_FPRIVATE(file);
1169 ll_cl_add(file, env, io, LCC_RW);
1170 rc = cl_io_loop(env, io);
1171 ll_cl_remove(file, env);
1173 /* cl_io_rw_init() handled IO */
1177 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1183 if (io->ci_nob > 0) {
1184 pt->cip_result += io->ci_nob;
1185 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1187 pt->cip_iocb.ki_pos = pos;
1188 #ifdef HAVE_KIOCB_KI_LEFT
1189 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1190 #elif defined(HAVE_KI_NBYTES)
1191 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1195 cl_io_fini(env, io);
1196 cl_env_put(env, &refcheck);
1198 pt->cip_need_restart = io->ci_need_restart;
1200 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1201 file_dentry(file)->d_name.name,
1202 pt->cip_iot == CIT_READ ? "read" : "write",
1203 pt->cip_result, rc);
1205 RETURN(pt->cip_result > 0 ? 0 : rc);
1209 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1210 struct file *file, enum cl_io_type iot,
1211 loff_t *ppos, size_t count)
1213 struct range_lock range;
1214 struct vvp_io *vio = vvp_env_io(env);
1215 struct inode *inode = file_inode(file);
1216 struct ll_inode_info *lli = ll_i2info(inode);
1217 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1222 unsigned retried = 0;
1223 bool restarted = false;
1227 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1228 file_dentry(file)->d_name.name,
1229 iot == CIT_READ ? "read" : "write", pos, pos + count);
1232 io = vvp_env_thread_io(env);
1233 ll_io_init(io, file, iot);
1234 if (args->via_io_subtype == IO_NORMAL) {
1235 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1236 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1238 if (args->via_io_subtype != IO_NORMAL || restarted)
1240 io->ci_ndelay_tried = retried;
1242 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1243 bool range_locked = false;
1245 if (file->f_flags & O_APPEND)
1246 range_lock_init(&range, 0, LUSTRE_EOF);
1248 range_lock_init(&range, pos, pos + count - 1);
1250 vio->vui_fd = LUSTRE_FPRIVATE(file);
1251 vio->vui_io_subtype = args->via_io_subtype;
1253 switch (vio->vui_io_subtype) {
1255 /* Direct IO reads must also take range lock,
1256 * or multiple reads will try to work on the same pages
1257 * See LU-6227 for details. */
1258 if (((iot == CIT_WRITE) ||
1259 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1260 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1261 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1263 rc = range_lock(&lli->lli_write_tree, &range);
1267 range_locked = true;
1271 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1272 vio->u.splice.vui_flags = args->u.splice.via_flags;
1275 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1279 ll_cl_add(file, env, io, LCC_RW);
1280 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1281 !lli->lli_inode_locked) {
1283 lli->lli_inode_locked = 1;
1285 rc = cl_io_loop(env, io);
1286 if (lli->lli_inode_locked) {
1287 lli->lli_inode_locked = 0;
1288 inode_unlock(inode);
1290 ll_cl_remove(file, env);
1293 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1295 range_unlock(&lli->lli_write_tree, &range);
1298 /* cl_io_rw_init() handled IO */
1302 if (io->ci_nob > 0) {
1303 result += io->ci_nob;
1304 count -= io->ci_nob;
1306 if (args->via_io_subtype == IO_NORMAL) {
1307 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1309 args->u.normal.via_iocb->ki_pos = pos;
1310 #ifdef HAVE_KIOCB_KI_LEFT
1311 args->u.normal.via_iocb->ki_left = count;
1312 #elif defined(HAVE_KI_NBYTES)
1313 args->u.normal.via_iocb->ki_nbytes = count;
1317 pos = io->u.ci_rw.rw_range.cir_pos;
1321 cl_io_fini(env, io);
1324 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1325 file->f_path.dentry->d_name.name,
1326 iot, rc, result, io->ci_need_restart);
1328 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1330 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1331 file_dentry(file)->d_name.name,
1332 iot == CIT_READ ? "read" : "write",
1333 pos, pos + count, result, rc);
1334 /* preserve the tried count for FLR */
1335 retried = io->ci_ndelay_tried;
1340 if (iot == CIT_READ) {
1342 ll_stats_ops_tally(ll_i2sbi(inode),
1343 LPROC_LL_READ_BYTES, result);
1344 } else if (iot == CIT_WRITE) {
1346 ll_stats_ops_tally(ll_i2sbi(inode),
1347 LPROC_LL_WRITE_BYTES, result);
1348 fd->fd_write_failed = false;
1349 } else if (result == 0 && rc == 0) {
1352 fd->fd_write_failed = true;
1354 fd->fd_write_failed = false;
1355 } else if (rc != -ERESTARTSYS) {
1356 fd->fd_write_failed = true;
1360 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1361 file_dentry(file)->d_name.name,
1362 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1366 RETURN(result > 0 ? result : rc);
1370 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1371 * especially for small I/O.
1373 * To serve a read request, CLIO has to create and initialize a cl_io and
1374 * then request DLM lock. This has turned out to have siginificant overhead
1375 * and affects the performance of small I/O dramatically.
1377 * It's not necessary to create a cl_io for each I/O. Under the help of read
1378 * ahead, most of the pages being read are already in memory cache and we can
1379 * read those pages directly because if the pages exist, the corresponding DLM
1380 * lock must exist so that page content must be valid.
1382 * In fast read implementation, the llite speculatively finds and reads pages
1383 * in memory cache. There are three scenarios for fast read:
1384 * - If the page exists and is uptodate, kernel VM will provide the data and
1385 * CLIO won't be intervened;
1386 * - If the page was brought into memory by read ahead, it will be exported
1387 * and read ahead parameters will be updated;
1388 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1389 * it will go back and invoke normal read, i.e., a cl_io will be created
1390 * and DLM lock will be requested.
1392 * POSIX compliance: posix standard states that read is intended to be atomic.
1393 * Lustre read implementation is in line with Linux kernel read implementation
1394 * and neither of them complies with POSIX standard in this matter. Fast read
1395 * doesn't make the situation worse on single node but it may interleave write
1396 * results from multiple nodes due to short read handling in ll_file_aio_read().
1398 * \param env - lu_env
1399 * \param iocb - kiocb from kernel
1400 * \param iter - user space buffers where the data will be copied
1402 * \retval - number of bytes have been read, or error code if error occurred.
1405 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1409 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1412 /* NB: we can't do direct IO for fast read because it will need a lock
1413 * to make IO engine happy. */
1414 if (iocb->ki_filp->f_flags & O_DIRECT)
1417 result = generic_file_read_iter(iocb, iter);
1419 /* If the first page is not in cache, generic_file_aio_read() will be
1420 * returned with -ENODATA.
1421 * See corresponding code in ll_readpage(). */
1422 if (result == -ENODATA)
1426 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1427 LPROC_LL_READ_BYTES, result);
1433 * Read from a file (through the page cache).
1435 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1438 struct vvp_io_args *args;
1443 result = ll_do_fast_read(iocb, to);
1444 if (result < 0 || iov_iter_count(to) == 0)
1447 env = cl_env_get(&refcheck);
1449 return PTR_ERR(env);
1451 args = ll_env_args(env, IO_NORMAL);
1452 args->u.normal.via_iter = to;
1453 args->u.normal.via_iocb = iocb;
1455 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1456 &iocb->ki_pos, iov_iter_count(to));
1459 else if (result == 0)
1462 cl_env_put(env, &refcheck);
1468 * Write to a file (through the page cache).
1470 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1472 struct vvp_io_args *args;
1477 env = cl_env_get(&refcheck);
1479 return PTR_ERR(env);
1481 args = ll_env_args(env, IO_NORMAL);
1482 args->u.normal.via_iter = from;
1483 args->u.normal.via_iocb = iocb;
1485 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1486 &iocb->ki_pos, iov_iter_count(from));
1487 cl_env_put(env, &refcheck);
1491 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1493 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1495 static int ll_file_get_iov_count(const struct iovec *iov,
1496 unsigned long *nr_segs, size_t *count)
1501 for (seg = 0; seg < *nr_segs; seg++) {
1502 const struct iovec *iv = &iov[seg];
1505 * If any segment has a negative length, or the cumulative
1506 * length ever wraps negative then return -EINVAL.
1509 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1511 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1516 cnt -= iv->iov_len; /* This segment is no good */
1523 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1524 unsigned long nr_segs, loff_t pos)
1531 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1535 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1536 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1537 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1538 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1539 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1541 result = ll_file_read_iter(iocb, &to);
1546 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1549 struct iovec iov = { .iov_base = buf, .iov_len = count };
1554 init_sync_kiocb(&kiocb, file);
1555 kiocb.ki_pos = *ppos;
1556 #ifdef HAVE_KIOCB_KI_LEFT
1557 kiocb.ki_left = count;
1558 #elif defined(HAVE_KI_NBYTES)
1559 kiocb.i_nbytes = count;
1562 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1563 *ppos = kiocb.ki_pos;
1569 * Write to a file (through the page cache).
1572 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1573 unsigned long nr_segs, loff_t pos)
1575 struct iov_iter from;
1580 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1584 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1585 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1586 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1587 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1588 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1590 result = ll_file_write_iter(iocb, &from);
1595 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1596 size_t count, loff_t *ppos)
1599 struct iovec iov = { .iov_base = (void __user *)buf,
1601 struct kiocb *kiocb;
1606 env = cl_env_get(&refcheck);
1608 RETURN(PTR_ERR(env));
1610 kiocb = &ll_env_info(env)->lti_kiocb;
1611 init_sync_kiocb(kiocb, file);
1612 kiocb->ki_pos = *ppos;
1613 #ifdef HAVE_KIOCB_KI_LEFT
1614 kiocb->ki_left = count;
1615 #elif defined(HAVE_KI_NBYTES)
1616 kiocb->ki_nbytes = count;
1619 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1620 *ppos = kiocb->ki_pos;
1622 cl_env_put(env, &refcheck);
1625 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1628 * Send file content (through pagecache) somewhere with helper
1630 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1631 struct pipe_inode_info *pipe, size_t count,
1635 struct vvp_io_args *args;
1640 env = cl_env_get(&refcheck);
1642 RETURN(PTR_ERR(env));
1644 args = ll_env_args(env, IO_SPLICE);
1645 args->u.splice.via_pipe = pipe;
1646 args->u.splice.via_flags = flags;
1648 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1649 cl_env_put(env, &refcheck);
1653 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1654 __u64 flags, struct lov_user_md *lum, int lum_size)
1656 struct lookup_intent oit = {
1658 .it_flags = flags | MDS_OPEN_BY_FID,
1663 ll_inode_size_lock(inode);
1664 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1666 GOTO(out_unlock, rc);
1668 ll_release_openhandle(dentry, &oit);
1671 ll_inode_size_unlock(inode);
1672 ll_intent_release(&oit);
1677 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1678 struct lov_mds_md **lmmp, int *lmm_size,
1679 struct ptlrpc_request **request)
1681 struct ll_sb_info *sbi = ll_i2sbi(inode);
1682 struct mdt_body *body;
1683 struct lov_mds_md *lmm = NULL;
1684 struct ptlrpc_request *req = NULL;
1685 struct md_op_data *op_data;
1688 rc = ll_get_default_mdsize(sbi, &lmmsize);
1692 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1693 strlen(filename), lmmsize,
1694 LUSTRE_OPC_ANY, NULL);
1695 if (IS_ERR(op_data))
1696 RETURN(PTR_ERR(op_data));
1698 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1699 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1700 ll_finish_md_op_data(op_data);
1702 CDEBUG(D_INFO, "md_getattr_name failed "
1703 "on %s: rc %d\n", filename, rc);
1707 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1708 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1710 lmmsize = body->mbo_eadatasize;
1712 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1714 GOTO(out, rc = -ENODATA);
1717 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1718 LASSERT(lmm != NULL);
1720 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1721 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1722 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1723 GOTO(out, rc = -EPROTO);
1726 * This is coming from the MDS, so is probably in
1727 * little endian. We convert it to host endian before
1728 * passing it to userspace.
1730 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1733 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1734 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1735 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1736 if (le32_to_cpu(lmm->lmm_pattern) &
1737 LOV_PATTERN_F_RELEASED)
1741 /* if function called for directory - we should
1742 * avoid swab not existent lsm objects */
1743 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1744 lustre_swab_lov_user_md_v1(
1745 (struct lov_user_md_v1 *)lmm);
1746 if (S_ISREG(body->mbo_mode))
1747 lustre_swab_lov_user_md_objects(
1748 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1750 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1751 lustre_swab_lov_user_md_v3(
1752 (struct lov_user_md_v3 *)lmm);
1753 if (S_ISREG(body->mbo_mode))
1754 lustre_swab_lov_user_md_objects(
1755 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1757 } else if (lmm->lmm_magic ==
1758 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1759 lustre_swab_lov_comp_md_v1(
1760 (struct lov_comp_md_v1 *)lmm);
1766 *lmm_size = lmmsize;
1771 static int ll_lov_setea(struct inode *inode, struct file *file,
1774 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1775 struct lov_user_md *lump;
1776 int lum_size = sizeof(struct lov_user_md) +
1777 sizeof(struct lov_user_ost_data);
1781 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1784 OBD_ALLOC_LARGE(lump, lum_size);
1788 if (copy_from_user(lump, arg, lum_size))
1789 GOTO(out_lump, rc = -EFAULT);
1791 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1793 cl_lov_delay_create_clear(&file->f_flags);
1796 OBD_FREE_LARGE(lump, lum_size);
1800 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1807 env = cl_env_get(&refcheck);
1809 RETURN(PTR_ERR(env));
1811 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1812 cl_env_put(env, &refcheck);
1816 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1819 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1820 struct lov_user_md *klum;
1822 __u64 flags = FMODE_WRITE;
1825 rc = ll_copy_user_md(lum, &klum);
1830 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1835 rc = put_user(0, &lum->lmm_stripe_count);
1839 rc = ll_layout_refresh(inode, &gen);
1843 rc = ll_file_getstripe(inode, arg, lum_size);
1845 cl_lov_delay_create_clear(&file->f_flags);
1848 OBD_FREE(klum, lum_size);
1853 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1855 struct ll_inode_info *lli = ll_i2info(inode);
1856 struct cl_object *obj = lli->lli_clob;
1857 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1858 struct ll_grouplock grouplock;
1863 CWARN("group id for group lock must not be 0\n");
1867 if (ll_file_nolock(file))
1868 RETURN(-EOPNOTSUPP);
1870 spin_lock(&lli->lli_lock);
1871 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1872 CWARN("group lock already existed with gid %lu\n",
1873 fd->fd_grouplock.lg_gid);
1874 spin_unlock(&lli->lli_lock);
1877 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1878 spin_unlock(&lli->lli_lock);
1881 * XXX: group lock needs to protect all OST objects while PFL
1882 * can add new OST objects during the IO, so we'd instantiate
1883 * all OST objects before getting its group lock.
1888 struct cl_layout cl = {
1889 .cl_is_composite = false,
1892 env = cl_env_get(&refcheck);
1894 RETURN(PTR_ERR(env));
1896 rc = cl_object_layout_get(env, obj, &cl);
1897 if (!rc && cl.cl_is_composite)
1898 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1900 cl_env_put(env, &refcheck);
1905 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1906 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1910 spin_lock(&lli->lli_lock);
1911 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1912 spin_unlock(&lli->lli_lock);
1913 CERROR("another thread just won the race\n");
1914 cl_put_grouplock(&grouplock);
1918 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1919 fd->fd_grouplock = grouplock;
1920 spin_unlock(&lli->lli_lock);
1922 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1926 static int ll_put_grouplock(struct inode *inode, struct file *file,
1929 struct ll_inode_info *lli = ll_i2info(inode);
1930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1931 struct ll_grouplock grouplock;
1934 spin_lock(&lli->lli_lock);
1935 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1936 spin_unlock(&lli->lli_lock);
1937 CWARN("no group lock held\n");
1941 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1943 if (fd->fd_grouplock.lg_gid != arg) {
1944 CWARN("group lock %lu doesn't match current id %lu\n",
1945 arg, fd->fd_grouplock.lg_gid);
1946 spin_unlock(&lli->lli_lock);
1950 grouplock = fd->fd_grouplock;
1951 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1952 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1953 spin_unlock(&lli->lli_lock);
1955 cl_put_grouplock(&grouplock);
1956 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1961 * Close inode open handle
1963 * \param dentry [in] dentry which contains the inode
1964 * \param it [in,out] intent which contains open info and result
1967 * \retval <0 failure
1969 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1971 struct inode *inode = dentry->d_inode;
1972 struct obd_client_handle *och;
1978 /* Root ? Do nothing. */
1979 if (dentry->d_inode->i_sb->s_root == dentry)
1982 /* No open handle to close? Move away */
1983 if (!it_disposition(it, DISP_OPEN_OPEN))
1986 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1988 OBD_ALLOC(och, sizeof(*och));
1990 GOTO(out, rc = -ENOMEM);
1992 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1994 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1996 /* this one is in place of ll_file_open */
1997 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1998 ptlrpc_req_finished(it->it_request);
1999 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2005 * Get size for inode for which FIEMAP mapping is requested.
2006 * Make the FIEMAP get_info call and returns the result.
2007 * \param fiemap kernel buffer to hold extens
2008 * \param num_bytes kernel buffer size
2010 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2016 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2019 /* Checks for fiemap flags */
2020 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2021 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2025 /* Check for FIEMAP_FLAG_SYNC */
2026 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2027 rc = filemap_fdatawrite(inode->i_mapping);
2032 env = cl_env_get(&refcheck);
2034 RETURN(PTR_ERR(env));
2036 if (i_size_read(inode) == 0) {
2037 rc = ll_glimpse_size(inode);
2042 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2043 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2044 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2046 /* If filesize is 0, then there would be no objects for mapping */
2047 if (fmkey.lfik_oa.o_size == 0) {
2048 fiemap->fm_mapped_extents = 0;
2052 fmkey.lfik_fiemap = *fiemap;
2054 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2055 &fmkey, fiemap, &num_bytes);
2057 cl_env_put(env, &refcheck);
2061 int ll_fid2path(struct inode *inode, void __user *arg)
2063 struct obd_export *exp = ll_i2mdexp(inode);
2064 const struct getinfo_fid2path __user *gfin = arg;
2066 struct getinfo_fid2path *gfout;
2072 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2073 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2076 /* Only need to get the buflen */
2077 if (get_user(pathlen, &gfin->gf_pathlen))
2080 if (pathlen > PATH_MAX)
2083 outsize = sizeof(*gfout) + pathlen;
2084 OBD_ALLOC(gfout, outsize);
2088 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2089 GOTO(gf_free, rc = -EFAULT);
2090 /* append root FID after gfout to let MDT know the root FID so that it
2091 * can lookup the correct path, this is mainly for fileset.
2092 * old server without fileset mount support will ignore this. */
2093 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2095 /* Call mdc_iocontrol */
2096 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2100 if (copy_to_user(arg, gfout, outsize))
2104 OBD_FREE(gfout, outsize);
2109 * Read the data_version for inode.
2111 * This value is computed using stripe object version on OST.
2112 * Version is computed using server side locking.
2114 * @param flags if do sync on the OST side;
2116 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2117 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2119 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2121 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2129 /* If no file object initialized, we consider its version is 0. */
2135 env = cl_env_get(&refcheck);
2137 RETURN(PTR_ERR(env));
2139 io = vvp_env_thread_io(env);
2141 io->u.ci_data_version.dv_data_version = 0;
2142 io->u.ci_data_version.dv_flags = flags;
2145 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2146 result = cl_io_loop(env, io);
2148 result = io->ci_result;
2150 *data_version = io->u.ci_data_version.dv_data_version;
2152 cl_io_fini(env, io);
2154 if (unlikely(io->ci_need_restart))
2157 cl_env_put(env, &refcheck);
2163 * Trigger a HSM release request for the provided inode.
2165 int ll_hsm_release(struct inode *inode)
2168 struct obd_client_handle *och = NULL;
2169 __u64 data_version = 0;
2174 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2175 ll_get_fsname(inode->i_sb, NULL, 0),
2176 PFID(&ll_i2info(inode)->lli_fid));
2178 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2180 GOTO(out, rc = PTR_ERR(och));
2182 /* Grab latest data_version and [am]time values */
2183 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2187 env = cl_env_get(&refcheck);
2189 GOTO(out, rc = PTR_ERR(env));
2191 ll_merge_attr(env, inode);
2192 cl_env_put(env, &refcheck);
2194 /* Release the file.
2195 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2196 * we still need it to pack l_remote_handle to MDT. */
2197 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2203 if (och != NULL && !IS_ERR(och)) /* close the file */
2204 ll_lease_close(och, inode, NULL);
2209 struct ll_swap_stack {
2212 struct inode *inode1;
2213 struct inode *inode2;
2218 static int ll_swap_layouts(struct file *file1, struct file *file2,
2219 struct lustre_swap_layouts *lsl)
2221 struct mdc_swap_layouts msl;
2222 struct md_op_data *op_data;
2225 struct ll_swap_stack *llss = NULL;
2228 OBD_ALLOC_PTR(llss);
2232 llss->inode1 = file_inode(file1);
2233 llss->inode2 = file_inode(file2);
2235 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2239 /* we use 2 bool because it is easier to swap than 2 bits */
2240 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2241 llss->check_dv1 = true;
2243 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2244 llss->check_dv2 = true;
2246 /* we cannot use lsl->sl_dvX directly because we may swap them */
2247 llss->dv1 = lsl->sl_dv1;
2248 llss->dv2 = lsl->sl_dv2;
2250 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2251 if (rc == 0) /* same file, done! */
2254 if (rc < 0) { /* sequentialize it */
2255 swap(llss->inode1, llss->inode2);
2257 swap(llss->dv1, llss->dv2);
2258 swap(llss->check_dv1, llss->check_dv2);
2262 if (gid != 0) { /* application asks to flush dirty cache */
2263 rc = ll_get_grouplock(llss->inode1, file1, gid);
2267 rc = ll_get_grouplock(llss->inode2, file2, gid);
2269 ll_put_grouplock(llss->inode1, file1, gid);
2274 /* ultimate check, before swaping the layouts we check if
2275 * dataversion has changed (if requested) */
2276 if (llss->check_dv1) {
2277 rc = ll_data_version(llss->inode1, &dv, 0);
2280 if (dv != llss->dv1)
2281 GOTO(putgl, rc = -EAGAIN);
2284 if (llss->check_dv2) {
2285 rc = ll_data_version(llss->inode2, &dv, 0);
2288 if (dv != llss->dv2)
2289 GOTO(putgl, rc = -EAGAIN);
2292 /* struct md_op_data is used to send the swap args to the mdt
2293 * only flags is missing, so we use struct mdc_swap_layouts
2294 * through the md_op_data->op_data */
2295 /* flags from user space have to be converted before they are send to
2296 * server, no flag is sent today, they are only used on the client */
2299 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2300 0, LUSTRE_OPC_ANY, &msl);
2301 if (IS_ERR(op_data))
2302 GOTO(free, rc = PTR_ERR(op_data));
2304 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2305 sizeof(*op_data), op_data, NULL);
2306 ll_finish_md_op_data(op_data);
2313 ll_put_grouplock(llss->inode2, file2, gid);
2314 ll_put_grouplock(llss->inode1, file1, gid);
2324 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2326 struct md_op_data *op_data;
2330 /* Detect out-of range masks */
2331 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2334 /* Non-root users are forbidden to set or clear flags which are
2335 * NOT defined in HSM_USER_MASK. */
2336 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2337 !cfs_capable(CFS_CAP_SYS_ADMIN))
2340 /* Detect out-of range archive id */
2341 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2342 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2345 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2346 LUSTRE_OPC_ANY, hss);
2347 if (IS_ERR(op_data))
2348 RETURN(PTR_ERR(op_data));
2350 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2351 sizeof(*op_data), op_data, NULL);
2353 ll_finish_md_op_data(op_data);
2358 static int ll_hsm_import(struct inode *inode, struct file *file,
2359 struct hsm_user_import *hui)
2361 struct hsm_state_set *hss = NULL;
2362 struct iattr *attr = NULL;
2366 if (!S_ISREG(inode->i_mode))
2372 GOTO(out, rc = -ENOMEM);
2374 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2375 hss->hss_archive_id = hui->hui_archive_id;
2376 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2377 rc = ll_hsm_state_set(inode, hss);
2381 OBD_ALLOC_PTR(attr);
2383 GOTO(out, rc = -ENOMEM);
2385 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2386 attr->ia_mode |= S_IFREG;
2387 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2388 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2389 attr->ia_size = hui->hui_size;
2390 attr->ia_mtime.tv_sec = hui->hui_mtime;
2391 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2392 attr->ia_atime.tv_sec = hui->hui_atime;
2393 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2395 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2396 ATTR_UID | ATTR_GID |
2397 ATTR_MTIME | ATTR_MTIME_SET |
2398 ATTR_ATIME | ATTR_ATIME_SET;
2402 rc = ll_setattr_raw(file_dentry(file), attr, true);
2406 inode_unlock(inode);
2418 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2420 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2421 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2424 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2426 struct inode *inode = file_inode(file);
2428 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2429 ATTR_MTIME | ATTR_MTIME_SET |
2430 ATTR_CTIME | ATTR_CTIME_SET,
2432 .tv_sec = lfu->lfu_atime_sec,
2433 .tv_nsec = lfu->lfu_atime_nsec,
2436 .tv_sec = lfu->lfu_mtime_sec,
2437 .tv_nsec = lfu->lfu_mtime_nsec,
2440 .tv_sec = lfu->lfu_ctime_sec,
2441 .tv_nsec = lfu->lfu_ctime_nsec,
2447 if (!capable(CAP_SYS_ADMIN))
2450 if (!S_ISREG(inode->i_mode))
2454 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2455 inode_unlock(inode);
2460 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2463 case MODE_READ_USER:
2465 case MODE_WRITE_USER:
2472 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2474 /* Used to allow the upper layers of the client to request an LDLM lock
2475 * without doing an actual read or write.
2477 * Used for ladvise lockahead to manually request specific locks.
2479 * \param[in] file file this ladvise lock request is on
2480 * \param[in] ladvise ladvise struct describing this lock request
2482 * \retval 0 success, no detailed result available (sync requests
2483 * and requests sent to the server [not handled locally]
2484 * cannot return detailed results)
2485 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2486 * see definitions for details.
2487 * \retval negative negative errno on error
2489 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2491 struct lu_env *env = NULL;
2492 struct cl_io *io = NULL;
2493 struct cl_lock *lock = NULL;
2494 struct cl_lock_descr *descr = NULL;
2495 struct dentry *dentry = file->f_path.dentry;
2496 struct inode *inode = dentry->d_inode;
2497 enum cl_lock_mode cl_mode;
2498 off_t start = ladvise->lla_start;
2499 off_t end = ladvise->lla_end;
2505 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2506 "start=%llu, end=%llu\n", dentry->d_name.len,
2507 dentry->d_name.name, dentry->d_inode,
2508 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2511 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2513 GOTO(out, result = cl_mode);
2515 /* Get IO environment */
2516 result = cl_io_get(inode, &env, &io, &refcheck);
2520 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2523 * nothing to do for this io. This currently happens when
2524 * stripe sub-object's are not yet created.
2526 result = io->ci_result;
2527 } else if (result == 0) {
2528 lock = vvp_env_lock(env);
2529 descr = &lock->cll_descr;
2531 descr->cld_obj = io->ci_obj;
2532 /* Convert byte offsets to pages */
2533 descr->cld_start = cl_index(io->ci_obj, start);
2534 descr->cld_end = cl_index(io->ci_obj, end);
2535 descr->cld_mode = cl_mode;
2536 /* CEF_MUST is used because we do not want to convert a
2537 * lockahead request to a lockless lock */
2538 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2541 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2542 descr->cld_enq_flags |= CEF_SPECULATIVE;
2544 result = cl_lock_request(env, io, lock);
2546 /* On success, we need to release the lock */
2548 cl_lock_release(env, lock);
2550 cl_io_fini(env, io);
2551 cl_env_put(env, &refcheck);
2553 /* -ECANCELED indicates a matching lock with a different extent
2554 * was already present, and -EEXIST indicates a matching lock
2555 * on exactly the same extent was already present.
2556 * We convert them to positive values for userspace to make
2557 * recognizing true errors easier.
2558 * Note we can only return these detailed results on async requests,
2559 * as sync requests look the same as i/o requests for locking. */
2560 if (result == -ECANCELED)
2561 result = LLA_RESULT_DIFFERENT;
2562 else if (result == -EEXIST)
2563 result = LLA_RESULT_SAME;
2568 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2570 static int ll_ladvise_sanity(struct inode *inode,
2571 struct llapi_lu_ladvise *ladvise)
2573 enum lu_ladvise_type advice = ladvise->lla_advice;
2574 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2575 * be in the first 32 bits of enum ladvise_flags */
2576 __u32 flags = ladvise->lla_peradvice_flags;
2577 /* 3 lines at 80 characters per line, should be plenty */
2580 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2582 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2583 "last supported advice is %s (value '%d'): rc = %d\n",
2584 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2585 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2589 /* Per-advice checks */
2591 case LU_LADVISE_LOCKNOEXPAND:
2592 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2594 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2596 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2597 ladvise_names[advice], rc);
2601 case LU_LADVISE_LOCKAHEAD:
2602 /* Currently only READ and WRITE modes can be requested */
2603 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2604 ladvise->lla_lockahead_mode == 0) {
2606 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2608 ll_get_fsname(inode->i_sb, NULL, 0),
2609 ladvise->lla_lockahead_mode,
2610 ladvise_names[advice], rc);
2613 case LU_LADVISE_WILLREAD:
2614 case LU_LADVISE_DONTNEED:
2616 /* Note fall through above - These checks apply to all advices
2617 * except LOCKNOEXPAND */
2618 if (flags & ~LF_DEFAULT_MASK) {
2620 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2622 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2623 ladvise_names[advice], rc);
2626 if (ladvise->lla_start >= ladvise->lla_end) {
2628 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2629 "for %s: rc = %d\n",
2630 ll_get_fsname(inode->i_sb, NULL, 0),
2631 ladvise->lla_start, ladvise->lla_end,
2632 ladvise_names[advice], rc);
2644 * Give file access advices
2646 * The ladvise interface is similar to Linux fadvise() system call, except it
2647 * forwards the advices directly from Lustre client to server. The server side
2648 * codes will apply appropriate read-ahead and caching techniques for the
2649 * corresponding files.
2651 * A typical workload for ladvise is e.g. a bunch of different clients are
2652 * doing small random reads of a file, so prefetching pages into OSS cache
2653 * with big linear reads before the random IO is a net benefit. Fetching
2654 * all that data into each client cache with fadvise() may not be, due to
2655 * much more data being sent to the client.
2657 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2658 struct llapi_lu_ladvise *ladvise)
2662 struct cl_ladvise_io *lio;
2667 env = cl_env_get(&refcheck);
2669 RETURN(PTR_ERR(env));
2671 io = vvp_env_thread_io(env);
2672 io->ci_obj = ll_i2info(inode)->lli_clob;
2674 /* initialize parameters for ladvise */
2675 lio = &io->u.ci_ladvise;
2676 lio->li_start = ladvise->lla_start;
2677 lio->li_end = ladvise->lla_end;
2678 lio->li_fid = ll_inode2fid(inode);
2679 lio->li_advice = ladvise->lla_advice;
2680 lio->li_flags = flags;
2682 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2683 rc = cl_io_loop(env, io);
2687 cl_io_fini(env, io);
2688 cl_env_put(env, &refcheck);
2692 static int ll_lock_noexpand(struct file *file, int flags)
2694 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2696 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2701 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2704 struct fsxattr fsxattr;
2706 if (copy_from_user(&fsxattr,
2707 (const struct fsxattr __user *)arg,
2711 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2712 if (copy_to_user((struct fsxattr __user *)arg,
2713 &fsxattr, sizeof(fsxattr)))
2719 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2723 struct md_op_data *op_data;
2724 struct ptlrpc_request *req = NULL;
2726 struct fsxattr fsxattr;
2728 /* only root could change project ID */
2729 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2732 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2733 LUSTRE_OPC_ANY, NULL);
2734 if (IS_ERR(op_data))
2735 RETURN(PTR_ERR(op_data));
2737 if (copy_from_user(&fsxattr,
2738 (const struct fsxattr __user *)arg,
2740 GOTO(out_fsxattr1, rc = -EFAULT);
2742 op_data->op_projid = fsxattr.fsx_projid;
2743 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2744 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2746 ptlrpc_req_finished(req);
2749 ll_finish_md_op_data(op_data);
2756 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2758 struct inode *inode = file_inode(file);
2759 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2763 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2764 PFID(ll_inode2fid(inode)), inode, cmd);
2765 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2767 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2768 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2772 case LL_IOC_GETFLAGS:
2773 /* Get the current value of the file flags */
2774 return put_user(fd->fd_flags, (int __user *)arg);
2775 case LL_IOC_SETFLAGS:
2776 case LL_IOC_CLRFLAGS:
2777 /* Set or clear specific file flags */
2778 /* XXX This probably needs checks to ensure the flags are
2779 * not abused, and to handle any flag side effects.
2781 if (get_user(flags, (int __user *) arg))
2784 if (cmd == LL_IOC_SETFLAGS) {
2785 if ((flags & LL_FILE_IGNORE_LOCK) &&
2786 !(file->f_flags & O_DIRECT)) {
2787 CERROR("%s: unable to disable locking on "
2788 "non-O_DIRECT file\n", current->comm);
2792 fd->fd_flags |= flags;
2794 fd->fd_flags &= ~flags;
2797 case LL_IOC_LOV_SETSTRIPE:
2798 case LL_IOC_LOV_SETSTRIPE_NEW:
2799 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2800 case LL_IOC_LOV_SETEA:
2801 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2802 case LL_IOC_LOV_SWAP_LAYOUTS: {
2804 struct lustre_swap_layouts lsl;
2807 if (copy_from_user(&lsl, (char __user *)arg,
2808 sizeof(struct lustre_swap_layouts)))
2811 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2814 file2 = fget(lsl.sl_fd);
2818 /* O_WRONLY or O_RDWR */
2819 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2820 GOTO(out, rc = -EPERM);
2822 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
2824 struct inode *inode2;
2825 struct ll_inode_info *lli;
2826 struct obd_client_handle *och = NULL;
2828 lli = ll_i2info(inode);
2829 mutex_lock(&lli->lli_och_mutex);
2830 if (fd->fd_lease_och != NULL) {
2831 och = fd->fd_lease_och;
2832 fd->fd_lease_och = NULL;
2834 mutex_unlock(&lli->lli_och_mutex);
2836 GOTO(out, rc = -ENOLCK);
2837 inode2 = file_inode(file2);
2838 rc = ll_swap_layouts_close(och, inode, inode2, intent);
2840 rc = ll_swap_layouts(file, file2, &lsl);
2846 case LL_IOC_LOV_GETSTRIPE:
2847 case LL_IOC_LOV_GETSTRIPE_NEW:
2848 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2849 case FSFILT_IOC_GETFLAGS:
2850 case FSFILT_IOC_SETFLAGS:
2851 RETURN(ll_iocontrol(inode, file, cmd, arg));
2852 case FSFILT_IOC_GETVERSION_OLD:
2853 case FSFILT_IOC_GETVERSION:
2854 RETURN(put_user(inode->i_generation, (int __user *)arg));
2855 case LL_IOC_GROUP_LOCK:
2856 RETURN(ll_get_grouplock(inode, file, arg));
2857 case LL_IOC_GROUP_UNLOCK:
2858 RETURN(ll_put_grouplock(inode, file, arg));
2859 case IOC_OBD_STATFS:
2860 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2862 /* We need to special case any other ioctls we want to handle,
2863 * to send them to the MDS/OST as appropriate and to properly
2864 * network encode the arg field.
2865 case FSFILT_IOC_SETVERSION_OLD:
2866 case FSFILT_IOC_SETVERSION:
2868 case LL_IOC_FLUSHCTX:
2869 RETURN(ll_flush_ctx(inode));
2870 case LL_IOC_PATH2FID: {
2871 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2872 sizeof(struct lu_fid)))
2877 case LL_IOC_GETPARENT:
2878 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2880 case OBD_IOC_FID2PATH:
2881 RETURN(ll_fid2path(inode, (void __user *)arg));
2882 case LL_IOC_DATA_VERSION: {
2883 struct ioc_data_version idv;
2886 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2889 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2890 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2893 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2899 case LL_IOC_GET_MDTIDX: {
2902 mdtidx = ll_get_mdt_idx(inode);
2906 if (put_user((int)mdtidx, (int __user *)arg))
2911 case OBD_IOC_GETDTNAME:
2912 case OBD_IOC_GETMDNAME:
2913 RETURN(ll_get_obd_name(inode, cmd, arg));
2914 case LL_IOC_HSM_STATE_GET: {
2915 struct md_op_data *op_data;
2916 struct hsm_user_state *hus;
2923 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2924 LUSTRE_OPC_ANY, hus);
2925 if (IS_ERR(op_data)) {
2927 RETURN(PTR_ERR(op_data));
2930 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2933 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2936 ll_finish_md_op_data(op_data);
2940 case LL_IOC_HSM_STATE_SET: {
2941 struct hsm_state_set *hss;
2948 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2953 rc = ll_hsm_state_set(inode, hss);
2958 case LL_IOC_HSM_ACTION: {
2959 struct md_op_data *op_data;
2960 struct hsm_current_action *hca;
2967 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2968 LUSTRE_OPC_ANY, hca);
2969 if (IS_ERR(op_data)) {
2971 RETURN(PTR_ERR(op_data));
2974 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2977 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2980 ll_finish_md_op_data(op_data);
2984 case LL_IOC_SET_LEASE: {
2985 struct ll_inode_info *lli = ll_i2info(inode);
2986 struct obd_client_handle *och = NULL;
2991 case LL_LEASE_WRLCK:
2992 if (!(file->f_mode & FMODE_WRITE))
2994 fmode = FMODE_WRITE;
2996 case LL_LEASE_RDLCK:
2997 if (!(file->f_mode & FMODE_READ))
3001 case LL_LEASE_UNLCK:
3002 mutex_lock(&lli->lli_och_mutex);
3003 if (fd->fd_lease_och != NULL) {
3004 och = fd->fd_lease_och;
3005 fd->fd_lease_och = NULL;
3007 mutex_unlock(&lli->lli_och_mutex);
3012 fmode = och->och_flags;
3013 rc = ll_lease_close(och, inode, &lease_broken);
3017 rc = ll_lease_och_release(inode, file);
3024 RETURN(ll_lease_type_from_fmode(fmode));
3029 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3031 /* apply for lease */
3032 och = ll_lease_open(inode, file, fmode, 0);
3034 RETURN(PTR_ERR(och));
3037 mutex_lock(&lli->lli_och_mutex);
3038 if (fd->fd_lease_och == NULL) {
3039 fd->fd_lease_och = och;
3042 mutex_unlock(&lli->lli_och_mutex);
3044 /* impossible now that only excl is supported for now */
3045 ll_lease_close(och, inode, &lease_broken);
3050 case LL_IOC_GET_LEASE: {
3051 struct ll_inode_info *lli = ll_i2info(inode);
3052 struct ldlm_lock *lock = NULL;
3055 mutex_lock(&lli->lli_och_mutex);
3056 if (fd->fd_lease_och != NULL) {
3057 struct obd_client_handle *och = fd->fd_lease_och;
3059 lock = ldlm_handle2lock(&och->och_lease_handle);
3061 lock_res_and_lock(lock);
3062 if (!ldlm_is_cancel(lock))
3063 fmode = och->och_flags;
3065 unlock_res_and_lock(lock);
3066 LDLM_LOCK_PUT(lock);
3069 mutex_unlock(&lli->lli_och_mutex);
3071 RETURN(ll_lease_type_from_fmode(fmode));
3073 case LL_IOC_HSM_IMPORT: {
3074 struct hsm_user_import *hui;
3080 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3085 rc = ll_hsm_import(inode, file, hui);
3090 case LL_IOC_FUTIMES_3: {
3091 struct ll_futimes_3 lfu;
3093 if (copy_from_user(&lfu,
3094 (const struct ll_futimes_3 __user *)arg,
3098 RETURN(ll_file_futimes_3(file, &lfu));
3100 case LL_IOC_LADVISE: {
3101 struct llapi_ladvise_hdr *k_ladvise_hdr;
3102 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3105 int alloc_size = sizeof(*k_ladvise_hdr);
3108 u_ladvise_hdr = (void __user *)arg;
3109 OBD_ALLOC_PTR(k_ladvise_hdr);
3110 if (k_ladvise_hdr == NULL)
3113 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3114 GOTO(out_ladvise, rc = -EFAULT);
3116 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3117 k_ladvise_hdr->lah_count < 1)
3118 GOTO(out_ladvise, rc = -EINVAL);
3120 num_advise = k_ladvise_hdr->lah_count;
3121 if (num_advise >= LAH_COUNT_MAX)
3122 GOTO(out_ladvise, rc = -EFBIG);
3124 OBD_FREE_PTR(k_ladvise_hdr);
3125 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3126 lah_advise[num_advise]);
3127 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3128 if (k_ladvise_hdr == NULL)
3132 * TODO: submit multiple advices to one server in a single RPC
3134 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3135 GOTO(out_ladvise, rc = -EFAULT);
3137 for (i = 0; i < num_advise; i++) {
3138 struct llapi_lu_ladvise *k_ladvise =
3139 &k_ladvise_hdr->lah_advise[i];
3140 struct llapi_lu_ladvise __user *u_ladvise =
3141 &u_ladvise_hdr->lah_advise[i];
3143 rc = ll_ladvise_sanity(inode, k_ladvise);
3145 GOTO(out_ladvise, rc);
3147 switch (k_ladvise->lla_advice) {
3148 case LU_LADVISE_LOCKNOEXPAND:
3149 rc = ll_lock_noexpand(file,
3150 k_ladvise->lla_peradvice_flags);
3151 GOTO(out_ladvise, rc);
3152 case LU_LADVISE_LOCKAHEAD:
3154 rc = ll_file_lock_ahead(file, k_ladvise);
3157 GOTO(out_ladvise, rc);
3160 &u_ladvise->lla_lockahead_result))
3161 GOTO(out_ladvise, rc = -EFAULT);
3164 rc = ll_ladvise(inode, file,
3165 k_ladvise_hdr->lah_flags,
3168 GOTO(out_ladvise, rc);
3175 OBD_FREE(k_ladvise_hdr, alloc_size);
3178 case LL_IOC_FSGETXATTR:
3179 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3180 case LL_IOC_FSSETXATTR:
3181 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3183 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3185 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3186 (void __user *)arg));
3190 #ifndef HAVE_FILE_LLSEEK_SIZE
3191 static inline loff_t
3192 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3194 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3196 if (offset > maxsize)
3199 if (offset != file->f_pos) {
3200 file->f_pos = offset;
3201 file->f_version = 0;
3207 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3208 loff_t maxsize, loff_t eof)
3210 struct inode *inode = file_inode(file);
3218 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3219 * position-querying operation. Avoid rewriting the "same"
3220 * f_pos value back to the file because a concurrent read(),
3221 * write() or lseek() might have altered it
3226 * f_lock protects against read/modify/write race with other
3227 * SEEK_CURs. Note that parallel writes and reads behave
3231 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3232 inode_unlock(inode);
3236 * In the generic case the entire file is data, so as long as
3237 * offset isn't at the end of the file then the offset is data.
3244 * There is a virtual hole at the end of the file, so as long as
3245 * offset isn't i_size or larger, return i_size.
3253 return llseek_execute(file, offset, maxsize);
3257 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3259 struct inode *inode = file_inode(file);
3260 loff_t retval, eof = 0;
3263 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3264 (origin == SEEK_CUR) ? file->f_pos : 0);
3265 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3266 PFID(ll_inode2fid(inode)), inode, retval, retval,
3268 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3270 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3271 retval = ll_glimpse_size(inode);
3274 eof = i_size_read(inode);
3277 retval = ll_generic_file_llseek_size(file, offset, origin,
3278 ll_file_maxbytes(inode), eof);
3282 static int ll_flush(struct file *file, fl_owner_t id)
3284 struct inode *inode = file_inode(file);
3285 struct ll_inode_info *lli = ll_i2info(inode);
3286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3289 LASSERT(!S_ISDIR(inode->i_mode));
3291 /* catch async errors that were recorded back when async writeback
3292 * failed for pages in this mapping. */
3293 rc = lli->lli_async_rc;
3294 lli->lli_async_rc = 0;
3295 if (lli->lli_clob != NULL) {
3296 err = lov_read_and_clear_async_rc(lli->lli_clob);
3301 /* The application has been told write failure already.
3302 * Do not report failure again. */
3303 if (fd->fd_write_failed)
3305 return rc ? -EIO : 0;
3309 * Called to make sure a portion of file has been written out.
3310 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3312 * Return how many pages have been written.
3314 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3315 enum cl_fsync_mode mode, int ignore_layout)
3319 struct cl_fsync_io *fio;
3324 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3325 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3328 env = cl_env_get(&refcheck);
3330 RETURN(PTR_ERR(env));
3332 io = vvp_env_thread_io(env);
3333 io->ci_obj = ll_i2info(inode)->lli_clob;
3334 io->ci_ignore_layout = ignore_layout;
3336 /* initialize parameters for sync */
3337 fio = &io->u.ci_fsync;
3338 fio->fi_start = start;
3340 fio->fi_fid = ll_inode2fid(inode);
3341 fio->fi_mode = mode;
3342 fio->fi_nr_written = 0;
3344 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3345 result = cl_io_loop(env, io);
3347 result = io->ci_result;
3349 result = fio->fi_nr_written;
3350 cl_io_fini(env, io);
3351 cl_env_put(env, &refcheck);
3357 * When dentry is provided (the 'else' case), file_dentry() may be
3358 * null and dentry must be used directly rather than pulled from
3359 * file_dentry() as is done otherwise.
3362 #ifdef HAVE_FILE_FSYNC_4ARGS
3363 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3365 struct dentry *dentry = file_dentry(file);
3367 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3368 int ll_fsync(struct file *file, int datasync)
3370 struct dentry *dentry = file_dentry(file);
3372 loff_t end = LLONG_MAX;
3374 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3377 loff_t end = LLONG_MAX;
3379 struct inode *inode = dentry->d_inode;
3380 struct ll_inode_info *lli = ll_i2info(inode);
3381 struct ptlrpc_request *req;
3385 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3386 PFID(ll_inode2fid(inode)), inode);
3387 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3389 #ifdef HAVE_FILE_FSYNC_4ARGS
3390 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3391 lock_inode = !lli->lli_inode_locked;
3395 /* fsync's caller has already called _fdata{sync,write}, we want
3396 * that IO to finish before calling the osc and mdc sync methods */
3397 rc = filemap_fdatawait(inode->i_mapping);
3400 /* catch async errors that were recorded back when async writeback
3401 * failed for pages in this mapping. */
3402 if (!S_ISDIR(inode->i_mode)) {
3403 err = lli->lli_async_rc;
3404 lli->lli_async_rc = 0;
3407 if (lli->lli_clob != NULL) {
3408 err = lov_read_and_clear_async_rc(lli->lli_clob);
3414 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3418 ptlrpc_req_finished(req);
3420 if (S_ISREG(inode->i_mode)) {
3421 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3423 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3424 if (rc == 0 && err < 0)
3427 fd->fd_write_failed = true;
3429 fd->fd_write_failed = false;
3432 #ifdef HAVE_FILE_FSYNC_4ARGS
3434 inode_unlock(inode);
3440 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3442 struct inode *inode = file_inode(file);
3443 struct ll_sb_info *sbi = ll_i2sbi(inode);
3444 struct ldlm_enqueue_info einfo = {
3445 .ei_type = LDLM_FLOCK,
3446 .ei_cb_cp = ldlm_flock_completion_ast,
3447 .ei_cbdata = file_lock,
3449 struct md_op_data *op_data;
3450 struct lustre_handle lockh = { 0 };
3451 union ldlm_policy_data flock = { { 0 } };
3452 int fl_type = file_lock->fl_type;
3458 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3459 PFID(ll_inode2fid(inode)), file_lock);
3461 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3463 if (file_lock->fl_flags & FL_FLOCK) {
3464 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3465 /* flocks are whole-file locks */
3466 flock.l_flock.end = OFFSET_MAX;
3467 /* For flocks owner is determined by the local file desctiptor*/
3468 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3469 } else if (file_lock->fl_flags & FL_POSIX) {
3470 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3471 flock.l_flock.start = file_lock->fl_start;
3472 flock.l_flock.end = file_lock->fl_end;
3476 flock.l_flock.pid = file_lock->fl_pid;
3478 /* Somewhat ugly workaround for svc lockd.
3479 * lockd installs custom fl_lmops->lm_compare_owner that checks
3480 * for the fl_owner to be the same (which it always is on local node
3481 * I guess between lockd processes) and then compares pid.
3482 * As such we assign pid to the owner field to make it all work,
3483 * conflict with normal locks is unlikely since pid space and
3484 * pointer space for current->files are not intersecting */
3485 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3486 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3490 einfo.ei_mode = LCK_PR;
3493 /* An unlock request may or may not have any relation to
3494 * existing locks so we may not be able to pass a lock handle
3495 * via a normal ldlm_lock_cancel() request. The request may even
3496 * unlock a byte range in the middle of an existing lock. In
3497 * order to process an unlock request we need all of the same
3498 * information that is given with a normal read or write record
3499 * lock request. To avoid creating another ldlm unlock (cancel)
3500 * message we'll treat a LCK_NL flock request as an unlock. */
3501 einfo.ei_mode = LCK_NL;
3504 einfo.ei_mode = LCK_PW;
3507 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3522 flags = LDLM_FL_BLOCK_NOWAIT;
3528 flags = LDLM_FL_TEST_LOCK;
3531 CERROR("unknown fcntl lock command: %d\n", cmd);
3535 /* Save the old mode so that if the mode in the lock changes we
3536 * can decrement the appropriate reader or writer refcount. */
3537 file_lock->fl_type = einfo.ei_mode;
3539 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3540 LUSTRE_OPC_ANY, NULL);
3541 if (IS_ERR(op_data))
3542 RETURN(PTR_ERR(op_data));
3544 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3545 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3546 flock.l_flock.pid, flags, einfo.ei_mode,
3547 flock.l_flock.start, flock.l_flock.end);
3549 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3552 /* Restore the file lock type if not TEST lock. */
3553 if (!(flags & LDLM_FL_TEST_LOCK))
3554 file_lock->fl_type = fl_type;
3556 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3557 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3558 !(flags & LDLM_FL_TEST_LOCK))
3559 rc2 = locks_lock_file_wait(file, file_lock);
3561 if ((file_lock->fl_flags & FL_FLOCK) &&
3562 (rc == 0 || file_lock->fl_type == F_UNLCK))
3563 rc2 = flock_lock_file_wait(file, file_lock);
3564 if ((file_lock->fl_flags & FL_POSIX) &&
3565 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3566 !(flags & LDLM_FL_TEST_LOCK))
3567 rc2 = posix_lock_file_wait(file, file_lock);
3568 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3570 if (rc2 && file_lock->fl_type != F_UNLCK) {
3571 einfo.ei_mode = LCK_NL;
3572 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3577 ll_finish_md_op_data(op_data);
3582 int ll_get_fid_by_name(struct inode *parent, const char *name,
3583 int namelen, struct lu_fid *fid,
3584 struct inode **inode)
3586 struct md_op_data *op_data = NULL;
3587 struct mdt_body *body;
3588 struct ptlrpc_request *req;
3592 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3593 LUSTRE_OPC_ANY, NULL);
3594 if (IS_ERR(op_data))
3595 RETURN(PTR_ERR(op_data));
3597 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3598 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3599 ll_finish_md_op_data(op_data);
3603 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3605 GOTO(out_req, rc = -EFAULT);
3607 *fid = body->mbo_fid1;
3610 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3612 ptlrpc_req_finished(req);
3616 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3617 const char *name, int namelen)
3619 struct dentry *dchild = NULL;
3620 struct inode *child_inode = NULL;
3621 struct md_op_data *op_data;
3622 struct ptlrpc_request *request = NULL;
3623 struct obd_client_handle *och = NULL;
3625 struct mdt_body *body;
3627 __u64 data_version = 0;
3630 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3631 name, PFID(ll_inode2fid(parent)), mdtidx);
3633 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3634 0, LUSTRE_OPC_ANY, NULL);
3635 if (IS_ERR(op_data))
3636 RETURN(PTR_ERR(op_data));
3638 /* Get child FID first */
3639 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3642 dchild = d_lookup(file_dentry(file), &qstr);
3643 if (dchild != NULL) {
3644 if (dchild->d_inode != NULL)
3645 child_inode = igrab(dchild->d_inode);
3649 if (child_inode == NULL) {
3650 rc = ll_get_fid_by_name(parent, name, namelen,
3651 &op_data->op_fid3, &child_inode);
3656 if (child_inode == NULL)
3657 GOTO(out_free, rc = -EINVAL);
3660 * lfs migrate command needs to be blocked on the client
3661 * by checking the migrate FID against the FID of the
3664 if (child_inode == parent->i_sb->s_root->d_inode)
3665 GOTO(out_iput, rc = -EINVAL);
3667 inode_lock(child_inode);
3668 op_data->op_fid3 = *ll_inode2fid(child_inode);
3669 if (!fid_is_sane(&op_data->op_fid3)) {
3670 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3671 ll_get_fsname(parent->i_sb, NULL, 0), name,
3672 PFID(&op_data->op_fid3));
3673 GOTO(out_unlock, rc = -EINVAL);
3676 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3678 GOTO(out_unlock, rc);
3681 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3682 PFID(&op_data->op_fid3), mdtidx);
3683 GOTO(out_unlock, rc = 0);
3686 if (S_ISREG(child_inode->i_mode)) {
3687 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3691 GOTO(out_unlock, rc);
3694 rc = ll_data_version(child_inode, &data_version,
3697 GOTO(out_close, rc);
3699 op_data->op_handle = och->och_fh;
3700 op_data->op_data = och->och_mod;
3701 op_data->op_data_version = data_version;
3702 op_data->op_lease_handle = och->och_lease_handle;
3703 op_data->op_bias |= MDS_RENAME_MIGRATE;
3706 op_data->op_mds = mdtidx;
3707 op_data->op_cli_flags = CLI_MIGRATE;
3708 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3709 namelen, name, namelen, &request);
3711 LASSERT(request != NULL);
3712 ll_update_times(request, parent);
3714 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3715 LASSERT(body != NULL);
3717 /* If the server does release layout lock, then we cleanup
3718 * the client och here, otherwise release it in out_close: */
3720 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3721 obd_mod_put(och->och_mod);
3722 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3724 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3730 if (request != NULL) {
3731 ptlrpc_req_finished(request);
3735 /* Try again if the file layout has changed. */
3736 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3740 if (och != NULL) /* close the file */
3741 ll_lease_close(och, child_inode, NULL);
3743 clear_nlink(child_inode);
3745 inode_unlock(child_inode);
3749 ll_finish_md_op_data(op_data);
3754 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3762 * test if some locks matching bits and l_req_mode are acquired
3763 * - bits can be in different locks
3764 * - if found clear the common lock bits in *bits
3765 * - the bits not found, are kept in *bits
3767 * \param bits [IN] searched lock bits [IN]
3768 * \param l_req_mode [IN] searched lock mode
3769 * \retval boolean, true iff all bits are found
3771 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3773 struct lustre_handle lockh;
3774 union ldlm_policy_data policy;
3775 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3776 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3785 fid = &ll_i2info(inode)->lli_fid;
3786 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3787 ldlm_lockname[mode]);
3789 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3790 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3791 policy.l_inodebits.bits = *bits & (1 << i);
3792 if (policy.l_inodebits.bits == 0)
3795 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3796 &policy, mode, &lockh)) {
3797 struct ldlm_lock *lock;
3799 lock = ldlm_handle2lock(&lockh);
3802 ~(lock->l_policy_data.l_inodebits.bits);
3803 LDLM_LOCK_PUT(lock);
3805 *bits &= ~policy.l_inodebits.bits;
3812 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3813 struct lustre_handle *lockh, __u64 flags,
3814 enum ldlm_mode mode)
3816 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3821 fid = &ll_i2info(inode)->lli_fid;
3822 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3824 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3825 fid, LDLM_IBITS, &policy, mode, lockh);
3830 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3832 /* Already unlinked. Just update nlink and return success */
3833 if (rc == -ENOENT) {
3835 /* If it is striped directory, and there is bad stripe
3836 * Let's revalidate the dentry again, instead of returning
3838 if (S_ISDIR(inode->i_mode) &&
3839 ll_i2info(inode)->lli_lsm_md != NULL)
3842 /* This path cannot be hit for regular files unless in
3843 * case of obscure races, so no need to to validate
3845 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3847 } else if (rc != 0) {
3848 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3849 "%s: revalidate FID "DFID" error: rc = %d\n",
3850 ll_get_fsname(inode->i_sb, NULL, 0),
3851 PFID(ll_inode2fid(inode)), rc);
3857 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3859 struct inode *inode = dentry->d_inode;
3860 struct ptlrpc_request *req = NULL;
3861 struct obd_export *exp;
3865 LASSERT(inode != NULL);
3867 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3868 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3870 exp = ll_i2mdexp(inode);
3872 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3873 * But under CMD case, it caused some lock issues, should be fixed
3874 * with new CMD ibits lock. See bug 12718 */
3875 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3876 struct lookup_intent oit = { .it_op = IT_GETATTR };
3877 struct md_op_data *op_data;
3879 if (ibits == MDS_INODELOCK_LOOKUP)
3880 oit.it_op = IT_LOOKUP;
3882 /* Call getattr by fid, so do not provide name at all. */
3883 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3884 dentry->d_inode, NULL, 0, 0,
3885 LUSTRE_OPC_ANY, NULL);
3886 if (IS_ERR(op_data))
3887 RETURN(PTR_ERR(op_data));
3889 rc = md_intent_lock(exp, op_data, &oit, &req,
3890 &ll_md_blocking_ast, 0);
3891 ll_finish_md_op_data(op_data);
3893 rc = ll_inode_revalidate_fini(inode, rc);
3897 rc = ll_revalidate_it_finish(req, &oit, dentry);
3899 ll_intent_release(&oit);
3903 /* Unlinked? Unhash dentry, so it is not picked up later by
3904 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3905 here to preserve get_cwd functionality on 2.6.
3907 if (!dentry->d_inode->i_nlink) {
3908 ll_lock_dcache(inode);
3909 d_lustre_invalidate(dentry, 0);
3910 ll_unlock_dcache(inode);
3913 ll_lookup_finish_locks(&oit, dentry);
3914 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3915 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3916 u64 valid = OBD_MD_FLGETATTR;
3917 struct md_op_data *op_data;
3920 if (S_ISREG(inode->i_mode)) {
3921 rc = ll_get_default_mdsize(sbi, &ealen);
3924 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3927 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3928 0, ealen, LUSTRE_OPC_ANY,
3930 if (IS_ERR(op_data))
3931 RETURN(PTR_ERR(op_data));
3933 op_data->op_valid = valid;
3934 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3935 ll_finish_md_op_data(op_data);
3937 rc = ll_inode_revalidate_fini(inode, rc);
3941 rc = ll_prep_inode(&inode, req, NULL, NULL);
3944 ptlrpc_req_finished(req);
3948 static int ll_merge_md_attr(struct inode *inode)
3950 struct cl_attr attr = { 0 };
3953 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3954 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3955 &attr, ll_md_blocking_ast);
3959 set_nlink(inode, attr.cat_nlink);
3960 inode->i_blocks = attr.cat_blocks;
3961 i_size_write(inode, attr.cat_size);
3963 ll_i2info(inode)->lli_atime = attr.cat_atime;
3964 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3965 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3971 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3973 struct inode *inode = dentry->d_inode;
3977 rc = __ll_inode_revalidate(dentry, ibits);
3981 /* if object isn't regular file, don't validate size */
3982 if (!S_ISREG(inode->i_mode)) {
3983 if (S_ISDIR(inode->i_mode) &&
3984 ll_i2info(inode)->lli_lsm_md != NULL) {
3985 rc = ll_merge_md_attr(inode);
3990 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3991 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3992 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3994 /* In case of restore, the MDT has the right size and has
3995 * already send it back without granting the layout lock,
3996 * inode is up-to-date so glimpse is useless.
3997 * Also to glimpse we need the layout, in case of a running
3998 * restore the MDT holds the layout lock so the glimpse will
3999 * block up to the end of restore (getattr will block)
4001 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4002 rc = ll_glimpse_size(inode);
4007 static inline dev_t ll_compat_encode_dev(dev_t dev)
4009 /* The compat_sys_*stat*() syscalls will fail unless the
4010 * device majors and minors are both less than 256. Note that
4011 * the value returned here will be passed through
4012 * old_encode_dev() in cp_compat_stat(). And so we are not
4013 * trying to return a valid compat (u16) device number, just
4014 * one that will pass the old_valid_dev() check. */
4016 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4019 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4020 int ll_getattr(const struct path *path, struct kstat *stat,
4021 u32 request_mask, unsigned int flags)
4024 struct dentry *de = path->dentry;
4026 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4029 struct inode *inode = de->d_inode;
4030 struct ll_sb_info *sbi = ll_i2sbi(inode);
4031 struct ll_inode_info *lli = ll_i2info(inode);
4034 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4035 MDS_INODELOCK_LOOKUP);
4036 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4041 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4043 if (ll_need_32bit_api(sbi)) {
4044 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4045 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4046 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4048 stat->ino = inode->i_ino;
4049 stat->dev = inode->i_sb->s_dev;
4050 stat->rdev = inode->i_rdev;
4053 stat->mode = inode->i_mode;
4054 stat->uid = inode->i_uid;
4055 stat->gid = inode->i_gid;
4056 stat->atime = inode->i_atime;
4057 stat->mtime = inode->i_mtime;
4058 stat->ctime = inode->i_ctime;
4059 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4061 stat->nlink = inode->i_nlink;
4062 stat->size = i_size_read(inode);
4063 stat->blocks = inode->i_blocks;
4068 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4069 __u64 start, __u64 len)
4073 struct fiemap *fiemap;
4074 unsigned int extent_count = fieinfo->fi_extents_max;
4076 num_bytes = sizeof(*fiemap) + (extent_count *
4077 sizeof(struct fiemap_extent));
4078 OBD_ALLOC_LARGE(fiemap, num_bytes);
4083 fiemap->fm_flags = fieinfo->fi_flags;
4084 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4085 fiemap->fm_start = start;
4086 fiemap->fm_length = len;
4087 if (extent_count > 0 &&
4088 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4089 sizeof(struct fiemap_extent)) != 0)
4090 GOTO(out, rc = -EFAULT);
4092 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4094 fieinfo->fi_flags = fiemap->fm_flags;
4095 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4096 if (extent_count > 0 &&
4097 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4098 fiemap->fm_mapped_extents *
4099 sizeof(struct fiemap_extent)) != 0)
4100 GOTO(out, rc = -EFAULT);
4102 OBD_FREE_LARGE(fiemap, num_bytes);
4106 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4108 struct ll_inode_info *lli = ll_i2info(inode);
4109 struct posix_acl *acl = NULL;
4112 spin_lock(&lli->lli_lock);
4113 /* VFS' acl_permission_check->check_acl will release the refcount */
4114 acl = posix_acl_dup(lli->lli_posix_acl);
4115 spin_unlock(&lli->lli_lock);
4120 #ifdef HAVE_IOP_SET_ACL
4121 #ifdef CONFIG_FS_POSIX_ACL
4122 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4124 const char *name = NULL;
4131 case ACL_TYPE_ACCESS:
4133 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4137 name = XATTR_NAME_POSIX_ACL_ACCESS;
4139 case ACL_TYPE_DEFAULT:
4140 if (!S_ISDIR(inode->i_mode))
4141 GOTO(out, rc = acl ? -EACCES : 0);
4142 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4145 GOTO(out, rc = -EINVAL);
4149 size = posix_acl_xattr_size(acl->a_count);
4150 value = kmalloc(size, GFP_NOFS);
4152 GOTO(out, rc = -ENOMEM);
4154 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4159 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4160 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4165 set_cached_acl(inode, type, acl);
4167 forget_cached_acl(inode, type);
4170 #endif /* CONFIG_FS_POSIX_ACL */
4171 #endif /* HAVE_IOP_SET_ACL */
4173 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4175 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4176 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4178 ll_check_acl(struct inode *inode, int mask)
4181 # ifdef CONFIG_FS_POSIX_ACL
4182 struct posix_acl *acl;
4186 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4187 if (flags & IPERM_FLAG_RCU)
4190 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4195 rc = posix_acl_permission(inode, acl, mask);
4196 posix_acl_release(acl);
4199 # else /* !CONFIG_FS_POSIX_ACL */
4201 # endif /* CONFIG_FS_POSIX_ACL */
4203 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4205 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4206 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4208 # ifdef HAVE_INODE_PERMISION_2ARGS
4209 int ll_inode_permission(struct inode *inode, int mask)
4211 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4216 struct ll_sb_info *sbi;
4217 struct root_squash_info *squash;
4218 struct cred *cred = NULL;
4219 const struct cred *old_cred = NULL;
4221 bool squash_id = false;
4224 #ifdef MAY_NOT_BLOCK
4225 if (mask & MAY_NOT_BLOCK)
4227 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4228 if (flags & IPERM_FLAG_RCU)
4232 /* as root inode are NOT getting validated in lookup operation,
4233 * need to do it before permission check. */
4235 if (inode == inode->i_sb->s_root->d_inode) {
4236 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4237 MDS_INODELOCK_LOOKUP);
4242 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4243 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4245 /* squash fsuid/fsgid if needed */
4246 sbi = ll_i2sbi(inode);
4247 squash = &sbi->ll_squash;
4248 if (unlikely(squash->rsi_uid != 0 &&
4249 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4250 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4254 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4255 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4256 squash->rsi_uid, squash->rsi_gid);
4258 /* update current process's credentials
4259 * and FS capability */
4260 cred = prepare_creds();
4264 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4265 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4266 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4267 if ((1 << cap) & CFS_CAP_FS_MASK)
4268 cap_lower(cred->cap_effective, cap);
4270 old_cred = override_creds(cred);
4273 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4274 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4275 /* restore current process's credentials and FS capability */
4277 revert_creds(old_cred);
4284 /* -o localflock - only provides locally consistent flock locks */
4285 struct file_operations ll_file_operations = {
4286 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4287 # ifdef HAVE_SYNC_READ_WRITE
4288 .read = new_sync_read,
4289 .write = new_sync_write,
4291 .read_iter = ll_file_read_iter,
4292 .write_iter = ll_file_write_iter,
4293 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4294 .read = ll_file_read,
4295 .aio_read = ll_file_aio_read,
4296 .write = ll_file_write,
4297 .aio_write = ll_file_aio_write,
4298 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4299 .unlocked_ioctl = ll_file_ioctl,
4300 .open = ll_file_open,
4301 .release = ll_file_release,
4302 .mmap = ll_file_mmap,
4303 .llseek = ll_file_seek,
4304 .splice_read = ll_file_splice_read,
4309 struct file_operations ll_file_operations_flock = {
4310 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4311 # ifdef HAVE_SYNC_READ_WRITE
4312 .read = new_sync_read,
4313 .write = new_sync_write,
4314 # endif /* HAVE_SYNC_READ_WRITE */
4315 .read_iter = ll_file_read_iter,
4316 .write_iter = ll_file_write_iter,
4317 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4318 .read = ll_file_read,
4319 .aio_read = ll_file_aio_read,
4320 .write = ll_file_write,
4321 .aio_write = ll_file_aio_write,
4322 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4323 .unlocked_ioctl = ll_file_ioctl,
4324 .open = ll_file_open,
4325 .release = ll_file_release,
4326 .mmap = ll_file_mmap,
4327 .llseek = ll_file_seek,
4328 .splice_read = ll_file_splice_read,
4331 .flock = ll_file_flock,
4332 .lock = ll_file_flock
4335 /* These are for -o noflock - to return ENOSYS on flock calls */
4336 struct file_operations ll_file_operations_noflock = {
4337 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4338 # ifdef HAVE_SYNC_READ_WRITE
4339 .read = new_sync_read,
4340 .write = new_sync_write,
4341 # endif /* HAVE_SYNC_READ_WRITE */
4342 .read_iter = ll_file_read_iter,
4343 .write_iter = ll_file_write_iter,
4344 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4345 .read = ll_file_read,
4346 .aio_read = ll_file_aio_read,
4347 .write = ll_file_write,
4348 .aio_write = ll_file_aio_write,
4349 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4350 .unlocked_ioctl = ll_file_ioctl,
4351 .open = ll_file_open,
4352 .release = ll_file_release,
4353 .mmap = ll_file_mmap,
4354 .llseek = ll_file_seek,
4355 .splice_read = ll_file_splice_read,
4358 .flock = ll_file_noflock,
4359 .lock = ll_file_noflock
4362 struct inode_operations ll_file_inode_operations = {
4363 .setattr = ll_setattr,
4364 .getattr = ll_getattr,
4365 .permission = ll_inode_permission,
4366 #ifdef HAVE_IOP_XATTR
4367 .setxattr = ll_setxattr,
4368 .getxattr = ll_getxattr,
4369 .removexattr = ll_removexattr,
4371 .listxattr = ll_listxattr,
4372 .fiemap = ll_fiemap,
4373 #ifdef HAVE_IOP_GET_ACL
4374 .get_acl = ll_get_acl,
4376 #ifdef HAVE_IOP_SET_ACL
4377 .set_acl = ll_set_acl,
4381 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4383 struct ll_inode_info *lli = ll_i2info(inode);
4384 struct cl_object *obj = lli->lli_clob;
4393 env = cl_env_get(&refcheck);
4395 RETURN(PTR_ERR(env));
4397 rc = cl_conf_set(env, lli->lli_clob, conf);
4401 if (conf->coc_opc == OBJECT_CONF_SET) {
4402 struct ldlm_lock *lock = conf->coc_lock;
4403 struct cl_layout cl = {
4407 LASSERT(lock != NULL);
4408 LASSERT(ldlm_has_layout(lock));
4410 /* it can only be allowed to match after layout is
4411 * applied to inode otherwise false layout would be
4412 * seen. Applying layout shoud happen before dropping
4413 * the intent lock. */
4414 ldlm_lock_allow_match(lock);
4416 rc = cl_object_layout_get(env, obj, &cl);
4421 DFID": layout version change: %u -> %u\n",
4422 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4424 ll_layout_version_set(lli, cl.cl_layout_gen);
4428 cl_env_put(env, &refcheck);
4433 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4434 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4437 struct ll_sb_info *sbi = ll_i2sbi(inode);
4438 struct ptlrpc_request *req;
4439 struct mdt_body *body;
4446 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4447 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4448 lock->l_lvb_data, lock->l_lvb_len);
4450 if (lock->l_lvb_data != NULL)
4453 /* if layout lock was granted right away, the layout is returned
4454 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4455 * blocked and then granted via completion ast, we have to fetch
4456 * layout here. Please note that we can't use the LVB buffer in
4457 * completion AST because it doesn't have a large enough buffer */
4458 rc = ll_get_default_mdsize(sbi, &lmmsize);
4460 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4461 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4466 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4468 GOTO(out, rc = -EPROTO);
4470 lmmsize = body->mbo_eadatasize;
4471 if (lmmsize == 0) /* empty layout */
4474 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4476 GOTO(out, rc = -EFAULT);
4478 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4479 if (lvbdata == NULL)
4480 GOTO(out, rc = -ENOMEM);
4482 memcpy(lvbdata, lmm, lmmsize);
4483 lock_res_and_lock(lock);
4484 if (unlikely(lock->l_lvb_data == NULL)) {
4485 lock->l_lvb_type = LVB_T_LAYOUT;
4486 lock->l_lvb_data = lvbdata;
4487 lock->l_lvb_len = lmmsize;
4490 unlock_res_and_lock(lock);
4493 OBD_FREE_LARGE(lvbdata, lmmsize);
4498 ptlrpc_req_finished(req);
4503 * Apply the layout to the inode. Layout lock is held and will be released
4506 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4507 struct inode *inode)
4509 struct ll_inode_info *lli = ll_i2info(inode);
4510 struct ll_sb_info *sbi = ll_i2sbi(inode);
4511 struct ldlm_lock *lock;
4512 struct cl_object_conf conf;
4515 bool wait_layout = false;
4518 LASSERT(lustre_handle_is_used(lockh));
4520 lock = ldlm_handle2lock(lockh);
4521 LASSERT(lock != NULL);
4522 LASSERT(ldlm_has_layout(lock));
4524 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4525 PFID(&lli->lli_fid), inode);
4527 /* in case this is a caching lock and reinstate with new inode */
4528 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4530 lock_res_and_lock(lock);
4531 lvb_ready = ldlm_is_lvb_ready(lock);
4532 unlock_res_and_lock(lock);
4534 /* checking lvb_ready is racy but this is okay. The worst case is
4535 * that multi processes may configure the file on the same time. */
4539 rc = ll_layout_fetch(inode, lock);
4543 /* for layout lock, lmm is stored in lock's lvb.
4544 * lvb_data is immutable if the lock is held so it's safe to access it
4547 * set layout to file. Unlikely this will fail as old layout was
4548 * surely eliminated */
4549 memset(&conf, 0, sizeof conf);
4550 conf.coc_opc = OBJECT_CONF_SET;
4551 conf.coc_inode = inode;
4552 conf.coc_lock = lock;
4553 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4554 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4555 rc = ll_layout_conf(inode, &conf);
4557 /* refresh layout failed, need to wait */
4558 wait_layout = rc == -EBUSY;
4561 LDLM_LOCK_PUT(lock);
4562 ldlm_lock_decref(lockh, mode);
4564 /* wait for IO to complete if it's still being used. */
4566 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4567 ll_get_fsname(inode->i_sb, NULL, 0),
4568 PFID(&lli->lli_fid), inode);
4570 memset(&conf, 0, sizeof conf);
4571 conf.coc_opc = OBJECT_CONF_WAIT;
4572 conf.coc_inode = inode;
4573 rc = ll_layout_conf(inode, &conf);
4577 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4578 ll_get_fsname(inode->i_sb, NULL, 0),
4579 PFID(&lli->lli_fid), rc);
4585 * Issue layout intent RPC to MDS.
4586 * \param inode [in] file inode
4587 * \param intent [in] layout intent
4589 * \retval 0 on success
4590 * \retval < 0 error code
4592 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4594 struct ll_inode_info *lli = ll_i2info(inode);
4595 struct ll_sb_info *sbi = ll_i2sbi(inode);
4596 struct md_op_data *op_data;
4597 struct lookup_intent it;
4598 struct ptlrpc_request *req;
4602 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4603 0, 0, LUSTRE_OPC_ANY, NULL);
4604 if (IS_ERR(op_data))
4605 RETURN(PTR_ERR(op_data));
4607 op_data->op_data = intent;
4608 op_data->op_data_size = sizeof(*intent);
4610 memset(&it, 0, sizeof(it));
4611 it.it_op = IT_LAYOUT;
4612 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4613 intent->li_opc == LAYOUT_INTENT_TRUNC)
4614 it.it_flags = FMODE_WRITE;
4616 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4617 ll_get_fsname(inode->i_sb, NULL, 0),
4618 PFID(&lli->lli_fid), inode);
4620 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4621 &ll_md_blocking_ast, 0);
4622 if (it.it_request != NULL)
4623 ptlrpc_req_finished(it.it_request);
4624 it.it_request = NULL;
4626 ll_finish_md_op_data(op_data);
4628 /* set lock data in case this is a new lock */
4630 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4632 ll_intent_drop_lock(&it);
4638 * This function checks if there exists a LAYOUT lock on the client side,
4639 * or enqueues it if it doesn't have one in cache.
4641 * This function will not hold layout lock so it may be revoked any time after
4642 * this function returns. Any operations depend on layout should be redone
4645 * This function should be called before lov_io_init() to get an uptodate
4646 * layout version, the caller should save the version number and after IO
4647 * is finished, this function should be called again to verify that layout
4648 * is not changed during IO time.
4650 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4652 struct ll_inode_info *lli = ll_i2info(inode);
4653 struct ll_sb_info *sbi = ll_i2sbi(inode);
4654 struct lustre_handle lockh;
4655 struct layout_intent intent = {
4656 .li_opc = LAYOUT_INTENT_ACCESS,
4658 enum ldlm_mode mode;
4662 *gen = ll_layout_version_get(lli);
4663 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4667 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4668 LASSERT(S_ISREG(inode->i_mode));
4670 /* take layout lock mutex to enqueue layout lock exclusively. */
4671 mutex_lock(&lli->lli_layout_mutex);
4674 /* mostly layout lock is caching on the local side, so try to
4675 * match it before grabbing layout lock mutex. */
4676 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4677 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4678 if (mode != 0) { /* hit cached lock */
4679 rc = ll_layout_lock_set(&lockh, mode, inode);
4685 rc = ll_layout_intent(inode, &intent);
4691 *gen = ll_layout_version_get(lli);
4692 mutex_unlock(&lli->lli_layout_mutex);
4698 * Issue layout intent RPC indicating where in a file an IO is about to write.
4700 * \param[in] inode file inode.
4701 * \param[in] start start offset of fille in bytes where an IO is about to
4703 * \param[in] end exclusive end offset in bytes of the write range.
4705 * \retval 0 on success
4706 * \retval < 0 error code
4708 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4710 struct layout_intent intent = {
4711 .li_opc = LAYOUT_INTENT_WRITE,
4718 rc = ll_layout_intent(inode, &intent);
4724 * This function send a restore request to the MDT
4726 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4728 struct hsm_user_request *hur;
4732 len = sizeof(struct hsm_user_request) +
4733 sizeof(struct hsm_user_item);
4734 OBD_ALLOC(hur, len);
4738 hur->hur_request.hr_action = HUA_RESTORE;
4739 hur->hur_request.hr_archive_id = 0;
4740 hur->hur_request.hr_flags = 0;
4741 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4742 sizeof(hur->hur_user_item[0].hui_fid));
4743 hur->hur_user_item[0].hui_extent.offset = offset;
4744 hur->hur_user_item[0].hui_extent.length = length;
4745 hur->hur_request.hr_itemcount = 1;
4746 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,