4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_SWAP:
148 LASSERT(data != NULL);
149 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
150 op_data->op_data_version = 0;
151 op_data->op_lease_handle = och->och_lease_handle;
152 op_data->op_fid2 = *ll_inode2fid(data);
155 case MDS_HSM_RELEASE:
156 LASSERT(data != NULL);
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *(__u64 *)data;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 LASSERT(data == NULL);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 if (rc != 0 && rc != -EINTR)
170 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
171 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
174 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
175 struct mdt_body *body;
177 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
178 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
182 ll_finish_md_op_data(op_data);
186 md_clear_open_replay_data(md_exp, och);
187 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
190 ptlrpc_req_finished(req); /* This is close request */
194 int ll_md_real_close(struct inode *inode, fmode_t fmode)
196 struct ll_inode_info *lli = ll_i2info(inode);
197 struct obd_client_handle **och_p;
198 struct obd_client_handle *och;
203 if (fmode & FMODE_WRITE) {
204 och_p = &lli->lli_mds_write_och;
205 och_usecount = &lli->lli_open_fd_write_count;
206 } else if (fmode & FMODE_EXEC) {
207 och_p = &lli->lli_mds_exec_och;
208 och_usecount = &lli->lli_open_fd_exec_count;
210 LASSERT(fmode & FMODE_READ);
211 och_p = &lli->lli_mds_read_och;
212 och_usecount = &lli->lli_open_fd_read_count;
215 mutex_lock(&lli->lli_och_mutex);
216 if (*och_usecount > 0) {
217 /* There are still users of this handle, so skip
219 mutex_unlock(&lli->lli_och_mutex);
225 mutex_unlock(&lli->lli_och_mutex);
228 /* There might be a race and this handle may already
230 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
236 static int ll_md_close(struct inode *inode, struct file *file)
238 union ldlm_policy_data policy = {
239 .l_inodebits = { MDS_INODELOCK_OPEN },
241 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
242 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
243 struct ll_inode_info *lli = ll_i2info(inode);
244 struct lustre_handle lockh;
245 enum ldlm_mode lockmode;
249 /* clear group lock, if present */
250 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
251 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
253 if (fd->fd_lease_och != NULL) {
256 /* Usually the lease is not released when the
257 * application crashed, we need to release here. */
258 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
259 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
260 PFID(&lli->lli_fid), rc, lease_broken);
262 fd->fd_lease_och = NULL;
265 if (fd->fd_och != NULL) {
266 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
271 /* Let's see if we have good enough OPEN lock on the file and if
272 we can skip talking to MDS */
273 mutex_lock(&lli->lli_och_mutex);
274 if (fd->fd_omode & FMODE_WRITE) {
276 LASSERT(lli->lli_open_fd_write_count);
277 lli->lli_open_fd_write_count--;
278 } else if (fd->fd_omode & FMODE_EXEC) {
280 LASSERT(lli->lli_open_fd_exec_count);
281 lli->lli_open_fd_exec_count--;
284 LASSERT(lli->lli_open_fd_read_count);
285 lli->lli_open_fd_read_count--;
287 mutex_unlock(&lli->lli_och_mutex);
289 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
290 LDLM_IBITS, &policy, lockmode, &lockh))
291 rc = ll_md_real_close(inode, fd->fd_omode);
294 LUSTRE_FPRIVATE(file) = NULL;
295 ll_file_data_put(fd);
300 /* While this returns an error code, fput() the caller does not, so we need
301 * to make every effort to clean up all of our state here. Also, applications
302 * rarely check close errors and even if an error is returned they will not
303 * re-try the close call.
305 int ll_file_release(struct inode *inode, struct file *file)
307 struct ll_file_data *fd;
308 struct ll_sb_info *sbi = ll_i2sbi(inode);
309 struct ll_inode_info *lli = ll_i2info(inode);
313 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
314 PFID(ll_inode2fid(inode)), inode);
316 if (inode->i_sb->s_root != file_dentry(file))
317 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
318 fd = LUSTRE_FPRIVATE(file);
321 /* The last ref on @file, maybe not the the owner pid of statahead,
322 * because parent and child process can share the same file handle. */
323 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
324 ll_deauthorize_statahead(inode, fd);
326 if (inode->i_sb->s_root == file_dentry(file)) {
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
332 if (!S_ISDIR(inode->i_mode)) {
333 if (lli->lli_clob != NULL)
334 lov_read_and_clear_async_rc(lli->lli_clob);
335 lli->lli_async_rc = 0;
338 rc = ll_md_close(inode, file);
340 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
341 libcfs_debug_dumplog();
346 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
347 struct lookup_intent *itp)
349 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
350 struct dentry *parent = de->d_parent;
351 const char *name = NULL;
353 struct md_op_data *op_data;
354 struct ptlrpc_request *req = NULL;
358 LASSERT(parent != NULL);
359 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
361 /* if server supports open-by-fid, or file name is invalid, don't pack
362 * name in open request */
363 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
364 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
365 name = de->d_name.name;
366 len = de->d_name.len;
369 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
370 name, len, 0, LUSTRE_OPC_ANY, NULL);
372 RETURN(PTR_ERR(op_data));
373 op_data->op_data = lmm;
374 op_data->op_data_size = lmmsize;
376 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
377 &ll_md_blocking_ast, 0);
378 ll_finish_md_op_data(op_data);
380 /* reason for keep own exit path - don`t flood log
381 * with messages with -ESTALE errors.
383 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
384 it_open_error(DISP_OPEN_OPEN, itp))
386 ll_release_openhandle(de, itp);
390 if (it_disposition(itp, DISP_LOOKUP_NEG))
391 GOTO(out, rc = -ENOENT);
393 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
394 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
395 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
399 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
400 if (!rc && itp->it_lock_mode)
401 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
404 ptlrpc_req_finished(req);
405 ll_intent_drop_lock(itp);
407 /* We did open by fid, but by the time we got to the server,
408 * the object disappeared. If this is a create, we cannot really
409 * tell the userspace that the file it was trying to create
410 * does not exist. Instead let's return -ESTALE, and the VFS will
411 * retry the create with LOOKUP_REVAL that we are going to catch
412 * in ll_revalidate_dentry() and use lookup then.
414 if (rc == -ENOENT && itp->it_op & IT_CREAT)
420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
421 struct obd_client_handle *och)
423 struct mdt_body *body;
425 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
426 och->och_fh = body->mbo_handle;
427 och->och_fid = body->mbo_fid1;
428 och->och_lease_handle.cookie = it->it_lock_handle;
429 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430 och->och_flags = it->it_flags;
432 return md_set_open_replay_data(md_exp, och, it);
435 static int ll_local_open(struct file *file, struct lookup_intent *it,
436 struct ll_file_data *fd, struct obd_client_handle *och)
438 struct inode *inode = file_inode(file);
441 LASSERT(!LUSTRE_FPRIVATE(file));
448 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
453 LUSTRE_FPRIVATE(file) = fd;
454 ll_readahead_init(inode, &fd->fd_ras);
455 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
457 /* ll_cl_context initialize */
458 rwlock_init(&fd->fd_lock);
459 INIT_LIST_HEAD(&fd->fd_lccs);
464 /* Open a file, and (for the very first open) create objects on the OSTs at
465 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
466 * creation or open until ll_lov_setstripe() ioctl is called.
468 * If we already have the stripe MD locally then we don't request it in
469 * md_open(), by passing a lmm_size = 0.
471 * It is up to the application to ensure no other processes open this file
472 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
473 * used. We might be able to avoid races of that sort by getting lli_open_sem
474 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
475 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
477 int ll_file_open(struct inode *inode, struct file *file)
479 struct ll_inode_info *lli = ll_i2info(inode);
480 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
481 .it_flags = file->f_flags };
482 struct obd_client_handle **och_p = NULL;
483 __u64 *och_usecount = NULL;
484 struct ll_file_data *fd;
488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
489 PFID(ll_inode2fid(inode)), inode, file->f_flags);
491 it = file->private_data; /* XXX: compat macro */
492 file->private_data = NULL; /* prevent ll_local_open assertion */
494 fd = ll_file_data_get();
496 GOTO(out_openerr, rc = -ENOMEM);
499 if (S_ISDIR(inode->i_mode))
500 ll_authorize_statahead(inode, fd);
502 if (inode->i_sb->s_root == file_dentry(file)) {
503 LUSTRE_FPRIVATE(file) = fd;
507 if (!it || !it->it_disposition) {
508 /* Convert f_flags into access mode. We cannot use file->f_mode,
509 * because everything but O_ACCMODE mask was stripped from
511 if ((oit.it_flags + 1) & O_ACCMODE)
513 if (file->f_flags & O_TRUNC)
514 oit.it_flags |= FMODE_WRITE;
516 /* kernel only call f_op->open in dentry_open. filp_open calls
517 * dentry_open after call to open_namei that checks permissions.
518 * Only nfsd_open call dentry_open directly without checking
519 * permissions and because of that this code below is safe. */
520 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
521 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
523 /* We do not want O_EXCL here, presumably we opened the file
524 * already? XXX - NFS implications? */
525 oit.it_flags &= ~O_EXCL;
527 /* bug20584, if "it_flags" contains O_CREAT, the file will be
528 * created if necessary, then "IT_CREAT" should be set to keep
529 * consistent with it */
530 if (oit.it_flags & O_CREAT)
531 oit.it_op |= IT_CREAT;
537 /* Let's see if we have file open on MDS already. */
538 if (it->it_flags & FMODE_WRITE) {
539 och_p = &lli->lli_mds_write_och;
540 och_usecount = &lli->lli_open_fd_write_count;
541 } else if (it->it_flags & FMODE_EXEC) {
542 och_p = &lli->lli_mds_exec_och;
543 och_usecount = &lli->lli_open_fd_exec_count;
545 och_p = &lli->lli_mds_read_och;
546 och_usecount = &lli->lli_open_fd_read_count;
549 mutex_lock(&lli->lli_och_mutex);
550 if (*och_p) { /* Open handle is present */
551 if (it_disposition(it, DISP_OPEN_OPEN)) {
552 /* Well, there's extra open request that we do not need,
553 let's close it somehow. This will decref request. */
554 rc = it_open_error(DISP_OPEN_OPEN, it);
556 mutex_unlock(&lli->lli_och_mutex);
557 GOTO(out_openerr, rc);
560 ll_release_openhandle(file_dentry(file), it);
564 rc = ll_local_open(file, it, fd, NULL);
567 mutex_unlock(&lli->lli_och_mutex);
568 GOTO(out_openerr, rc);
571 LASSERT(*och_usecount == 0);
572 if (!it->it_disposition) {
573 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
574 /* We cannot just request lock handle now, new ELC code
575 means that one of other OPEN locks for this file
576 could be cancelled, and since blocking ast handler
577 would attempt to grab och_mutex as well, that would
578 result in a deadlock */
579 mutex_unlock(&lli->lli_och_mutex);
581 * Normally called under two situations:
583 * 2. A race/condition on MDS resulting in no open
584 * handle to be returned from LOOKUP|OPEN request,
585 * for example if the target entry was a symlink.
587 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
588 * marked by a bit set in ll_iget_for_nfs. Clear the
589 * bit so that it's not confusing later callers.
591 * NB; when ldd is NULL, it must have come via normal
592 * lookup path only, since ll_iget_for_nfs always calls
595 if (ldd && ldd->lld_nfs_dentry) {
596 ldd->lld_nfs_dentry = 0;
597 it->it_flags |= MDS_OPEN_LOCK;
601 * Always specify MDS_OPEN_BY_FID because we don't want
602 * to get file with different fid.
604 it->it_flags |= MDS_OPEN_BY_FID;
605 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * When setting a lease on a file, we take ownership of the lli_mds_*_och
698 * and save it as fd->fd_och so as to force client to reopen the file even
699 * if it has an open lock in cache already.
701 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
702 struct lustre_handle *old_handle)
704 struct ll_inode_info *lli = ll_i2info(inode);
705 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
706 struct obd_client_handle **och_p;
711 /* Get the openhandle of the file */
712 mutex_lock(&lli->lli_och_mutex);
713 if (fd->fd_lease_och != NULL)
714 GOTO(out_unlock, rc = -EBUSY);
716 if (fd->fd_och == NULL) {
717 if (file->f_mode & FMODE_WRITE) {
718 LASSERT(lli->lli_mds_write_och != NULL);
719 och_p = &lli->lli_mds_write_och;
720 och_usecount = &lli->lli_open_fd_write_count;
722 LASSERT(lli->lli_mds_read_och != NULL);
723 och_p = &lli->lli_mds_read_och;
724 och_usecount = &lli->lli_open_fd_read_count;
727 if (*och_usecount > 1)
728 GOTO(out_unlock, rc = -EBUSY);
735 *old_handle = fd->fd_och->och_fh;
739 mutex_unlock(&lli->lli_och_mutex);
744 * Release ownership on lli_mds_*_och when putting back a file lease.
746 static int ll_lease_och_release(struct inode *inode, struct file *file)
748 struct ll_inode_info *lli = ll_i2info(inode);
749 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
750 struct obd_client_handle **och_p;
751 struct obd_client_handle *old_och = NULL;
756 mutex_lock(&lli->lli_och_mutex);
757 if (file->f_mode & FMODE_WRITE) {
758 och_p = &lli->lli_mds_write_och;
759 och_usecount = &lli->lli_open_fd_write_count;
761 och_p = &lli->lli_mds_read_och;
762 och_usecount = &lli->lli_open_fd_read_count;
765 /* The file may have been open by another process (broken lease) so
766 * *och_p is not NULL. In this case we should simply increase usecount
769 if (*och_p != NULL) {
770 old_och = fd->fd_och;
777 mutex_unlock(&lli->lli_och_mutex);
780 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
786 * Acquire a lease and open the file.
788 static struct obd_client_handle *
789 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
792 struct lookup_intent it = { .it_op = IT_OPEN };
793 struct ll_sb_info *sbi = ll_i2sbi(inode);
794 struct md_op_data *op_data;
795 struct ptlrpc_request *req = NULL;
796 struct lustre_handle old_handle = { 0 };
797 struct obd_client_handle *och = NULL;
802 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
803 RETURN(ERR_PTR(-EINVAL));
806 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
807 RETURN(ERR_PTR(-EPERM));
809 rc = ll_lease_och_acquire(inode, file, &old_handle);
816 RETURN(ERR_PTR(-ENOMEM));
818 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
819 LUSTRE_OPC_ANY, NULL);
821 GOTO(out, rc = PTR_ERR(op_data));
823 /* To tell the MDT this openhandle is from the same owner */
824 op_data->op_handle = old_handle;
826 it.it_flags = fmode | open_flags;
827 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
828 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
829 &ll_md_blocking_lease_ast,
830 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
831 * it can be cancelled which may mislead applications that the lease is
833 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
834 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
835 * doesn't deal with openhandle, so normal openhandle will be leaked. */
836 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
837 ll_finish_md_op_data(op_data);
838 ptlrpc_req_finished(req);
840 GOTO(out_release_it, rc);
842 if (it_disposition(&it, DISP_LOOKUP_NEG))
843 GOTO(out_release_it, rc = -ENOENT);
845 rc = it_open_error(DISP_OPEN_OPEN, &it);
847 GOTO(out_release_it, rc);
849 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
850 ll_och_fill(sbi->ll_md_exp, &it, och);
852 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
853 GOTO(out_close, rc = -EOPNOTSUPP);
855 /* already get lease, handle lease lock */
856 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
857 if (it.it_lock_mode == 0 ||
858 it.it_lock_bits != MDS_INODELOCK_OPEN) {
859 /* open lock must return for lease */
860 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
861 PFID(ll_inode2fid(inode)), it.it_lock_mode,
863 GOTO(out_close, rc = -EPROTO);
866 ll_intent_release(&it);
870 /* Cancel open lock */
871 if (it.it_lock_mode != 0) {
872 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
875 och->och_lease_handle.cookie = 0ULL;
877 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
879 CERROR("%s: error closing file "DFID": %d\n",
880 ll_get_fsname(inode->i_sb, NULL, 0),
881 PFID(&ll_i2info(inode)->lli_fid), rc2);
882 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
884 ll_intent_release(&it);
892 * Check whether a layout swap can be done between two inodes.
894 * \param[in] inode1 First inode to check
895 * \param[in] inode2 Second inode to check
897 * \retval 0 on success, layout swap can be performed between both inodes
898 * \retval negative error code if requirements are not met
900 static int ll_check_swap_layouts_validity(struct inode *inode1,
901 struct inode *inode2)
903 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
906 if (inode_permission(inode1, MAY_WRITE) ||
907 inode_permission(inode2, MAY_WRITE))
910 if (inode1->i_sb != inode2->i_sb)
916 static int ll_swap_layouts_close(struct obd_client_handle *och,
917 struct inode *inode, struct inode *inode2)
919 const struct lu_fid *fid1 = ll_inode2fid(inode);
920 const struct lu_fid *fid2;
924 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
925 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
927 rc = ll_check_swap_layouts_validity(inode, inode2);
929 GOTO(out_free_och, rc);
931 /* We now know that inode2 is a lustre inode */
932 fid2 = ll_inode2fid(inode2);
934 rc = lu_fid_cmp(fid1, fid2);
936 GOTO(out_free_och, rc = -EINVAL);
938 /* Close the file and swap layouts between inode & inode2.
939 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
940 * because we still need it to pack l_remote_handle to MDT. */
941 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
944 och = NULL; /* freed in ll_close_inode_openhandle() */
954 * Release lease and close the file.
955 * It will check if the lease has ever broken.
957 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
960 struct ldlm_lock *lock;
961 bool cancelled = true;
965 lock = ldlm_handle2lock(&och->och_lease_handle);
967 lock_res_and_lock(lock);
968 cancelled = ldlm_is_cancel(lock);
969 unlock_res_and_lock(lock);
973 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
974 PFID(&ll_i2info(inode)->lli_fid), cancelled);
977 ldlm_cli_cancel(&och->och_lease_handle, 0);
979 if (lease_broken != NULL)
980 *lease_broken = cancelled;
982 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
986 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
988 struct ll_inode_info *lli = ll_i2info(inode);
989 struct cl_object *obj = lli->lli_clob;
990 struct cl_attr *attr = vvp_env_thread_attr(env);
998 ll_inode_size_lock(inode);
1000 /* Merge timestamps the most recently obtained from MDS with
1001 * timestamps obtained from OSTs.
1003 * Do not overwrite atime of inode because it may be refreshed
1004 * by file_accessed() function. If the read was served by cache
1005 * data, there is no RPC to be sent so that atime may not be
1006 * transferred to OSTs at all. MDT only updates atime at close time
1007 * if it's at least 'mdd.*.atime_diff' older.
1008 * All in all, the atime in Lustre does not strictly comply with
1009 * POSIX. Solving this problem needs to send an RPC to MDT for each
1010 * read, this will hurt performance. */
1011 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1012 LTIME_S(inode->i_atime) = lli->lli_atime;
1013 lli->lli_update_atime = 0;
1015 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1016 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1018 atime = LTIME_S(inode->i_atime);
1019 mtime = LTIME_S(inode->i_mtime);
1020 ctime = LTIME_S(inode->i_ctime);
1022 cl_object_attr_lock(obj);
1023 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1026 rc = cl_object_attr_get(env, obj, attr);
1027 cl_object_attr_unlock(obj);
1030 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1032 if (atime < attr->cat_atime)
1033 atime = attr->cat_atime;
1035 if (ctime < attr->cat_ctime)
1036 ctime = attr->cat_ctime;
1038 if (mtime < attr->cat_mtime)
1039 mtime = attr->cat_mtime;
1041 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1042 PFID(&lli->lli_fid), attr->cat_size);
1044 i_size_write(inode, attr->cat_size);
1045 inode->i_blocks = attr->cat_blocks;
1047 LTIME_S(inode->i_atime) = atime;
1048 LTIME_S(inode->i_mtime) = mtime;
1049 LTIME_S(inode->i_ctime) = ctime;
1052 ll_inode_size_unlock(inode);
1057 static bool file_is_noatime(const struct file *file)
1059 const struct vfsmount *mnt = file->f_path.mnt;
1060 const struct inode *inode = file_inode((struct file *)file);
1062 /* Adapted from file_accessed() and touch_atime().*/
1063 if (file->f_flags & O_NOATIME)
1066 if (inode->i_flags & S_NOATIME)
1069 if (IS_NOATIME(inode))
1072 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1075 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1078 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1084 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1086 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1088 struct inode *inode = file_inode(file);
1089 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1091 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1092 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1093 io->u.ci_rw.rw_file = file;
1094 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1095 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1096 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1098 if (iot == CIT_WRITE) {
1099 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1100 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1101 file->f_flags & O_DIRECT ||
1104 io->ci_obj = ll_i2info(inode)->lli_clob;
1105 io->ci_lockreq = CILR_MAYBE;
1106 if (ll_file_nolock(file)) {
1107 io->ci_lockreq = CILR_NEVER;
1108 io->ci_no_srvlock = 1;
1109 } else if (file->f_flags & O_APPEND) {
1110 io->ci_lockreq = CILR_MANDATORY;
1112 io->ci_noatime = file_is_noatime(file);
1113 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1114 io->ci_pio = !io->u.ci_rw.rw_append;
1119 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1121 struct cl_io_pt *pt = ptask->pt_cbdata;
1122 struct file *file = pt->cip_file;
1125 loff_t pos = pt->cip_pos;
1130 env = cl_env_get(&refcheck);
1132 RETURN(PTR_ERR(env));
1134 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1135 file_dentry(file)->d_name.name,
1136 pt->cip_iot == CIT_READ ? "read" : "write",
1137 pos, pos + pt->cip_count);
1140 io = vvp_env_thread_io(env);
1141 ll_io_init(io, file, pt->cip_iot);
1142 io->u.ci_rw.rw_iter = pt->cip_iter;
1143 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1144 io->ci_pio = 0; /* It's already in parallel task */
1146 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1147 pt->cip_count - pt->cip_result);
1149 struct vvp_io *vio = vvp_env_io(env);
1151 vio->vui_io_subtype = IO_NORMAL;
1152 vio->vui_fd = LUSTRE_FPRIVATE(file);
1154 ll_cl_add(file, env, io, LCC_RW);
1155 rc = cl_io_loop(env, io);
1156 ll_cl_remove(file, env);
1158 /* cl_io_rw_init() handled IO */
1162 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1168 if (io->ci_nob > 0) {
1169 pt->cip_result += io->ci_nob;
1170 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1172 pt->cip_iocb.ki_pos = pos;
1173 #ifdef HAVE_KIOCB_KI_LEFT
1174 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1175 #elif defined(HAVE_KI_NBYTES)
1176 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1180 cl_io_fini(env, io);
1182 if ((rc == 0 || rc == -ENODATA) &&
1183 pt->cip_result < pt->cip_count &&
1184 io->ci_need_restart) {
1186 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1187 file_dentry(file)->d_name.name,
1188 pt->cip_iot == CIT_READ ? "read" : "write",
1189 pos, pos + pt->cip_count - pt->cip_result,
1190 pt->cip_result, rc);
1194 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1195 file_dentry(file)->d_name.name,
1196 pt->cip_iot == CIT_READ ? "read" : "write",
1197 pt->cip_result, rc);
1199 cl_env_put(env, &refcheck);
1200 RETURN(pt->cip_result > 0 ? 0 : rc);
1204 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1205 struct file *file, enum cl_io_type iot,
1206 loff_t *ppos, size_t count)
1208 struct range_lock range;
1209 struct vvp_io *vio = vvp_env_io(env);
1210 struct inode *inode = file_inode(file);
1211 struct ll_inode_info *lli = ll_i2info(inode);
1212 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1220 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1221 file_dentry(file)->d_name.name,
1222 iot == CIT_READ ? "read" : "write", pos, pos + count);
1225 io = vvp_env_thread_io(env);
1226 ll_io_init(io, file, iot);
1227 if (args->via_io_subtype == IO_NORMAL) {
1228 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1229 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1234 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1235 bool range_locked = false;
1237 if (file->f_flags & O_APPEND)
1238 range_lock_init(&range, 0, LUSTRE_EOF);
1240 range_lock_init(&range, pos, pos + count - 1);
1242 vio->vui_fd = LUSTRE_FPRIVATE(file);
1243 vio->vui_io_subtype = args->via_io_subtype;
1245 switch (vio->vui_io_subtype) {
1247 /* Direct IO reads must also take range lock,
1248 * or multiple reads will try to work on the same pages
1249 * See LU-6227 for details. */
1250 if (((iot == CIT_WRITE) ||
1251 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1252 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1253 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1255 rc = range_lock(&lli->lli_write_tree, &range);
1259 range_locked = true;
1263 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1264 vio->u.splice.vui_flags = args->u.splice.via_flags;
1267 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1271 ll_cl_add(file, env, io, LCC_RW);
1272 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1273 !lli->lli_inode_locked) {
1275 lli->lli_inode_locked = 1;
1277 rc = cl_io_loop(env, io);
1278 if (lli->lli_inode_locked) {
1279 lli->lli_inode_locked = 0;
1280 inode_unlock(inode);
1282 ll_cl_remove(file, env);
1285 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1287 range_unlock(&lli->lli_write_tree, &range);
1290 /* cl_io_rw_init() handled IO */
1294 if (io->ci_nob > 0) {
1295 result += io->ci_nob;
1296 count -= io->ci_nob;
1298 if (args->via_io_subtype == IO_NORMAL) {
1299 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1301 args->u.normal.via_iocb->ki_pos = pos;
1302 #ifdef HAVE_KIOCB_KI_LEFT
1303 args->u.normal.via_iocb->ki_left = count;
1304 #elif defined(HAVE_KI_NBYTES)
1305 args->u.normal.via_iocb->ki_nbytes = count;
1309 pos = io->u.ci_rw.rw_range.cir_pos;
1313 cl_io_fini(env, io);
1315 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1317 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1318 file_dentry(file)->d_name.name,
1319 iot == CIT_READ ? "read" : "write",
1320 pos, pos + count, result, rc);
1324 if (iot == CIT_READ) {
1326 ll_stats_ops_tally(ll_i2sbi(inode),
1327 LPROC_LL_READ_BYTES, result);
1328 } else if (iot == CIT_WRITE) {
1330 ll_stats_ops_tally(ll_i2sbi(inode),
1331 LPROC_LL_WRITE_BYTES, result);
1332 fd->fd_write_failed = false;
1333 } else if (result == 0 && rc == 0) {
1336 fd->fd_write_failed = true;
1338 fd->fd_write_failed = false;
1339 } else if (rc != -ERESTARTSYS) {
1340 fd->fd_write_failed = true;
1344 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1345 file_dentry(file)->d_name.name,
1346 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1350 RETURN(result > 0 ? result : rc);
1354 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1355 * especially for small I/O.
1357 * To serve a read request, CLIO has to create and initialize a cl_io and
1358 * then request DLM lock. This has turned out to have siginificant overhead
1359 * and affects the performance of small I/O dramatically.
1361 * It's not necessary to create a cl_io for each I/O. Under the help of read
1362 * ahead, most of the pages being read are already in memory cache and we can
1363 * read those pages directly because if the pages exist, the corresponding DLM
1364 * lock must exist so that page content must be valid.
1366 * In fast read implementation, the llite speculatively finds and reads pages
1367 * in memory cache. There are three scenarios for fast read:
1368 * - If the page exists and is uptodate, kernel VM will provide the data and
1369 * CLIO won't be intervened;
1370 * - If the page was brought into memory by read ahead, it will be exported
1371 * and read ahead parameters will be updated;
1372 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1373 * it will go back and invoke normal read, i.e., a cl_io will be created
1374 * and DLM lock will be requested.
1376 * POSIX compliance: posix standard states that read is intended to be atomic.
1377 * Lustre read implementation is in line with Linux kernel read implementation
1378 * and neither of them complies with POSIX standard in this matter. Fast read
1379 * doesn't make the situation worse on single node but it may interleave write
1380 * results from multiple nodes due to short read handling in ll_file_aio_read().
1382 * \param env - lu_env
1383 * \param iocb - kiocb from kernel
1384 * \param iter - user space buffers where the data will be copied
1386 * \retval - number of bytes have been read, or error code if error occurred.
1389 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1393 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1396 /* NB: we can't do direct IO for fast read because it will need a lock
1397 * to make IO engine happy. */
1398 if (iocb->ki_filp->f_flags & O_DIRECT)
1401 result = generic_file_read_iter(iocb, iter);
1403 /* If the first page is not in cache, generic_file_aio_read() will be
1404 * returned with -ENODATA.
1405 * See corresponding code in ll_readpage(). */
1406 if (result == -ENODATA)
1410 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1411 LPROC_LL_READ_BYTES, result);
1417 * Read from a file (through the page cache).
1419 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1422 struct vvp_io_args *args;
1427 result = ll_do_fast_read(iocb, to);
1428 if (result < 0 || iov_iter_count(to) == 0)
1431 env = cl_env_get(&refcheck);
1433 return PTR_ERR(env);
1435 args = ll_env_args(env, IO_NORMAL);
1436 args->u.normal.via_iter = to;
1437 args->u.normal.via_iocb = iocb;
1439 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1440 &iocb->ki_pos, iov_iter_count(to));
1443 else if (result == 0)
1446 cl_env_put(env, &refcheck);
1452 * Write to a file (through the page cache).
1454 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1456 struct vvp_io_args *args;
1461 env = cl_env_get(&refcheck);
1463 return PTR_ERR(env);
1465 args = ll_env_args(env, IO_NORMAL);
1466 args->u.normal.via_iter = from;
1467 args->u.normal.via_iocb = iocb;
1469 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1470 &iocb->ki_pos, iov_iter_count(from));
1471 cl_env_put(env, &refcheck);
1475 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1477 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1479 static int ll_file_get_iov_count(const struct iovec *iov,
1480 unsigned long *nr_segs, size_t *count)
1485 for (seg = 0; seg < *nr_segs; seg++) {
1486 const struct iovec *iv = &iov[seg];
1489 * If any segment has a negative length, or the cumulative
1490 * length ever wraps negative then return -EINVAL.
1493 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1495 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1500 cnt -= iv->iov_len; /* This segment is no good */
1507 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1508 unsigned long nr_segs, loff_t pos)
1515 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1519 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1520 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1521 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1522 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1523 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1525 result = ll_file_read_iter(iocb, &to);
1530 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1533 struct iovec iov = { .iov_base = buf, .iov_len = count };
1538 init_sync_kiocb(&kiocb, file);
1539 kiocb.ki_pos = *ppos;
1540 #ifdef HAVE_KIOCB_KI_LEFT
1541 kiocb.ki_left = count;
1542 #elif defined(HAVE_KI_NBYTES)
1543 kiocb.i_nbytes = count;
1546 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1547 *ppos = kiocb.ki_pos;
1553 * Write to a file (through the page cache).
1556 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1557 unsigned long nr_segs, loff_t pos)
1559 struct iov_iter from;
1564 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1568 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1569 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1570 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1571 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1572 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1574 result = ll_file_write_iter(iocb, &from);
1579 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1580 size_t count, loff_t *ppos)
1583 struct iovec iov = { .iov_base = (void __user *)buf,
1585 struct kiocb *kiocb;
1590 env = cl_env_get(&refcheck);
1592 RETURN(PTR_ERR(env));
1594 kiocb = &ll_env_info(env)->lti_kiocb;
1595 init_sync_kiocb(kiocb, file);
1596 kiocb->ki_pos = *ppos;
1597 #ifdef HAVE_KIOCB_KI_LEFT
1598 kiocb->ki_left = count;
1599 #elif defined(HAVE_KI_NBYTES)
1600 kiocb->ki_nbytes = count;
1603 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1604 *ppos = kiocb->ki_pos;
1606 cl_env_put(env, &refcheck);
1609 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1612 * Send file content (through pagecache) somewhere with helper
1614 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1615 struct pipe_inode_info *pipe, size_t count,
1619 struct vvp_io_args *args;
1624 env = cl_env_get(&refcheck);
1626 RETURN(PTR_ERR(env));
1628 args = ll_env_args(env, IO_SPLICE);
1629 args->u.splice.via_pipe = pipe;
1630 args->u.splice.via_flags = flags;
1632 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1633 cl_env_put(env, &refcheck);
1637 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1638 __u64 flags, struct lov_user_md *lum, int lum_size)
1640 struct lookup_intent oit = {
1642 .it_flags = flags | MDS_OPEN_BY_FID,
1647 ll_inode_size_lock(inode);
1648 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1650 GOTO(out_unlock, rc);
1652 ll_release_openhandle(dentry, &oit);
1655 ll_inode_size_unlock(inode);
1656 ll_intent_release(&oit);
1661 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1662 struct lov_mds_md **lmmp, int *lmm_size,
1663 struct ptlrpc_request **request)
1665 struct ll_sb_info *sbi = ll_i2sbi(inode);
1666 struct mdt_body *body;
1667 struct lov_mds_md *lmm = NULL;
1668 struct ptlrpc_request *req = NULL;
1669 struct md_op_data *op_data;
1672 rc = ll_get_default_mdsize(sbi, &lmmsize);
1676 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1677 strlen(filename), lmmsize,
1678 LUSTRE_OPC_ANY, NULL);
1679 if (IS_ERR(op_data))
1680 RETURN(PTR_ERR(op_data));
1682 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1683 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1684 ll_finish_md_op_data(op_data);
1686 CDEBUG(D_INFO, "md_getattr_name failed "
1687 "on %s: rc %d\n", filename, rc);
1691 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1692 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1694 lmmsize = body->mbo_eadatasize;
1696 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1698 GOTO(out, rc = -ENODATA);
1701 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1702 LASSERT(lmm != NULL);
1704 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1705 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1706 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1707 GOTO(out, rc = -EPROTO);
1710 * This is coming from the MDS, so is probably in
1711 * little endian. We convert it to host endian before
1712 * passing it to userspace.
1714 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1717 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1718 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1719 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1720 if (le32_to_cpu(lmm->lmm_pattern) &
1721 LOV_PATTERN_F_RELEASED)
1725 /* if function called for directory - we should
1726 * avoid swab not existent lsm objects */
1727 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1728 lustre_swab_lov_user_md_v1(
1729 (struct lov_user_md_v1 *)lmm);
1730 if (S_ISREG(body->mbo_mode))
1731 lustre_swab_lov_user_md_objects(
1732 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1734 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1735 lustre_swab_lov_user_md_v3(
1736 (struct lov_user_md_v3 *)lmm);
1737 if (S_ISREG(body->mbo_mode))
1738 lustre_swab_lov_user_md_objects(
1739 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1741 } else if (lmm->lmm_magic ==
1742 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1743 lustre_swab_lov_comp_md_v1(
1744 (struct lov_comp_md_v1 *)lmm);
1750 *lmm_size = lmmsize;
1755 static int ll_lov_setea(struct inode *inode, struct file *file,
1758 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1759 struct lov_user_md *lump;
1760 int lum_size = sizeof(struct lov_user_md) +
1761 sizeof(struct lov_user_ost_data);
1765 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1768 OBD_ALLOC_LARGE(lump, lum_size);
1772 if (copy_from_user(lump, arg, lum_size))
1773 GOTO(out_lump, rc = -EFAULT);
1775 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1777 cl_lov_delay_create_clear(&file->f_flags);
1780 OBD_FREE_LARGE(lump, lum_size);
1784 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1791 env = cl_env_get(&refcheck);
1793 RETURN(PTR_ERR(env));
1795 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1796 cl_env_put(env, &refcheck);
1800 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1803 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1804 struct lov_user_md *klum;
1806 __u64 flags = FMODE_WRITE;
1809 rc = ll_copy_user_md(lum, &klum);
1814 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1819 rc = put_user(0, &lum->lmm_stripe_count);
1823 rc = ll_layout_refresh(inode, &gen);
1827 rc = ll_file_getstripe(inode, arg, lum_size);
1829 cl_lov_delay_create_clear(&file->f_flags);
1832 OBD_FREE(klum, lum_size);
1837 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1839 struct ll_inode_info *lli = ll_i2info(inode);
1840 struct cl_object *obj = lli->lli_clob;
1841 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1842 struct ll_grouplock grouplock;
1847 CWARN("group id for group lock must not be 0\n");
1851 if (ll_file_nolock(file))
1852 RETURN(-EOPNOTSUPP);
1854 spin_lock(&lli->lli_lock);
1855 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1856 CWARN("group lock already existed with gid %lu\n",
1857 fd->fd_grouplock.lg_gid);
1858 spin_unlock(&lli->lli_lock);
1861 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1862 spin_unlock(&lli->lli_lock);
1865 * XXX: group lock needs to protect all OST objects while PFL
1866 * can add new OST objects during the IO, so we'd instantiate
1867 * all OST objects before getting its group lock.
1872 struct cl_layout cl = {
1873 .cl_is_composite = false,
1876 env = cl_env_get(&refcheck);
1878 RETURN(PTR_ERR(env));
1880 rc = cl_object_layout_get(env, obj, &cl);
1881 if (!rc && cl.cl_is_composite)
1882 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1884 cl_env_put(env, &refcheck);
1889 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1890 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1894 spin_lock(&lli->lli_lock);
1895 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1896 spin_unlock(&lli->lli_lock);
1897 CERROR("another thread just won the race\n");
1898 cl_put_grouplock(&grouplock);
1902 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1903 fd->fd_grouplock = grouplock;
1904 spin_unlock(&lli->lli_lock);
1906 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1910 static int ll_put_grouplock(struct inode *inode, struct file *file,
1913 struct ll_inode_info *lli = ll_i2info(inode);
1914 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1915 struct ll_grouplock grouplock;
1918 spin_lock(&lli->lli_lock);
1919 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1920 spin_unlock(&lli->lli_lock);
1921 CWARN("no group lock held\n");
1925 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1927 if (fd->fd_grouplock.lg_gid != arg) {
1928 CWARN("group lock %lu doesn't match current id %lu\n",
1929 arg, fd->fd_grouplock.lg_gid);
1930 spin_unlock(&lli->lli_lock);
1934 grouplock = fd->fd_grouplock;
1935 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1936 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1937 spin_unlock(&lli->lli_lock);
1939 cl_put_grouplock(&grouplock);
1940 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1945 * Close inode open handle
1947 * \param dentry [in] dentry which contains the inode
1948 * \param it [in,out] intent which contains open info and result
1951 * \retval <0 failure
1953 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1955 struct inode *inode = dentry->d_inode;
1956 struct obd_client_handle *och;
1962 /* Root ? Do nothing. */
1963 if (dentry->d_inode->i_sb->s_root == dentry)
1966 /* No open handle to close? Move away */
1967 if (!it_disposition(it, DISP_OPEN_OPEN))
1970 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1972 OBD_ALLOC(och, sizeof(*och));
1974 GOTO(out, rc = -ENOMEM);
1976 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1978 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1980 /* this one is in place of ll_file_open */
1981 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1982 ptlrpc_req_finished(it->it_request);
1983 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1989 * Get size for inode for which FIEMAP mapping is requested.
1990 * Make the FIEMAP get_info call and returns the result.
1991 * \param fiemap kernel buffer to hold extens
1992 * \param num_bytes kernel buffer size
1994 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2000 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2003 /* Checks for fiemap flags */
2004 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2005 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2009 /* Check for FIEMAP_FLAG_SYNC */
2010 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2011 rc = filemap_fdatawrite(inode->i_mapping);
2016 env = cl_env_get(&refcheck);
2018 RETURN(PTR_ERR(env));
2020 if (i_size_read(inode) == 0) {
2021 rc = ll_glimpse_size(inode);
2026 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2027 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2028 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2030 /* If filesize is 0, then there would be no objects for mapping */
2031 if (fmkey.lfik_oa.o_size == 0) {
2032 fiemap->fm_mapped_extents = 0;
2036 fmkey.lfik_fiemap = *fiemap;
2038 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2039 &fmkey, fiemap, &num_bytes);
2041 cl_env_put(env, &refcheck);
2045 int ll_fid2path(struct inode *inode, void __user *arg)
2047 struct obd_export *exp = ll_i2mdexp(inode);
2048 const struct getinfo_fid2path __user *gfin = arg;
2050 struct getinfo_fid2path *gfout;
2056 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2057 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2060 /* Only need to get the buflen */
2061 if (get_user(pathlen, &gfin->gf_pathlen))
2064 if (pathlen > PATH_MAX)
2067 outsize = sizeof(*gfout) + pathlen;
2068 OBD_ALLOC(gfout, outsize);
2072 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2073 GOTO(gf_free, rc = -EFAULT);
2074 /* append root FID after gfout to let MDT know the root FID so that it
2075 * can lookup the correct path, this is mainly for fileset.
2076 * old server without fileset mount support will ignore this. */
2077 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2079 /* Call mdc_iocontrol */
2080 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2084 if (copy_to_user(arg, gfout, outsize))
2088 OBD_FREE(gfout, outsize);
2093 * Read the data_version for inode.
2095 * This value is computed using stripe object version on OST.
2096 * Version is computed using server side locking.
2098 * @param flags if do sync on the OST side;
2100 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2101 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2103 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2105 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2113 /* If no file object initialized, we consider its version is 0. */
2119 env = cl_env_get(&refcheck);
2121 RETURN(PTR_ERR(env));
2123 io = vvp_env_thread_io(env);
2125 io->u.ci_data_version.dv_data_version = 0;
2126 io->u.ci_data_version.dv_flags = flags;
2129 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2130 result = cl_io_loop(env, io);
2132 result = io->ci_result;
2134 *data_version = io->u.ci_data_version.dv_data_version;
2136 cl_io_fini(env, io);
2138 if (unlikely(io->ci_need_restart))
2141 cl_env_put(env, &refcheck);
2147 * Trigger a HSM release request for the provided inode.
2149 int ll_hsm_release(struct inode *inode)
2152 struct obd_client_handle *och = NULL;
2153 __u64 data_version = 0;
2158 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2159 ll_get_fsname(inode->i_sb, NULL, 0),
2160 PFID(&ll_i2info(inode)->lli_fid));
2162 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2164 GOTO(out, rc = PTR_ERR(och));
2166 /* Grab latest data_version and [am]time values */
2167 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2171 env = cl_env_get(&refcheck);
2173 GOTO(out, rc = PTR_ERR(env));
2175 rc = ll_merge_attr(env, inode);
2176 cl_env_put(env, &refcheck);
2178 /* If error happen, we have the wrong size for a file.
2184 /* Release the file.
2185 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2186 * we still need it to pack l_remote_handle to MDT. */
2187 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2193 if (och != NULL && !IS_ERR(och)) /* close the file */
2194 ll_lease_close(och, inode, NULL);
2199 struct ll_swap_stack {
2202 struct inode *inode1;
2203 struct inode *inode2;
2208 static int ll_swap_layouts(struct file *file1, struct file *file2,
2209 struct lustre_swap_layouts *lsl)
2211 struct mdc_swap_layouts msl;
2212 struct md_op_data *op_data;
2215 struct ll_swap_stack *llss = NULL;
2218 OBD_ALLOC_PTR(llss);
2222 llss->inode1 = file_inode(file1);
2223 llss->inode2 = file_inode(file2);
2225 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2229 /* we use 2 bool because it is easier to swap than 2 bits */
2230 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2231 llss->check_dv1 = true;
2233 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2234 llss->check_dv2 = true;
2236 /* we cannot use lsl->sl_dvX directly because we may swap them */
2237 llss->dv1 = lsl->sl_dv1;
2238 llss->dv2 = lsl->sl_dv2;
2240 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2241 if (rc == 0) /* same file, done! */
2244 if (rc < 0) { /* sequentialize it */
2245 swap(llss->inode1, llss->inode2);
2247 swap(llss->dv1, llss->dv2);
2248 swap(llss->check_dv1, llss->check_dv2);
2252 if (gid != 0) { /* application asks to flush dirty cache */
2253 rc = ll_get_grouplock(llss->inode1, file1, gid);
2257 rc = ll_get_grouplock(llss->inode2, file2, gid);
2259 ll_put_grouplock(llss->inode1, file1, gid);
2264 /* ultimate check, before swaping the layouts we check if
2265 * dataversion has changed (if requested) */
2266 if (llss->check_dv1) {
2267 rc = ll_data_version(llss->inode1, &dv, 0);
2270 if (dv != llss->dv1)
2271 GOTO(putgl, rc = -EAGAIN);
2274 if (llss->check_dv2) {
2275 rc = ll_data_version(llss->inode2, &dv, 0);
2278 if (dv != llss->dv2)
2279 GOTO(putgl, rc = -EAGAIN);
2282 /* struct md_op_data is used to send the swap args to the mdt
2283 * only flags is missing, so we use struct mdc_swap_layouts
2284 * through the md_op_data->op_data */
2285 /* flags from user space have to be converted before they are send to
2286 * server, no flag is sent today, they are only used on the client */
2289 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2290 0, LUSTRE_OPC_ANY, &msl);
2291 if (IS_ERR(op_data))
2292 GOTO(free, rc = PTR_ERR(op_data));
2294 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2295 sizeof(*op_data), op_data, NULL);
2296 ll_finish_md_op_data(op_data);
2303 ll_put_grouplock(llss->inode2, file2, gid);
2304 ll_put_grouplock(llss->inode1, file1, gid);
2314 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2316 struct md_op_data *op_data;
2320 /* Detect out-of range masks */
2321 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2324 /* Non-root users are forbidden to set or clear flags which are
2325 * NOT defined in HSM_USER_MASK. */
2326 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2327 !cfs_capable(CFS_CAP_SYS_ADMIN))
2330 /* Detect out-of range archive id */
2331 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2332 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2335 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2336 LUSTRE_OPC_ANY, hss);
2337 if (IS_ERR(op_data))
2338 RETURN(PTR_ERR(op_data));
2340 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2341 sizeof(*op_data), op_data, NULL);
2343 ll_finish_md_op_data(op_data);
2348 static int ll_hsm_import(struct inode *inode, struct file *file,
2349 struct hsm_user_import *hui)
2351 struct hsm_state_set *hss = NULL;
2352 struct iattr *attr = NULL;
2356 if (!S_ISREG(inode->i_mode))
2362 GOTO(out, rc = -ENOMEM);
2364 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2365 hss->hss_archive_id = hui->hui_archive_id;
2366 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2367 rc = ll_hsm_state_set(inode, hss);
2371 OBD_ALLOC_PTR(attr);
2373 GOTO(out, rc = -ENOMEM);
2375 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2376 attr->ia_mode |= S_IFREG;
2377 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2378 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2379 attr->ia_size = hui->hui_size;
2380 attr->ia_mtime.tv_sec = hui->hui_mtime;
2381 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2382 attr->ia_atime.tv_sec = hui->hui_atime;
2383 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2385 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2386 ATTR_UID | ATTR_GID |
2387 ATTR_MTIME | ATTR_MTIME_SET |
2388 ATTR_ATIME | ATTR_ATIME_SET;
2392 rc = ll_setattr_raw(file_dentry(file), attr, true);
2396 inode_unlock(inode);
2408 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2410 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2411 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2414 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2416 struct inode *inode = file_inode(file);
2418 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2419 ATTR_MTIME | ATTR_MTIME_SET |
2420 ATTR_CTIME | ATTR_CTIME_SET,
2422 .tv_sec = lfu->lfu_atime_sec,
2423 .tv_nsec = lfu->lfu_atime_nsec,
2426 .tv_sec = lfu->lfu_mtime_sec,
2427 .tv_nsec = lfu->lfu_mtime_nsec,
2430 .tv_sec = lfu->lfu_ctime_sec,
2431 .tv_nsec = lfu->lfu_ctime_nsec,
2437 if (!capable(CAP_SYS_ADMIN))
2440 if (!S_ISREG(inode->i_mode))
2444 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2445 inode_unlock(inode);
2450 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2453 case MODE_READ_USER:
2455 case MODE_WRITE_USER:
2462 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2464 /* Used to allow the upper layers of the client to request an LDLM lock
2465 * without doing an actual read or write.
2467 * Used for ladvise lockahead to manually request specific locks.
2469 * \param[in] file file this ladvise lock request is on
2470 * \param[in] ladvise ladvise struct describing this lock request
2472 * \retval 0 success, no detailed result available (sync requests
2473 * and requests sent to the server [not handled locally]
2474 * cannot return detailed results)
2475 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2476 * see definitions for details.
2477 * \retval negative negative errno on error
2479 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2481 struct lu_env *env = NULL;
2482 struct cl_io *io = NULL;
2483 struct cl_lock *lock = NULL;
2484 struct cl_lock_descr *descr = NULL;
2485 struct dentry *dentry = file->f_path.dentry;
2486 struct inode *inode = dentry->d_inode;
2487 enum cl_lock_mode cl_mode;
2488 off_t start = ladvise->lla_start;
2489 off_t end = ladvise->lla_end;
2495 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2496 "start=%llu, end=%llu\n", dentry->d_name.len,
2497 dentry->d_name.name, dentry->d_inode,
2498 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2501 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2503 GOTO(out, result = cl_mode);
2505 /* Get IO environment */
2506 result = cl_io_get(inode, &env, &io, &refcheck);
2510 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2513 * nothing to do for this io. This currently happens when
2514 * stripe sub-object's are not yet created.
2516 result = io->ci_result;
2517 } else if (result == 0) {
2518 lock = vvp_env_lock(env);
2519 descr = &lock->cll_descr;
2521 descr->cld_obj = io->ci_obj;
2522 /* Convert byte offsets to pages */
2523 descr->cld_start = cl_index(io->ci_obj, start);
2524 descr->cld_end = cl_index(io->ci_obj, end);
2525 descr->cld_mode = cl_mode;
2526 /* CEF_MUST is used because we do not want to convert a
2527 * lockahead request to a lockless lock */
2528 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2531 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2532 descr->cld_enq_flags |= CEF_SPECULATIVE;
2534 result = cl_lock_request(env, io, lock);
2536 /* On success, we need to release the lock */
2538 cl_lock_release(env, lock);
2540 cl_io_fini(env, io);
2541 cl_env_put(env, &refcheck);
2543 /* -ECANCELED indicates a matching lock with a different extent
2544 * was already present, and -EEXIST indicates a matching lock
2545 * on exactly the same extent was already present.
2546 * We convert them to positive values for userspace to make
2547 * recognizing true errors easier.
2548 * Note we can only return these detailed results on async requests,
2549 * as sync requests look the same as i/o requests for locking. */
2550 if (result == -ECANCELED)
2551 result = LLA_RESULT_DIFFERENT;
2552 else if (result == -EEXIST)
2553 result = LLA_RESULT_SAME;
2558 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2560 static int ll_ladvise_sanity(struct inode *inode,
2561 struct llapi_lu_ladvise *ladvise)
2563 enum lu_ladvise_type advice = ladvise->lla_advice;
2564 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2565 * be in the first 32 bits of enum ladvise_flags */
2566 __u32 flags = ladvise->lla_peradvice_flags;
2567 /* 3 lines at 80 characters per line, should be plenty */
2570 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2572 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2573 "last supported advice is %s (value '%d'): rc = %d\n",
2574 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2575 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2579 /* Per-advice checks */
2581 case LU_LADVISE_LOCKNOEXPAND:
2582 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2584 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2586 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2587 ladvise_names[advice], rc);
2591 case LU_LADVISE_LOCKAHEAD:
2592 /* Currently only READ and WRITE modes can be requested */
2593 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2594 ladvise->lla_lockahead_mode == 0) {
2596 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2598 ll_get_fsname(inode->i_sb, NULL, 0),
2599 ladvise->lla_lockahead_mode,
2600 ladvise_names[advice], rc);
2603 case LU_LADVISE_WILLREAD:
2604 case LU_LADVISE_DONTNEED:
2606 /* Note fall through above - These checks apply to all advices
2607 * except LOCKNOEXPAND */
2608 if (flags & ~LF_DEFAULT_MASK) {
2610 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2612 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2613 ladvise_names[advice], rc);
2616 if (ladvise->lla_start >= ladvise->lla_end) {
2618 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2619 "for %s: rc = %d\n",
2620 ll_get_fsname(inode->i_sb, NULL, 0),
2621 ladvise->lla_start, ladvise->lla_end,
2622 ladvise_names[advice], rc);
2634 * Give file access advices
2636 * The ladvise interface is similar to Linux fadvise() system call, except it
2637 * forwards the advices directly from Lustre client to server. The server side
2638 * codes will apply appropriate read-ahead and caching techniques for the
2639 * corresponding files.
2641 * A typical workload for ladvise is e.g. a bunch of different clients are
2642 * doing small random reads of a file, so prefetching pages into OSS cache
2643 * with big linear reads before the random IO is a net benefit. Fetching
2644 * all that data into each client cache with fadvise() may not be, due to
2645 * much more data being sent to the client.
2647 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2648 struct llapi_lu_ladvise *ladvise)
2652 struct cl_ladvise_io *lio;
2657 env = cl_env_get(&refcheck);
2659 RETURN(PTR_ERR(env));
2661 io = vvp_env_thread_io(env);
2662 io->ci_obj = ll_i2info(inode)->lli_clob;
2664 /* initialize parameters for ladvise */
2665 lio = &io->u.ci_ladvise;
2666 lio->li_start = ladvise->lla_start;
2667 lio->li_end = ladvise->lla_end;
2668 lio->li_fid = ll_inode2fid(inode);
2669 lio->li_advice = ladvise->lla_advice;
2670 lio->li_flags = flags;
2672 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2673 rc = cl_io_loop(env, io);
2677 cl_io_fini(env, io);
2678 cl_env_put(env, &refcheck);
2682 static int ll_lock_noexpand(struct file *file, int flags)
2684 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2686 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2691 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2694 struct fsxattr fsxattr;
2696 if (copy_from_user(&fsxattr,
2697 (const struct fsxattr __user *)arg,
2701 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2702 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2703 if (copy_to_user((struct fsxattr __user *)arg,
2704 &fsxattr, sizeof(fsxattr)))
2710 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2714 struct md_op_data *op_data;
2715 struct ptlrpc_request *req = NULL;
2717 struct fsxattr fsxattr;
2718 struct cl_object *obj;
2720 /* only root could change project ID */
2721 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2724 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2725 LUSTRE_OPC_ANY, NULL);
2726 if (IS_ERR(op_data))
2727 RETURN(PTR_ERR(op_data));
2729 if (copy_from_user(&fsxattr,
2730 (const struct fsxattr __user *)arg,
2732 GOTO(out_fsxattr1, rc = -EFAULT);
2734 op_data->op_attr_flags = fsxattr.fsx_xflags;
2735 op_data->op_projid = fsxattr.fsx_projid;
2736 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2737 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2739 ptlrpc_req_finished(req);
2741 obj = ll_i2info(inode)->lli_clob;
2745 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2746 OBD_ALLOC_PTR(attr);
2748 GOTO(out_fsxattr1, rc = -ENOMEM);
2749 attr->ia_valid = ATTR_ATTR_FLAG;
2750 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2755 ll_finish_md_op_data(op_data);
2762 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2764 struct inode *inode = file_inode(file);
2765 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2769 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2770 PFID(ll_inode2fid(inode)), inode, cmd);
2771 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2773 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2774 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2778 case LL_IOC_GETFLAGS:
2779 /* Get the current value of the file flags */
2780 return put_user(fd->fd_flags, (int __user *)arg);
2781 case LL_IOC_SETFLAGS:
2782 case LL_IOC_CLRFLAGS:
2783 /* Set or clear specific file flags */
2784 /* XXX This probably needs checks to ensure the flags are
2785 * not abused, and to handle any flag side effects.
2787 if (get_user(flags, (int __user *) arg))
2790 if (cmd == LL_IOC_SETFLAGS) {
2791 if ((flags & LL_FILE_IGNORE_LOCK) &&
2792 !(file->f_flags & O_DIRECT)) {
2793 CERROR("%s: unable to disable locking on "
2794 "non-O_DIRECT file\n", current->comm);
2798 fd->fd_flags |= flags;
2800 fd->fd_flags &= ~flags;
2803 case LL_IOC_LOV_SETSTRIPE:
2804 case LL_IOC_LOV_SETSTRIPE_NEW:
2805 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2806 case LL_IOC_LOV_SETEA:
2807 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2808 case LL_IOC_LOV_SWAP_LAYOUTS: {
2810 struct lustre_swap_layouts lsl;
2812 if (copy_from_user(&lsl, (char __user *)arg,
2813 sizeof(struct lustre_swap_layouts)))
2816 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2819 file2 = fget(lsl.sl_fd);
2823 /* O_WRONLY or O_RDWR */
2824 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2825 GOTO(out, rc = -EPERM);
2827 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2828 struct inode *inode2;
2829 struct ll_inode_info *lli;
2830 struct obd_client_handle *och = NULL;
2832 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2833 GOTO(out, rc = -EINVAL);
2835 lli = ll_i2info(inode);
2836 mutex_lock(&lli->lli_och_mutex);
2837 if (fd->fd_lease_och != NULL) {
2838 och = fd->fd_lease_och;
2839 fd->fd_lease_och = NULL;
2841 mutex_unlock(&lli->lli_och_mutex);
2843 GOTO(out, rc = -ENOLCK);
2844 inode2 = file_inode(file2);
2845 rc = ll_swap_layouts_close(och, inode, inode2);
2847 rc = ll_swap_layouts(file, file2, &lsl);
2853 case LL_IOC_LOV_GETSTRIPE:
2854 case LL_IOC_LOV_GETSTRIPE_NEW:
2855 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2856 case FSFILT_IOC_GETFLAGS:
2857 case FSFILT_IOC_SETFLAGS:
2858 RETURN(ll_iocontrol(inode, file, cmd, arg));
2859 case FSFILT_IOC_GETVERSION_OLD:
2860 case FSFILT_IOC_GETVERSION:
2861 RETURN(put_user(inode->i_generation, (int __user *)arg));
2862 case LL_IOC_GROUP_LOCK:
2863 RETURN(ll_get_grouplock(inode, file, arg));
2864 case LL_IOC_GROUP_UNLOCK:
2865 RETURN(ll_put_grouplock(inode, file, arg));
2866 case IOC_OBD_STATFS:
2867 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2869 /* We need to special case any other ioctls we want to handle,
2870 * to send them to the MDS/OST as appropriate and to properly
2871 * network encode the arg field.
2872 case FSFILT_IOC_SETVERSION_OLD:
2873 case FSFILT_IOC_SETVERSION:
2875 case LL_IOC_FLUSHCTX:
2876 RETURN(ll_flush_ctx(inode));
2877 case LL_IOC_PATH2FID: {
2878 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2879 sizeof(struct lu_fid)))
2884 case LL_IOC_GETPARENT:
2885 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2887 case OBD_IOC_FID2PATH:
2888 RETURN(ll_fid2path(inode, (void __user *)arg));
2889 case LL_IOC_DATA_VERSION: {
2890 struct ioc_data_version idv;
2893 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2896 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2897 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2900 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2906 case LL_IOC_GET_MDTIDX: {
2909 mdtidx = ll_get_mdt_idx(inode);
2913 if (put_user((int)mdtidx, (int __user *)arg))
2918 case OBD_IOC_GETDTNAME:
2919 case OBD_IOC_GETMDNAME:
2920 RETURN(ll_get_obd_name(inode, cmd, arg));
2921 case LL_IOC_HSM_STATE_GET: {
2922 struct md_op_data *op_data;
2923 struct hsm_user_state *hus;
2930 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2931 LUSTRE_OPC_ANY, hus);
2932 if (IS_ERR(op_data)) {
2934 RETURN(PTR_ERR(op_data));
2937 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2940 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2943 ll_finish_md_op_data(op_data);
2947 case LL_IOC_HSM_STATE_SET: {
2948 struct hsm_state_set *hss;
2955 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2960 rc = ll_hsm_state_set(inode, hss);
2965 case LL_IOC_HSM_ACTION: {
2966 struct md_op_data *op_data;
2967 struct hsm_current_action *hca;
2974 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2975 LUSTRE_OPC_ANY, hca);
2976 if (IS_ERR(op_data)) {
2978 RETURN(PTR_ERR(op_data));
2981 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2984 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2987 ll_finish_md_op_data(op_data);
2991 case LL_IOC_SET_LEASE: {
2992 struct ll_inode_info *lli = ll_i2info(inode);
2993 struct obd_client_handle *och = NULL;
2998 case LL_LEASE_WRLCK:
2999 if (!(file->f_mode & FMODE_WRITE))
3001 fmode = FMODE_WRITE;
3003 case LL_LEASE_RDLCK:
3004 if (!(file->f_mode & FMODE_READ))
3008 case LL_LEASE_UNLCK:
3009 mutex_lock(&lli->lli_och_mutex);
3010 if (fd->fd_lease_och != NULL) {
3011 och = fd->fd_lease_och;
3012 fd->fd_lease_och = NULL;
3014 mutex_unlock(&lli->lli_och_mutex);
3019 fmode = och->och_flags;
3020 rc = ll_lease_close(och, inode, &lease_broken);
3024 rc = ll_lease_och_release(inode, file);
3031 RETURN(ll_lease_type_from_fmode(fmode));
3036 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3038 /* apply for lease */
3039 och = ll_lease_open(inode, file, fmode, 0);
3041 RETURN(PTR_ERR(och));
3044 mutex_lock(&lli->lli_och_mutex);
3045 if (fd->fd_lease_och == NULL) {
3046 fd->fd_lease_och = och;
3049 mutex_unlock(&lli->lli_och_mutex);
3051 /* impossible now that only excl is supported for now */
3052 ll_lease_close(och, inode, &lease_broken);
3057 case LL_IOC_GET_LEASE: {
3058 struct ll_inode_info *lli = ll_i2info(inode);
3059 struct ldlm_lock *lock = NULL;
3062 mutex_lock(&lli->lli_och_mutex);
3063 if (fd->fd_lease_och != NULL) {
3064 struct obd_client_handle *och = fd->fd_lease_och;
3066 lock = ldlm_handle2lock(&och->och_lease_handle);
3068 lock_res_and_lock(lock);
3069 if (!ldlm_is_cancel(lock))
3070 fmode = och->och_flags;
3072 unlock_res_and_lock(lock);
3073 LDLM_LOCK_PUT(lock);
3076 mutex_unlock(&lli->lli_och_mutex);
3078 RETURN(ll_lease_type_from_fmode(fmode));
3080 case LL_IOC_HSM_IMPORT: {
3081 struct hsm_user_import *hui;
3087 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3092 rc = ll_hsm_import(inode, file, hui);
3097 case LL_IOC_FUTIMES_3: {
3098 struct ll_futimes_3 lfu;
3100 if (copy_from_user(&lfu,
3101 (const struct ll_futimes_3 __user *)arg,
3105 RETURN(ll_file_futimes_3(file, &lfu));
3107 case LL_IOC_LADVISE: {
3108 struct llapi_ladvise_hdr *k_ladvise_hdr;
3109 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3112 int alloc_size = sizeof(*k_ladvise_hdr);
3115 u_ladvise_hdr = (void __user *)arg;
3116 OBD_ALLOC_PTR(k_ladvise_hdr);
3117 if (k_ladvise_hdr == NULL)
3120 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3121 GOTO(out_ladvise, rc = -EFAULT);
3123 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3124 k_ladvise_hdr->lah_count < 1)
3125 GOTO(out_ladvise, rc = -EINVAL);
3127 num_advise = k_ladvise_hdr->lah_count;
3128 if (num_advise >= LAH_COUNT_MAX)
3129 GOTO(out_ladvise, rc = -EFBIG);
3131 OBD_FREE_PTR(k_ladvise_hdr);
3132 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3133 lah_advise[num_advise]);
3134 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3135 if (k_ladvise_hdr == NULL)
3139 * TODO: submit multiple advices to one server in a single RPC
3141 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3142 GOTO(out_ladvise, rc = -EFAULT);
3144 for (i = 0; i < num_advise; i++) {
3145 struct llapi_lu_ladvise *k_ladvise =
3146 &k_ladvise_hdr->lah_advise[i];
3147 struct llapi_lu_ladvise __user *u_ladvise =
3148 &u_ladvise_hdr->lah_advise[i];
3150 rc = ll_ladvise_sanity(inode, k_ladvise);
3152 GOTO(out_ladvise, rc);
3154 switch (k_ladvise->lla_advice) {
3155 case LU_LADVISE_LOCKNOEXPAND:
3156 rc = ll_lock_noexpand(file,
3157 k_ladvise->lla_peradvice_flags);
3158 GOTO(out_ladvise, rc);
3159 case LU_LADVISE_LOCKAHEAD:
3161 rc = ll_file_lock_ahead(file, k_ladvise);
3164 GOTO(out_ladvise, rc);
3167 &u_ladvise->lla_lockahead_result))
3168 GOTO(out_ladvise, rc = -EFAULT);
3171 rc = ll_ladvise(inode, file,
3172 k_ladvise_hdr->lah_flags,
3175 GOTO(out_ladvise, rc);
3182 OBD_FREE(k_ladvise_hdr, alloc_size);
3185 case LL_IOC_FSGETXATTR:
3186 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3187 case LL_IOC_FSSETXATTR:
3188 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3190 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3192 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3193 (void __user *)arg));
3197 #ifndef HAVE_FILE_LLSEEK_SIZE
3198 static inline loff_t
3199 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3201 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3203 if (offset > maxsize)
3206 if (offset != file->f_pos) {
3207 file->f_pos = offset;
3208 file->f_version = 0;
3214 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3215 loff_t maxsize, loff_t eof)
3217 struct inode *inode = file_inode(file);
3225 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3226 * position-querying operation. Avoid rewriting the "same"
3227 * f_pos value back to the file because a concurrent read(),
3228 * write() or lseek() might have altered it
3233 * f_lock protects against read/modify/write race with other
3234 * SEEK_CURs. Note that parallel writes and reads behave
3238 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3239 inode_unlock(inode);
3243 * In the generic case the entire file is data, so as long as
3244 * offset isn't at the end of the file then the offset is data.
3251 * There is a virtual hole at the end of the file, so as long as
3252 * offset isn't i_size or larger, return i_size.
3260 return llseek_execute(file, offset, maxsize);
3264 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3266 struct inode *inode = file_inode(file);
3267 loff_t retval, eof = 0;
3270 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3271 (origin == SEEK_CUR) ? file->f_pos : 0);
3272 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3273 PFID(ll_inode2fid(inode)), inode, retval, retval,
3275 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3277 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3278 retval = ll_glimpse_size(inode);
3281 eof = i_size_read(inode);
3284 retval = ll_generic_file_llseek_size(file, offset, origin,
3285 ll_file_maxbytes(inode), eof);
3289 static int ll_flush(struct file *file, fl_owner_t id)
3291 struct inode *inode = file_inode(file);
3292 struct ll_inode_info *lli = ll_i2info(inode);
3293 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3296 LASSERT(!S_ISDIR(inode->i_mode));
3298 /* catch async errors that were recorded back when async writeback
3299 * failed for pages in this mapping. */
3300 rc = lli->lli_async_rc;
3301 lli->lli_async_rc = 0;
3302 if (lli->lli_clob != NULL) {
3303 err = lov_read_and_clear_async_rc(lli->lli_clob);
3308 /* The application has been told write failure already.
3309 * Do not report failure again. */
3310 if (fd->fd_write_failed)
3312 return rc ? -EIO : 0;
3316 * Called to make sure a portion of file has been written out.
3317 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3319 * Return how many pages have been written.
3321 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3322 enum cl_fsync_mode mode, int ignore_layout)
3326 struct cl_fsync_io *fio;
3331 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3332 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3335 env = cl_env_get(&refcheck);
3337 RETURN(PTR_ERR(env));
3339 io = vvp_env_thread_io(env);
3340 io->ci_obj = ll_i2info(inode)->lli_clob;
3341 io->ci_ignore_layout = ignore_layout;
3343 /* initialize parameters for sync */
3344 fio = &io->u.ci_fsync;
3345 fio->fi_start = start;
3347 fio->fi_fid = ll_inode2fid(inode);
3348 fio->fi_mode = mode;
3349 fio->fi_nr_written = 0;
3351 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3352 result = cl_io_loop(env, io);
3354 result = io->ci_result;
3356 result = fio->fi_nr_written;
3357 cl_io_fini(env, io);
3358 cl_env_put(env, &refcheck);
3364 * When dentry is provided (the 'else' case), file_dentry() may be
3365 * null and dentry must be used directly rather than pulled from
3366 * file_dentry() as is done otherwise.
3369 #ifdef HAVE_FILE_FSYNC_4ARGS
3370 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3372 struct dentry *dentry = file_dentry(file);
3374 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3375 int ll_fsync(struct file *file, int datasync)
3377 struct dentry *dentry = file_dentry(file);
3379 loff_t end = LLONG_MAX;
3381 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3384 loff_t end = LLONG_MAX;
3386 struct inode *inode = dentry->d_inode;
3387 struct ll_inode_info *lli = ll_i2info(inode);
3388 struct ptlrpc_request *req;
3392 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3393 PFID(ll_inode2fid(inode)), inode);
3394 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3396 #ifdef HAVE_FILE_FSYNC_4ARGS
3397 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3398 lock_inode = !lli->lli_inode_locked;
3402 /* fsync's caller has already called _fdata{sync,write}, we want
3403 * that IO to finish before calling the osc and mdc sync methods */
3404 rc = filemap_fdatawait(inode->i_mapping);
3407 /* catch async errors that were recorded back when async writeback
3408 * failed for pages in this mapping. */
3409 if (!S_ISDIR(inode->i_mode)) {
3410 err = lli->lli_async_rc;
3411 lli->lli_async_rc = 0;
3414 if (lli->lli_clob != NULL) {
3415 err = lov_read_and_clear_async_rc(lli->lli_clob);
3421 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3425 ptlrpc_req_finished(req);
3427 if (S_ISREG(inode->i_mode)) {
3428 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3430 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3431 if (rc == 0 && err < 0)
3434 fd->fd_write_failed = true;
3436 fd->fd_write_failed = false;
3439 #ifdef HAVE_FILE_FSYNC_4ARGS
3441 inode_unlock(inode);
3447 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3449 struct inode *inode = file_inode(file);
3450 struct ll_sb_info *sbi = ll_i2sbi(inode);
3451 struct ldlm_enqueue_info einfo = {
3452 .ei_type = LDLM_FLOCK,
3453 .ei_cb_cp = ldlm_flock_completion_ast,
3454 .ei_cbdata = file_lock,
3456 struct md_op_data *op_data;
3457 struct lustre_handle lockh = { 0 };
3458 union ldlm_policy_data flock = { { 0 } };
3459 int fl_type = file_lock->fl_type;
3465 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3466 PFID(ll_inode2fid(inode)), file_lock);
3468 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3470 if (file_lock->fl_flags & FL_FLOCK) {
3471 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3472 /* flocks are whole-file locks */
3473 flock.l_flock.end = OFFSET_MAX;
3474 /* For flocks owner is determined by the local file desctiptor*/
3475 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3476 } else if (file_lock->fl_flags & FL_POSIX) {
3477 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3478 flock.l_flock.start = file_lock->fl_start;
3479 flock.l_flock.end = file_lock->fl_end;
3483 flock.l_flock.pid = file_lock->fl_pid;
3485 /* Somewhat ugly workaround for svc lockd.
3486 * lockd installs custom fl_lmops->lm_compare_owner that checks
3487 * for the fl_owner to be the same (which it always is on local node
3488 * I guess between lockd processes) and then compares pid.
3489 * As such we assign pid to the owner field to make it all work,
3490 * conflict with normal locks is unlikely since pid space and
3491 * pointer space for current->files are not intersecting */
3492 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3493 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3497 einfo.ei_mode = LCK_PR;
3500 /* An unlock request may or may not have any relation to
3501 * existing locks so we may not be able to pass a lock handle
3502 * via a normal ldlm_lock_cancel() request. The request may even
3503 * unlock a byte range in the middle of an existing lock. In
3504 * order to process an unlock request we need all of the same
3505 * information that is given with a normal read or write record
3506 * lock request. To avoid creating another ldlm unlock (cancel)
3507 * message we'll treat a LCK_NL flock request as an unlock. */
3508 einfo.ei_mode = LCK_NL;
3511 einfo.ei_mode = LCK_PW;
3514 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3529 flags = LDLM_FL_BLOCK_NOWAIT;
3535 flags = LDLM_FL_TEST_LOCK;
3538 CERROR("unknown fcntl lock command: %d\n", cmd);
3542 /* Save the old mode so that if the mode in the lock changes we
3543 * can decrement the appropriate reader or writer refcount. */
3544 file_lock->fl_type = einfo.ei_mode;
3546 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3547 LUSTRE_OPC_ANY, NULL);
3548 if (IS_ERR(op_data))
3549 RETURN(PTR_ERR(op_data));
3551 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3552 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3553 flock.l_flock.pid, flags, einfo.ei_mode,
3554 flock.l_flock.start, flock.l_flock.end);
3556 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3559 /* Restore the file lock type if not TEST lock. */
3560 if (!(flags & LDLM_FL_TEST_LOCK))
3561 file_lock->fl_type = fl_type;
3563 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3564 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3565 !(flags & LDLM_FL_TEST_LOCK))
3566 rc2 = locks_lock_file_wait(file, file_lock);
3568 if ((file_lock->fl_flags & FL_FLOCK) &&
3569 (rc == 0 || file_lock->fl_type == F_UNLCK))
3570 rc2 = flock_lock_file_wait(file, file_lock);
3571 if ((file_lock->fl_flags & FL_POSIX) &&
3572 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3573 !(flags & LDLM_FL_TEST_LOCK))
3574 rc2 = posix_lock_file_wait(file, file_lock);
3575 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3577 if (rc2 && file_lock->fl_type != F_UNLCK) {
3578 einfo.ei_mode = LCK_NL;
3579 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3584 ll_finish_md_op_data(op_data);
3589 int ll_get_fid_by_name(struct inode *parent, const char *name,
3590 int namelen, struct lu_fid *fid,
3591 struct inode **inode)
3593 struct md_op_data *op_data = NULL;
3594 struct mdt_body *body;
3595 struct ptlrpc_request *req;
3599 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3600 LUSTRE_OPC_ANY, NULL);
3601 if (IS_ERR(op_data))
3602 RETURN(PTR_ERR(op_data));
3604 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3605 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3606 ll_finish_md_op_data(op_data);
3610 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3612 GOTO(out_req, rc = -EFAULT);
3614 *fid = body->mbo_fid1;
3617 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3619 ptlrpc_req_finished(req);
3623 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3624 const char *name, int namelen)
3626 struct dentry *dchild = NULL;
3627 struct inode *child_inode = NULL;
3628 struct md_op_data *op_data;
3629 struct ptlrpc_request *request = NULL;
3630 struct obd_client_handle *och = NULL;
3632 struct mdt_body *body;
3634 __u64 data_version = 0;
3637 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3638 name, PFID(ll_inode2fid(parent)), mdtidx);
3640 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3641 0, LUSTRE_OPC_ANY, NULL);
3642 if (IS_ERR(op_data))
3643 RETURN(PTR_ERR(op_data));
3645 /* Get child FID first */
3646 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3649 dchild = d_lookup(file_dentry(file), &qstr);
3650 if (dchild != NULL) {
3651 if (dchild->d_inode != NULL)
3652 child_inode = igrab(dchild->d_inode);
3656 if (child_inode == NULL) {
3657 rc = ll_get_fid_by_name(parent, name, namelen,
3658 &op_data->op_fid3, &child_inode);
3663 if (child_inode == NULL)
3664 GOTO(out_free, rc = -EINVAL);
3667 * lfs migrate command needs to be blocked on the client
3668 * by checking the migrate FID against the FID of the
3671 if (child_inode == parent->i_sb->s_root->d_inode)
3672 GOTO(out_iput, rc = -EINVAL);
3674 inode_lock(child_inode);
3675 op_data->op_fid3 = *ll_inode2fid(child_inode);
3676 if (!fid_is_sane(&op_data->op_fid3)) {
3677 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3678 ll_get_fsname(parent->i_sb, NULL, 0), name,
3679 PFID(&op_data->op_fid3));
3680 GOTO(out_unlock, rc = -EINVAL);
3683 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3685 GOTO(out_unlock, rc);
3688 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3689 PFID(&op_data->op_fid3), mdtidx);
3690 GOTO(out_unlock, rc = 0);
3693 if (S_ISREG(child_inode->i_mode)) {
3694 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3698 GOTO(out_unlock, rc);
3701 rc = ll_data_version(child_inode, &data_version,
3704 GOTO(out_close, rc);
3706 op_data->op_handle = och->och_fh;
3707 op_data->op_data = och->och_mod;
3708 op_data->op_data_version = data_version;
3709 op_data->op_lease_handle = och->och_lease_handle;
3710 op_data->op_bias |= MDS_RENAME_MIGRATE;
3713 op_data->op_mds = mdtidx;
3714 op_data->op_cli_flags = CLI_MIGRATE;
3715 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3716 namelen, name, namelen, &request);
3718 LASSERT(request != NULL);
3719 ll_update_times(request, parent);
3721 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3722 LASSERT(body != NULL);
3724 /* If the server does release layout lock, then we cleanup
3725 * the client och here, otherwise release it in out_close: */
3727 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3728 obd_mod_put(och->och_mod);
3729 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3731 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3737 if (request != NULL) {
3738 ptlrpc_req_finished(request);
3742 /* Try again if the file layout has changed. */
3743 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3747 if (och != NULL) /* close the file */
3748 ll_lease_close(och, child_inode, NULL);
3750 clear_nlink(child_inode);
3752 inode_unlock(child_inode);
3756 ll_finish_md_op_data(op_data);
3761 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3769 * test if some locks matching bits and l_req_mode are acquired
3770 * - bits can be in different locks
3771 * - if found clear the common lock bits in *bits
3772 * - the bits not found, are kept in *bits
3774 * \param bits [IN] searched lock bits [IN]
3775 * \param l_req_mode [IN] searched lock mode
3776 * \retval boolean, true iff all bits are found
3778 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3780 struct lustre_handle lockh;
3781 union ldlm_policy_data policy;
3782 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3783 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3792 fid = &ll_i2info(inode)->lli_fid;
3793 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3794 ldlm_lockname[mode]);
3796 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3797 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3798 policy.l_inodebits.bits = *bits & (1 << i);
3799 if (policy.l_inodebits.bits == 0)
3802 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3803 &policy, mode, &lockh)) {
3804 struct ldlm_lock *lock;
3806 lock = ldlm_handle2lock(&lockh);
3809 ~(lock->l_policy_data.l_inodebits.bits);
3810 LDLM_LOCK_PUT(lock);
3812 *bits &= ~policy.l_inodebits.bits;
3819 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3820 struct lustre_handle *lockh, __u64 flags,
3821 enum ldlm_mode mode)
3823 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3828 fid = &ll_i2info(inode)->lli_fid;
3829 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3831 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3832 fid, LDLM_IBITS, &policy, mode, lockh);
3837 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3839 /* Already unlinked. Just update nlink and return success */
3840 if (rc == -ENOENT) {
3842 /* If it is striped directory, and there is bad stripe
3843 * Let's revalidate the dentry again, instead of returning
3845 if (S_ISDIR(inode->i_mode) &&
3846 ll_i2info(inode)->lli_lsm_md != NULL)
3849 /* This path cannot be hit for regular files unless in
3850 * case of obscure races, so no need to to validate
3852 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3854 } else if (rc != 0) {
3855 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3856 "%s: revalidate FID "DFID" error: rc = %d\n",
3857 ll_get_fsname(inode->i_sb, NULL, 0),
3858 PFID(ll_inode2fid(inode)), rc);
3864 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3866 struct inode *inode = dentry->d_inode;
3867 struct ptlrpc_request *req = NULL;
3868 struct obd_export *exp;
3872 LASSERT(inode != NULL);
3874 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3875 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3877 exp = ll_i2mdexp(inode);
3879 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3880 * But under CMD case, it caused some lock issues, should be fixed
3881 * with new CMD ibits lock. See bug 12718 */
3882 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3883 struct lookup_intent oit = { .it_op = IT_GETATTR };
3884 struct md_op_data *op_data;
3886 if (ibits == MDS_INODELOCK_LOOKUP)
3887 oit.it_op = IT_LOOKUP;
3889 /* Call getattr by fid, so do not provide name at all. */
3890 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3891 dentry->d_inode, NULL, 0, 0,
3892 LUSTRE_OPC_ANY, NULL);
3893 if (IS_ERR(op_data))
3894 RETURN(PTR_ERR(op_data));
3896 rc = md_intent_lock(exp, op_data, &oit, &req,
3897 &ll_md_blocking_ast, 0);
3898 ll_finish_md_op_data(op_data);
3900 rc = ll_inode_revalidate_fini(inode, rc);
3904 rc = ll_revalidate_it_finish(req, &oit, dentry);
3906 ll_intent_release(&oit);
3910 /* Unlinked? Unhash dentry, so it is not picked up later by
3911 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3912 here to preserve get_cwd functionality on 2.6.
3914 if (!dentry->d_inode->i_nlink) {
3915 ll_lock_dcache(inode);
3916 d_lustre_invalidate(dentry, 0);
3917 ll_unlock_dcache(inode);
3920 ll_lookup_finish_locks(&oit, dentry);
3921 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3922 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3923 u64 valid = OBD_MD_FLGETATTR;
3924 struct md_op_data *op_data;
3927 if (S_ISREG(inode->i_mode)) {
3928 rc = ll_get_default_mdsize(sbi, &ealen);
3931 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3934 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3935 0, ealen, LUSTRE_OPC_ANY,
3937 if (IS_ERR(op_data))
3938 RETURN(PTR_ERR(op_data));
3940 op_data->op_valid = valid;
3941 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3942 ll_finish_md_op_data(op_data);
3944 rc = ll_inode_revalidate_fini(inode, rc);
3948 rc = ll_prep_inode(&inode, req, NULL, NULL);
3951 ptlrpc_req_finished(req);
3955 static int ll_merge_md_attr(struct inode *inode)
3957 struct cl_attr attr = { 0 };
3960 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3961 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3962 &attr, ll_md_blocking_ast);
3966 set_nlink(inode, attr.cat_nlink);
3967 inode->i_blocks = attr.cat_blocks;
3968 i_size_write(inode, attr.cat_size);
3970 ll_i2info(inode)->lli_atime = attr.cat_atime;
3971 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3972 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3978 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3980 struct inode *inode = dentry->d_inode;
3984 rc = __ll_inode_revalidate(dentry, ibits);
3988 /* if object isn't regular file, don't validate size */
3989 if (!S_ISREG(inode->i_mode)) {
3990 if (S_ISDIR(inode->i_mode) &&
3991 ll_i2info(inode)->lli_lsm_md != NULL) {
3992 rc = ll_merge_md_attr(inode);
3997 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3998 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3999 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
4001 /* In case of restore, the MDT has the right size and has
4002 * already send it back without granting the layout lock,
4003 * inode is up-to-date so glimpse is useless.
4004 * Also to glimpse we need the layout, in case of a running
4005 * restore the MDT holds the layout lock so the glimpse will
4006 * block up to the end of restore (getattr will block)
4008 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4009 rc = ll_glimpse_size(inode);
4014 static inline dev_t ll_compat_encode_dev(dev_t dev)
4016 /* The compat_sys_*stat*() syscalls will fail unless the
4017 * device majors and minors are both less than 256. Note that
4018 * the value returned here will be passed through
4019 * old_encode_dev() in cp_compat_stat(). And so we are not
4020 * trying to return a valid compat (u16) device number, just
4021 * one that will pass the old_valid_dev() check. */
4023 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4026 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4027 int ll_getattr(const struct path *path, struct kstat *stat,
4028 u32 request_mask, unsigned int flags)
4031 struct dentry *de = path->dentry;
4033 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4036 struct inode *inode = de->d_inode;
4037 struct ll_sb_info *sbi = ll_i2sbi(inode);
4038 struct ll_inode_info *lli = ll_i2info(inode);
4041 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4042 MDS_INODELOCK_LOOKUP);
4043 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4048 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4050 if (ll_need_32bit_api(sbi)) {
4051 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4052 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4053 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4055 stat->ino = inode->i_ino;
4056 stat->dev = inode->i_sb->s_dev;
4057 stat->rdev = inode->i_rdev;
4060 stat->mode = inode->i_mode;
4061 stat->uid = inode->i_uid;
4062 stat->gid = inode->i_gid;
4063 stat->atime = inode->i_atime;
4064 stat->mtime = inode->i_mtime;
4065 stat->ctime = inode->i_ctime;
4066 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4068 stat->nlink = inode->i_nlink;
4069 stat->size = i_size_read(inode);
4070 stat->blocks = inode->i_blocks;
4075 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4076 __u64 start, __u64 len)
4080 struct fiemap *fiemap;
4081 unsigned int extent_count = fieinfo->fi_extents_max;
4083 num_bytes = sizeof(*fiemap) + (extent_count *
4084 sizeof(struct fiemap_extent));
4085 OBD_ALLOC_LARGE(fiemap, num_bytes);
4090 fiemap->fm_flags = fieinfo->fi_flags;
4091 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4092 fiemap->fm_start = start;
4093 fiemap->fm_length = len;
4094 if (extent_count > 0 &&
4095 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4096 sizeof(struct fiemap_extent)) != 0)
4097 GOTO(out, rc = -EFAULT);
4099 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4101 fieinfo->fi_flags = fiemap->fm_flags;
4102 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4103 if (extent_count > 0 &&
4104 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4105 fiemap->fm_mapped_extents *
4106 sizeof(struct fiemap_extent)) != 0)
4107 GOTO(out, rc = -EFAULT);
4109 OBD_FREE_LARGE(fiemap, num_bytes);
4113 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4115 struct ll_inode_info *lli = ll_i2info(inode);
4116 struct posix_acl *acl = NULL;
4119 spin_lock(&lli->lli_lock);
4120 /* VFS' acl_permission_check->check_acl will release the refcount */
4121 acl = posix_acl_dup(lli->lli_posix_acl);
4122 spin_unlock(&lli->lli_lock);
4127 #ifdef HAVE_IOP_SET_ACL
4128 #ifdef CONFIG_FS_POSIX_ACL
4129 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4131 const char *name = NULL;
4138 case ACL_TYPE_ACCESS:
4140 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4144 name = XATTR_NAME_POSIX_ACL_ACCESS;
4146 case ACL_TYPE_DEFAULT:
4147 if (!S_ISDIR(inode->i_mode))
4148 GOTO(out, rc = acl ? -EACCES : 0);
4149 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4152 GOTO(out, rc = -EINVAL);
4156 size = posix_acl_xattr_size(acl->a_count);
4157 value = kmalloc(size, GFP_NOFS);
4159 GOTO(out, rc = -ENOMEM);
4161 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4166 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4167 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4172 set_cached_acl(inode, type, acl);
4174 forget_cached_acl(inode, type);
4177 #endif /* CONFIG_FS_POSIX_ACL */
4178 #endif /* HAVE_IOP_SET_ACL */
4180 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4182 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4183 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4185 ll_check_acl(struct inode *inode, int mask)
4188 # ifdef CONFIG_FS_POSIX_ACL
4189 struct posix_acl *acl;
4193 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4194 if (flags & IPERM_FLAG_RCU)
4197 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4202 rc = posix_acl_permission(inode, acl, mask);
4203 posix_acl_release(acl);
4206 # else /* !CONFIG_FS_POSIX_ACL */
4208 # endif /* CONFIG_FS_POSIX_ACL */
4210 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4212 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4213 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4215 # ifdef HAVE_INODE_PERMISION_2ARGS
4216 int ll_inode_permission(struct inode *inode, int mask)
4218 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4223 struct ll_sb_info *sbi;
4224 struct root_squash_info *squash;
4225 struct cred *cred = NULL;
4226 const struct cred *old_cred = NULL;
4228 bool squash_id = false;
4231 #ifdef MAY_NOT_BLOCK
4232 if (mask & MAY_NOT_BLOCK)
4234 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4235 if (flags & IPERM_FLAG_RCU)
4239 /* as root inode are NOT getting validated in lookup operation,
4240 * need to do it before permission check. */
4242 if (inode == inode->i_sb->s_root->d_inode) {
4243 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4244 MDS_INODELOCK_LOOKUP);
4249 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4250 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4252 /* squash fsuid/fsgid if needed */
4253 sbi = ll_i2sbi(inode);
4254 squash = &sbi->ll_squash;
4255 if (unlikely(squash->rsi_uid != 0 &&
4256 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4257 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4261 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4262 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4263 squash->rsi_uid, squash->rsi_gid);
4265 /* update current process's credentials
4266 * and FS capability */
4267 cred = prepare_creds();
4271 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4272 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4273 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4274 if ((1 << cap) & CFS_CAP_FS_MASK)
4275 cap_lower(cred->cap_effective, cap);
4277 old_cred = override_creds(cred);
4280 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4281 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4282 /* restore current process's credentials and FS capability */
4284 revert_creds(old_cred);
4291 /* -o localflock - only provides locally consistent flock locks */
4292 struct file_operations ll_file_operations = {
4293 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4294 # ifdef HAVE_SYNC_READ_WRITE
4295 .read = new_sync_read,
4296 .write = new_sync_write,
4298 .read_iter = ll_file_read_iter,
4299 .write_iter = ll_file_write_iter,
4300 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4301 .read = ll_file_read,
4302 .aio_read = ll_file_aio_read,
4303 .write = ll_file_write,
4304 .aio_write = ll_file_aio_write,
4305 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4306 .unlocked_ioctl = ll_file_ioctl,
4307 .open = ll_file_open,
4308 .release = ll_file_release,
4309 .mmap = ll_file_mmap,
4310 .llseek = ll_file_seek,
4311 .splice_read = ll_file_splice_read,
4316 struct file_operations ll_file_operations_flock = {
4317 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4318 # ifdef HAVE_SYNC_READ_WRITE
4319 .read = new_sync_read,
4320 .write = new_sync_write,
4321 # endif /* HAVE_SYNC_READ_WRITE */
4322 .read_iter = ll_file_read_iter,
4323 .write_iter = ll_file_write_iter,
4324 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4325 .read = ll_file_read,
4326 .aio_read = ll_file_aio_read,
4327 .write = ll_file_write,
4328 .aio_write = ll_file_aio_write,
4329 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4330 .unlocked_ioctl = ll_file_ioctl,
4331 .open = ll_file_open,
4332 .release = ll_file_release,
4333 .mmap = ll_file_mmap,
4334 .llseek = ll_file_seek,
4335 .splice_read = ll_file_splice_read,
4338 .flock = ll_file_flock,
4339 .lock = ll_file_flock
4342 /* These are for -o noflock - to return ENOSYS on flock calls */
4343 struct file_operations ll_file_operations_noflock = {
4344 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4345 # ifdef HAVE_SYNC_READ_WRITE
4346 .read = new_sync_read,
4347 .write = new_sync_write,
4348 # endif /* HAVE_SYNC_READ_WRITE */
4349 .read_iter = ll_file_read_iter,
4350 .write_iter = ll_file_write_iter,
4351 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4352 .read = ll_file_read,
4353 .aio_read = ll_file_aio_read,
4354 .write = ll_file_write,
4355 .aio_write = ll_file_aio_write,
4356 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4357 .unlocked_ioctl = ll_file_ioctl,
4358 .open = ll_file_open,
4359 .release = ll_file_release,
4360 .mmap = ll_file_mmap,
4361 .llseek = ll_file_seek,
4362 .splice_read = ll_file_splice_read,
4365 .flock = ll_file_noflock,
4366 .lock = ll_file_noflock
4369 struct inode_operations ll_file_inode_operations = {
4370 .setattr = ll_setattr,
4371 .getattr = ll_getattr,
4372 .permission = ll_inode_permission,
4373 #ifdef HAVE_IOP_XATTR
4374 .setxattr = ll_setxattr,
4375 .getxattr = ll_getxattr,
4376 .removexattr = ll_removexattr,
4378 .listxattr = ll_listxattr,
4379 .fiemap = ll_fiemap,
4380 #ifdef HAVE_IOP_GET_ACL
4381 .get_acl = ll_get_acl,
4383 #ifdef HAVE_IOP_SET_ACL
4384 .set_acl = ll_set_acl,
4388 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4390 struct ll_inode_info *lli = ll_i2info(inode);
4391 struct cl_object *obj = lli->lli_clob;
4400 env = cl_env_get(&refcheck);
4402 RETURN(PTR_ERR(env));
4404 rc = cl_conf_set(env, lli->lli_clob, conf);
4408 if (conf->coc_opc == OBJECT_CONF_SET) {
4409 struct ldlm_lock *lock = conf->coc_lock;
4410 struct cl_layout cl = {
4414 LASSERT(lock != NULL);
4415 LASSERT(ldlm_has_layout(lock));
4417 /* it can only be allowed to match after layout is
4418 * applied to inode otherwise false layout would be
4419 * seen. Applying layout shoud happen before dropping
4420 * the intent lock. */
4421 ldlm_lock_allow_match(lock);
4423 rc = cl_object_layout_get(env, obj, &cl);
4428 DFID": layout version change: %u -> %u\n",
4429 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4431 ll_layout_version_set(lli, cl.cl_layout_gen);
4435 cl_env_put(env, &refcheck);
4440 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4441 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4444 struct ll_sb_info *sbi = ll_i2sbi(inode);
4445 struct ptlrpc_request *req;
4446 struct mdt_body *body;
4453 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4454 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4455 lock->l_lvb_data, lock->l_lvb_len);
4457 if (lock->l_lvb_data != NULL)
4460 /* if layout lock was granted right away, the layout is returned
4461 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4462 * blocked and then granted via completion ast, we have to fetch
4463 * layout here. Please note that we can't use the LVB buffer in
4464 * completion AST because it doesn't have a large enough buffer */
4465 rc = ll_get_default_mdsize(sbi, &lmmsize);
4467 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4468 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4473 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4475 GOTO(out, rc = -EPROTO);
4477 lmmsize = body->mbo_eadatasize;
4478 if (lmmsize == 0) /* empty layout */
4481 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4483 GOTO(out, rc = -EFAULT);
4485 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4486 if (lvbdata == NULL)
4487 GOTO(out, rc = -ENOMEM);
4489 memcpy(lvbdata, lmm, lmmsize);
4490 lock_res_and_lock(lock);
4491 if (unlikely(lock->l_lvb_data == NULL)) {
4492 lock->l_lvb_type = LVB_T_LAYOUT;
4493 lock->l_lvb_data = lvbdata;
4494 lock->l_lvb_len = lmmsize;
4497 unlock_res_and_lock(lock);
4500 OBD_FREE_LARGE(lvbdata, lmmsize);
4505 ptlrpc_req_finished(req);
4510 * Apply the layout to the inode. Layout lock is held and will be released
4513 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4514 struct inode *inode)
4516 struct ll_inode_info *lli = ll_i2info(inode);
4517 struct ll_sb_info *sbi = ll_i2sbi(inode);
4518 struct ldlm_lock *lock;
4519 struct cl_object_conf conf;
4522 bool wait_layout = false;
4525 LASSERT(lustre_handle_is_used(lockh));
4527 lock = ldlm_handle2lock(lockh);
4528 LASSERT(lock != NULL);
4529 LASSERT(ldlm_has_layout(lock));
4531 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4532 PFID(&lli->lli_fid), inode);
4534 /* in case this is a caching lock and reinstate with new inode */
4535 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4537 lock_res_and_lock(lock);
4538 lvb_ready = ldlm_is_lvb_ready(lock);
4539 unlock_res_and_lock(lock);
4541 /* checking lvb_ready is racy but this is okay. The worst case is
4542 * that multi processes may configure the file on the same time. */
4546 rc = ll_layout_fetch(inode, lock);
4550 /* for layout lock, lmm is stored in lock's lvb.
4551 * lvb_data is immutable if the lock is held so it's safe to access it
4554 * set layout to file. Unlikely this will fail as old layout was
4555 * surely eliminated */
4556 memset(&conf, 0, sizeof conf);
4557 conf.coc_opc = OBJECT_CONF_SET;
4558 conf.coc_inode = inode;
4559 conf.coc_lock = lock;
4560 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4561 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4562 rc = ll_layout_conf(inode, &conf);
4564 /* refresh layout failed, need to wait */
4565 wait_layout = rc == -EBUSY;
4568 LDLM_LOCK_PUT(lock);
4569 ldlm_lock_decref(lockh, mode);
4571 /* wait for IO to complete if it's still being used. */
4573 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4574 ll_get_fsname(inode->i_sb, NULL, 0),
4575 PFID(&lli->lli_fid), inode);
4577 memset(&conf, 0, sizeof conf);
4578 conf.coc_opc = OBJECT_CONF_WAIT;
4579 conf.coc_inode = inode;
4580 rc = ll_layout_conf(inode, &conf);
4584 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4585 ll_get_fsname(inode->i_sb, NULL, 0),
4586 PFID(&lli->lli_fid), rc);
4592 * Issue layout intent RPC to MDS.
4593 * \param inode [in] file inode
4594 * \param intent [in] layout intent
4596 * \retval 0 on success
4597 * \retval < 0 error code
4599 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4601 struct ll_inode_info *lli = ll_i2info(inode);
4602 struct ll_sb_info *sbi = ll_i2sbi(inode);
4603 struct md_op_data *op_data;
4604 struct lookup_intent it;
4605 struct ptlrpc_request *req;
4609 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4610 0, 0, LUSTRE_OPC_ANY, NULL);
4611 if (IS_ERR(op_data))
4612 RETURN(PTR_ERR(op_data));
4614 op_data->op_data = intent;
4615 op_data->op_data_size = sizeof(*intent);
4617 memset(&it, 0, sizeof(it));
4618 it.it_op = IT_LAYOUT;
4619 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4620 intent->li_opc == LAYOUT_INTENT_TRUNC)
4621 it.it_flags = FMODE_WRITE;
4623 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4624 ll_get_fsname(inode->i_sb, NULL, 0),
4625 PFID(&lli->lli_fid), inode);
4627 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4628 &ll_md_blocking_ast, 0);
4629 if (it.it_request != NULL)
4630 ptlrpc_req_finished(it.it_request);
4631 it.it_request = NULL;
4633 ll_finish_md_op_data(op_data);
4635 /* set lock data in case this is a new lock */
4637 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4639 ll_intent_drop_lock(&it);
4645 * This function checks if there exists a LAYOUT lock on the client side,
4646 * or enqueues it if it doesn't have one in cache.
4648 * This function will not hold layout lock so it may be revoked any time after
4649 * this function returns. Any operations depend on layout should be redone
4652 * This function should be called before lov_io_init() to get an uptodate
4653 * layout version, the caller should save the version number and after IO
4654 * is finished, this function should be called again to verify that layout
4655 * is not changed during IO time.
4657 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4659 struct ll_inode_info *lli = ll_i2info(inode);
4660 struct ll_sb_info *sbi = ll_i2sbi(inode);
4661 struct lustre_handle lockh;
4662 struct layout_intent intent = {
4663 .li_opc = LAYOUT_INTENT_ACCESS,
4665 enum ldlm_mode mode;
4669 *gen = ll_layout_version_get(lli);
4670 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4674 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4675 LASSERT(S_ISREG(inode->i_mode));
4677 /* take layout lock mutex to enqueue layout lock exclusively. */
4678 mutex_lock(&lli->lli_layout_mutex);
4681 /* mostly layout lock is caching on the local side, so try to
4682 * match it before grabbing layout lock mutex. */
4683 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4684 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4685 if (mode != 0) { /* hit cached lock */
4686 rc = ll_layout_lock_set(&lockh, mode, inode);
4692 rc = ll_layout_intent(inode, &intent);
4698 *gen = ll_layout_version_get(lli);
4699 mutex_unlock(&lli->lli_layout_mutex);
4705 * Issue layout intent RPC indicating where in a file an IO is about to write.
4707 * \param[in] inode file inode.
4708 * \param[in] start start offset of fille in bytes where an IO is about to
4710 * \param[in] end exclusive end offset in bytes of the write range.
4712 * \retval 0 on success
4713 * \retval < 0 error code
4715 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4717 struct layout_intent intent = {
4718 .li_opc = LAYOUT_INTENT_WRITE,
4725 rc = ll_layout_intent(inode, &intent);
4731 * This function send a restore request to the MDT
4733 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4735 struct hsm_user_request *hur;
4739 len = sizeof(struct hsm_user_request) +
4740 sizeof(struct hsm_user_item);
4741 OBD_ALLOC(hur, len);
4745 hur->hur_request.hr_action = HUA_RESTORE;
4746 hur->hur_request.hr_archive_id = 0;
4747 hur->hur_request.hr_flags = 0;
4748 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4749 sizeof(hur->hur_user_item[0].hui_fid));
4750 hur->hur_user_item[0].hui_extent.offset = offset;
4751 hur->hur_user_item[0].hui_extent.length = length;
4752 hur->hur_request.hr_itemcount = 1;
4753 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,