4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_CLOSE_RESYNC_DONE: {
160 struct ll_ioc_lease *ioc = data;
162 LASSERT(data != NULL);
163 op_data->op_attr_blocks +=
164 ioc->lil_count * op_data->op_attr_blocks;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_data = &ioc->lil_ids[0];
170 op_data->op_data_size =
171 ioc->lil_count * sizeof(ioc->lil_ids[0]);
175 case MDS_HSM_RELEASE:
176 LASSERT(data != NULL);
177 op_data->op_bias |= MDS_HSM_RELEASE;
178 op_data->op_data_version = *(__u64 *)data;
179 op_data->op_lease_handle = och->och_lease_handle;
180 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
184 LASSERT(data == NULL);
188 rc = md_close(md_exp, op_data, och->och_mod, &req);
189 if (rc != 0 && rc != -EINTR)
190 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
191 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
193 if (rc == 0 && op_data->op_bias & bias) {
194 struct mdt_body *body;
196 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
197 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
201 ll_finish_md_op_data(op_data);
205 md_clear_open_replay_data(md_exp, och);
206 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
209 ptlrpc_req_finished(req); /* This is close request */
213 int ll_md_real_close(struct inode *inode, fmode_t fmode)
215 struct ll_inode_info *lli = ll_i2info(inode);
216 struct obd_client_handle **och_p;
217 struct obd_client_handle *och;
222 if (fmode & FMODE_WRITE) {
223 och_p = &lli->lli_mds_write_och;
224 och_usecount = &lli->lli_open_fd_write_count;
225 } else if (fmode & FMODE_EXEC) {
226 och_p = &lli->lli_mds_exec_och;
227 och_usecount = &lli->lli_open_fd_exec_count;
229 LASSERT(fmode & FMODE_READ);
230 och_p = &lli->lli_mds_read_och;
231 och_usecount = &lli->lli_open_fd_read_count;
234 mutex_lock(&lli->lli_och_mutex);
235 if (*och_usecount > 0) {
236 /* There are still users of this handle, so skip
238 mutex_unlock(&lli->lli_och_mutex);
244 mutex_unlock(&lli->lli_och_mutex);
247 /* There might be a race and this handle may already
249 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
255 static int ll_md_close(struct inode *inode, struct file *file)
257 union ldlm_policy_data policy = {
258 .l_inodebits = { MDS_INODELOCK_OPEN },
260 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
261 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct lustre_handle lockh;
264 enum ldlm_mode lockmode;
268 /* clear group lock, if present */
269 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
270 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
272 if (fd->fd_lease_och != NULL) {
275 /* Usually the lease is not released when the
276 * application crashed, we need to release here. */
277 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
278 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
279 PFID(&lli->lli_fid), rc, lease_broken);
281 fd->fd_lease_och = NULL;
284 if (fd->fd_och != NULL) {
285 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
290 /* Let's see if we have good enough OPEN lock on the file and if
291 we can skip talking to MDS */
292 mutex_lock(&lli->lli_och_mutex);
293 if (fd->fd_omode & FMODE_WRITE) {
295 LASSERT(lli->lli_open_fd_write_count);
296 lli->lli_open_fd_write_count--;
297 } else if (fd->fd_omode & FMODE_EXEC) {
299 LASSERT(lli->lli_open_fd_exec_count);
300 lli->lli_open_fd_exec_count--;
303 LASSERT(lli->lli_open_fd_read_count);
304 lli->lli_open_fd_read_count--;
306 mutex_unlock(&lli->lli_och_mutex);
308 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
309 LDLM_IBITS, &policy, lockmode, &lockh))
310 rc = ll_md_real_close(inode, fd->fd_omode);
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
319 /* While this returns an error code, fput() the caller does not, so we need
320 * to make every effort to clean up all of our state here. Also, applications
321 * rarely check close errors and even if an error is returned they will not
322 * re-try the close call.
324 int ll_file_release(struct inode *inode, struct file *file)
326 struct ll_file_data *fd;
327 struct ll_sb_info *sbi = ll_i2sbi(inode);
328 struct ll_inode_info *lli = ll_i2info(inode);
332 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
333 PFID(ll_inode2fid(inode)), inode);
335 if (inode->i_sb->s_root != file_dentry(file))
336 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
337 fd = LUSTRE_FPRIVATE(file);
340 /* The last ref on @file, maybe not the the owner pid of statahead,
341 * because parent and child process can share the same file handle. */
342 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
343 ll_deauthorize_statahead(inode, fd);
345 if (inode->i_sb->s_root == file_dentry(file)) {
346 LUSTRE_FPRIVATE(file) = NULL;
347 ll_file_data_put(fd);
351 if (!S_ISDIR(inode->i_mode)) {
352 if (lli->lli_clob != NULL)
353 lov_read_and_clear_async_rc(lli->lli_clob);
354 lli->lli_async_rc = 0;
357 rc = ll_md_close(inode, file);
359 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
360 libcfs_debug_dumplog();
365 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
366 struct lookup_intent *itp)
368 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
369 struct dentry *parent = de->d_parent;
370 const char *name = NULL;
372 struct md_op_data *op_data;
373 struct ptlrpc_request *req = NULL;
377 LASSERT(parent != NULL);
378 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
380 /* if server supports open-by-fid, or file name is invalid, don't pack
381 * name in open request */
382 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
383 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
384 name = de->d_name.name;
385 len = de->d_name.len;
388 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
389 name, len, 0, LUSTRE_OPC_ANY, NULL);
391 RETURN(PTR_ERR(op_data));
392 op_data->op_data = lmm;
393 op_data->op_data_size = lmmsize;
395 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
396 &ll_md_blocking_ast, 0);
397 ll_finish_md_op_data(op_data);
399 /* reason for keep own exit path - don`t flood log
400 * with messages with -ESTALE errors.
402 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
403 it_open_error(DISP_OPEN_OPEN, itp))
405 ll_release_openhandle(de, itp);
409 if (it_disposition(itp, DISP_LOOKUP_NEG))
410 GOTO(out, rc = -ENOENT);
412 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
413 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
414 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
418 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
419 if (!rc && itp->it_lock_mode)
420 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
423 ptlrpc_req_finished(req);
424 ll_intent_drop_lock(itp);
426 /* We did open by fid, but by the time we got to the server,
427 * the object disappeared. If this is a create, we cannot really
428 * tell the userspace that the file it was trying to create
429 * does not exist. Instead let's return -ESTALE, and the VFS will
430 * retry the create with LOOKUP_REVAL that we are going to catch
431 * in ll_revalidate_dentry() and use lookup then.
433 if (rc == -ENOENT && itp->it_op & IT_CREAT)
439 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
440 struct obd_client_handle *och)
442 struct mdt_body *body;
444 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
445 och->och_fh = body->mbo_handle;
446 och->och_fid = body->mbo_fid1;
447 och->och_lease_handle.cookie = it->it_lock_handle;
448 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
449 och->och_flags = it->it_flags;
451 return md_set_open_replay_data(md_exp, och, it);
454 static int ll_local_open(struct file *file, struct lookup_intent *it,
455 struct ll_file_data *fd, struct obd_client_handle *och)
457 struct inode *inode = file_inode(file);
460 LASSERT(!LUSTRE_FPRIVATE(file));
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
472 LUSTRE_FPRIVATE(file) = fd;
473 ll_readahead_init(inode, &fd->fd_ras);
474 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
476 /* ll_cl_context initialize */
477 rwlock_init(&fd->fd_lock);
478 INIT_LIST_HEAD(&fd->fd_lccs);
483 /* Open a file, and (for the very first open) create objects on the OSTs at
484 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
485 * creation or open until ll_lov_setstripe() ioctl is called.
487 * If we already have the stripe MD locally then we don't request it in
488 * md_open(), by passing a lmm_size = 0.
490 * It is up to the application to ensure no other processes open this file
491 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
492 * used. We might be able to avoid races of that sort by getting lli_open_sem
493 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
494 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
496 int ll_file_open(struct inode *inode, struct file *file)
498 struct ll_inode_info *lli = ll_i2info(inode);
499 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
500 .it_flags = file->f_flags };
501 struct obd_client_handle **och_p = NULL;
502 __u64 *och_usecount = NULL;
503 struct ll_file_data *fd;
507 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
508 PFID(ll_inode2fid(inode)), inode, file->f_flags);
510 it = file->private_data; /* XXX: compat macro */
511 file->private_data = NULL; /* prevent ll_local_open assertion */
513 fd = ll_file_data_get();
515 GOTO(out_openerr, rc = -ENOMEM);
518 if (S_ISDIR(inode->i_mode))
519 ll_authorize_statahead(inode, fd);
521 if (inode->i_sb->s_root == file_dentry(file)) {
522 LUSTRE_FPRIVATE(file) = fd;
526 if (!it || !it->it_disposition) {
527 /* Convert f_flags into access mode. We cannot use file->f_mode,
528 * because everything but O_ACCMODE mask was stripped from
530 if ((oit.it_flags + 1) & O_ACCMODE)
532 if (file->f_flags & O_TRUNC)
533 oit.it_flags |= FMODE_WRITE;
535 /* kernel only call f_op->open in dentry_open. filp_open calls
536 * dentry_open after call to open_namei that checks permissions.
537 * Only nfsd_open call dentry_open directly without checking
538 * permissions and because of that this code below is safe. */
539 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
540 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
542 /* We do not want O_EXCL here, presumably we opened the file
543 * already? XXX - NFS implications? */
544 oit.it_flags &= ~O_EXCL;
546 /* bug20584, if "it_flags" contains O_CREAT, the file will be
547 * created if necessary, then "IT_CREAT" should be set to keep
548 * consistent with it */
549 if (oit.it_flags & O_CREAT)
550 oit.it_op |= IT_CREAT;
556 /* Let's see if we have file open on MDS already. */
557 if (it->it_flags & FMODE_WRITE) {
558 och_p = &lli->lli_mds_write_och;
559 och_usecount = &lli->lli_open_fd_write_count;
560 } else if (it->it_flags & FMODE_EXEC) {
561 och_p = &lli->lli_mds_exec_och;
562 och_usecount = &lli->lli_open_fd_exec_count;
564 och_p = &lli->lli_mds_read_och;
565 och_usecount = &lli->lli_open_fd_read_count;
568 mutex_lock(&lli->lli_och_mutex);
569 if (*och_p) { /* Open handle is present */
570 if (it_disposition(it, DISP_OPEN_OPEN)) {
571 /* Well, there's extra open request that we do not need,
572 let's close it somehow. This will decref request. */
573 rc = it_open_error(DISP_OPEN_OPEN, it);
575 mutex_unlock(&lli->lli_och_mutex);
576 GOTO(out_openerr, rc);
579 ll_release_openhandle(file_dentry(file), it);
583 rc = ll_local_open(file, it, fd, NULL);
586 mutex_unlock(&lli->lli_och_mutex);
587 GOTO(out_openerr, rc);
590 LASSERT(*och_usecount == 0);
591 if (!it->it_disposition) {
592 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
593 /* We cannot just request lock handle now, new ELC code
594 means that one of other OPEN locks for this file
595 could be cancelled, and since blocking ast handler
596 would attempt to grab och_mutex as well, that would
597 result in a deadlock */
598 mutex_unlock(&lli->lli_och_mutex);
600 * Normally called under two situations:
602 * 2. A race/condition on MDS resulting in no open
603 * handle to be returned from LOOKUP|OPEN request,
604 * for example if the target entry was a symlink.
606 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
607 * marked by a bit set in ll_iget_for_nfs. Clear the
608 * bit so that it's not confusing later callers.
610 * NB; when ldd is NULL, it must have come via normal
611 * lookup path only, since ll_iget_for_nfs always calls
614 if (ldd && ldd->lld_nfs_dentry) {
615 ldd->lld_nfs_dentry = 0;
616 it->it_flags |= MDS_OPEN_LOCK;
620 * Always specify MDS_OPEN_BY_FID because we don't want
621 * to get file with different fid.
623 it->it_flags |= MDS_OPEN_BY_FID;
624 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
627 GOTO(out_openerr, rc);
631 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
633 GOTO(out_och_free, rc = -ENOMEM);
637 /* md_intent_lock() didn't get a request ref if there was an
638 * open error, so don't do cleanup on the request here
640 /* XXX (green): Should not we bail out on any error here, not
641 * just open error? */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 GOTO(out_och_free, rc);
646 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
647 "inode %p: disposition %x, status %d\n", inode,
648 it_disposition(it, ~0), it->it_status);
650 rc = ll_local_open(file, it, fd, *och_p);
652 GOTO(out_och_free, rc);
654 mutex_unlock(&lli->lli_och_mutex);
657 /* Must do this outside lli_och_mutex lock to prevent deadlock where
658 different kind of OPEN lock for this same inode gets cancelled
659 by ldlm_cancel_lru */
660 if (!S_ISREG(inode->i_mode))
661 GOTO(out_och_free, rc);
663 cl_lov_delay_create_clear(&file->f_flags);
664 GOTO(out_och_free, rc);
668 if (och_p && *och_p) {
669 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
670 *och_p = NULL; /* OBD_FREE writes some magic there */
673 mutex_unlock(&lli->lli_och_mutex);
676 if (lli->lli_opendir_key == fd)
677 ll_deauthorize_statahead(inode, fd);
679 ll_file_data_put(fd);
681 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
684 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
685 ptlrpc_req_finished(it->it_request);
686 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
692 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
693 struct ldlm_lock_desc *desc, void *data, int flag)
696 struct lustre_handle lockh;
700 case LDLM_CB_BLOCKING:
701 ldlm_lock2handle(lock, &lockh);
702 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
704 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
708 case LDLM_CB_CANCELING:
716 * When setting a lease on a file, we take ownership of the lli_mds_*_och
717 * and save it as fd->fd_och so as to force client to reopen the file even
718 * if it has an open lock in cache already.
720 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
721 struct lustre_handle *old_handle)
723 struct ll_inode_info *lli = ll_i2info(inode);
724 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
725 struct obd_client_handle **och_p;
730 /* Get the openhandle of the file */
731 mutex_lock(&lli->lli_och_mutex);
732 if (fd->fd_lease_och != NULL)
733 GOTO(out_unlock, rc = -EBUSY);
735 if (fd->fd_och == NULL) {
736 if (file->f_mode & FMODE_WRITE) {
737 LASSERT(lli->lli_mds_write_och != NULL);
738 och_p = &lli->lli_mds_write_och;
739 och_usecount = &lli->lli_open_fd_write_count;
741 LASSERT(lli->lli_mds_read_och != NULL);
742 och_p = &lli->lli_mds_read_och;
743 och_usecount = &lli->lli_open_fd_read_count;
746 if (*och_usecount > 1)
747 GOTO(out_unlock, rc = -EBUSY);
754 *old_handle = fd->fd_och->och_fh;
758 mutex_unlock(&lli->lli_och_mutex);
763 * Release ownership on lli_mds_*_och when putting back a file lease.
765 static int ll_lease_och_release(struct inode *inode, struct file *file)
767 struct ll_inode_info *lli = ll_i2info(inode);
768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
769 struct obd_client_handle **och_p;
770 struct obd_client_handle *old_och = NULL;
775 mutex_lock(&lli->lli_och_mutex);
776 if (file->f_mode & FMODE_WRITE) {
777 och_p = &lli->lli_mds_write_och;
778 och_usecount = &lli->lli_open_fd_write_count;
780 och_p = &lli->lli_mds_read_och;
781 och_usecount = &lli->lli_open_fd_read_count;
784 /* The file may have been open by another process (broken lease) so
785 * *och_p is not NULL. In this case we should simply increase usecount
788 if (*och_p != NULL) {
789 old_och = fd->fd_och;
796 mutex_unlock(&lli->lli_och_mutex);
799 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
805 * Acquire a lease and open the file.
807 static struct obd_client_handle *
808 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
811 struct lookup_intent it = { .it_op = IT_OPEN };
812 struct ll_sb_info *sbi = ll_i2sbi(inode);
813 struct md_op_data *op_data;
814 struct ptlrpc_request *req = NULL;
815 struct lustre_handle old_handle = { 0 };
816 struct obd_client_handle *och = NULL;
821 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
822 RETURN(ERR_PTR(-EINVAL));
825 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
826 RETURN(ERR_PTR(-EPERM));
828 rc = ll_lease_och_acquire(inode, file, &old_handle);
835 RETURN(ERR_PTR(-ENOMEM));
837 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
838 LUSTRE_OPC_ANY, NULL);
840 GOTO(out, rc = PTR_ERR(op_data));
842 /* To tell the MDT this openhandle is from the same owner */
843 op_data->op_handle = old_handle;
845 it.it_flags = fmode | open_flags;
846 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
847 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
848 &ll_md_blocking_lease_ast,
849 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
850 * it can be cancelled which may mislead applications that the lease is
852 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
853 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
854 * doesn't deal with openhandle, so normal openhandle will be leaked. */
855 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
856 ll_finish_md_op_data(op_data);
857 ptlrpc_req_finished(req);
859 GOTO(out_release_it, rc);
861 if (it_disposition(&it, DISP_LOOKUP_NEG))
862 GOTO(out_release_it, rc = -ENOENT);
864 rc = it_open_error(DISP_OPEN_OPEN, &it);
866 GOTO(out_release_it, rc);
868 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
869 ll_och_fill(sbi->ll_md_exp, &it, och);
871 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
872 GOTO(out_close, rc = -EOPNOTSUPP);
874 /* already get lease, handle lease lock */
875 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
876 if (it.it_lock_mode == 0 ||
877 it.it_lock_bits != MDS_INODELOCK_OPEN) {
878 /* open lock must return for lease */
879 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
880 PFID(ll_inode2fid(inode)), it.it_lock_mode,
882 GOTO(out_close, rc = -EPROTO);
885 ll_intent_release(&it);
889 /* Cancel open lock */
890 if (it.it_lock_mode != 0) {
891 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
894 och->och_lease_handle.cookie = 0ULL;
896 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
898 CERROR("%s: error closing file "DFID": %d\n",
899 ll_get_fsname(inode->i_sb, NULL, 0),
900 PFID(&ll_i2info(inode)->lli_fid), rc2);
901 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
903 ll_intent_release(&it);
911 * Check whether a layout swap can be done between two inodes.
913 * \param[in] inode1 First inode to check
914 * \param[in] inode2 Second inode to check
916 * \retval 0 on success, layout swap can be performed between both inodes
917 * \retval negative error code if requirements are not met
919 static int ll_check_swap_layouts_validity(struct inode *inode1,
920 struct inode *inode2)
922 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
925 if (inode_permission(inode1, MAY_WRITE) ||
926 inode_permission(inode2, MAY_WRITE))
929 if (inode1->i_sb != inode2->i_sb)
935 static int ll_swap_layouts_close(struct obd_client_handle *och,
936 struct inode *inode, struct inode *inode2,
939 const struct lu_fid *fid1 = ll_inode2fid(inode);
940 const struct lu_fid *fid2;
941 enum mds_op_bias bias;
945 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
946 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
948 rc = ll_check_swap_layouts_validity(inode, inode2);
950 GOTO(out_free_och, rc);
952 /* We now know that inode2 is a lustre inode */
953 fid2 = ll_inode2fid(inode2);
955 rc = lu_fid_cmp(fid1, fid2);
957 GOTO(out_free_och, rc = -EINVAL);
960 case SWAP_LAYOUTS_CLOSE:
961 bias = MDS_CLOSE_LAYOUT_SWAP;
963 case MERGE_LAYOUTS_CLOSE:
964 bias = MDS_CLOSE_LAYOUT_MERGE;
967 GOTO(out_free_och, rc = -EOPNOTSUPP);
970 /* Close the file and {swap,merge} layouts between inode & inode2.
971 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
972 * because we still need it to pack l_remote_handle to MDT. */
973 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
975 och = NULL; /* freed in ll_close_inode_openhandle() */
985 * Release lease and close the file.
986 * It will check if the lease has ever broken.
988 static int ll_lease_close_intent(struct obd_client_handle *och,
990 bool *lease_broken, enum mds_op_bias bias,
993 struct ldlm_lock *lock;
994 bool cancelled = true;
998 lock = ldlm_handle2lock(&och->och_lease_handle);
1000 lock_res_and_lock(lock);
1001 cancelled = ldlm_is_cancel(lock);
1002 unlock_res_and_lock(lock);
1003 LDLM_LOCK_PUT(lock);
1006 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1007 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1009 if (lease_broken != NULL)
1010 *lease_broken = cancelled;
1012 if (!cancelled && !bias)
1013 ldlm_cli_cancel(&och->och_lease_handle, 0);
1015 if (cancelled) { /* no need to excute intent */
1020 rc = ll_close_inode_openhandle(inode, och, bias, data);
1024 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1027 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1031 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1033 static int ll_lease_file_resync(struct obd_client_handle *och,
1034 struct inode *inode)
1036 struct ll_sb_info *sbi = ll_i2sbi(inode);
1037 struct md_op_data *op_data;
1038 __u64 data_version_unused;
1042 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1043 LUSTRE_OPC_ANY, NULL);
1044 if (IS_ERR(op_data))
1045 RETURN(PTR_ERR(op_data));
1047 /* before starting file resync, it's necessary to clean up page cache
1048 * in client memory, otherwise once the layout version is increased,
1049 * writing back cached data will be denied the OSTs. */
1050 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1054 op_data->op_handle = och->och_lease_handle;
1055 rc = md_file_resync(sbi->ll_md_exp, op_data);
1061 ll_finish_md_op_data(op_data);
1065 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1067 struct ll_inode_info *lli = ll_i2info(inode);
1068 struct cl_object *obj = lli->lli_clob;
1069 struct cl_attr *attr = vvp_env_thread_attr(env);
1077 ll_inode_size_lock(inode);
1079 /* Merge timestamps the most recently obtained from MDS with
1080 * timestamps obtained from OSTs.
1082 * Do not overwrite atime of inode because it may be refreshed
1083 * by file_accessed() function. If the read was served by cache
1084 * data, there is no RPC to be sent so that atime may not be
1085 * transferred to OSTs at all. MDT only updates atime at close time
1086 * if it's at least 'mdd.*.atime_diff' older.
1087 * All in all, the atime in Lustre does not strictly comply with
1088 * POSIX. Solving this problem needs to send an RPC to MDT for each
1089 * read, this will hurt performance. */
1090 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1091 LTIME_S(inode->i_atime) = lli->lli_atime;
1092 lli->lli_update_atime = 0;
1094 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1095 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1097 atime = LTIME_S(inode->i_atime);
1098 mtime = LTIME_S(inode->i_mtime);
1099 ctime = LTIME_S(inode->i_ctime);
1101 cl_object_attr_lock(obj);
1102 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1105 rc = cl_object_attr_get(env, obj, attr);
1106 cl_object_attr_unlock(obj);
1109 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1111 if (atime < attr->cat_atime)
1112 atime = attr->cat_atime;
1114 if (ctime < attr->cat_ctime)
1115 ctime = attr->cat_ctime;
1117 if (mtime < attr->cat_mtime)
1118 mtime = attr->cat_mtime;
1120 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1121 PFID(&lli->lli_fid), attr->cat_size);
1123 i_size_write(inode, attr->cat_size);
1124 inode->i_blocks = attr->cat_blocks;
1126 LTIME_S(inode->i_atime) = atime;
1127 LTIME_S(inode->i_mtime) = mtime;
1128 LTIME_S(inode->i_ctime) = ctime;
1131 ll_inode_size_unlock(inode);
1137 * Set designated mirror for I/O.
1139 * So far only read, write, and truncated can support to issue I/O to
1140 * designated mirror.
1142 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1144 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1146 /* clear layout version for generic(non-resync) I/O in case it carries
1147 * stale layout version due to I/O restart */
1148 io->ci_layout_version = 0;
1150 /* FLR: disable non-delay for designated mirror I/O because obviously
1151 * only one mirror is available */
1152 if (fd->fd_designated_mirror > 0) {
1154 io->ci_designated_mirror = fd->fd_designated_mirror;
1155 io->ci_layout_version = fd->fd_layout_version;
1156 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1160 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1161 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1164 static bool file_is_noatime(const struct file *file)
1166 const struct vfsmount *mnt = file->f_path.mnt;
1167 const struct inode *inode = file_inode((struct file *)file);
1169 /* Adapted from file_accessed() and touch_atime().*/
1170 if (file->f_flags & O_NOATIME)
1173 if (inode->i_flags & S_NOATIME)
1176 if (IS_NOATIME(inode))
1179 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1182 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1185 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1191 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1193 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1195 struct inode *inode = file_inode(file);
1196 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1198 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1199 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1200 io->u.ci_rw.rw_file = file;
1201 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1202 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1203 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1205 if (iot == CIT_WRITE) {
1206 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1207 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1208 file->f_flags & O_DIRECT ||
1211 io->ci_obj = ll_i2info(inode)->lli_clob;
1212 io->ci_lockreq = CILR_MAYBE;
1213 if (ll_file_nolock(file)) {
1214 io->ci_lockreq = CILR_NEVER;
1215 io->ci_no_srvlock = 1;
1216 } else if (file->f_flags & O_APPEND) {
1217 io->ci_lockreq = CILR_MANDATORY;
1219 io->ci_noatime = file_is_noatime(file);
1220 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1221 io->ci_pio = !io->u.ci_rw.rw_append;
1225 /* FLR: only use non-delay I/O for read as there is only one
1226 * avaliable mirror for write. */
1227 io->ci_ndelay = !(iot == CIT_WRITE);
1229 ll_io_set_mirror(io, file);
1232 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1234 struct cl_io_pt *pt = ptask->pt_cbdata;
1235 struct file *file = pt->cip_file;
1238 loff_t pos = pt->cip_pos;
1243 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1244 file_dentry(file)->d_name.name,
1245 pt->cip_iot == CIT_READ ? "read" : "write",
1246 pos, pos + pt->cip_count);
1248 env = cl_env_get(&refcheck);
1250 RETURN(PTR_ERR(env));
1252 io = vvp_env_thread_io(env);
1253 ll_io_init(io, file, pt->cip_iot);
1254 io->u.ci_rw.rw_iter = pt->cip_iter;
1255 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1256 io->ci_pio = 0; /* It's already in parallel task */
1258 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1259 pt->cip_count - pt->cip_result);
1261 struct vvp_io *vio = vvp_env_io(env);
1263 vio->vui_io_subtype = IO_NORMAL;
1264 vio->vui_fd = LUSTRE_FPRIVATE(file);
1266 ll_cl_add(file, env, io, LCC_RW);
1267 rc = cl_io_loop(env, io);
1268 ll_cl_remove(file, env);
1270 /* cl_io_rw_init() handled IO */
1274 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1280 if (io->ci_nob > 0) {
1281 pt->cip_result += io->ci_nob;
1282 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1284 pt->cip_iocb.ki_pos = pos;
1285 #ifdef HAVE_KIOCB_KI_LEFT
1286 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1287 #elif defined(HAVE_KI_NBYTES)
1288 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1292 cl_io_fini(env, io);
1293 cl_env_put(env, &refcheck);
1295 pt->cip_need_restart = io->ci_need_restart;
1297 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1298 file_dentry(file)->d_name.name,
1299 pt->cip_iot == CIT_READ ? "read" : "write",
1300 pt->cip_result, rc);
1302 RETURN(pt->cip_result > 0 ? 0 : rc);
1306 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1307 struct file *file, enum cl_io_type iot,
1308 loff_t *ppos, size_t count)
1310 struct range_lock range;
1311 struct vvp_io *vio = vvp_env_io(env);
1312 struct inode *inode = file_inode(file);
1313 struct ll_inode_info *lli = ll_i2info(inode);
1314 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1319 unsigned retried = 0;
1320 bool restarted = false;
1324 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1325 file_dentry(file)->d_name.name,
1326 iot == CIT_READ ? "read" : "write", pos, pos + count);
1329 io = vvp_env_thread_io(env);
1330 ll_io_init(io, file, iot);
1331 if (args->via_io_subtype == IO_NORMAL) {
1332 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1333 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1335 if (args->via_io_subtype != IO_NORMAL || restarted)
1337 io->ci_ndelay_tried = retried;
1339 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1340 bool range_locked = false;
1342 if (file->f_flags & O_APPEND)
1343 range_lock_init(&range, 0, LUSTRE_EOF);
1345 range_lock_init(&range, pos, pos + count - 1);
1347 vio->vui_fd = LUSTRE_FPRIVATE(file);
1348 vio->vui_io_subtype = args->via_io_subtype;
1350 switch (vio->vui_io_subtype) {
1352 /* Direct IO reads must also take range lock,
1353 * or multiple reads will try to work on the same pages
1354 * See LU-6227 for details. */
1355 if (((iot == CIT_WRITE) ||
1356 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1357 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1358 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1360 rc = range_lock(&lli->lli_write_tree, &range);
1364 range_locked = true;
1368 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1369 vio->u.splice.vui_flags = args->u.splice.via_flags;
1372 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1376 ll_cl_add(file, env, io, LCC_RW);
1377 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1378 !lli->lli_inode_locked) {
1380 lli->lli_inode_locked = 1;
1382 rc = cl_io_loop(env, io);
1383 if (lli->lli_inode_locked) {
1384 lli->lli_inode_locked = 0;
1385 inode_unlock(inode);
1387 ll_cl_remove(file, env);
1390 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1392 range_unlock(&lli->lli_write_tree, &range);
1395 /* cl_io_rw_init() handled IO */
1399 if (io->ci_nob > 0) {
1400 result += io->ci_nob;
1401 count -= io->ci_nob;
1403 if (args->via_io_subtype == IO_NORMAL) {
1404 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1406 args->u.normal.via_iocb->ki_pos = pos;
1407 #ifdef HAVE_KIOCB_KI_LEFT
1408 args->u.normal.via_iocb->ki_left = count;
1409 #elif defined(HAVE_KI_NBYTES)
1410 args->u.normal.via_iocb->ki_nbytes = count;
1414 pos = io->u.ci_rw.rw_range.cir_pos;
1418 cl_io_fini(env, io);
1421 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1422 file->f_path.dentry->d_name.name,
1423 iot, rc, result, io->ci_need_restart);
1425 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1427 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1428 file_dentry(file)->d_name.name,
1429 iot == CIT_READ ? "read" : "write",
1430 pos, pos + count, result, rc);
1431 /* preserve the tried count for FLR */
1432 retried = io->ci_ndelay_tried;
1437 if (iot == CIT_READ) {
1439 ll_stats_ops_tally(ll_i2sbi(inode),
1440 LPROC_LL_READ_BYTES, result);
1441 } else if (iot == CIT_WRITE) {
1443 ll_stats_ops_tally(ll_i2sbi(inode),
1444 LPROC_LL_WRITE_BYTES, result);
1445 fd->fd_write_failed = false;
1446 } else if (result == 0 && rc == 0) {
1449 fd->fd_write_failed = true;
1451 fd->fd_write_failed = false;
1452 } else if (rc != -ERESTARTSYS) {
1453 fd->fd_write_failed = true;
1457 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1458 file_dentry(file)->d_name.name,
1459 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1463 RETURN(result > 0 ? result : rc);
1467 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1468 * especially for small I/O.
1470 * To serve a read request, CLIO has to create and initialize a cl_io and
1471 * then request DLM lock. This has turned out to have siginificant overhead
1472 * and affects the performance of small I/O dramatically.
1474 * It's not necessary to create a cl_io for each I/O. Under the help of read
1475 * ahead, most of the pages being read are already in memory cache and we can
1476 * read those pages directly because if the pages exist, the corresponding DLM
1477 * lock must exist so that page content must be valid.
1479 * In fast read implementation, the llite speculatively finds and reads pages
1480 * in memory cache. There are three scenarios for fast read:
1481 * - If the page exists and is uptodate, kernel VM will provide the data and
1482 * CLIO won't be intervened;
1483 * - If the page was brought into memory by read ahead, it will be exported
1484 * and read ahead parameters will be updated;
1485 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1486 * it will go back and invoke normal read, i.e., a cl_io will be created
1487 * and DLM lock will be requested.
1489 * POSIX compliance: posix standard states that read is intended to be atomic.
1490 * Lustre read implementation is in line with Linux kernel read implementation
1491 * and neither of them complies with POSIX standard in this matter. Fast read
1492 * doesn't make the situation worse on single node but it may interleave write
1493 * results from multiple nodes due to short read handling in ll_file_aio_read().
1495 * \param env - lu_env
1496 * \param iocb - kiocb from kernel
1497 * \param iter - user space buffers where the data will be copied
1499 * \retval - number of bytes have been read, or error code if error occurred.
1502 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1506 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1509 /* NB: we can't do direct IO for fast read because it will need a lock
1510 * to make IO engine happy. */
1511 if (iocb->ki_filp->f_flags & O_DIRECT)
1514 result = generic_file_read_iter(iocb, iter);
1516 /* If the first page is not in cache, generic_file_aio_read() will be
1517 * returned with -ENODATA.
1518 * See corresponding code in ll_readpage(). */
1519 if (result == -ENODATA)
1523 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1524 LPROC_LL_READ_BYTES, result);
1530 * Read from a file (through the page cache).
1532 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1535 struct vvp_io_args *args;
1540 result = ll_do_fast_read(iocb, to);
1541 if (result < 0 || iov_iter_count(to) == 0)
1544 env = cl_env_get(&refcheck);
1546 return PTR_ERR(env);
1548 args = ll_env_args(env, IO_NORMAL);
1549 args->u.normal.via_iter = to;
1550 args->u.normal.via_iocb = iocb;
1552 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1553 &iocb->ki_pos, iov_iter_count(to));
1556 else if (result == 0)
1559 cl_env_put(env, &refcheck);
1565 * Write to a file (through the page cache).
1567 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1569 struct vvp_io_args *args;
1574 env = cl_env_get(&refcheck);
1576 return PTR_ERR(env);
1578 args = ll_env_args(env, IO_NORMAL);
1579 args->u.normal.via_iter = from;
1580 args->u.normal.via_iocb = iocb;
1582 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1583 &iocb->ki_pos, iov_iter_count(from));
1584 cl_env_put(env, &refcheck);
1588 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1590 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1592 static int ll_file_get_iov_count(const struct iovec *iov,
1593 unsigned long *nr_segs, size_t *count)
1598 for (seg = 0; seg < *nr_segs; seg++) {
1599 const struct iovec *iv = &iov[seg];
1602 * If any segment has a negative length, or the cumulative
1603 * length ever wraps negative then return -EINVAL.
1606 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1608 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1613 cnt -= iv->iov_len; /* This segment is no good */
1620 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1621 unsigned long nr_segs, loff_t pos)
1628 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1632 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1633 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1634 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1635 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1636 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1638 result = ll_file_read_iter(iocb, &to);
1643 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1646 struct iovec iov = { .iov_base = buf, .iov_len = count };
1651 init_sync_kiocb(&kiocb, file);
1652 kiocb.ki_pos = *ppos;
1653 #ifdef HAVE_KIOCB_KI_LEFT
1654 kiocb.ki_left = count;
1655 #elif defined(HAVE_KI_NBYTES)
1656 kiocb.i_nbytes = count;
1659 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1660 *ppos = kiocb.ki_pos;
1666 * Write to a file (through the page cache).
1669 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1670 unsigned long nr_segs, loff_t pos)
1672 struct iov_iter from;
1677 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1681 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1682 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1683 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1684 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1685 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1687 result = ll_file_write_iter(iocb, &from);
1692 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1693 size_t count, loff_t *ppos)
1696 struct iovec iov = { .iov_base = (void __user *)buf,
1698 struct kiocb *kiocb;
1703 env = cl_env_get(&refcheck);
1705 RETURN(PTR_ERR(env));
1707 kiocb = &ll_env_info(env)->lti_kiocb;
1708 init_sync_kiocb(kiocb, file);
1709 kiocb->ki_pos = *ppos;
1710 #ifdef HAVE_KIOCB_KI_LEFT
1711 kiocb->ki_left = count;
1712 #elif defined(HAVE_KI_NBYTES)
1713 kiocb->ki_nbytes = count;
1716 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1717 *ppos = kiocb->ki_pos;
1719 cl_env_put(env, &refcheck);
1722 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1725 * Send file content (through pagecache) somewhere with helper
1727 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1728 struct pipe_inode_info *pipe, size_t count,
1732 struct vvp_io_args *args;
1737 env = cl_env_get(&refcheck);
1739 RETURN(PTR_ERR(env));
1741 args = ll_env_args(env, IO_SPLICE);
1742 args->u.splice.via_pipe = pipe;
1743 args->u.splice.via_flags = flags;
1745 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1746 cl_env_put(env, &refcheck);
1750 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1751 __u64 flags, struct lov_user_md *lum, int lum_size)
1753 struct lookup_intent oit = {
1755 .it_flags = flags | MDS_OPEN_BY_FID,
1760 ll_inode_size_lock(inode);
1761 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1763 GOTO(out_unlock, rc);
1765 ll_release_openhandle(dentry, &oit);
1768 ll_inode_size_unlock(inode);
1769 ll_intent_release(&oit);
1774 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1775 struct lov_mds_md **lmmp, int *lmm_size,
1776 struct ptlrpc_request **request)
1778 struct ll_sb_info *sbi = ll_i2sbi(inode);
1779 struct mdt_body *body;
1780 struct lov_mds_md *lmm = NULL;
1781 struct ptlrpc_request *req = NULL;
1782 struct md_op_data *op_data;
1785 rc = ll_get_default_mdsize(sbi, &lmmsize);
1789 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1790 strlen(filename), lmmsize,
1791 LUSTRE_OPC_ANY, NULL);
1792 if (IS_ERR(op_data))
1793 RETURN(PTR_ERR(op_data));
1795 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1796 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1797 ll_finish_md_op_data(op_data);
1799 CDEBUG(D_INFO, "md_getattr_name failed "
1800 "on %s: rc %d\n", filename, rc);
1804 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1805 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1807 lmmsize = body->mbo_eadatasize;
1809 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1811 GOTO(out, rc = -ENODATA);
1814 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1815 LASSERT(lmm != NULL);
1817 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1818 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1819 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1820 GOTO(out, rc = -EPROTO);
1823 * This is coming from the MDS, so is probably in
1824 * little endian. We convert it to host endian before
1825 * passing it to userspace.
1827 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1830 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1831 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1832 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1833 if (le32_to_cpu(lmm->lmm_pattern) &
1834 LOV_PATTERN_F_RELEASED)
1838 /* if function called for directory - we should
1839 * avoid swab not existent lsm objects */
1840 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1841 lustre_swab_lov_user_md_v1(
1842 (struct lov_user_md_v1 *)lmm);
1843 if (S_ISREG(body->mbo_mode))
1844 lustre_swab_lov_user_md_objects(
1845 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1847 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1848 lustre_swab_lov_user_md_v3(
1849 (struct lov_user_md_v3 *)lmm);
1850 if (S_ISREG(body->mbo_mode))
1851 lustre_swab_lov_user_md_objects(
1852 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1854 } else if (lmm->lmm_magic ==
1855 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1856 lustre_swab_lov_comp_md_v1(
1857 (struct lov_comp_md_v1 *)lmm);
1863 *lmm_size = lmmsize;
1868 static int ll_lov_setea(struct inode *inode, struct file *file,
1871 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1872 struct lov_user_md *lump;
1873 int lum_size = sizeof(struct lov_user_md) +
1874 sizeof(struct lov_user_ost_data);
1878 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1881 OBD_ALLOC_LARGE(lump, lum_size);
1885 if (copy_from_user(lump, arg, lum_size))
1886 GOTO(out_lump, rc = -EFAULT);
1888 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1890 cl_lov_delay_create_clear(&file->f_flags);
1893 OBD_FREE_LARGE(lump, lum_size);
1897 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1904 env = cl_env_get(&refcheck);
1906 RETURN(PTR_ERR(env));
1908 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1909 cl_env_put(env, &refcheck);
1913 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1916 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1917 struct lov_user_md *klum;
1919 __u64 flags = FMODE_WRITE;
1922 rc = ll_copy_user_md(lum, &klum);
1927 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1932 rc = put_user(0, &lum->lmm_stripe_count);
1936 rc = ll_layout_refresh(inode, &gen);
1940 rc = ll_file_getstripe(inode, arg, lum_size);
1942 cl_lov_delay_create_clear(&file->f_flags);
1945 OBD_FREE(klum, lum_size);
1950 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1952 struct ll_inode_info *lli = ll_i2info(inode);
1953 struct cl_object *obj = lli->lli_clob;
1954 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1955 struct ll_grouplock grouplock;
1960 CWARN("group id for group lock must not be 0\n");
1964 if (ll_file_nolock(file))
1965 RETURN(-EOPNOTSUPP);
1967 spin_lock(&lli->lli_lock);
1968 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1969 CWARN("group lock already existed with gid %lu\n",
1970 fd->fd_grouplock.lg_gid);
1971 spin_unlock(&lli->lli_lock);
1974 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1975 spin_unlock(&lli->lli_lock);
1978 * XXX: group lock needs to protect all OST objects while PFL
1979 * can add new OST objects during the IO, so we'd instantiate
1980 * all OST objects before getting its group lock.
1985 struct cl_layout cl = {
1986 .cl_is_composite = false,
1988 struct lu_extent ext = {
1990 .e_end = OBD_OBJECT_EOF,
1993 env = cl_env_get(&refcheck);
1995 RETURN(PTR_ERR(env));
1997 rc = cl_object_layout_get(env, obj, &cl);
1998 if (!rc && cl.cl_is_composite)
1999 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2002 cl_env_put(env, &refcheck);
2007 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2008 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2012 spin_lock(&lli->lli_lock);
2013 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2014 spin_unlock(&lli->lli_lock);
2015 CERROR("another thread just won the race\n");
2016 cl_put_grouplock(&grouplock);
2020 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2021 fd->fd_grouplock = grouplock;
2022 spin_unlock(&lli->lli_lock);
2024 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2028 static int ll_put_grouplock(struct inode *inode, struct file *file,
2031 struct ll_inode_info *lli = ll_i2info(inode);
2032 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2033 struct ll_grouplock grouplock;
2036 spin_lock(&lli->lli_lock);
2037 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2038 spin_unlock(&lli->lli_lock);
2039 CWARN("no group lock held\n");
2043 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2045 if (fd->fd_grouplock.lg_gid != arg) {
2046 CWARN("group lock %lu doesn't match current id %lu\n",
2047 arg, fd->fd_grouplock.lg_gid);
2048 spin_unlock(&lli->lli_lock);
2052 grouplock = fd->fd_grouplock;
2053 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2054 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2055 spin_unlock(&lli->lli_lock);
2057 cl_put_grouplock(&grouplock);
2058 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2063 * Close inode open handle
2065 * \param dentry [in] dentry which contains the inode
2066 * \param it [in,out] intent which contains open info and result
2069 * \retval <0 failure
2071 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2073 struct inode *inode = dentry->d_inode;
2074 struct obd_client_handle *och;
2080 /* Root ? Do nothing. */
2081 if (dentry->d_inode->i_sb->s_root == dentry)
2084 /* No open handle to close? Move away */
2085 if (!it_disposition(it, DISP_OPEN_OPEN))
2088 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2090 OBD_ALLOC(och, sizeof(*och));
2092 GOTO(out, rc = -ENOMEM);
2094 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2096 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2098 /* this one is in place of ll_file_open */
2099 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2100 ptlrpc_req_finished(it->it_request);
2101 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2107 * Get size for inode for which FIEMAP mapping is requested.
2108 * Make the FIEMAP get_info call and returns the result.
2109 * \param fiemap kernel buffer to hold extens
2110 * \param num_bytes kernel buffer size
2112 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2118 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2121 /* Checks for fiemap flags */
2122 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2123 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2127 /* Check for FIEMAP_FLAG_SYNC */
2128 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2129 rc = filemap_fdatawrite(inode->i_mapping);
2134 env = cl_env_get(&refcheck);
2136 RETURN(PTR_ERR(env));
2138 if (i_size_read(inode) == 0) {
2139 rc = ll_glimpse_size(inode);
2144 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2145 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2146 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2148 /* If filesize is 0, then there would be no objects for mapping */
2149 if (fmkey.lfik_oa.o_size == 0) {
2150 fiemap->fm_mapped_extents = 0;
2154 fmkey.lfik_fiemap = *fiemap;
2156 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2157 &fmkey, fiemap, &num_bytes);
2159 cl_env_put(env, &refcheck);
2163 int ll_fid2path(struct inode *inode, void __user *arg)
2165 struct obd_export *exp = ll_i2mdexp(inode);
2166 const struct getinfo_fid2path __user *gfin = arg;
2168 struct getinfo_fid2path *gfout;
2174 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2175 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2178 /* Only need to get the buflen */
2179 if (get_user(pathlen, &gfin->gf_pathlen))
2182 if (pathlen > PATH_MAX)
2185 outsize = sizeof(*gfout) + pathlen;
2186 OBD_ALLOC(gfout, outsize);
2190 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2191 GOTO(gf_free, rc = -EFAULT);
2192 /* append root FID after gfout to let MDT know the root FID so that it
2193 * can lookup the correct path, this is mainly for fileset.
2194 * old server without fileset mount support will ignore this. */
2195 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2197 /* Call mdc_iocontrol */
2198 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2202 if (copy_to_user(arg, gfout, outsize))
2206 OBD_FREE(gfout, outsize);
2211 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2213 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2221 ioc->idv_version = 0;
2222 ioc->idv_layout_version = UINT_MAX;
2224 /* If no file object initialized, we consider its version is 0. */
2228 env = cl_env_get(&refcheck);
2230 RETURN(PTR_ERR(env));
2232 io = vvp_env_thread_io(env);
2234 io->u.ci_data_version.dv_data_version = 0;
2235 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2236 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2239 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2240 result = cl_io_loop(env, io);
2242 result = io->ci_result;
2244 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2245 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2247 cl_io_fini(env, io);
2249 if (unlikely(io->ci_need_restart))
2252 cl_env_put(env, &refcheck);
2258 * Read the data_version for inode.
2260 * This value is computed using stripe object version on OST.
2261 * Version is computed using server side locking.
2263 * @param flags if do sync on the OST side;
2265 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2266 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2268 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2270 struct ioc_data_version ioc = { .idv_flags = flags };
2273 rc = ll_ioc_data_version(inode, &ioc);
2275 *data_version = ioc.idv_version;
2281 * Trigger a HSM release request for the provided inode.
2283 int ll_hsm_release(struct inode *inode)
2286 struct obd_client_handle *och = NULL;
2287 __u64 data_version = 0;
2292 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2293 ll_get_fsname(inode->i_sb, NULL, 0),
2294 PFID(&ll_i2info(inode)->lli_fid));
2296 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2298 GOTO(out, rc = PTR_ERR(och));
2300 /* Grab latest data_version and [am]time values */
2301 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2305 env = cl_env_get(&refcheck);
2307 GOTO(out, rc = PTR_ERR(env));
2309 rc = ll_merge_attr(env, inode);
2310 cl_env_put(env, &refcheck);
2312 /* If error happen, we have the wrong size for a file.
2318 /* Release the file.
2319 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2320 * we still need it to pack l_remote_handle to MDT. */
2321 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2327 if (och != NULL && !IS_ERR(och)) /* close the file */
2328 ll_lease_close(och, inode, NULL);
2333 struct ll_swap_stack {
2336 struct inode *inode1;
2337 struct inode *inode2;
2342 static int ll_swap_layouts(struct file *file1, struct file *file2,
2343 struct lustre_swap_layouts *lsl)
2345 struct mdc_swap_layouts msl;
2346 struct md_op_data *op_data;
2349 struct ll_swap_stack *llss = NULL;
2352 OBD_ALLOC_PTR(llss);
2356 llss->inode1 = file_inode(file1);
2357 llss->inode2 = file_inode(file2);
2359 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2363 /* we use 2 bool because it is easier to swap than 2 bits */
2364 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2365 llss->check_dv1 = true;
2367 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2368 llss->check_dv2 = true;
2370 /* we cannot use lsl->sl_dvX directly because we may swap them */
2371 llss->dv1 = lsl->sl_dv1;
2372 llss->dv2 = lsl->sl_dv2;
2374 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2375 if (rc == 0) /* same file, done! */
2378 if (rc < 0) { /* sequentialize it */
2379 swap(llss->inode1, llss->inode2);
2381 swap(llss->dv1, llss->dv2);
2382 swap(llss->check_dv1, llss->check_dv2);
2386 if (gid != 0) { /* application asks to flush dirty cache */
2387 rc = ll_get_grouplock(llss->inode1, file1, gid);
2391 rc = ll_get_grouplock(llss->inode2, file2, gid);
2393 ll_put_grouplock(llss->inode1, file1, gid);
2398 /* ultimate check, before swaping the layouts we check if
2399 * dataversion has changed (if requested) */
2400 if (llss->check_dv1) {
2401 rc = ll_data_version(llss->inode1, &dv, 0);
2404 if (dv != llss->dv1)
2405 GOTO(putgl, rc = -EAGAIN);
2408 if (llss->check_dv2) {
2409 rc = ll_data_version(llss->inode2, &dv, 0);
2412 if (dv != llss->dv2)
2413 GOTO(putgl, rc = -EAGAIN);
2416 /* struct md_op_data is used to send the swap args to the mdt
2417 * only flags is missing, so we use struct mdc_swap_layouts
2418 * through the md_op_data->op_data */
2419 /* flags from user space have to be converted before they are send to
2420 * server, no flag is sent today, they are only used on the client */
2423 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2424 0, LUSTRE_OPC_ANY, &msl);
2425 if (IS_ERR(op_data))
2426 GOTO(free, rc = PTR_ERR(op_data));
2428 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2429 sizeof(*op_data), op_data, NULL);
2430 ll_finish_md_op_data(op_data);
2437 ll_put_grouplock(llss->inode2, file2, gid);
2438 ll_put_grouplock(llss->inode1, file1, gid);
2448 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2450 struct md_op_data *op_data;
2454 /* Detect out-of range masks */
2455 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2458 /* Non-root users are forbidden to set or clear flags which are
2459 * NOT defined in HSM_USER_MASK. */
2460 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2461 !cfs_capable(CFS_CAP_SYS_ADMIN))
2464 /* Detect out-of range archive id */
2465 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2466 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2469 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2470 LUSTRE_OPC_ANY, hss);
2471 if (IS_ERR(op_data))
2472 RETURN(PTR_ERR(op_data));
2474 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2475 sizeof(*op_data), op_data, NULL);
2477 ll_finish_md_op_data(op_data);
2482 static int ll_hsm_import(struct inode *inode, struct file *file,
2483 struct hsm_user_import *hui)
2485 struct hsm_state_set *hss = NULL;
2486 struct iattr *attr = NULL;
2490 if (!S_ISREG(inode->i_mode))
2496 GOTO(out, rc = -ENOMEM);
2498 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2499 hss->hss_archive_id = hui->hui_archive_id;
2500 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2501 rc = ll_hsm_state_set(inode, hss);
2505 OBD_ALLOC_PTR(attr);
2507 GOTO(out, rc = -ENOMEM);
2509 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2510 attr->ia_mode |= S_IFREG;
2511 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2512 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2513 attr->ia_size = hui->hui_size;
2514 attr->ia_mtime.tv_sec = hui->hui_mtime;
2515 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2516 attr->ia_atime.tv_sec = hui->hui_atime;
2517 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2519 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2520 ATTR_UID | ATTR_GID |
2521 ATTR_MTIME | ATTR_MTIME_SET |
2522 ATTR_ATIME | ATTR_ATIME_SET;
2526 rc = ll_setattr_raw(file_dentry(file), attr, true);
2530 inode_unlock(inode);
2542 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2544 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2545 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2548 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2550 struct inode *inode = file_inode(file);
2552 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2553 ATTR_MTIME | ATTR_MTIME_SET |
2554 ATTR_CTIME | ATTR_CTIME_SET,
2556 .tv_sec = lfu->lfu_atime_sec,
2557 .tv_nsec = lfu->lfu_atime_nsec,
2560 .tv_sec = lfu->lfu_mtime_sec,
2561 .tv_nsec = lfu->lfu_mtime_nsec,
2564 .tv_sec = lfu->lfu_ctime_sec,
2565 .tv_nsec = lfu->lfu_ctime_nsec,
2571 if (!capable(CAP_SYS_ADMIN))
2574 if (!S_ISREG(inode->i_mode))
2578 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2579 inode_unlock(inode);
2584 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2587 case MODE_READ_USER:
2589 case MODE_WRITE_USER:
2596 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2598 /* Used to allow the upper layers of the client to request an LDLM lock
2599 * without doing an actual read or write.
2601 * Used for ladvise lockahead to manually request specific locks.
2603 * \param[in] file file this ladvise lock request is on
2604 * \param[in] ladvise ladvise struct describing this lock request
2606 * \retval 0 success, no detailed result available (sync requests
2607 * and requests sent to the server [not handled locally]
2608 * cannot return detailed results)
2609 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2610 * see definitions for details.
2611 * \retval negative negative errno on error
2613 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2615 struct lu_env *env = NULL;
2616 struct cl_io *io = NULL;
2617 struct cl_lock *lock = NULL;
2618 struct cl_lock_descr *descr = NULL;
2619 struct dentry *dentry = file->f_path.dentry;
2620 struct inode *inode = dentry->d_inode;
2621 enum cl_lock_mode cl_mode;
2622 off_t start = ladvise->lla_start;
2623 off_t end = ladvise->lla_end;
2629 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2630 "start=%llu, end=%llu\n", dentry->d_name.len,
2631 dentry->d_name.name, dentry->d_inode,
2632 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2635 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2637 GOTO(out, result = cl_mode);
2639 /* Get IO environment */
2640 result = cl_io_get(inode, &env, &io, &refcheck);
2644 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2647 * nothing to do for this io. This currently happens when
2648 * stripe sub-object's are not yet created.
2650 result = io->ci_result;
2651 } else if (result == 0) {
2652 lock = vvp_env_lock(env);
2653 descr = &lock->cll_descr;
2655 descr->cld_obj = io->ci_obj;
2656 /* Convert byte offsets to pages */
2657 descr->cld_start = cl_index(io->ci_obj, start);
2658 descr->cld_end = cl_index(io->ci_obj, end);
2659 descr->cld_mode = cl_mode;
2660 /* CEF_MUST is used because we do not want to convert a
2661 * lockahead request to a lockless lock */
2662 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2665 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2666 descr->cld_enq_flags |= CEF_SPECULATIVE;
2668 result = cl_lock_request(env, io, lock);
2670 /* On success, we need to release the lock */
2672 cl_lock_release(env, lock);
2674 cl_io_fini(env, io);
2675 cl_env_put(env, &refcheck);
2677 /* -ECANCELED indicates a matching lock with a different extent
2678 * was already present, and -EEXIST indicates a matching lock
2679 * on exactly the same extent was already present.
2680 * We convert them to positive values for userspace to make
2681 * recognizing true errors easier.
2682 * Note we can only return these detailed results on async requests,
2683 * as sync requests look the same as i/o requests for locking. */
2684 if (result == -ECANCELED)
2685 result = LLA_RESULT_DIFFERENT;
2686 else if (result == -EEXIST)
2687 result = LLA_RESULT_SAME;
2692 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2694 static int ll_ladvise_sanity(struct inode *inode,
2695 struct llapi_lu_ladvise *ladvise)
2697 enum lu_ladvise_type advice = ladvise->lla_advice;
2698 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2699 * be in the first 32 bits of enum ladvise_flags */
2700 __u32 flags = ladvise->lla_peradvice_flags;
2701 /* 3 lines at 80 characters per line, should be plenty */
2704 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2706 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2707 "last supported advice is %s (value '%d'): rc = %d\n",
2708 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2709 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2713 /* Per-advice checks */
2715 case LU_LADVISE_LOCKNOEXPAND:
2716 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2718 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2720 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2721 ladvise_names[advice], rc);
2725 case LU_LADVISE_LOCKAHEAD:
2726 /* Currently only READ and WRITE modes can be requested */
2727 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2728 ladvise->lla_lockahead_mode == 0) {
2730 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2732 ll_get_fsname(inode->i_sb, NULL, 0),
2733 ladvise->lla_lockahead_mode,
2734 ladvise_names[advice], rc);
2737 case LU_LADVISE_WILLREAD:
2738 case LU_LADVISE_DONTNEED:
2740 /* Note fall through above - These checks apply to all advices
2741 * except LOCKNOEXPAND */
2742 if (flags & ~LF_DEFAULT_MASK) {
2744 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2746 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2747 ladvise_names[advice], rc);
2750 if (ladvise->lla_start >= ladvise->lla_end) {
2752 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2753 "for %s: rc = %d\n",
2754 ll_get_fsname(inode->i_sb, NULL, 0),
2755 ladvise->lla_start, ladvise->lla_end,
2756 ladvise_names[advice], rc);
2768 * Give file access advices
2770 * The ladvise interface is similar to Linux fadvise() system call, except it
2771 * forwards the advices directly from Lustre client to server. The server side
2772 * codes will apply appropriate read-ahead and caching techniques for the
2773 * corresponding files.
2775 * A typical workload for ladvise is e.g. a bunch of different clients are
2776 * doing small random reads of a file, so prefetching pages into OSS cache
2777 * with big linear reads before the random IO is a net benefit. Fetching
2778 * all that data into each client cache with fadvise() may not be, due to
2779 * much more data being sent to the client.
2781 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2782 struct llapi_lu_ladvise *ladvise)
2786 struct cl_ladvise_io *lio;
2791 env = cl_env_get(&refcheck);
2793 RETURN(PTR_ERR(env));
2795 io = vvp_env_thread_io(env);
2796 io->ci_obj = ll_i2info(inode)->lli_clob;
2798 /* initialize parameters for ladvise */
2799 lio = &io->u.ci_ladvise;
2800 lio->li_start = ladvise->lla_start;
2801 lio->li_end = ladvise->lla_end;
2802 lio->li_fid = ll_inode2fid(inode);
2803 lio->li_advice = ladvise->lla_advice;
2804 lio->li_flags = flags;
2806 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2807 rc = cl_io_loop(env, io);
2811 cl_io_fini(env, io);
2812 cl_env_put(env, &refcheck);
2816 static int ll_lock_noexpand(struct file *file, int flags)
2818 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2820 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2825 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2828 struct fsxattr fsxattr;
2830 if (copy_from_user(&fsxattr,
2831 (const struct fsxattr __user *)arg,
2835 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2836 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2837 if (copy_to_user((struct fsxattr __user *)arg,
2838 &fsxattr, sizeof(fsxattr)))
2844 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2848 struct md_op_data *op_data;
2849 struct ptlrpc_request *req = NULL;
2851 struct fsxattr fsxattr;
2852 struct cl_object *obj;
2854 /* only root could change project ID */
2855 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2858 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2859 LUSTRE_OPC_ANY, NULL);
2860 if (IS_ERR(op_data))
2861 RETURN(PTR_ERR(op_data));
2863 if (copy_from_user(&fsxattr,
2864 (const struct fsxattr __user *)arg,
2866 GOTO(out_fsxattr1, rc = -EFAULT);
2868 op_data->op_attr_flags = fsxattr.fsx_xflags;
2869 op_data->op_projid = fsxattr.fsx_projid;
2870 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2871 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2873 ptlrpc_req_finished(req);
2875 obj = ll_i2info(inode)->lli_clob;
2879 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2880 OBD_ALLOC_PTR(attr);
2882 GOTO(out_fsxattr1, rc = -ENOMEM);
2883 attr->ia_valid = ATTR_ATTR_FLAG;
2884 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2889 ll_finish_md_op_data(op_data);
2893 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2896 struct inode *inode = file_inode(file);
2897 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2898 struct ll_inode_info *lli = ll_i2info(inode);
2899 struct obd_client_handle *och = NULL;
2902 enum mds_op_bias bias = 0;
2904 size_t data_size = 0;
2908 mutex_lock(&lli->lli_och_mutex);
2909 if (fd->fd_lease_och != NULL) {
2910 och = fd->fd_lease_och;
2911 fd->fd_lease_och = NULL;
2913 mutex_unlock(&lli->lli_och_mutex);
2916 GOTO(out, rc = -ENOLCK);
2918 fmode = och->och_flags;
2920 if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
2921 if (ioc->lil_count > IOC_IDS_MAX)
2922 GOTO(out, rc = -EINVAL);
2924 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2925 OBD_ALLOC(data, data_size);
2927 GOTO(out, rc = -ENOMEM);
2929 if (copy_from_user(data, (void __user *)arg, data_size))
2930 GOTO(out, rc = -EFAULT);
2932 bias = MDS_CLOSE_RESYNC_DONE;
2935 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2939 rc = ll_lease_och_release(inode, file);
2949 OBD_FREE(data, data_size);
2951 rc = ll_lease_type_from_fmode(fmode);
2955 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
2958 struct inode *inode = file_inode(file);
2959 struct ll_inode_info *lli = ll_i2info(inode);
2960 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2961 struct obd_client_handle *och = NULL;
2962 __u64 open_flags = 0;
2968 switch (ioc->lil_mode) {
2969 case LL_LEASE_WRLCK:
2970 if (!(file->f_mode & FMODE_WRITE))
2972 fmode = FMODE_WRITE;
2974 case LL_LEASE_RDLCK:
2975 if (!(file->f_mode & FMODE_READ))
2979 case LL_LEASE_UNLCK:
2980 RETURN(ll_file_unlock_lease(file, ioc, arg));
2985 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2987 /* apply for lease */
2988 if (ioc->lil_flags & LL_LEASE_RESYNC)
2989 open_flags = MDS_OPEN_RESYNC;
2990 och = ll_lease_open(inode, file, fmode, open_flags);
2992 RETURN(PTR_ERR(och));
2994 if (ioc->lil_flags & LL_LEASE_RESYNC) {
2995 rc = ll_lease_file_resync(och, inode);
2997 ll_lease_close(och, inode, NULL);
3000 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3002 ll_lease_close(och, inode, NULL);
3008 mutex_lock(&lli->lli_och_mutex);
3009 if (fd->fd_lease_och == NULL) {
3010 fd->fd_lease_och = och;
3013 mutex_unlock(&lli->lli_och_mutex);
3015 /* impossible now that only excl is supported for now */
3016 ll_lease_close(och, inode, &lease_broken);
3023 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3025 struct inode *inode = file_inode(file);
3026 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3030 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3031 PFID(ll_inode2fid(inode)), inode, cmd);
3032 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3034 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3035 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3039 case LL_IOC_GETFLAGS:
3040 /* Get the current value of the file flags */
3041 return put_user(fd->fd_flags, (int __user *)arg);
3042 case LL_IOC_SETFLAGS:
3043 case LL_IOC_CLRFLAGS:
3044 /* Set or clear specific file flags */
3045 /* XXX This probably needs checks to ensure the flags are
3046 * not abused, and to handle any flag side effects.
3048 if (get_user(flags, (int __user *) arg))
3051 if (cmd == LL_IOC_SETFLAGS) {
3052 if ((flags & LL_FILE_IGNORE_LOCK) &&
3053 !(file->f_flags & O_DIRECT)) {
3054 CERROR("%s: unable to disable locking on "
3055 "non-O_DIRECT file\n", current->comm);
3059 fd->fd_flags |= flags;
3061 fd->fd_flags &= ~flags;
3064 case LL_IOC_LOV_SETSTRIPE:
3065 case LL_IOC_LOV_SETSTRIPE_NEW:
3066 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3067 case LL_IOC_LOV_SETEA:
3068 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3069 case LL_IOC_LOV_SWAP_LAYOUTS: {
3071 struct lustre_swap_layouts lsl;
3074 if (copy_from_user(&lsl, (char __user *)arg,
3075 sizeof(struct lustre_swap_layouts)))
3078 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3081 file2 = fget(lsl.sl_fd);
3085 /* O_WRONLY or O_RDWR */
3086 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3087 GOTO(out, rc = -EPERM);
3089 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
3091 struct inode *inode2;
3092 struct ll_inode_info *lli;
3093 struct obd_client_handle *och = NULL;
3095 lli = ll_i2info(inode);
3096 mutex_lock(&lli->lli_och_mutex);
3097 if (fd->fd_lease_och != NULL) {
3098 och = fd->fd_lease_och;
3099 fd->fd_lease_och = NULL;
3101 mutex_unlock(&lli->lli_och_mutex);
3103 GOTO(out, rc = -ENOLCK);
3104 inode2 = file_inode(file2);
3105 rc = ll_swap_layouts_close(och, inode, inode2, intent);
3107 rc = ll_swap_layouts(file, file2, &lsl);
3113 case LL_IOC_LOV_GETSTRIPE:
3114 case LL_IOC_LOV_GETSTRIPE_NEW:
3115 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3116 case FSFILT_IOC_GETFLAGS:
3117 case FSFILT_IOC_SETFLAGS:
3118 RETURN(ll_iocontrol(inode, file, cmd, arg));
3119 case FSFILT_IOC_GETVERSION_OLD:
3120 case FSFILT_IOC_GETVERSION:
3121 RETURN(put_user(inode->i_generation, (int __user *)arg));
3122 case LL_IOC_GROUP_LOCK:
3123 RETURN(ll_get_grouplock(inode, file, arg));
3124 case LL_IOC_GROUP_UNLOCK:
3125 RETURN(ll_put_grouplock(inode, file, arg));
3126 case IOC_OBD_STATFS:
3127 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3129 /* We need to special case any other ioctls we want to handle,
3130 * to send them to the MDS/OST as appropriate and to properly
3131 * network encode the arg field.
3132 case FSFILT_IOC_SETVERSION_OLD:
3133 case FSFILT_IOC_SETVERSION:
3135 case LL_IOC_FLUSHCTX:
3136 RETURN(ll_flush_ctx(inode));
3137 case LL_IOC_PATH2FID: {
3138 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3139 sizeof(struct lu_fid)))
3144 case LL_IOC_GETPARENT:
3145 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3147 case OBD_IOC_FID2PATH:
3148 RETURN(ll_fid2path(inode, (void __user *)arg));
3149 case LL_IOC_DATA_VERSION: {
3150 struct ioc_data_version idv;
3153 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3156 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3157 rc = ll_ioc_data_version(inode, &idv);
3160 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3166 case LL_IOC_GET_MDTIDX: {
3169 mdtidx = ll_get_mdt_idx(inode);
3173 if (put_user((int)mdtidx, (int __user *)arg))
3178 case OBD_IOC_GETDTNAME:
3179 case OBD_IOC_GETMDNAME:
3180 RETURN(ll_get_obd_name(inode, cmd, arg));
3181 case LL_IOC_HSM_STATE_GET: {
3182 struct md_op_data *op_data;
3183 struct hsm_user_state *hus;
3190 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3191 LUSTRE_OPC_ANY, hus);
3192 if (IS_ERR(op_data)) {
3194 RETURN(PTR_ERR(op_data));
3197 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3200 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3203 ll_finish_md_op_data(op_data);
3207 case LL_IOC_HSM_STATE_SET: {
3208 struct hsm_state_set *hss;
3215 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3220 rc = ll_hsm_state_set(inode, hss);
3225 case LL_IOC_HSM_ACTION: {
3226 struct md_op_data *op_data;
3227 struct hsm_current_action *hca;
3234 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3235 LUSTRE_OPC_ANY, hca);
3236 if (IS_ERR(op_data)) {
3238 RETURN(PTR_ERR(op_data));
3241 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3244 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3247 ll_finish_md_op_data(op_data);
3251 case LL_IOC_SET_LEASE_OLD: {
3252 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3254 RETURN(ll_file_set_lease(file, &ioc, 0));
3256 case LL_IOC_SET_LEASE: {
3257 struct ll_ioc_lease ioc;
3259 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3262 RETURN(ll_file_set_lease(file, &ioc, arg));
3264 case LL_IOC_GET_LEASE: {
3265 struct ll_inode_info *lli = ll_i2info(inode);
3266 struct ldlm_lock *lock = NULL;
3269 mutex_lock(&lli->lli_och_mutex);
3270 if (fd->fd_lease_och != NULL) {
3271 struct obd_client_handle *och = fd->fd_lease_och;
3273 lock = ldlm_handle2lock(&och->och_lease_handle);
3275 lock_res_and_lock(lock);
3276 if (!ldlm_is_cancel(lock))
3277 fmode = och->och_flags;
3279 unlock_res_and_lock(lock);
3280 LDLM_LOCK_PUT(lock);
3283 mutex_unlock(&lli->lli_och_mutex);
3285 RETURN(ll_lease_type_from_fmode(fmode));
3287 case LL_IOC_HSM_IMPORT: {
3288 struct hsm_user_import *hui;
3294 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3299 rc = ll_hsm_import(inode, file, hui);
3304 case LL_IOC_FUTIMES_3: {
3305 struct ll_futimes_3 lfu;
3307 if (copy_from_user(&lfu,
3308 (const struct ll_futimes_3 __user *)arg,
3312 RETURN(ll_file_futimes_3(file, &lfu));
3314 case LL_IOC_LADVISE: {
3315 struct llapi_ladvise_hdr *k_ladvise_hdr;
3316 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3319 int alloc_size = sizeof(*k_ladvise_hdr);
3322 u_ladvise_hdr = (void __user *)arg;
3323 OBD_ALLOC_PTR(k_ladvise_hdr);
3324 if (k_ladvise_hdr == NULL)
3327 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3328 GOTO(out_ladvise, rc = -EFAULT);
3330 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3331 k_ladvise_hdr->lah_count < 1)
3332 GOTO(out_ladvise, rc = -EINVAL);
3334 num_advise = k_ladvise_hdr->lah_count;
3335 if (num_advise >= LAH_COUNT_MAX)
3336 GOTO(out_ladvise, rc = -EFBIG);
3338 OBD_FREE_PTR(k_ladvise_hdr);
3339 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3340 lah_advise[num_advise]);
3341 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3342 if (k_ladvise_hdr == NULL)
3346 * TODO: submit multiple advices to one server in a single RPC
3348 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3349 GOTO(out_ladvise, rc = -EFAULT);
3351 for (i = 0; i < num_advise; i++) {
3352 struct llapi_lu_ladvise *k_ladvise =
3353 &k_ladvise_hdr->lah_advise[i];
3354 struct llapi_lu_ladvise __user *u_ladvise =
3355 &u_ladvise_hdr->lah_advise[i];
3357 rc = ll_ladvise_sanity(inode, k_ladvise);
3359 GOTO(out_ladvise, rc);
3361 switch (k_ladvise->lla_advice) {
3362 case LU_LADVISE_LOCKNOEXPAND:
3363 rc = ll_lock_noexpand(file,
3364 k_ladvise->lla_peradvice_flags);
3365 GOTO(out_ladvise, rc);
3366 case LU_LADVISE_LOCKAHEAD:
3368 rc = ll_file_lock_ahead(file, k_ladvise);
3371 GOTO(out_ladvise, rc);
3374 &u_ladvise->lla_lockahead_result))
3375 GOTO(out_ladvise, rc = -EFAULT);
3378 rc = ll_ladvise(inode, file,
3379 k_ladvise_hdr->lah_flags,
3382 GOTO(out_ladvise, rc);
3389 OBD_FREE(k_ladvise_hdr, alloc_size);
3392 case LL_IOC_FLR_SET_MIRROR: {
3393 /* mirror I/O must be direct to avoid polluting page cache
3395 if (!(file->f_flags & O_DIRECT))
3398 fd->fd_designated_mirror = (__u32)arg;
3401 case LL_IOC_FSGETXATTR:
3402 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3403 case LL_IOC_FSSETXATTR:
3404 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3406 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3408 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3409 (void __user *)arg));
3413 #ifndef HAVE_FILE_LLSEEK_SIZE
3414 static inline loff_t
3415 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3417 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3419 if (offset > maxsize)
3422 if (offset != file->f_pos) {
3423 file->f_pos = offset;
3424 file->f_version = 0;
3430 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3431 loff_t maxsize, loff_t eof)
3433 struct inode *inode = file_inode(file);
3441 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3442 * position-querying operation. Avoid rewriting the "same"
3443 * f_pos value back to the file because a concurrent read(),
3444 * write() or lseek() might have altered it
3449 * f_lock protects against read/modify/write race with other
3450 * SEEK_CURs. Note that parallel writes and reads behave
3454 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3455 inode_unlock(inode);
3459 * In the generic case the entire file is data, so as long as
3460 * offset isn't at the end of the file then the offset is data.
3467 * There is a virtual hole at the end of the file, so as long as
3468 * offset isn't i_size or larger, return i_size.
3476 return llseek_execute(file, offset, maxsize);
3480 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3482 struct inode *inode = file_inode(file);
3483 loff_t retval, eof = 0;
3486 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3487 (origin == SEEK_CUR) ? file->f_pos : 0);
3488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3489 PFID(ll_inode2fid(inode)), inode, retval, retval,
3491 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3493 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3494 retval = ll_glimpse_size(inode);
3497 eof = i_size_read(inode);
3500 retval = ll_generic_file_llseek_size(file, offset, origin,
3501 ll_file_maxbytes(inode), eof);
3505 static int ll_flush(struct file *file, fl_owner_t id)
3507 struct inode *inode = file_inode(file);
3508 struct ll_inode_info *lli = ll_i2info(inode);
3509 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3512 LASSERT(!S_ISDIR(inode->i_mode));
3514 /* catch async errors that were recorded back when async writeback
3515 * failed for pages in this mapping. */
3516 rc = lli->lli_async_rc;
3517 lli->lli_async_rc = 0;
3518 if (lli->lli_clob != NULL) {
3519 err = lov_read_and_clear_async_rc(lli->lli_clob);
3524 /* The application has been told write failure already.
3525 * Do not report failure again. */
3526 if (fd->fd_write_failed)
3528 return rc ? -EIO : 0;
3532 * Called to make sure a portion of file has been written out.
3533 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3535 * Return how many pages have been written.
3537 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3538 enum cl_fsync_mode mode, int ignore_layout)
3542 struct cl_fsync_io *fio;
3547 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3548 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3551 env = cl_env_get(&refcheck);
3553 RETURN(PTR_ERR(env));
3555 io = vvp_env_thread_io(env);
3556 io->ci_obj = ll_i2info(inode)->lli_clob;
3557 io->ci_ignore_layout = ignore_layout;
3559 /* initialize parameters for sync */
3560 fio = &io->u.ci_fsync;
3561 fio->fi_start = start;
3563 fio->fi_fid = ll_inode2fid(inode);
3564 fio->fi_mode = mode;
3565 fio->fi_nr_written = 0;
3567 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3568 result = cl_io_loop(env, io);
3570 result = io->ci_result;
3572 result = fio->fi_nr_written;
3573 cl_io_fini(env, io);
3574 cl_env_put(env, &refcheck);
3580 * When dentry is provided (the 'else' case), file_dentry() may be
3581 * null and dentry must be used directly rather than pulled from
3582 * file_dentry() as is done otherwise.
3585 #ifdef HAVE_FILE_FSYNC_4ARGS
3586 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3588 struct dentry *dentry = file_dentry(file);
3590 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3591 int ll_fsync(struct file *file, int datasync)
3593 struct dentry *dentry = file_dentry(file);
3595 loff_t end = LLONG_MAX;
3597 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3600 loff_t end = LLONG_MAX;
3602 struct inode *inode = dentry->d_inode;
3603 struct ll_inode_info *lli = ll_i2info(inode);
3604 struct ptlrpc_request *req;
3608 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3609 PFID(ll_inode2fid(inode)), inode);
3610 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3612 #ifdef HAVE_FILE_FSYNC_4ARGS
3613 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3614 lock_inode = !lli->lli_inode_locked;
3618 /* fsync's caller has already called _fdata{sync,write}, we want
3619 * that IO to finish before calling the osc and mdc sync methods */
3620 rc = filemap_fdatawait(inode->i_mapping);
3623 /* catch async errors that were recorded back when async writeback
3624 * failed for pages in this mapping. */
3625 if (!S_ISDIR(inode->i_mode)) {
3626 err = lli->lli_async_rc;
3627 lli->lli_async_rc = 0;
3630 if (lli->lli_clob != NULL) {
3631 err = lov_read_and_clear_async_rc(lli->lli_clob);
3637 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3641 ptlrpc_req_finished(req);
3643 if (S_ISREG(inode->i_mode)) {
3644 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3646 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3647 if (rc == 0 && err < 0)
3650 fd->fd_write_failed = true;
3652 fd->fd_write_failed = false;
3655 #ifdef HAVE_FILE_FSYNC_4ARGS
3657 inode_unlock(inode);
3663 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3665 struct inode *inode = file_inode(file);
3666 struct ll_sb_info *sbi = ll_i2sbi(inode);
3667 struct ldlm_enqueue_info einfo = {
3668 .ei_type = LDLM_FLOCK,
3669 .ei_cb_cp = ldlm_flock_completion_ast,
3670 .ei_cbdata = file_lock,
3672 struct md_op_data *op_data;
3673 struct lustre_handle lockh = { 0 };
3674 union ldlm_policy_data flock = { { 0 } };
3675 int fl_type = file_lock->fl_type;
3681 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3682 PFID(ll_inode2fid(inode)), file_lock);
3684 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3686 if (file_lock->fl_flags & FL_FLOCK) {
3687 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3688 /* flocks are whole-file locks */
3689 flock.l_flock.end = OFFSET_MAX;
3690 /* For flocks owner is determined by the local file desctiptor*/
3691 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3692 } else if (file_lock->fl_flags & FL_POSIX) {
3693 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3694 flock.l_flock.start = file_lock->fl_start;
3695 flock.l_flock.end = file_lock->fl_end;
3699 flock.l_flock.pid = file_lock->fl_pid;
3701 /* Somewhat ugly workaround for svc lockd.
3702 * lockd installs custom fl_lmops->lm_compare_owner that checks
3703 * for the fl_owner to be the same (which it always is on local node
3704 * I guess between lockd processes) and then compares pid.
3705 * As such we assign pid to the owner field to make it all work,
3706 * conflict with normal locks is unlikely since pid space and
3707 * pointer space for current->files are not intersecting */
3708 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3709 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3713 einfo.ei_mode = LCK_PR;
3716 /* An unlock request may or may not have any relation to
3717 * existing locks so we may not be able to pass a lock handle
3718 * via a normal ldlm_lock_cancel() request. The request may even
3719 * unlock a byte range in the middle of an existing lock. In
3720 * order to process an unlock request we need all of the same
3721 * information that is given with a normal read or write record
3722 * lock request. To avoid creating another ldlm unlock (cancel)
3723 * message we'll treat a LCK_NL flock request as an unlock. */
3724 einfo.ei_mode = LCK_NL;
3727 einfo.ei_mode = LCK_PW;
3730 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3745 flags = LDLM_FL_BLOCK_NOWAIT;
3751 flags = LDLM_FL_TEST_LOCK;
3754 CERROR("unknown fcntl lock command: %d\n", cmd);
3758 /* Save the old mode so that if the mode in the lock changes we
3759 * can decrement the appropriate reader or writer refcount. */
3760 file_lock->fl_type = einfo.ei_mode;
3762 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3763 LUSTRE_OPC_ANY, NULL);
3764 if (IS_ERR(op_data))
3765 RETURN(PTR_ERR(op_data));
3767 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3768 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3769 flock.l_flock.pid, flags, einfo.ei_mode,
3770 flock.l_flock.start, flock.l_flock.end);
3772 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3775 /* Restore the file lock type if not TEST lock. */
3776 if (!(flags & LDLM_FL_TEST_LOCK))
3777 file_lock->fl_type = fl_type;
3779 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3780 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3781 !(flags & LDLM_FL_TEST_LOCK))
3782 rc2 = locks_lock_file_wait(file, file_lock);
3784 if ((file_lock->fl_flags & FL_FLOCK) &&
3785 (rc == 0 || file_lock->fl_type == F_UNLCK))
3786 rc2 = flock_lock_file_wait(file, file_lock);
3787 if ((file_lock->fl_flags & FL_POSIX) &&
3788 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3789 !(flags & LDLM_FL_TEST_LOCK))
3790 rc2 = posix_lock_file_wait(file, file_lock);
3791 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3793 if (rc2 && file_lock->fl_type != F_UNLCK) {
3794 einfo.ei_mode = LCK_NL;
3795 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3800 ll_finish_md_op_data(op_data);
3805 int ll_get_fid_by_name(struct inode *parent, const char *name,
3806 int namelen, struct lu_fid *fid,
3807 struct inode **inode)
3809 struct md_op_data *op_data = NULL;
3810 struct mdt_body *body;
3811 struct ptlrpc_request *req;
3815 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3816 LUSTRE_OPC_ANY, NULL);
3817 if (IS_ERR(op_data))
3818 RETURN(PTR_ERR(op_data));
3820 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3821 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3822 ll_finish_md_op_data(op_data);
3826 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3828 GOTO(out_req, rc = -EFAULT);
3830 *fid = body->mbo_fid1;
3833 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3835 ptlrpc_req_finished(req);
3839 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3840 const char *name, int namelen)
3842 struct dentry *dchild = NULL;
3843 struct inode *child_inode = NULL;
3844 struct md_op_data *op_data;
3845 struct ptlrpc_request *request = NULL;
3846 struct obd_client_handle *och = NULL;
3848 struct mdt_body *body;
3850 __u64 data_version = 0;
3853 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3854 name, PFID(ll_inode2fid(parent)), mdtidx);
3856 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3857 0, LUSTRE_OPC_ANY, NULL);
3858 if (IS_ERR(op_data))
3859 RETURN(PTR_ERR(op_data));
3861 /* Get child FID first */
3862 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3865 dchild = d_lookup(file_dentry(file), &qstr);
3866 if (dchild != NULL) {
3867 if (dchild->d_inode != NULL)
3868 child_inode = igrab(dchild->d_inode);
3872 if (child_inode == NULL) {
3873 rc = ll_get_fid_by_name(parent, name, namelen,
3874 &op_data->op_fid3, &child_inode);
3879 if (child_inode == NULL)
3880 GOTO(out_free, rc = -EINVAL);
3883 * lfs migrate command needs to be blocked on the client
3884 * by checking the migrate FID against the FID of the
3887 if (child_inode == parent->i_sb->s_root->d_inode)
3888 GOTO(out_iput, rc = -EINVAL);
3890 inode_lock(child_inode);
3891 op_data->op_fid3 = *ll_inode2fid(child_inode);
3892 if (!fid_is_sane(&op_data->op_fid3)) {
3893 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3894 ll_get_fsname(parent->i_sb, NULL, 0), name,
3895 PFID(&op_data->op_fid3));
3896 GOTO(out_unlock, rc = -EINVAL);
3899 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3901 GOTO(out_unlock, rc);
3904 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3905 PFID(&op_data->op_fid3), mdtidx);
3906 GOTO(out_unlock, rc = 0);
3909 if (S_ISREG(child_inode->i_mode)) {
3910 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3914 GOTO(out_unlock, rc);
3917 rc = ll_data_version(child_inode, &data_version,
3920 GOTO(out_close, rc);
3922 op_data->op_handle = och->och_fh;
3923 op_data->op_data = och->och_mod;
3924 op_data->op_data_version = data_version;
3925 op_data->op_lease_handle = och->och_lease_handle;
3926 op_data->op_bias |= MDS_RENAME_MIGRATE;
3929 op_data->op_mds = mdtidx;
3930 op_data->op_cli_flags = CLI_MIGRATE;
3931 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3932 namelen, name, namelen, &request);
3934 LASSERT(request != NULL);
3935 ll_update_times(request, parent);
3937 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3938 LASSERT(body != NULL);
3940 /* If the server does release layout lock, then we cleanup
3941 * the client och here, otherwise release it in out_close: */
3943 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3944 obd_mod_put(och->och_mod);
3945 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3947 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3953 if (request != NULL) {
3954 ptlrpc_req_finished(request);
3958 /* Try again if the file layout has changed. */
3959 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3963 if (och != NULL) /* close the file */
3964 ll_lease_close(och, child_inode, NULL);
3966 clear_nlink(child_inode);
3968 inode_unlock(child_inode);
3972 ll_finish_md_op_data(op_data);
3977 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3985 * test if some locks matching bits and l_req_mode are acquired
3986 * - bits can be in different locks
3987 * - if found clear the common lock bits in *bits
3988 * - the bits not found, are kept in *bits
3990 * \param bits [IN] searched lock bits [IN]
3991 * \param l_req_mode [IN] searched lock mode
3992 * \retval boolean, true iff all bits are found
3994 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3996 struct lustre_handle lockh;
3997 union ldlm_policy_data policy;
3998 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3999 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4008 fid = &ll_i2info(inode)->lli_fid;
4009 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4010 ldlm_lockname[mode]);
4012 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4013 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4014 policy.l_inodebits.bits = *bits & (1 << i);
4015 if (policy.l_inodebits.bits == 0)
4018 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4019 &policy, mode, &lockh)) {
4020 struct ldlm_lock *lock;
4022 lock = ldlm_handle2lock(&lockh);
4025 ~(lock->l_policy_data.l_inodebits.bits);
4026 LDLM_LOCK_PUT(lock);
4028 *bits &= ~policy.l_inodebits.bits;
4035 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4036 struct lustre_handle *lockh, __u64 flags,
4037 enum ldlm_mode mode)
4039 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4044 fid = &ll_i2info(inode)->lli_fid;
4045 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4047 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4048 fid, LDLM_IBITS, &policy, mode, lockh);
4053 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4055 /* Already unlinked. Just update nlink and return success */
4056 if (rc == -ENOENT) {
4058 /* If it is striped directory, and there is bad stripe
4059 * Let's revalidate the dentry again, instead of returning
4061 if (S_ISDIR(inode->i_mode) &&
4062 ll_i2info(inode)->lli_lsm_md != NULL)
4065 /* This path cannot be hit for regular files unless in
4066 * case of obscure races, so no need to to validate
4068 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4070 } else if (rc != 0) {
4071 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4072 "%s: revalidate FID "DFID" error: rc = %d\n",
4073 ll_get_fsname(inode->i_sb, NULL, 0),
4074 PFID(ll_inode2fid(inode)), rc);
4080 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4082 struct inode *inode = dentry->d_inode;
4083 struct obd_export *exp = ll_i2mdexp(inode);
4084 struct lookup_intent oit = {
4087 struct ptlrpc_request *req = NULL;
4088 struct md_op_data *op_data;
4092 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4093 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4095 /* Call getattr by fid, so do not provide name at all. */
4096 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4097 LUSTRE_OPC_ANY, NULL);
4098 if (IS_ERR(op_data))
4099 RETURN(PTR_ERR(op_data));
4101 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4102 ll_finish_md_op_data(op_data);
4104 rc = ll_inode_revalidate_fini(inode, rc);
4108 rc = ll_revalidate_it_finish(req, &oit, dentry);
4110 ll_intent_release(&oit);
4114 /* Unlinked? Unhash dentry, so it is not picked up later by
4115 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4116 * here to preserve get_cwd functionality on 2.6.
4118 if (!dentry->d_inode->i_nlink) {
4119 ll_lock_dcache(inode);
4120 d_lustre_invalidate(dentry, 0);
4121 ll_unlock_dcache(inode);
4124 ll_lookup_finish_locks(&oit, dentry);
4126 ptlrpc_req_finished(req);
4131 static int ll_merge_md_attr(struct inode *inode)
4133 struct cl_attr attr = { 0 };
4136 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4137 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4138 &attr, ll_md_blocking_ast);
4142 set_nlink(inode, attr.cat_nlink);
4143 inode->i_blocks = attr.cat_blocks;
4144 i_size_write(inode, attr.cat_size);
4146 ll_i2info(inode)->lli_atime = attr.cat_atime;
4147 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4148 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4153 static inline dev_t ll_compat_encode_dev(dev_t dev)
4155 /* The compat_sys_*stat*() syscalls will fail unless the
4156 * device majors and minors are both less than 256. Note that
4157 * the value returned here will be passed through
4158 * old_encode_dev() in cp_compat_stat(). And so we are not
4159 * trying to return a valid compat (u16) device number, just
4160 * one that will pass the old_valid_dev() check. */
4162 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4165 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4166 int ll_getattr(const struct path *path, struct kstat *stat,
4167 u32 request_mask, unsigned int flags)
4169 struct dentry *de = path->dentry;
4171 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4174 struct inode *inode = de->d_inode;
4175 struct ll_sb_info *sbi = ll_i2sbi(inode);
4176 struct ll_inode_info *lli = ll_i2info(inode);
4179 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4181 rc = ll_inode_revalidate(de, IT_GETATTR);
4185 if (S_ISREG(inode->i_mode)) {
4186 /* In case of restore, the MDT has the right size and has
4187 * already send it back without granting the layout lock,
4188 * inode is up-to-date so glimpse is useless.
4189 * Also to glimpse we need the layout, in case of a running
4190 * restore the MDT holds the layout lock so the glimpse will
4191 * block up to the end of restore (getattr will block)
4193 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4194 rc = ll_glimpse_size(inode);
4199 /* If object isn't regular a file then don't validate size. */
4200 if (S_ISDIR(inode->i_mode) &&
4201 lli->lli_lsm_md != NULL) {
4202 rc = ll_merge_md_attr(inode);
4207 LTIME_S(inode->i_atime) = lli->lli_atime;
4208 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4209 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4212 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4214 if (ll_need_32bit_api(sbi)) {
4215 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4216 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4217 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4219 stat->ino = inode->i_ino;
4220 stat->dev = inode->i_sb->s_dev;
4221 stat->rdev = inode->i_rdev;
4224 stat->mode = inode->i_mode;
4225 stat->uid = inode->i_uid;
4226 stat->gid = inode->i_gid;
4227 stat->atime = inode->i_atime;
4228 stat->mtime = inode->i_mtime;
4229 stat->ctime = inode->i_ctime;
4230 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4232 stat->nlink = inode->i_nlink;
4233 stat->size = i_size_read(inode);
4234 stat->blocks = inode->i_blocks;
4239 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4240 __u64 start, __u64 len)
4244 struct fiemap *fiemap;
4245 unsigned int extent_count = fieinfo->fi_extents_max;
4247 num_bytes = sizeof(*fiemap) + (extent_count *
4248 sizeof(struct fiemap_extent));
4249 OBD_ALLOC_LARGE(fiemap, num_bytes);
4254 fiemap->fm_flags = fieinfo->fi_flags;
4255 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4256 fiemap->fm_start = start;
4257 fiemap->fm_length = len;
4258 if (extent_count > 0 &&
4259 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4260 sizeof(struct fiemap_extent)) != 0)
4261 GOTO(out, rc = -EFAULT);
4263 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4265 fieinfo->fi_flags = fiemap->fm_flags;
4266 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4267 if (extent_count > 0 &&
4268 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4269 fiemap->fm_mapped_extents *
4270 sizeof(struct fiemap_extent)) != 0)
4271 GOTO(out, rc = -EFAULT);
4273 OBD_FREE_LARGE(fiemap, num_bytes);
4277 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4279 struct ll_inode_info *lli = ll_i2info(inode);
4280 struct posix_acl *acl = NULL;
4283 spin_lock(&lli->lli_lock);
4284 /* VFS' acl_permission_check->check_acl will release the refcount */
4285 acl = posix_acl_dup(lli->lli_posix_acl);
4286 spin_unlock(&lli->lli_lock);
4291 #ifdef HAVE_IOP_SET_ACL
4292 #ifdef CONFIG_FS_POSIX_ACL
4293 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4295 const char *name = NULL;
4302 case ACL_TYPE_ACCESS:
4304 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4308 name = XATTR_NAME_POSIX_ACL_ACCESS;
4310 case ACL_TYPE_DEFAULT:
4311 if (!S_ISDIR(inode->i_mode))
4312 GOTO(out, rc = acl ? -EACCES : 0);
4313 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4316 GOTO(out, rc = -EINVAL);
4320 size = posix_acl_xattr_size(acl->a_count);
4321 value = kmalloc(size, GFP_NOFS);
4323 GOTO(out, rc = -ENOMEM);
4325 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4330 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4331 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4336 set_cached_acl(inode, type, acl);
4338 forget_cached_acl(inode, type);
4341 #endif /* CONFIG_FS_POSIX_ACL */
4342 #endif /* HAVE_IOP_SET_ACL */
4344 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4346 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4347 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4349 ll_check_acl(struct inode *inode, int mask)
4352 # ifdef CONFIG_FS_POSIX_ACL
4353 struct posix_acl *acl;
4357 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4358 if (flags & IPERM_FLAG_RCU)
4361 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4366 rc = posix_acl_permission(inode, acl, mask);
4367 posix_acl_release(acl);
4370 # else /* !CONFIG_FS_POSIX_ACL */
4372 # endif /* CONFIG_FS_POSIX_ACL */
4374 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4376 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4377 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4379 # ifdef HAVE_INODE_PERMISION_2ARGS
4380 int ll_inode_permission(struct inode *inode, int mask)
4382 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4387 struct ll_sb_info *sbi;
4388 struct root_squash_info *squash;
4389 struct cred *cred = NULL;
4390 const struct cred *old_cred = NULL;
4392 bool squash_id = false;
4395 #ifdef MAY_NOT_BLOCK
4396 if (mask & MAY_NOT_BLOCK)
4398 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4399 if (flags & IPERM_FLAG_RCU)
4403 /* as root inode are NOT getting validated in lookup operation,
4404 * need to do it before permission check. */
4406 if (inode == inode->i_sb->s_root->d_inode) {
4407 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4412 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4413 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4415 /* squash fsuid/fsgid if needed */
4416 sbi = ll_i2sbi(inode);
4417 squash = &sbi->ll_squash;
4418 if (unlikely(squash->rsi_uid != 0 &&
4419 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4420 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4424 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4425 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4426 squash->rsi_uid, squash->rsi_gid);
4428 /* update current process's credentials
4429 * and FS capability */
4430 cred = prepare_creds();
4434 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4435 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4436 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4437 if ((1 << cap) & CFS_CAP_FS_MASK)
4438 cap_lower(cred->cap_effective, cap);
4440 old_cred = override_creds(cred);
4443 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4444 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4445 /* restore current process's credentials and FS capability */
4447 revert_creds(old_cred);
4454 /* -o localflock - only provides locally consistent flock locks */
4455 struct file_operations ll_file_operations = {
4456 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4457 # ifdef HAVE_SYNC_READ_WRITE
4458 .read = new_sync_read,
4459 .write = new_sync_write,
4461 .read_iter = ll_file_read_iter,
4462 .write_iter = ll_file_write_iter,
4463 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4464 .read = ll_file_read,
4465 .aio_read = ll_file_aio_read,
4466 .write = ll_file_write,
4467 .aio_write = ll_file_aio_write,
4468 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4469 .unlocked_ioctl = ll_file_ioctl,
4470 .open = ll_file_open,
4471 .release = ll_file_release,
4472 .mmap = ll_file_mmap,
4473 .llseek = ll_file_seek,
4474 .splice_read = ll_file_splice_read,
4479 struct file_operations ll_file_operations_flock = {
4480 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4481 # ifdef HAVE_SYNC_READ_WRITE
4482 .read = new_sync_read,
4483 .write = new_sync_write,
4484 # endif /* HAVE_SYNC_READ_WRITE */
4485 .read_iter = ll_file_read_iter,
4486 .write_iter = ll_file_write_iter,
4487 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4488 .read = ll_file_read,
4489 .aio_read = ll_file_aio_read,
4490 .write = ll_file_write,
4491 .aio_write = ll_file_aio_write,
4492 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4493 .unlocked_ioctl = ll_file_ioctl,
4494 .open = ll_file_open,
4495 .release = ll_file_release,
4496 .mmap = ll_file_mmap,
4497 .llseek = ll_file_seek,
4498 .splice_read = ll_file_splice_read,
4501 .flock = ll_file_flock,
4502 .lock = ll_file_flock
4505 /* These are for -o noflock - to return ENOSYS on flock calls */
4506 struct file_operations ll_file_operations_noflock = {
4507 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4508 # ifdef HAVE_SYNC_READ_WRITE
4509 .read = new_sync_read,
4510 .write = new_sync_write,
4511 # endif /* HAVE_SYNC_READ_WRITE */
4512 .read_iter = ll_file_read_iter,
4513 .write_iter = ll_file_write_iter,
4514 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4515 .read = ll_file_read,
4516 .aio_read = ll_file_aio_read,
4517 .write = ll_file_write,
4518 .aio_write = ll_file_aio_write,
4519 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4520 .unlocked_ioctl = ll_file_ioctl,
4521 .open = ll_file_open,
4522 .release = ll_file_release,
4523 .mmap = ll_file_mmap,
4524 .llseek = ll_file_seek,
4525 .splice_read = ll_file_splice_read,
4528 .flock = ll_file_noflock,
4529 .lock = ll_file_noflock
4532 struct inode_operations ll_file_inode_operations = {
4533 .setattr = ll_setattr,
4534 .getattr = ll_getattr,
4535 .permission = ll_inode_permission,
4536 #ifdef HAVE_IOP_XATTR
4537 .setxattr = ll_setxattr,
4538 .getxattr = ll_getxattr,
4539 .removexattr = ll_removexattr,
4541 .listxattr = ll_listxattr,
4542 .fiemap = ll_fiemap,
4543 #ifdef HAVE_IOP_GET_ACL
4544 .get_acl = ll_get_acl,
4546 #ifdef HAVE_IOP_SET_ACL
4547 .set_acl = ll_set_acl,
4551 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4553 struct ll_inode_info *lli = ll_i2info(inode);
4554 struct cl_object *obj = lli->lli_clob;
4563 env = cl_env_get(&refcheck);
4565 RETURN(PTR_ERR(env));
4567 rc = cl_conf_set(env, lli->lli_clob, conf);
4571 if (conf->coc_opc == OBJECT_CONF_SET) {
4572 struct ldlm_lock *lock = conf->coc_lock;
4573 struct cl_layout cl = {
4577 LASSERT(lock != NULL);
4578 LASSERT(ldlm_has_layout(lock));
4580 /* it can only be allowed to match after layout is
4581 * applied to inode otherwise false layout would be
4582 * seen. Applying layout shoud happen before dropping
4583 * the intent lock. */
4584 ldlm_lock_allow_match(lock);
4586 rc = cl_object_layout_get(env, obj, &cl);
4591 DFID": layout version change: %u -> %u\n",
4592 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4594 ll_layout_version_set(lli, cl.cl_layout_gen);
4598 cl_env_put(env, &refcheck);
4603 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4604 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4607 struct ll_sb_info *sbi = ll_i2sbi(inode);
4608 struct ptlrpc_request *req;
4609 struct mdt_body *body;
4616 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4617 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4618 lock->l_lvb_data, lock->l_lvb_len);
4620 if (lock->l_lvb_data != NULL)
4623 /* if layout lock was granted right away, the layout is returned
4624 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4625 * blocked and then granted via completion ast, we have to fetch
4626 * layout here. Please note that we can't use the LVB buffer in
4627 * completion AST because it doesn't have a large enough buffer */
4628 rc = ll_get_default_mdsize(sbi, &lmmsize);
4630 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4631 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4636 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4638 GOTO(out, rc = -EPROTO);
4640 lmmsize = body->mbo_eadatasize;
4641 if (lmmsize == 0) /* empty layout */
4644 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4646 GOTO(out, rc = -EFAULT);
4648 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4649 if (lvbdata == NULL)
4650 GOTO(out, rc = -ENOMEM);
4652 memcpy(lvbdata, lmm, lmmsize);
4653 lock_res_and_lock(lock);
4654 if (unlikely(lock->l_lvb_data == NULL)) {
4655 lock->l_lvb_type = LVB_T_LAYOUT;
4656 lock->l_lvb_data = lvbdata;
4657 lock->l_lvb_len = lmmsize;
4660 unlock_res_and_lock(lock);
4663 OBD_FREE_LARGE(lvbdata, lmmsize);
4668 ptlrpc_req_finished(req);
4673 * Apply the layout to the inode. Layout lock is held and will be released
4676 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4677 struct inode *inode)
4679 struct ll_inode_info *lli = ll_i2info(inode);
4680 struct ll_sb_info *sbi = ll_i2sbi(inode);
4681 struct ldlm_lock *lock;
4682 struct cl_object_conf conf;
4685 bool wait_layout = false;
4688 LASSERT(lustre_handle_is_used(lockh));
4690 lock = ldlm_handle2lock(lockh);
4691 LASSERT(lock != NULL);
4692 LASSERT(ldlm_has_layout(lock));
4694 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4695 PFID(&lli->lli_fid), inode);
4697 /* in case this is a caching lock and reinstate with new inode */
4698 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4700 lock_res_and_lock(lock);
4701 lvb_ready = ldlm_is_lvb_ready(lock);
4702 unlock_res_and_lock(lock);
4704 /* checking lvb_ready is racy but this is okay. The worst case is
4705 * that multi processes may configure the file on the same time. */
4709 rc = ll_layout_fetch(inode, lock);
4713 /* for layout lock, lmm is stored in lock's lvb.
4714 * lvb_data is immutable if the lock is held so it's safe to access it
4717 * set layout to file. Unlikely this will fail as old layout was
4718 * surely eliminated */
4719 memset(&conf, 0, sizeof conf);
4720 conf.coc_opc = OBJECT_CONF_SET;
4721 conf.coc_inode = inode;
4722 conf.coc_lock = lock;
4723 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4724 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4725 rc = ll_layout_conf(inode, &conf);
4727 /* refresh layout failed, need to wait */
4728 wait_layout = rc == -EBUSY;
4731 LDLM_LOCK_PUT(lock);
4732 ldlm_lock_decref(lockh, mode);
4734 /* wait for IO to complete if it's still being used. */
4736 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4737 ll_get_fsname(inode->i_sb, NULL, 0),
4738 PFID(&lli->lli_fid), inode);
4740 memset(&conf, 0, sizeof conf);
4741 conf.coc_opc = OBJECT_CONF_WAIT;
4742 conf.coc_inode = inode;
4743 rc = ll_layout_conf(inode, &conf);
4747 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4748 ll_get_fsname(inode->i_sb, NULL, 0),
4749 PFID(&lli->lli_fid), rc);
4755 * Issue layout intent RPC to MDS.
4756 * \param inode [in] file inode
4757 * \param intent [in] layout intent
4759 * \retval 0 on success
4760 * \retval < 0 error code
4762 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4764 struct ll_inode_info *lli = ll_i2info(inode);
4765 struct ll_sb_info *sbi = ll_i2sbi(inode);
4766 struct md_op_data *op_data;
4767 struct lookup_intent it;
4768 struct ptlrpc_request *req;
4772 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4773 0, 0, LUSTRE_OPC_ANY, NULL);
4774 if (IS_ERR(op_data))
4775 RETURN(PTR_ERR(op_data));
4777 op_data->op_data = intent;
4778 op_data->op_data_size = sizeof(*intent);
4780 memset(&it, 0, sizeof(it));
4781 it.it_op = IT_LAYOUT;
4782 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4783 intent->li_opc == LAYOUT_INTENT_TRUNC)
4784 it.it_flags = FMODE_WRITE;
4786 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4787 ll_get_fsname(inode->i_sb, NULL, 0),
4788 PFID(&lli->lli_fid), inode);
4790 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4791 &ll_md_blocking_ast, 0);
4792 if (it.it_request != NULL)
4793 ptlrpc_req_finished(it.it_request);
4794 it.it_request = NULL;
4796 ll_finish_md_op_data(op_data);
4798 /* set lock data in case this is a new lock */
4800 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4802 ll_intent_drop_lock(&it);
4808 * This function checks if there exists a LAYOUT lock on the client side,
4809 * or enqueues it if it doesn't have one in cache.
4811 * This function will not hold layout lock so it may be revoked any time after
4812 * this function returns. Any operations depend on layout should be redone
4815 * This function should be called before lov_io_init() to get an uptodate
4816 * layout version, the caller should save the version number and after IO
4817 * is finished, this function should be called again to verify that layout
4818 * is not changed during IO time.
4820 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4822 struct ll_inode_info *lli = ll_i2info(inode);
4823 struct ll_sb_info *sbi = ll_i2sbi(inode);
4824 struct lustre_handle lockh;
4825 struct layout_intent intent = {
4826 .li_opc = LAYOUT_INTENT_ACCESS,
4828 enum ldlm_mode mode;
4832 *gen = ll_layout_version_get(lli);
4833 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4837 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4838 LASSERT(S_ISREG(inode->i_mode));
4840 /* take layout lock mutex to enqueue layout lock exclusively. */
4841 mutex_lock(&lli->lli_layout_mutex);
4844 /* mostly layout lock is caching on the local side, so try to
4845 * match it before grabbing layout lock mutex. */
4846 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4847 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4848 if (mode != 0) { /* hit cached lock */
4849 rc = ll_layout_lock_set(&lockh, mode, inode);
4855 rc = ll_layout_intent(inode, &intent);
4861 *gen = ll_layout_version_get(lli);
4862 mutex_unlock(&lli->lli_layout_mutex);
4868 * Issue layout intent RPC indicating where in a file an IO is about to write.
4870 * \param[in] inode file inode.
4871 * \param[in] ext write range with start offset of fille in bytes where
4872 * an IO is about to write, and exclusive end offset in
4875 * \retval 0 on success
4876 * \retval < 0 error code
4878 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
4879 struct lu_extent *ext)
4881 struct layout_intent intent = {
4883 .li_extent.e_start = ext->e_start,
4884 .li_extent.e_end = ext->e_end,
4889 rc = ll_layout_intent(inode, &intent);
4895 * This function send a restore request to the MDT
4897 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4899 struct hsm_user_request *hur;
4903 len = sizeof(struct hsm_user_request) +
4904 sizeof(struct hsm_user_item);
4905 OBD_ALLOC(hur, len);
4909 hur->hur_request.hr_action = HUA_RESTORE;
4910 hur->hur_request.hr_archive_id = 0;
4911 hur->hur_request.hr_flags = 0;
4912 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4913 sizeof(hur->hur_user_item[0].hui_fid));
4914 hur->hur_user_item[0].hui_extent.offset = offset;
4915 hur->hur_user_item[0].hui_extent.length = length;
4916 hur->hur_request.hr_itemcount = 1;
4917 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,