4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 if (inode->i_sb->s_root != file_dentry(file))
326 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
327 fd = LUSTRE_FPRIVATE(file);
330 /* The last ref on @file, maybe not the the owner pid of statahead,
331 * because parent and child process can share the same file handle. */
332 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
333 ll_deauthorize_statahead(inode, fd);
335 if (inode->i_sb->s_root == file_dentry(file)) {
336 LUSTRE_FPRIVATE(file) = NULL;
337 ll_file_data_put(fd);
341 if (!S_ISDIR(inode->i_mode)) {
342 if (lli->lli_clob != NULL)
343 lov_read_and_clear_async_rc(lli->lli_clob);
344 lli->lli_async_rc = 0;
347 rc = ll_md_close(inode, file);
349 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
350 libcfs_debug_dumplog();
355 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
356 struct lookup_intent *itp)
358 struct dentry *de = file_dentry(file);
359 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
360 struct dentry *parent = de->d_parent;
361 const char *name = NULL;
363 struct md_op_data *op_data;
364 struct ptlrpc_request *req = NULL;
368 LASSERT(parent != NULL);
369 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
371 /* if server supports open-by-fid, or file name is invalid, don't pack
372 * name in open request */
373 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
374 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
375 name = de->d_name.name;
376 len = de->d_name.len;
379 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
380 name, len, 0, LUSTRE_OPC_ANY, NULL);
382 RETURN(PTR_ERR(op_data));
383 op_data->op_data = lmm;
384 op_data->op_data_size = lmmsize;
386 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
387 &ll_md_blocking_ast, 0);
388 ll_finish_md_op_data(op_data);
390 /* reason for keep own exit path - don`t flood log
391 * with messages with -ESTALE errors.
393 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
394 it_open_error(DISP_OPEN_OPEN, itp))
396 ll_release_openhandle(de, itp);
400 if (it_disposition(itp, DISP_LOOKUP_NEG))
401 GOTO(out, rc = -ENOENT);
403 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
404 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
405 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
409 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
410 if (!rc && itp->it_lock_mode)
411 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
414 ptlrpc_req_finished(req);
415 ll_intent_drop_lock(itp);
417 /* We did open by fid, but by the time we got to the server,
418 * the object disappeared. If this is a create, we cannot really
419 * tell the userspace that the file it was trying to create
420 * does not exist. Instead let's return -ESTALE, and the VFS will
421 * retry the create with LOOKUP_REVAL that we are going to catch
422 * in ll_revalidate_dentry() and use lookup then.
424 if (rc == -ENOENT && itp->it_op & IT_CREAT)
430 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
431 struct obd_client_handle *och)
433 struct mdt_body *body;
435 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
436 och->och_fh = body->mbo_handle;
437 och->och_fid = body->mbo_fid1;
438 och->och_lease_handle.cookie = it->it_lock_handle;
439 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
440 och->och_flags = it->it_flags;
442 return md_set_open_replay_data(md_exp, och, it);
445 static int ll_local_open(struct file *file, struct lookup_intent *it,
446 struct ll_file_data *fd, struct obd_client_handle *och)
448 struct inode *inode = file_inode(file);
451 LASSERT(!LUSTRE_FPRIVATE(file));
458 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
463 LUSTRE_FPRIVATE(file) = fd;
464 ll_readahead_init(inode, &fd->fd_ras);
465 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
467 /* ll_cl_context initialize */
468 rwlock_init(&fd->fd_lock);
469 INIT_LIST_HEAD(&fd->fd_lccs);
474 /* Open a file, and (for the very first open) create objects on the OSTs at
475 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
476 * creation or open until ll_lov_setstripe() ioctl is called.
478 * If we already have the stripe MD locally then we don't request it in
479 * md_open(), by passing a lmm_size = 0.
481 * It is up to the application to ensure no other processes open this file
482 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
483 * used. We might be able to avoid races of that sort by getting lli_open_sem
484 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
485 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
487 int ll_file_open(struct inode *inode, struct file *file)
489 struct ll_inode_info *lli = ll_i2info(inode);
490 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
491 .it_flags = file->f_flags };
492 struct obd_client_handle **och_p = NULL;
493 __u64 *och_usecount = NULL;
494 struct ll_file_data *fd;
498 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
499 PFID(ll_inode2fid(inode)), inode, file->f_flags);
501 it = file->private_data; /* XXX: compat macro */
502 file->private_data = NULL; /* prevent ll_local_open assertion */
504 fd = ll_file_data_get();
506 GOTO(out_openerr, rc = -ENOMEM);
509 if (S_ISDIR(inode->i_mode))
510 ll_authorize_statahead(inode, fd);
512 if (inode->i_sb->s_root == file_dentry(file)) {
513 LUSTRE_FPRIVATE(file) = fd;
517 if (!it || !it->it_disposition) {
518 /* Convert f_flags into access mode. We cannot use file->f_mode,
519 * because everything but O_ACCMODE mask was stripped from
521 if ((oit.it_flags + 1) & O_ACCMODE)
523 if (file->f_flags & O_TRUNC)
524 oit.it_flags |= FMODE_WRITE;
526 /* kernel only call f_op->open in dentry_open. filp_open calls
527 * dentry_open after call to open_namei that checks permissions.
528 * Only nfsd_open call dentry_open directly without checking
529 * permissions and because of that this code below is safe. */
530 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
531 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
533 /* We do not want O_EXCL here, presumably we opened the file
534 * already? XXX - NFS implications? */
535 oit.it_flags &= ~O_EXCL;
537 /* bug20584, if "it_flags" contains O_CREAT, the file will be
538 * created if necessary, then "IT_CREAT" should be set to keep
539 * consistent with it */
540 if (oit.it_flags & O_CREAT)
541 oit.it_op |= IT_CREAT;
547 /* Let's see if we have file open on MDS already. */
548 if (it->it_flags & FMODE_WRITE) {
549 och_p = &lli->lli_mds_write_och;
550 och_usecount = &lli->lli_open_fd_write_count;
551 } else if (it->it_flags & FMODE_EXEC) {
552 och_p = &lli->lli_mds_exec_och;
553 och_usecount = &lli->lli_open_fd_exec_count;
555 och_p = &lli->lli_mds_read_och;
556 och_usecount = &lli->lli_open_fd_read_count;
559 mutex_lock(&lli->lli_och_mutex);
560 if (*och_p) { /* Open handle is present */
561 if (it_disposition(it, DISP_OPEN_OPEN)) {
562 /* Well, there's extra open request that we do not need,
563 let's close it somehow. This will decref request. */
564 rc = it_open_error(DISP_OPEN_OPEN, it);
566 mutex_unlock(&lli->lli_och_mutex);
567 GOTO(out_openerr, rc);
570 ll_release_openhandle(file_dentry(file), it);
574 rc = ll_local_open(file, it, fd, NULL);
577 mutex_unlock(&lli->lli_och_mutex);
578 GOTO(out_openerr, rc);
581 LASSERT(*och_usecount == 0);
582 if (!it->it_disposition) {
583 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
584 /* We cannot just request lock handle now, new ELC code
585 means that one of other OPEN locks for this file
586 could be cancelled, and since blocking ast handler
587 would attempt to grab och_mutex as well, that would
588 result in a deadlock */
589 mutex_unlock(&lli->lli_och_mutex);
591 * Normally called under two situations:
593 * 2. A race/condition on MDS resulting in no open
594 * handle to be returned from LOOKUP|OPEN request,
595 * for example if the target entry was a symlink.
597 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
598 * marked by a bit set in ll_iget_for_nfs. Clear the
599 * bit so that it's not confusing later callers.
601 * NB; when ldd is NULL, it must have come via normal
602 * lookup path only, since ll_iget_for_nfs always calls
605 if (ldd && ldd->lld_nfs_dentry) {
606 ldd->lld_nfs_dentry = 0;
607 it->it_flags |= MDS_OPEN_LOCK;
611 * Always specify MDS_OPEN_BY_FID because we don't want
612 * to get file with different fid.
614 it->it_flags |= MDS_OPEN_BY_FID;
615 rc = ll_intent_file_open(file, NULL, 0, it);
617 GOTO(out_openerr, rc);
621 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
623 GOTO(out_och_free, rc = -ENOMEM);
627 /* md_intent_lock() didn't get a request ref if there was an
628 * open error, so don't do cleanup on the request here
630 /* XXX (green): Should not we bail out on any error here, not
631 * just open error? */
632 rc = it_open_error(DISP_OPEN_OPEN, it);
634 GOTO(out_och_free, rc);
636 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
637 "inode %p: disposition %x, status %d\n", inode,
638 it_disposition(it, ~0), it->it_status);
640 rc = ll_local_open(file, it, fd, *och_p);
642 GOTO(out_och_free, rc);
644 mutex_unlock(&lli->lli_och_mutex);
647 /* Must do this outside lli_och_mutex lock to prevent deadlock where
648 different kind of OPEN lock for this same inode gets cancelled
649 by ldlm_cancel_lru */
650 if (!S_ISREG(inode->i_mode))
651 GOTO(out_och_free, rc);
653 cl_lov_delay_create_clear(&file->f_flags);
654 GOTO(out_och_free, rc);
658 if (och_p && *och_p) {
659 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
660 *och_p = NULL; /* OBD_FREE writes some magic there */
663 mutex_unlock(&lli->lli_och_mutex);
666 if (lli->lli_opendir_key == fd)
667 ll_deauthorize_statahead(inode, fd);
669 ll_file_data_put(fd);
671 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
674 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
675 ptlrpc_req_finished(it->it_request);
676 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
682 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
683 struct ldlm_lock_desc *desc, void *data, int flag)
686 struct lustre_handle lockh;
690 case LDLM_CB_BLOCKING:
691 ldlm_lock2handle(lock, &lockh);
692 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
694 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
698 case LDLM_CB_CANCELING:
706 * When setting a lease on a file, we take ownership of the lli_mds_*_och
707 * and save it as fd->fd_och so as to force client to reopen the file even
708 * if it has an open lock in cache already.
710 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
711 struct lustre_handle *old_handle)
713 struct ll_inode_info *lli = ll_i2info(inode);
714 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
715 struct obd_client_handle **och_p;
720 /* Get the openhandle of the file */
721 mutex_lock(&lli->lli_och_mutex);
722 if (fd->fd_lease_och != NULL)
723 GOTO(out_unlock, rc = -EBUSY);
725 if (fd->fd_och == NULL) {
726 if (file->f_mode & FMODE_WRITE) {
727 LASSERT(lli->lli_mds_write_och != NULL);
728 och_p = &lli->lli_mds_write_och;
729 och_usecount = &lli->lli_open_fd_write_count;
731 LASSERT(lli->lli_mds_read_och != NULL);
732 och_p = &lli->lli_mds_read_och;
733 och_usecount = &lli->lli_open_fd_read_count;
736 if (*och_usecount > 1)
737 GOTO(out_unlock, rc = -EBUSY);
744 *old_handle = fd->fd_och->och_fh;
748 mutex_unlock(&lli->lli_och_mutex);
753 * Release ownership on lli_mds_*_och when putting back a file lease.
755 static int ll_lease_och_release(struct inode *inode, struct file *file)
757 struct ll_inode_info *lli = ll_i2info(inode);
758 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
759 struct obd_client_handle **och_p;
760 struct obd_client_handle *old_och = NULL;
765 mutex_lock(&lli->lli_och_mutex);
766 if (file->f_mode & FMODE_WRITE) {
767 och_p = &lli->lli_mds_write_och;
768 och_usecount = &lli->lli_open_fd_write_count;
770 och_p = &lli->lli_mds_read_och;
771 och_usecount = &lli->lli_open_fd_read_count;
774 /* The file may have been open by another process (broken lease) so
775 * *och_p is not NULL. In this case we should simply increase usecount
778 if (*och_p != NULL) {
779 old_och = fd->fd_och;
786 mutex_unlock(&lli->lli_och_mutex);
789 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
795 * Acquire a lease and open the file.
797 static struct obd_client_handle *
798 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
801 struct lookup_intent it = { .it_op = IT_OPEN };
802 struct ll_sb_info *sbi = ll_i2sbi(inode);
803 struct md_op_data *op_data;
804 struct ptlrpc_request *req = NULL;
805 struct lustre_handle old_handle = { 0 };
806 struct obd_client_handle *och = NULL;
811 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
812 RETURN(ERR_PTR(-EINVAL));
815 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
816 RETURN(ERR_PTR(-EPERM));
818 rc = ll_lease_och_acquire(inode, file, &old_handle);
825 RETURN(ERR_PTR(-ENOMEM));
827 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
828 LUSTRE_OPC_ANY, NULL);
830 GOTO(out, rc = PTR_ERR(op_data));
832 /* To tell the MDT this openhandle is from the same owner */
833 op_data->op_handle = old_handle;
835 it.it_flags = fmode | open_flags;
836 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
837 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
838 &ll_md_blocking_lease_ast,
839 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
840 * it can be cancelled which may mislead applications that the lease is
842 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
843 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
844 * doesn't deal with openhandle, so normal openhandle will be leaked. */
845 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
846 ll_finish_md_op_data(op_data);
847 ptlrpc_req_finished(req);
849 GOTO(out_release_it, rc);
851 if (it_disposition(&it, DISP_LOOKUP_NEG))
852 GOTO(out_release_it, rc = -ENOENT);
854 rc = it_open_error(DISP_OPEN_OPEN, &it);
856 GOTO(out_release_it, rc);
858 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
859 ll_och_fill(sbi->ll_md_exp, &it, och);
861 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
862 GOTO(out_close, rc = -EOPNOTSUPP);
864 /* already get lease, handle lease lock */
865 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
866 if (it.it_lock_mode == 0 ||
867 it.it_lock_bits != MDS_INODELOCK_OPEN) {
868 /* open lock must return for lease */
869 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
870 PFID(ll_inode2fid(inode)), it.it_lock_mode,
872 GOTO(out_close, rc = -EPROTO);
875 ll_intent_release(&it);
879 /* Cancel open lock */
880 if (it.it_lock_mode != 0) {
881 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
884 och->och_lease_handle.cookie = 0ULL;
886 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
888 CERROR("%s: error closing file "DFID": %d\n",
889 ll_get_fsname(inode->i_sb, NULL, 0),
890 PFID(&ll_i2info(inode)->lli_fid), rc2);
891 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
893 ll_intent_release(&it);
901 * Check whether a layout swap can be done between two inodes.
903 * \param[in] inode1 First inode to check
904 * \param[in] inode2 Second inode to check
906 * \retval 0 on success, layout swap can be performed between both inodes
907 * \retval negative error code if requirements are not met
909 static int ll_check_swap_layouts_validity(struct inode *inode1,
910 struct inode *inode2)
912 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
915 if (inode_permission(inode1, MAY_WRITE) ||
916 inode_permission(inode2, MAY_WRITE))
919 if (inode1->i_sb != inode2->i_sb)
925 static int ll_swap_layouts_close(struct obd_client_handle *och,
926 struct inode *inode, struct inode *inode2)
928 const struct lu_fid *fid1 = ll_inode2fid(inode);
929 const struct lu_fid *fid2;
933 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
934 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
936 rc = ll_check_swap_layouts_validity(inode, inode2);
938 GOTO(out_free_och, rc);
940 /* We now know that inode2 is a lustre inode */
941 fid2 = ll_inode2fid(inode2);
943 rc = lu_fid_cmp(fid1, fid2);
945 GOTO(out_free_och, rc = -EINVAL);
947 /* Close the file and swap layouts between inode & inode2.
948 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
949 * because we still need it to pack l_remote_handle to MDT. */
950 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
953 och = NULL; /* freed in ll_close_inode_openhandle() */
963 * Release lease and close the file.
964 * It will check if the lease has ever broken.
966 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
969 struct ldlm_lock *lock;
970 bool cancelled = true;
974 lock = ldlm_handle2lock(&och->och_lease_handle);
976 lock_res_and_lock(lock);
977 cancelled = ldlm_is_cancel(lock);
978 unlock_res_and_lock(lock);
982 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
983 PFID(&ll_i2info(inode)->lli_fid), cancelled);
986 ldlm_cli_cancel(&och->och_lease_handle, 0);
988 if (lease_broken != NULL)
989 *lease_broken = cancelled;
991 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
995 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
997 struct ll_inode_info *lli = ll_i2info(inode);
998 struct cl_object *obj = lli->lli_clob;
999 struct cl_attr *attr = vvp_env_thread_attr(env);
1007 ll_inode_size_lock(inode);
1009 /* Merge timestamps the most recently obtained from MDS with
1010 * timestamps obtained from OSTs.
1012 * Do not overwrite atime of inode because it may be refreshed
1013 * by file_accessed() function. If the read was served by cache
1014 * data, there is no RPC to be sent so that atime may not be
1015 * transferred to OSTs at all. MDT only updates atime at close time
1016 * if it's at least 'mdd.*.atime_diff' older.
1017 * All in all, the atime in Lustre does not strictly comply with
1018 * POSIX. Solving this problem needs to send an RPC to MDT for each
1019 * read, this will hurt performance. */
1020 if (LTIME_S(inode->i_atime) < lli->lli_atime)
1021 LTIME_S(inode->i_atime) = lli->lli_atime;
1022 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1023 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1025 atime = LTIME_S(inode->i_atime);
1026 mtime = LTIME_S(inode->i_mtime);
1027 ctime = LTIME_S(inode->i_ctime);
1029 cl_object_attr_lock(obj);
1030 rc = cl_object_attr_get(env, obj, attr);
1031 cl_object_attr_unlock(obj);
1034 GOTO(out_size_unlock, rc);
1036 if (atime < attr->cat_atime)
1037 atime = attr->cat_atime;
1039 if (ctime < attr->cat_ctime)
1040 ctime = attr->cat_ctime;
1042 if (mtime < attr->cat_mtime)
1043 mtime = attr->cat_mtime;
1045 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1046 PFID(&lli->lli_fid), attr->cat_size);
1048 i_size_write(inode, attr->cat_size);
1049 inode->i_blocks = attr->cat_blocks;
1051 LTIME_S(inode->i_atime) = atime;
1052 LTIME_S(inode->i_mtime) = mtime;
1053 LTIME_S(inode->i_ctime) = ctime;
1056 ll_inode_size_unlock(inode);
1061 static bool file_is_noatime(const struct file *file)
1063 const struct vfsmount *mnt = file->f_path.mnt;
1064 const struct inode *inode = file_inode((struct file *)file);
1066 /* Adapted from file_accessed() and touch_atime().*/
1067 if (file->f_flags & O_NOATIME)
1070 if (inode->i_flags & S_NOATIME)
1073 if (IS_NOATIME(inode))
1076 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1079 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1082 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1088 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1090 struct inode *inode = file_inode((struct file *)file);
1092 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1094 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1095 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1096 file->f_flags & O_DIRECT ||
1099 io->ci_obj = ll_i2info(inode)->lli_clob;
1100 io->ci_lockreq = CILR_MAYBE;
1101 if (ll_file_nolock(file)) {
1102 io->ci_lockreq = CILR_NEVER;
1103 io->ci_no_srvlock = 1;
1104 } else if (file->f_flags & O_APPEND) {
1105 io->ci_lockreq = CILR_MANDATORY;
1108 io->ci_noatime = file_is_noatime(file);
1112 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1113 struct file *file, enum cl_io_type iot,
1114 loff_t *ppos, size_t count)
1116 struct vvp_io *vio = vvp_env_io(env);
1117 struct inode *inode = file_inode(file);
1118 struct ll_inode_info *lli = ll_i2info(inode);
1119 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1123 struct range_lock range;
1127 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1128 file_dentry(file)->d_name.name, iot, *ppos, count);
1131 io = vvp_env_thread_io(env);
1132 ll_io_init(io, file, iot == CIT_WRITE);
1134 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1135 bool range_locked = false;
1137 if (file->f_flags & O_APPEND)
1138 range_lock_init(&range, 0, LUSTRE_EOF);
1140 range_lock_init(&range, *ppos, *ppos + count - 1);
1142 vio->vui_fd = LUSTRE_FPRIVATE(file);
1143 vio->vui_io_subtype = args->via_io_subtype;
1145 switch (vio->vui_io_subtype) {
1147 vio->vui_iter = args->u.normal.via_iter;
1148 vio->vui_iocb = args->u.normal.via_iocb;
1149 /* Direct IO reads must also take range lock,
1150 * or multiple reads will try to work on the same pages
1151 * See LU-6227 for details. */
1152 if (((iot == CIT_WRITE) ||
1153 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1154 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1155 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1157 rc = range_lock(&lli->lli_write_tree, &range);
1161 range_locked = true;
1165 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1166 vio->u.splice.vui_flags = args->u.splice.via_flags;
1169 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1173 ll_cl_add(file, env, io, LCC_RW);
1174 rc = cl_io_loop(env, io);
1175 ll_cl_remove(file, env);
1178 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1180 range_unlock(&lli->lli_write_tree, &range);
1183 /* cl_io_rw_init() handled IO */
1187 if (io->ci_nob > 0) {
1188 result += io->ci_nob;
1189 count -= io->ci_nob;
1190 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1192 /* prepare IO restart */
1193 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1194 args->u.normal.via_iter = vio->vui_iter;
1198 cl_io_fini(env, io);
1200 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1202 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1203 file_dentry(file)->d_name.name,
1204 iot == CIT_READ ? "read" : "write",
1205 *ppos, count, result);
1209 if (iot == CIT_READ) {
1211 ll_stats_ops_tally(ll_i2sbi(inode),
1212 LPROC_LL_READ_BYTES, result);
1213 } else if (iot == CIT_WRITE) {
1215 ll_stats_ops_tally(ll_i2sbi(inode),
1216 LPROC_LL_WRITE_BYTES, result);
1217 fd->fd_write_failed = false;
1218 } else if (result == 0 && rc == 0) {
1221 fd->fd_write_failed = true;
1223 fd->fd_write_failed = false;
1224 } else if (rc != -ERESTARTSYS) {
1225 fd->fd_write_failed = true;
1229 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1231 return result > 0 ? result : rc;
1235 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1236 * especially for small I/O.
1238 * To serve a read request, CLIO has to create and initialize a cl_io and
1239 * then request DLM lock. This has turned out to have siginificant overhead
1240 * and affects the performance of small I/O dramatically.
1242 * It's not necessary to create a cl_io for each I/O. Under the help of read
1243 * ahead, most of the pages being read are already in memory cache and we can
1244 * read those pages directly because if the pages exist, the corresponding DLM
1245 * lock must exist so that page content must be valid.
1247 * In fast read implementation, the llite speculatively finds and reads pages
1248 * in memory cache. There are three scenarios for fast read:
1249 * - If the page exists and is uptodate, kernel VM will provide the data and
1250 * CLIO won't be intervened;
1251 * - If the page was brought into memory by read ahead, it will be exported
1252 * and read ahead parameters will be updated;
1253 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1254 * it will go back and invoke normal read, i.e., a cl_io will be created
1255 * and DLM lock will be requested.
1257 * POSIX compliance: posix standard states that read is intended to be atomic.
1258 * Lustre read implementation is in line with Linux kernel read implementation
1259 * and neither of them complies with POSIX standard in this matter. Fast read
1260 * doesn't make the situation worse on single node but it may interleave write
1261 * results from multiple nodes due to short read handling in ll_file_aio_read().
1263 * \param env - lu_env
1264 * \param iocb - kiocb from kernel
1265 * \param iter - user space buffers where the data will be copied
1267 * \retval - number of bytes have been read, or error code if error occurred.
1270 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1271 struct iov_iter *iter)
1275 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1278 /* NB: we can't do direct IO for fast read because it will need a lock
1279 * to make IO engine happy. */
1280 if (iocb->ki_filp->f_flags & O_DIRECT)
1283 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1284 result = generic_file_read_iter(iocb, iter);
1285 ll_cl_remove(iocb->ki_filp, env);
1287 /* If the first page is not in cache, generic_file_aio_read() will be
1288 * returned with -ENODATA.
1289 * See corresponding code in ll_readpage(). */
1290 if (result == -ENODATA)
1294 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1295 LPROC_LL_READ_BYTES, result);
1301 * Read from a file (through the page cache).
1303 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1306 struct vvp_io_args *args;
1311 env = cl_env_get(&refcheck);
1313 return PTR_ERR(env);
1315 result = ll_do_fast_read(env, iocb, to);
1316 if (result < 0 || iov_iter_count(to) == 0)
1319 args = ll_env_args(env, IO_NORMAL);
1320 args->u.normal.via_iter = to;
1321 args->u.normal.via_iocb = iocb;
1323 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1324 &iocb->ki_pos, iov_iter_count(to));
1327 else if (result == 0)
1331 cl_env_put(env, &refcheck);
1336 * Write to a file (through the page cache).
1338 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1340 struct vvp_io_args *args;
1345 env = cl_env_get(&refcheck);
1347 return PTR_ERR(env);
1349 args = ll_env_args(env, IO_NORMAL);
1350 args->u.normal.via_iter = from;
1351 args->u.normal.via_iocb = iocb;
1353 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1354 &iocb->ki_pos, iov_iter_count(from));
1355 cl_env_put(env, &refcheck);
1359 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1361 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1363 static int ll_file_get_iov_count(const struct iovec *iov,
1364 unsigned long *nr_segs, size_t *count)
1369 for (seg = 0; seg < *nr_segs; seg++) {
1370 const struct iovec *iv = &iov[seg];
1373 * If any segment has a negative length, or the cumulative
1374 * length ever wraps negative then return -EINVAL.
1377 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1379 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1384 cnt -= iv->iov_len; /* This segment is no good */
1391 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1392 unsigned long nr_segs, loff_t pos)
1399 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1403 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1404 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1405 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1406 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1407 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1409 result = ll_file_read_iter(iocb, &to);
1414 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1417 struct iovec iov = { .iov_base = buf, .iov_len = count };
1418 struct kiocb *kiocb;
1422 OBD_ALLOC_PTR(kiocb);
1426 init_sync_kiocb(kiocb, file);
1427 kiocb->ki_pos = *ppos;
1428 #ifdef HAVE_KIOCB_KI_LEFT
1429 kiocb->ki_left = count;
1430 #elif defined(HAVE_KI_NBYTES)
1431 kiocb->ki_nbytes = count;
1434 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1435 *ppos = kiocb->ki_pos;
1437 OBD_FREE_PTR(kiocb);
1442 * Write to a file (through the page cache).
1445 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1446 unsigned long nr_segs, loff_t pos)
1448 struct iov_iter from;
1453 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1457 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1458 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1459 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1460 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1461 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1463 result = ll_file_write_iter(iocb, &from);
1468 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1469 size_t count, loff_t *ppos)
1472 struct iovec iov = { .iov_base = (void __user *)buf,
1474 struct kiocb *kiocb;
1479 env = cl_env_get(&refcheck);
1481 RETURN(PTR_ERR(env));
1483 kiocb = &ll_env_info(env)->lti_kiocb;
1484 init_sync_kiocb(kiocb, file);
1485 kiocb->ki_pos = *ppos;
1486 #ifdef HAVE_KIOCB_KI_LEFT
1487 kiocb->ki_left = count;
1488 #elif defined(HAVE_KI_NBYTES)
1489 kiocb->ki_nbytes = count;
1492 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1493 *ppos = kiocb->ki_pos;
1495 cl_env_put(env, &refcheck);
1498 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1501 * Send file content (through pagecache) somewhere with helper
1503 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1504 struct pipe_inode_info *pipe, size_t count,
1508 struct vvp_io_args *args;
1513 env = cl_env_get(&refcheck);
1515 RETURN(PTR_ERR(env));
1517 args = ll_env_args(env, IO_SPLICE);
1518 args->u.splice.via_pipe = pipe;
1519 args->u.splice.via_flags = flags;
1521 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1522 cl_env_put(env, &refcheck);
1526 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1527 __u64 flags, struct lov_user_md *lum,
1530 struct lookup_intent oit = {
1532 .it_flags = flags | MDS_OPEN_BY_FID,
1537 ll_inode_size_lock(inode);
1538 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1540 GOTO(out_unlock, rc);
1542 ll_release_openhandle(file_dentry(file), &oit);
1545 ll_inode_size_unlock(inode);
1546 ll_intent_release(&oit);
1547 cl_lov_delay_create_clear(&file->f_flags);
1552 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1553 struct lov_mds_md **lmmp, int *lmm_size,
1554 struct ptlrpc_request **request)
1556 struct ll_sb_info *sbi = ll_i2sbi(inode);
1557 struct mdt_body *body;
1558 struct lov_mds_md *lmm = NULL;
1559 struct ptlrpc_request *req = NULL;
1560 struct md_op_data *op_data;
1563 rc = ll_get_default_mdsize(sbi, &lmmsize);
1567 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1568 strlen(filename), lmmsize,
1569 LUSTRE_OPC_ANY, NULL);
1570 if (IS_ERR(op_data))
1571 RETURN(PTR_ERR(op_data));
1573 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1574 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1575 ll_finish_md_op_data(op_data);
1577 CDEBUG(D_INFO, "md_getattr_name failed "
1578 "on %s: rc %d\n", filename, rc);
1582 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1583 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1585 lmmsize = body->mbo_eadatasize;
1587 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1589 GOTO(out, rc = -ENODATA);
1592 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1593 LASSERT(lmm != NULL);
1595 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1596 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1597 GOTO(out, rc = -EPROTO);
1601 * This is coming from the MDS, so is probably in
1602 * little endian. We convert it to host endian before
1603 * passing it to userspace.
1605 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1608 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1609 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1612 /* if function called for directory - we should
1613 * avoid swab not existent lsm objects */
1614 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1615 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1616 if (S_ISREG(body->mbo_mode))
1617 lustre_swab_lov_user_md_objects(
1618 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1620 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1621 lustre_swab_lov_user_md_v3(
1622 (struct lov_user_md_v3 *)lmm);
1623 if (S_ISREG(body->mbo_mode))
1624 lustre_swab_lov_user_md_objects(
1625 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1632 *lmm_size = lmmsize;
1637 static int ll_lov_setea(struct inode *inode, struct file *file,
1640 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1641 struct lov_user_md *lump;
1642 int lum_size = sizeof(struct lov_user_md) +
1643 sizeof(struct lov_user_ost_data);
1647 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1650 OBD_ALLOC_LARGE(lump, lum_size);
1654 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1655 GOTO(out_lump, rc = -EFAULT);
1657 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1660 OBD_FREE_LARGE(lump, lum_size);
1664 static int ll_file_getstripe(struct inode *inode,
1665 struct lov_user_md __user *lum)
1672 env = cl_env_get(&refcheck);
1674 RETURN(PTR_ERR(env));
1676 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1677 cl_env_put(env, &refcheck);
1681 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1684 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1685 struct lov_user_md *klum;
1687 __u64 flags = FMODE_WRITE;
1690 rc = ll_copy_user_md(lum, &klum);
1695 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1699 put_user(0, &lum->lmm_stripe_count);
1701 ll_layout_refresh(inode, &gen);
1702 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1705 OBD_FREE(klum, lum_size);
1710 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1712 struct ll_inode_info *lli = ll_i2info(inode);
1713 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1714 struct ll_grouplock grouplock;
1719 CWARN("group id for group lock must not be 0\n");
1723 if (ll_file_nolock(file))
1724 RETURN(-EOPNOTSUPP);
1726 spin_lock(&lli->lli_lock);
1727 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1728 CWARN("group lock already existed with gid %lu\n",
1729 fd->fd_grouplock.lg_gid);
1730 spin_unlock(&lli->lli_lock);
1733 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1734 spin_unlock(&lli->lli_lock);
1736 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1737 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1741 spin_lock(&lli->lli_lock);
1742 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1743 spin_unlock(&lli->lli_lock);
1744 CERROR("another thread just won the race\n");
1745 cl_put_grouplock(&grouplock);
1749 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1750 fd->fd_grouplock = grouplock;
1751 spin_unlock(&lli->lli_lock);
1753 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1757 static int ll_put_grouplock(struct inode *inode, struct file *file,
1760 struct ll_inode_info *lli = ll_i2info(inode);
1761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1762 struct ll_grouplock grouplock;
1765 spin_lock(&lli->lli_lock);
1766 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1767 spin_unlock(&lli->lli_lock);
1768 CWARN("no group lock held\n");
1772 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1774 if (fd->fd_grouplock.lg_gid != arg) {
1775 CWARN("group lock %lu doesn't match current id %lu\n",
1776 arg, fd->fd_grouplock.lg_gid);
1777 spin_unlock(&lli->lli_lock);
1781 grouplock = fd->fd_grouplock;
1782 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1783 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1784 spin_unlock(&lli->lli_lock);
1786 cl_put_grouplock(&grouplock);
1787 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1792 * Close inode open handle
1794 * \param dentry [in] dentry which contains the inode
1795 * \param it [in,out] intent which contains open info and result
1798 * \retval <0 failure
1800 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1802 struct inode *inode = dentry->d_inode;
1803 struct obd_client_handle *och;
1809 /* Root ? Do nothing. */
1810 if (dentry->d_inode->i_sb->s_root == dentry)
1813 /* No open handle to close? Move away */
1814 if (!it_disposition(it, DISP_OPEN_OPEN))
1817 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1819 OBD_ALLOC(och, sizeof(*och));
1821 GOTO(out, rc = -ENOMEM);
1823 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1825 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1827 /* this one is in place of ll_file_open */
1828 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1829 ptlrpc_req_finished(it->it_request);
1830 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1836 * Get size for inode for which FIEMAP mapping is requested.
1837 * Make the FIEMAP get_info call and returns the result.
1838 * \param fiemap kernel buffer to hold extens
1839 * \param num_bytes kernel buffer size
1841 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1847 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1850 /* Checks for fiemap flags */
1851 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1852 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1856 /* Check for FIEMAP_FLAG_SYNC */
1857 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1858 rc = filemap_fdatawrite(inode->i_mapping);
1863 env = cl_env_get(&refcheck);
1865 RETURN(PTR_ERR(env));
1867 if (i_size_read(inode) == 0) {
1868 rc = ll_glimpse_size(inode);
1873 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1874 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1875 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1877 /* If filesize is 0, then there would be no objects for mapping */
1878 if (fmkey.lfik_oa.o_size == 0) {
1879 fiemap->fm_mapped_extents = 0;
1883 fmkey.lfik_fiemap = *fiemap;
1885 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1886 &fmkey, fiemap, &num_bytes);
1888 cl_env_put(env, &refcheck);
1892 int ll_fid2path(struct inode *inode, void __user *arg)
1894 struct obd_export *exp = ll_i2mdexp(inode);
1895 const struct getinfo_fid2path __user *gfin = arg;
1897 struct getinfo_fid2path *gfout;
1903 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1904 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1907 /* Only need to get the buflen */
1908 if (get_user(pathlen, &gfin->gf_pathlen))
1911 if (pathlen > PATH_MAX)
1914 outsize = sizeof(*gfout) + pathlen;
1915 OBD_ALLOC(gfout, outsize);
1919 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1920 GOTO(gf_free, rc = -EFAULT);
1921 /* append root FID after gfout to let MDT know the root FID so that it
1922 * can lookup the correct path, this is mainly for fileset.
1923 * old server without fileset mount support will ignore this. */
1924 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1926 /* Call mdc_iocontrol */
1927 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1931 if (copy_to_user(arg, gfout, outsize))
1935 OBD_FREE(gfout, outsize);
1940 * Read the data_version for inode.
1942 * This value is computed using stripe object version on OST.
1943 * Version is computed using server side locking.
1945 * @param flags if do sync on the OST side;
1947 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1948 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1950 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1952 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1960 /* If no file object initialized, we consider its version is 0. */
1966 env = cl_env_get(&refcheck);
1968 RETURN(PTR_ERR(env));
1970 io = vvp_env_thread_io(env);
1972 io->u.ci_data_version.dv_data_version = 0;
1973 io->u.ci_data_version.dv_flags = flags;
1976 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1977 result = cl_io_loop(env, io);
1979 result = io->ci_result;
1981 *data_version = io->u.ci_data_version.dv_data_version;
1983 cl_io_fini(env, io);
1985 if (unlikely(io->ci_need_restart))
1988 cl_env_put(env, &refcheck);
1994 * Trigger a HSM release request for the provided inode.
1996 int ll_hsm_release(struct inode *inode)
1999 struct obd_client_handle *och = NULL;
2000 __u64 data_version = 0;
2005 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2006 ll_get_fsname(inode->i_sb, NULL, 0),
2007 PFID(&ll_i2info(inode)->lli_fid));
2009 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2011 GOTO(out, rc = PTR_ERR(och));
2013 /* Grab latest data_version and [am]time values */
2014 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2018 env = cl_env_get(&refcheck);
2020 GOTO(out, rc = PTR_ERR(env));
2022 ll_merge_attr(env, inode);
2023 cl_env_put(env, &refcheck);
2025 /* Release the file.
2026 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2027 * we still need it to pack l_remote_handle to MDT. */
2028 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2034 if (och != NULL && !IS_ERR(och)) /* close the file */
2035 ll_lease_close(och, inode, NULL);
2040 struct ll_swap_stack {
2043 struct inode *inode1;
2044 struct inode *inode2;
2049 static int ll_swap_layouts(struct file *file1, struct file *file2,
2050 struct lustre_swap_layouts *lsl)
2052 struct mdc_swap_layouts msl;
2053 struct md_op_data *op_data;
2056 struct ll_swap_stack *llss = NULL;
2059 OBD_ALLOC_PTR(llss);
2063 llss->inode1 = file_inode(file1);
2064 llss->inode2 = file_inode(file2);
2066 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2070 /* we use 2 bool because it is easier to swap than 2 bits */
2071 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2072 llss->check_dv1 = true;
2074 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2075 llss->check_dv2 = true;
2077 /* we cannot use lsl->sl_dvX directly because we may swap them */
2078 llss->dv1 = lsl->sl_dv1;
2079 llss->dv2 = lsl->sl_dv2;
2081 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2082 if (rc == 0) /* same file, done! */
2085 if (rc < 0) { /* sequentialize it */
2086 swap(llss->inode1, llss->inode2);
2088 swap(llss->dv1, llss->dv2);
2089 swap(llss->check_dv1, llss->check_dv2);
2093 if (gid != 0) { /* application asks to flush dirty cache */
2094 rc = ll_get_grouplock(llss->inode1, file1, gid);
2098 rc = ll_get_grouplock(llss->inode2, file2, gid);
2100 ll_put_grouplock(llss->inode1, file1, gid);
2105 /* ultimate check, before swaping the layouts we check if
2106 * dataversion has changed (if requested) */
2107 if (llss->check_dv1) {
2108 rc = ll_data_version(llss->inode1, &dv, 0);
2111 if (dv != llss->dv1)
2112 GOTO(putgl, rc = -EAGAIN);
2115 if (llss->check_dv2) {
2116 rc = ll_data_version(llss->inode2, &dv, 0);
2119 if (dv != llss->dv2)
2120 GOTO(putgl, rc = -EAGAIN);
2123 /* struct md_op_data is used to send the swap args to the mdt
2124 * only flags is missing, so we use struct mdc_swap_layouts
2125 * through the md_op_data->op_data */
2126 /* flags from user space have to be converted before they are send to
2127 * server, no flag is sent today, they are only used on the client */
2130 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2131 0, LUSTRE_OPC_ANY, &msl);
2132 if (IS_ERR(op_data))
2133 GOTO(free, rc = PTR_ERR(op_data));
2135 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2136 sizeof(*op_data), op_data, NULL);
2137 ll_finish_md_op_data(op_data);
2144 ll_put_grouplock(llss->inode2, file2, gid);
2145 ll_put_grouplock(llss->inode1, file1, gid);
2155 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2157 struct md_op_data *op_data;
2161 /* Detect out-of range masks */
2162 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2165 /* Non-root users are forbidden to set or clear flags which are
2166 * NOT defined in HSM_USER_MASK. */
2167 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2168 !cfs_capable(CFS_CAP_SYS_ADMIN))
2171 /* Detect out-of range archive id */
2172 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2173 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2176 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2177 LUSTRE_OPC_ANY, hss);
2178 if (IS_ERR(op_data))
2179 RETURN(PTR_ERR(op_data));
2181 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2182 sizeof(*op_data), op_data, NULL);
2184 ll_finish_md_op_data(op_data);
2189 static int ll_hsm_import(struct inode *inode, struct file *file,
2190 struct hsm_user_import *hui)
2192 struct hsm_state_set *hss = NULL;
2193 struct iattr *attr = NULL;
2197 if (!S_ISREG(inode->i_mode))
2203 GOTO(out, rc = -ENOMEM);
2205 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2206 hss->hss_archive_id = hui->hui_archive_id;
2207 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2208 rc = ll_hsm_state_set(inode, hss);
2212 OBD_ALLOC_PTR(attr);
2214 GOTO(out, rc = -ENOMEM);
2216 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2217 attr->ia_mode |= S_IFREG;
2218 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2219 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2220 attr->ia_size = hui->hui_size;
2221 attr->ia_mtime.tv_sec = hui->hui_mtime;
2222 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2223 attr->ia_atime.tv_sec = hui->hui_atime;
2224 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2226 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2227 ATTR_UID | ATTR_GID |
2228 ATTR_MTIME | ATTR_MTIME_SET |
2229 ATTR_ATIME | ATTR_ATIME_SET;
2233 rc = ll_setattr_raw(file_dentry(file), attr, true);
2237 inode_unlock(inode);
2249 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2251 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2252 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2255 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2257 struct inode *inode = file_inode(file);
2259 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2260 ATTR_MTIME | ATTR_MTIME_SET |
2261 ATTR_CTIME | ATTR_CTIME_SET,
2263 .tv_sec = lfu->lfu_atime_sec,
2264 .tv_nsec = lfu->lfu_atime_nsec,
2267 .tv_sec = lfu->lfu_mtime_sec,
2268 .tv_nsec = lfu->lfu_mtime_nsec,
2271 .tv_sec = lfu->lfu_ctime_sec,
2272 .tv_nsec = lfu->lfu_ctime_nsec,
2278 if (!capable(CAP_SYS_ADMIN))
2281 if (!S_ISREG(inode->i_mode))
2285 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2286 inode_unlock(inode);
2292 * Give file access advices
2294 * The ladvise interface is similar to Linux fadvise() system call, except it
2295 * forwards the advices directly from Lustre client to server. The server side
2296 * codes will apply appropriate read-ahead and caching techniques for the
2297 * corresponding files.
2299 * A typical workload for ladvise is e.g. a bunch of different clients are
2300 * doing small random reads of a file, so prefetching pages into OSS cache
2301 * with big linear reads before the random IO is a net benefit. Fetching
2302 * all that data into each client cache with fadvise() may not be, due to
2303 * much more data being sent to the client.
2305 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2306 struct llapi_lu_ladvise *ladvise)
2310 struct cl_ladvise_io *lio;
2315 env = cl_env_get(&refcheck);
2317 RETURN(PTR_ERR(env));
2319 io = vvp_env_thread_io(env);
2320 io->ci_obj = ll_i2info(inode)->lli_clob;
2322 /* initialize parameters for ladvise */
2323 lio = &io->u.ci_ladvise;
2324 lio->li_start = ladvise->lla_start;
2325 lio->li_end = ladvise->lla_end;
2326 lio->li_fid = ll_inode2fid(inode);
2327 lio->li_advice = ladvise->lla_advice;
2328 lio->li_flags = flags;
2330 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2331 rc = cl_io_loop(env, io);
2335 cl_io_fini(env, io);
2336 cl_env_put(env, &refcheck);
2341 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2343 struct inode *inode = file_inode(file);
2344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2348 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2349 PFID(ll_inode2fid(inode)), inode, cmd);
2350 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2352 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2353 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2357 case LL_IOC_GETFLAGS:
2358 /* Get the current value of the file flags */
2359 return put_user(fd->fd_flags, (int __user *)arg);
2360 case LL_IOC_SETFLAGS:
2361 case LL_IOC_CLRFLAGS:
2362 /* Set or clear specific file flags */
2363 /* XXX This probably needs checks to ensure the flags are
2364 * not abused, and to handle any flag side effects.
2366 if (get_user(flags, (int __user *) arg))
2369 if (cmd == LL_IOC_SETFLAGS) {
2370 if ((flags & LL_FILE_IGNORE_LOCK) &&
2371 !(file->f_flags & O_DIRECT)) {
2372 CERROR("%s: unable to disable locking on "
2373 "non-O_DIRECT file\n", current->comm);
2377 fd->fd_flags |= flags;
2379 fd->fd_flags &= ~flags;
2382 case LL_IOC_LOV_SETSTRIPE:
2383 RETURN(ll_lov_setstripe(inode, file, arg));
2384 case LL_IOC_LOV_SETEA:
2385 RETURN(ll_lov_setea(inode, file, arg));
2386 case LL_IOC_LOV_SWAP_LAYOUTS: {
2388 struct lustre_swap_layouts lsl;
2390 if (copy_from_user(&lsl, (char __user *)arg,
2391 sizeof(struct lustre_swap_layouts)))
2394 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2397 file2 = fget(lsl.sl_fd);
2401 /* O_WRONLY or O_RDWR */
2402 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2403 GOTO(out, rc = -EPERM);
2405 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2406 struct inode *inode2;
2407 struct ll_inode_info *lli;
2408 struct obd_client_handle *och = NULL;
2410 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2411 GOTO(out, rc = -EINVAL);
2413 lli = ll_i2info(inode);
2414 mutex_lock(&lli->lli_och_mutex);
2415 if (fd->fd_lease_och != NULL) {
2416 och = fd->fd_lease_och;
2417 fd->fd_lease_och = NULL;
2419 mutex_unlock(&lli->lli_och_mutex);
2421 GOTO(out, rc = -ENOLCK);
2422 inode2 = file_inode(file2);
2423 rc = ll_swap_layouts_close(och, inode, inode2);
2425 rc = ll_swap_layouts(file, file2, &lsl);
2431 case LL_IOC_LOV_GETSTRIPE:
2432 RETURN(ll_file_getstripe(inode,
2433 (struct lov_user_md __user *)arg));
2434 case FSFILT_IOC_GETFLAGS:
2435 case FSFILT_IOC_SETFLAGS:
2436 RETURN(ll_iocontrol(inode, file, cmd, arg));
2437 case FSFILT_IOC_GETVERSION_OLD:
2438 case FSFILT_IOC_GETVERSION:
2439 RETURN(put_user(inode->i_generation, (int __user *)arg));
2440 case LL_IOC_GROUP_LOCK:
2441 RETURN(ll_get_grouplock(inode, file, arg));
2442 case LL_IOC_GROUP_UNLOCK:
2443 RETURN(ll_put_grouplock(inode, file, arg));
2444 case IOC_OBD_STATFS:
2445 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2447 /* We need to special case any other ioctls we want to handle,
2448 * to send them to the MDS/OST as appropriate and to properly
2449 * network encode the arg field.
2450 case FSFILT_IOC_SETVERSION_OLD:
2451 case FSFILT_IOC_SETVERSION:
2453 case LL_IOC_FLUSHCTX:
2454 RETURN(ll_flush_ctx(inode));
2455 case LL_IOC_PATH2FID: {
2456 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2457 sizeof(struct lu_fid)))
2462 case LL_IOC_GETPARENT:
2463 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2465 case OBD_IOC_FID2PATH:
2466 RETURN(ll_fid2path(inode, (void __user *)arg));
2467 case LL_IOC_DATA_VERSION: {
2468 struct ioc_data_version idv;
2471 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2474 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2475 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2478 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2484 case LL_IOC_GET_MDTIDX: {
2487 mdtidx = ll_get_mdt_idx(inode);
2491 if (put_user((int)mdtidx, (int __user *)arg))
2496 case OBD_IOC_GETDTNAME:
2497 case OBD_IOC_GETMDNAME:
2498 RETURN(ll_get_obd_name(inode, cmd, arg));
2499 case LL_IOC_HSM_STATE_GET: {
2500 struct md_op_data *op_data;
2501 struct hsm_user_state *hus;
2508 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2509 LUSTRE_OPC_ANY, hus);
2510 if (IS_ERR(op_data)) {
2512 RETURN(PTR_ERR(op_data));
2515 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2518 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2521 ll_finish_md_op_data(op_data);
2525 case LL_IOC_HSM_STATE_SET: {
2526 struct hsm_state_set *hss;
2533 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2538 rc = ll_hsm_state_set(inode, hss);
2543 case LL_IOC_HSM_ACTION: {
2544 struct md_op_data *op_data;
2545 struct hsm_current_action *hca;
2552 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2553 LUSTRE_OPC_ANY, hca);
2554 if (IS_ERR(op_data)) {
2556 RETURN(PTR_ERR(op_data));
2559 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2562 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2565 ll_finish_md_op_data(op_data);
2569 case LL_IOC_SET_LEASE: {
2570 struct ll_inode_info *lli = ll_i2info(inode);
2571 struct obd_client_handle *och = NULL;
2576 case LL_LEASE_WRLCK:
2577 if (!(file->f_mode & FMODE_WRITE))
2579 fmode = FMODE_WRITE;
2581 case LL_LEASE_RDLCK:
2582 if (!(file->f_mode & FMODE_READ))
2586 case LL_LEASE_UNLCK:
2587 mutex_lock(&lli->lli_och_mutex);
2588 if (fd->fd_lease_och != NULL) {
2589 och = fd->fd_lease_och;
2590 fd->fd_lease_och = NULL;
2592 mutex_unlock(&lli->lli_och_mutex);
2597 fmode = och->och_flags;
2598 rc = ll_lease_close(och, inode, &lease_broken);
2602 rc = ll_lease_och_release(inode, file);
2609 RETURN(ll_lease_type_from_fmode(fmode));
2614 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2616 /* apply for lease */
2617 och = ll_lease_open(inode, file, fmode, 0);
2619 RETURN(PTR_ERR(och));
2622 mutex_lock(&lli->lli_och_mutex);
2623 if (fd->fd_lease_och == NULL) {
2624 fd->fd_lease_och = och;
2627 mutex_unlock(&lli->lli_och_mutex);
2629 /* impossible now that only excl is supported for now */
2630 ll_lease_close(och, inode, &lease_broken);
2635 case LL_IOC_GET_LEASE: {
2636 struct ll_inode_info *lli = ll_i2info(inode);
2637 struct ldlm_lock *lock = NULL;
2640 mutex_lock(&lli->lli_och_mutex);
2641 if (fd->fd_lease_och != NULL) {
2642 struct obd_client_handle *och = fd->fd_lease_och;
2644 lock = ldlm_handle2lock(&och->och_lease_handle);
2646 lock_res_and_lock(lock);
2647 if (!ldlm_is_cancel(lock))
2648 fmode = och->och_flags;
2650 unlock_res_and_lock(lock);
2651 LDLM_LOCK_PUT(lock);
2654 mutex_unlock(&lli->lli_och_mutex);
2656 RETURN(ll_lease_type_from_fmode(fmode));
2658 case LL_IOC_HSM_IMPORT: {
2659 struct hsm_user_import *hui;
2665 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2670 rc = ll_hsm_import(inode, file, hui);
2675 case LL_IOC_FUTIMES_3: {
2676 struct ll_futimes_3 lfu;
2678 if (copy_from_user(&lfu,
2679 (const struct ll_futimes_3 __user *)arg,
2683 RETURN(ll_file_futimes_3(file, &lfu));
2685 case LL_IOC_LADVISE: {
2686 struct llapi_ladvise_hdr *ladvise_hdr;
2689 int alloc_size = sizeof(*ladvise_hdr);
2692 OBD_ALLOC_PTR(ladvise_hdr);
2693 if (ladvise_hdr == NULL)
2696 if (copy_from_user(ladvise_hdr,
2697 (const struct llapi_ladvise_hdr __user *)arg,
2699 GOTO(out_ladvise, rc = -EFAULT);
2701 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2702 ladvise_hdr->lah_count < 1)
2703 GOTO(out_ladvise, rc = -EINVAL);
2705 num_advise = ladvise_hdr->lah_count;
2706 if (num_advise >= LAH_COUNT_MAX)
2707 GOTO(out_ladvise, rc = -EFBIG);
2709 OBD_FREE_PTR(ladvise_hdr);
2710 alloc_size = offsetof(typeof(*ladvise_hdr),
2711 lah_advise[num_advise]);
2712 OBD_ALLOC(ladvise_hdr, alloc_size);
2713 if (ladvise_hdr == NULL)
2717 * TODO: submit multiple advices to one server in a single RPC
2719 if (copy_from_user(ladvise_hdr,
2720 (const struct llapi_ladvise_hdr __user *)arg,
2722 GOTO(out_ladvise, rc = -EFAULT);
2724 for (i = 0; i < num_advise; i++) {
2725 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2726 &ladvise_hdr->lah_advise[i]);
2732 OBD_FREE(ladvise_hdr, alloc_size);
2739 ll_iocontrol_call(inode, file, cmd, arg, &err))
2742 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2743 (void __user *)arg));
2748 #ifndef HAVE_FILE_LLSEEK_SIZE
2749 static inline loff_t
2750 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2752 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2754 if (offset > maxsize)
2757 if (offset != file->f_pos) {
2758 file->f_pos = offset;
2759 file->f_version = 0;
2765 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2766 loff_t maxsize, loff_t eof)
2768 struct inode *inode = file_inode(file);
2776 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2777 * position-querying operation. Avoid rewriting the "same"
2778 * f_pos value back to the file because a concurrent read(),
2779 * write() or lseek() might have altered it
2784 * f_lock protects against read/modify/write race with other
2785 * SEEK_CURs. Note that parallel writes and reads behave
2789 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2790 inode_unlock(inode);
2794 * In the generic case the entire file is data, so as long as
2795 * offset isn't at the end of the file then the offset is data.
2802 * There is a virtual hole at the end of the file, so as long as
2803 * offset isn't i_size or larger, return i_size.
2811 return llseek_execute(file, offset, maxsize);
2815 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2817 struct inode *inode = file_inode(file);
2818 loff_t retval, eof = 0;
2821 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2822 (origin == SEEK_CUR) ? file->f_pos : 0);
2823 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2824 PFID(ll_inode2fid(inode)), inode, retval, retval,
2826 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2828 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2829 retval = ll_glimpse_size(inode);
2832 eof = i_size_read(inode);
2835 retval = ll_generic_file_llseek_size(file, offset, origin,
2836 ll_file_maxbytes(inode), eof);
2840 static int ll_flush(struct file *file, fl_owner_t id)
2842 struct inode *inode = file_inode(file);
2843 struct ll_inode_info *lli = ll_i2info(inode);
2844 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2847 LASSERT(!S_ISDIR(inode->i_mode));
2849 /* catch async errors that were recorded back when async writeback
2850 * failed for pages in this mapping. */
2851 rc = lli->lli_async_rc;
2852 lli->lli_async_rc = 0;
2853 if (lli->lli_clob != NULL) {
2854 err = lov_read_and_clear_async_rc(lli->lli_clob);
2859 /* The application has been told write failure already.
2860 * Do not report failure again. */
2861 if (fd->fd_write_failed)
2863 return rc ? -EIO : 0;
2867 * Called to make sure a portion of file has been written out.
2868 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2870 * Return how many pages have been written.
2872 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2873 enum cl_fsync_mode mode, int ignore_layout)
2877 struct cl_fsync_io *fio;
2882 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2883 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2886 env = cl_env_get(&refcheck);
2888 RETURN(PTR_ERR(env));
2890 io = vvp_env_thread_io(env);
2891 io->ci_obj = ll_i2info(inode)->lli_clob;
2892 io->ci_ignore_layout = ignore_layout;
2894 /* initialize parameters for sync */
2895 fio = &io->u.ci_fsync;
2896 fio->fi_start = start;
2898 fio->fi_fid = ll_inode2fid(inode);
2899 fio->fi_mode = mode;
2900 fio->fi_nr_written = 0;
2902 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2903 result = cl_io_loop(env, io);
2905 result = io->ci_result;
2907 result = fio->fi_nr_written;
2908 cl_io_fini(env, io);
2909 cl_env_put(env, &refcheck);
2915 * When dentry is provided (the 'else' case), file_dentry() may be
2916 * null and dentry must be used directly rather than pulled from
2917 * file_dentry() as is done otherwise.
2920 #ifdef HAVE_FILE_FSYNC_4ARGS
2921 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2923 struct dentry *dentry = file_dentry(file);
2924 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2925 int ll_fsync(struct file *file, int datasync)
2927 struct dentry *dentry = file_dentry(file);
2929 loff_t end = LLONG_MAX;
2931 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2934 loff_t end = LLONG_MAX;
2936 struct inode *inode = dentry->d_inode;
2937 struct ll_inode_info *lli = ll_i2info(inode);
2938 struct ptlrpc_request *req;
2942 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2943 PFID(ll_inode2fid(inode)), inode);
2944 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2946 #ifdef HAVE_FILE_FSYNC_4ARGS
2947 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2950 /* fsync's caller has already called _fdata{sync,write}, we want
2951 * that IO to finish before calling the osc and mdc sync methods */
2952 rc = filemap_fdatawait(inode->i_mapping);
2955 /* catch async errors that were recorded back when async writeback
2956 * failed for pages in this mapping. */
2957 if (!S_ISDIR(inode->i_mode)) {
2958 err = lli->lli_async_rc;
2959 lli->lli_async_rc = 0;
2962 err = lov_read_and_clear_async_rc(lli->lli_clob);
2967 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2971 ptlrpc_req_finished(req);
2973 if (S_ISREG(inode->i_mode)) {
2974 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2976 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2977 if (rc == 0 && err < 0)
2980 fd->fd_write_failed = true;
2982 fd->fd_write_failed = false;
2985 #ifdef HAVE_FILE_FSYNC_4ARGS
2986 inode_unlock(inode);
2992 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2994 struct inode *inode = file_inode(file);
2995 struct ll_sb_info *sbi = ll_i2sbi(inode);
2996 struct ldlm_enqueue_info einfo = {
2997 .ei_type = LDLM_FLOCK,
2998 .ei_cb_cp = ldlm_flock_completion_ast,
2999 .ei_cbdata = file_lock,
3001 struct md_op_data *op_data;
3002 struct lustre_handle lockh = { 0 };
3003 union ldlm_policy_data flock = { { 0 } };
3004 int fl_type = file_lock->fl_type;
3010 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3011 PFID(ll_inode2fid(inode)), file_lock);
3013 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3015 if (file_lock->fl_flags & FL_FLOCK) {
3016 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3017 /* flocks are whole-file locks */
3018 flock.l_flock.end = OFFSET_MAX;
3019 /* For flocks owner is determined by the local file desctiptor*/
3020 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3021 } else if (file_lock->fl_flags & FL_POSIX) {
3022 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3023 flock.l_flock.start = file_lock->fl_start;
3024 flock.l_flock.end = file_lock->fl_end;
3028 flock.l_flock.pid = file_lock->fl_pid;
3030 /* Somewhat ugly workaround for svc lockd.
3031 * lockd installs custom fl_lmops->lm_compare_owner that checks
3032 * for the fl_owner to be the same (which it always is on local node
3033 * I guess between lockd processes) and then compares pid.
3034 * As such we assign pid to the owner field to make it all work,
3035 * conflict with normal locks is unlikely since pid space and
3036 * pointer space for current->files are not intersecting */
3037 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3038 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3042 einfo.ei_mode = LCK_PR;
3045 /* An unlock request may or may not have any relation to
3046 * existing locks so we may not be able to pass a lock handle
3047 * via a normal ldlm_lock_cancel() request. The request may even
3048 * unlock a byte range in the middle of an existing lock. In
3049 * order to process an unlock request we need all of the same
3050 * information that is given with a normal read or write record
3051 * lock request. To avoid creating another ldlm unlock (cancel)
3052 * message we'll treat a LCK_NL flock request as an unlock. */
3053 einfo.ei_mode = LCK_NL;
3056 einfo.ei_mode = LCK_PW;
3059 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3074 flags = LDLM_FL_BLOCK_NOWAIT;
3080 flags = LDLM_FL_TEST_LOCK;
3083 CERROR("unknown fcntl lock command: %d\n", cmd);
3087 /* Save the old mode so that if the mode in the lock changes we
3088 * can decrement the appropriate reader or writer refcount. */
3089 file_lock->fl_type = einfo.ei_mode;
3091 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3092 LUSTRE_OPC_ANY, NULL);
3093 if (IS_ERR(op_data))
3094 RETURN(PTR_ERR(op_data));
3096 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3097 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3098 flock.l_flock.pid, flags, einfo.ei_mode,
3099 flock.l_flock.start, flock.l_flock.end);
3101 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3104 /* Restore the file lock type if not TEST lock. */
3105 if (!(flags & LDLM_FL_TEST_LOCK))
3106 file_lock->fl_type = fl_type;
3108 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3109 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3110 !(flags & LDLM_FL_TEST_LOCK))
3111 rc2 = locks_lock_file_wait(file, file_lock);
3113 if ((file_lock->fl_flags & FL_FLOCK) &&
3114 (rc == 0 || file_lock->fl_type == F_UNLCK))
3115 rc2 = flock_lock_file_wait(file, file_lock);
3116 if ((file_lock->fl_flags & FL_POSIX) &&
3117 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3118 !(flags & LDLM_FL_TEST_LOCK))
3119 rc2 = posix_lock_file_wait(file, file_lock);
3120 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3122 if (rc2 && file_lock->fl_type != F_UNLCK) {
3123 einfo.ei_mode = LCK_NL;
3124 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3129 ll_finish_md_op_data(op_data);
3134 int ll_get_fid_by_name(struct inode *parent, const char *name,
3135 int namelen, struct lu_fid *fid,
3136 struct inode **inode)
3138 struct md_op_data *op_data = NULL;
3139 struct mdt_body *body;
3140 struct ptlrpc_request *req;
3144 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3145 LUSTRE_OPC_ANY, NULL);
3146 if (IS_ERR(op_data))
3147 RETURN(PTR_ERR(op_data));
3149 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3150 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3151 ll_finish_md_op_data(op_data);
3155 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3157 GOTO(out_req, rc = -EFAULT);
3159 *fid = body->mbo_fid1;
3162 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3164 ptlrpc_req_finished(req);
3168 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3169 const char *name, int namelen)
3171 struct dentry *dchild = NULL;
3172 struct inode *child_inode = NULL;
3173 struct md_op_data *op_data;
3174 struct ptlrpc_request *request = NULL;
3175 struct obd_client_handle *och = NULL;
3177 struct mdt_body *body;
3179 __u64 data_version = 0;
3182 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3183 name, PFID(ll_inode2fid(parent)), mdtidx);
3185 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3186 0, LUSTRE_OPC_ANY, NULL);
3187 if (IS_ERR(op_data))
3188 RETURN(PTR_ERR(op_data));
3190 /* Get child FID first */
3191 qstr.hash = full_name_hash(name, namelen);
3194 dchild = d_lookup(file_dentry(file), &qstr);
3195 if (dchild != NULL) {
3196 if (dchild->d_inode != NULL)
3197 child_inode = igrab(dchild->d_inode);
3201 if (child_inode == NULL) {
3202 rc = ll_get_fid_by_name(parent, name, namelen,
3203 &op_data->op_fid3, &child_inode);
3208 if (child_inode == NULL)
3209 GOTO(out_free, rc = -EINVAL);
3212 * lfs migrate command needs to be blocked on the client
3213 * by checking the migrate FID against the FID of the
3216 if (child_inode == parent->i_sb->s_root->d_inode)
3217 GOTO(out_iput, rc = -EINVAL);
3219 inode_lock(child_inode);
3220 op_data->op_fid3 = *ll_inode2fid(child_inode);
3221 if (!fid_is_sane(&op_data->op_fid3)) {
3222 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3223 ll_get_fsname(parent->i_sb, NULL, 0), name,
3224 PFID(&op_data->op_fid3));
3225 GOTO(out_unlock, rc = -EINVAL);
3228 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3230 GOTO(out_unlock, rc);
3233 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3234 PFID(&op_data->op_fid3), mdtidx);
3235 GOTO(out_unlock, rc = 0);
3238 if (S_ISREG(child_inode->i_mode)) {
3239 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3243 GOTO(out_unlock, rc);
3246 rc = ll_data_version(child_inode, &data_version,
3249 GOTO(out_close, rc);
3251 op_data->op_handle = och->och_fh;
3252 op_data->op_data = och->och_mod;
3253 op_data->op_data_version = data_version;
3254 op_data->op_lease_handle = och->och_lease_handle;
3255 op_data->op_bias |= MDS_RENAME_MIGRATE;
3258 op_data->op_mds = mdtidx;
3259 op_data->op_cli_flags = CLI_MIGRATE;
3260 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3261 namelen, name, namelen, &request);
3263 ll_update_times(request, parent);
3265 if (request != NULL) {
3266 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3268 ptlrpc_req_finished(request);
3269 GOTO(out_close, rc = -EPROTO);
3272 /* If the server does release layout lock, then we cleanup
3273 * the client och here, otherwise release it in out_close: */
3275 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3276 obd_mod_put(och->och_mod);
3277 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3279 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3283 ptlrpc_req_finished(request);
3286 /* Try again if the file layout has changed. */
3287 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3292 if (och != NULL) /* close the file */
3293 ll_lease_close(och, child_inode, NULL);
3295 clear_nlink(child_inode);
3297 inode_unlock(child_inode);
3301 ll_finish_md_op_data(op_data);
3306 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3314 * test if some locks matching bits and l_req_mode are acquired
3315 * - bits can be in different locks
3316 * - if found clear the common lock bits in *bits
3317 * - the bits not found, are kept in *bits
3319 * \param bits [IN] searched lock bits [IN]
3320 * \param l_req_mode [IN] searched lock mode
3321 * \retval boolean, true iff all bits are found
3323 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3325 struct lustre_handle lockh;
3326 union ldlm_policy_data policy;
3327 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3328 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3337 fid = &ll_i2info(inode)->lli_fid;
3338 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3339 ldlm_lockname[mode]);
3341 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3342 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3343 policy.l_inodebits.bits = *bits & (1 << i);
3344 if (policy.l_inodebits.bits == 0)
3347 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3348 &policy, mode, &lockh)) {
3349 struct ldlm_lock *lock;
3351 lock = ldlm_handle2lock(&lockh);
3354 ~(lock->l_policy_data.l_inodebits.bits);
3355 LDLM_LOCK_PUT(lock);
3357 *bits &= ~policy.l_inodebits.bits;
3364 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3365 struct lustre_handle *lockh, __u64 flags,
3366 enum ldlm_mode mode)
3368 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3373 fid = &ll_i2info(inode)->lli_fid;
3374 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3376 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3377 fid, LDLM_IBITS, &policy, mode, lockh);
3382 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3384 /* Already unlinked. Just update nlink and return success */
3385 if (rc == -ENOENT) {
3387 /* If it is striped directory, and there is bad stripe
3388 * Let's revalidate the dentry again, instead of returning
3390 if (S_ISDIR(inode->i_mode) &&
3391 ll_i2info(inode)->lli_lsm_md != NULL)
3394 /* This path cannot be hit for regular files unless in
3395 * case of obscure races, so no need to to validate
3397 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3399 } else if (rc != 0) {
3400 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3401 "%s: revalidate FID "DFID" error: rc = %d\n",
3402 ll_get_fsname(inode->i_sb, NULL, 0),
3403 PFID(ll_inode2fid(inode)), rc);
3409 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3411 struct inode *inode = dentry->d_inode;
3412 struct ptlrpc_request *req = NULL;
3413 struct obd_export *exp;
3417 LASSERT(inode != NULL);
3419 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3420 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3422 exp = ll_i2mdexp(inode);
3424 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3425 * But under CMD case, it caused some lock issues, should be fixed
3426 * with new CMD ibits lock. See bug 12718 */
3427 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3428 struct lookup_intent oit = { .it_op = IT_GETATTR };
3429 struct md_op_data *op_data;
3431 if (ibits == MDS_INODELOCK_LOOKUP)
3432 oit.it_op = IT_LOOKUP;
3434 /* Call getattr by fid, so do not provide name at all. */
3435 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3436 dentry->d_inode, NULL, 0, 0,
3437 LUSTRE_OPC_ANY, NULL);
3438 if (IS_ERR(op_data))
3439 RETURN(PTR_ERR(op_data));
3441 rc = md_intent_lock(exp, op_data, &oit, &req,
3442 &ll_md_blocking_ast, 0);
3443 ll_finish_md_op_data(op_data);
3445 rc = ll_inode_revalidate_fini(inode, rc);
3449 rc = ll_revalidate_it_finish(req, &oit, dentry);
3451 ll_intent_release(&oit);
3455 /* Unlinked? Unhash dentry, so it is not picked up later by
3456 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3457 here to preserve get_cwd functionality on 2.6.
3459 if (!dentry->d_inode->i_nlink) {
3460 ll_lock_dcache(inode);
3461 d_lustre_invalidate(dentry, 0);
3462 ll_unlock_dcache(inode);
3465 ll_lookup_finish_locks(&oit, dentry);
3466 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3467 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3468 u64 valid = OBD_MD_FLGETATTR;
3469 struct md_op_data *op_data;
3472 if (S_ISREG(inode->i_mode)) {
3473 rc = ll_get_default_mdsize(sbi, &ealen);
3476 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3479 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3480 0, ealen, LUSTRE_OPC_ANY,
3482 if (IS_ERR(op_data))
3483 RETURN(PTR_ERR(op_data));
3485 op_data->op_valid = valid;
3486 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3487 ll_finish_md_op_data(op_data);
3489 rc = ll_inode_revalidate_fini(inode, rc);
3493 rc = ll_prep_inode(&inode, req, NULL, NULL);
3496 ptlrpc_req_finished(req);
3500 static int ll_merge_md_attr(struct inode *inode)
3502 struct cl_attr attr = { 0 };
3505 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3506 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3507 &attr, ll_md_blocking_ast);
3511 set_nlink(inode, attr.cat_nlink);
3512 inode->i_blocks = attr.cat_blocks;
3513 i_size_write(inode, attr.cat_size);
3515 ll_i2info(inode)->lli_atime = attr.cat_atime;
3516 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3517 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3523 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3525 struct inode *inode = dentry->d_inode;
3529 rc = __ll_inode_revalidate(dentry, ibits);
3533 /* if object isn't regular file, don't validate size */
3534 if (!S_ISREG(inode->i_mode)) {
3535 if (S_ISDIR(inode->i_mode) &&
3536 ll_i2info(inode)->lli_lsm_md != NULL) {
3537 rc = ll_merge_md_attr(inode);
3542 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3543 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3544 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3546 /* In case of restore, the MDT has the right size and has
3547 * already send it back without granting the layout lock,
3548 * inode is up-to-date so glimpse is useless.
3549 * Also to glimpse we need the layout, in case of a running
3550 * restore the MDT holds the layout lock so the glimpse will
3551 * block up to the end of restore (getattr will block)
3553 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3554 rc = ll_glimpse_size(inode);
3559 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3561 struct inode *inode = de->d_inode;
3562 struct ll_sb_info *sbi = ll_i2sbi(inode);
3563 struct ll_inode_info *lli = ll_i2info(inode);
3566 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3567 MDS_INODELOCK_LOOKUP);
3568 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3573 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3575 stat->dev = inode->i_sb->s_dev;
3576 if (ll_need_32bit_api(sbi))
3577 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3579 stat->ino = inode->i_ino;
3580 stat->mode = inode->i_mode;
3581 stat->uid = inode->i_uid;
3582 stat->gid = inode->i_gid;
3583 stat->rdev = inode->i_rdev;
3584 stat->atime = inode->i_atime;
3585 stat->mtime = inode->i_mtime;
3586 stat->ctime = inode->i_ctime;
3587 stat->blksize = 1 << inode->i_blkbits;
3589 stat->nlink = inode->i_nlink;
3590 stat->size = i_size_read(inode);
3591 stat->blocks = inode->i_blocks;
3596 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3597 __u64 start, __u64 len)
3601 struct fiemap *fiemap;
3602 unsigned int extent_count = fieinfo->fi_extents_max;
3604 num_bytes = sizeof(*fiemap) + (extent_count *
3605 sizeof(struct fiemap_extent));
3606 OBD_ALLOC_LARGE(fiemap, num_bytes);
3611 fiemap->fm_flags = fieinfo->fi_flags;
3612 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3613 fiemap->fm_start = start;
3614 fiemap->fm_length = len;
3615 if (extent_count > 0 &&
3616 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3617 sizeof(struct fiemap_extent)) != 0)
3618 GOTO(out, rc = -EFAULT);
3620 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3622 fieinfo->fi_flags = fiemap->fm_flags;
3623 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3624 if (extent_count > 0 &&
3625 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3626 fiemap->fm_mapped_extents *
3627 sizeof(struct fiemap_extent)) != 0)
3628 GOTO(out, rc = -EFAULT);
3630 OBD_FREE_LARGE(fiemap, num_bytes);
3634 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3636 struct ll_inode_info *lli = ll_i2info(inode);
3637 struct posix_acl *acl = NULL;
3640 spin_lock(&lli->lli_lock);
3641 /* VFS' acl_permission_check->check_acl will release the refcount */
3642 acl = posix_acl_dup(lli->lli_posix_acl);
3643 spin_unlock(&lli->lli_lock);
3648 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3650 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3651 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3653 ll_check_acl(struct inode *inode, int mask)
3656 # ifdef CONFIG_FS_POSIX_ACL
3657 struct posix_acl *acl;
3661 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3662 if (flags & IPERM_FLAG_RCU)
3665 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3670 rc = posix_acl_permission(inode, acl, mask);
3671 posix_acl_release(acl);
3674 # else /* !CONFIG_FS_POSIX_ACL */
3676 # endif /* CONFIG_FS_POSIX_ACL */
3678 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3680 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3681 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3683 # ifdef HAVE_INODE_PERMISION_2ARGS
3684 int ll_inode_permission(struct inode *inode, int mask)
3686 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3691 struct ll_sb_info *sbi;
3692 struct root_squash_info *squash;
3693 struct cred *cred = NULL;
3694 const struct cred *old_cred = NULL;
3696 bool squash_id = false;
3699 #ifdef MAY_NOT_BLOCK
3700 if (mask & MAY_NOT_BLOCK)
3702 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3703 if (flags & IPERM_FLAG_RCU)
3707 /* as root inode are NOT getting validated in lookup operation,
3708 * need to do it before permission check. */
3710 if (inode == inode->i_sb->s_root->d_inode) {
3711 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3712 MDS_INODELOCK_LOOKUP);
3717 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3718 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3720 /* squash fsuid/fsgid if needed */
3721 sbi = ll_i2sbi(inode);
3722 squash = &sbi->ll_squash;
3723 if (unlikely(squash->rsi_uid != 0 &&
3724 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3725 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3729 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3730 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3731 squash->rsi_uid, squash->rsi_gid);
3733 /* update current process's credentials
3734 * and FS capability */
3735 cred = prepare_creds();
3739 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3740 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3741 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3742 if ((1 << cap) & CFS_CAP_FS_MASK)
3743 cap_lower(cred->cap_effective, cap);
3745 old_cred = override_creds(cred);
3748 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3749 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3750 /* restore current process's credentials and FS capability */
3752 revert_creds(old_cred);
3759 /* -o localflock - only provides locally consistent flock locks */
3760 struct file_operations ll_file_operations = {
3761 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3762 # ifdef HAVE_SYNC_READ_WRITE
3763 .read = new_sync_read,
3764 .write = new_sync_write,
3766 .read_iter = ll_file_read_iter,
3767 .write_iter = ll_file_write_iter,
3768 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3769 .read = ll_file_read,
3770 .aio_read = ll_file_aio_read,
3771 .write = ll_file_write,
3772 .aio_write = ll_file_aio_write,
3773 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3774 .unlocked_ioctl = ll_file_ioctl,
3775 .open = ll_file_open,
3776 .release = ll_file_release,
3777 .mmap = ll_file_mmap,
3778 .llseek = ll_file_seek,
3779 .splice_read = ll_file_splice_read,
3784 struct file_operations ll_file_operations_flock = {
3785 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3786 # ifdef HAVE_SYNC_READ_WRITE
3787 .read = new_sync_read,
3788 .write = new_sync_write,
3789 # endif /* HAVE_SYNC_READ_WRITE */
3790 .read_iter = ll_file_read_iter,
3791 .write_iter = ll_file_write_iter,
3792 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3793 .read = ll_file_read,
3794 .aio_read = ll_file_aio_read,
3795 .write = ll_file_write,
3796 .aio_write = ll_file_aio_write,
3797 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3798 .unlocked_ioctl = ll_file_ioctl,
3799 .open = ll_file_open,
3800 .release = ll_file_release,
3801 .mmap = ll_file_mmap,
3802 .llseek = ll_file_seek,
3803 .splice_read = ll_file_splice_read,
3806 .flock = ll_file_flock,
3807 .lock = ll_file_flock
3810 /* These are for -o noflock - to return ENOSYS on flock calls */
3811 struct file_operations ll_file_operations_noflock = {
3812 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3813 # ifdef HAVE_SYNC_READ_WRITE
3814 .read = new_sync_read,
3815 .write = new_sync_write,
3816 # endif /* HAVE_SYNC_READ_WRITE */
3817 .read_iter = ll_file_read_iter,
3818 .write_iter = ll_file_write_iter,
3819 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3820 .read = ll_file_read,
3821 .aio_read = ll_file_aio_read,
3822 .write = ll_file_write,
3823 .aio_write = ll_file_aio_write,
3824 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3825 .unlocked_ioctl = ll_file_ioctl,
3826 .open = ll_file_open,
3827 .release = ll_file_release,
3828 .mmap = ll_file_mmap,
3829 .llseek = ll_file_seek,
3830 .splice_read = ll_file_splice_read,
3833 .flock = ll_file_noflock,
3834 .lock = ll_file_noflock
3837 struct inode_operations ll_file_inode_operations = {
3838 .setattr = ll_setattr,
3839 .getattr = ll_getattr,
3840 .permission = ll_inode_permission,
3841 .setxattr = ll_setxattr,
3842 .getxattr = ll_getxattr,
3843 .listxattr = ll_listxattr,
3844 .removexattr = ll_removexattr,
3845 .fiemap = ll_fiemap,
3846 #ifdef HAVE_IOP_GET_ACL
3847 .get_acl = ll_get_acl,
3851 /* dynamic ioctl number support routins */
3852 static struct llioc_ctl_data {
3853 struct rw_semaphore ioc_sem;
3854 struct list_head ioc_head;
3856 __RWSEM_INITIALIZER(llioc.ioc_sem),
3857 LIST_HEAD_INIT(llioc.ioc_head)
3862 struct list_head iocd_list;
3863 unsigned int iocd_size;
3864 llioc_callback_t iocd_cb;
3865 unsigned int iocd_count;
3866 unsigned int iocd_cmd[0];
3869 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3872 struct llioc_data *in_data = NULL;
3875 if (cb == NULL || cmd == NULL ||
3876 count > LLIOC_MAX_CMD || count < 0)
3879 size = sizeof(*in_data) + count * sizeof(unsigned int);
3880 OBD_ALLOC(in_data, size);
3881 if (in_data == NULL)
3884 memset(in_data, 0, sizeof(*in_data));
3885 in_data->iocd_size = size;
3886 in_data->iocd_cb = cb;
3887 in_data->iocd_count = count;
3888 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3890 down_write(&llioc.ioc_sem);
3891 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3892 up_write(&llioc.ioc_sem);
3897 void ll_iocontrol_unregister(void *magic)
3899 struct llioc_data *tmp;
3904 down_write(&llioc.ioc_sem);
3905 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3907 unsigned int size = tmp->iocd_size;
3909 list_del(&tmp->iocd_list);
3910 up_write(&llioc.ioc_sem);
3912 OBD_FREE(tmp, size);
3916 up_write(&llioc.ioc_sem);
3918 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3921 EXPORT_SYMBOL(ll_iocontrol_register);
3922 EXPORT_SYMBOL(ll_iocontrol_unregister);
3924 static enum llioc_iter
3925 ll_iocontrol_call(struct inode *inode, struct file *file,
3926 unsigned int cmd, unsigned long arg, int *rcp)
3928 enum llioc_iter ret = LLIOC_CONT;
3929 struct llioc_data *data;
3930 int rc = -EINVAL, i;
3932 down_read(&llioc.ioc_sem);
3933 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3934 for (i = 0; i < data->iocd_count; i++) {
3935 if (cmd != data->iocd_cmd[i])
3938 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3942 if (ret == LLIOC_STOP)
3945 up_read(&llioc.ioc_sem);
3952 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3954 struct ll_inode_info *lli = ll_i2info(inode);
3955 struct cl_object *obj = lli->lli_clob;
3964 env = cl_env_get(&refcheck);
3966 RETURN(PTR_ERR(env));
3968 rc = cl_conf_set(env, lli->lli_clob, conf);
3972 if (conf->coc_opc == OBJECT_CONF_SET) {
3973 struct ldlm_lock *lock = conf->coc_lock;
3974 struct cl_layout cl = {
3978 LASSERT(lock != NULL);
3979 LASSERT(ldlm_has_layout(lock));
3981 /* it can only be allowed to match after layout is
3982 * applied to inode otherwise false layout would be
3983 * seen. Applying layout shoud happen before dropping
3984 * the intent lock. */
3985 ldlm_lock_allow_match(lock);
3987 rc = cl_object_layout_get(env, obj, &cl);
3992 DFID": layout version change: %u -> %u\n",
3993 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3995 ll_layout_version_set(lli, cl.cl_layout_gen);
3999 cl_env_put(env, &refcheck);
4004 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4005 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4008 struct ll_sb_info *sbi = ll_i2sbi(inode);
4009 struct ptlrpc_request *req;
4010 struct mdt_body *body;
4017 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4018 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4019 lock->l_lvb_data, lock->l_lvb_len);
4021 if (lock->l_lvb_data != NULL)
4024 /* if layout lock was granted right away, the layout is returned
4025 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4026 * blocked and then granted via completion ast, we have to fetch
4027 * layout here. Please note that we can't use the LVB buffer in
4028 * completion AST because it doesn't have a large enough buffer */
4029 rc = ll_get_default_mdsize(sbi, &lmmsize);
4031 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4032 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4037 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4039 GOTO(out, rc = -EPROTO);
4041 lmmsize = body->mbo_eadatasize;
4042 if (lmmsize == 0) /* empty layout */
4045 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4047 GOTO(out, rc = -EFAULT);
4049 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4050 if (lvbdata == NULL)
4051 GOTO(out, rc = -ENOMEM);
4053 memcpy(lvbdata, lmm, lmmsize);
4054 lock_res_and_lock(lock);
4055 if (unlikely(lock->l_lvb_data == NULL)) {
4056 lock->l_lvb_type = LVB_T_LAYOUT;
4057 lock->l_lvb_data = lvbdata;
4058 lock->l_lvb_len = lmmsize;
4061 unlock_res_and_lock(lock);
4064 OBD_FREE_LARGE(lvbdata, lmmsize);
4069 ptlrpc_req_finished(req);
4074 * Apply the layout to the inode. Layout lock is held and will be released
4077 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4078 struct inode *inode)
4080 struct ll_inode_info *lli = ll_i2info(inode);
4081 struct ll_sb_info *sbi = ll_i2sbi(inode);
4082 struct ldlm_lock *lock;
4083 struct cl_object_conf conf;
4086 bool wait_layout = false;
4089 LASSERT(lustre_handle_is_used(lockh));
4091 lock = ldlm_handle2lock(lockh);
4092 LASSERT(lock != NULL);
4093 LASSERT(ldlm_has_layout(lock));
4095 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4096 PFID(&lli->lli_fid), inode);
4098 /* in case this is a caching lock and reinstate with new inode */
4099 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4101 lock_res_and_lock(lock);
4102 lvb_ready = ldlm_is_lvb_ready(lock);
4103 unlock_res_and_lock(lock);
4104 /* checking lvb_ready is racy but this is okay. The worst case is
4105 * that multi processes may configure the file on the same time. */
4110 rc = ll_layout_fetch(inode, lock);
4114 /* for layout lock, lmm is stored in lock's lvb.
4115 * lvb_data is immutable if the lock is held so it's safe to access it
4118 * set layout to file. Unlikely this will fail as old layout was
4119 * surely eliminated */
4120 memset(&conf, 0, sizeof conf);
4121 conf.coc_opc = OBJECT_CONF_SET;
4122 conf.coc_inode = inode;
4123 conf.coc_lock = lock;
4124 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4125 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4126 rc = ll_layout_conf(inode, &conf);
4128 /* refresh layout failed, need to wait */
4129 wait_layout = rc == -EBUSY;
4133 LDLM_LOCK_PUT(lock);
4134 ldlm_lock_decref(lockh, mode);
4136 /* wait for IO to complete if it's still being used. */
4138 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4139 ll_get_fsname(inode->i_sb, NULL, 0),
4140 PFID(&lli->lli_fid), inode);
4142 memset(&conf, 0, sizeof conf);
4143 conf.coc_opc = OBJECT_CONF_WAIT;
4144 conf.coc_inode = inode;
4145 rc = ll_layout_conf(inode, &conf);
4149 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4150 ll_get_fsname(inode->i_sb, NULL, 0),
4151 PFID(&lli->lli_fid), rc);
4156 static int ll_layout_refresh_locked(struct inode *inode)
4158 struct ll_inode_info *lli = ll_i2info(inode);
4159 struct ll_sb_info *sbi = ll_i2sbi(inode);
4160 struct md_op_data *op_data;
4161 struct lookup_intent it;
4162 struct lustre_handle lockh;
4163 enum ldlm_mode mode;
4164 struct ldlm_enqueue_info einfo = {
4165 .ei_type = LDLM_IBITS,
4167 .ei_cb_bl = &ll_md_blocking_ast,
4168 .ei_cb_cp = &ldlm_completion_ast,
4174 /* mostly layout lock is caching on the local side, so try to match
4175 * it before grabbing layout lock mutex. */
4176 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4177 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4178 if (mode != 0) { /* hit cached lock */
4179 rc = ll_layout_lock_set(&lockh, mode, inode);
4186 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4187 0, 0, LUSTRE_OPC_ANY, NULL);
4188 if (IS_ERR(op_data))
4189 RETURN(PTR_ERR(op_data));
4191 /* have to enqueue one */
4192 memset(&it, 0, sizeof(it));
4193 it.it_op = IT_LAYOUT;
4194 lockh.cookie = 0ULL;
4196 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4197 ll_get_fsname(inode->i_sb, NULL, 0),
4198 PFID(&lli->lli_fid), inode);
4200 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4201 if (it.it_request != NULL)
4202 ptlrpc_req_finished(it.it_request);
4203 it.it_request = NULL;
4205 ll_finish_md_op_data(op_data);
4207 mode = it.it_lock_mode;
4208 it.it_lock_mode = 0;
4209 ll_intent_drop_lock(&it);
4212 /* set lock data in case this is a new lock */
4213 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4214 rc = ll_layout_lock_set(&lockh, mode, inode);
4223 * This function checks if there exists a LAYOUT lock on the client side,
4224 * or enqueues it if it doesn't have one in cache.
4226 * This function will not hold layout lock so it may be revoked any time after
4227 * this function returns. Any operations depend on layout should be redone
4230 * This function should be called before lov_io_init() to get an uptodate
4231 * layout version, the caller should save the version number and after IO
4232 * is finished, this function should be called again to verify that layout
4233 * is not changed during IO time.
4235 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4237 struct ll_inode_info *lli = ll_i2info(inode);
4238 struct ll_sb_info *sbi = ll_i2sbi(inode);
4242 *gen = ll_layout_version_get(lli);
4243 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4247 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4248 LASSERT(S_ISREG(inode->i_mode));
4250 /* take layout lock mutex to enqueue layout lock exclusively. */
4251 mutex_lock(&lli->lli_layout_mutex);
4253 rc = ll_layout_refresh_locked(inode);
4257 *gen = ll_layout_version_get(lli);
4259 mutex_unlock(&lli->lli_layout_mutex);
4265 * This function send a restore request to the MDT
4267 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4269 struct hsm_user_request *hur;
4273 len = sizeof(struct hsm_user_request) +
4274 sizeof(struct hsm_user_item);
4275 OBD_ALLOC(hur, len);
4279 hur->hur_request.hr_action = HUA_RESTORE;
4280 hur->hur_request.hr_archive_id = 0;
4281 hur->hur_request.hr_flags = 0;
4282 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4283 sizeof(hur->hur_user_item[0].hui_fid));
4284 hur->hur_user_item[0].hui_extent.offset = offset;
4285 hur->hur_user_item[0].hui_extent.length = length;
4286 hur->hur_request.hr_itemcount = 1;
4287 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,