4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
54 #include <lustre_ioctl.h>
55 #include <lustre_swab.h>
57 #include "cl_object.h"
58 #include "llite_internal.h"
59 #include "vvp_internal.h"
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static enum llioc_iter
68 ll_iocontrol_call(struct inode *inode, struct file *file,
69 unsigned int cmd, unsigned long arg, int *rcp);
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
84 static void ll_file_data_put(struct ll_file_data *fd)
87 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
91 * Packs all the attributes into @op_data for the CLOSE rpc.
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 ll_prep_md_op_data(op_data, inode, NULL, NULL,
99 0, 0, LUSTRE_OPC_ANY, NULL);
101 op_data->op_attr.ia_mode = inode->i_mode;
102 op_data->op_attr.ia_atime = inode->i_atime;
103 op_data->op_attr.ia_mtime = inode->i_mtime;
104 op_data->op_attr.ia_ctime = inode->i_ctime;
105 op_data->op_attr.ia_size = i_size_read(inode);
106 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
107 ATTR_MTIME | ATTR_MTIME_SET |
108 ATTR_CTIME | ATTR_CTIME_SET;
109 op_data->op_attr_blocks = inode->i_blocks;
110 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
111 op_data->op_handle = och->och_fh;
113 if (och->och_flags & FMODE_WRITE &&
114 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
115 /* For HSM: if inode data has been modified, pack it so that
116 * MDT can set data dirty flag in the archive. */
117 op_data->op_bias |= MDS_DATA_MODIFIED;
123 * Perform a close, possibly with a bias.
124 * The meaning of "data" depends on the value of "bias".
126 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
127 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
130 static int ll_close_inode_openhandle(struct inode *inode,
131 struct obd_client_handle *och,
132 enum mds_op_bias bias, void *data)
134 struct obd_export *md_exp = ll_i2mdexp(inode);
135 const struct ll_inode_info *lli = ll_i2info(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
141 if (class_exp2obd(md_exp) == NULL) {
142 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
143 ll_get_fsname(inode->i_sb, NULL, 0),
144 PFID(&lli->lli_fid));
148 OBD_ALLOC_PTR(op_data);
149 /* We leak openhandle and request here on error, but not much to be
150 * done in OOM case since app won't retry close on error either. */
152 GOTO(out, rc = -ENOMEM);
154 ll_prepare_close(inode, op_data, och);
156 case MDS_CLOSE_LAYOUT_SWAP:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
159 op_data->op_data_version = 0;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_fid2 = *ll_inode2fid(data);
164 case MDS_HSM_RELEASE:
165 LASSERT(data != NULL);
166 op_data->op_bias |= MDS_HSM_RELEASE;
167 op_data->op_data_version = *(__u64 *)data;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
173 LASSERT(data == NULL);
177 rc = md_close(md_exp, op_data, och->och_mod, &req);
178 if (rc != 0 && rc != -EINTR)
179 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
180 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
183 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
184 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 ptlrpc_req_finished(req); /* This is close request */
203 int ll_md_real_close(struct inode *inode, fmode_t fmode)
205 struct ll_inode_info *lli = ll_i2info(inode);
206 struct obd_client_handle **och_p;
207 struct obd_client_handle *och;
212 if (fmode & FMODE_WRITE) {
213 och_p = &lli->lli_mds_write_och;
214 och_usecount = &lli->lli_open_fd_write_count;
215 } else if (fmode & FMODE_EXEC) {
216 och_p = &lli->lli_mds_exec_och;
217 och_usecount = &lli->lli_open_fd_exec_count;
219 LASSERT(fmode & FMODE_READ);
220 och_p = &lli->lli_mds_read_och;
221 och_usecount = &lli->lli_open_fd_read_count;
224 mutex_lock(&lli->lli_och_mutex);
225 if (*och_usecount > 0) {
226 /* There are still users of this handle, so skip
228 mutex_unlock(&lli->lli_och_mutex);
234 mutex_unlock(&lli->lli_och_mutex);
237 /* There might be a race and this handle may already
239 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
245 static int ll_md_close(struct inode *inode, struct file *file)
247 union ldlm_policy_data policy = {
248 .l_inodebits = { MDS_INODELOCK_OPEN },
250 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
252 struct ll_inode_info *lli = ll_i2info(inode);
253 struct lustre_handle lockh;
254 enum ldlm_mode lockmode;
258 /* clear group lock, if present */
259 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
260 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
262 if (fd->fd_lease_och != NULL) {
265 /* Usually the lease is not released when the
266 * application crashed, we need to release here. */
267 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
268 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
269 PFID(&lli->lli_fid), rc, lease_broken);
271 fd->fd_lease_och = NULL;
274 if (fd->fd_och != NULL) {
275 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
280 /* Let's see if we have good enough OPEN lock on the file and if
281 we can skip talking to MDS */
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode, &lockh))
300 rc = ll_md_real_close(inode, fd->fd_omode);
303 LUSTRE_FPRIVATE(file) = NULL;
304 ll_file_data_put(fd);
309 /* While this returns an error code, fput() the caller does not, so we need
310 * to make every effort to clean up all of our state here. Also, applications
311 * rarely check close errors and even if an error is returned they will not
312 * re-try the close call.
314 int ll_file_release(struct inode *inode, struct file *file)
316 struct ll_file_data *fd;
317 struct ll_sb_info *sbi = ll_i2sbi(inode);
318 struct ll_inode_info *lli = ll_i2info(inode);
322 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
323 PFID(ll_inode2fid(inode)), inode);
325 if (inode->i_sb->s_root != file_dentry(file))
326 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
327 fd = LUSTRE_FPRIVATE(file);
330 /* The last ref on @file, maybe not the the owner pid of statahead,
331 * because parent and child process can share the same file handle. */
332 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
333 ll_deauthorize_statahead(inode, fd);
335 if (inode->i_sb->s_root == file_dentry(file)) {
336 LUSTRE_FPRIVATE(file) = NULL;
337 ll_file_data_put(fd);
341 if (!S_ISDIR(inode->i_mode)) {
342 if (lli->lli_clob != NULL)
343 lov_read_and_clear_async_rc(lli->lli_clob);
344 lli->lli_async_rc = 0;
347 rc = ll_md_close(inode, file);
349 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
350 libcfs_debug_dumplog();
355 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
356 struct lookup_intent *itp)
358 struct dentry *de = file_dentry(file);
359 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
360 struct dentry *parent = de->d_parent;
361 const char *name = NULL;
363 struct md_op_data *op_data;
364 struct ptlrpc_request *req = NULL;
368 LASSERT(parent != NULL);
369 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
371 /* if server supports open-by-fid, or file name is invalid, don't pack
372 * name in open request */
373 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
374 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
375 name = de->d_name.name;
376 len = de->d_name.len;
379 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
380 name, len, 0, LUSTRE_OPC_ANY, NULL);
382 RETURN(PTR_ERR(op_data));
383 op_data->op_data = lmm;
384 op_data->op_data_size = lmmsize;
386 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
387 &ll_md_blocking_ast, 0);
388 ll_finish_md_op_data(op_data);
390 /* reason for keep own exit path - don`t flood log
391 * with messages with -ESTALE errors.
393 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
394 it_open_error(DISP_OPEN_OPEN, itp))
396 ll_release_openhandle(de, itp);
400 if (it_disposition(itp, DISP_LOOKUP_NEG))
401 GOTO(out, rc = -ENOENT);
403 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
404 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
405 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
409 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
410 if (!rc && itp->it_lock_mode)
411 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
414 ptlrpc_req_finished(req);
415 ll_intent_drop_lock(itp);
420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
421 struct obd_client_handle *och)
423 struct mdt_body *body;
425 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
426 och->och_fh = body->mbo_handle;
427 och->och_fid = body->mbo_fid1;
428 och->och_lease_handle.cookie = it->it_lock_handle;
429 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430 och->och_flags = it->it_flags;
432 return md_set_open_replay_data(md_exp, och, it);
435 static int ll_local_open(struct file *file, struct lookup_intent *it,
436 struct ll_file_data *fd, struct obd_client_handle *och)
438 struct inode *inode = file_inode(file);
441 LASSERT(!LUSTRE_FPRIVATE(file));
448 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
453 LUSTRE_FPRIVATE(file) = fd;
454 ll_readahead_init(inode, &fd->fd_ras);
455 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
457 /* ll_cl_context initialize */
458 rwlock_init(&fd->fd_lock);
459 INIT_LIST_HEAD(&fd->fd_lccs);
464 /* Open a file, and (for the very first open) create objects on the OSTs at
465 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
466 * creation or open until ll_lov_setstripe() ioctl is called.
468 * If we already have the stripe MD locally then we don't request it in
469 * md_open(), by passing a lmm_size = 0.
471 * It is up to the application to ensure no other processes open this file
472 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
473 * used. We might be able to avoid races of that sort by getting lli_open_sem
474 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
475 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
477 int ll_file_open(struct inode *inode, struct file *file)
479 struct ll_inode_info *lli = ll_i2info(inode);
480 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
481 .it_flags = file->f_flags };
482 struct obd_client_handle **och_p = NULL;
483 __u64 *och_usecount = NULL;
484 struct ll_file_data *fd;
488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
489 PFID(ll_inode2fid(inode)), inode, file->f_flags);
491 it = file->private_data; /* XXX: compat macro */
492 file->private_data = NULL; /* prevent ll_local_open assertion */
494 fd = ll_file_data_get();
496 GOTO(out_openerr, rc = -ENOMEM);
499 if (S_ISDIR(inode->i_mode))
500 ll_authorize_statahead(inode, fd);
502 if (inode->i_sb->s_root == file_dentry(file)) {
503 LUSTRE_FPRIVATE(file) = fd;
507 if (!it || !it->it_disposition) {
508 /* Convert f_flags into access mode. We cannot use file->f_mode,
509 * because everything but O_ACCMODE mask was stripped from
511 if ((oit.it_flags + 1) & O_ACCMODE)
513 if (file->f_flags & O_TRUNC)
514 oit.it_flags |= FMODE_WRITE;
516 /* kernel only call f_op->open in dentry_open. filp_open calls
517 * dentry_open after call to open_namei that checks permissions.
518 * Only nfsd_open call dentry_open directly without checking
519 * permissions and because of that this code below is safe. */
520 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
521 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
523 /* We do not want O_EXCL here, presumably we opened the file
524 * already? XXX - NFS implications? */
525 oit.it_flags &= ~O_EXCL;
527 /* bug20584, if "it_flags" contains O_CREAT, the file will be
528 * created if necessary, then "IT_CREAT" should be set to keep
529 * consistent with it */
530 if (oit.it_flags & O_CREAT)
531 oit.it_op |= IT_CREAT;
537 /* Let's see if we have file open on MDS already. */
538 if (it->it_flags & FMODE_WRITE) {
539 och_p = &lli->lli_mds_write_och;
540 och_usecount = &lli->lli_open_fd_write_count;
541 } else if (it->it_flags & FMODE_EXEC) {
542 och_p = &lli->lli_mds_exec_och;
543 och_usecount = &lli->lli_open_fd_exec_count;
545 och_p = &lli->lli_mds_read_och;
546 och_usecount = &lli->lli_open_fd_read_count;
549 mutex_lock(&lli->lli_och_mutex);
550 if (*och_p) { /* Open handle is present */
551 if (it_disposition(it, DISP_OPEN_OPEN)) {
552 /* Well, there's extra open request that we do not need,
553 let's close it somehow. This will decref request. */
554 rc = it_open_error(DISP_OPEN_OPEN, it);
556 mutex_unlock(&lli->lli_och_mutex);
557 GOTO(out_openerr, rc);
560 ll_release_openhandle(file_dentry(file), it);
564 rc = ll_local_open(file, it, fd, NULL);
567 mutex_unlock(&lli->lli_och_mutex);
568 GOTO(out_openerr, rc);
571 LASSERT(*och_usecount == 0);
572 if (!it->it_disposition) {
573 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
574 /* We cannot just request lock handle now, new ELC code
575 means that one of other OPEN locks for this file
576 could be cancelled, and since blocking ast handler
577 would attempt to grab och_mutex as well, that would
578 result in a deadlock */
579 mutex_unlock(&lli->lli_och_mutex);
581 * Normally called under two situations:
583 * 2. A race/condition on MDS resulting in no open
584 * handle to be returned from LOOKUP|OPEN request,
585 * for example if the target entry was a symlink.
587 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
588 * marked by a bit set in ll_iget_for_nfs. Clear the
589 * bit so that it's not confusing later callers.
591 * NB; when ldd is NULL, it must have come via normal
592 * lookup path only, since ll_iget_for_nfs always calls
595 if (ldd && ldd->lld_nfs_dentry) {
596 ldd->lld_nfs_dentry = 0;
597 it->it_flags |= MDS_OPEN_LOCK;
601 * Always specify MDS_OPEN_BY_FID because we don't want
602 * to get file with different fid.
604 it->it_flags |= MDS_OPEN_BY_FID;
605 rc = ll_intent_file_open(file, NULL, 0, it);
607 GOTO(out_openerr, rc);
611 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
613 GOTO(out_och_free, rc = -ENOMEM);
617 /* md_intent_lock() didn't get a request ref if there was an
618 * open error, so don't do cleanup on the request here
620 /* XXX (green): Should not we bail out on any error here, not
621 * just open error? */
622 rc = it_open_error(DISP_OPEN_OPEN, it);
624 GOTO(out_och_free, rc);
626 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
627 "inode %p: disposition %x, status %d\n", inode,
628 it_disposition(it, ~0), it->it_status);
630 rc = ll_local_open(file, it, fd, *och_p);
632 GOTO(out_och_free, rc);
634 mutex_unlock(&lli->lli_och_mutex);
637 /* Must do this outside lli_och_mutex lock to prevent deadlock where
638 different kind of OPEN lock for this same inode gets cancelled
639 by ldlm_cancel_lru */
640 if (!S_ISREG(inode->i_mode))
641 GOTO(out_och_free, rc);
643 cl_lov_delay_create_clear(&file->f_flags);
644 GOTO(out_och_free, rc);
648 if (och_p && *och_p) {
649 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
650 *och_p = NULL; /* OBD_FREE writes some magic there */
653 mutex_unlock(&lli->lli_och_mutex);
656 if (lli->lli_opendir_key == fd)
657 ll_deauthorize_statahead(inode, fd);
659 ll_file_data_put(fd);
661 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
664 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
665 ptlrpc_req_finished(it->it_request);
666 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
672 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
673 struct ldlm_lock_desc *desc, void *data, int flag)
676 struct lustre_handle lockh;
680 case LDLM_CB_BLOCKING:
681 ldlm_lock2handle(lock, &lockh);
682 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
684 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
688 case LDLM_CB_CANCELING:
696 * When setting a lease on a file, we take ownership of the lli_mds_*_och
697 * and save it as fd->fd_och so as to force client to reopen the file even
698 * if it has an open lock in cache already.
700 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
701 struct lustre_handle *old_handle)
703 struct ll_inode_info *lli = ll_i2info(inode);
704 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
705 struct obd_client_handle **och_p;
710 /* Get the openhandle of the file */
711 mutex_lock(&lli->lli_och_mutex);
712 if (fd->fd_lease_och != NULL)
713 GOTO(out_unlock, rc = -EBUSY);
715 if (fd->fd_och == NULL) {
716 if (file->f_mode & FMODE_WRITE) {
717 LASSERT(lli->lli_mds_write_och != NULL);
718 och_p = &lli->lli_mds_write_och;
719 och_usecount = &lli->lli_open_fd_write_count;
721 LASSERT(lli->lli_mds_read_och != NULL);
722 och_p = &lli->lli_mds_read_och;
723 och_usecount = &lli->lli_open_fd_read_count;
726 if (*och_usecount > 1)
727 GOTO(out_unlock, rc = -EBUSY);
734 *old_handle = fd->fd_och->och_fh;
738 mutex_unlock(&lli->lli_och_mutex);
743 * Release ownership on lli_mds_*_och when putting back a file lease.
745 static int ll_lease_och_release(struct inode *inode, struct file *file)
747 struct ll_inode_info *lli = ll_i2info(inode);
748 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
749 struct obd_client_handle **och_p;
750 struct obd_client_handle *old_och = NULL;
755 mutex_lock(&lli->lli_och_mutex);
756 if (file->f_mode & FMODE_WRITE) {
757 och_p = &lli->lli_mds_write_och;
758 och_usecount = &lli->lli_open_fd_write_count;
760 och_p = &lli->lli_mds_read_och;
761 och_usecount = &lli->lli_open_fd_read_count;
764 /* The file may have been open by another process (broken lease) so
765 * *och_p is not NULL. In this case we should simply increase usecount
768 if (*och_p != NULL) {
769 old_och = fd->fd_och;
776 mutex_unlock(&lli->lli_och_mutex);
779 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
785 * Acquire a lease and open the file.
787 static struct obd_client_handle *
788 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
791 struct lookup_intent it = { .it_op = IT_OPEN };
792 struct ll_sb_info *sbi = ll_i2sbi(inode);
793 struct md_op_data *op_data;
794 struct ptlrpc_request *req = NULL;
795 struct lustre_handle old_handle = { 0 };
796 struct obd_client_handle *och = NULL;
801 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
802 RETURN(ERR_PTR(-EINVAL));
805 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
806 RETURN(ERR_PTR(-EPERM));
808 rc = ll_lease_och_acquire(inode, file, &old_handle);
815 RETURN(ERR_PTR(-ENOMEM));
817 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
818 LUSTRE_OPC_ANY, NULL);
820 GOTO(out, rc = PTR_ERR(op_data));
822 /* To tell the MDT this openhandle is from the same owner */
823 op_data->op_handle = old_handle;
825 it.it_flags = fmode | open_flags;
826 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
827 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
828 &ll_md_blocking_lease_ast,
829 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
830 * it can be cancelled which may mislead applications that the lease is
832 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
833 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
834 * doesn't deal with openhandle, so normal openhandle will be leaked. */
835 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
836 ll_finish_md_op_data(op_data);
837 ptlrpc_req_finished(req);
839 GOTO(out_release_it, rc);
841 if (it_disposition(&it, DISP_LOOKUP_NEG))
842 GOTO(out_release_it, rc = -ENOENT);
844 rc = it_open_error(DISP_OPEN_OPEN, &it);
846 GOTO(out_release_it, rc);
848 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
849 ll_och_fill(sbi->ll_md_exp, &it, och);
851 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
852 GOTO(out_close, rc = -EOPNOTSUPP);
854 /* already get lease, handle lease lock */
855 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
856 if (it.it_lock_mode == 0 ||
857 it.it_lock_bits != MDS_INODELOCK_OPEN) {
858 /* open lock must return for lease */
859 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
860 PFID(ll_inode2fid(inode)), it.it_lock_mode,
862 GOTO(out_close, rc = -EPROTO);
865 ll_intent_release(&it);
869 /* Cancel open lock */
870 if (it.it_lock_mode != 0) {
871 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
874 och->och_lease_handle.cookie = 0ULL;
876 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
878 CERROR("%s: error closing file "DFID": %d\n",
879 ll_get_fsname(inode->i_sb, NULL, 0),
880 PFID(&ll_i2info(inode)->lli_fid), rc2);
881 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
883 ll_intent_release(&it);
891 * Check whether a layout swap can be done between two inodes.
893 * \param[in] inode1 First inode to check
894 * \param[in] inode2 Second inode to check
896 * \retval 0 on success, layout swap can be performed between both inodes
897 * \retval negative error code if requirements are not met
899 static int ll_check_swap_layouts_validity(struct inode *inode1,
900 struct inode *inode2)
902 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
905 if (inode_permission(inode1, MAY_WRITE) ||
906 inode_permission(inode2, MAY_WRITE))
909 if (inode1->i_sb != inode2->i_sb)
915 static int ll_swap_layouts_close(struct obd_client_handle *och,
916 struct inode *inode, struct inode *inode2)
918 const struct lu_fid *fid1 = ll_inode2fid(inode);
919 const struct lu_fid *fid2;
923 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
924 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
926 rc = ll_check_swap_layouts_validity(inode, inode2);
928 GOTO(out_free_och, rc);
930 /* We now know that inode2 is a lustre inode */
931 fid2 = ll_inode2fid(inode2);
933 rc = lu_fid_cmp(fid1, fid2);
935 GOTO(out_free_och, rc = -EINVAL);
937 /* Close the file and swap layouts between inode & inode2.
938 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
939 * because we still need it to pack l_remote_handle to MDT. */
940 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
943 och = NULL; /* freed in ll_close_inode_openhandle() */
953 * Release lease and close the file.
954 * It will check if the lease has ever broken.
956 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
959 struct ldlm_lock *lock;
960 bool cancelled = true;
964 lock = ldlm_handle2lock(&och->och_lease_handle);
966 lock_res_and_lock(lock);
967 cancelled = ldlm_is_cancel(lock);
968 unlock_res_and_lock(lock);
972 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
973 PFID(&ll_i2info(inode)->lli_fid), cancelled);
976 ldlm_cli_cancel(&och->och_lease_handle, 0);
978 if (lease_broken != NULL)
979 *lease_broken = cancelled;
981 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
985 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
987 struct ll_inode_info *lli = ll_i2info(inode);
988 struct cl_object *obj = lli->lli_clob;
989 struct cl_attr *attr = vvp_env_thread_attr(env);
997 ll_inode_size_lock(inode);
999 /* Merge timestamps the most recently obtained from MDS with
1000 * timestamps obtained from OSTs.
1002 * Do not overwrite atime of inode because it may be refreshed
1003 * by file_accessed() function. If the read was served by cache
1004 * data, there is no RPC to be sent so that atime may not be
1005 * transferred to OSTs at all. MDT only updates atime at close time
1006 * if it's at least 'mdd.*.atime_diff' older.
1007 * All in all, the atime in Lustre does not strictly comply with
1008 * POSIX. Solving this problem needs to send an RPC to MDT for each
1009 * read, this will hurt performance. */
1010 if (LTIME_S(inode->i_atime) < lli->lli_atime)
1011 LTIME_S(inode->i_atime) = lli->lli_atime;
1012 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1013 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1015 atime = LTIME_S(inode->i_atime);
1016 mtime = LTIME_S(inode->i_mtime);
1017 ctime = LTIME_S(inode->i_ctime);
1019 cl_object_attr_lock(obj);
1020 rc = cl_object_attr_get(env, obj, attr);
1021 cl_object_attr_unlock(obj);
1024 GOTO(out_size_unlock, rc);
1026 if (atime < attr->cat_atime)
1027 atime = attr->cat_atime;
1029 if (ctime < attr->cat_ctime)
1030 ctime = attr->cat_ctime;
1032 if (mtime < attr->cat_mtime)
1033 mtime = attr->cat_mtime;
1035 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1036 PFID(&lli->lli_fid), attr->cat_size);
1038 i_size_write(inode, attr->cat_size);
1039 inode->i_blocks = attr->cat_blocks;
1041 LTIME_S(inode->i_atime) = atime;
1042 LTIME_S(inode->i_mtime) = mtime;
1043 LTIME_S(inode->i_ctime) = ctime;
1046 ll_inode_size_unlock(inode);
1051 static bool file_is_noatime(const struct file *file)
1053 const struct vfsmount *mnt = file->f_path.mnt;
1054 const struct inode *inode = file_inode((struct file *)file);
1056 /* Adapted from file_accessed() and touch_atime().*/
1057 if (file->f_flags & O_NOATIME)
1060 if (inode->i_flags & S_NOATIME)
1063 if (IS_NOATIME(inode))
1066 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1069 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1072 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1078 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1080 struct inode *inode = file_inode((struct file *)file);
1082 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1084 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1085 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1086 file->f_flags & O_DIRECT ||
1089 io->ci_obj = ll_i2info(inode)->lli_clob;
1090 io->ci_lockreq = CILR_MAYBE;
1091 if (ll_file_nolock(file)) {
1092 io->ci_lockreq = CILR_NEVER;
1093 io->ci_no_srvlock = 1;
1094 } else if (file->f_flags & O_APPEND) {
1095 io->ci_lockreq = CILR_MANDATORY;
1098 io->ci_noatime = file_is_noatime(file);
1102 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1103 struct file *file, enum cl_io_type iot,
1104 loff_t *ppos, size_t count)
1106 struct vvp_io *vio = vvp_env_io(env);
1107 struct inode *inode = file_inode(file);
1108 struct ll_inode_info *lli = ll_i2info(inode);
1109 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1113 struct range_lock range;
1117 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1118 file_dentry(file)->d_name.name, iot, *ppos, count);
1121 io = vvp_env_thread_io(env);
1122 ll_io_init(io, file, iot == CIT_WRITE);
1124 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1125 bool range_locked = false;
1127 if (file->f_flags & O_APPEND)
1128 range_lock_init(&range, 0, LUSTRE_EOF);
1130 range_lock_init(&range, *ppos, *ppos + count - 1);
1132 vio->vui_fd = LUSTRE_FPRIVATE(file);
1133 vio->vui_io_subtype = args->via_io_subtype;
1135 switch (vio->vui_io_subtype) {
1137 vio->vui_iter = args->u.normal.via_iter;
1138 vio->vui_iocb = args->u.normal.via_iocb;
1139 /* Direct IO reads must also take range lock,
1140 * or multiple reads will try to work on the same pages
1141 * See LU-6227 for details. */
1142 if (((iot == CIT_WRITE) ||
1143 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1144 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1145 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1147 rc = range_lock(&lli->lli_write_tree, &range);
1151 range_locked = true;
1155 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1156 vio->u.splice.vui_flags = args->u.splice.via_flags;
1159 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1163 ll_cl_add(file, env, io, LCC_RW);
1164 rc = cl_io_loop(env, io);
1165 ll_cl_remove(file, env);
1168 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1170 range_unlock(&lli->lli_write_tree, &range);
1173 /* cl_io_rw_init() handled IO */
1177 if (io->ci_nob > 0) {
1178 result += io->ci_nob;
1179 count -= io->ci_nob;
1180 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1182 /* prepare IO restart */
1183 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1184 args->u.normal.via_iter = vio->vui_iter;
1188 cl_io_fini(env, io);
1190 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1192 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1193 file_dentry(file)->d_name.name,
1194 iot == CIT_READ ? "read" : "write",
1195 *ppos, count, result);
1199 if (iot == CIT_READ) {
1201 ll_stats_ops_tally(ll_i2sbi(inode),
1202 LPROC_LL_READ_BYTES, result);
1203 } else if (iot == CIT_WRITE) {
1205 ll_stats_ops_tally(ll_i2sbi(inode),
1206 LPROC_LL_WRITE_BYTES, result);
1207 fd->fd_write_failed = false;
1208 } else if (result == 0 && rc == 0) {
1211 fd->fd_write_failed = true;
1213 fd->fd_write_failed = false;
1214 } else if (rc != -ERESTARTSYS) {
1215 fd->fd_write_failed = true;
1219 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1221 return result > 0 ? result : rc;
1225 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1226 * especially for small I/O.
1228 * To serve a read request, CLIO has to create and initialize a cl_io and
1229 * then request DLM lock. This has turned out to have siginificant overhead
1230 * and affects the performance of small I/O dramatically.
1232 * It's not necessary to create a cl_io for each I/O. Under the help of read
1233 * ahead, most of the pages being read are already in memory cache and we can
1234 * read those pages directly because if the pages exist, the corresponding DLM
1235 * lock must exist so that page content must be valid.
1237 * In fast read implementation, the llite speculatively finds and reads pages
1238 * in memory cache. There are three scenarios for fast read:
1239 * - If the page exists and is uptodate, kernel VM will provide the data and
1240 * CLIO won't be intervened;
1241 * - If the page was brought into memory by read ahead, it will be exported
1242 * and read ahead parameters will be updated;
1243 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1244 * it will go back and invoke normal read, i.e., a cl_io will be created
1245 * and DLM lock will be requested.
1247 * POSIX compliance: posix standard states that read is intended to be atomic.
1248 * Lustre read implementation is in line with Linux kernel read implementation
1249 * and neither of them complies with POSIX standard in this matter. Fast read
1250 * doesn't make the situation worse on single node but it may interleave write
1251 * results from multiple nodes due to short read handling in ll_file_aio_read().
1253 * \param env - lu_env
1254 * \param iocb - kiocb from kernel
1255 * \param iter - user space buffers where the data will be copied
1257 * \retval - number of bytes have been read, or error code if error occurred.
1260 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1261 struct iov_iter *iter)
1265 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1268 /* NB: we can't do direct IO for fast read because it will need a lock
1269 * to make IO engine happy. */
1270 if (iocb->ki_filp->f_flags & O_DIRECT)
1273 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1274 result = generic_file_read_iter(iocb, iter);
1275 ll_cl_remove(iocb->ki_filp, env);
1277 /* If the first page is not in cache, generic_file_aio_read() will be
1278 * returned with -ENODATA.
1279 * See corresponding code in ll_readpage(). */
1280 if (result == -ENODATA)
1284 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1285 LPROC_LL_READ_BYTES, result);
1291 * Read from a file (through the page cache).
1293 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1296 struct vvp_io_args *args;
1301 env = cl_env_get(&refcheck);
1303 return PTR_ERR(env);
1305 result = ll_do_fast_read(env, iocb, to);
1306 if (result < 0 || iov_iter_count(to) == 0)
1309 args = ll_env_args(env, IO_NORMAL);
1310 args->u.normal.via_iter = to;
1311 args->u.normal.via_iocb = iocb;
1313 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1314 &iocb->ki_pos, iov_iter_count(to));
1317 else if (result == 0)
1321 cl_env_put(env, &refcheck);
1326 * Write to a file (through the page cache).
1328 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1330 struct vvp_io_args *args;
1335 env = cl_env_get(&refcheck);
1337 return PTR_ERR(env);
1339 args = ll_env_args(env, IO_NORMAL);
1340 args->u.normal.via_iter = from;
1341 args->u.normal.via_iocb = iocb;
1343 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1344 &iocb->ki_pos, iov_iter_count(from));
1345 cl_env_put(env, &refcheck);
1349 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1351 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1353 static int ll_file_get_iov_count(const struct iovec *iov,
1354 unsigned long *nr_segs, size_t *count)
1359 for (seg = 0; seg < *nr_segs; seg++) {
1360 const struct iovec *iv = &iov[seg];
1363 * If any segment has a negative length, or the cumulative
1364 * length ever wraps negative then return -EINVAL.
1367 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1369 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1374 cnt -= iv->iov_len; /* This segment is no good */
1381 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1382 unsigned long nr_segs, loff_t pos)
1389 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1393 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1394 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1395 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1396 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1397 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1399 result = ll_file_read_iter(iocb, &to);
1404 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1407 struct iovec iov = { .iov_base = buf, .iov_len = count };
1408 struct kiocb *kiocb;
1412 OBD_ALLOC_PTR(kiocb);
1416 init_sync_kiocb(kiocb, file);
1417 kiocb->ki_pos = *ppos;
1418 #ifdef HAVE_KIOCB_KI_LEFT
1419 kiocb->ki_left = count;
1420 #elif defined(HAVE_KI_NBYTES)
1421 kiocb->ki_nbytes = count;
1424 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1425 *ppos = kiocb->ki_pos;
1427 OBD_FREE_PTR(kiocb);
1432 * Write to a file (through the page cache).
1435 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1436 unsigned long nr_segs, loff_t pos)
1438 struct iov_iter from;
1443 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1447 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1448 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1449 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1450 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1451 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1453 result = ll_file_write_iter(iocb, &from);
1458 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1459 size_t count, loff_t *ppos)
1462 struct iovec iov = { .iov_base = (void __user *)buf,
1464 struct kiocb *kiocb;
1469 env = cl_env_get(&refcheck);
1471 RETURN(PTR_ERR(env));
1473 kiocb = &ll_env_info(env)->lti_kiocb;
1474 init_sync_kiocb(kiocb, file);
1475 kiocb->ki_pos = *ppos;
1476 #ifdef HAVE_KIOCB_KI_LEFT
1477 kiocb->ki_left = count;
1478 #elif defined(HAVE_KI_NBYTES)
1479 kiocb->ki_nbytes = count;
1482 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1483 *ppos = kiocb->ki_pos;
1485 cl_env_put(env, &refcheck);
1488 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1491 * Send file content (through pagecache) somewhere with helper
1493 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1494 struct pipe_inode_info *pipe, size_t count,
1498 struct vvp_io_args *args;
1503 env = cl_env_get(&refcheck);
1505 RETURN(PTR_ERR(env));
1507 args = ll_env_args(env, IO_SPLICE);
1508 args->u.splice.via_pipe = pipe;
1509 args->u.splice.via_flags = flags;
1511 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1512 cl_env_put(env, &refcheck);
1516 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1517 __u64 flags, struct lov_user_md *lum,
1520 struct lookup_intent oit = {
1522 .it_flags = flags | MDS_OPEN_BY_FID,
1527 ll_inode_size_lock(inode);
1528 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1530 GOTO(out_unlock, rc);
1532 ll_release_openhandle(file_dentry(file), &oit);
1535 ll_inode_size_unlock(inode);
1536 ll_intent_release(&oit);
1537 cl_lov_delay_create_clear(&file->f_flags);
1542 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1543 struct lov_mds_md **lmmp, int *lmm_size,
1544 struct ptlrpc_request **request)
1546 struct ll_sb_info *sbi = ll_i2sbi(inode);
1547 struct mdt_body *body;
1548 struct lov_mds_md *lmm = NULL;
1549 struct ptlrpc_request *req = NULL;
1550 struct md_op_data *op_data;
1553 rc = ll_get_default_mdsize(sbi, &lmmsize);
1557 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1558 strlen(filename), lmmsize,
1559 LUSTRE_OPC_ANY, NULL);
1560 if (IS_ERR(op_data))
1561 RETURN(PTR_ERR(op_data));
1563 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1564 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1565 ll_finish_md_op_data(op_data);
1567 CDEBUG(D_INFO, "md_getattr_name failed "
1568 "on %s: rc %d\n", filename, rc);
1572 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1573 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1575 lmmsize = body->mbo_eadatasize;
1577 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1579 GOTO(out, rc = -ENODATA);
1582 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1583 LASSERT(lmm != NULL);
1585 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1586 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1587 GOTO(out, rc = -EPROTO);
1591 * This is coming from the MDS, so is probably in
1592 * little endian. We convert it to host endian before
1593 * passing it to userspace.
1595 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1598 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1599 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1602 /* if function called for directory - we should
1603 * avoid swab not existent lsm objects */
1604 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1605 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1606 if (S_ISREG(body->mbo_mode))
1607 lustre_swab_lov_user_md_objects(
1608 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1610 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1611 lustre_swab_lov_user_md_v3(
1612 (struct lov_user_md_v3 *)lmm);
1613 if (S_ISREG(body->mbo_mode))
1614 lustre_swab_lov_user_md_objects(
1615 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1622 *lmm_size = lmmsize;
1627 static int ll_lov_setea(struct inode *inode, struct file *file,
1630 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1631 struct lov_user_md *lump;
1632 int lum_size = sizeof(struct lov_user_md) +
1633 sizeof(struct lov_user_ost_data);
1637 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1640 OBD_ALLOC_LARGE(lump, lum_size);
1644 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1645 GOTO(out_lump, rc = -EFAULT);
1647 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1650 OBD_FREE_LARGE(lump, lum_size);
1654 static int ll_file_getstripe(struct inode *inode,
1655 struct lov_user_md __user *lum)
1662 env = cl_env_get(&refcheck);
1664 RETURN(PTR_ERR(env));
1666 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1667 cl_env_put(env, &refcheck);
1671 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1674 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1675 struct lov_user_md *klum;
1677 __u64 flags = FMODE_WRITE;
1680 rc = ll_copy_user_md(lum, &klum);
1685 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1689 put_user(0, &lum->lmm_stripe_count);
1691 ll_layout_refresh(inode, &gen);
1692 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1695 OBD_FREE(klum, lum_size);
1700 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1702 struct ll_inode_info *lli = ll_i2info(inode);
1703 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1704 struct ll_grouplock grouplock;
1709 CWARN("group id for group lock must not be 0\n");
1713 if (ll_file_nolock(file))
1714 RETURN(-EOPNOTSUPP);
1716 spin_lock(&lli->lli_lock);
1717 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1718 CWARN("group lock already existed with gid %lu\n",
1719 fd->fd_grouplock.lg_gid);
1720 spin_unlock(&lli->lli_lock);
1723 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1724 spin_unlock(&lli->lli_lock);
1726 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1727 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1731 spin_lock(&lli->lli_lock);
1732 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1733 spin_unlock(&lli->lli_lock);
1734 CERROR("another thread just won the race\n");
1735 cl_put_grouplock(&grouplock);
1739 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1740 fd->fd_grouplock = grouplock;
1741 spin_unlock(&lli->lli_lock);
1743 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1747 static int ll_put_grouplock(struct inode *inode, struct file *file,
1750 struct ll_inode_info *lli = ll_i2info(inode);
1751 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1752 struct ll_grouplock grouplock;
1755 spin_lock(&lli->lli_lock);
1756 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1757 spin_unlock(&lli->lli_lock);
1758 CWARN("no group lock held\n");
1762 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1764 if (fd->fd_grouplock.lg_gid != arg) {
1765 CWARN("group lock %lu doesn't match current id %lu\n",
1766 arg, fd->fd_grouplock.lg_gid);
1767 spin_unlock(&lli->lli_lock);
1771 grouplock = fd->fd_grouplock;
1772 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1773 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1774 spin_unlock(&lli->lli_lock);
1776 cl_put_grouplock(&grouplock);
1777 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1782 * Close inode open handle
1784 * \param dentry [in] dentry which contains the inode
1785 * \param it [in,out] intent which contains open info and result
1788 * \retval <0 failure
1790 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1792 struct inode *inode = dentry->d_inode;
1793 struct obd_client_handle *och;
1799 /* Root ? Do nothing. */
1800 if (dentry->d_inode->i_sb->s_root == dentry)
1803 /* No open handle to close? Move away */
1804 if (!it_disposition(it, DISP_OPEN_OPEN))
1807 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1809 OBD_ALLOC(och, sizeof(*och));
1811 GOTO(out, rc = -ENOMEM);
1813 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1815 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1817 /* this one is in place of ll_file_open */
1818 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1819 ptlrpc_req_finished(it->it_request);
1820 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1826 * Get size for inode for which FIEMAP mapping is requested.
1827 * Make the FIEMAP get_info call and returns the result.
1828 * \param fiemap kernel buffer to hold extens
1829 * \param num_bytes kernel buffer size
1831 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1837 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1840 /* Checks for fiemap flags */
1841 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1842 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1846 /* Check for FIEMAP_FLAG_SYNC */
1847 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1848 rc = filemap_fdatawrite(inode->i_mapping);
1853 env = cl_env_get(&refcheck);
1855 RETURN(PTR_ERR(env));
1857 if (i_size_read(inode) == 0) {
1858 rc = ll_glimpse_size(inode);
1863 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1864 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1865 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1867 /* If filesize is 0, then there would be no objects for mapping */
1868 if (fmkey.lfik_oa.o_size == 0) {
1869 fiemap->fm_mapped_extents = 0;
1873 fmkey.lfik_fiemap = *fiemap;
1875 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1876 &fmkey, fiemap, &num_bytes);
1878 cl_env_put(env, &refcheck);
1882 int ll_fid2path(struct inode *inode, void __user *arg)
1884 struct obd_export *exp = ll_i2mdexp(inode);
1885 const struct getinfo_fid2path __user *gfin = arg;
1887 struct getinfo_fid2path *gfout;
1893 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1894 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1897 /* Only need to get the buflen */
1898 if (get_user(pathlen, &gfin->gf_pathlen))
1901 if (pathlen > PATH_MAX)
1904 outsize = sizeof(*gfout) + pathlen;
1905 OBD_ALLOC(gfout, outsize);
1909 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1910 GOTO(gf_free, rc = -EFAULT);
1911 /* append root FID after gfout to let MDT know the root FID so that it
1912 * can lookup the correct path, this is mainly for fileset.
1913 * old server without fileset mount support will ignore this. */
1914 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1916 /* Call mdc_iocontrol */
1917 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1921 if (copy_to_user(arg, gfout, outsize))
1925 OBD_FREE(gfout, outsize);
1930 * Read the data_version for inode.
1932 * This value is computed using stripe object version on OST.
1933 * Version is computed using server side locking.
1935 * @param flags if do sync on the OST side;
1937 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1938 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1940 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1942 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1950 /* If no file object initialized, we consider its version is 0. */
1956 env = cl_env_get(&refcheck);
1958 RETURN(PTR_ERR(env));
1960 io = vvp_env_thread_io(env);
1962 io->u.ci_data_version.dv_data_version = 0;
1963 io->u.ci_data_version.dv_flags = flags;
1966 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1967 result = cl_io_loop(env, io);
1969 result = io->ci_result;
1971 *data_version = io->u.ci_data_version.dv_data_version;
1973 cl_io_fini(env, io);
1975 if (unlikely(io->ci_need_restart))
1978 cl_env_put(env, &refcheck);
1984 * Trigger a HSM release request for the provided inode.
1986 int ll_hsm_release(struct inode *inode)
1989 struct obd_client_handle *och = NULL;
1990 __u64 data_version = 0;
1995 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1996 ll_get_fsname(inode->i_sb, NULL, 0),
1997 PFID(&ll_i2info(inode)->lli_fid));
1999 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2001 GOTO(out, rc = PTR_ERR(och));
2003 /* Grab latest data_version and [am]time values */
2004 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2008 env = cl_env_get(&refcheck);
2010 GOTO(out, rc = PTR_ERR(env));
2012 ll_merge_attr(env, inode);
2013 cl_env_put(env, &refcheck);
2015 /* Release the file.
2016 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2017 * we still need it to pack l_remote_handle to MDT. */
2018 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2024 if (och != NULL && !IS_ERR(och)) /* close the file */
2025 ll_lease_close(och, inode, NULL);
2030 struct ll_swap_stack {
2033 struct inode *inode1;
2034 struct inode *inode2;
2039 static int ll_swap_layouts(struct file *file1, struct file *file2,
2040 struct lustre_swap_layouts *lsl)
2042 struct mdc_swap_layouts msl;
2043 struct md_op_data *op_data;
2046 struct ll_swap_stack *llss = NULL;
2049 OBD_ALLOC_PTR(llss);
2053 llss->inode1 = file_inode(file1);
2054 llss->inode2 = file_inode(file2);
2056 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2060 /* we use 2 bool because it is easier to swap than 2 bits */
2061 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2062 llss->check_dv1 = true;
2064 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2065 llss->check_dv2 = true;
2067 /* we cannot use lsl->sl_dvX directly because we may swap them */
2068 llss->dv1 = lsl->sl_dv1;
2069 llss->dv2 = lsl->sl_dv2;
2071 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2072 if (rc == 0) /* same file, done! */
2075 if (rc < 0) { /* sequentialize it */
2076 swap(llss->inode1, llss->inode2);
2078 swap(llss->dv1, llss->dv2);
2079 swap(llss->check_dv1, llss->check_dv2);
2083 if (gid != 0) { /* application asks to flush dirty cache */
2084 rc = ll_get_grouplock(llss->inode1, file1, gid);
2088 rc = ll_get_grouplock(llss->inode2, file2, gid);
2090 ll_put_grouplock(llss->inode1, file1, gid);
2095 /* ultimate check, before swaping the layouts we check if
2096 * dataversion has changed (if requested) */
2097 if (llss->check_dv1) {
2098 rc = ll_data_version(llss->inode1, &dv, 0);
2101 if (dv != llss->dv1)
2102 GOTO(putgl, rc = -EAGAIN);
2105 if (llss->check_dv2) {
2106 rc = ll_data_version(llss->inode2, &dv, 0);
2109 if (dv != llss->dv2)
2110 GOTO(putgl, rc = -EAGAIN);
2113 /* struct md_op_data is used to send the swap args to the mdt
2114 * only flags is missing, so we use struct mdc_swap_layouts
2115 * through the md_op_data->op_data */
2116 /* flags from user space have to be converted before they are send to
2117 * server, no flag is sent today, they are only used on the client */
2120 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2121 0, LUSTRE_OPC_ANY, &msl);
2122 if (IS_ERR(op_data))
2123 GOTO(free, rc = PTR_ERR(op_data));
2125 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2126 sizeof(*op_data), op_data, NULL);
2127 ll_finish_md_op_data(op_data);
2134 ll_put_grouplock(llss->inode2, file2, gid);
2135 ll_put_grouplock(llss->inode1, file1, gid);
2145 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2147 struct md_op_data *op_data;
2151 /* Detect out-of range masks */
2152 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2155 /* Non-root users are forbidden to set or clear flags which are
2156 * NOT defined in HSM_USER_MASK. */
2157 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2158 !cfs_capable(CFS_CAP_SYS_ADMIN))
2161 /* Detect out-of range archive id */
2162 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2163 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2166 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2167 LUSTRE_OPC_ANY, hss);
2168 if (IS_ERR(op_data))
2169 RETURN(PTR_ERR(op_data));
2171 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2172 sizeof(*op_data), op_data, NULL);
2174 ll_finish_md_op_data(op_data);
2179 static int ll_hsm_import(struct inode *inode, struct file *file,
2180 struct hsm_user_import *hui)
2182 struct hsm_state_set *hss = NULL;
2183 struct iattr *attr = NULL;
2187 if (!S_ISREG(inode->i_mode))
2193 GOTO(out, rc = -ENOMEM);
2195 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2196 hss->hss_archive_id = hui->hui_archive_id;
2197 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2198 rc = ll_hsm_state_set(inode, hss);
2202 OBD_ALLOC_PTR(attr);
2204 GOTO(out, rc = -ENOMEM);
2206 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2207 attr->ia_mode |= S_IFREG;
2208 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2209 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2210 attr->ia_size = hui->hui_size;
2211 attr->ia_mtime.tv_sec = hui->hui_mtime;
2212 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2213 attr->ia_atime.tv_sec = hui->hui_atime;
2214 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2216 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2217 ATTR_UID | ATTR_GID |
2218 ATTR_MTIME | ATTR_MTIME_SET |
2219 ATTR_ATIME | ATTR_ATIME_SET;
2223 rc = ll_setattr_raw(file_dentry(file), attr, true);
2227 inode_unlock(inode);
2239 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2241 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2242 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2245 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2247 struct inode *inode = file_inode(file);
2249 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2250 ATTR_MTIME | ATTR_MTIME_SET |
2251 ATTR_CTIME | ATTR_CTIME_SET,
2253 .tv_sec = lfu->lfu_atime_sec,
2254 .tv_nsec = lfu->lfu_atime_nsec,
2257 .tv_sec = lfu->lfu_mtime_sec,
2258 .tv_nsec = lfu->lfu_mtime_nsec,
2261 .tv_sec = lfu->lfu_ctime_sec,
2262 .tv_nsec = lfu->lfu_ctime_nsec,
2268 if (!capable(CAP_SYS_ADMIN))
2271 if (!S_ISREG(inode->i_mode))
2275 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2276 inode_unlock(inode);
2282 * Give file access advices
2284 * The ladvise interface is similar to Linux fadvise() system call, except it
2285 * forwards the advices directly from Lustre client to server. The server side
2286 * codes will apply appropriate read-ahead and caching techniques for the
2287 * corresponding files.
2289 * A typical workload for ladvise is e.g. a bunch of different clients are
2290 * doing small random reads of a file, so prefetching pages into OSS cache
2291 * with big linear reads before the random IO is a net benefit. Fetching
2292 * all that data into each client cache with fadvise() may not be, due to
2293 * much more data being sent to the client.
2295 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2296 struct lu_ladvise *ladvise)
2300 struct cl_ladvise_io *lio;
2305 env = cl_env_get(&refcheck);
2307 RETURN(PTR_ERR(env));
2309 io = vvp_env_thread_io(env);
2310 io->ci_obj = ll_i2info(inode)->lli_clob;
2312 /* initialize parameters for ladvise */
2313 lio = &io->u.ci_ladvise;
2314 lio->li_start = ladvise->lla_start;
2315 lio->li_end = ladvise->lla_end;
2316 lio->li_fid = ll_inode2fid(inode);
2317 lio->li_advice = ladvise->lla_advice;
2318 lio->li_flags = flags;
2320 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2321 rc = cl_io_loop(env, io);
2325 cl_io_fini(env, io);
2326 cl_env_put(env, &refcheck);
2331 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2333 struct inode *inode = file_inode(file);
2334 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2338 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2339 PFID(ll_inode2fid(inode)), inode, cmd);
2340 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2342 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2343 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2347 case LL_IOC_GETFLAGS:
2348 /* Get the current value of the file flags */
2349 return put_user(fd->fd_flags, (int __user *)arg);
2350 case LL_IOC_SETFLAGS:
2351 case LL_IOC_CLRFLAGS:
2352 /* Set or clear specific file flags */
2353 /* XXX This probably needs checks to ensure the flags are
2354 * not abused, and to handle any flag side effects.
2356 if (get_user(flags, (int __user *) arg))
2359 if (cmd == LL_IOC_SETFLAGS) {
2360 if ((flags & LL_FILE_IGNORE_LOCK) &&
2361 !(file->f_flags & O_DIRECT)) {
2362 CERROR("%s: unable to disable locking on "
2363 "non-O_DIRECT file\n", current->comm);
2367 fd->fd_flags |= flags;
2369 fd->fd_flags &= ~flags;
2372 case LL_IOC_LOV_SETSTRIPE:
2373 RETURN(ll_lov_setstripe(inode, file, arg));
2374 case LL_IOC_LOV_SETEA:
2375 RETURN(ll_lov_setea(inode, file, arg));
2376 case LL_IOC_LOV_SWAP_LAYOUTS: {
2378 struct lustre_swap_layouts lsl;
2380 if (copy_from_user(&lsl, (char __user *)arg,
2381 sizeof(struct lustre_swap_layouts)))
2384 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2387 file2 = fget(lsl.sl_fd);
2391 /* O_WRONLY or O_RDWR */
2392 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2393 GOTO(out, rc = -EPERM);
2395 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2396 struct inode *inode2;
2397 struct ll_inode_info *lli;
2398 struct obd_client_handle *och = NULL;
2400 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2401 GOTO(out, rc = -EINVAL);
2403 lli = ll_i2info(inode);
2404 mutex_lock(&lli->lli_och_mutex);
2405 if (fd->fd_lease_och != NULL) {
2406 och = fd->fd_lease_och;
2407 fd->fd_lease_och = NULL;
2409 mutex_unlock(&lli->lli_och_mutex);
2411 GOTO(out, rc = -ENOLCK);
2412 inode2 = file_inode(file2);
2413 rc = ll_swap_layouts_close(och, inode, inode2);
2415 rc = ll_swap_layouts(file, file2, &lsl);
2421 case LL_IOC_LOV_GETSTRIPE:
2422 RETURN(ll_file_getstripe(inode,
2423 (struct lov_user_md __user *)arg));
2424 case FSFILT_IOC_GETFLAGS:
2425 case FSFILT_IOC_SETFLAGS:
2426 RETURN(ll_iocontrol(inode, file, cmd, arg));
2427 case FSFILT_IOC_GETVERSION_OLD:
2428 case FSFILT_IOC_GETVERSION:
2429 RETURN(put_user(inode->i_generation, (int __user *)arg));
2430 case LL_IOC_GROUP_LOCK:
2431 RETURN(ll_get_grouplock(inode, file, arg));
2432 case LL_IOC_GROUP_UNLOCK:
2433 RETURN(ll_put_grouplock(inode, file, arg));
2434 case IOC_OBD_STATFS:
2435 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2437 /* We need to special case any other ioctls we want to handle,
2438 * to send them to the MDS/OST as appropriate and to properly
2439 * network encode the arg field.
2440 case FSFILT_IOC_SETVERSION_OLD:
2441 case FSFILT_IOC_SETVERSION:
2443 case LL_IOC_FLUSHCTX:
2444 RETURN(ll_flush_ctx(inode));
2445 case LL_IOC_PATH2FID: {
2446 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2447 sizeof(struct lu_fid)))
2452 case LL_IOC_GETPARENT:
2453 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2455 case OBD_IOC_FID2PATH:
2456 RETURN(ll_fid2path(inode, (void __user *)arg));
2457 case LL_IOC_DATA_VERSION: {
2458 struct ioc_data_version idv;
2461 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2464 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2465 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2468 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2474 case LL_IOC_GET_MDTIDX: {
2477 mdtidx = ll_get_mdt_idx(inode);
2481 if (put_user((int)mdtidx, (int __user *)arg))
2486 case OBD_IOC_GETDTNAME:
2487 case OBD_IOC_GETMDNAME:
2488 RETURN(ll_get_obd_name(inode, cmd, arg));
2489 case LL_IOC_HSM_STATE_GET: {
2490 struct md_op_data *op_data;
2491 struct hsm_user_state *hus;
2498 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2499 LUSTRE_OPC_ANY, hus);
2500 if (IS_ERR(op_data)) {
2502 RETURN(PTR_ERR(op_data));
2505 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2508 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2511 ll_finish_md_op_data(op_data);
2515 case LL_IOC_HSM_STATE_SET: {
2516 struct hsm_state_set *hss;
2523 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2528 rc = ll_hsm_state_set(inode, hss);
2533 case LL_IOC_HSM_ACTION: {
2534 struct md_op_data *op_data;
2535 struct hsm_current_action *hca;
2542 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2543 LUSTRE_OPC_ANY, hca);
2544 if (IS_ERR(op_data)) {
2546 RETURN(PTR_ERR(op_data));
2549 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2552 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2555 ll_finish_md_op_data(op_data);
2559 case LL_IOC_SET_LEASE: {
2560 struct ll_inode_info *lli = ll_i2info(inode);
2561 struct obd_client_handle *och = NULL;
2566 case LL_LEASE_WRLCK:
2567 if (!(file->f_mode & FMODE_WRITE))
2569 fmode = FMODE_WRITE;
2571 case LL_LEASE_RDLCK:
2572 if (!(file->f_mode & FMODE_READ))
2576 case LL_LEASE_UNLCK:
2577 mutex_lock(&lli->lli_och_mutex);
2578 if (fd->fd_lease_och != NULL) {
2579 och = fd->fd_lease_och;
2580 fd->fd_lease_och = NULL;
2582 mutex_unlock(&lli->lli_och_mutex);
2587 fmode = och->och_flags;
2588 rc = ll_lease_close(och, inode, &lease_broken);
2592 rc = ll_lease_och_release(inode, file);
2599 RETURN(ll_lease_type_from_fmode(fmode));
2604 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2606 /* apply for lease */
2607 och = ll_lease_open(inode, file, fmode, 0);
2609 RETURN(PTR_ERR(och));
2612 mutex_lock(&lli->lli_och_mutex);
2613 if (fd->fd_lease_och == NULL) {
2614 fd->fd_lease_och = och;
2617 mutex_unlock(&lli->lli_och_mutex);
2619 /* impossible now that only excl is supported for now */
2620 ll_lease_close(och, inode, &lease_broken);
2625 case LL_IOC_GET_LEASE: {
2626 struct ll_inode_info *lli = ll_i2info(inode);
2627 struct ldlm_lock *lock = NULL;
2630 mutex_lock(&lli->lli_och_mutex);
2631 if (fd->fd_lease_och != NULL) {
2632 struct obd_client_handle *och = fd->fd_lease_och;
2634 lock = ldlm_handle2lock(&och->och_lease_handle);
2636 lock_res_and_lock(lock);
2637 if (!ldlm_is_cancel(lock))
2638 fmode = och->och_flags;
2640 unlock_res_and_lock(lock);
2641 LDLM_LOCK_PUT(lock);
2644 mutex_unlock(&lli->lli_och_mutex);
2646 RETURN(ll_lease_type_from_fmode(fmode));
2648 case LL_IOC_HSM_IMPORT: {
2649 struct hsm_user_import *hui;
2655 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2660 rc = ll_hsm_import(inode, file, hui);
2665 case LL_IOC_FUTIMES_3: {
2666 struct ll_futimes_3 lfu;
2668 if (copy_from_user(&lfu,
2669 (const struct ll_futimes_3 __user *)arg,
2673 RETURN(ll_file_futimes_3(file, &lfu));
2675 case LL_IOC_LADVISE: {
2676 struct ladvise_hdr *ladvise_hdr;
2679 int alloc_size = sizeof(*ladvise_hdr);
2682 OBD_ALLOC_PTR(ladvise_hdr);
2683 if (ladvise_hdr == NULL)
2686 if (copy_from_user(ladvise_hdr,
2687 (const struct ladvise_hdr __user *)arg,
2689 GOTO(out_ladvise, rc = -EFAULT);
2691 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2692 ladvise_hdr->lah_count < 1)
2693 GOTO(out_ladvise, rc = -EINVAL);
2695 num_advise = ladvise_hdr->lah_count;
2696 if (num_advise >= LAH_COUNT_MAX)
2697 GOTO(out_ladvise, rc = -EFBIG);
2699 OBD_FREE_PTR(ladvise_hdr);
2700 alloc_size = offsetof(typeof(*ladvise_hdr),
2701 lah_advise[num_advise]);
2702 OBD_ALLOC(ladvise_hdr, alloc_size);
2703 if (ladvise_hdr == NULL)
2707 * TODO: submit multiple advices to one server in a single RPC
2709 if (copy_from_user(ladvise_hdr,
2710 (const struct ladvise_hdr __user *)arg,
2712 GOTO(out_ladvise, rc = -EFAULT);
2714 for (i = 0; i < num_advise; i++) {
2715 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2716 &ladvise_hdr->lah_advise[i]);
2722 OBD_FREE(ladvise_hdr, alloc_size);
2729 ll_iocontrol_call(inode, file, cmd, arg, &err))
2732 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2733 (void __user *)arg));
2738 #ifndef HAVE_FILE_LLSEEK_SIZE
2739 static inline loff_t
2740 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2742 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2744 if (offset > maxsize)
2747 if (offset != file->f_pos) {
2748 file->f_pos = offset;
2749 file->f_version = 0;
2755 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2756 loff_t maxsize, loff_t eof)
2758 struct inode *inode = file_inode(file);
2766 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2767 * position-querying operation. Avoid rewriting the "same"
2768 * f_pos value back to the file because a concurrent read(),
2769 * write() or lseek() might have altered it
2774 * f_lock protects against read/modify/write race with other
2775 * SEEK_CURs. Note that parallel writes and reads behave
2779 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2780 inode_unlock(inode);
2784 * In the generic case the entire file is data, so as long as
2785 * offset isn't at the end of the file then the offset is data.
2792 * There is a virtual hole at the end of the file, so as long as
2793 * offset isn't i_size or larger, return i_size.
2801 return llseek_execute(file, offset, maxsize);
2805 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2807 struct inode *inode = file_inode(file);
2808 loff_t retval, eof = 0;
2811 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2812 (origin == SEEK_CUR) ? file->f_pos : 0);
2813 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2814 PFID(ll_inode2fid(inode)), inode, retval, retval,
2816 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2818 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2819 retval = ll_glimpse_size(inode);
2822 eof = i_size_read(inode);
2825 retval = ll_generic_file_llseek_size(file, offset, origin,
2826 ll_file_maxbytes(inode), eof);
2830 static int ll_flush(struct file *file, fl_owner_t id)
2832 struct inode *inode = file_inode(file);
2833 struct ll_inode_info *lli = ll_i2info(inode);
2834 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2837 LASSERT(!S_ISDIR(inode->i_mode));
2839 /* catch async errors that were recorded back when async writeback
2840 * failed for pages in this mapping. */
2841 rc = lli->lli_async_rc;
2842 lli->lli_async_rc = 0;
2843 if (lli->lli_clob != NULL) {
2844 err = lov_read_and_clear_async_rc(lli->lli_clob);
2849 /* The application has been told write failure already.
2850 * Do not report failure again. */
2851 if (fd->fd_write_failed)
2853 return rc ? -EIO : 0;
2857 * Called to make sure a portion of file has been written out.
2858 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2860 * Return how many pages have been written.
2862 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2863 enum cl_fsync_mode mode, int ignore_layout)
2867 struct cl_fsync_io *fio;
2872 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2873 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2876 env = cl_env_get(&refcheck);
2878 RETURN(PTR_ERR(env));
2880 io = vvp_env_thread_io(env);
2881 io->ci_obj = ll_i2info(inode)->lli_clob;
2882 io->ci_ignore_layout = ignore_layout;
2884 /* initialize parameters for sync */
2885 fio = &io->u.ci_fsync;
2886 fio->fi_start = start;
2888 fio->fi_fid = ll_inode2fid(inode);
2889 fio->fi_mode = mode;
2890 fio->fi_nr_written = 0;
2892 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2893 result = cl_io_loop(env, io);
2895 result = io->ci_result;
2897 result = fio->fi_nr_written;
2898 cl_io_fini(env, io);
2899 cl_env_put(env, &refcheck);
2905 * When dentry is provided (the 'else' case), file_dentry() may be
2906 * null and dentry must be used directly rather than pulled from
2907 * file_dentry() as is done otherwise.
2910 #ifdef HAVE_FILE_FSYNC_4ARGS
2911 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2913 struct dentry *dentry = file_dentry(file);
2914 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2915 int ll_fsync(struct file *file, int datasync)
2917 struct dentry *dentry = file_dentry(file);
2919 loff_t end = LLONG_MAX;
2921 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2924 loff_t end = LLONG_MAX;
2926 struct inode *inode = dentry->d_inode;
2927 struct ll_inode_info *lli = ll_i2info(inode);
2928 struct ptlrpc_request *req;
2932 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2933 PFID(ll_inode2fid(inode)), inode);
2934 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2936 #ifdef HAVE_FILE_FSYNC_4ARGS
2937 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2940 /* fsync's caller has already called _fdata{sync,write}, we want
2941 * that IO to finish before calling the osc and mdc sync methods */
2942 rc = filemap_fdatawait(inode->i_mapping);
2945 /* catch async errors that were recorded back when async writeback
2946 * failed for pages in this mapping. */
2947 if (!S_ISDIR(inode->i_mode)) {
2948 err = lli->lli_async_rc;
2949 lli->lli_async_rc = 0;
2952 err = lov_read_and_clear_async_rc(lli->lli_clob);
2957 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2961 ptlrpc_req_finished(req);
2963 if (S_ISREG(inode->i_mode)) {
2964 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2966 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2967 if (rc == 0 && err < 0)
2970 fd->fd_write_failed = true;
2972 fd->fd_write_failed = false;
2975 #ifdef HAVE_FILE_FSYNC_4ARGS
2976 inode_unlock(inode);
2982 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2984 struct inode *inode = file_inode(file);
2985 struct ll_sb_info *sbi = ll_i2sbi(inode);
2986 struct ldlm_enqueue_info einfo = {
2987 .ei_type = LDLM_FLOCK,
2988 .ei_cb_cp = ldlm_flock_completion_ast,
2989 .ei_cbdata = file_lock,
2991 struct md_op_data *op_data;
2992 struct lustre_handle lockh = { 0 };
2993 union ldlm_policy_data flock = { { 0 } };
2994 int fl_type = file_lock->fl_type;
3000 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3001 PFID(ll_inode2fid(inode)), file_lock);
3003 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3005 if (file_lock->fl_flags & FL_FLOCK) {
3006 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3007 /* flocks are whole-file locks */
3008 flock.l_flock.end = OFFSET_MAX;
3009 /* For flocks owner is determined by the local file desctiptor*/
3010 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3011 } else if (file_lock->fl_flags & FL_POSIX) {
3012 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3013 flock.l_flock.start = file_lock->fl_start;
3014 flock.l_flock.end = file_lock->fl_end;
3018 flock.l_flock.pid = file_lock->fl_pid;
3020 /* Somewhat ugly workaround for svc lockd.
3021 * lockd installs custom fl_lmops->lm_compare_owner that checks
3022 * for the fl_owner to be the same (which it always is on local node
3023 * I guess between lockd processes) and then compares pid.
3024 * As such we assign pid to the owner field to make it all work,
3025 * conflict with normal locks is unlikely since pid space and
3026 * pointer space for current->files are not intersecting */
3027 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3028 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3032 einfo.ei_mode = LCK_PR;
3035 /* An unlock request may or may not have any relation to
3036 * existing locks so we may not be able to pass a lock handle
3037 * via a normal ldlm_lock_cancel() request. The request may even
3038 * unlock a byte range in the middle of an existing lock. In
3039 * order to process an unlock request we need all of the same
3040 * information that is given with a normal read or write record
3041 * lock request. To avoid creating another ldlm unlock (cancel)
3042 * message we'll treat a LCK_NL flock request as an unlock. */
3043 einfo.ei_mode = LCK_NL;
3046 einfo.ei_mode = LCK_PW;
3049 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3064 flags = LDLM_FL_BLOCK_NOWAIT;
3070 flags = LDLM_FL_TEST_LOCK;
3073 CERROR("unknown fcntl lock command: %d\n", cmd);
3077 /* Save the old mode so that if the mode in the lock changes we
3078 * can decrement the appropriate reader or writer refcount. */
3079 file_lock->fl_type = einfo.ei_mode;
3081 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3082 LUSTRE_OPC_ANY, NULL);
3083 if (IS_ERR(op_data))
3084 RETURN(PTR_ERR(op_data));
3086 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3087 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3088 flock.l_flock.pid, flags, einfo.ei_mode,
3089 flock.l_flock.start, flock.l_flock.end);
3091 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3094 /* Restore the file lock type if not TEST lock. */
3095 if (!(flags & LDLM_FL_TEST_LOCK))
3096 file_lock->fl_type = fl_type;
3098 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3099 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3100 !(flags & LDLM_FL_TEST_LOCK))
3101 rc2 = locks_lock_file_wait(file, file_lock);
3103 if ((file_lock->fl_flags & FL_FLOCK) &&
3104 (rc == 0 || file_lock->fl_type == F_UNLCK))
3105 rc2 = flock_lock_file_wait(file, file_lock);
3106 if ((file_lock->fl_flags & FL_POSIX) &&
3107 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3108 !(flags & LDLM_FL_TEST_LOCK))
3109 rc2 = posix_lock_file_wait(file, file_lock);
3110 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3112 if (rc2 && file_lock->fl_type != F_UNLCK) {
3113 einfo.ei_mode = LCK_NL;
3114 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3119 ll_finish_md_op_data(op_data);
3124 int ll_get_fid_by_name(struct inode *parent, const char *name,
3125 int namelen, struct lu_fid *fid,
3126 struct inode **inode)
3128 struct md_op_data *op_data = NULL;
3129 struct mdt_body *body;
3130 struct ptlrpc_request *req;
3134 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3135 LUSTRE_OPC_ANY, NULL);
3136 if (IS_ERR(op_data))
3137 RETURN(PTR_ERR(op_data));
3139 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3140 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3141 ll_finish_md_op_data(op_data);
3145 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3147 GOTO(out_req, rc = -EFAULT);
3149 *fid = body->mbo_fid1;
3152 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3154 ptlrpc_req_finished(req);
3158 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3159 const char *name, int namelen)
3161 struct dentry *dchild = NULL;
3162 struct inode *child_inode = NULL;
3163 struct md_op_data *op_data;
3164 struct ptlrpc_request *request = NULL;
3165 struct obd_client_handle *och = NULL;
3167 struct mdt_body *body;
3169 __u64 data_version = 0;
3172 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3173 name, PFID(ll_inode2fid(parent)), mdtidx);
3175 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3176 0, LUSTRE_OPC_ANY, NULL);
3177 if (IS_ERR(op_data))
3178 RETURN(PTR_ERR(op_data));
3180 /* Get child FID first */
3181 qstr.hash = full_name_hash(name, namelen);
3184 dchild = d_lookup(file_dentry(file), &qstr);
3185 if (dchild != NULL) {
3186 if (dchild->d_inode != NULL)
3187 child_inode = igrab(dchild->d_inode);
3191 if (child_inode == NULL) {
3192 rc = ll_get_fid_by_name(parent, name, namelen,
3193 &op_data->op_fid3, &child_inode);
3198 if (child_inode == NULL)
3199 GOTO(out_free, rc = -EINVAL);
3202 * lfs migrate command needs to be blocked on the client
3203 * by checking the migrate FID against the FID of the
3206 if (child_inode == parent->i_sb->s_root->d_inode)
3207 GOTO(out_iput, rc = -EINVAL);
3209 inode_lock(child_inode);
3210 op_data->op_fid3 = *ll_inode2fid(child_inode);
3211 if (!fid_is_sane(&op_data->op_fid3)) {
3212 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3213 ll_get_fsname(parent->i_sb, NULL, 0), name,
3214 PFID(&op_data->op_fid3));
3215 GOTO(out_unlock, rc = -EINVAL);
3218 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3220 GOTO(out_unlock, rc);
3223 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3224 PFID(&op_data->op_fid3), mdtidx);
3225 GOTO(out_unlock, rc = 0);
3228 if (S_ISREG(child_inode->i_mode)) {
3229 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3233 GOTO(out_unlock, rc);
3236 rc = ll_data_version(child_inode, &data_version,
3239 GOTO(out_close, rc);
3241 op_data->op_handle = och->och_fh;
3242 op_data->op_data = och->och_mod;
3243 op_data->op_data_version = data_version;
3244 op_data->op_lease_handle = och->och_lease_handle;
3245 op_data->op_bias |= MDS_RENAME_MIGRATE;
3248 op_data->op_mds = mdtidx;
3249 op_data->op_cli_flags = CLI_MIGRATE;
3250 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3251 namelen, name, namelen, &request);
3253 ll_update_times(request, parent);
3255 if (request != NULL) {
3256 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3258 ptlrpc_req_finished(request);
3259 GOTO(out_close, rc = -EPROTO);
3262 /* If the server does release layout lock, then we cleanup
3263 * the client och here, otherwise release it in out_close: */
3265 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3266 obd_mod_put(och->och_mod);
3267 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3269 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3273 ptlrpc_req_finished(request);
3276 /* Try again if the file layout has changed. */
3277 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
3282 if (och != NULL) /* close the file */
3283 ll_lease_close(och, child_inode, NULL);
3285 clear_nlink(child_inode);
3287 inode_unlock(child_inode);
3291 ll_finish_md_op_data(op_data);
3296 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3304 * test if some locks matching bits and l_req_mode are acquired
3305 * - bits can be in different locks
3306 * - if found clear the common lock bits in *bits
3307 * - the bits not found, are kept in *bits
3309 * \param bits [IN] searched lock bits [IN]
3310 * \param l_req_mode [IN] searched lock mode
3311 * \retval boolean, true iff all bits are found
3313 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3315 struct lustre_handle lockh;
3316 union ldlm_policy_data policy;
3317 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3318 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3327 fid = &ll_i2info(inode)->lli_fid;
3328 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3329 ldlm_lockname[mode]);
3331 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3332 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3333 policy.l_inodebits.bits = *bits & (1 << i);
3334 if (policy.l_inodebits.bits == 0)
3337 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3338 &policy, mode, &lockh)) {
3339 struct ldlm_lock *lock;
3341 lock = ldlm_handle2lock(&lockh);
3344 ~(lock->l_policy_data.l_inodebits.bits);
3345 LDLM_LOCK_PUT(lock);
3347 *bits &= ~policy.l_inodebits.bits;
3354 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3355 struct lustre_handle *lockh, __u64 flags,
3356 enum ldlm_mode mode)
3358 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3363 fid = &ll_i2info(inode)->lli_fid;
3364 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3366 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3367 fid, LDLM_IBITS, &policy, mode, lockh);
3372 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3374 /* Already unlinked. Just update nlink and return success */
3375 if (rc == -ENOENT) {
3377 /* If it is striped directory, and there is bad stripe
3378 * Let's revalidate the dentry again, instead of returning
3380 if (S_ISDIR(inode->i_mode) &&
3381 ll_i2info(inode)->lli_lsm_md != NULL)
3384 /* This path cannot be hit for regular files unless in
3385 * case of obscure races, so no need to to validate
3387 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3389 } else if (rc != 0) {
3390 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3391 "%s: revalidate FID "DFID" error: rc = %d\n",
3392 ll_get_fsname(inode->i_sb, NULL, 0),
3393 PFID(ll_inode2fid(inode)), rc);
3399 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3401 struct inode *inode = dentry->d_inode;
3402 struct ptlrpc_request *req = NULL;
3403 struct obd_export *exp;
3407 LASSERT(inode != NULL);
3409 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3410 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3412 exp = ll_i2mdexp(inode);
3414 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3415 * But under CMD case, it caused some lock issues, should be fixed
3416 * with new CMD ibits lock. See bug 12718 */
3417 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3418 struct lookup_intent oit = { .it_op = IT_GETATTR };
3419 struct md_op_data *op_data;
3421 if (ibits == MDS_INODELOCK_LOOKUP)
3422 oit.it_op = IT_LOOKUP;
3424 /* Call getattr by fid, so do not provide name at all. */
3425 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3426 dentry->d_inode, NULL, 0, 0,
3427 LUSTRE_OPC_ANY, NULL);
3428 if (IS_ERR(op_data))
3429 RETURN(PTR_ERR(op_data));
3431 rc = md_intent_lock(exp, op_data, &oit, &req,
3432 &ll_md_blocking_ast, 0);
3433 ll_finish_md_op_data(op_data);
3435 rc = ll_inode_revalidate_fini(inode, rc);
3439 rc = ll_revalidate_it_finish(req, &oit, dentry);
3441 ll_intent_release(&oit);
3445 /* Unlinked? Unhash dentry, so it is not picked up later by
3446 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3447 here to preserve get_cwd functionality on 2.6.
3449 if (!dentry->d_inode->i_nlink) {
3450 ll_lock_dcache(inode);
3451 d_lustre_invalidate(dentry, 0);
3452 ll_unlock_dcache(inode);
3455 ll_lookup_finish_locks(&oit, dentry);
3456 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3457 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3458 u64 valid = OBD_MD_FLGETATTR;
3459 struct md_op_data *op_data;
3462 if (S_ISREG(inode->i_mode)) {
3463 rc = ll_get_default_mdsize(sbi, &ealen);
3466 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3469 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3470 0, ealen, LUSTRE_OPC_ANY,
3472 if (IS_ERR(op_data))
3473 RETURN(PTR_ERR(op_data));
3475 op_data->op_valid = valid;
3476 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3477 ll_finish_md_op_data(op_data);
3479 rc = ll_inode_revalidate_fini(inode, rc);
3483 rc = ll_prep_inode(&inode, req, NULL, NULL);
3486 ptlrpc_req_finished(req);
3490 static int ll_merge_md_attr(struct inode *inode)
3492 struct cl_attr attr = { 0 };
3495 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3496 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3497 &attr, ll_md_blocking_ast);
3501 set_nlink(inode, attr.cat_nlink);
3502 inode->i_blocks = attr.cat_blocks;
3503 i_size_write(inode, attr.cat_size);
3505 ll_i2info(inode)->lli_atime = attr.cat_atime;
3506 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3507 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3513 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3515 struct inode *inode = dentry->d_inode;
3519 rc = __ll_inode_revalidate(dentry, ibits);
3523 /* if object isn't regular file, don't validate size */
3524 if (!S_ISREG(inode->i_mode)) {
3525 if (S_ISDIR(inode->i_mode) &&
3526 ll_i2info(inode)->lli_lsm_md != NULL) {
3527 rc = ll_merge_md_attr(inode);
3532 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3533 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3534 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3536 /* In case of restore, the MDT has the right size and has
3537 * already send it back without granting the layout lock,
3538 * inode is up-to-date so glimpse is useless.
3539 * Also to glimpse we need the layout, in case of a running
3540 * restore the MDT holds the layout lock so the glimpse will
3541 * block up to the end of restore (getattr will block)
3543 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3544 rc = ll_glimpse_size(inode);
3549 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3551 struct inode *inode = de->d_inode;
3552 struct ll_sb_info *sbi = ll_i2sbi(inode);
3553 struct ll_inode_info *lli = ll_i2info(inode);
3556 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3557 MDS_INODELOCK_LOOKUP);
3558 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3563 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3565 stat->dev = inode->i_sb->s_dev;
3566 if (ll_need_32bit_api(sbi))
3567 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3569 stat->ino = inode->i_ino;
3570 stat->mode = inode->i_mode;
3571 stat->uid = inode->i_uid;
3572 stat->gid = inode->i_gid;
3573 stat->rdev = inode->i_rdev;
3574 stat->atime = inode->i_atime;
3575 stat->mtime = inode->i_mtime;
3576 stat->ctime = inode->i_ctime;
3577 stat->blksize = 1 << inode->i_blkbits;
3579 stat->nlink = inode->i_nlink;
3580 stat->size = i_size_read(inode);
3581 stat->blocks = inode->i_blocks;
3586 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3587 __u64 start, __u64 len)
3591 struct fiemap *fiemap;
3592 unsigned int extent_count = fieinfo->fi_extents_max;
3594 num_bytes = sizeof(*fiemap) + (extent_count *
3595 sizeof(struct fiemap_extent));
3596 OBD_ALLOC_LARGE(fiemap, num_bytes);
3601 fiemap->fm_flags = fieinfo->fi_flags;
3602 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3603 fiemap->fm_start = start;
3604 fiemap->fm_length = len;
3605 if (extent_count > 0 &&
3606 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3607 sizeof(struct fiemap_extent)) != 0)
3608 GOTO(out, rc = -EFAULT);
3610 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3612 fieinfo->fi_flags = fiemap->fm_flags;
3613 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3614 if (extent_count > 0 &&
3615 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3616 fiemap->fm_mapped_extents *
3617 sizeof(struct fiemap_extent)) != 0)
3618 GOTO(out, rc = -EFAULT);
3620 OBD_FREE_LARGE(fiemap, num_bytes);
3624 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3626 struct ll_inode_info *lli = ll_i2info(inode);
3627 struct posix_acl *acl = NULL;
3630 spin_lock(&lli->lli_lock);
3631 /* VFS' acl_permission_check->check_acl will release the refcount */
3632 acl = posix_acl_dup(lli->lli_posix_acl);
3633 spin_unlock(&lli->lli_lock);
3638 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3640 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3641 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3643 ll_check_acl(struct inode *inode, int mask)
3646 # ifdef CONFIG_FS_POSIX_ACL
3647 struct posix_acl *acl;
3651 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3652 if (flags & IPERM_FLAG_RCU)
3655 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3660 rc = posix_acl_permission(inode, acl, mask);
3661 posix_acl_release(acl);
3664 # else /* !CONFIG_FS_POSIX_ACL */
3666 # endif /* CONFIG_FS_POSIX_ACL */
3668 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3670 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3671 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3673 # ifdef HAVE_INODE_PERMISION_2ARGS
3674 int ll_inode_permission(struct inode *inode, int mask)
3676 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3681 struct ll_sb_info *sbi;
3682 struct root_squash_info *squash;
3683 struct cred *cred = NULL;
3684 const struct cred *old_cred = NULL;
3686 bool squash_id = false;
3689 #ifdef MAY_NOT_BLOCK
3690 if (mask & MAY_NOT_BLOCK)
3692 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3693 if (flags & IPERM_FLAG_RCU)
3697 /* as root inode are NOT getting validated in lookup operation,
3698 * need to do it before permission check. */
3700 if (inode == inode->i_sb->s_root->d_inode) {
3701 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3702 MDS_INODELOCK_LOOKUP);
3707 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3708 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3710 /* squash fsuid/fsgid if needed */
3711 sbi = ll_i2sbi(inode);
3712 squash = &sbi->ll_squash;
3713 if (unlikely(squash->rsi_uid != 0 &&
3714 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3715 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3719 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3720 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3721 squash->rsi_uid, squash->rsi_gid);
3723 /* update current process's credentials
3724 * and FS capability */
3725 cred = prepare_creds();
3729 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3730 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3731 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3732 if ((1 << cap) & CFS_CAP_FS_MASK)
3733 cap_lower(cred->cap_effective, cap);
3735 old_cred = override_creds(cred);
3738 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3739 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3740 /* restore current process's credentials and FS capability */
3742 revert_creds(old_cred);
3749 /* -o localflock - only provides locally consistent flock locks */
3750 struct file_operations ll_file_operations = {
3751 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3752 # ifdef HAVE_SYNC_READ_WRITE
3753 .read = new_sync_read,
3754 .write = new_sync_write,
3756 .read_iter = ll_file_read_iter,
3757 .write_iter = ll_file_write_iter,
3758 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3759 .read = ll_file_read,
3760 .aio_read = ll_file_aio_read,
3761 .write = ll_file_write,
3762 .aio_write = ll_file_aio_write,
3763 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3764 .unlocked_ioctl = ll_file_ioctl,
3765 .open = ll_file_open,
3766 .release = ll_file_release,
3767 .mmap = ll_file_mmap,
3768 .llseek = ll_file_seek,
3769 .splice_read = ll_file_splice_read,
3774 struct file_operations ll_file_operations_flock = {
3775 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3776 # ifdef HAVE_SYNC_READ_WRITE
3777 .read = new_sync_read,
3778 .write = new_sync_write,
3779 # endif /* HAVE_SYNC_READ_WRITE */
3780 .read_iter = ll_file_read_iter,
3781 .write_iter = ll_file_write_iter,
3782 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3783 .read = ll_file_read,
3784 .aio_read = ll_file_aio_read,
3785 .write = ll_file_write,
3786 .aio_write = ll_file_aio_write,
3787 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3788 .unlocked_ioctl = ll_file_ioctl,
3789 .open = ll_file_open,
3790 .release = ll_file_release,
3791 .mmap = ll_file_mmap,
3792 .llseek = ll_file_seek,
3793 .splice_read = ll_file_splice_read,
3796 .flock = ll_file_flock,
3797 .lock = ll_file_flock
3800 /* These are for -o noflock - to return ENOSYS on flock calls */
3801 struct file_operations ll_file_operations_noflock = {
3802 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3803 # ifdef HAVE_SYNC_READ_WRITE
3804 .read = new_sync_read,
3805 .write = new_sync_write,
3806 # endif /* HAVE_SYNC_READ_WRITE */
3807 .read_iter = ll_file_read_iter,
3808 .write_iter = ll_file_write_iter,
3809 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3810 .read = ll_file_read,
3811 .aio_read = ll_file_aio_read,
3812 .write = ll_file_write,
3813 .aio_write = ll_file_aio_write,
3814 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3815 .unlocked_ioctl = ll_file_ioctl,
3816 .open = ll_file_open,
3817 .release = ll_file_release,
3818 .mmap = ll_file_mmap,
3819 .llseek = ll_file_seek,
3820 .splice_read = ll_file_splice_read,
3823 .flock = ll_file_noflock,
3824 .lock = ll_file_noflock
3827 struct inode_operations ll_file_inode_operations = {
3828 .setattr = ll_setattr,
3829 .getattr = ll_getattr,
3830 .permission = ll_inode_permission,
3831 .setxattr = ll_setxattr,
3832 .getxattr = ll_getxattr,
3833 .listxattr = ll_listxattr,
3834 .removexattr = ll_removexattr,
3835 .fiemap = ll_fiemap,
3836 #ifdef HAVE_IOP_GET_ACL
3837 .get_acl = ll_get_acl,
3841 /* dynamic ioctl number support routins */
3842 static struct llioc_ctl_data {
3843 struct rw_semaphore ioc_sem;
3844 struct list_head ioc_head;
3846 __RWSEM_INITIALIZER(llioc.ioc_sem),
3847 LIST_HEAD_INIT(llioc.ioc_head)
3852 struct list_head iocd_list;
3853 unsigned int iocd_size;
3854 llioc_callback_t iocd_cb;
3855 unsigned int iocd_count;
3856 unsigned int iocd_cmd[0];
3859 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3862 struct llioc_data *in_data = NULL;
3865 if (cb == NULL || cmd == NULL ||
3866 count > LLIOC_MAX_CMD || count < 0)
3869 size = sizeof(*in_data) + count * sizeof(unsigned int);
3870 OBD_ALLOC(in_data, size);
3871 if (in_data == NULL)
3874 memset(in_data, 0, sizeof(*in_data));
3875 in_data->iocd_size = size;
3876 in_data->iocd_cb = cb;
3877 in_data->iocd_count = count;
3878 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3880 down_write(&llioc.ioc_sem);
3881 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3882 up_write(&llioc.ioc_sem);
3887 void ll_iocontrol_unregister(void *magic)
3889 struct llioc_data *tmp;
3894 down_write(&llioc.ioc_sem);
3895 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3897 unsigned int size = tmp->iocd_size;
3899 list_del(&tmp->iocd_list);
3900 up_write(&llioc.ioc_sem);
3902 OBD_FREE(tmp, size);
3906 up_write(&llioc.ioc_sem);
3908 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3911 EXPORT_SYMBOL(ll_iocontrol_register);
3912 EXPORT_SYMBOL(ll_iocontrol_unregister);
3914 static enum llioc_iter
3915 ll_iocontrol_call(struct inode *inode, struct file *file,
3916 unsigned int cmd, unsigned long arg, int *rcp)
3918 enum llioc_iter ret = LLIOC_CONT;
3919 struct llioc_data *data;
3920 int rc = -EINVAL, i;
3922 down_read(&llioc.ioc_sem);
3923 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3924 for (i = 0; i < data->iocd_count; i++) {
3925 if (cmd != data->iocd_cmd[i])
3928 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3932 if (ret == LLIOC_STOP)
3935 up_read(&llioc.ioc_sem);
3942 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3944 struct ll_inode_info *lli = ll_i2info(inode);
3945 struct cl_object *obj = lli->lli_clob;
3954 env = cl_env_get(&refcheck);
3956 RETURN(PTR_ERR(env));
3958 rc = cl_conf_set(env, lli->lli_clob, conf);
3962 if (conf->coc_opc == OBJECT_CONF_SET) {
3963 struct ldlm_lock *lock = conf->coc_lock;
3964 struct cl_layout cl = {
3968 LASSERT(lock != NULL);
3969 LASSERT(ldlm_has_layout(lock));
3971 /* it can only be allowed to match after layout is
3972 * applied to inode otherwise false layout would be
3973 * seen. Applying layout shoud happen before dropping
3974 * the intent lock. */
3975 ldlm_lock_allow_match(lock);
3977 rc = cl_object_layout_get(env, obj, &cl);
3982 DFID": layout version change: %u -> %u\n",
3983 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3985 ll_layout_version_set(lli, cl.cl_layout_gen);
3989 cl_env_put(env, &refcheck);
3994 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3995 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3998 struct ll_sb_info *sbi = ll_i2sbi(inode);
3999 struct ptlrpc_request *req;
4000 struct mdt_body *body;
4007 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4008 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4009 lock->l_lvb_data, lock->l_lvb_len);
4011 if (lock->l_lvb_data != NULL)
4014 /* if layout lock was granted right away, the layout is returned
4015 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4016 * blocked and then granted via completion ast, we have to fetch
4017 * layout here. Please note that we can't use the LVB buffer in
4018 * completion AST because it doesn't have a large enough buffer */
4019 rc = ll_get_default_mdsize(sbi, &lmmsize);
4021 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4022 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4027 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4029 GOTO(out, rc = -EPROTO);
4031 lmmsize = body->mbo_eadatasize;
4032 if (lmmsize == 0) /* empty layout */
4035 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4037 GOTO(out, rc = -EFAULT);
4039 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4040 if (lvbdata == NULL)
4041 GOTO(out, rc = -ENOMEM);
4043 memcpy(lvbdata, lmm, lmmsize);
4044 lock_res_and_lock(lock);
4045 if (unlikely(lock->l_lvb_data == NULL)) {
4046 lock->l_lvb_type = LVB_T_LAYOUT;
4047 lock->l_lvb_data = lvbdata;
4048 lock->l_lvb_len = lmmsize;
4051 unlock_res_and_lock(lock);
4054 OBD_FREE_LARGE(lvbdata, lmmsize);
4059 ptlrpc_req_finished(req);
4064 * Apply the layout to the inode. Layout lock is held and will be released
4067 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4068 struct inode *inode)
4070 struct ll_inode_info *lli = ll_i2info(inode);
4071 struct ll_sb_info *sbi = ll_i2sbi(inode);
4072 struct ldlm_lock *lock;
4073 struct cl_object_conf conf;
4076 bool wait_layout = false;
4079 LASSERT(lustre_handle_is_used(lockh));
4081 lock = ldlm_handle2lock(lockh);
4082 LASSERT(lock != NULL);
4083 LASSERT(ldlm_has_layout(lock));
4085 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4086 PFID(&lli->lli_fid), inode);
4088 /* in case this is a caching lock and reinstate with new inode */
4089 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4091 lock_res_and_lock(lock);
4092 lvb_ready = ldlm_is_lvb_ready(lock);
4093 unlock_res_and_lock(lock);
4094 /* checking lvb_ready is racy but this is okay. The worst case is
4095 * that multi processes may configure the file on the same time. */
4100 rc = ll_layout_fetch(inode, lock);
4104 /* for layout lock, lmm is stored in lock's lvb.
4105 * lvb_data is immutable if the lock is held so it's safe to access it
4108 * set layout to file. Unlikely this will fail as old layout was
4109 * surely eliminated */
4110 memset(&conf, 0, sizeof conf);
4111 conf.coc_opc = OBJECT_CONF_SET;
4112 conf.coc_inode = inode;
4113 conf.coc_lock = lock;
4114 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4115 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4116 rc = ll_layout_conf(inode, &conf);
4118 /* refresh layout failed, need to wait */
4119 wait_layout = rc == -EBUSY;
4123 LDLM_LOCK_PUT(lock);
4124 ldlm_lock_decref(lockh, mode);
4126 /* wait for IO to complete if it's still being used. */
4128 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4129 ll_get_fsname(inode->i_sb, NULL, 0),
4130 PFID(&lli->lli_fid), inode);
4132 memset(&conf, 0, sizeof conf);
4133 conf.coc_opc = OBJECT_CONF_WAIT;
4134 conf.coc_inode = inode;
4135 rc = ll_layout_conf(inode, &conf);
4139 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4140 ll_get_fsname(inode->i_sb, NULL, 0),
4141 PFID(&lli->lli_fid), rc);
4146 static int ll_layout_refresh_locked(struct inode *inode)
4148 struct ll_inode_info *lli = ll_i2info(inode);
4149 struct ll_sb_info *sbi = ll_i2sbi(inode);
4150 struct md_op_data *op_data;
4151 struct lookup_intent it;
4152 struct lustre_handle lockh;
4153 enum ldlm_mode mode;
4154 struct ldlm_enqueue_info einfo = {
4155 .ei_type = LDLM_IBITS,
4157 .ei_cb_bl = &ll_md_blocking_ast,
4158 .ei_cb_cp = &ldlm_completion_ast,
4164 /* mostly layout lock is caching on the local side, so try to match
4165 * it before grabbing layout lock mutex. */
4166 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4167 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4168 if (mode != 0) { /* hit cached lock */
4169 rc = ll_layout_lock_set(&lockh, mode, inode);
4176 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4177 0, 0, LUSTRE_OPC_ANY, NULL);
4178 if (IS_ERR(op_data))
4179 RETURN(PTR_ERR(op_data));
4181 /* have to enqueue one */
4182 memset(&it, 0, sizeof(it));
4183 it.it_op = IT_LAYOUT;
4184 lockh.cookie = 0ULL;
4186 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4187 ll_get_fsname(inode->i_sb, NULL, 0),
4188 PFID(&lli->lli_fid), inode);
4190 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4191 if (it.it_request != NULL)
4192 ptlrpc_req_finished(it.it_request);
4193 it.it_request = NULL;
4195 ll_finish_md_op_data(op_data);
4197 mode = it.it_lock_mode;
4198 it.it_lock_mode = 0;
4199 ll_intent_drop_lock(&it);
4202 /* set lock data in case this is a new lock */
4203 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4204 rc = ll_layout_lock_set(&lockh, mode, inode);
4213 * This function checks if there exists a LAYOUT lock on the client side,
4214 * or enqueues it if it doesn't have one in cache.
4216 * This function will not hold layout lock so it may be revoked any time after
4217 * this function returns. Any operations depend on layout should be redone
4220 * This function should be called before lov_io_init() to get an uptodate
4221 * layout version, the caller should save the version number and after IO
4222 * is finished, this function should be called again to verify that layout
4223 * is not changed during IO time.
4225 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4227 struct ll_inode_info *lli = ll_i2info(inode);
4228 struct ll_sb_info *sbi = ll_i2sbi(inode);
4232 *gen = ll_layout_version_get(lli);
4233 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4237 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4238 LASSERT(S_ISREG(inode->i_mode));
4240 /* take layout lock mutex to enqueue layout lock exclusively. */
4241 mutex_lock(&lli->lli_layout_mutex);
4243 rc = ll_layout_refresh_locked(inode);
4247 *gen = ll_layout_version_get(lli);
4249 mutex_unlock(&lli->lli_layout_mutex);
4255 * This function send a restore request to the MDT
4257 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4259 struct hsm_user_request *hur;
4263 len = sizeof(struct hsm_user_request) +
4264 sizeof(struct hsm_user_item);
4265 OBD_ALLOC(hur, len);
4269 hur->hur_request.hr_action = HUA_RESTORE;
4270 hur->hur_request.hr_archive_id = 0;
4271 hur->hur_request.hr_flags = 0;
4272 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4273 sizeof(hur->hur_user_item[0].hui_fid));
4274 hur->hur_user_item[0].hui_extent.offset = offset;
4275 hur->hur_user_item[0].hui_extent.length = length;
4276 hur->hur_request.hr_itemcount = 1;
4277 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,