4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct dentry *de = file_dentry(file);
355 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
356 struct dentry *parent = de->d_parent;
357 const char *name = NULL;
359 struct md_op_data *op_data;
360 struct ptlrpc_request *req = NULL;
364 LASSERT(parent != NULL);
365 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
367 /* if server supports open-by-fid, or file name is invalid, don't pack
368 * name in open request */
369 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
370 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
371 name = de->d_name.name;
372 len = de->d_name.len;
375 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
376 name, len, 0, LUSTRE_OPC_ANY, NULL);
378 RETURN(PTR_ERR(op_data));
379 op_data->op_data = lmm;
380 op_data->op_data_size = lmmsize;
382 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
383 &ll_md_blocking_ast, 0);
384 ll_finish_md_op_data(op_data);
386 /* reason for keep own exit path - don`t flood log
387 * with messages with -ESTALE errors.
389 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
390 it_open_error(DISP_OPEN_OPEN, itp))
392 ll_release_openhandle(de, itp);
396 if (it_disposition(itp, DISP_LOOKUP_NEG))
397 GOTO(out, rc = -ENOENT);
399 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
400 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
401 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
405 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
406 if (!rc && itp->it_lock_mode)
407 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
410 ptlrpc_req_finished(req);
411 ll_intent_drop_lock(itp);
413 /* We did open by fid, but by the time we got to the server,
414 * the object disappeared. If this is a create, we cannot really
415 * tell the userspace that the file it was trying to create
416 * does not exist. Instead let's return -ESTALE, and the VFS will
417 * retry the create with LOOKUP_REVAL that we are going to catch
418 * in ll_revalidate_dentry() and use lookup then.
420 if (rc == -ENOENT && itp->it_op & IT_CREAT)
426 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
427 struct obd_client_handle *och)
429 struct mdt_body *body;
431 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
432 och->och_fh = body->mbo_handle;
433 och->och_fid = body->mbo_fid1;
434 och->och_lease_handle.cookie = it->it_lock_handle;
435 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
436 och->och_flags = it->it_flags;
438 return md_set_open_replay_data(md_exp, och, it);
441 static int ll_local_open(struct file *file, struct lookup_intent *it,
442 struct ll_file_data *fd, struct obd_client_handle *och)
444 struct inode *inode = file_inode(file);
447 LASSERT(!LUSTRE_FPRIVATE(file));
454 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
459 LUSTRE_FPRIVATE(file) = fd;
460 ll_readahead_init(inode, &fd->fd_ras);
461 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
463 /* ll_cl_context initialize */
464 rwlock_init(&fd->fd_lock);
465 INIT_LIST_HEAD(&fd->fd_lccs);
470 /* Open a file, and (for the very first open) create objects on the OSTs at
471 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
472 * creation or open until ll_lov_setstripe() ioctl is called.
474 * If we already have the stripe MD locally then we don't request it in
475 * md_open(), by passing a lmm_size = 0.
477 * It is up to the application to ensure no other processes open this file
478 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
479 * used. We might be able to avoid races of that sort by getting lli_open_sem
480 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
481 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
483 int ll_file_open(struct inode *inode, struct file *file)
485 struct ll_inode_info *lli = ll_i2info(inode);
486 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
487 .it_flags = file->f_flags };
488 struct obd_client_handle **och_p = NULL;
489 __u64 *och_usecount = NULL;
490 struct ll_file_data *fd;
494 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
495 PFID(ll_inode2fid(inode)), inode, file->f_flags);
497 it = file->private_data; /* XXX: compat macro */
498 file->private_data = NULL; /* prevent ll_local_open assertion */
500 fd = ll_file_data_get();
502 GOTO(out_openerr, rc = -ENOMEM);
505 if (S_ISDIR(inode->i_mode))
506 ll_authorize_statahead(inode, fd);
508 if (inode->i_sb->s_root == file_dentry(file)) {
509 LUSTRE_FPRIVATE(file) = fd;
513 if (!it || !it->it_disposition) {
514 /* Convert f_flags into access mode. We cannot use file->f_mode,
515 * because everything but O_ACCMODE mask was stripped from
517 if ((oit.it_flags + 1) & O_ACCMODE)
519 if (file->f_flags & O_TRUNC)
520 oit.it_flags |= FMODE_WRITE;
522 /* kernel only call f_op->open in dentry_open. filp_open calls
523 * dentry_open after call to open_namei that checks permissions.
524 * Only nfsd_open call dentry_open directly without checking
525 * permissions and because of that this code below is safe. */
526 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
527 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
529 /* We do not want O_EXCL here, presumably we opened the file
530 * already? XXX - NFS implications? */
531 oit.it_flags &= ~O_EXCL;
533 /* bug20584, if "it_flags" contains O_CREAT, the file will be
534 * created if necessary, then "IT_CREAT" should be set to keep
535 * consistent with it */
536 if (oit.it_flags & O_CREAT)
537 oit.it_op |= IT_CREAT;
543 /* Let's see if we have file open on MDS already. */
544 if (it->it_flags & FMODE_WRITE) {
545 och_p = &lli->lli_mds_write_och;
546 och_usecount = &lli->lli_open_fd_write_count;
547 } else if (it->it_flags & FMODE_EXEC) {
548 och_p = &lli->lli_mds_exec_och;
549 och_usecount = &lli->lli_open_fd_exec_count;
551 och_p = &lli->lli_mds_read_och;
552 och_usecount = &lli->lli_open_fd_read_count;
555 mutex_lock(&lli->lli_och_mutex);
556 if (*och_p) { /* Open handle is present */
557 if (it_disposition(it, DISP_OPEN_OPEN)) {
558 /* Well, there's extra open request that we do not need,
559 let's close it somehow. This will decref request. */
560 rc = it_open_error(DISP_OPEN_OPEN, it);
562 mutex_unlock(&lli->lli_och_mutex);
563 GOTO(out_openerr, rc);
566 ll_release_openhandle(file_dentry(file), it);
570 rc = ll_local_open(file, it, fd, NULL);
573 mutex_unlock(&lli->lli_och_mutex);
574 GOTO(out_openerr, rc);
577 LASSERT(*och_usecount == 0);
578 if (!it->it_disposition) {
579 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
580 /* We cannot just request lock handle now, new ELC code
581 means that one of other OPEN locks for this file
582 could be cancelled, and since blocking ast handler
583 would attempt to grab och_mutex as well, that would
584 result in a deadlock */
585 mutex_unlock(&lli->lli_och_mutex);
587 * Normally called under two situations:
589 * 2. A race/condition on MDS resulting in no open
590 * handle to be returned from LOOKUP|OPEN request,
591 * for example if the target entry was a symlink.
593 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
594 * marked by a bit set in ll_iget_for_nfs. Clear the
595 * bit so that it's not confusing later callers.
597 * NB; when ldd is NULL, it must have come via normal
598 * lookup path only, since ll_iget_for_nfs always calls
601 if (ldd && ldd->lld_nfs_dentry) {
602 ldd->lld_nfs_dentry = 0;
603 it->it_flags |= MDS_OPEN_LOCK;
607 * Always specify MDS_OPEN_BY_FID because we don't want
608 * to get file with different fid.
610 it->it_flags |= MDS_OPEN_BY_FID;
611 rc = ll_intent_file_open(file, NULL, 0, it);
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime)
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1019 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1021 atime = LTIME_S(inode->i_atime);
1022 mtime = LTIME_S(inode->i_mtime);
1023 ctime = LTIME_S(inode->i_ctime);
1025 cl_object_attr_lock(obj);
1026 rc = cl_object_attr_get(env, obj, attr);
1027 cl_object_attr_unlock(obj);
1030 GOTO(out_size_unlock, rc);
1032 if (atime < attr->cat_atime)
1033 atime = attr->cat_atime;
1035 if (ctime < attr->cat_ctime)
1036 ctime = attr->cat_ctime;
1038 if (mtime < attr->cat_mtime)
1039 mtime = attr->cat_mtime;
1041 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1042 PFID(&lli->lli_fid), attr->cat_size);
1044 i_size_write(inode, attr->cat_size);
1045 inode->i_blocks = attr->cat_blocks;
1047 LTIME_S(inode->i_atime) = atime;
1048 LTIME_S(inode->i_mtime) = mtime;
1049 LTIME_S(inode->i_ctime) = ctime;
1052 ll_inode_size_unlock(inode);
1057 static bool file_is_noatime(const struct file *file)
1059 const struct vfsmount *mnt = file->f_path.mnt;
1060 const struct inode *inode = file_inode((struct file *)file);
1062 /* Adapted from file_accessed() and touch_atime().*/
1063 if (file->f_flags & O_NOATIME)
1066 if (inode->i_flags & S_NOATIME)
1069 if (IS_NOATIME(inode))
1072 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1075 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1078 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1084 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1086 struct inode *inode = file_inode((struct file *)file);
1088 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1090 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1091 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1092 file->f_flags & O_DIRECT ||
1095 io->ci_obj = ll_i2info(inode)->lli_clob;
1096 io->ci_lockreq = CILR_MAYBE;
1097 if (ll_file_nolock(file)) {
1098 io->ci_lockreq = CILR_NEVER;
1099 io->ci_no_srvlock = 1;
1100 } else if (file->f_flags & O_APPEND) {
1101 io->ci_lockreq = CILR_MANDATORY;
1104 io->ci_noatime = file_is_noatime(file);
1108 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1109 struct file *file, enum cl_io_type iot,
1110 loff_t *ppos, size_t count)
1112 struct vvp_io *vio = vvp_env_io(env);
1113 struct inode *inode = file_inode(file);
1114 struct ll_inode_info *lli = ll_i2info(inode);
1115 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1119 struct range_lock range;
1123 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1124 file_dentry(file)->d_name.name, iot, *ppos, count);
1127 io = vvp_env_thread_io(env);
1128 ll_io_init(io, file, iot == CIT_WRITE);
1130 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1131 bool range_locked = false;
1133 if (file->f_flags & O_APPEND)
1134 range_lock_init(&range, 0, LUSTRE_EOF);
1136 range_lock_init(&range, *ppos, *ppos + count - 1);
1138 vio->vui_fd = LUSTRE_FPRIVATE(file);
1139 vio->vui_io_subtype = args->via_io_subtype;
1141 switch (vio->vui_io_subtype) {
1143 vio->vui_iter = args->u.normal.via_iter;
1144 vio->vui_iocb = args->u.normal.via_iocb;
1145 /* Direct IO reads must also take range lock,
1146 * or multiple reads will try to work on the same pages
1147 * See LU-6227 for details. */
1148 if (((iot == CIT_WRITE) ||
1149 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1150 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1151 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1153 rc = range_lock(&lli->lli_write_tree, &range);
1157 range_locked = true;
1161 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1162 vio->u.splice.vui_flags = args->u.splice.via_flags;
1165 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1169 ll_cl_add(file, env, io, LCC_RW);
1170 rc = cl_io_loop(env, io);
1171 ll_cl_remove(file, env);
1174 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1176 range_unlock(&lli->lli_write_tree, &range);
1179 /* cl_io_rw_init() handled IO */
1183 if (io->ci_nob > 0) {
1184 result += io->ci_nob;
1185 count -= io->ci_nob;
1186 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1188 /* prepare IO restart */
1189 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1190 args->u.normal.via_iter = vio->vui_iter;
1194 cl_io_fini(env, io);
1196 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1198 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1199 file_dentry(file)->d_name.name,
1200 iot == CIT_READ ? "read" : "write",
1201 *ppos, count, result);
1205 if (iot == CIT_READ) {
1207 ll_stats_ops_tally(ll_i2sbi(inode),
1208 LPROC_LL_READ_BYTES, result);
1209 } else if (iot == CIT_WRITE) {
1211 ll_stats_ops_tally(ll_i2sbi(inode),
1212 LPROC_LL_WRITE_BYTES, result);
1213 fd->fd_write_failed = false;
1214 } else if (result == 0 && rc == 0) {
1217 fd->fd_write_failed = true;
1219 fd->fd_write_failed = false;
1220 } else if (rc != -ERESTARTSYS) {
1221 fd->fd_write_failed = true;
1225 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1227 return result > 0 ? result : rc;
1231 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1232 * especially for small I/O.
1234 * To serve a read request, CLIO has to create and initialize a cl_io and
1235 * then request DLM lock. This has turned out to have siginificant overhead
1236 * and affects the performance of small I/O dramatically.
1238 * It's not necessary to create a cl_io for each I/O. Under the help of read
1239 * ahead, most of the pages being read are already in memory cache and we can
1240 * read those pages directly because if the pages exist, the corresponding DLM
1241 * lock must exist so that page content must be valid.
1243 * In fast read implementation, the llite speculatively finds and reads pages
1244 * in memory cache. There are three scenarios for fast read:
1245 * - If the page exists and is uptodate, kernel VM will provide the data and
1246 * CLIO won't be intervened;
1247 * - If the page was brought into memory by read ahead, it will be exported
1248 * and read ahead parameters will be updated;
1249 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1250 * it will go back and invoke normal read, i.e., a cl_io will be created
1251 * and DLM lock will be requested.
1253 * POSIX compliance: posix standard states that read is intended to be atomic.
1254 * Lustre read implementation is in line with Linux kernel read implementation
1255 * and neither of them complies with POSIX standard in this matter. Fast read
1256 * doesn't make the situation worse on single node but it may interleave write
1257 * results from multiple nodes due to short read handling in ll_file_aio_read().
1259 * \param env - lu_env
1260 * \param iocb - kiocb from kernel
1261 * \param iter - user space buffers where the data will be copied
1263 * \retval - number of bytes have been read, or error code if error occurred.
1266 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1267 struct iov_iter *iter)
1271 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1274 /* NB: we can't do direct IO for fast read because it will need a lock
1275 * to make IO engine happy. */
1276 if (iocb->ki_filp->f_flags & O_DIRECT)
1279 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1280 result = generic_file_read_iter(iocb, iter);
1281 ll_cl_remove(iocb->ki_filp, env);
1283 /* If the first page is not in cache, generic_file_aio_read() will be
1284 * returned with -ENODATA.
1285 * See corresponding code in ll_readpage(). */
1286 if (result == -ENODATA)
1290 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1291 LPROC_LL_READ_BYTES, result);
1297 * Read from a file (through the page cache).
1299 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1302 struct vvp_io_args *args;
1307 env = cl_env_get(&refcheck);
1309 return PTR_ERR(env);
1311 result = ll_do_fast_read(env, iocb, to);
1312 if (result < 0 || iov_iter_count(to) == 0)
1315 args = ll_env_args(env, IO_NORMAL);
1316 args->u.normal.via_iter = to;
1317 args->u.normal.via_iocb = iocb;
1319 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1320 &iocb->ki_pos, iov_iter_count(to));
1323 else if (result == 0)
1327 cl_env_put(env, &refcheck);
1332 * Write to a file (through the page cache).
1334 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1336 struct vvp_io_args *args;
1341 env = cl_env_get(&refcheck);
1343 return PTR_ERR(env);
1345 args = ll_env_args(env, IO_NORMAL);
1346 args->u.normal.via_iter = from;
1347 args->u.normal.via_iocb = iocb;
1349 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1350 &iocb->ki_pos, iov_iter_count(from));
1351 cl_env_put(env, &refcheck);
1355 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1357 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1359 static int ll_file_get_iov_count(const struct iovec *iov,
1360 unsigned long *nr_segs, size_t *count)
1365 for (seg = 0; seg < *nr_segs; seg++) {
1366 const struct iovec *iv = &iov[seg];
1369 * If any segment has a negative length, or the cumulative
1370 * length ever wraps negative then return -EINVAL.
1373 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1375 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1380 cnt -= iv->iov_len; /* This segment is no good */
1387 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1388 unsigned long nr_segs, loff_t pos)
1395 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1399 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1400 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1401 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1402 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1403 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1405 result = ll_file_read_iter(iocb, &to);
1410 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1414 struct iovec iov = { .iov_base = buf, .iov_len = count };
1415 struct kiocb *kiocb;
1420 env = cl_env_get(&refcheck);
1422 RETURN(PTR_ERR(env));
1424 kiocb = &ll_env_info(env)->lti_kiocb;
1425 init_sync_kiocb(kiocb, file);
1426 kiocb->ki_pos = *ppos;
1427 #ifdef HAVE_KIOCB_KI_LEFT
1428 kiocb->ki_left = count;
1429 #elif defined(HAVE_KI_NBYTES)
1430 kiocb->ki_nbytes = count;
1433 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1434 *ppos = kiocb->ki_pos;
1436 cl_env_put(env, &refcheck);
1441 * Write to a file (through the page cache).
1444 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1445 unsigned long nr_segs, loff_t pos)
1447 struct iov_iter from;
1452 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1456 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1457 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1458 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1459 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1460 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1462 result = ll_file_write_iter(iocb, &from);
1467 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1468 size_t count, loff_t *ppos)
1471 struct iovec iov = { .iov_base = (void __user *)buf,
1473 struct kiocb *kiocb;
1478 env = cl_env_get(&refcheck);
1480 RETURN(PTR_ERR(env));
1482 kiocb = &ll_env_info(env)->lti_kiocb;
1483 init_sync_kiocb(kiocb, file);
1484 kiocb->ki_pos = *ppos;
1485 #ifdef HAVE_KIOCB_KI_LEFT
1486 kiocb->ki_left = count;
1487 #elif defined(HAVE_KI_NBYTES)
1488 kiocb->ki_nbytes = count;
1491 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1492 *ppos = kiocb->ki_pos;
1494 cl_env_put(env, &refcheck);
1497 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1500 * Send file content (through pagecache) somewhere with helper
1502 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1503 struct pipe_inode_info *pipe, size_t count,
1507 struct vvp_io_args *args;
1512 env = cl_env_get(&refcheck);
1514 RETURN(PTR_ERR(env));
1516 args = ll_env_args(env, IO_SPLICE);
1517 args->u.splice.via_pipe = pipe;
1518 args->u.splice.via_flags = flags;
1520 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1521 cl_env_put(env, &refcheck);
1525 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1526 __u64 flags, struct lov_user_md *lum,
1529 struct lookup_intent oit = {
1531 .it_flags = flags | MDS_OPEN_BY_FID,
1536 ll_inode_size_lock(inode);
1537 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1539 GOTO(out_unlock, rc);
1541 ll_release_openhandle(file_dentry(file), &oit);
1544 ll_inode_size_unlock(inode);
1545 ll_intent_release(&oit);
1546 cl_lov_delay_create_clear(&file->f_flags);
1551 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1552 struct lov_mds_md **lmmp, int *lmm_size,
1553 struct ptlrpc_request **request)
1555 struct ll_sb_info *sbi = ll_i2sbi(inode);
1556 struct mdt_body *body;
1557 struct lov_mds_md *lmm = NULL;
1558 struct ptlrpc_request *req = NULL;
1559 struct md_op_data *op_data;
1562 rc = ll_get_default_mdsize(sbi, &lmmsize);
1566 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1567 strlen(filename), lmmsize,
1568 LUSTRE_OPC_ANY, NULL);
1569 if (IS_ERR(op_data))
1570 RETURN(PTR_ERR(op_data));
1572 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1573 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1574 ll_finish_md_op_data(op_data);
1576 CDEBUG(D_INFO, "md_getattr_name failed "
1577 "on %s: rc %d\n", filename, rc);
1581 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1582 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1584 lmmsize = body->mbo_eadatasize;
1586 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1588 GOTO(out, rc = -ENODATA);
1591 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1592 LASSERT(lmm != NULL);
1594 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1595 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1596 GOTO(out, rc = -EPROTO);
1600 * This is coming from the MDS, so is probably in
1601 * little endian. We convert it to host endian before
1602 * passing it to userspace.
1604 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1607 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1608 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1611 /* if function called for directory - we should
1612 * avoid swab not existent lsm objects */
1613 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1614 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1615 if (S_ISREG(body->mbo_mode))
1616 lustre_swab_lov_user_md_objects(
1617 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1619 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1620 lustre_swab_lov_user_md_v3(
1621 (struct lov_user_md_v3 *)lmm);
1622 if (S_ISREG(body->mbo_mode))
1623 lustre_swab_lov_user_md_objects(
1624 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1631 *lmm_size = lmmsize;
1636 static int ll_lov_setea(struct inode *inode, struct file *file,
1639 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1640 struct lov_user_md *lump;
1641 int lum_size = sizeof(struct lov_user_md) +
1642 sizeof(struct lov_user_ost_data);
1646 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1649 OBD_ALLOC_LARGE(lump, lum_size);
1653 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1654 GOTO(out_lump, rc = -EFAULT);
1656 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1659 OBD_FREE_LARGE(lump, lum_size);
1663 static int ll_file_getstripe(struct inode *inode,
1664 struct lov_user_md __user *lum)
1671 env = cl_env_get(&refcheck);
1673 RETURN(PTR_ERR(env));
1675 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1676 cl_env_put(env, &refcheck);
1680 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1683 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1684 struct lov_user_md *klum;
1686 __u64 flags = FMODE_WRITE;
1689 rc = ll_copy_user_md(lum, &klum);
1694 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1698 put_user(0, &lum->lmm_stripe_count);
1700 ll_layout_refresh(inode, &gen);
1701 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1704 OBD_FREE(klum, lum_size);
1709 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1711 struct ll_inode_info *lli = ll_i2info(inode);
1712 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1713 struct ll_grouplock grouplock;
1718 CWARN("group id for group lock must not be 0\n");
1722 if (ll_file_nolock(file))
1723 RETURN(-EOPNOTSUPP);
1725 spin_lock(&lli->lli_lock);
1726 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1727 CWARN("group lock already existed with gid %lu\n",
1728 fd->fd_grouplock.lg_gid);
1729 spin_unlock(&lli->lli_lock);
1732 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1733 spin_unlock(&lli->lli_lock);
1735 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1736 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1740 spin_lock(&lli->lli_lock);
1741 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1742 spin_unlock(&lli->lli_lock);
1743 CERROR("another thread just won the race\n");
1744 cl_put_grouplock(&grouplock);
1748 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1749 fd->fd_grouplock = grouplock;
1750 spin_unlock(&lli->lli_lock);
1752 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1756 static int ll_put_grouplock(struct inode *inode, struct file *file,
1759 struct ll_inode_info *lli = ll_i2info(inode);
1760 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1761 struct ll_grouplock grouplock;
1764 spin_lock(&lli->lli_lock);
1765 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1766 spin_unlock(&lli->lli_lock);
1767 CWARN("no group lock held\n");
1771 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1773 if (fd->fd_grouplock.lg_gid != arg) {
1774 CWARN("group lock %lu doesn't match current id %lu\n",
1775 arg, fd->fd_grouplock.lg_gid);
1776 spin_unlock(&lli->lli_lock);
1780 grouplock = fd->fd_grouplock;
1781 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1782 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1783 spin_unlock(&lli->lli_lock);
1785 cl_put_grouplock(&grouplock);
1786 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1791 * Close inode open handle
1793 * \param dentry [in] dentry which contains the inode
1794 * \param it [in,out] intent which contains open info and result
1797 * \retval <0 failure
1799 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1801 struct inode *inode = dentry->d_inode;
1802 struct obd_client_handle *och;
1808 /* Root ? Do nothing. */
1809 if (dentry->d_inode->i_sb->s_root == dentry)
1812 /* No open handle to close? Move away */
1813 if (!it_disposition(it, DISP_OPEN_OPEN))
1816 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1818 OBD_ALLOC(och, sizeof(*och));
1820 GOTO(out, rc = -ENOMEM);
1822 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1824 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1826 /* this one is in place of ll_file_open */
1827 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1828 ptlrpc_req_finished(it->it_request);
1829 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1835 * Get size for inode for which FIEMAP mapping is requested.
1836 * Make the FIEMAP get_info call and returns the result.
1837 * \param fiemap kernel buffer to hold extens
1838 * \param num_bytes kernel buffer size
1840 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1846 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1849 /* Checks for fiemap flags */
1850 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1851 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1855 /* Check for FIEMAP_FLAG_SYNC */
1856 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1857 rc = filemap_fdatawrite(inode->i_mapping);
1862 env = cl_env_get(&refcheck);
1864 RETURN(PTR_ERR(env));
1866 if (i_size_read(inode) == 0) {
1867 rc = ll_glimpse_size(inode);
1872 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1873 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1874 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1876 /* If filesize is 0, then there would be no objects for mapping */
1877 if (fmkey.lfik_oa.o_size == 0) {
1878 fiemap->fm_mapped_extents = 0;
1882 fmkey.lfik_fiemap = *fiemap;
1884 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1885 &fmkey, fiemap, &num_bytes);
1887 cl_env_put(env, &refcheck);
1891 int ll_fid2path(struct inode *inode, void __user *arg)
1893 struct obd_export *exp = ll_i2mdexp(inode);
1894 const struct getinfo_fid2path __user *gfin = arg;
1896 struct getinfo_fid2path *gfout;
1902 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1903 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1906 /* Only need to get the buflen */
1907 if (get_user(pathlen, &gfin->gf_pathlen))
1910 if (pathlen > PATH_MAX)
1913 outsize = sizeof(*gfout) + pathlen;
1914 OBD_ALLOC(gfout, outsize);
1918 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1919 GOTO(gf_free, rc = -EFAULT);
1920 /* append root FID after gfout to let MDT know the root FID so that it
1921 * can lookup the correct path, this is mainly for fileset.
1922 * old server without fileset mount support will ignore this. */
1923 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1925 /* Call mdc_iocontrol */
1926 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1930 if (copy_to_user(arg, gfout, outsize))
1934 OBD_FREE(gfout, outsize);
1939 * Read the data_version for inode.
1941 * This value is computed using stripe object version on OST.
1942 * Version is computed using server side locking.
1944 * @param flags if do sync on the OST side;
1946 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1947 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1949 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1951 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1959 /* If no file object initialized, we consider its version is 0. */
1965 env = cl_env_get(&refcheck);
1967 RETURN(PTR_ERR(env));
1969 io = vvp_env_thread_io(env);
1971 io->u.ci_data_version.dv_data_version = 0;
1972 io->u.ci_data_version.dv_flags = flags;
1975 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1976 result = cl_io_loop(env, io);
1978 result = io->ci_result;
1980 *data_version = io->u.ci_data_version.dv_data_version;
1982 cl_io_fini(env, io);
1984 if (unlikely(io->ci_need_restart))
1987 cl_env_put(env, &refcheck);
1993 * Trigger a HSM release request for the provided inode.
1995 int ll_hsm_release(struct inode *inode)
1998 struct obd_client_handle *och = NULL;
1999 __u64 data_version = 0;
2004 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2005 ll_get_fsname(inode->i_sb, NULL, 0),
2006 PFID(&ll_i2info(inode)->lli_fid));
2008 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2010 GOTO(out, rc = PTR_ERR(och));
2012 /* Grab latest data_version and [am]time values */
2013 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2017 env = cl_env_get(&refcheck);
2019 GOTO(out, rc = PTR_ERR(env));
2021 ll_merge_attr(env, inode);
2022 cl_env_put(env, &refcheck);
2024 /* Release the file.
2025 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2026 * we still need it to pack l_remote_handle to MDT. */
2027 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2033 if (och != NULL && !IS_ERR(och)) /* close the file */
2034 ll_lease_close(och, inode, NULL);
2039 struct ll_swap_stack {
2042 struct inode *inode1;
2043 struct inode *inode2;
2048 static int ll_swap_layouts(struct file *file1, struct file *file2,
2049 struct lustre_swap_layouts *lsl)
2051 struct mdc_swap_layouts msl;
2052 struct md_op_data *op_data;
2055 struct ll_swap_stack *llss = NULL;
2058 OBD_ALLOC_PTR(llss);
2062 llss->inode1 = file_inode(file1);
2063 llss->inode2 = file_inode(file2);
2065 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2069 /* we use 2 bool because it is easier to swap than 2 bits */
2070 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2071 llss->check_dv1 = true;
2073 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2074 llss->check_dv2 = true;
2076 /* we cannot use lsl->sl_dvX directly because we may swap them */
2077 llss->dv1 = lsl->sl_dv1;
2078 llss->dv2 = lsl->sl_dv2;
2080 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2081 if (rc == 0) /* same file, done! */
2084 if (rc < 0) { /* sequentialize it */
2085 swap(llss->inode1, llss->inode2);
2087 swap(llss->dv1, llss->dv2);
2088 swap(llss->check_dv1, llss->check_dv2);
2092 if (gid != 0) { /* application asks to flush dirty cache */
2093 rc = ll_get_grouplock(llss->inode1, file1, gid);
2097 rc = ll_get_grouplock(llss->inode2, file2, gid);
2099 ll_put_grouplock(llss->inode1, file1, gid);
2104 /* ultimate check, before swaping the layouts we check if
2105 * dataversion has changed (if requested) */
2106 if (llss->check_dv1) {
2107 rc = ll_data_version(llss->inode1, &dv, 0);
2110 if (dv != llss->dv1)
2111 GOTO(putgl, rc = -EAGAIN);
2114 if (llss->check_dv2) {
2115 rc = ll_data_version(llss->inode2, &dv, 0);
2118 if (dv != llss->dv2)
2119 GOTO(putgl, rc = -EAGAIN);
2122 /* struct md_op_data is used to send the swap args to the mdt
2123 * only flags is missing, so we use struct mdc_swap_layouts
2124 * through the md_op_data->op_data */
2125 /* flags from user space have to be converted before they are send to
2126 * server, no flag is sent today, they are only used on the client */
2129 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2130 0, LUSTRE_OPC_ANY, &msl);
2131 if (IS_ERR(op_data))
2132 GOTO(free, rc = PTR_ERR(op_data));
2134 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2135 sizeof(*op_data), op_data, NULL);
2136 ll_finish_md_op_data(op_data);
2143 ll_put_grouplock(llss->inode2, file2, gid);
2144 ll_put_grouplock(llss->inode1, file1, gid);
2154 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2156 struct md_op_data *op_data;
2160 /* Detect out-of range masks */
2161 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2164 /* Non-root users are forbidden to set or clear flags which are
2165 * NOT defined in HSM_USER_MASK. */
2166 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2167 !cfs_capable(CFS_CAP_SYS_ADMIN))
2170 /* Detect out-of range archive id */
2171 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2172 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2175 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2176 LUSTRE_OPC_ANY, hss);
2177 if (IS_ERR(op_data))
2178 RETURN(PTR_ERR(op_data));
2180 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2181 sizeof(*op_data), op_data, NULL);
2183 ll_finish_md_op_data(op_data);
2188 static int ll_hsm_import(struct inode *inode, struct file *file,
2189 struct hsm_user_import *hui)
2191 struct hsm_state_set *hss = NULL;
2192 struct iattr *attr = NULL;
2196 if (!S_ISREG(inode->i_mode))
2202 GOTO(out, rc = -ENOMEM);
2204 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2205 hss->hss_archive_id = hui->hui_archive_id;
2206 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2207 rc = ll_hsm_state_set(inode, hss);
2211 OBD_ALLOC_PTR(attr);
2213 GOTO(out, rc = -ENOMEM);
2215 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2216 attr->ia_mode |= S_IFREG;
2217 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2218 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2219 attr->ia_size = hui->hui_size;
2220 attr->ia_mtime.tv_sec = hui->hui_mtime;
2221 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2222 attr->ia_atime.tv_sec = hui->hui_atime;
2223 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2225 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2226 ATTR_UID | ATTR_GID |
2227 ATTR_MTIME | ATTR_MTIME_SET |
2228 ATTR_ATIME | ATTR_ATIME_SET;
2232 rc = ll_setattr_raw(file_dentry(file), attr, true);
2236 inode_unlock(inode);
2248 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2250 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2251 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2254 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2256 struct inode *inode = file_inode(file);
2258 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2259 ATTR_MTIME | ATTR_MTIME_SET |
2260 ATTR_CTIME | ATTR_CTIME_SET,
2262 .tv_sec = lfu->lfu_atime_sec,
2263 .tv_nsec = lfu->lfu_atime_nsec,
2266 .tv_sec = lfu->lfu_mtime_sec,
2267 .tv_nsec = lfu->lfu_mtime_nsec,
2270 .tv_sec = lfu->lfu_ctime_sec,
2271 .tv_nsec = lfu->lfu_ctime_nsec,
2277 if (!capable(CAP_SYS_ADMIN))
2280 if (!S_ISREG(inode->i_mode))
2284 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2285 inode_unlock(inode);
2291 * Give file access advices
2293 * The ladvise interface is similar to Linux fadvise() system call, except it
2294 * forwards the advices directly from Lustre client to server. The server side
2295 * codes will apply appropriate read-ahead and caching techniques for the
2296 * corresponding files.
2298 * A typical workload for ladvise is e.g. a bunch of different clients are
2299 * doing small random reads of a file, so prefetching pages into OSS cache
2300 * with big linear reads before the random IO is a net benefit. Fetching
2301 * all that data into each client cache with fadvise() may not be, due to
2302 * much more data being sent to the client.
2304 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2305 struct llapi_lu_ladvise *ladvise)
2309 struct cl_ladvise_io *lio;
2314 env = cl_env_get(&refcheck);
2316 RETURN(PTR_ERR(env));
2318 io = vvp_env_thread_io(env);
2319 io->ci_obj = ll_i2info(inode)->lli_clob;
2321 /* initialize parameters for ladvise */
2322 lio = &io->u.ci_ladvise;
2323 lio->li_start = ladvise->lla_start;
2324 lio->li_end = ladvise->lla_end;
2325 lio->li_fid = ll_inode2fid(inode);
2326 lio->li_advice = ladvise->lla_advice;
2327 lio->li_flags = flags;
2329 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2330 rc = cl_io_loop(env, io);
2334 cl_io_fini(env, io);
2335 cl_env_put(env, &refcheck);
2340 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2342 struct inode *inode = file_inode(file);
2343 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2347 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2348 PFID(ll_inode2fid(inode)), inode, cmd);
2349 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2351 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2352 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2356 case LL_IOC_GETFLAGS:
2357 /* Get the current value of the file flags */
2358 return put_user(fd->fd_flags, (int __user *)arg);
2359 case LL_IOC_SETFLAGS:
2360 case LL_IOC_CLRFLAGS:
2361 /* Set or clear specific file flags */
2362 /* XXX This probably needs checks to ensure the flags are
2363 * not abused, and to handle any flag side effects.
2365 if (get_user(flags, (int __user *) arg))
2368 if (cmd == LL_IOC_SETFLAGS) {
2369 if ((flags & LL_FILE_IGNORE_LOCK) &&
2370 !(file->f_flags & O_DIRECT)) {
2371 CERROR("%s: unable to disable locking on "
2372 "non-O_DIRECT file\n", current->comm);
2376 fd->fd_flags |= flags;
2378 fd->fd_flags &= ~flags;
2381 case LL_IOC_LOV_SETSTRIPE:
2382 RETURN(ll_lov_setstripe(inode, file, arg));
2383 case LL_IOC_LOV_SETEA:
2384 RETURN(ll_lov_setea(inode, file, arg));
2385 case LL_IOC_LOV_SWAP_LAYOUTS: {
2387 struct lustre_swap_layouts lsl;
2389 if (copy_from_user(&lsl, (char __user *)arg,
2390 sizeof(struct lustre_swap_layouts)))
2393 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2396 file2 = fget(lsl.sl_fd);
2400 /* O_WRONLY or O_RDWR */
2401 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2402 GOTO(out, rc = -EPERM);
2404 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2405 struct inode *inode2;
2406 struct ll_inode_info *lli;
2407 struct obd_client_handle *och = NULL;
2409 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2410 GOTO(out, rc = -EINVAL);
2412 lli = ll_i2info(inode);
2413 mutex_lock(&lli->lli_och_mutex);
2414 if (fd->fd_lease_och != NULL) {
2415 och = fd->fd_lease_och;
2416 fd->fd_lease_och = NULL;
2418 mutex_unlock(&lli->lli_och_mutex);
2420 GOTO(out, rc = -ENOLCK);
2421 inode2 = file_inode(file2);
2422 rc = ll_swap_layouts_close(och, inode, inode2);
2424 rc = ll_swap_layouts(file, file2, &lsl);
2430 case LL_IOC_LOV_GETSTRIPE:
2431 RETURN(ll_file_getstripe(inode,
2432 (struct lov_user_md __user *)arg));
2433 case FSFILT_IOC_GETFLAGS:
2434 case FSFILT_IOC_SETFLAGS:
2435 RETURN(ll_iocontrol(inode, file, cmd, arg));
2436 case FSFILT_IOC_GETVERSION_OLD:
2437 case FSFILT_IOC_GETVERSION:
2438 RETURN(put_user(inode->i_generation, (int __user *)arg));
2439 case LL_IOC_GROUP_LOCK:
2440 RETURN(ll_get_grouplock(inode, file, arg));
2441 case LL_IOC_GROUP_UNLOCK:
2442 RETURN(ll_put_grouplock(inode, file, arg));
2443 case IOC_OBD_STATFS:
2444 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2446 /* We need to special case any other ioctls we want to handle,
2447 * to send them to the MDS/OST as appropriate and to properly
2448 * network encode the arg field.
2449 case FSFILT_IOC_SETVERSION_OLD:
2450 case FSFILT_IOC_SETVERSION:
2452 case LL_IOC_FLUSHCTX:
2453 RETURN(ll_flush_ctx(inode));
2454 case LL_IOC_PATH2FID: {
2455 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2456 sizeof(struct lu_fid)))
2461 case LL_IOC_GETPARENT:
2462 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2464 case OBD_IOC_FID2PATH:
2465 RETURN(ll_fid2path(inode, (void __user *)arg));
2466 case LL_IOC_DATA_VERSION: {
2467 struct ioc_data_version idv;
2470 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2473 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2474 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2477 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2483 case LL_IOC_GET_MDTIDX: {
2486 mdtidx = ll_get_mdt_idx(inode);
2490 if (put_user((int)mdtidx, (int __user *)arg))
2495 case OBD_IOC_GETDTNAME:
2496 case OBD_IOC_GETMDNAME:
2497 RETURN(ll_get_obd_name(inode, cmd, arg));
2498 case LL_IOC_HSM_STATE_GET: {
2499 struct md_op_data *op_data;
2500 struct hsm_user_state *hus;
2507 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2508 LUSTRE_OPC_ANY, hus);
2509 if (IS_ERR(op_data)) {
2511 RETURN(PTR_ERR(op_data));
2514 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2517 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2520 ll_finish_md_op_data(op_data);
2524 case LL_IOC_HSM_STATE_SET: {
2525 struct hsm_state_set *hss;
2532 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2537 rc = ll_hsm_state_set(inode, hss);
2542 case LL_IOC_HSM_ACTION: {
2543 struct md_op_data *op_data;
2544 struct hsm_current_action *hca;
2551 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2552 LUSTRE_OPC_ANY, hca);
2553 if (IS_ERR(op_data)) {
2555 RETURN(PTR_ERR(op_data));
2558 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2561 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2564 ll_finish_md_op_data(op_data);
2568 case LL_IOC_SET_LEASE: {
2569 struct ll_inode_info *lli = ll_i2info(inode);
2570 struct obd_client_handle *och = NULL;
2575 case LL_LEASE_WRLCK:
2576 if (!(file->f_mode & FMODE_WRITE))
2578 fmode = FMODE_WRITE;
2580 case LL_LEASE_RDLCK:
2581 if (!(file->f_mode & FMODE_READ))
2585 case LL_LEASE_UNLCK:
2586 mutex_lock(&lli->lli_och_mutex);
2587 if (fd->fd_lease_och != NULL) {
2588 och = fd->fd_lease_och;
2589 fd->fd_lease_och = NULL;
2591 mutex_unlock(&lli->lli_och_mutex);
2596 fmode = och->och_flags;
2597 rc = ll_lease_close(och, inode, &lease_broken);
2601 rc = ll_lease_och_release(inode, file);
2608 RETURN(ll_lease_type_from_fmode(fmode));
2613 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2615 /* apply for lease */
2616 och = ll_lease_open(inode, file, fmode, 0);
2618 RETURN(PTR_ERR(och));
2621 mutex_lock(&lli->lli_och_mutex);
2622 if (fd->fd_lease_och == NULL) {
2623 fd->fd_lease_och = och;
2626 mutex_unlock(&lli->lli_och_mutex);
2628 /* impossible now that only excl is supported for now */
2629 ll_lease_close(och, inode, &lease_broken);
2634 case LL_IOC_GET_LEASE: {
2635 struct ll_inode_info *lli = ll_i2info(inode);
2636 struct ldlm_lock *lock = NULL;
2639 mutex_lock(&lli->lli_och_mutex);
2640 if (fd->fd_lease_och != NULL) {
2641 struct obd_client_handle *och = fd->fd_lease_och;
2643 lock = ldlm_handle2lock(&och->och_lease_handle);
2645 lock_res_and_lock(lock);
2646 if (!ldlm_is_cancel(lock))
2647 fmode = och->och_flags;
2649 unlock_res_and_lock(lock);
2650 LDLM_LOCK_PUT(lock);
2653 mutex_unlock(&lli->lli_och_mutex);
2655 RETURN(ll_lease_type_from_fmode(fmode));
2657 case LL_IOC_HSM_IMPORT: {
2658 struct hsm_user_import *hui;
2664 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2669 rc = ll_hsm_import(inode, file, hui);
2674 case LL_IOC_FUTIMES_3: {
2675 struct ll_futimes_3 lfu;
2677 if (copy_from_user(&lfu,
2678 (const struct ll_futimes_3 __user *)arg,
2682 RETURN(ll_file_futimes_3(file, &lfu));
2684 case LL_IOC_LADVISE: {
2685 struct llapi_ladvise_hdr *ladvise_hdr;
2688 int alloc_size = sizeof(*ladvise_hdr);
2691 OBD_ALLOC_PTR(ladvise_hdr);
2692 if (ladvise_hdr == NULL)
2695 if (copy_from_user(ladvise_hdr,
2696 (const struct llapi_ladvise_hdr __user *)arg,
2698 GOTO(out_ladvise, rc = -EFAULT);
2700 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2701 ladvise_hdr->lah_count < 1)
2702 GOTO(out_ladvise, rc = -EINVAL);
2704 num_advise = ladvise_hdr->lah_count;
2705 if (num_advise >= LAH_COUNT_MAX)
2706 GOTO(out_ladvise, rc = -EFBIG);
2708 OBD_FREE_PTR(ladvise_hdr);
2709 alloc_size = offsetof(typeof(*ladvise_hdr),
2710 lah_advise[num_advise]);
2711 OBD_ALLOC(ladvise_hdr, alloc_size);
2712 if (ladvise_hdr == NULL)
2716 * TODO: submit multiple advices to one server in a single RPC
2718 if (copy_from_user(ladvise_hdr,
2719 (const struct llapi_ladvise_hdr __user *)arg,
2721 GOTO(out_ladvise, rc = -EFAULT);
2723 for (i = 0; i < num_advise; i++) {
2724 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2725 &ladvise_hdr->lah_advise[i]);
2731 OBD_FREE(ladvise_hdr, alloc_size);
2738 ll_iocontrol_call(inode, file, cmd, arg, &err))
2741 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2742 (void __user *)arg));
2747 #ifndef HAVE_FILE_LLSEEK_SIZE
2748 static inline loff_t
2749 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2751 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2753 if (offset > maxsize)
2756 if (offset != file->f_pos) {
2757 file->f_pos = offset;
2758 file->f_version = 0;
2764 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2765 loff_t maxsize, loff_t eof)
2767 struct inode *inode = file_inode(file);
2775 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2776 * position-querying operation. Avoid rewriting the "same"
2777 * f_pos value back to the file because a concurrent read(),
2778 * write() or lseek() might have altered it
2783 * f_lock protects against read/modify/write race with other
2784 * SEEK_CURs. Note that parallel writes and reads behave
2788 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2789 inode_unlock(inode);
2793 * In the generic case the entire file is data, so as long as
2794 * offset isn't at the end of the file then the offset is data.
2801 * There is a virtual hole at the end of the file, so as long as
2802 * offset isn't i_size or larger, return i_size.
2810 return llseek_execute(file, offset, maxsize);
2814 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2816 struct inode *inode = file_inode(file);
2817 loff_t retval, eof = 0;
2820 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2821 (origin == SEEK_CUR) ? file->f_pos : 0);
2822 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2823 PFID(ll_inode2fid(inode)), inode, retval, retval,
2825 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2827 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2828 retval = ll_glimpse_size(inode);
2831 eof = i_size_read(inode);
2834 retval = ll_generic_file_llseek_size(file, offset, origin,
2835 ll_file_maxbytes(inode), eof);
2839 static int ll_flush(struct file *file, fl_owner_t id)
2841 struct inode *inode = file_inode(file);
2842 struct ll_inode_info *lli = ll_i2info(inode);
2843 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2846 LASSERT(!S_ISDIR(inode->i_mode));
2848 /* catch async errors that were recorded back when async writeback
2849 * failed for pages in this mapping. */
2850 rc = lli->lli_async_rc;
2851 lli->lli_async_rc = 0;
2852 if (lli->lli_clob != NULL) {
2853 err = lov_read_and_clear_async_rc(lli->lli_clob);
2858 /* The application has been told write failure already.
2859 * Do not report failure again. */
2860 if (fd->fd_write_failed)
2862 return rc ? -EIO : 0;
2866 * Called to make sure a portion of file has been written out.
2867 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2869 * Return how many pages have been written.
2871 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2872 enum cl_fsync_mode mode, int ignore_layout)
2876 struct cl_fsync_io *fio;
2881 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2882 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2885 env = cl_env_get(&refcheck);
2887 RETURN(PTR_ERR(env));
2889 io = vvp_env_thread_io(env);
2890 io->ci_obj = ll_i2info(inode)->lli_clob;
2891 io->ci_ignore_layout = ignore_layout;
2893 /* initialize parameters for sync */
2894 fio = &io->u.ci_fsync;
2895 fio->fi_start = start;
2897 fio->fi_fid = ll_inode2fid(inode);
2898 fio->fi_mode = mode;
2899 fio->fi_nr_written = 0;
2901 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2902 result = cl_io_loop(env, io);
2904 result = io->ci_result;
2906 result = fio->fi_nr_written;
2907 cl_io_fini(env, io);
2908 cl_env_put(env, &refcheck);
2914 * When dentry is provided (the 'else' case), file_dentry() may be
2915 * null and dentry must be used directly rather than pulled from
2916 * file_dentry() as is done otherwise.
2919 #ifdef HAVE_FILE_FSYNC_4ARGS
2920 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2922 struct dentry *dentry = file_dentry(file);
2923 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2924 int ll_fsync(struct file *file, int datasync)
2926 struct dentry *dentry = file_dentry(file);
2928 loff_t end = LLONG_MAX;
2930 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2933 loff_t end = LLONG_MAX;
2935 struct inode *inode = dentry->d_inode;
2936 struct ll_inode_info *lli = ll_i2info(inode);
2937 struct ptlrpc_request *req;
2941 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2942 PFID(ll_inode2fid(inode)), inode);
2943 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2945 #ifdef HAVE_FILE_FSYNC_4ARGS
2946 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2949 /* fsync's caller has already called _fdata{sync,write}, we want
2950 * that IO to finish before calling the osc and mdc sync methods */
2951 rc = filemap_fdatawait(inode->i_mapping);
2954 /* catch async errors that were recorded back when async writeback
2955 * failed for pages in this mapping. */
2956 if (!S_ISDIR(inode->i_mode)) {
2957 err = lli->lli_async_rc;
2958 lli->lli_async_rc = 0;
2961 if (lli->lli_clob != NULL) {
2962 err = lov_read_and_clear_async_rc(lli->lli_clob);
2968 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2972 ptlrpc_req_finished(req);
2974 if (S_ISREG(inode->i_mode)) {
2975 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2977 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2978 if (rc == 0 && err < 0)
2981 fd->fd_write_failed = true;
2983 fd->fd_write_failed = false;
2986 #ifdef HAVE_FILE_FSYNC_4ARGS
2987 inode_unlock(inode);
2993 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2995 struct inode *inode = file_inode(file);
2996 struct ll_sb_info *sbi = ll_i2sbi(inode);
2997 struct ldlm_enqueue_info einfo = {
2998 .ei_type = LDLM_FLOCK,
2999 .ei_cb_cp = ldlm_flock_completion_ast,
3000 .ei_cbdata = file_lock,
3002 struct md_op_data *op_data;
3003 struct lustre_handle lockh = { 0 };
3004 union ldlm_policy_data flock = { { 0 } };
3005 int fl_type = file_lock->fl_type;
3011 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3012 PFID(ll_inode2fid(inode)), file_lock);
3014 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3016 if (file_lock->fl_flags & FL_FLOCK) {
3017 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3018 /* flocks are whole-file locks */
3019 flock.l_flock.end = OFFSET_MAX;
3020 /* For flocks owner is determined by the local file desctiptor*/
3021 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3022 } else if (file_lock->fl_flags & FL_POSIX) {
3023 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3024 flock.l_flock.start = file_lock->fl_start;
3025 flock.l_flock.end = file_lock->fl_end;
3029 flock.l_flock.pid = file_lock->fl_pid;
3031 /* Somewhat ugly workaround for svc lockd.
3032 * lockd installs custom fl_lmops->lm_compare_owner that checks
3033 * for the fl_owner to be the same (which it always is on local node
3034 * I guess between lockd processes) and then compares pid.
3035 * As such we assign pid to the owner field to make it all work,
3036 * conflict with normal locks is unlikely since pid space and
3037 * pointer space for current->files are not intersecting */
3038 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3039 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3043 einfo.ei_mode = LCK_PR;
3046 /* An unlock request may or may not have any relation to
3047 * existing locks so we may not be able to pass a lock handle
3048 * via a normal ldlm_lock_cancel() request. The request may even
3049 * unlock a byte range in the middle of an existing lock. In
3050 * order to process an unlock request we need all of the same
3051 * information that is given with a normal read or write record
3052 * lock request. To avoid creating another ldlm unlock (cancel)
3053 * message we'll treat a LCK_NL flock request as an unlock. */
3054 einfo.ei_mode = LCK_NL;
3057 einfo.ei_mode = LCK_PW;
3060 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3075 flags = LDLM_FL_BLOCK_NOWAIT;
3081 flags = LDLM_FL_TEST_LOCK;
3084 CERROR("unknown fcntl lock command: %d\n", cmd);
3088 /* Save the old mode so that if the mode in the lock changes we
3089 * can decrement the appropriate reader or writer refcount. */
3090 file_lock->fl_type = einfo.ei_mode;
3092 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3093 LUSTRE_OPC_ANY, NULL);
3094 if (IS_ERR(op_data))
3095 RETURN(PTR_ERR(op_data));
3097 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3098 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3099 flock.l_flock.pid, flags, einfo.ei_mode,
3100 flock.l_flock.start, flock.l_flock.end);
3102 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3105 /* Restore the file lock type if not TEST lock. */
3106 if (!(flags & LDLM_FL_TEST_LOCK))
3107 file_lock->fl_type = fl_type;
3109 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3110 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3111 !(flags & LDLM_FL_TEST_LOCK))
3112 rc2 = locks_lock_file_wait(file, file_lock);
3114 if ((file_lock->fl_flags & FL_FLOCK) &&
3115 (rc == 0 || file_lock->fl_type == F_UNLCK))
3116 rc2 = flock_lock_file_wait(file, file_lock);
3117 if ((file_lock->fl_flags & FL_POSIX) &&
3118 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3119 !(flags & LDLM_FL_TEST_LOCK))
3120 rc2 = posix_lock_file_wait(file, file_lock);
3121 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3123 if (rc2 && file_lock->fl_type != F_UNLCK) {
3124 einfo.ei_mode = LCK_NL;
3125 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3130 ll_finish_md_op_data(op_data);
3135 int ll_get_fid_by_name(struct inode *parent, const char *name,
3136 int namelen, struct lu_fid *fid,
3137 struct inode **inode)
3139 struct md_op_data *op_data = NULL;
3140 struct mdt_body *body;
3141 struct ptlrpc_request *req;
3145 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3146 LUSTRE_OPC_ANY, NULL);
3147 if (IS_ERR(op_data))
3148 RETURN(PTR_ERR(op_data));
3150 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3151 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3152 ll_finish_md_op_data(op_data);
3156 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3158 GOTO(out_req, rc = -EFAULT);
3160 *fid = body->mbo_fid1;
3163 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3165 ptlrpc_req_finished(req);
3169 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3170 const char *name, int namelen)
3172 struct dentry *dchild = NULL;
3173 struct inode *child_inode = NULL;
3174 struct md_op_data *op_data;
3175 struct ptlrpc_request *request = NULL;
3176 struct obd_client_handle *och = NULL;
3178 struct mdt_body *body;
3180 __u64 data_version = 0;
3183 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3184 name, PFID(ll_inode2fid(parent)), mdtidx);
3186 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3187 0, LUSTRE_OPC_ANY, NULL);
3188 if (IS_ERR(op_data))
3189 RETURN(PTR_ERR(op_data));
3191 /* Get child FID first */
3192 qstr.hash = full_name_hash(name, namelen);
3195 dchild = d_lookup(file_dentry(file), &qstr);
3196 if (dchild != NULL) {
3197 if (dchild->d_inode != NULL)
3198 child_inode = igrab(dchild->d_inode);
3202 if (child_inode == NULL) {
3203 rc = ll_get_fid_by_name(parent, name, namelen,
3204 &op_data->op_fid3, &child_inode);
3209 if (child_inode == NULL)
3210 GOTO(out_free, rc = -EINVAL);
3213 * lfs migrate command needs to be blocked on the client
3214 * by checking the migrate FID against the FID of the
3217 if (child_inode == parent->i_sb->s_root->d_inode)
3218 GOTO(out_iput, rc = -EINVAL);
3220 inode_lock(child_inode);
3221 op_data->op_fid3 = *ll_inode2fid(child_inode);
3222 if (!fid_is_sane(&op_data->op_fid3)) {
3223 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3224 ll_get_fsname(parent->i_sb, NULL, 0), name,
3225 PFID(&op_data->op_fid3));
3226 GOTO(out_unlock, rc = -EINVAL);
3229 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3231 GOTO(out_unlock, rc);
3234 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3235 PFID(&op_data->op_fid3), mdtidx);
3236 GOTO(out_unlock, rc = 0);
3239 if (S_ISREG(child_inode->i_mode)) {
3240 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3244 GOTO(out_unlock, rc);
3247 rc = ll_data_version(child_inode, &data_version,
3250 GOTO(out_close, rc);
3252 op_data->op_handle = och->och_fh;
3253 op_data->op_data = och->och_mod;
3254 op_data->op_data_version = data_version;
3255 op_data->op_lease_handle = och->och_lease_handle;
3256 op_data->op_bias |= MDS_RENAME_MIGRATE;
3259 op_data->op_mds = mdtidx;
3260 op_data->op_cli_flags = CLI_MIGRATE;
3261 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3262 namelen, name, namelen, &request);
3264 LASSERT(request != NULL);
3265 ll_update_times(request, parent);
3267 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3268 LASSERT(body != NULL);
3270 /* If the server does release layout lock, then we cleanup
3271 * the client och here, otherwise release it in out_close: */
3273 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3274 obd_mod_put(och->och_mod);
3275 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3277 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3283 if (request != NULL) {
3284 ptlrpc_req_finished(request);
3288 /* Try again if the file layout has changed. */
3289 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3293 if (och != NULL) /* close the file */
3294 ll_lease_close(och, child_inode, NULL);
3296 clear_nlink(child_inode);
3298 inode_unlock(child_inode);
3302 ll_finish_md_op_data(op_data);
3307 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3315 * test if some locks matching bits and l_req_mode are acquired
3316 * - bits can be in different locks
3317 * - if found clear the common lock bits in *bits
3318 * - the bits not found, are kept in *bits
3320 * \param bits [IN] searched lock bits [IN]
3321 * \param l_req_mode [IN] searched lock mode
3322 * \retval boolean, true iff all bits are found
3324 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3326 struct lustre_handle lockh;
3327 union ldlm_policy_data policy;
3328 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3329 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3338 fid = &ll_i2info(inode)->lli_fid;
3339 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3340 ldlm_lockname[mode]);
3342 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3343 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3344 policy.l_inodebits.bits = *bits & (1 << i);
3345 if (policy.l_inodebits.bits == 0)
3348 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3349 &policy, mode, &lockh)) {
3350 struct ldlm_lock *lock;
3352 lock = ldlm_handle2lock(&lockh);
3355 ~(lock->l_policy_data.l_inodebits.bits);
3356 LDLM_LOCK_PUT(lock);
3358 *bits &= ~policy.l_inodebits.bits;
3365 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3366 struct lustre_handle *lockh, __u64 flags,
3367 enum ldlm_mode mode)
3369 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3374 fid = &ll_i2info(inode)->lli_fid;
3375 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3377 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3378 fid, LDLM_IBITS, &policy, mode, lockh);
3383 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3385 /* Already unlinked. Just update nlink and return success */
3386 if (rc == -ENOENT) {
3388 /* If it is striped directory, and there is bad stripe
3389 * Let's revalidate the dentry again, instead of returning
3391 if (S_ISDIR(inode->i_mode) &&
3392 ll_i2info(inode)->lli_lsm_md != NULL)
3395 /* This path cannot be hit for regular files unless in
3396 * case of obscure races, so no need to to validate
3398 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3400 } else if (rc != 0) {
3401 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3402 "%s: revalidate FID "DFID" error: rc = %d\n",
3403 ll_get_fsname(inode->i_sb, NULL, 0),
3404 PFID(ll_inode2fid(inode)), rc);
3410 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3412 struct inode *inode = dentry->d_inode;
3413 struct ptlrpc_request *req = NULL;
3414 struct obd_export *exp;
3418 LASSERT(inode != NULL);
3420 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3421 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3423 exp = ll_i2mdexp(inode);
3425 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3426 * But under CMD case, it caused some lock issues, should be fixed
3427 * with new CMD ibits lock. See bug 12718 */
3428 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3429 struct lookup_intent oit = { .it_op = IT_GETATTR };
3430 struct md_op_data *op_data;
3432 if (ibits == MDS_INODELOCK_LOOKUP)
3433 oit.it_op = IT_LOOKUP;
3435 /* Call getattr by fid, so do not provide name at all. */
3436 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3437 dentry->d_inode, NULL, 0, 0,
3438 LUSTRE_OPC_ANY, NULL);
3439 if (IS_ERR(op_data))
3440 RETURN(PTR_ERR(op_data));
3442 rc = md_intent_lock(exp, op_data, &oit, &req,
3443 &ll_md_blocking_ast, 0);
3444 ll_finish_md_op_data(op_data);
3446 rc = ll_inode_revalidate_fini(inode, rc);
3450 rc = ll_revalidate_it_finish(req, &oit, dentry);
3452 ll_intent_release(&oit);
3456 /* Unlinked? Unhash dentry, so it is not picked up later by
3457 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3458 here to preserve get_cwd functionality on 2.6.
3460 if (!dentry->d_inode->i_nlink) {
3461 ll_lock_dcache(inode);
3462 d_lustre_invalidate(dentry, 0);
3463 ll_unlock_dcache(inode);
3466 ll_lookup_finish_locks(&oit, dentry);
3467 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3468 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3469 u64 valid = OBD_MD_FLGETATTR;
3470 struct md_op_data *op_data;
3473 if (S_ISREG(inode->i_mode)) {
3474 rc = ll_get_default_mdsize(sbi, &ealen);
3477 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3480 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3481 0, ealen, LUSTRE_OPC_ANY,
3483 if (IS_ERR(op_data))
3484 RETURN(PTR_ERR(op_data));
3486 op_data->op_valid = valid;
3487 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3488 ll_finish_md_op_data(op_data);
3490 rc = ll_inode_revalidate_fini(inode, rc);
3494 rc = ll_prep_inode(&inode, req, NULL, NULL);
3497 ptlrpc_req_finished(req);
3501 static int ll_merge_md_attr(struct inode *inode)
3503 struct cl_attr attr = { 0 };
3506 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3507 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3508 &attr, ll_md_blocking_ast);
3512 set_nlink(inode, attr.cat_nlink);
3513 inode->i_blocks = attr.cat_blocks;
3514 i_size_write(inode, attr.cat_size);
3516 ll_i2info(inode)->lli_atime = attr.cat_atime;
3517 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3518 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3524 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3526 struct inode *inode = dentry->d_inode;
3530 rc = __ll_inode_revalidate(dentry, ibits);
3534 /* if object isn't regular file, don't validate size */
3535 if (!S_ISREG(inode->i_mode)) {
3536 if (S_ISDIR(inode->i_mode) &&
3537 ll_i2info(inode)->lli_lsm_md != NULL) {
3538 rc = ll_merge_md_attr(inode);
3543 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3544 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3545 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3547 /* In case of restore, the MDT has the right size and has
3548 * already send it back without granting the layout lock,
3549 * inode is up-to-date so glimpse is useless.
3550 * Also to glimpse we need the layout, in case of a running
3551 * restore the MDT holds the layout lock so the glimpse will
3552 * block up to the end of restore (getattr will block)
3554 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3555 rc = ll_glimpse_size(inode);
3560 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3562 struct inode *inode = de->d_inode;
3563 struct ll_sb_info *sbi = ll_i2sbi(inode);
3564 struct ll_inode_info *lli = ll_i2info(inode);
3567 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3568 MDS_INODELOCK_LOOKUP);
3569 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3574 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3576 stat->dev = inode->i_sb->s_dev;
3577 if (ll_need_32bit_api(sbi))
3578 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3580 stat->ino = inode->i_ino;
3581 stat->mode = inode->i_mode;
3582 stat->uid = inode->i_uid;
3583 stat->gid = inode->i_gid;
3584 stat->rdev = inode->i_rdev;
3585 stat->atime = inode->i_atime;
3586 stat->mtime = inode->i_mtime;
3587 stat->ctime = inode->i_ctime;
3588 stat->blksize = 1 << inode->i_blkbits;
3590 stat->nlink = inode->i_nlink;
3591 stat->size = i_size_read(inode);
3592 stat->blocks = inode->i_blocks;
3597 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3598 __u64 start, __u64 len)
3602 struct fiemap *fiemap;
3603 unsigned int extent_count = fieinfo->fi_extents_max;
3605 num_bytes = sizeof(*fiemap) + (extent_count *
3606 sizeof(struct fiemap_extent));
3607 OBD_ALLOC_LARGE(fiemap, num_bytes);
3612 fiemap->fm_flags = fieinfo->fi_flags;
3613 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3614 fiemap->fm_start = start;
3615 fiemap->fm_length = len;
3616 if (extent_count > 0 &&
3617 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3618 sizeof(struct fiemap_extent)) != 0)
3619 GOTO(out, rc = -EFAULT);
3621 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3623 fieinfo->fi_flags = fiemap->fm_flags;
3624 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3625 if (extent_count > 0 &&
3626 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3627 fiemap->fm_mapped_extents *
3628 sizeof(struct fiemap_extent)) != 0)
3629 GOTO(out, rc = -EFAULT);
3631 OBD_FREE_LARGE(fiemap, num_bytes);
3635 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3637 struct ll_inode_info *lli = ll_i2info(inode);
3638 struct posix_acl *acl = NULL;
3641 spin_lock(&lli->lli_lock);
3642 /* VFS' acl_permission_check->check_acl will release the refcount */
3643 acl = posix_acl_dup(lli->lli_posix_acl);
3644 spin_unlock(&lli->lli_lock);
3649 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3651 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3652 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3654 ll_check_acl(struct inode *inode, int mask)
3657 # ifdef CONFIG_FS_POSIX_ACL
3658 struct posix_acl *acl;
3662 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3663 if (flags & IPERM_FLAG_RCU)
3666 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3671 rc = posix_acl_permission(inode, acl, mask);
3672 posix_acl_release(acl);
3675 # else /* !CONFIG_FS_POSIX_ACL */
3677 # endif /* CONFIG_FS_POSIX_ACL */
3679 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3681 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3682 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3684 # ifdef HAVE_INODE_PERMISION_2ARGS
3685 int ll_inode_permission(struct inode *inode, int mask)
3687 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3692 struct ll_sb_info *sbi;
3693 struct root_squash_info *squash;
3694 struct cred *cred = NULL;
3695 const struct cred *old_cred = NULL;
3697 bool squash_id = false;
3700 #ifdef MAY_NOT_BLOCK
3701 if (mask & MAY_NOT_BLOCK)
3703 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3704 if (flags & IPERM_FLAG_RCU)
3708 /* as root inode are NOT getting validated in lookup operation,
3709 * need to do it before permission check. */
3711 if (inode == inode->i_sb->s_root->d_inode) {
3712 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3713 MDS_INODELOCK_LOOKUP);
3718 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3719 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3721 /* squash fsuid/fsgid if needed */
3722 sbi = ll_i2sbi(inode);
3723 squash = &sbi->ll_squash;
3724 if (unlikely(squash->rsi_uid != 0 &&
3725 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3726 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3730 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3731 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3732 squash->rsi_uid, squash->rsi_gid);
3734 /* update current process's credentials
3735 * and FS capability */
3736 cred = prepare_creds();
3740 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3741 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3742 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3743 if ((1 << cap) & CFS_CAP_FS_MASK)
3744 cap_lower(cred->cap_effective, cap);
3746 old_cred = override_creds(cred);
3749 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3750 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3751 /* restore current process's credentials and FS capability */
3753 revert_creds(old_cred);
3760 /* -o localflock - only provides locally consistent flock locks */
3761 struct file_operations ll_file_operations = {
3762 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3763 # ifdef HAVE_SYNC_READ_WRITE
3764 .read = new_sync_read,
3765 .write = new_sync_write,
3767 .read_iter = ll_file_read_iter,
3768 .write_iter = ll_file_write_iter,
3769 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3770 .read = ll_file_read,
3771 .aio_read = ll_file_aio_read,
3772 .write = ll_file_write,
3773 .aio_write = ll_file_aio_write,
3774 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3775 .unlocked_ioctl = ll_file_ioctl,
3776 .open = ll_file_open,
3777 .release = ll_file_release,
3778 .mmap = ll_file_mmap,
3779 .llseek = ll_file_seek,
3780 .splice_read = ll_file_splice_read,
3785 struct file_operations ll_file_operations_flock = {
3786 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3787 # ifdef HAVE_SYNC_READ_WRITE
3788 .read = new_sync_read,
3789 .write = new_sync_write,
3790 # endif /* HAVE_SYNC_READ_WRITE */
3791 .read_iter = ll_file_read_iter,
3792 .write_iter = ll_file_write_iter,
3793 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3794 .read = ll_file_read,
3795 .aio_read = ll_file_aio_read,
3796 .write = ll_file_write,
3797 .aio_write = ll_file_aio_write,
3798 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3799 .unlocked_ioctl = ll_file_ioctl,
3800 .open = ll_file_open,
3801 .release = ll_file_release,
3802 .mmap = ll_file_mmap,
3803 .llseek = ll_file_seek,
3804 .splice_read = ll_file_splice_read,
3807 .flock = ll_file_flock,
3808 .lock = ll_file_flock
3811 /* These are for -o noflock - to return ENOSYS on flock calls */
3812 struct file_operations ll_file_operations_noflock = {
3813 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3814 # ifdef HAVE_SYNC_READ_WRITE
3815 .read = new_sync_read,
3816 .write = new_sync_write,
3817 # endif /* HAVE_SYNC_READ_WRITE */
3818 .read_iter = ll_file_read_iter,
3819 .write_iter = ll_file_write_iter,
3820 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3821 .read = ll_file_read,
3822 .aio_read = ll_file_aio_read,
3823 .write = ll_file_write,
3824 .aio_write = ll_file_aio_write,
3825 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3826 .unlocked_ioctl = ll_file_ioctl,
3827 .open = ll_file_open,
3828 .release = ll_file_release,
3829 .mmap = ll_file_mmap,
3830 .llseek = ll_file_seek,
3831 .splice_read = ll_file_splice_read,
3834 .flock = ll_file_noflock,
3835 .lock = ll_file_noflock
3838 struct inode_operations ll_file_inode_operations = {
3839 .setattr = ll_setattr,
3840 .getattr = ll_getattr,
3841 .permission = ll_inode_permission,
3842 .setxattr = ll_setxattr,
3843 .getxattr = ll_getxattr,
3844 .listxattr = ll_listxattr,
3845 .removexattr = ll_removexattr,
3846 .fiemap = ll_fiemap,
3847 #ifdef HAVE_IOP_GET_ACL
3848 .get_acl = ll_get_acl,
3852 /* dynamic ioctl number support routins */
3853 static struct llioc_ctl_data {
3854 struct rw_semaphore ioc_sem;
3855 struct list_head ioc_head;
3857 __RWSEM_INITIALIZER(llioc.ioc_sem),
3858 LIST_HEAD_INIT(llioc.ioc_head)
3863 struct list_head iocd_list;
3864 unsigned int iocd_size;
3865 llioc_callback_t iocd_cb;
3866 unsigned int iocd_count;
3867 unsigned int iocd_cmd[0];
3870 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3873 struct llioc_data *in_data = NULL;
3876 if (cb == NULL || cmd == NULL ||
3877 count > LLIOC_MAX_CMD || count < 0)
3880 size = sizeof(*in_data) + count * sizeof(unsigned int);
3881 OBD_ALLOC(in_data, size);
3882 if (in_data == NULL)
3885 memset(in_data, 0, sizeof(*in_data));
3886 in_data->iocd_size = size;
3887 in_data->iocd_cb = cb;
3888 in_data->iocd_count = count;
3889 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3891 down_write(&llioc.ioc_sem);
3892 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3893 up_write(&llioc.ioc_sem);
3898 void ll_iocontrol_unregister(void *magic)
3900 struct llioc_data *tmp;
3905 down_write(&llioc.ioc_sem);
3906 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3908 unsigned int size = tmp->iocd_size;
3910 list_del(&tmp->iocd_list);
3911 up_write(&llioc.ioc_sem);
3913 OBD_FREE(tmp, size);
3917 up_write(&llioc.ioc_sem);
3919 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3922 EXPORT_SYMBOL(ll_iocontrol_register);
3923 EXPORT_SYMBOL(ll_iocontrol_unregister);
3925 static enum llioc_iter
3926 ll_iocontrol_call(struct inode *inode, struct file *file,
3927 unsigned int cmd, unsigned long arg, int *rcp)
3929 enum llioc_iter ret = LLIOC_CONT;
3930 struct llioc_data *data;
3931 int rc = -EINVAL, i;
3933 down_read(&llioc.ioc_sem);
3934 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3935 for (i = 0; i < data->iocd_count; i++) {
3936 if (cmd != data->iocd_cmd[i])
3939 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3943 if (ret == LLIOC_STOP)
3946 up_read(&llioc.ioc_sem);
3953 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3955 struct ll_inode_info *lli = ll_i2info(inode);
3956 struct cl_object *obj = lli->lli_clob;
3965 env = cl_env_get(&refcheck);
3967 RETURN(PTR_ERR(env));
3969 rc = cl_conf_set(env, lli->lli_clob, conf);
3973 if (conf->coc_opc == OBJECT_CONF_SET) {
3974 struct ldlm_lock *lock = conf->coc_lock;
3975 struct cl_layout cl = {
3979 LASSERT(lock != NULL);
3980 LASSERT(ldlm_has_layout(lock));
3982 /* it can only be allowed to match after layout is
3983 * applied to inode otherwise false layout would be
3984 * seen. Applying layout shoud happen before dropping
3985 * the intent lock. */
3986 ldlm_lock_allow_match(lock);
3988 rc = cl_object_layout_get(env, obj, &cl);
3993 DFID": layout version change: %u -> %u\n",
3994 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3996 ll_layout_version_set(lli, cl.cl_layout_gen);
4000 cl_env_put(env, &refcheck);
4005 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4006 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4009 struct ll_sb_info *sbi = ll_i2sbi(inode);
4010 struct ptlrpc_request *req;
4011 struct mdt_body *body;
4018 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4019 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4020 lock->l_lvb_data, lock->l_lvb_len);
4022 if (lock->l_lvb_data != NULL)
4025 /* if layout lock was granted right away, the layout is returned
4026 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4027 * blocked and then granted via completion ast, we have to fetch
4028 * layout here. Please note that we can't use the LVB buffer in
4029 * completion AST because it doesn't have a large enough buffer */
4030 rc = ll_get_default_mdsize(sbi, &lmmsize);
4032 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4033 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4038 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4040 GOTO(out, rc = -EPROTO);
4042 lmmsize = body->mbo_eadatasize;
4043 if (lmmsize == 0) /* empty layout */
4046 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4048 GOTO(out, rc = -EFAULT);
4050 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4051 if (lvbdata == NULL)
4052 GOTO(out, rc = -ENOMEM);
4054 memcpy(lvbdata, lmm, lmmsize);
4055 lock_res_and_lock(lock);
4056 if (unlikely(lock->l_lvb_data == NULL)) {
4057 lock->l_lvb_type = LVB_T_LAYOUT;
4058 lock->l_lvb_data = lvbdata;
4059 lock->l_lvb_len = lmmsize;
4062 unlock_res_and_lock(lock);
4065 OBD_FREE_LARGE(lvbdata, lmmsize);
4070 ptlrpc_req_finished(req);
4075 * Apply the layout to the inode. Layout lock is held and will be released
4078 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4079 struct inode *inode)
4081 struct ll_inode_info *lli = ll_i2info(inode);
4082 struct ll_sb_info *sbi = ll_i2sbi(inode);
4083 struct ldlm_lock *lock;
4084 struct cl_object_conf conf;
4087 bool wait_layout = false;
4090 LASSERT(lustre_handle_is_used(lockh));
4092 lock = ldlm_handle2lock(lockh);
4093 LASSERT(lock != NULL);
4094 LASSERT(ldlm_has_layout(lock));
4096 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4097 PFID(&lli->lli_fid), inode);
4099 /* in case this is a caching lock and reinstate with new inode */
4100 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4102 lock_res_and_lock(lock);
4103 lvb_ready = ldlm_is_lvb_ready(lock);
4104 unlock_res_and_lock(lock);
4105 /* checking lvb_ready is racy but this is okay. The worst case is
4106 * that multi processes may configure the file on the same time. */
4111 rc = ll_layout_fetch(inode, lock);
4115 /* for layout lock, lmm is stored in lock's lvb.
4116 * lvb_data is immutable if the lock is held so it's safe to access it
4119 * set layout to file. Unlikely this will fail as old layout was
4120 * surely eliminated */
4121 memset(&conf, 0, sizeof conf);
4122 conf.coc_opc = OBJECT_CONF_SET;
4123 conf.coc_inode = inode;
4124 conf.coc_lock = lock;
4125 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4126 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4127 rc = ll_layout_conf(inode, &conf);
4129 /* refresh layout failed, need to wait */
4130 wait_layout = rc == -EBUSY;
4134 LDLM_LOCK_PUT(lock);
4135 ldlm_lock_decref(lockh, mode);
4137 /* wait for IO to complete if it's still being used. */
4139 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4140 ll_get_fsname(inode->i_sb, NULL, 0),
4141 PFID(&lli->lli_fid), inode);
4143 memset(&conf, 0, sizeof conf);
4144 conf.coc_opc = OBJECT_CONF_WAIT;
4145 conf.coc_inode = inode;
4146 rc = ll_layout_conf(inode, &conf);
4150 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4151 ll_get_fsname(inode->i_sb, NULL, 0),
4152 PFID(&lli->lli_fid), rc);
4157 static int ll_layout_refresh_locked(struct inode *inode)
4159 struct ll_inode_info *lli = ll_i2info(inode);
4160 struct ll_sb_info *sbi = ll_i2sbi(inode);
4161 struct md_op_data *op_data;
4162 struct lookup_intent it;
4163 struct lustre_handle lockh;
4164 enum ldlm_mode mode;
4165 struct ptlrpc_request *req;
4170 /* mostly layout lock is caching on the local side, so try to match
4171 * it before grabbing layout lock mutex. */
4172 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4173 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4174 if (mode != 0) { /* hit cached lock */
4175 rc = ll_layout_lock_set(&lockh, mode, inode);
4182 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4183 0, 0, LUSTRE_OPC_ANY, NULL);
4184 if (IS_ERR(op_data))
4185 RETURN(PTR_ERR(op_data));
4187 /* have to enqueue one */
4188 memset(&it, 0, sizeof(it));
4189 it.it_op = IT_LAYOUT;
4191 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4192 ll_get_fsname(inode->i_sb, NULL, 0),
4193 PFID(&lli->lli_fid), inode);
4195 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4196 &ll_md_blocking_ast, 0);
4197 if (it.it_request != NULL)
4198 ptlrpc_req_finished(it.it_request);
4199 it.it_request = NULL;
4201 ll_finish_md_op_data(op_data);
4203 mode = it.it_lock_mode;
4204 it.it_lock_mode = 0;
4205 ll_intent_drop_lock(&it);
4208 /* set lock data in case this is a new lock */
4209 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4210 lockh.cookie = it.it_lock_handle;
4211 rc = ll_layout_lock_set(&lockh, mode, inode);
4220 * This function checks if there exists a LAYOUT lock on the client side,
4221 * or enqueues it if it doesn't have one in cache.
4223 * This function will not hold layout lock so it may be revoked any time after
4224 * this function returns. Any operations depend on layout should be redone
4227 * This function should be called before lov_io_init() to get an uptodate
4228 * layout version, the caller should save the version number and after IO
4229 * is finished, this function should be called again to verify that layout
4230 * is not changed during IO time.
4232 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4234 struct ll_inode_info *lli = ll_i2info(inode);
4235 struct ll_sb_info *sbi = ll_i2sbi(inode);
4239 *gen = ll_layout_version_get(lli);
4240 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4244 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4245 LASSERT(S_ISREG(inode->i_mode));
4247 /* take layout lock mutex to enqueue layout lock exclusively. */
4248 mutex_lock(&lli->lli_layout_mutex);
4250 rc = ll_layout_refresh_locked(inode);
4254 *gen = ll_layout_version_get(lli);
4256 mutex_unlock(&lli->lli_layout_mutex);
4262 * This function send a restore request to the MDT
4264 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4266 struct hsm_user_request *hur;
4270 len = sizeof(struct hsm_user_request) +
4271 sizeof(struct hsm_user_item);
4272 OBD_ALLOC(hur, len);
4276 hur->hur_request.hr_action = HUA_RESTORE;
4277 hur->hur_request.hr_archive_id = 0;
4278 hur->hur_request.hr_flags = 0;
4279 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4280 sizeof(hur->hur_user_item[0].hui_fid));
4281 hur->hur_user_item[0].hui_extent.offset = offset;
4282 hur->hur_user_item[0].hui_extent.length = length;
4283 hur->hur_request.hr_itemcount = 1;
4284 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,