4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct dentry *de = file_dentry(file);
355 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
356 struct dentry *parent = de->d_parent;
357 const char *name = NULL;
359 struct md_op_data *op_data;
360 struct ptlrpc_request *req = NULL;
364 LASSERT(parent != NULL);
365 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
367 /* if server supports open-by-fid, or file name is invalid, don't pack
368 * name in open request */
369 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
370 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
371 name = de->d_name.name;
372 len = de->d_name.len;
375 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
376 name, len, 0, LUSTRE_OPC_ANY, NULL);
378 RETURN(PTR_ERR(op_data));
379 op_data->op_data = lmm;
380 op_data->op_data_size = lmmsize;
382 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
383 &ll_md_blocking_ast, 0);
384 ll_finish_md_op_data(op_data);
386 /* reason for keep own exit path - don`t flood log
387 * with messages with -ESTALE errors.
389 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
390 it_open_error(DISP_OPEN_OPEN, itp))
392 ll_release_openhandle(de, itp);
396 if (it_disposition(itp, DISP_LOOKUP_NEG))
397 GOTO(out, rc = -ENOENT);
399 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
400 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
401 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
405 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
406 if (!rc && itp->it_lock_mode)
407 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
410 ptlrpc_req_finished(req);
411 ll_intent_drop_lock(itp);
413 /* We did open by fid, but by the time we got to the server,
414 * the object disappeared. If this is a create, we cannot really
415 * tell the userspace that the file it was trying to create
416 * does not exist. Instead let's return -ESTALE, and the VFS will
417 * retry the create with LOOKUP_REVAL that we are going to catch
418 * in ll_revalidate_dentry() and use lookup then.
420 if (rc == -ENOENT && itp->it_op & IT_CREAT)
426 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
427 struct obd_client_handle *och)
429 struct mdt_body *body;
431 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
432 och->och_fh = body->mbo_handle;
433 och->och_fid = body->mbo_fid1;
434 och->och_lease_handle.cookie = it->it_lock_handle;
435 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
436 och->och_flags = it->it_flags;
438 return md_set_open_replay_data(md_exp, och, it);
441 static int ll_local_open(struct file *file, struct lookup_intent *it,
442 struct ll_file_data *fd, struct obd_client_handle *och)
444 struct inode *inode = file_inode(file);
447 LASSERT(!LUSTRE_FPRIVATE(file));
454 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
459 LUSTRE_FPRIVATE(file) = fd;
460 ll_readahead_init(inode, &fd->fd_ras);
461 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
463 /* ll_cl_context initialize */
464 rwlock_init(&fd->fd_lock);
465 INIT_LIST_HEAD(&fd->fd_lccs);
470 /* Open a file, and (for the very first open) create objects on the OSTs at
471 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
472 * creation or open until ll_lov_setstripe() ioctl is called.
474 * If we already have the stripe MD locally then we don't request it in
475 * md_open(), by passing a lmm_size = 0.
477 * It is up to the application to ensure no other processes open this file
478 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
479 * used. We might be able to avoid races of that sort by getting lli_open_sem
480 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
481 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
483 int ll_file_open(struct inode *inode, struct file *file)
485 struct ll_inode_info *lli = ll_i2info(inode);
486 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
487 .it_flags = file->f_flags };
488 struct obd_client_handle **och_p = NULL;
489 __u64 *och_usecount = NULL;
490 struct ll_file_data *fd;
494 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
495 PFID(ll_inode2fid(inode)), inode, file->f_flags);
497 it = file->private_data; /* XXX: compat macro */
498 file->private_data = NULL; /* prevent ll_local_open assertion */
500 fd = ll_file_data_get();
502 GOTO(out_openerr, rc = -ENOMEM);
505 if (S_ISDIR(inode->i_mode))
506 ll_authorize_statahead(inode, fd);
508 if (inode->i_sb->s_root == file_dentry(file)) {
509 LUSTRE_FPRIVATE(file) = fd;
513 if (!it || !it->it_disposition) {
514 /* Convert f_flags into access mode. We cannot use file->f_mode,
515 * because everything but O_ACCMODE mask was stripped from
517 if ((oit.it_flags + 1) & O_ACCMODE)
519 if (file->f_flags & O_TRUNC)
520 oit.it_flags |= FMODE_WRITE;
522 /* kernel only call f_op->open in dentry_open. filp_open calls
523 * dentry_open after call to open_namei that checks permissions.
524 * Only nfsd_open call dentry_open directly without checking
525 * permissions and because of that this code below is safe. */
526 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
527 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
529 /* We do not want O_EXCL here, presumably we opened the file
530 * already? XXX - NFS implications? */
531 oit.it_flags &= ~O_EXCL;
533 /* bug20584, if "it_flags" contains O_CREAT, the file will be
534 * created if necessary, then "IT_CREAT" should be set to keep
535 * consistent with it */
536 if (oit.it_flags & O_CREAT)
537 oit.it_op |= IT_CREAT;
543 /* Let's see if we have file open on MDS already. */
544 if (it->it_flags & FMODE_WRITE) {
545 och_p = &lli->lli_mds_write_och;
546 och_usecount = &lli->lli_open_fd_write_count;
547 } else if (it->it_flags & FMODE_EXEC) {
548 och_p = &lli->lli_mds_exec_och;
549 och_usecount = &lli->lli_open_fd_exec_count;
551 och_p = &lli->lli_mds_read_och;
552 och_usecount = &lli->lli_open_fd_read_count;
555 mutex_lock(&lli->lli_och_mutex);
556 if (*och_p) { /* Open handle is present */
557 if (it_disposition(it, DISP_OPEN_OPEN)) {
558 /* Well, there's extra open request that we do not need,
559 let's close it somehow. This will decref request. */
560 rc = it_open_error(DISP_OPEN_OPEN, it);
562 mutex_unlock(&lli->lli_och_mutex);
563 GOTO(out_openerr, rc);
566 ll_release_openhandle(file_dentry(file), it);
570 rc = ll_local_open(file, it, fd, NULL);
573 mutex_unlock(&lli->lli_och_mutex);
574 GOTO(out_openerr, rc);
577 LASSERT(*och_usecount == 0);
578 if (!it->it_disposition) {
579 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
580 /* We cannot just request lock handle now, new ELC code
581 means that one of other OPEN locks for this file
582 could be cancelled, and since blocking ast handler
583 would attempt to grab och_mutex as well, that would
584 result in a deadlock */
585 mutex_unlock(&lli->lli_och_mutex);
587 * Normally called under two situations:
589 * 2. A race/condition on MDS resulting in no open
590 * handle to be returned from LOOKUP|OPEN request,
591 * for example if the target entry was a symlink.
593 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
594 * marked by a bit set in ll_iget_for_nfs. Clear the
595 * bit so that it's not confusing later callers.
597 * NB; when ldd is NULL, it must have come via normal
598 * lookup path only, since ll_iget_for_nfs always calls
601 if (ldd && ldd->lld_nfs_dentry) {
602 ldd->lld_nfs_dentry = 0;
603 it->it_flags |= MDS_OPEN_LOCK;
607 * Always specify MDS_OPEN_BY_FID because we don't want
608 * to get file with different fid.
610 it->it_flags |= MDS_OPEN_BY_FID;
611 rc = ll_intent_file_open(file, NULL, 0, it);
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 lli->lli_update_atime = 0;
1020 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1023 atime = LTIME_S(inode->i_atime);
1024 mtime = LTIME_S(inode->i_mtime);
1025 ctime = LTIME_S(inode->i_ctime);
1027 cl_object_attr_lock(obj);
1028 rc = cl_object_attr_get(env, obj, attr);
1029 cl_object_attr_unlock(obj);
1032 GOTO(out_size_unlock, rc);
1034 if (atime < attr->cat_atime)
1035 atime = attr->cat_atime;
1037 if (ctime < attr->cat_ctime)
1038 ctime = attr->cat_ctime;
1040 if (mtime < attr->cat_mtime)
1041 mtime = attr->cat_mtime;
1043 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044 PFID(&lli->lli_fid), attr->cat_size);
1046 i_size_write(inode, attr->cat_size);
1047 inode->i_blocks = attr->cat_blocks;
1049 LTIME_S(inode->i_atime) = atime;
1050 LTIME_S(inode->i_mtime) = mtime;
1051 LTIME_S(inode->i_ctime) = ctime;
1054 ll_inode_size_unlock(inode);
1059 static bool file_is_noatime(const struct file *file)
1061 const struct vfsmount *mnt = file->f_path.mnt;
1062 const struct inode *inode = file_inode((struct file *)file);
1064 /* Adapted from file_accessed() and touch_atime().*/
1065 if (file->f_flags & O_NOATIME)
1068 if (inode->i_flags & S_NOATIME)
1071 if (IS_NOATIME(inode))
1074 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1077 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1086 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1088 struct inode *inode = file_inode((struct file *)file);
1090 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1092 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1093 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1094 file->f_flags & O_DIRECT ||
1097 io->ci_obj = ll_i2info(inode)->lli_clob;
1098 io->ci_lockreq = CILR_MAYBE;
1099 if (ll_file_nolock(file)) {
1100 io->ci_lockreq = CILR_NEVER;
1101 io->ci_no_srvlock = 1;
1102 } else if (file->f_flags & O_APPEND) {
1103 io->ci_lockreq = CILR_MANDATORY;
1106 io->ci_noatime = file_is_noatime(file);
1110 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1111 struct file *file, enum cl_io_type iot,
1112 loff_t *ppos, size_t count)
1114 struct vvp_io *vio = vvp_env_io(env);
1115 struct inode *inode = file_inode(file);
1116 struct ll_inode_info *lli = ll_i2info(inode);
1117 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1121 struct range_lock range;
1125 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1126 file_dentry(file)->d_name.name, iot, *ppos, count);
1129 io = vvp_env_thread_io(env);
1130 ll_io_init(io, file, iot == CIT_WRITE);
1132 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1133 bool range_locked = false;
1135 if (file->f_flags & O_APPEND)
1136 range_lock_init(&range, 0, LUSTRE_EOF);
1138 range_lock_init(&range, *ppos, *ppos + count - 1);
1140 vio->vui_fd = LUSTRE_FPRIVATE(file);
1141 vio->vui_io_subtype = args->via_io_subtype;
1143 switch (vio->vui_io_subtype) {
1145 vio->vui_iter = args->u.normal.via_iter;
1146 vio->vui_iocb = args->u.normal.via_iocb;
1147 /* Direct IO reads must also take range lock,
1148 * or multiple reads will try to work on the same pages
1149 * See LU-6227 for details. */
1150 if (((iot == CIT_WRITE) ||
1151 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1152 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1153 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1155 rc = range_lock(&lli->lli_write_tree, &range);
1159 range_locked = true;
1163 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1164 vio->u.splice.vui_flags = args->u.splice.via_flags;
1167 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1171 ll_cl_add(file, env, io, LCC_RW);
1172 rc = cl_io_loop(env, io);
1173 ll_cl_remove(file, env);
1176 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1178 range_unlock(&lli->lli_write_tree, &range);
1181 /* cl_io_rw_init() handled IO */
1185 if (io->ci_nob > 0) {
1186 result += io->ci_nob;
1187 count -= io->ci_nob;
1188 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1190 /* prepare IO restart */
1191 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1192 args->u.normal.via_iter = vio->vui_iter;
1196 cl_io_fini(env, io);
1198 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1200 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1201 file_dentry(file)->d_name.name,
1202 iot == CIT_READ ? "read" : "write",
1203 *ppos, count, result);
1207 if (iot == CIT_READ) {
1209 ll_stats_ops_tally(ll_i2sbi(inode),
1210 LPROC_LL_READ_BYTES, result);
1211 } else if (iot == CIT_WRITE) {
1213 ll_stats_ops_tally(ll_i2sbi(inode),
1214 LPROC_LL_WRITE_BYTES, result);
1215 fd->fd_write_failed = false;
1216 } else if (result == 0 && rc == 0) {
1219 fd->fd_write_failed = true;
1221 fd->fd_write_failed = false;
1222 } else if (rc != -ERESTARTSYS) {
1223 fd->fd_write_failed = true;
1227 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1229 return result > 0 ? result : rc;
1233 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1234 * especially for small I/O.
1236 * To serve a read request, CLIO has to create and initialize a cl_io and
1237 * then request DLM lock. This has turned out to have siginificant overhead
1238 * and affects the performance of small I/O dramatically.
1240 * It's not necessary to create a cl_io for each I/O. Under the help of read
1241 * ahead, most of the pages being read are already in memory cache and we can
1242 * read those pages directly because if the pages exist, the corresponding DLM
1243 * lock must exist so that page content must be valid.
1245 * In fast read implementation, the llite speculatively finds and reads pages
1246 * in memory cache. There are three scenarios for fast read:
1247 * - If the page exists and is uptodate, kernel VM will provide the data and
1248 * CLIO won't be intervened;
1249 * - If the page was brought into memory by read ahead, it will be exported
1250 * and read ahead parameters will be updated;
1251 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1252 * it will go back and invoke normal read, i.e., a cl_io will be created
1253 * and DLM lock will be requested.
1255 * POSIX compliance: posix standard states that read is intended to be atomic.
1256 * Lustre read implementation is in line with Linux kernel read implementation
1257 * and neither of them complies with POSIX standard in this matter. Fast read
1258 * doesn't make the situation worse on single node but it may interleave write
1259 * results from multiple nodes due to short read handling in ll_file_aio_read().
1261 * \param env - lu_env
1262 * \param iocb - kiocb from kernel
1263 * \param iter - user space buffers where the data will be copied
1265 * \retval - number of bytes have been read, or error code if error occurred.
1268 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1269 struct iov_iter *iter)
1273 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1276 /* NB: we can't do direct IO for fast read because it will need a lock
1277 * to make IO engine happy. */
1278 if (iocb->ki_filp->f_flags & O_DIRECT)
1281 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1282 result = generic_file_read_iter(iocb, iter);
1283 ll_cl_remove(iocb->ki_filp, env);
1285 /* If the first page is not in cache, generic_file_aio_read() will be
1286 * returned with -ENODATA.
1287 * See corresponding code in ll_readpage(). */
1288 if (result == -ENODATA)
1292 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1293 LPROC_LL_READ_BYTES, result);
1299 * Read from a file (through the page cache).
1301 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1304 struct vvp_io_args *args;
1309 env = cl_env_get(&refcheck);
1311 return PTR_ERR(env);
1313 result = ll_do_fast_read(env, iocb, to);
1314 if (result < 0 || iov_iter_count(to) == 0)
1317 args = ll_env_args(env, IO_NORMAL);
1318 args->u.normal.via_iter = to;
1319 args->u.normal.via_iocb = iocb;
1321 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1322 &iocb->ki_pos, iov_iter_count(to));
1325 else if (result == 0)
1329 cl_env_put(env, &refcheck);
1334 * Write to a file (through the page cache).
1336 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1338 struct vvp_io_args *args;
1343 env = cl_env_get(&refcheck);
1345 return PTR_ERR(env);
1347 args = ll_env_args(env, IO_NORMAL);
1348 args->u.normal.via_iter = from;
1349 args->u.normal.via_iocb = iocb;
1351 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1352 &iocb->ki_pos, iov_iter_count(from));
1353 cl_env_put(env, &refcheck);
1357 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1359 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1361 static int ll_file_get_iov_count(const struct iovec *iov,
1362 unsigned long *nr_segs, size_t *count)
1367 for (seg = 0; seg < *nr_segs; seg++) {
1368 const struct iovec *iv = &iov[seg];
1371 * If any segment has a negative length, or the cumulative
1372 * length ever wraps negative then return -EINVAL.
1375 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1377 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1382 cnt -= iv->iov_len; /* This segment is no good */
1389 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1390 unsigned long nr_segs, loff_t pos)
1397 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1401 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1402 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1403 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1404 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1405 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1407 result = ll_file_read_iter(iocb, &to);
1412 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1416 struct iovec iov = { .iov_base = buf, .iov_len = count };
1417 struct kiocb *kiocb;
1422 env = cl_env_get(&refcheck);
1424 RETURN(PTR_ERR(env));
1426 kiocb = &ll_env_info(env)->lti_kiocb;
1427 init_sync_kiocb(kiocb, file);
1428 kiocb->ki_pos = *ppos;
1429 #ifdef HAVE_KIOCB_KI_LEFT
1430 kiocb->ki_left = count;
1431 #elif defined(HAVE_KI_NBYTES)
1432 kiocb->ki_nbytes = count;
1435 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1436 *ppos = kiocb->ki_pos;
1438 cl_env_put(env, &refcheck);
1443 * Write to a file (through the page cache).
1446 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1447 unsigned long nr_segs, loff_t pos)
1449 struct iov_iter from;
1454 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1458 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1459 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1460 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1461 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1462 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1464 result = ll_file_write_iter(iocb, &from);
1469 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1470 size_t count, loff_t *ppos)
1473 struct iovec iov = { .iov_base = (void __user *)buf,
1475 struct kiocb *kiocb;
1480 env = cl_env_get(&refcheck);
1482 RETURN(PTR_ERR(env));
1484 kiocb = &ll_env_info(env)->lti_kiocb;
1485 init_sync_kiocb(kiocb, file);
1486 kiocb->ki_pos = *ppos;
1487 #ifdef HAVE_KIOCB_KI_LEFT
1488 kiocb->ki_left = count;
1489 #elif defined(HAVE_KI_NBYTES)
1490 kiocb->ki_nbytes = count;
1493 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1494 *ppos = kiocb->ki_pos;
1496 cl_env_put(env, &refcheck);
1499 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1502 * Send file content (through pagecache) somewhere with helper
1504 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1505 struct pipe_inode_info *pipe, size_t count,
1509 struct vvp_io_args *args;
1514 env = cl_env_get(&refcheck);
1516 RETURN(PTR_ERR(env));
1518 args = ll_env_args(env, IO_SPLICE);
1519 args->u.splice.via_pipe = pipe;
1520 args->u.splice.via_flags = flags;
1522 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1523 cl_env_put(env, &refcheck);
1527 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1528 __u64 flags, struct lov_user_md *lum,
1531 struct lookup_intent oit = {
1533 .it_flags = flags | MDS_OPEN_BY_FID,
1538 ll_inode_size_lock(inode);
1539 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1541 GOTO(out_unlock, rc);
1543 ll_release_openhandle(file_dentry(file), &oit);
1546 ll_inode_size_unlock(inode);
1547 ll_intent_release(&oit);
1548 cl_lov_delay_create_clear(&file->f_flags);
1553 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1554 struct lov_mds_md **lmmp, int *lmm_size,
1555 struct ptlrpc_request **request)
1557 struct ll_sb_info *sbi = ll_i2sbi(inode);
1558 struct mdt_body *body;
1559 struct lov_mds_md *lmm = NULL;
1560 struct ptlrpc_request *req = NULL;
1561 struct md_op_data *op_data;
1564 rc = ll_get_default_mdsize(sbi, &lmmsize);
1568 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1569 strlen(filename), lmmsize,
1570 LUSTRE_OPC_ANY, NULL);
1571 if (IS_ERR(op_data))
1572 RETURN(PTR_ERR(op_data));
1574 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1575 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1576 ll_finish_md_op_data(op_data);
1578 CDEBUG(D_INFO, "md_getattr_name failed "
1579 "on %s: rc %d\n", filename, rc);
1583 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1584 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1586 lmmsize = body->mbo_eadatasize;
1588 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1590 GOTO(out, rc = -ENODATA);
1593 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1594 LASSERT(lmm != NULL);
1596 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1597 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1598 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1599 GOTO(out, rc = -EPROTO);
1602 * This is coming from the MDS, so is probably in
1603 * little endian. We convert it to host endian before
1604 * passing it to userspace.
1606 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1609 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1610 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1611 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1612 if (le32_to_cpu(lmm->lmm_pattern) &
1613 LOV_PATTERN_F_RELEASED)
1617 /* if function called for directory - we should
1618 * avoid swab not existent lsm objects */
1619 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1620 lustre_swab_lov_user_md_v1(
1621 (struct lov_user_md_v1 *)lmm);
1622 if (S_ISREG(body->mbo_mode))
1623 lustre_swab_lov_user_md_objects(
1624 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1626 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1627 lustre_swab_lov_user_md_v3(
1628 (struct lov_user_md_v3 *)lmm);
1629 if (S_ISREG(body->mbo_mode))
1630 lustre_swab_lov_user_md_objects(
1631 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1633 } else if (lmm->lmm_magic ==
1634 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1635 lustre_swab_lov_comp_md_v1(
1636 (struct lov_comp_md_v1 *)lmm);
1642 *lmm_size = lmmsize;
1647 static int ll_lov_setea(struct inode *inode, struct file *file,
1650 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1651 struct lov_user_md *lump;
1652 int lum_size = sizeof(struct lov_user_md) +
1653 sizeof(struct lov_user_ost_data);
1657 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1660 OBD_ALLOC_LARGE(lump, lum_size);
1664 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1665 GOTO(out_lump, rc = -EFAULT);
1667 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1670 OBD_FREE_LARGE(lump, lum_size);
1674 static int ll_file_getstripe(struct inode *inode,
1675 struct lov_user_md __user *lum)
1682 env = cl_env_get(&refcheck);
1684 RETURN(PTR_ERR(env));
1686 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1687 cl_env_put(env, &refcheck);
1691 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1694 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1695 struct lov_user_md *klum;
1697 __u64 flags = FMODE_WRITE;
1700 rc = ll_copy_user_md(lum, &klum);
1705 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1707 OBD_FREE(klum, lum_size);
1712 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1714 struct ll_inode_info *lli = ll_i2info(inode);
1715 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1716 struct ll_grouplock grouplock;
1721 CWARN("group id for group lock must not be 0\n");
1725 if (ll_file_nolock(file))
1726 RETURN(-EOPNOTSUPP);
1728 spin_lock(&lli->lli_lock);
1729 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1730 CWARN("group lock already existed with gid %lu\n",
1731 fd->fd_grouplock.lg_gid);
1732 spin_unlock(&lli->lli_lock);
1735 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1736 spin_unlock(&lli->lli_lock);
1738 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1739 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1743 spin_lock(&lli->lli_lock);
1744 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1745 spin_unlock(&lli->lli_lock);
1746 CERROR("another thread just won the race\n");
1747 cl_put_grouplock(&grouplock);
1751 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1752 fd->fd_grouplock = grouplock;
1753 spin_unlock(&lli->lli_lock);
1755 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1759 static int ll_put_grouplock(struct inode *inode, struct file *file,
1762 struct ll_inode_info *lli = ll_i2info(inode);
1763 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1764 struct ll_grouplock grouplock;
1767 spin_lock(&lli->lli_lock);
1768 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1769 spin_unlock(&lli->lli_lock);
1770 CWARN("no group lock held\n");
1774 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1776 if (fd->fd_grouplock.lg_gid != arg) {
1777 CWARN("group lock %lu doesn't match current id %lu\n",
1778 arg, fd->fd_grouplock.lg_gid);
1779 spin_unlock(&lli->lli_lock);
1783 grouplock = fd->fd_grouplock;
1784 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1785 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1786 spin_unlock(&lli->lli_lock);
1788 cl_put_grouplock(&grouplock);
1789 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1794 * Close inode open handle
1796 * \param dentry [in] dentry which contains the inode
1797 * \param it [in,out] intent which contains open info and result
1800 * \retval <0 failure
1802 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1804 struct inode *inode = dentry->d_inode;
1805 struct obd_client_handle *och;
1811 /* Root ? Do nothing. */
1812 if (dentry->d_inode->i_sb->s_root == dentry)
1815 /* No open handle to close? Move away */
1816 if (!it_disposition(it, DISP_OPEN_OPEN))
1819 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1821 OBD_ALLOC(och, sizeof(*och));
1823 GOTO(out, rc = -ENOMEM);
1825 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1827 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1829 /* this one is in place of ll_file_open */
1830 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1831 ptlrpc_req_finished(it->it_request);
1832 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1838 * Get size for inode for which FIEMAP mapping is requested.
1839 * Make the FIEMAP get_info call and returns the result.
1840 * \param fiemap kernel buffer to hold extens
1841 * \param num_bytes kernel buffer size
1843 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1849 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1852 /* Checks for fiemap flags */
1853 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1854 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1858 /* Check for FIEMAP_FLAG_SYNC */
1859 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1860 rc = filemap_fdatawrite(inode->i_mapping);
1865 env = cl_env_get(&refcheck);
1867 RETURN(PTR_ERR(env));
1869 if (i_size_read(inode) == 0) {
1870 rc = ll_glimpse_size(inode);
1875 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1876 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1877 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1879 /* If filesize is 0, then there would be no objects for mapping */
1880 if (fmkey.lfik_oa.o_size == 0) {
1881 fiemap->fm_mapped_extents = 0;
1885 fmkey.lfik_fiemap = *fiemap;
1887 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1888 &fmkey, fiemap, &num_bytes);
1890 cl_env_put(env, &refcheck);
1894 int ll_fid2path(struct inode *inode, void __user *arg)
1896 struct obd_export *exp = ll_i2mdexp(inode);
1897 const struct getinfo_fid2path __user *gfin = arg;
1899 struct getinfo_fid2path *gfout;
1905 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1906 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1909 /* Only need to get the buflen */
1910 if (get_user(pathlen, &gfin->gf_pathlen))
1913 if (pathlen > PATH_MAX)
1916 outsize = sizeof(*gfout) + pathlen;
1917 OBD_ALLOC(gfout, outsize);
1921 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1922 GOTO(gf_free, rc = -EFAULT);
1923 /* append root FID after gfout to let MDT know the root FID so that it
1924 * can lookup the correct path, this is mainly for fileset.
1925 * old server without fileset mount support will ignore this. */
1926 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1928 /* Call mdc_iocontrol */
1929 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1933 if (copy_to_user(arg, gfout, outsize))
1937 OBD_FREE(gfout, outsize);
1942 * Read the data_version for inode.
1944 * This value is computed using stripe object version on OST.
1945 * Version is computed using server side locking.
1947 * @param flags if do sync on the OST side;
1949 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1950 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1952 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1954 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1962 /* If no file object initialized, we consider its version is 0. */
1968 env = cl_env_get(&refcheck);
1970 RETURN(PTR_ERR(env));
1972 io = vvp_env_thread_io(env);
1974 io->u.ci_data_version.dv_data_version = 0;
1975 io->u.ci_data_version.dv_flags = flags;
1978 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1979 result = cl_io_loop(env, io);
1981 result = io->ci_result;
1983 *data_version = io->u.ci_data_version.dv_data_version;
1985 cl_io_fini(env, io);
1987 if (unlikely(io->ci_need_restart))
1990 cl_env_put(env, &refcheck);
1996 * Trigger a HSM release request for the provided inode.
1998 int ll_hsm_release(struct inode *inode)
2001 struct obd_client_handle *och = NULL;
2002 __u64 data_version = 0;
2007 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2008 ll_get_fsname(inode->i_sb, NULL, 0),
2009 PFID(&ll_i2info(inode)->lli_fid));
2011 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2013 GOTO(out, rc = PTR_ERR(och));
2015 /* Grab latest data_version and [am]time values */
2016 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2020 env = cl_env_get(&refcheck);
2022 GOTO(out, rc = PTR_ERR(env));
2024 ll_merge_attr(env, inode);
2025 cl_env_put(env, &refcheck);
2027 /* Release the file.
2028 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2029 * we still need it to pack l_remote_handle to MDT. */
2030 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2036 if (och != NULL && !IS_ERR(och)) /* close the file */
2037 ll_lease_close(och, inode, NULL);
2042 struct ll_swap_stack {
2045 struct inode *inode1;
2046 struct inode *inode2;
2051 static int ll_swap_layouts(struct file *file1, struct file *file2,
2052 struct lustre_swap_layouts *lsl)
2054 struct mdc_swap_layouts msl;
2055 struct md_op_data *op_data;
2058 struct ll_swap_stack *llss = NULL;
2061 OBD_ALLOC_PTR(llss);
2065 llss->inode1 = file_inode(file1);
2066 llss->inode2 = file_inode(file2);
2068 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2072 /* we use 2 bool because it is easier to swap than 2 bits */
2073 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2074 llss->check_dv1 = true;
2076 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2077 llss->check_dv2 = true;
2079 /* we cannot use lsl->sl_dvX directly because we may swap them */
2080 llss->dv1 = lsl->sl_dv1;
2081 llss->dv2 = lsl->sl_dv2;
2083 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2084 if (rc == 0) /* same file, done! */
2087 if (rc < 0) { /* sequentialize it */
2088 swap(llss->inode1, llss->inode2);
2090 swap(llss->dv1, llss->dv2);
2091 swap(llss->check_dv1, llss->check_dv2);
2095 if (gid != 0) { /* application asks to flush dirty cache */
2096 rc = ll_get_grouplock(llss->inode1, file1, gid);
2100 rc = ll_get_grouplock(llss->inode2, file2, gid);
2102 ll_put_grouplock(llss->inode1, file1, gid);
2107 /* ultimate check, before swaping the layouts we check if
2108 * dataversion has changed (if requested) */
2109 if (llss->check_dv1) {
2110 rc = ll_data_version(llss->inode1, &dv, 0);
2113 if (dv != llss->dv1)
2114 GOTO(putgl, rc = -EAGAIN);
2117 if (llss->check_dv2) {
2118 rc = ll_data_version(llss->inode2, &dv, 0);
2121 if (dv != llss->dv2)
2122 GOTO(putgl, rc = -EAGAIN);
2125 /* struct md_op_data is used to send the swap args to the mdt
2126 * only flags is missing, so we use struct mdc_swap_layouts
2127 * through the md_op_data->op_data */
2128 /* flags from user space have to be converted before they are send to
2129 * server, no flag is sent today, they are only used on the client */
2132 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2133 0, LUSTRE_OPC_ANY, &msl);
2134 if (IS_ERR(op_data))
2135 GOTO(free, rc = PTR_ERR(op_data));
2137 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2138 sizeof(*op_data), op_data, NULL);
2139 ll_finish_md_op_data(op_data);
2146 ll_put_grouplock(llss->inode2, file2, gid);
2147 ll_put_grouplock(llss->inode1, file1, gid);
2157 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2159 struct md_op_data *op_data;
2163 /* Detect out-of range masks */
2164 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2167 /* Non-root users are forbidden to set or clear flags which are
2168 * NOT defined in HSM_USER_MASK. */
2169 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2170 !cfs_capable(CFS_CAP_SYS_ADMIN))
2173 /* Detect out-of range archive id */
2174 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2175 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2178 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2179 LUSTRE_OPC_ANY, hss);
2180 if (IS_ERR(op_data))
2181 RETURN(PTR_ERR(op_data));
2183 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2184 sizeof(*op_data), op_data, NULL);
2186 ll_finish_md_op_data(op_data);
2191 static int ll_hsm_import(struct inode *inode, struct file *file,
2192 struct hsm_user_import *hui)
2194 struct hsm_state_set *hss = NULL;
2195 struct iattr *attr = NULL;
2199 if (!S_ISREG(inode->i_mode))
2205 GOTO(out, rc = -ENOMEM);
2207 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2208 hss->hss_archive_id = hui->hui_archive_id;
2209 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2210 rc = ll_hsm_state_set(inode, hss);
2214 OBD_ALLOC_PTR(attr);
2216 GOTO(out, rc = -ENOMEM);
2218 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2219 attr->ia_mode |= S_IFREG;
2220 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2221 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2222 attr->ia_size = hui->hui_size;
2223 attr->ia_mtime.tv_sec = hui->hui_mtime;
2224 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2225 attr->ia_atime.tv_sec = hui->hui_atime;
2226 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2228 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2229 ATTR_UID | ATTR_GID |
2230 ATTR_MTIME | ATTR_MTIME_SET |
2231 ATTR_ATIME | ATTR_ATIME_SET;
2235 rc = ll_setattr_raw(file_dentry(file), attr, true);
2239 inode_unlock(inode);
2251 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2253 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2254 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2257 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2259 struct inode *inode = file_inode(file);
2261 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2262 ATTR_MTIME | ATTR_MTIME_SET |
2263 ATTR_CTIME | ATTR_CTIME_SET,
2265 .tv_sec = lfu->lfu_atime_sec,
2266 .tv_nsec = lfu->lfu_atime_nsec,
2269 .tv_sec = lfu->lfu_mtime_sec,
2270 .tv_nsec = lfu->lfu_mtime_nsec,
2273 .tv_sec = lfu->lfu_ctime_sec,
2274 .tv_nsec = lfu->lfu_ctime_nsec,
2280 if (!capable(CAP_SYS_ADMIN))
2283 if (!S_ISREG(inode->i_mode))
2287 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2288 inode_unlock(inode);
2294 * Give file access advices
2296 * The ladvise interface is similar to Linux fadvise() system call, except it
2297 * forwards the advices directly from Lustre client to server. The server side
2298 * codes will apply appropriate read-ahead and caching techniques for the
2299 * corresponding files.
2301 * A typical workload for ladvise is e.g. a bunch of different clients are
2302 * doing small random reads of a file, so prefetching pages into OSS cache
2303 * with big linear reads before the random IO is a net benefit. Fetching
2304 * all that data into each client cache with fadvise() may not be, due to
2305 * much more data being sent to the client.
2307 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2308 struct llapi_lu_ladvise *ladvise)
2312 struct cl_ladvise_io *lio;
2317 env = cl_env_get(&refcheck);
2319 RETURN(PTR_ERR(env));
2321 io = vvp_env_thread_io(env);
2322 io->ci_obj = ll_i2info(inode)->lli_clob;
2324 /* initialize parameters for ladvise */
2325 lio = &io->u.ci_ladvise;
2326 lio->li_start = ladvise->lla_start;
2327 lio->li_end = ladvise->lla_end;
2328 lio->li_fid = ll_inode2fid(inode);
2329 lio->li_advice = ladvise->lla_advice;
2330 lio->li_flags = flags;
2332 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2333 rc = cl_io_loop(env, io);
2337 cl_io_fini(env, io);
2338 cl_env_put(env, &refcheck);
2343 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2345 struct inode *inode = file_inode(file);
2346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2350 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2351 PFID(ll_inode2fid(inode)), inode, cmd);
2352 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2354 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2355 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2359 case LL_IOC_GETFLAGS:
2360 /* Get the current value of the file flags */
2361 return put_user(fd->fd_flags, (int __user *)arg);
2362 case LL_IOC_SETFLAGS:
2363 case LL_IOC_CLRFLAGS:
2364 /* Set or clear specific file flags */
2365 /* XXX This probably needs checks to ensure the flags are
2366 * not abused, and to handle any flag side effects.
2368 if (get_user(flags, (int __user *) arg))
2371 if (cmd == LL_IOC_SETFLAGS) {
2372 if ((flags & LL_FILE_IGNORE_LOCK) &&
2373 !(file->f_flags & O_DIRECT)) {
2374 CERROR("%s: unable to disable locking on "
2375 "non-O_DIRECT file\n", current->comm);
2379 fd->fd_flags |= flags;
2381 fd->fd_flags &= ~flags;
2384 case LL_IOC_LOV_SETSTRIPE:
2385 RETURN(ll_lov_setstripe(inode, file, arg));
2386 case LL_IOC_LOV_SETEA:
2387 RETURN(ll_lov_setea(inode, file, arg));
2388 case LL_IOC_LOV_SWAP_LAYOUTS: {
2390 struct lustre_swap_layouts lsl;
2392 if (copy_from_user(&lsl, (char __user *)arg,
2393 sizeof(struct lustre_swap_layouts)))
2396 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2399 file2 = fget(lsl.sl_fd);
2403 /* O_WRONLY or O_RDWR */
2404 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2405 GOTO(out, rc = -EPERM);
2407 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2408 struct inode *inode2;
2409 struct ll_inode_info *lli;
2410 struct obd_client_handle *och = NULL;
2412 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2413 GOTO(out, rc = -EINVAL);
2415 lli = ll_i2info(inode);
2416 mutex_lock(&lli->lli_och_mutex);
2417 if (fd->fd_lease_och != NULL) {
2418 och = fd->fd_lease_och;
2419 fd->fd_lease_och = NULL;
2421 mutex_unlock(&lli->lli_och_mutex);
2423 GOTO(out, rc = -ENOLCK);
2424 inode2 = file_inode(file2);
2425 rc = ll_swap_layouts_close(och, inode, inode2);
2427 rc = ll_swap_layouts(file, file2, &lsl);
2433 case LL_IOC_LOV_GETSTRIPE:
2434 RETURN(ll_file_getstripe(inode,
2435 (struct lov_user_md __user *)arg));
2436 case FSFILT_IOC_GETFLAGS:
2437 case FSFILT_IOC_SETFLAGS:
2438 RETURN(ll_iocontrol(inode, file, cmd, arg));
2439 case FSFILT_IOC_GETVERSION_OLD:
2440 case FSFILT_IOC_GETVERSION:
2441 RETURN(put_user(inode->i_generation, (int __user *)arg));
2442 case LL_IOC_GROUP_LOCK:
2443 RETURN(ll_get_grouplock(inode, file, arg));
2444 case LL_IOC_GROUP_UNLOCK:
2445 RETURN(ll_put_grouplock(inode, file, arg));
2446 case IOC_OBD_STATFS:
2447 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2449 /* We need to special case any other ioctls we want to handle,
2450 * to send them to the MDS/OST as appropriate and to properly
2451 * network encode the arg field.
2452 case FSFILT_IOC_SETVERSION_OLD:
2453 case FSFILT_IOC_SETVERSION:
2455 case LL_IOC_FLUSHCTX:
2456 RETURN(ll_flush_ctx(inode));
2457 case LL_IOC_PATH2FID: {
2458 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2459 sizeof(struct lu_fid)))
2464 case LL_IOC_GETPARENT:
2465 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2467 case OBD_IOC_FID2PATH:
2468 RETURN(ll_fid2path(inode, (void __user *)arg));
2469 case LL_IOC_DATA_VERSION: {
2470 struct ioc_data_version idv;
2473 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2476 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2477 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2480 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2486 case LL_IOC_GET_MDTIDX: {
2489 mdtidx = ll_get_mdt_idx(inode);
2493 if (put_user((int)mdtidx, (int __user *)arg))
2498 case OBD_IOC_GETDTNAME:
2499 case OBD_IOC_GETMDNAME:
2500 RETURN(ll_get_obd_name(inode, cmd, arg));
2501 case LL_IOC_HSM_STATE_GET: {
2502 struct md_op_data *op_data;
2503 struct hsm_user_state *hus;
2510 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2511 LUSTRE_OPC_ANY, hus);
2512 if (IS_ERR(op_data)) {
2514 RETURN(PTR_ERR(op_data));
2517 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2520 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2523 ll_finish_md_op_data(op_data);
2527 case LL_IOC_HSM_STATE_SET: {
2528 struct hsm_state_set *hss;
2535 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2540 rc = ll_hsm_state_set(inode, hss);
2545 case LL_IOC_HSM_ACTION: {
2546 struct md_op_data *op_data;
2547 struct hsm_current_action *hca;
2554 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2555 LUSTRE_OPC_ANY, hca);
2556 if (IS_ERR(op_data)) {
2558 RETURN(PTR_ERR(op_data));
2561 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2564 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2567 ll_finish_md_op_data(op_data);
2571 case LL_IOC_SET_LEASE: {
2572 struct ll_inode_info *lli = ll_i2info(inode);
2573 struct obd_client_handle *och = NULL;
2578 case LL_LEASE_WRLCK:
2579 if (!(file->f_mode & FMODE_WRITE))
2581 fmode = FMODE_WRITE;
2583 case LL_LEASE_RDLCK:
2584 if (!(file->f_mode & FMODE_READ))
2588 case LL_LEASE_UNLCK:
2589 mutex_lock(&lli->lli_och_mutex);
2590 if (fd->fd_lease_och != NULL) {
2591 och = fd->fd_lease_och;
2592 fd->fd_lease_och = NULL;
2594 mutex_unlock(&lli->lli_och_mutex);
2599 fmode = och->och_flags;
2600 rc = ll_lease_close(och, inode, &lease_broken);
2604 rc = ll_lease_och_release(inode, file);
2611 RETURN(ll_lease_type_from_fmode(fmode));
2616 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2618 /* apply for lease */
2619 och = ll_lease_open(inode, file, fmode, 0);
2621 RETURN(PTR_ERR(och));
2624 mutex_lock(&lli->lli_och_mutex);
2625 if (fd->fd_lease_och == NULL) {
2626 fd->fd_lease_och = och;
2629 mutex_unlock(&lli->lli_och_mutex);
2631 /* impossible now that only excl is supported for now */
2632 ll_lease_close(och, inode, &lease_broken);
2637 case LL_IOC_GET_LEASE: {
2638 struct ll_inode_info *lli = ll_i2info(inode);
2639 struct ldlm_lock *lock = NULL;
2642 mutex_lock(&lli->lli_och_mutex);
2643 if (fd->fd_lease_och != NULL) {
2644 struct obd_client_handle *och = fd->fd_lease_och;
2646 lock = ldlm_handle2lock(&och->och_lease_handle);
2648 lock_res_and_lock(lock);
2649 if (!ldlm_is_cancel(lock))
2650 fmode = och->och_flags;
2652 unlock_res_and_lock(lock);
2653 LDLM_LOCK_PUT(lock);
2656 mutex_unlock(&lli->lli_och_mutex);
2658 RETURN(ll_lease_type_from_fmode(fmode));
2660 case LL_IOC_HSM_IMPORT: {
2661 struct hsm_user_import *hui;
2667 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2672 rc = ll_hsm_import(inode, file, hui);
2677 case LL_IOC_FUTIMES_3: {
2678 struct ll_futimes_3 lfu;
2680 if (copy_from_user(&lfu,
2681 (const struct ll_futimes_3 __user *)arg,
2685 RETURN(ll_file_futimes_3(file, &lfu));
2687 case LL_IOC_LADVISE: {
2688 struct llapi_ladvise_hdr *ladvise_hdr;
2691 int alloc_size = sizeof(*ladvise_hdr);
2694 OBD_ALLOC_PTR(ladvise_hdr);
2695 if (ladvise_hdr == NULL)
2698 if (copy_from_user(ladvise_hdr,
2699 (const struct llapi_ladvise_hdr __user *)arg,
2701 GOTO(out_ladvise, rc = -EFAULT);
2703 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2704 ladvise_hdr->lah_count < 1)
2705 GOTO(out_ladvise, rc = -EINVAL);
2707 num_advise = ladvise_hdr->lah_count;
2708 if (num_advise >= LAH_COUNT_MAX)
2709 GOTO(out_ladvise, rc = -EFBIG);
2711 OBD_FREE_PTR(ladvise_hdr);
2712 alloc_size = offsetof(typeof(*ladvise_hdr),
2713 lah_advise[num_advise]);
2714 OBD_ALLOC(ladvise_hdr, alloc_size);
2715 if (ladvise_hdr == NULL)
2719 * TODO: submit multiple advices to one server in a single RPC
2721 if (copy_from_user(ladvise_hdr,
2722 (const struct llapi_ladvise_hdr __user *)arg,
2724 GOTO(out_ladvise, rc = -EFAULT);
2726 for (i = 0; i < num_advise; i++) {
2727 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2728 &ladvise_hdr->lah_advise[i]);
2734 OBD_FREE(ladvise_hdr, alloc_size);
2741 ll_iocontrol_call(inode, file, cmd, arg, &err))
2744 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2745 (void __user *)arg));
2750 #ifndef HAVE_FILE_LLSEEK_SIZE
2751 static inline loff_t
2752 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2754 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2756 if (offset > maxsize)
2759 if (offset != file->f_pos) {
2760 file->f_pos = offset;
2761 file->f_version = 0;
2767 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2768 loff_t maxsize, loff_t eof)
2770 struct inode *inode = file_inode(file);
2778 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2779 * position-querying operation. Avoid rewriting the "same"
2780 * f_pos value back to the file because a concurrent read(),
2781 * write() or lseek() might have altered it
2786 * f_lock protects against read/modify/write race with other
2787 * SEEK_CURs. Note that parallel writes and reads behave
2791 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2792 inode_unlock(inode);
2796 * In the generic case the entire file is data, so as long as
2797 * offset isn't at the end of the file then the offset is data.
2804 * There is a virtual hole at the end of the file, so as long as
2805 * offset isn't i_size or larger, return i_size.
2813 return llseek_execute(file, offset, maxsize);
2817 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2819 struct inode *inode = file_inode(file);
2820 loff_t retval, eof = 0;
2823 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2824 (origin == SEEK_CUR) ? file->f_pos : 0);
2825 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2826 PFID(ll_inode2fid(inode)), inode, retval, retval,
2828 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2830 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2831 retval = ll_glimpse_size(inode);
2834 eof = i_size_read(inode);
2837 retval = ll_generic_file_llseek_size(file, offset, origin,
2838 ll_file_maxbytes(inode), eof);
2842 static int ll_flush(struct file *file, fl_owner_t id)
2844 struct inode *inode = file_inode(file);
2845 struct ll_inode_info *lli = ll_i2info(inode);
2846 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2849 LASSERT(!S_ISDIR(inode->i_mode));
2851 /* catch async errors that were recorded back when async writeback
2852 * failed for pages in this mapping. */
2853 rc = lli->lli_async_rc;
2854 lli->lli_async_rc = 0;
2855 if (lli->lli_clob != NULL) {
2856 err = lov_read_and_clear_async_rc(lli->lli_clob);
2861 /* The application has been told write failure already.
2862 * Do not report failure again. */
2863 if (fd->fd_write_failed)
2865 return rc ? -EIO : 0;
2869 * Called to make sure a portion of file has been written out.
2870 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2872 * Return how many pages have been written.
2874 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2875 enum cl_fsync_mode mode, int ignore_layout)
2879 struct cl_fsync_io *fio;
2884 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2885 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2888 env = cl_env_get(&refcheck);
2890 RETURN(PTR_ERR(env));
2892 io = vvp_env_thread_io(env);
2893 io->ci_obj = ll_i2info(inode)->lli_clob;
2894 io->ci_ignore_layout = ignore_layout;
2896 /* initialize parameters for sync */
2897 fio = &io->u.ci_fsync;
2898 fio->fi_start = start;
2900 fio->fi_fid = ll_inode2fid(inode);
2901 fio->fi_mode = mode;
2902 fio->fi_nr_written = 0;
2904 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2905 result = cl_io_loop(env, io);
2907 result = io->ci_result;
2909 result = fio->fi_nr_written;
2910 cl_io_fini(env, io);
2911 cl_env_put(env, &refcheck);
2917 * When dentry is provided (the 'else' case), file_dentry() may be
2918 * null and dentry must be used directly rather than pulled from
2919 * file_dentry() as is done otherwise.
2922 #ifdef HAVE_FILE_FSYNC_4ARGS
2923 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2925 struct dentry *dentry = file_dentry(file);
2926 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2927 int ll_fsync(struct file *file, int datasync)
2929 struct dentry *dentry = file_dentry(file);
2931 loff_t end = LLONG_MAX;
2933 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2936 loff_t end = LLONG_MAX;
2938 struct inode *inode = dentry->d_inode;
2939 struct ll_inode_info *lli = ll_i2info(inode);
2940 struct ptlrpc_request *req;
2944 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2945 PFID(ll_inode2fid(inode)), inode);
2946 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2948 #ifdef HAVE_FILE_FSYNC_4ARGS
2949 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2952 /* fsync's caller has already called _fdata{sync,write}, we want
2953 * that IO to finish before calling the osc and mdc sync methods */
2954 rc = filemap_fdatawait(inode->i_mapping);
2957 /* catch async errors that were recorded back when async writeback
2958 * failed for pages in this mapping. */
2959 if (!S_ISDIR(inode->i_mode)) {
2960 err = lli->lli_async_rc;
2961 lli->lli_async_rc = 0;
2964 if (lli->lli_clob != NULL) {
2965 err = lov_read_and_clear_async_rc(lli->lli_clob);
2971 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2975 ptlrpc_req_finished(req);
2977 if (S_ISREG(inode->i_mode)) {
2978 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2980 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2981 if (rc == 0 && err < 0)
2984 fd->fd_write_failed = true;
2986 fd->fd_write_failed = false;
2989 #ifdef HAVE_FILE_FSYNC_4ARGS
2990 inode_unlock(inode);
2996 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2998 struct inode *inode = file_inode(file);
2999 struct ll_sb_info *sbi = ll_i2sbi(inode);
3000 struct ldlm_enqueue_info einfo = {
3001 .ei_type = LDLM_FLOCK,
3002 .ei_cb_cp = ldlm_flock_completion_ast,
3003 .ei_cbdata = file_lock,
3005 struct md_op_data *op_data;
3006 struct lustre_handle lockh = { 0 };
3007 union ldlm_policy_data flock = { { 0 } };
3008 int fl_type = file_lock->fl_type;
3014 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3015 PFID(ll_inode2fid(inode)), file_lock);
3017 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3019 if (file_lock->fl_flags & FL_FLOCK) {
3020 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3021 /* flocks are whole-file locks */
3022 flock.l_flock.end = OFFSET_MAX;
3023 /* For flocks owner is determined by the local file desctiptor*/
3024 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3025 } else if (file_lock->fl_flags & FL_POSIX) {
3026 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3027 flock.l_flock.start = file_lock->fl_start;
3028 flock.l_flock.end = file_lock->fl_end;
3032 flock.l_flock.pid = file_lock->fl_pid;
3034 /* Somewhat ugly workaround for svc lockd.
3035 * lockd installs custom fl_lmops->lm_compare_owner that checks
3036 * for the fl_owner to be the same (which it always is on local node
3037 * I guess between lockd processes) and then compares pid.
3038 * As such we assign pid to the owner field to make it all work,
3039 * conflict with normal locks is unlikely since pid space and
3040 * pointer space for current->files are not intersecting */
3041 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3042 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3046 einfo.ei_mode = LCK_PR;
3049 /* An unlock request may or may not have any relation to
3050 * existing locks so we may not be able to pass a lock handle
3051 * via a normal ldlm_lock_cancel() request. The request may even
3052 * unlock a byte range in the middle of an existing lock. In
3053 * order to process an unlock request we need all of the same
3054 * information that is given with a normal read or write record
3055 * lock request. To avoid creating another ldlm unlock (cancel)
3056 * message we'll treat a LCK_NL flock request as an unlock. */
3057 einfo.ei_mode = LCK_NL;
3060 einfo.ei_mode = LCK_PW;
3063 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3078 flags = LDLM_FL_BLOCK_NOWAIT;
3084 flags = LDLM_FL_TEST_LOCK;
3087 CERROR("unknown fcntl lock command: %d\n", cmd);
3091 /* Save the old mode so that if the mode in the lock changes we
3092 * can decrement the appropriate reader or writer refcount. */
3093 file_lock->fl_type = einfo.ei_mode;
3095 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3096 LUSTRE_OPC_ANY, NULL);
3097 if (IS_ERR(op_data))
3098 RETURN(PTR_ERR(op_data));
3100 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3101 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3102 flock.l_flock.pid, flags, einfo.ei_mode,
3103 flock.l_flock.start, flock.l_flock.end);
3105 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3108 /* Restore the file lock type if not TEST lock. */
3109 if (!(flags & LDLM_FL_TEST_LOCK))
3110 file_lock->fl_type = fl_type;
3112 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3113 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3114 !(flags & LDLM_FL_TEST_LOCK))
3115 rc2 = locks_lock_file_wait(file, file_lock);
3117 if ((file_lock->fl_flags & FL_FLOCK) &&
3118 (rc == 0 || file_lock->fl_type == F_UNLCK))
3119 rc2 = flock_lock_file_wait(file, file_lock);
3120 if ((file_lock->fl_flags & FL_POSIX) &&
3121 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3122 !(flags & LDLM_FL_TEST_LOCK))
3123 rc2 = posix_lock_file_wait(file, file_lock);
3124 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3126 if (rc2 && file_lock->fl_type != F_UNLCK) {
3127 einfo.ei_mode = LCK_NL;
3128 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3133 ll_finish_md_op_data(op_data);
3138 int ll_get_fid_by_name(struct inode *parent, const char *name,
3139 int namelen, struct lu_fid *fid,
3140 struct inode **inode)
3142 struct md_op_data *op_data = NULL;
3143 struct mdt_body *body;
3144 struct ptlrpc_request *req;
3148 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3149 LUSTRE_OPC_ANY, NULL);
3150 if (IS_ERR(op_data))
3151 RETURN(PTR_ERR(op_data));
3153 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3154 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3155 ll_finish_md_op_data(op_data);
3159 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3161 GOTO(out_req, rc = -EFAULT);
3163 *fid = body->mbo_fid1;
3166 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3168 ptlrpc_req_finished(req);
3172 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3173 const char *name, int namelen)
3175 struct dentry *dchild = NULL;
3176 struct inode *child_inode = NULL;
3177 struct md_op_data *op_data;
3178 struct ptlrpc_request *request = NULL;
3179 struct obd_client_handle *och = NULL;
3181 struct mdt_body *body;
3183 __u64 data_version = 0;
3186 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3187 name, PFID(ll_inode2fid(parent)), mdtidx);
3189 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3190 0, LUSTRE_OPC_ANY, NULL);
3191 if (IS_ERR(op_data))
3192 RETURN(PTR_ERR(op_data));
3194 /* Get child FID first */
3195 qstr.hash = full_name_hash(name, namelen);
3198 dchild = d_lookup(file_dentry(file), &qstr);
3199 if (dchild != NULL) {
3200 if (dchild->d_inode != NULL)
3201 child_inode = igrab(dchild->d_inode);
3205 if (child_inode == NULL) {
3206 rc = ll_get_fid_by_name(parent, name, namelen,
3207 &op_data->op_fid3, &child_inode);
3212 if (child_inode == NULL)
3213 GOTO(out_free, rc = -EINVAL);
3216 * lfs migrate command needs to be blocked on the client
3217 * by checking the migrate FID against the FID of the
3220 if (child_inode == parent->i_sb->s_root->d_inode)
3221 GOTO(out_iput, rc = -EINVAL);
3223 inode_lock(child_inode);
3224 op_data->op_fid3 = *ll_inode2fid(child_inode);
3225 if (!fid_is_sane(&op_data->op_fid3)) {
3226 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3227 ll_get_fsname(parent->i_sb, NULL, 0), name,
3228 PFID(&op_data->op_fid3));
3229 GOTO(out_unlock, rc = -EINVAL);
3232 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3234 GOTO(out_unlock, rc);
3237 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3238 PFID(&op_data->op_fid3), mdtidx);
3239 GOTO(out_unlock, rc = 0);
3242 if (S_ISREG(child_inode->i_mode)) {
3243 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3247 GOTO(out_unlock, rc);
3250 rc = ll_data_version(child_inode, &data_version,
3253 GOTO(out_close, rc);
3255 op_data->op_handle = och->och_fh;
3256 op_data->op_data = och->och_mod;
3257 op_data->op_data_version = data_version;
3258 op_data->op_lease_handle = och->och_lease_handle;
3259 op_data->op_bias |= MDS_RENAME_MIGRATE;
3262 op_data->op_mds = mdtidx;
3263 op_data->op_cli_flags = CLI_MIGRATE;
3264 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3265 namelen, name, namelen, &request);
3267 LASSERT(request != NULL);
3268 ll_update_times(request, parent);
3270 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3271 LASSERT(body != NULL);
3273 /* If the server does release layout lock, then we cleanup
3274 * the client och here, otherwise release it in out_close: */
3276 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3277 obd_mod_put(och->och_mod);
3278 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3280 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3286 if (request != NULL) {
3287 ptlrpc_req_finished(request);
3291 /* Try again if the file layout has changed. */
3292 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3296 if (och != NULL) /* close the file */
3297 ll_lease_close(och, child_inode, NULL);
3299 clear_nlink(child_inode);
3301 inode_unlock(child_inode);
3305 ll_finish_md_op_data(op_data);
3310 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3318 * test if some locks matching bits and l_req_mode are acquired
3319 * - bits can be in different locks
3320 * - if found clear the common lock bits in *bits
3321 * - the bits not found, are kept in *bits
3323 * \param bits [IN] searched lock bits [IN]
3324 * \param l_req_mode [IN] searched lock mode
3325 * \retval boolean, true iff all bits are found
3327 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3329 struct lustre_handle lockh;
3330 union ldlm_policy_data policy;
3331 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3332 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3341 fid = &ll_i2info(inode)->lli_fid;
3342 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3343 ldlm_lockname[mode]);
3345 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3346 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3347 policy.l_inodebits.bits = *bits & (1 << i);
3348 if (policy.l_inodebits.bits == 0)
3351 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3352 &policy, mode, &lockh)) {
3353 struct ldlm_lock *lock;
3355 lock = ldlm_handle2lock(&lockh);
3358 ~(lock->l_policy_data.l_inodebits.bits);
3359 LDLM_LOCK_PUT(lock);
3361 *bits &= ~policy.l_inodebits.bits;
3368 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3369 struct lustre_handle *lockh, __u64 flags,
3370 enum ldlm_mode mode)
3372 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3377 fid = &ll_i2info(inode)->lli_fid;
3378 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3380 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3381 fid, LDLM_IBITS, &policy, mode, lockh);
3386 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3388 /* Already unlinked. Just update nlink and return success */
3389 if (rc == -ENOENT) {
3391 /* If it is striped directory, and there is bad stripe
3392 * Let's revalidate the dentry again, instead of returning
3394 if (S_ISDIR(inode->i_mode) &&
3395 ll_i2info(inode)->lli_lsm_md != NULL)
3398 /* This path cannot be hit for regular files unless in
3399 * case of obscure races, so no need to to validate
3401 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3403 } else if (rc != 0) {
3404 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3405 "%s: revalidate FID "DFID" error: rc = %d\n",
3406 ll_get_fsname(inode->i_sb, NULL, 0),
3407 PFID(ll_inode2fid(inode)), rc);
3413 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3415 struct inode *inode = dentry->d_inode;
3416 struct ptlrpc_request *req = NULL;
3417 struct obd_export *exp;
3421 LASSERT(inode != NULL);
3423 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3424 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3426 exp = ll_i2mdexp(inode);
3428 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3429 * But under CMD case, it caused some lock issues, should be fixed
3430 * with new CMD ibits lock. See bug 12718 */
3431 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3432 struct lookup_intent oit = { .it_op = IT_GETATTR };
3433 struct md_op_data *op_data;
3435 if (ibits == MDS_INODELOCK_LOOKUP)
3436 oit.it_op = IT_LOOKUP;
3438 /* Call getattr by fid, so do not provide name at all. */
3439 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3440 dentry->d_inode, NULL, 0, 0,
3441 LUSTRE_OPC_ANY, NULL);
3442 if (IS_ERR(op_data))
3443 RETURN(PTR_ERR(op_data));
3445 rc = md_intent_lock(exp, op_data, &oit, &req,
3446 &ll_md_blocking_ast, 0);
3447 ll_finish_md_op_data(op_data);
3449 rc = ll_inode_revalidate_fini(inode, rc);
3453 rc = ll_revalidate_it_finish(req, &oit, dentry);
3455 ll_intent_release(&oit);
3459 /* Unlinked? Unhash dentry, so it is not picked up later by
3460 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3461 here to preserve get_cwd functionality on 2.6.
3463 if (!dentry->d_inode->i_nlink) {
3464 ll_lock_dcache(inode);
3465 d_lustre_invalidate(dentry, 0);
3466 ll_unlock_dcache(inode);
3469 ll_lookup_finish_locks(&oit, dentry);
3470 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3471 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3472 u64 valid = OBD_MD_FLGETATTR;
3473 struct md_op_data *op_data;
3476 if (S_ISREG(inode->i_mode)) {
3477 rc = ll_get_default_mdsize(sbi, &ealen);
3480 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3483 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3484 0, ealen, LUSTRE_OPC_ANY,
3486 if (IS_ERR(op_data))
3487 RETURN(PTR_ERR(op_data));
3489 op_data->op_valid = valid;
3490 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3491 ll_finish_md_op_data(op_data);
3493 rc = ll_inode_revalidate_fini(inode, rc);
3497 rc = ll_prep_inode(&inode, req, NULL, NULL);
3500 ptlrpc_req_finished(req);
3504 static int ll_merge_md_attr(struct inode *inode)
3506 struct cl_attr attr = { 0 };
3509 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3510 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3511 &attr, ll_md_blocking_ast);
3515 set_nlink(inode, attr.cat_nlink);
3516 inode->i_blocks = attr.cat_blocks;
3517 i_size_write(inode, attr.cat_size);
3519 ll_i2info(inode)->lli_atime = attr.cat_atime;
3520 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3521 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3527 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3529 struct inode *inode = dentry->d_inode;
3533 rc = __ll_inode_revalidate(dentry, ibits);
3537 /* if object isn't regular file, don't validate size */
3538 if (!S_ISREG(inode->i_mode)) {
3539 if (S_ISDIR(inode->i_mode) &&
3540 ll_i2info(inode)->lli_lsm_md != NULL) {
3541 rc = ll_merge_md_attr(inode);
3546 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3547 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3548 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3550 /* In case of restore, the MDT has the right size and has
3551 * already send it back without granting the layout lock,
3552 * inode is up-to-date so glimpse is useless.
3553 * Also to glimpse we need the layout, in case of a running
3554 * restore the MDT holds the layout lock so the glimpse will
3555 * block up to the end of restore (getattr will block)
3557 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3558 rc = ll_glimpse_size(inode);
3563 static inline dev_t ll_compat_encode_dev(dev_t dev)
3565 /* The compat_sys_*stat*() syscalls will fail unless the
3566 * device majors and minors are both less than 256. Note that
3567 * the value returned here will be passed through
3568 * old_encode_dev() in cp_compat_stat(). And so we are not
3569 * trying to return a valid compat (u16) device number, just
3570 * one that will pass the old_valid_dev() check. */
3572 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3575 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3577 struct inode *inode = de->d_inode;
3578 struct ll_sb_info *sbi = ll_i2sbi(inode);
3579 struct ll_inode_info *lli = ll_i2info(inode);
3582 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3583 MDS_INODELOCK_LOOKUP);
3584 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3589 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3591 if (ll_need_32bit_api(sbi)) {
3592 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3593 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3594 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3596 stat->ino = inode->i_ino;
3597 stat->dev = inode->i_sb->s_dev;
3598 stat->rdev = inode->i_rdev;
3601 stat->mode = inode->i_mode;
3602 stat->uid = inode->i_uid;
3603 stat->gid = inode->i_gid;
3604 stat->atime = inode->i_atime;
3605 stat->mtime = inode->i_mtime;
3606 stat->ctime = inode->i_ctime;
3607 stat->blksize = 1 << inode->i_blkbits;
3609 stat->nlink = inode->i_nlink;
3610 stat->size = i_size_read(inode);
3611 stat->blocks = inode->i_blocks;
3616 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3617 __u64 start, __u64 len)
3621 struct fiemap *fiemap;
3622 unsigned int extent_count = fieinfo->fi_extents_max;
3624 num_bytes = sizeof(*fiemap) + (extent_count *
3625 sizeof(struct fiemap_extent));
3626 OBD_ALLOC_LARGE(fiemap, num_bytes);
3631 fiemap->fm_flags = fieinfo->fi_flags;
3632 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3633 fiemap->fm_start = start;
3634 fiemap->fm_length = len;
3635 if (extent_count > 0 &&
3636 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3637 sizeof(struct fiemap_extent)) != 0)
3638 GOTO(out, rc = -EFAULT);
3640 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3642 fieinfo->fi_flags = fiemap->fm_flags;
3643 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3644 if (extent_count > 0 &&
3645 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3646 fiemap->fm_mapped_extents *
3647 sizeof(struct fiemap_extent)) != 0)
3648 GOTO(out, rc = -EFAULT);
3650 OBD_FREE_LARGE(fiemap, num_bytes);
3654 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3656 struct ll_inode_info *lli = ll_i2info(inode);
3657 struct posix_acl *acl = NULL;
3660 spin_lock(&lli->lli_lock);
3661 /* VFS' acl_permission_check->check_acl will release the refcount */
3662 acl = posix_acl_dup(lli->lli_posix_acl);
3663 spin_unlock(&lli->lli_lock);
3668 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3670 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3671 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3673 ll_check_acl(struct inode *inode, int mask)
3676 # ifdef CONFIG_FS_POSIX_ACL
3677 struct posix_acl *acl;
3681 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3682 if (flags & IPERM_FLAG_RCU)
3685 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3690 rc = posix_acl_permission(inode, acl, mask);
3691 posix_acl_release(acl);
3694 # else /* !CONFIG_FS_POSIX_ACL */
3696 # endif /* CONFIG_FS_POSIX_ACL */
3698 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3700 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3701 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3703 # ifdef HAVE_INODE_PERMISION_2ARGS
3704 int ll_inode_permission(struct inode *inode, int mask)
3706 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3711 struct ll_sb_info *sbi;
3712 struct root_squash_info *squash;
3713 struct cred *cred = NULL;
3714 const struct cred *old_cred = NULL;
3716 bool squash_id = false;
3719 #ifdef MAY_NOT_BLOCK
3720 if (mask & MAY_NOT_BLOCK)
3722 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3723 if (flags & IPERM_FLAG_RCU)
3727 /* as root inode are NOT getting validated in lookup operation,
3728 * need to do it before permission check. */
3730 if (inode == inode->i_sb->s_root->d_inode) {
3731 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3732 MDS_INODELOCK_LOOKUP);
3737 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3738 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3740 /* squash fsuid/fsgid if needed */
3741 sbi = ll_i2sbi(inode);
3742 squash = &sbi->ll_squash;
3743 if (unlikely(squash->rsi_uid != 0 &&
3744 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3745 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3749 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3750 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3751 squash->rsi_uid, squash->rsi_gid);
3753 /* update current process's credentials
3754 * and FS capability */
3755 cred = prepare_creds();
3759 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3760 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3761 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3762 if ((1 << cap) & CFS_CAP_FS_MASK)
3763 cap_lower(cred->cap_effective, cap);
3765 old_cred = override_creds(cred);
3768 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3769 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3770 /* restore current process's credentials and FS capability */
3772 revert_creds(old_cred);
3779 /* -o localflock - only provides locally consistent flock locks */
3780 struct file_operations ll_file_operations = {
3781 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3782 # ifdef HAVE_SYNC_READ_WRITE
3783 .read = new_sync_read,
3784 .write = new_sync_write,
3786 .read_iter = ll_file_read_iter,
3787 .write_iter = ll_file_write_iter,
3788 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3789 .read = ll_file_read,
3790 .aio_read = ll_file_aio_read,
3791 .write = ll_file_write,
3792 .aio_write = ll_file_aio_write,
3793 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3794 .unlocked_ioctl = ll_file_ioctl,
3795 .open = ll_file_open,
3796 .release = ll_file_release,
3797 .mmap = ll_file_mmap,
3798 .llseek = ll_file_seek,
3799 .splice_read = ll_file_splice_read,
3804 struct file_operations ll_file_operations_flock = {
3805 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3806 # ifdef HAVE_SYNC_READ_WRITE
3807 .read = new_sync_read,
3808 .write = new_sync_write,
3809 # endif /* HAVE_SYNC_READ_WRITE */
3810 .read_iter = ll_file_read_iter,
3811 .write_iter = ll_file_write_iter,
3812 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3813 .read = ll_file_read,
3814 .aio_read = ll_file_aio_read,
3815 .write = ll_file_write,
3816 .aio_write = ll_file_aio_write,
3817 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3818 .unlocked_ioctl = ll_file_ioctl,
3819 .open = ll_file_open,
3820 .release = ll_file_release,
3821 .mmap = ll_file_mmap,
3822 .llseek = ll_file_seek,
3823 .splice_read = ll_file_splice_read,
3826 .flock = ll_file_flock,
3827 .lock = ll_file_flock
3830 /* These are for -o noflock - to return ENOSYS on flock calls */
3831 struct file_operations ll_file_operations_noflock = {
3832 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3833 # ifdef HAVE_SYNC_READ_WRITE
3834 .read = new_sync_read,
3835 .write = new_sync_write,
3836 # endif /* HAVE_SYNC_READ_WRITE */
3837 .read_iter = ll_file_read_iter,
3838 .write_iter = ll_file_write_iter,
3839 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3840 .read = ll_file_read,
3841 .aio_read = ll_file_aio_read,
3842 .write = ll_file_write,
3843 .aio_write = ll_file_aio_write,
3844 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3845 .unlocked_ioctl = ll_file_ioctl,
3846 .open = ll_file_open,
3847 .release = ll_file_release,
3848 .mmap = ll_file_mmap,
3849 .llseek = ll_file_seek,
3850 .splice_read = ll_file_splice_read,
3853 .flock = ll_file_noflock,
3854 .lock = ll_file_noflock
3857 struct inode_operations ll_file_inode_operations = {
3858 .setattr = ll_setattr,
3859 .getattr = ll_getattr,
3860 .permission = ll_inode_permission,
3861 .setxattr = ll_setxattr,
3862 .getxattr = ll_getxattr,
3863 .listxattr = ll_listxattr,
3864 .removexattr = ll_removexattr,
3865 .fiemap = ll_fiemap,
3866 #ifdef HAVE_IOP_GET_ACL
3867 .get_acl = ll_get_acl,
3871 /* dynamic ioctl number support routins */
3872 static struct llioc_ctl_data {
3873 struct rw_semaphore ioc_sem;
3874 struct list_head ioc_head;
3876 __RWSEM_INITIALIZER(llioc.ioc_sem),
3877 LIST_HEAD_INIT(llioc.ioc_head)
3882 struct list_head iocd_list;
3883 unsigned int iocd_size;
3884 llioc_callback_t iocd_cb;
3885 unsigned int iocd_count;
3886 unsigned int iocd_cmd[0];
3889 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3892 struct llioc_data *in_data = NULL;
3895 if (cb == NULL || cmd == NULL ||
3896 count > LLIOC_MAX_CMD || count < 0)
3899 size = sizeof(*in_data) + count * sizeof(unsigned int);
3900 OBD_ALLOC(in_data, size);
3901 if (in_data == NULL)
3904 memset(in_data, 0, sizeof(*in_data));
3905 in_data->iocd_size = size;
3906 in_data->iocd_cb = cb;
3907 in_data->iocd_count = count;
3908 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3910 down_write(&llioc.ioc_sem);
3911 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3912 up_write(&llioc.ioc_sem);
3917 void ll_iocontrol_unregister(void *magic)
3919 struct llioc_data *tmp;
3924 down_write(&llioc.ioc_sem);
3925 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3927 unsigned int size = tmp->iocd_size;
3929 list_del(&tmp->iocd_list);
3930 up_write(&llioc.ioc_sem);
3932 OBD_FREE(tmp, size);
3936 up_write(&llioc.ioc_sem);
3938 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3941 EXPORT_SYMBOL(ll_iocontrol_register);
3942 EXPORT_SYMBOL(ll_iocontrol_unregister);
3944 static enum llioc_iter
3945 ll_iocontrol_call(struct inode *inode, struct file *file,
3946 unsigned int cmd, unsigned long arg, int *rcp)
3948 enum llioc_iter ret = LLIOC_CONT;
3949 struct llioc_data *data;
3950 int rc = -EINVAL, i;
3952 down_read(&llioc.ioc_sem);
3953 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3954 for (i = 0; i < data->iocd_count; i++) {
3955 if (cmd != data->iocd_cmd[i])
3958 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3962 if (ret == LLIOC_STOP)
3965 up_read(&llioc.ioc_sem);
3972 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3974 struct ll_inode_info *lli = ll_i2info(inode);
3975 struct cl_object *obj = lli->lli_clob;
3984 env = cl_env_get(&refcheck);
3986 RETURN(PTR_ERR(env));
3988 rc = cl_conf_set(env, lli->lli_clob, conf);
3992 if (conf->coc_opc == OBJECT_CONF_SET) {
3993 struct ldlm_lock *lock = conf->coc_lock;
3994 struct cl_layout cl = {
3998 LASSERT(lock != NULL);
3999 LASSERT(ldlm_has_layout(lock));
4001 /* it can only be allowed to match after layout is
4002 * applied to inode otherwise false layout would be
4003 * seen. Applying layout shoud happen before dropping
4004 * the intent lock. */
4005 ldlm_lock_allow_match(lock);
4007 rc = cl_object_layout_get(env, obj, &cl);
4012 DFID": layout version change: %u -> %u\n",
4013 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4015 ll_layout_version_set(lli, cl.cl_layout_gen);
4019 cl_env_put(env, &refcheck);
4024 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4025 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4028 struct ll_sb_info *sbi = ll_i2sbi(inode);
4029 struct ptlrpc_request *req;
4030 struct mdt_body *body;
4037 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4038 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4039 lock->l_lvb_data, lock->l_lvb_len);
4041 if (lock->l_lvb_data != NULL)
4044 /* if layout lock was granted right away, the layout is returned
4045 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4046 * blocked and then granted via completion ast, we have to fetch
4047 * layout here. Please note that we can't use the LVB buffer in
4048 * completion AST because it doesn't have a large enough buffer */
4049 rc = ll_get_default_mdsize(sbi, &lmmsize);
4051 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4052 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4057 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4059 GOTO(out, rc = -EPROTO);
4061 lmmsize = body->mbo_eadatasize;
4062 if (lmmsize == 0) /* empty layout */
4065 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4067 GOTO(out, rc = -EFAULT);
4069 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4070 if (lvbdata == NULL)
4071 GOTO(out, rc = -ENOMEM);
4073 memcpy(lvbdata, lmm, lmmsize);
4074 lock_res_and_lock(lock);
4075 if (unlikely(lock->l_lvb_data == NULL)) {
4076 lock->l_lvb_type = LVB_T_LAYOUT;
4077 lock->l_lvb_data = lvbdata;
4078 lock->l_lvb_len = lmmsize;
4081 unlock_res_and_lock(lock);
4084 OBD_FREE_LARGE(lvbdata, lmmsize);
4089 ptlrpc_req_finished(req);
4094 * Apply the layout to the inode. Layout lock is held and will be released
4097 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4098 struct inode *inode)
4100 struct ll_inode_info *lli = ll_i2info(inode);
4101 struct ll_sb_info *sbi = ll_i2sbi(inode);
4102 struct ldlm_lock *lock;
4103 struct cl_object_conf conf;
4106 bool wait_layout = false;
4109 LASSERT(lustre_handle_is_used(lockh));
4111 lock = ldlm_handle2lock(lockh);
4112 LASSERT(lock != NULL);
4113 LASSERT(ldlm_has_layout(lock));
4115 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4116 PFID(&lli->lli_fid), inode);
4118 /* in case this is a caching lock and reinstate with new inode */
4119 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4121 lock_res_and_lock(lock);
4122 lvb_ready = ldlm_is_lvb_ready(lock);
4123 unlock_res_and_lock(lock);
4124 /* checking lvb_ready is racy but this is okay. The worst case is
4125 * that multi processes may configure the file on the same time. */
4130 rc = ll_layout_fetch(inode, lock);
4134 /* for layout lock, lmm is stored in lock's lvb.
4135 * lvb_data is immutable if the lock is held so it's safe to access it
4138 * set layout to file. Unlikely this will fail as old layout was
4139 * surely eliminated */
4140 memset(&conf, 0, sizeof conf);
4141 conf.coc_opc = OBJECT_CONF_SET;
4142 conf.coc_inode = inode;
4143 conf.coc_lock = lock;
4144 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4145 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4146 rc = ll_layout_conf(inode, &conf);
4148 /* refresh layout failed, need to wait */
4149 wait_layout = rc == -EBUSY;
4153 LDLM_LOCK_PUT(lock);
4154 ldlm_lock_decref(lockh, mode);
4156 /* wait for IO to complete if it's still being used. */
4158 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4159 ll_get_fsname(inode->i_sb, NULL, 0),
4160 PFID(&lli->lli_fid), inode);
4162 memset(&conf, 0, sizeof conf);
4163 conf.coc_opc = OBJECT_CONF_WAIT;
4164 conf.coc_inode = inode;
4165 rc = ll_layout_conf(inode, &conf);
4169 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4170 ll_get_fsname(inode->i_sb, NULL, 0),
4171 PFID(&lli->lli_fid), rc);
4176 static int ll_layout_refresh_locked(struct inode *inode)
4178 struct ll_inode_info *lli = ll_i2info(inode);
4179 struct ll_sb_info *sbi = ll_i2sbi(inode);
4180 struct md_op_data *op_data;
4181 struct lookup_intent it;
4182 struct lustre_handle lockh;
4183 enum ldlm_mode mode;
4184 struct ptlrpc_request *req;
4189 /* mostly layout lock is caching on the local side, so try to match
4190 * it before grabbing layout lock mutex. */
4191 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4192 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4193 if (mode != 0) { /* hit cached lock */
4194 rc = ll_layout_lock_set(&lockh, mode, inode);
4201 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4202 0, 0, LUSTRE_OPC_ANY, NULL);
4203 if (IS_ERR(op_data))
4204 RETURN(PTR_ERR(op_data));
4206 /* have to enqueue one */
4207 memset(&it, 0, sizeof(it));
4208 it.it_op = IT_LAYOUT;
4210 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4211 ll_get_fsname(inode->i_sb, NULL, 0),
4212 PFID(&lli->lli_fid), inode);
4214 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4215 &ll_md_blocking_ast, 0);
4216 if (it.it_request != NULL)
4217 ptlrpc_req_finished(it.it_request);
4218 it.it_request = NULL;
4220 ll_finish_md_op_data(op_data);
4222 mode = it.it_lock_mode;
4223 it.it_lock_mode = 0;
4224 ll_intent_drop_lock(&it);
4227 /* set lock data in case this is a new lock */
4228 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4229 lockh.cookie = it.it_lock_handle;
4230 rc = ll_layout_lock_set(&lockh, mode, inode);
4239 * This function checks if there exists a LAYOUT lock on the client side,
4240 * or enqueues it if it doesn't have one in cache.
4242 * This function will not hold layout lock so it may be revoked any time after
4243 * this function returns. Any operations depend on layout should be redone
4246 * This function should be called before lov_io_init() to get an uptodate
4247 * layout version, the caller should save the version number and after IO
4248 * is finished, this function should be called again to verify that layout
4249 * is not changed during IO time.
4251 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4253 struct ll_inode_info *lli = ll_i2info(inode);
4254 struct ll_sb_info *sbi = ll_i2sbi(inode);
4258 *gen = ll_layout_version_get(lli);
4259 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4263 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4264 LASSERT(S_ISREG(inode->i_mode));
4266 /* take layout lock mutex to enqueue layout lock exclusively. */
4267 mutex_lock(&lli->lli_layout_mutex);
4269 rc = ll_layout_refresh_locked(inode);
4273 *gen = ll_layout_version_get(lli);
4275 mutex_unlock(&lli->lli_layout_mutex);
4281 * This function send a restore request to the MDT
4283 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4285 struct hsm_user_request *hur;
4289 len = sizeof(struct hsm_user_request) +
4290 sizeof(struct hsm_user_item);
4291 OBD_ALLOC(hur, len);
4295 hur->hur_request.hr_action = HUA_RESTORE;
4296 hur->hur_request.hr_archive_id = 0;
4297 hur->hur_request.hr_flags = 0;
4298 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4299 sizeof(hur->hur_user_item[0].hui_fid));
4300 hur->hur_user_item[0].hui_extent.offset = offset;
4301 hur->hur_user_item[0].hui_extent.length = length;
4302 hur->hur_request.hr_itemcount = 1;
4303 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,