4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct dentry *de = file_dentry(file);
355 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
356 struct dentry *parent = de->d_parent;
357 const char *name = NULL;
359 struct md_op_data *op_data;
360 struct ptlrpc_request *req = NULL;
364 LASSERT(parent != NULL);
365 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
367 /* if server supports open-by-fid, or file name is invalid, don't pack
368 * name in open request */
369 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
370 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
371 name = de->d_name.name;
372 len = de->d_name.len;
375 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
376 name, len, 0, LUSTRE_OPC_ANY, NULL);
378 RETURN(PTR_ERR(op_data));
379 op_data->op_data = lmm;
380 op_data->op_data_size = lmmsize;
382 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
383 &ll_md_blocking_ast, 0);
384 ll_finish_md_op_data(op_data);
386 /* reason for keep own exit path - don`t flood log
387 * with messages with -ESTALE errors.
389 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
390 it_open_error(DISP_OPEN_OPEN, itp))
392 ll_release_openhandle(de, itp);
396 if (it_disposition(itp, DISP_LOOKUP_NEG))
397 GOTO(out, rc = -ENOENT);
399 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
400 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
401 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
405 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
406 if (!rc && itp->it_lock_mode)
407 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
410 ptlrpc_req_finished(req);
411 ll_intent_drop_lock(itp);
413 /* We did open by fid, but by the time we got to the server,
414 * the object disappeared. If this is a create, we cannot really
415 * tell the userspace that the file it was trying to create
416 * does not exist. Instead let's return -ESTALE, and the VFS will
417 * retry the create with LOOKUP_REVAL that we are going to catch
418 * in ll_revalidate_dentry() and use lookup then.
420 if (rc == -ENOENT && itp->it_op & IT_CREAT)
426 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
427 struct obd_client_handle *och)
429 struct mdt_body *body;
431 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
432 och->och_fh = body->mbo_handle;
433 och->och_fid = body->mbo_fid1;
434 och->och_lease_handle.cookie = it->it_lock_handle;
435 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
436 och->och_flags = it->it_flags;
438 return md_set_open_replay_data(md_exp, och, it);
441 static int ll_local_open(struct file *file, struct lookup_intent *it,
442 struct ll_file_data *fd, struct obd_client_handle *och)
444 struct inode *inode = file_inode(file);
447 LASSERT(!LUSTRE_FPRIVATE(file));
454 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
459 LUSTRE_FPRIVATE(file) = fd;
460 ll_readahead_init(inode, &fd->fd_ras);
461 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
463 /* ll_cl_context initialize */
464 rwlock_init(&fd->fd_lock);
465 INIT_LIST_HEAD(&fd->fd_lccs);
470 /* Open a file, and (for the very first open) create objects on the OSTs at
471 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
472 * creation or open until ll_lov_setstripe() ioctl is called.
474 * If we already have the stripe MD locally then we don't request it in
475 * md_open(), by passing a lmm_size = 0.
477 * It is up to the application to ensure no other processes open this file
478 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
479 * used. We might be able to avoid races of that sort by getting lli_open_sem
480 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
481 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
483 int ll_file_open(struct inode *inode, struct file *file)
485 struct ll_inode_info *lli = ll_i2info(inode);
486 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
487 .it_flags = file->f_flags };
488 struct obd_client_handle **och_p = NULL;
489 __u64 *och_usecount = NULL;
490 struct ll_file_data *fd;
494 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
495 PFID(ll_inode2fid(inode)), inode, file->f_flags);
497 it = file->private_data; /* XXX: compat macro */
498 file->private_data = NULL; /* prevent ll_local_open assertion */
500 fd = ll_file_data_get();
502 GOTO(out_openerr, rc = -ENOMEM);
505 if (S_ISDIR(inode->i_mode))
506 ll_authorize_statahead(inode, fd);
508 if (inode->i_sb->s_root == file_dentry(file)) {
509 LUSTRE_FPRIVATE(file) = fd;
513 if (!it || !it->it_disposition) {
514 /* Convert f_flags into access mode. We cannot use file->f_mode,
515 * because everything but O_ACCMODE mask was stripped from
517 if ((oit.it_flags + 1) & O_ACCMODE)
519 if (file->f_flags & O_TRUNC)
520 oit.it_flags |= FMODE_WRITE;
522 /* kernel only call f_op->open in dentry_open. filp_open calls
523 * dentry_open after call to open_namei that checks permissions.
524 * Only nfsd_open call dentry_open directly without checking
525 * permissions and because of that this code below is safe. */
526 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
527 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
529 /* We do not want O_EXCL here, presumably we opened the file
530 * already? XXX - NFS implications? */
531 oit.it_flags &= ~O_EXCL;
533 /* bug20584, if "it_flags" contains O_CREAT, the file will be
534 * created if necessary, then "IT_CREAT" should be set to keep
535 * consistent with it */
536 if (oit.it_flags & O_CREAT)
537 oit.it_op |= IT_CREAT;
543 /* Let's see if we have file open on MDS already. */
544 if (it->it_flags & FMODE_WRITE) {
545 och_p = &lli->lli_mds_write_och;
546 och_usecount = &lli->lli_open_fd_write_count;
547 } else if (it->it_flags & FMODE_EXEC) {
548 och_p = &lli->lli_mds_exec_och;
549 och_usecount = &lli->lli_open_fd_exec_count;
551 och_p = &lli->lli_mds_read_och;
552 och_usecount = &lli->lli_open_fd_read_count;
555 mutex_lock(&lli->lli_och_mutex);
556 if (*och_p) { /* Open handle is present */
557 if (it_disposition(it, DISP_OPEN_OPEN)) {
558 /* Well, there's extra open request that we do not need,
559 let's close it somehow. This will decref request. */
560 rc = it_open_error(DISP_OPEN_OPEN, it);
562 mutex_unlock(&lli->lli_och_mutex);
563 GOTO(out_openerr, rc);
566 ll_release_openhandle(file_dentry(file), it);
570 rc = ll_local_open(file, it, fd, NULL);
573 mutex_unlock(&lli->lli_och_mutex);
574 GOTO(out_openerr, rc);
577 LASSERT(*och_usecount == 0);
578 if (!it->it_disposition) {
579 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
580 /* We cannot just request lock handle now, new ELC code
581 means that one of other OPEN locks for this file
582 could be cancelled, and since blocking ast handler
583 would attempt to grab och_mutex as well, that would
584 result in a deadlock */
585 mutex_unlock(&lli->lli_och_mutex);
587 * Normally called under two situations:
589 * 2. A race/condition on MDS resulting in no open
590 * handle to be returned from LOOKUP|OPEN request,
591 * for example if the target entry was a symlink.
593 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
594 * marked by a bit set in ll_iget_for_nfs. Clear the
595 * bit so that it's not confusing later callers.
597 * NB; when ldd is NULL, it must have come via normal
598 * lookup path only, since ll_iget_for_nfs always calls
601 if (ldd && ldd->lld_nfs_dentry) {
602 ldd->lld_nfs_dentry = 0;
603 it->it_flags |= MDS_OPEN_LOCK;
607 * Always specify MDS_OPEN_BY_FID because we don't want
608 * to get file with different fid.
610 it->it_flags |= MDS_OPEN_BY_FID;
611 rc = ll_intent_file_open(file, NULL, 0, it);
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 lli->lli_update_atime = 0;
1020 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1023 atime = LTIME_S(inode->i_atime);
1024 mtime = LTIME_S(inode->i_mtime);
1025 ctime = LTIME_S(inode->i_ctime);
1027 cl_object_attr_lock(obj);
1028 rc = cl_object_attr_get(env, obj, attr);
1029 cl_object_attr_unlock(obj);
1032 GOTO(out_size_unlock, rc);
1034 if (atime < attr->cat_atime)
1035 atime = attr->cat_atime;
1037 if (ctime < attr->cat_ctime)
1038 ctime = attr->cat_ctime;
1040 if (mtime < attr->cat_mtime)
1041 mtime = attr->cat_mtime;
1043 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044 PFID(&lli->lli_fid), attr->cat_size);
1046 i_size_write(inode, attr->cat_size);
1047 inode->i_blocks = attr->cat_blocks;
1049 LTIME_S(inode->i_atime) = atime;
1050 LTIME_S(inode->i_mtime) = mtime;
1051 LTIME_S(inode->i_ctime) = ctime;
1054 ll_inode_size_unlock(inode);
1059 static bool file_is_noatime(const struct file *file)
1061 const struct vfsmount *mnt = file->f_path.mnt;
1062 const struct inode *inode = file_inode((struct file *)file);
1064 /* Adapted from file_accessed() and touch_atime().*/
1065 if (file->f_flags & O_NOATIME)
1068 if (inode->i_flags & S_NOATIME)
1071 if (IS_NOATIME(inode))
1074 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1077 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1086 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1088 struct inode *inode = file_inode((struct file *)file);
1090 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1092 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1093 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1094 file->f_flags & O_DIRECT ||
1097 io->ci_obj = ll_i2info(inode)->lli_clob;
1098 io->ci_lockreq = CILR_MAYBE;
1099 if (ll_file_nolock(file)) {
1100 io->ci_lockreq = CILR_NEVER;
1101 io->ci_no_srvlock = 1;
1102 } else if (file->f_flags & O_APPEND) {
1103 io->ci_lockreq = CILR_MANDATORY;
1106 io->ci_noatime = file_is_noatime(file);
1110 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1111 struct file *file, enum cl_io_type iot,
1112 loff_t *ppos, size_t count)
1114 struct vvp_io *vio = vvp_env_io(env);
1115 struct inode *inode = file_inode(file);
1116 struct ll_inode_info *lli = ll_i2info(inode);
1117 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1121 struct range_lock range;
1125 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
1126 file_dentry(file)->d_name.name, iot, *ppos, count);
1129 io = vvp_env_thread_io(env);
1130 ll_io_init(io, file, iot == CIT_WRITE);
1132 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1133 bool range_locked = false;
1135 if (file->f_flags & O_APPEND)
1136 range_lock_init(&range, 0, LUSTRE_EOF);
1138 range_lock_init(&range, *ppos, *ppos + count - 1);
1140 vio->vui_fd = LUSTRE_FPRIVATE(file);
1141 vio->vui_io_subtype = args->via_io_subtype;
1143 switch (vio->vui_io_subtype) {
1145 vio->vui_iter = args->u.normal.via_iter;
1146 vio->vui_iocb = args->u.normal.via_iocb;
1147 /* Direct IO reads must also take range lock,
1148 * or multiple reads will try to work on the same pages
1149 * See LU-6227 for details. */
1150 if (((iot == CIT_WRITE) ||
1151 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1152 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1153 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1155 rc = range_lock(&lli->lli_write_tree, &range);
1159 range_locked = true;
1163 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1164 vio->u.splice.vui_flags = args->u.splice.via_flags;
1167 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1171 ll_cl_add(file, env, io, LCC_RW);
1172 rc = cl_io_loop(env, io);
1173 ll_cl_remove(file, env);
1176 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1178 range_unlock(&lli->lli_write_tree, &range);
1181 /* cl_io_rw_init() handled IO */
1185 if (io->ci_nob > 0) {
1186 result += io->ci_nob;
1187 count -= io->ci_nob;
1188 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1190 /* prepare IO restart */
1191 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1192 args->u.normal.via_iter = vio->vui_iter;
1195 cl_io_fini(env, io);
1197 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1199 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1200 file_dentry(file)->d_name.name,
1201 iot == CIT_READ ? "read" : "write",
1202 *ppos, count, result);
1206 if (iot == CIT_READ) {
1208 ll_stats_ops_tally(ll_i2sbi(inode),
1209 LPROC_LL_READ_BYTES, result);
1210 } else if (iot == CIT_WRITE) {
1212 ll_stats_ops_tally(ll_i2sbi(inode),
1213 LPROC_LL_WRITE_BYTES, result);
1214 fd->fd_write_failed = false;
1215 } else if (result == 0 && rc == 0) {
1218 fd->fd_write_failed = true;
1220 fd->fd_write_failed = false;
1221 } else if (rc != -ERESTARTSYS) {
1222 fd->fd_write_failed = true;
1226 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1228 RETURN(result > 0 ? result : rc);
1232 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1233 * especially for small I/O.
1235 * To serve a read request, CLIO has to create and initialize a cl_io and
1236 * then request DLM lock. This has turned out to have siginificant overhead
1237 * and affects the performance of small I/O dramatically.
1239 * It's not necessary to create a cl_io for each I/O. Under the help of read
1240 * ahead, most of the pages being read are already in memory cache and we can
1241 * read those pages directly because if the pages exist, the corresponding DLM
1242 * lock must exist so that page content must be valid.
1244 * In fast read implementation, the llite speculatively finds and reads pages
1245 * in memory cache. There are three scenarios for fast read:
1246 * - If the page exists and is uptodate, kernel VM will provide the data and
1247 * CLIO won't be intervened;
1248 * - If the page was brought into memory by read ahead, it will be exported
1249 * and read ahead parameters will be updated;
1250 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1251 * it will go back and invoke normal read, i.e., a cl_io will be created
1252 * and DLM lock will be requested.
1254 * POSIX compliance: posix standard states that read is intended to be atomic.
1255 * Lustre read implementation is in line with Linux kernel read implementation
1256 * and neither of them complies with POSIX standard in this matter. Fast read
1257 * doesn't make the situation worse on single node but it may interleave write
1258 * results from multiple nodes due to short read handling in ll_file_aio_read().
1260 * \param env - lu_env
1261 * \param iocb - kiocb from kernel
1262 * \param iter - user space buffers where the data will be copied
1264 * \retval - number of bytes have been read, or error code if error occurred.
1267 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1268 struct iov_iter *iter)
1272 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1275 /* NB: we can't do direct IO for fast read because it will need a lock
1276 * to make IO engine happy. */
1277 if (iocb->ki_filp->f_flags & O_DIRECT)
1280 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1281 result = generic_file_read_iter(iocb, iter);
1282 ll_cl_remove(iocb->ki_filp, env);
1284 /* If the first page is not in cache, generic_file_aio_read() will be
1285 * returned with -ENODATA.
1286 * See corresponding code in ll_readpage(). */
1287 if (result == -ENODATA)
1291 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1292 LPROC_LL_READ_BYTES, result);
1298 * Read from a file (through the page cache).
1300 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1303 struct vvp_io_args *args;
1308 env = cl_env_get(&refcheck);
1310 return PTR_ERR(env);
1312 result = ll_do_fast_read(env, iocb, to);
1313 if (result < 0 || iov_iter_count(to) == 0)
1316 args = ll_env_args(env, IO_NORMAL);
1317 args->u.normal.via_iter = to;
1318 args->u.normal.via_iocb = iocb;
1320 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1321 &iocb->ki_pos, iov_iter_count(to));
1324 else if (result == 0)
1328 cl_env_put(env, &refcheck);
1333 * Write to a file (through the page cache).
1335 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1337 struct vvp_io_args *args;
1342 env = cl_env_get(&refcheck);
1344 return PTR_ERR(env);
1346 args = ll_env_args(env, IO_NORMAL);
1347 args->u.normal.via_iter = from;
1348 args->u.normal.via_iocb = iocb;
1350 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1351 &iocb->ki_pos, iov_iter_count(from));
1352 cl_env_put(env, &refcheck);
1356 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1358 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1360 static int ll_file_get_iov_count(const struct iovec *iov,
1361 unsigned long *nr_segs, size_t *count)
1366 for (seg = 0; seg < *nr_segs; seg++) {
1367 const struct iovec *iv = &iov[seg];
1370 * If any segment has a negative length, or the cumulative
1371 * length ever wraps negative then return -EINVAL.
1374 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1376 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1381 cnt -= iv->iov_len; /* This segment is no good */
1388 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1389 unsigned long nr_segs, loff_t pos)
1396 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1400 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1401 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1402 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1403 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1404 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1406 result = ll_file_read_iter(iocb, &to);
1411 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1415 struct iovec iov = { .iov_base = buf, .iov_len = count };
1416 struct kiocb *kiocb;
1421 env = cl_env_get(&refcheck);
1423 RETURN(PTR_ERR(env));
1425 kiocb = &ll_env_info(env)->lti_kiocb;
1426 init_sync_kiocb(kiocb, file);
1427 kiocb->ki_pos = *ppos;
1428 #ifdef HAVE_KIOCB_KI_LEFT
1429 kiocb->ki_left = count;
1430 #elif defined(HAVE_KI_NBYTES)
1431 kiocb->ki_nbytes = count;
1434 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1435 *ppos = kiocb->ki_pos;
1437 cl_env_put(env, &refcheck);
1442 * Write to a file (through the page cache).
1445 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1446 unsigned long nr_segs, loff_t pos)
1448 struct iov_iter from;
1453 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1457 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1458 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1459 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1460 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1461 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1463 result = ll_file_write_iter(iocb, &from);
1468 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1469 size_t count, loff_t *ppos)
1472 struct iovec iov = { .iov_base = (void __user *)buf,
1474 struct kiocb *kiocb;
1479 env = cl_env_get(&refcheck);
1481 RETURN(PTR_ERR(env));
1483 kiocb = &ll_env_info(env)->lti_kiocb;
1484 init_sync_kiocb(kiocb, file);
1485 kiocb->ki_pos = *ppos;
1486 #ifdef HAVE_KIOCB_KI_LEFT
1487 kiocb->ki_left = count;
1488 #elif defined(HAVE_KI_NBYTES)
1489 kiocb->ki_nbytes = count;
1492 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1493 *ppos = kiocb->ki_pos;
1495 cl_env_put(env, &refcheck);
1498 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1501 * Send file content (through pagecache) somewhere with helper
1503 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1504 struct pipe_inode_info *pipe, size_t count,
1508 struct vvp_io_args *args;
1513 env = cl_env_get(&refcheck);
1515 RETURN(PTR_ERR(env));
1517 args = ll_env_args(env, IO_SPLICE);
1518 args->u.splice.via_pipe = pipe;
1519 args->u.splice.via_flags = flags;
1521 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1522 cl_env_put(env, &refcheck);
1526 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1527 __u64 flags, struct lov_user_md *lum,
1530 struct lookup_intent oit = {
1532 .it_flags = flags | MDS_OPEN_BY_FID,
1537 ll_inode_size_lock(inode);
1538 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1540 GOTO(out_unlock, rc);
1542 ll_release_openhandle(file_dentry(file), &oit);
1545 ll_inode_size_unlock(inode);
1546 ll_intent_release(&oit);
1547 cl_lov_delay_create_clear(&file->f_flags);
1552 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1553 struct lov_mds_md **lmmp, int *lmm_size,
1554 struct ptlrpc_request **request)
1556 struct ll_sb_info *sbi = ll_i2sbi(inode);
1557 struct mdt_body *body;
1558 struct lov_mds_md *lmm = NULL;
1559 struct ptlrpc_request *req = NULL;
1560 struct md_op_data *op_data;
1563 rc = ll_get_default_mdsize(sbi, &lmmsize);
1567 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1568 strlen(filename), lmmsize,
1569 LUSTRE_OPC_ANY, NULL);
1570 if (IS_ERR(op_data))
1571 RETURN(PTR_ERR(op_data));
1573 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1574 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1575 ll_finish_md_op_data(op_data);
1577 CDEBUG(D_INFO, "md_getattr_name failed "
1578 "on %s: rc %d\n", filename, rc);
1582 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1583 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1585 lmmsize = body->mbo_eadatasize;
1587 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1589 GOTO(out, rc = -ENODATA);
1592 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1593 LASSERT(lmm != NULL);
1595 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1596 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1597 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1598 GOTO(out, rc = -EPROTO);
1601 * This is coming from the MDS, so is probably in
1602 * little endian. We convert it to host endian before
1603 * passing it to userspace.
1605 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1608 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1609 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1610 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1611 if (le32_to_cpu(lmm->lmm_pattern) &
1612 LOV_PATTERN_F_RELEASED)
1616 /* if function called for directory - we should
1617 * avoid swab not existent lsm objects */
1618 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1619 lustre_swab_lov_user_md_v1(
1620 (struct lov_user_md_v1 *)lmm);
1621 if (S_ISREG(body->mbo_mode))
1622 lustre_swab_lov_user_md_objects(
1623 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1625 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1626 lustre_swab_lov_user_md_v3(
1627 (struct lov_user_md_v3 *)lmm);
1628 if (S_ISREG(body->mbo_mode))
1629 lustre_swab_lov_user_md_objects(
1630 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1632 } else if (lmm->lmm_magic ==
1633 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1634 lustre_swab_lov_comp_md_v1(
1635 (struct lov_comp_md_v1 *)lmm);
1641 *lmm_size = lmmsize;
1646 static int ll_lov_setea(struct inode *inode, struct file *file,
1649 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1650 struct lov_user_md *lump;
1651 int lum_size = sizeof(struct lov_user_md) +
1652 sizeof(struct lov_user_ost_data);
1656 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1659 OBD_ALLOC_LARGE(lump, lum_size);
1663 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1664 GOTO(out_lump, rc = -EFAULT);
1666 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1669 OBD_FREE_LARGE(lump, lum_size);
1673 static int ll_file_getstripe(struct inode *inode,
1674 struct lov_user_md __user *lum)
1681 env = cl_env_get(&refcheck);
1683 RETURN(PTR_ERR(env));
1685 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1686 cl_env_put(env, &refcheck);
1690 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1693 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1694 struct lov_user_md *klum;
1696 __u64 flags = FMODE_WRITE;
1699 rc = ll_copy_user_md(lum, &klum);
1704 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1706 OBD_FREE(klum, lum_size);
1711 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1713 struct ll_inode_info *lli = ll_i2info(inode);
1714 struct cl_object *obj = lli->lli_clob;
1715 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1716 struct ll_grouplock grouplock;
1721 CWARN("group id for group lock must not be 0\n");
1725 if (ll_file_nolock(file))
1726 RETURN(-EOPNOTSUPP);
1728 spin_lock(&lli->lli_lock);
1729 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1730 CWARN("group lock already existed with gid %lu\n",
1731 fd->fd_grouplock.lg_gid);
1732 spin_unlock(&lli->lli_lock);
1735 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1736 spin_unlock(&lli->lli_lock);
1739 * XXX: group lock needs to protect all OST objects while PFL
1740 * can add new OST objects during the IO, so we'd instantiate
1741 * all OST objects before getting its group lock.
1746 struct cl_layout cl = {
1747 .cl_is_composite = false,
1750 env = cl_env_get(&refcheck);
1752 RETURN(PTR_ERR(env));
1754 rc = cl_object_layout_get(env, obj, &cl);
1755 if (!rc && cl.cl_is_composite)
1756 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1758 cl_env_put(env, &refcheck);
1763 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1764 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1768 spin_lock(&lli->lli_lock);
1769 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1770 spin_unlock(&lli->lli_lock);
1771 CERROR("another thread just won the race\n");
1772 cl_put_grouplock(&grouplock);
1776 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1777 fd->fd_grouplock = grouplock;
1778 spin_unlock(&lli->lli_lock);
1780 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1784 static int ll_put_grouplock(struct inode *inode, struct file *file,
1787 struct ll_inode_info *lli = ll_i2info(inode);
1788 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1789 struct ll_grouplock grouplock;
1792 spin_lock(&lli->lli_lock);
1793 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1794 spin_unlock(&lli->lli_lock);
1795 CWARN("no group lock held\n");
1799 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1801 if (fd->fd_grouplock.lg_gid != arg) {
1802 CWARN("group lock %lu doesn't match current id %lu\n",
1803 arg, fd->fd_grouplock.lg_gid);
1804 spin_unlock(&lli->lli_lock);
1808 grouplock = fd->fd_grouplock;
1809 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1810 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1811 spin_unlock(&lli->lli_lock);
1813 cl_put_grouplock(&grouplock);
1814 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1819 * Close inode open handle
1821 * \param dentry [in] dentry which contains the inode
1822 * \param it [in,out] intent which contains open info and result
1825 * \retval <0 failure
1827 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1829 struct inode *inode = dentry->d_inode;
1830 struct obd_client_handle *och;
1836 /* Root ? Do nothing. */
1837 if (dentry->d_inode->i_sb->s_root == dentry)
1840 /* No open handle to close? Move away */
1841 if (!it_disposition(it, DISP_OPEN_OPEN))
1844 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1846 OBD_ALLOC(och, sizeof(*och));
1848 GOTO(out, rc = -ENOMEM);
1850 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1852 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1854 /* this one is in place of ll_file_open */
1855 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1856 ptlrpc_req_finished(it->it_request);
1857 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1863 * Get size for inode for which FIEMAP mapping is requested.
1864 * Make the FIEMAP get_info call and returns the result.
1865 * \param fiemap kernel buffer to hold extens
1866 * \param num_bytes kernel buffer size
1868 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1874 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1877 /* Checks for fiemap flags */
1878 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1879 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1883 /* Check for FIEMAP_FLAG_SYNC */
1884 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1885 rc = filemap_fdatawrite(inode->i_mapping);
1890 env = cl_env_get(&refcheck);
1892 RETURN(PTR_ERR(env));
1894 if (i_size_read(inode) == 0) {
1895 rc = ll_glimpse_size(inode);
1900 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1901 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1902 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1904 /* If filesize is 0, then there would be no objects for mapping */
1905 if (fmkey.lfik_oa.o_size == 0) {
1906 fiemap->fm_mapped_extents = 0;
1910 fmkey.lfik_fiemap = *fiemap;
1912 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1913 &fmkey, fiemap, &num_bytes);
1915 cl_env_put(env, &refcheck);
1919 int ll_fid2path(struct inode *inode, void __user *arg)
1921 struct obd_export *exp = ll_i2mdexp(inode);
1922 const struct getinfo_fid2path __user *gfin = arg;
1924 struct getinfo_fid2path *gfout;
1930 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1931 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1934 /* Only need to get the buflen */
1935 if (get_user(pathlen, &gfin->gf_pathlen))
1938 if (pathlen > PATH_MAX)
1941 outsize = sizeof(*gfout) + pathlen;
1942 OBD_ALLOC(gfout, outsize);
1946 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1947 GOTO(gf_free, rc = -EFAULT);
1948 /* append root FID after gfout to let MDT know the root FID so that it
1949 * can lookup the correct path, this is mainly for fileset.
1950 * old server without fileset mount support will ignore this. */
1951 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
1953 /* Call mdc_iocontrol */
1954 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1958 if (copy_to_user(arg, gfout, outsize))
1962 OBD_FREE(gfout, outsize);
1967 * Read the data_version for inode.
1969 * This value is computed using stripe object version on OST.
1970 * Version is computed using server side locking.
1972 * @param flags if do sync on the OST side;
1974 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1975 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1977 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1979 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1987 /* If no file object initialized, we consider its version is 0. */
1993 env = cl_env_get(&refcheck);
1995 RETURN(PTR_ERR(env));
1997 io = vvp_env_thread_io(env);
1999 io->u.ci_data_version.dv_data_version = 0;
2000 io->u.ci_data_version.dv_flags = flags;
2003 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2004 result = cl_io_loop(env, io);
2006 result = io->ci_result;
2008 *data_version = io->u.ci_data_version.dv_data_version;
2010 cl_io_fini(env, io);
2012 if (unlikely(io->ci_need_restart))
2015 cl_env_put(env, &refcheck);
2021 * Trigger a HSM release request for the provided inode.
2023 int ll_hsm_release(struct inode *inode)
2026 struct obd_client_handle *och = NULL;
2027 __u64 data_version = 0;
2032 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2033 ll_get_fsname(inode->i_sb, NULL, 0),
2034 PFID(&ll_i2info(inode)->lli_fid));
2036 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2038 GOTO(out, rc = PTR_ERR(och));
2040 /* Grab latest data_version and [am]time values */
2041 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2045 env = cl_env_get(&refcheck);
2047 GOTO(out, rc = PTR_ERR(env));
2049 ll_merge_attr(env, inode);
2050 cl_env_put(env, &refcheck);
2052 /* Release the file.
2053 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2054 * we still need it to pack l_remote_handle to MDT. */
2055 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2061 if (och != NULL && !IS_ERR(och)) /* close the file */
2062 ll_lease_close(och, inode, NULL);
2067 struct ll_swap_stack {
2070 struct inode *inode1;
2071 struct inode *inode2;
2076 static int ll_swap_layouts(struct file *file1, struct file *file2,
2077 struct lustre_swap_layouts *lsl)
2079 struct mdc_swap_layouts msl;
2080 struct md_op_data *op_data;
2083 struct ll_swap_stack *llss = NULL;
2086 OBD_ALLOC_PTR(llss);
2090 llss->inode1 = file_inode(file1);
2091 llss->inode2 = file_inode(file2);
2093 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2097 /* we use 2 bool because it is easier to swap than 2 bits */
2098 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2099 llss->check_dv1 = true;
2101 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2102 llss->check_dv2 = true;
2104 /* we cannot use lsl->sl_dvX directly because we may swap them */
2105 llss->dv1 = lsl->sl_dv1;
2106 llss->dv2 = lsl->sl_dv2;
2108 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2109 if (rc == 0) /* same file, done! */
2112 if (rc < 0) { /* sequentialize it */
2113 swap(llss->inode1, llss->inode2);
2115 swap(llss->dv1, llss->dv2);
2116 swap(llss->check_dv1, llss->check_dv2);
2120 if (gid != 0) { /* application asks to flush dirty cache */
2121 rc = ll_get_grouplock(llss->inode1, file1, gid);
2125 rc = ll_get_grouplock(llss->inode2, file2, gid);
2127 ll_put_grouplock(llss->inode1, file1, gid);
2132 /* ultimate check, before swaping the layouts we check if
2133 * dataversion has changed (if requested) */
2134 if (llss->check_dv1) {
2135 rc = ll_data_version(llss->inode1, &dv, 0);
2138 if (dv != llss->dv1)
2139 GOTO(putgl, rc = -EAGAIN);
2142 if (llss->check_dv2) {
2143 rc = ll_data_version(llss->inode2, &dv, 0);
2146 if (dv != llss->dv2)
2147 GOTO(putgl, rc = -EAGAIN);
2150 /* struct md_op_data is used to send the swap args to the mdt
2151 * only flags is missing, so we use struct mdc_swap_layouts
2152 * through the md_op_data->op_data */
2153 /* flags from user space have to be converted before they are send to
2154 * server, no flag is sent today, they are only used on the client */
2157 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2158 0, LUSTRE_OPC_ANY, &msl);
2159 if (IS_ERR(op_data))
2160 GOTO(free, rc = PTR_ERR(op_data));
2162 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2163 sizeof(*op_data), op_data, NULL);
2164 ll_finish_md_op_data(op_data);
2171 ll_put_grouplock(llss->inode2, file2, gid);
2172 ll_put_grouplock(llss->inode1, file1, gid);
2182 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2184 struct md_op_data *op_data;
2188 /* Detect out-of range masks */
2189 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2192 /* Non-root users are forbidden to set or clear flags which are
2193 * NOT defined in HSM_USER_MASK. */
2194 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2195 !cfs_capable(CFS_CAP_SYS_ADMIN))
2198 /* Detect out-of range archive id */
2199 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2200 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2203 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2204 LUSTRE_OPC_ANY, hss);
2205 if (IS_ERR(op_data))
2206 RETURN(PTR_ERR(op_data));
2208 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2209 sizeof(*op_data), op_data, NULL);
2211 ll_finish_md_op_data(op_data);
2216 static int ll_hsm_import(struct inode *inode, struct file *file,
2217 struct hsm_user_import *hui)
2219 struct hsm_state_set *hss = NULL;
2220 struct iattr *attr = NULL;
2224 if (!S_ISREG(inode->i_mode))
2230 GOTO(out, rc = -ENOMEM);
2232 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2233 hss->hss_archive_id = hui->hui_archive_id;
2234 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2235 rc = ll_hsm_state_set(inode, hss);
2239 OBD_ALLOC_PTR(attr);
2241 GOTO(out, rc = -ENOMEM);
2243 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2244 attr->ia_mode |= S_IFREG;
2245 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2246 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2247 attr->ia_size = hui->hui_size;
2248 attr->ia_mtime.tv_sec = hui->hui_mtime;
2249 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2250 attr->ia_atime.tv_sec = hui->hui_atime;
2251 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2253 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2254 ATTR_UID | ATTR_GID |
2255 ATTR_MTIME | ATTR_MTIME_SET |
2256 ATTR_ATIME | ATTR_ATIME_SET;
2260 rc = ll_setattr_raw(file_dentry(file), attr, true);
2264 inode_unlock(inode);
2276 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2278 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2279 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2282 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2284 struct inode *inode = file_inode(file);
2286 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2287 ATTR_MTIME | ATTR_MTIME_SET |
2288 ATTR_CTIME | ATTR_CTIME_SET,
2290 .tv_sec = lfu->lfu_atime_sec,
2291 .tv_nsec = lfu->lfu_atime_nsec,
2294 .tv_sec = lfu->lfu_mtime_sec,
2295 .tv_nsec = lfu->lfu_mtime_nsec,
2298 .tv_sec = lfu->lfu_ctime_sec,
2299 .tv_nsec = lfu->lfu_ctime_nsec,
2305 if (!capable(CAP_SYS_ADMIN))
2308 if (!S_ISREG(inode->i_mode))
2312 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2313 inode_unlock(inode);
2319 * Give file access advices
2321 * The ladvise interface is similar to Linux fadvise() system call, except it
2322 * forwards the advices directly from Lustre client to server. The server side
2323 * codes will apply appropriate read-ahead and caching techniques for the
2324 * corresponding files.
2326 * A typical workload for ladvise is e.g. a bunch of different clients are
2327 * doing small random reads of a file, so prefetching pages into OSS cache
2328 * with big linear reads before the random IO is a net benefit. Fetching
2329 * all that data into each client cache with fadvise() may not be, due to
2330 * much more data being sent to the client.
2332 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2333 struct llapi_lu_ladvise *ladvise)
2337 struct cl_ladvise_io *lio;
2342 env = cl_env_get(&refcheck);
2344 RETURN(PTR_ERR(env));
2346 io = vvp_env_thread_io(env);
2347 io->ci_obj = ll_i2info(inode)->lli_clob;
2349 /* initialize parameters for ladvise */
2350 lio = &io->u.ci_ladvise;
2351 lio->li_start = ladvise->lla_start;
2352 lio->li_end = ladvise->lla_end;
2353 lio->li_fid = ll_inode2fid(inode);
2354 lio->li_advice = ladvise->lla_advice;
2355 lio->li_flags = flags;
2357 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2358 rc = cl_io_loop(env, io);
2362 cl_io_fini(env, io);
2363 cl_env_put(env, &refcheck);
2367 int ll_ioctl_projid(struct inode *inode, unsigned int cmd,
2371 struct fsxattr *fsxattr;
2372 int alloc_size = sizeof(*fsxattr);
2375 case LL_IOC_FSGETXATTR: {
2376 OBD_ALLOC_PTR(fsxattr);
2377 if (fsxattr == NULL)
2380 if (copy_from_user(fsxattr,
2381 (const struct fsxattr __user *)arg,
2383 GOTO(out_fsxattr, rc = -EFAULT);
2385 fsxattr->fsx_projid = ll_i2info(inode)->lli_projid;
2386 if (copy_to_user((struct fsxattr __user *)arg,
2387 fsxattr, alloc_size))
2388 GOTO(out_fsxattr, rc = -EFAULT);
2390 OBD_FREE(fsxattr, alloc_size);
2394 case LL_IOC_FSSETXATTR: {
2395 struct md_op_data *op_data;
2396 struct ptlrpc_request *req = NULL;
2398 /* only root could change project ID */
2399 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2402 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2403 LUSTRE_OPC_ANY, NULL);
2404 if (IS_ERR(op_data))
2405 RETURN(PTR_ERR(op_data));
2407 OBD_ALLOC_PTR(fsxattr);
2408 if (fsxattr == NULL)
2409 GOTO(out_fsxattr1, rc = -ENOMEM);
2411 if (copy_from_user(fsxattr,
2412 (const struct fsxattr __user *)arg,
2414 GOTO(out_fsxattr1, rc = -EFAULT);
2416 op_data->op_projid = fsxattr->fsx_projid;
2417 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2418 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2420 ptlrpc_req_finished(req);
2423 ll_finish_md_op_data(op_data);
2424 OBD_FREE(fsxattr, alloc_size);
2435 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2437 struct inode *inode = file_inode(file);
2438 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2442 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2443 PFID(ll_inode2fid(inode)), inode, cmd);
2444 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2446 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2447 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2451 case LL_IOC_GETFLAGS:
2452 /* Get the current value of the file flags */
2453 return put_user(fd->fd_flags, (int __user *)arg);
2454 case LL_IOC_SETFLAGS:
2455 case LL_IOC_CLRFLAGS:
2456 /* Set or clear specific file flags */
2457 /* XXX This probably needs checks to ensure the flags are
2458 * not abused, and to handle any flag side effects.
2460 if (get_user(flags, (int __user *) arg))
2463 if (cmd == LL_IOC_SETFLAGS) {
2464 if ((flags & LL_FILE_IGNORE_LOCK) &&
2465 !(file->f_flags & O_DIRECT)) {
2466 CERROR("%s: unable to disable locking on "
2467 "non-O_DIRECT file\n", current->comm);
2471 fd->fd_flags |= flags;
2473 fd->fd_flags &= ~flags;
2476 case LL_IOC_LOV_SETSTRIPE:
2477 RETURN(ll_lov_setstripe(inode, file, arg));
2478 case LL_IOC_LOV_SETEA:
2479 RETURN(ll_lov_setea(inode, file, arg));
2480 case LL_IOC_LOV_SWAP_LAYOUTS: {
2482 struct lustre_swap_layouts lsl;
2484 if (copy_from_user(&lsl, (char __user *)arg,
2485 sizeof(struct lustre_swap_layouts)))
2488 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2491 file2 = fget(lsl.sl_fd);
2495 /* O_WRONLY or O_RDWR */
2496 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2497 GOTO(out, rc = -EPERM);
2499 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2500 struct inode *inode2;
2501 struct ll_inode_info *lli;
2502 struct obd_client_handle *och = NULL;
2504 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2505 GOTO(out, rc = -EINVAL);
2507 lli = ll_i2info(inode);
2508 mutex_lock(&lli->lli_och_mutex);
2509 if (fd->fd_lease_och != NULL) {
2510 och = fd->fd_lease_och;
2511 fd->fd_lease_och = NULL;
2513 mutex_unlock(&lli->lli_och_mutex);
2515 GOTO(out, rc = -ENOLCK);
2516 inode2 = file_inode(file2);
2517 rc = ll_swap_layouts_close(och, inode, inode2);
2519 rc = ll_swap_layouts(file, file2, &lsl);
2525 case LL_IOC_LOV_GETSTRIPE:
2526 RETURN(ll_file_getstripe(inode,
2527 (struct lov_user_md __user *)arg));
2528 case FSFILT_IOC_GETFLAGS:
2529 case FSFILT_IOC_SETFLAGS:
2530 RETURN(ll_iocontrol(inode, file, cmd, arg));
2531 case FSFILT_IOC_GETVERSION_OLD:
2532 case FSFILT_IOC_GETVERSION:
2533 RETURN(put_user(inode->i_generation, (int __user *)arg));
2534 case LL_IOC_GROUP_LOCK:
2535 RETURN(ll_get_grouplock(inode, file, arg));
2536 case LL_IOC_GROUP_UNLOCK:
2537 RETURN(ll_put_grouplock(inode, file, arg));
2538 case IOC_OBD_STATFS:
2539 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2541 /* We need to special case any other ioctls we want to handle,
2542 * to send them to the MDS/OST as appropriate and to properly
2543 * network encode the arg field.
2544 case FSFILT_IOC_SETVERSION_OLD:
2545 case FSFILT_IOC_SETVERSION:
2547 case LL_IOC_FLUSHCTX:
2548 RETURN(ll_flush_ctx(inode));
2549 case LL_IOC_PATH2FID: {
2550 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2551 sizeof(struct lu_fid)))
2556 case LL_IOC_GETPARENT:
2557 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2559 case OBD_IOC_FID2PATH:
2560 RETURN(ll_fid2path(inode, (void __user *)arg));
2561 case LL_IOC_DATA_VERSION: {
2562 struct ioc_data_version idv;
2565 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2568 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2569 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2572 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2578 case LL_IOC_GET_MDTIDX: {
2581 mdtidx = ll_get_mdt_idx(inode);
2585 if (put_user((int)mdtidx, (int __user *)arg))
2590 case OBD_IOC_GETDTNAME:
2591 case OBD_IOC_GETMDNAME:
2592 RETURN(ll_get_obd_name(inode, cmd, arg));
2593 case LL_IOC_HSM_STATE_GET: {
2594 struct md_op_data *op_data;
2595 struct hsm_user_state *hus;
2602 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2603 LUSTRE_OPC_ANY, hus);
2604 if (IS_ERR(op_data)) {
2606 RETURN(PTR_ERR(op_data));
2609 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2612 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2615 ll_finish_md_op_data(op_data);
2619 case LL_IOC_HSM_STATE_SET: {
2620 struct hsm_state_set *hss;
2627 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2632 rc = ll_hsm_state_set(inode, hss);
2637 case LL_IOC_HSM_ACTION: {
2638 struct md_op_data *op_data;
2639 struct hsm_current_action *hca;
2646 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2647 LUSTRE_OPC_ANY, hca);
2648 if (IS_ERR(op_data)) {
2650 RETURN(PTR_ERR(op_data));
2653 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2656 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2659 ll_finish_md_op_data(op_data);
2663 case LL_IOC_SET_LEASE: {
2664 struct ll_inode_info *lli = ll_i2info(inode);
2665 struct obd_client_handle *och = NULL;
2670 case LL_LEASE_WRLCK:
2671 if (!(file->f_mode & FMODE_WRITE))
2673 fmode = FMODE_WRITE;
2675 case LL_LEASE_RDLCK:
2676 if (!(file->f_mode & FMODE_READ))
2680 case LL_LEASE_UNLCK:
2681 mutex_lock(&lli->lli_och_mutex);
2682 if (fd->fd_lease_och != NULL) {
2683 och = fd->fd_lease_och;
2684 fd->fd_lease_och = NULL;
2686 mutex_unlock(&lli->lli_och_mutex);
2691 fmode = och->och_flags;
2692 rc = ll_lease_close(och, inode, &lease_broken);
2696 rc = ll_lease_och_release(inode, file);
2703 RETURN(ll_lease_type_from_fmode(fmode));
2708 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2710 /* apply for lease */
2711 och = ll_lease_open(inode, file, fmode, 0);
2713 RETURN(PTR_ERR(och));
2716 mutex_lock(&lli->lli_och_mutex);
2717 if (fd->fd_lease_och == NULL) {
2718 fd->fd_lease_och = och;
2721 mutex_unlock(&lli->lli_och_mutex);
2723 /* impossible now that only excl is supported for now */
2724 ll_lease_close(och, inode, &lease_broken);
2729 case LL_IOC_GET_LEASE: {
2730 struct ll_inode_info *lli = ll_i2info(inode);
2731 struct ldlm_lock *lock = NULL;
2734 mutex_lock(&lli->lli_och_mutex);
2735 if (fd->fd_lease_och != NULL) {
2736 struct obd_client_handle *och = fd->fd_lease_och;
2738 lock = ldlm_handle2lock(&och->och_lease_handle);
2740 lock_res_and_lock(lock);
2741 if (!ldlm_is_cancel(lock))
2742 fmode = och->och_flags;
2744 unlock_res_and_lock(lock);
2745 LDLM_LOCK_PUT(lock);
2748 mutex_unlock(&lli->lli_och_mutex);
2750 RETURN(ll_lease_type_from_fmode(fmode));
2752 case LL_IOC_HSM_IMPORT: {
2753 struct hsm_user_import *hui;
2759 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2764 rc = ll_hsm_import(inode, file, hui);
2769 case LL_IOC_FUTIMES_3: {
2770 struct ll_futimes_3 lfu;
2772 if (copy_from_user(&lfu,
2773 (const struct ll_futimes_3 __user *)arg,
2777 RETURN(ll_file_futimes_3(file, &lfu));
2779 case LL_IOC_LADVISE: {
2780 struct llapi_ladvise_hdr *ladvise_hdr;
2783 int alloc_size = sizeof(*ladvise_hdr);
2786 OBD_ALLOC_PTR(ladvise_hdr);
2787 if (ladvise_hdr == NULL)
2790 if (copy_from_user(ladvise_hdr,
2791 (const struct llapi_ladvise_hdr __user *)arg,
2793 GOTO(out_ladvise, rc = -EFAULT);
2795 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2796 ladvise_hdr->lah_count < 1)
2797 GOTO(out_ladvise, rc = -EINVAL);
2799 num_advise = ladvise_hdr->lah_count;
2800 if (num_advise >= LAH_COUNT_MAX)
2801 GOTO(out_ladvise, rc = -EFBIG);
2803 OBD_FREE_PTR(ladvise_hdr);
2804 alloc_size = offsetof(typeof(*ladvise_hdr),
2805 lah_advise[num_advise]);
2806 OBD_ALLOC(ladvise_hdr, alloc_size);
2807 if (ladvise_hdr == NULL)
2811 * TODO: submit multiple advices to one server in a single RPC
2813 if (copy_from_user(ladvise_hdr,
2814 (const struct llapi_ladvise_hdr __user *)arg,
2816 GOTO(out_ladvise, rc = -EFAULT);
2818 for (i = 0; i < num_advise; i++) {
2819 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2820 &ladvise_hdr->lah_advise[i]);
2826 OBD_FREE(ladvise_hdr, alloc_size);
2829 case LL_IOC_FSGETXATTR:
2830 case LL_IOC_FSSETXATTR:
2831 RETURN(ll_ioctl_projid(inode, cmd, arg));
2836 ll_iocontrol_call(inode, file, cmd, arg, &err))
2839 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2840 (void __user *)arg));
2845 #ifndef HAVE_FILE_LLSEEK_SIZE
2846 static inline loff_t
2847 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2849 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2851 if (offset > maxsize)
2854 if (offset != file->f_pos) {
2855 file->f_pos = offset;
2856 file->f_version = 0;
2862 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2863 loff_t maxsize, loff_t eof)
2865 struct inode *inode = file_inode(file);
2873 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2874 * position-querying operation. Avoid rewriting the "same"
2875 * f_pos value back to the file because a concurrent read(),
2876 * write() or lseek() might have altered it
2881 * f_lock protects against read/modify/write race with other
2882 * SEEK_CURs. Note that parallel writes and reads behave
2886 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2887 inode_unlock(inode);
2891 * In the generic case the entire file is data, so as long as
2892 * offset isn't at the end of the file then the offset is data.
2899 * There is a virtual hole at the end of the file, so as long as
2900 * offset isn't i_size or larger, return i_size.
2908 return llseek_execute(file, offset, maxsize);
2912 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2914 struct inode *inode = file_inode(file);
2915 loff_t retval, eof = 0;
2918 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2919 (origin == SEEK_CUR) ? file->f_pos : 0);
2920 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2921 PFID(ll_inode2fid(inode)), inode, retval, retval,
2923 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2925 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2926 retval = ll_glimpse_size(inode);
2929 eof = i_size_read(inode);
2932 retval = ll_generic_file_llseek_size(file, offset, origin,
2933 ll_file_maxbytes(inode), eof);
2937 static int ll_flush(struct file *file, fl_owner_t id)
2939 struct inode *inode = file_inode(file);
2940 struct ll_inode_info *lli = ll_i2info(inode);
2941 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2944 LASSERT(!S_ISDIR(inode->i_mode));
2946 /* catch async errors that were recorded back when async writeback
2947 * failed for pages in this mapping. */
2948 rc = lli->lli_async_rc;
2949 lli->lli_async_rc = 0;
2950 if (lli->lli_clob != NULL) {
2951 err = lov_read_and_clear_async_rc(lli->lli_clob);
2956 /* The application has been told write failure already.
2957 * Do not report failure again. */
2958 if (fd->fd_write_failed)
2960 return rc ? -EIO : 0;
2964 * Called to make sure a portion of file has been written out.
2965 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2967 * Return how many pages have been written.
2969 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2970 enum cl_fsync_mode mode, int ignore_layout)
2974 struct cl_fsync_io *fio;
2979 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2980 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2983 env = cl_env_get(&refcheck);
2985 RETURN(PTR_ERR(env));
2987 io = vvp_env_thread_io(env);
2988 io->ci_obj = ll_i2info(inode)->lli_clob;
2989 io->ci_ignore_layout = ignore_layout;
2991 /* initialize parameters for sync */
2992 fio = &io->u.ci_fsync;
2993 fio->fi_start = start;
2995 fio->fi_fid = ll_inode2fid(inode);
2996 fio->fi_mode = mode;
2997 fio->fi_nr_written = 0;
2999 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3000 result = cl_io_loop(env, io);
3002 result = io->ci_result;
3004 result = fio->fi_nr_written;
3005 cl_io_fini(env, io);
3006 cl_env_put(env, &refcheck);
3012 * When dentry is provided (the 'else' case), file_dentry() may be
3013 * null and dentry must be used directly rather than pulled from
3014 * file_dentry() as is done otherwise.
3017 #ifdef HAVE_FILE_FSYNC_4ARGS
3018 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3020 struct dentry *dentry = file_dentry(file);
3021 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3022 int ll_fsync(struct file *file, int datasync)
3024 struct dentry *dentry = file_dentry(file);
3026 loff_t end = LLONG_MAX;
3028 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3031 loff_t end = LLONG_MAX;
3033 struct inode *inode = dentry->d_inode;
3034 struct ll_inode_info *lli = ll_i2info(inode);
3035 struct ptlrpc_request *req;
3039 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3040 PFID(ll_inode2fid(inode)), inode);
3041 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3043 #ifdef HAVE_FILE_FSYNC_4ARGS
3044 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3047 /* fsync's caller has already called _fdata{sync,write}, we want
3048 * that IO to finish before calling the osc and mdc sync methods */
3049 rc = filemap_fdatawait(inode->i_mapping);
3052 /* catch async errors that were recorded back when async writeback
3053 * failed for pages in this mapping. */
3054 if (!S_ISDIR(inode->i_mode)) {
3055 err = lli->lli_async_rc;
3056 lli->lli_async_rc = 0;
3059 if (lli->lli_clob != NULL) {
3060 err = lov_read_and_clear_async_rc(lli->lli_clob);
3066 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3070 ptlrpc_req_finished(req);
3072 if (S_ISREG(inode->i_mode)) {
3073 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3075 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3076 if (rc == 0 && err < 0)
3079 fd->fd_write_failed = true;
3081 fd->fd_write_failed = false;
3084 #ifdef HAVE_FILE_FSYNC_4ARGS
3085 inode_unlock(inode);
3091 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3093 struct inode *inode = file_inode(file);
3094 struct ll_sb_info *sbi = ll_i2sbi(inode);
3095 struct ldlm_enqueue_info einfo = {
3096 .ei_type = LDLM_FLOCK,
3097 .ei_cb_cp = ldlm_flock_completion_ast,
3098 .ei_cbdata = file_lock,
3100 struct md_op_data *op_data;
3101 struct lustre_handle lockh = { 0 };
3102 union ldlm_policy_data flock = { { 0 } };
3103 int fl_type = file_lock->fl_type;
3109 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3110 PFID(ll_inode2fid(inode)), file_lock);
3112 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3114 if (file_lock->fl_flags & FL_FLOCK) {
3115 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3116 /* flocks are whole-file locks */
3117 flock.l_flock.end = OFFSET_MAX;
3118 /* For flocks owner is determined by the local file desctiptor*/
3119 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3120 } else if (file_lock->fl_flags & FL_POSIX) {
3121 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3122 flock.l_flock.start = file_lock->fl_start;
3123 flock.l_flock.end = file_lock->fl_end;
3127 flock.l_flock.pid = file_lock->fl_pid;
3129 /* Somewhat ugly workaround for svc lockd.
3130 * lockd installs custom fl_lmops->lm_compare_owner that checks
3131 * for the fl_owner to be the same (which it always is on local node
3132 * I guess between lockd processes) and then compares pid.
3133 * As such we assign pid to the owner field to make it all work,
3134 * conflict with normal locks is unlikely since pid space and
3135 * pointer space for current->files are not intersecting */
3136 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3137 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3141 einfo.ei_mode = LCK_PR;
3144 /* An unlock request may or may not have any relation to
3145 * existing locks so we may not be able to pass a lock handle
3146 * via a normal ldlm_lock_cancel() request. The request may even
3147 * unlock a byte range in the middle of an existing lock. In
3148 * order to process an unlock request we need all of the same
3149 * information that is given with a normal read or write record
3150 * lock request. To avoid creating another ldlm unlock (cancel)
3151 * message we'll treat a LCK_NL flock request as an unlock. */
3152 einfo.ei_mode = LCK_NL;
3155 einfo.ei_mode = LCK_PW;
3158 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3173 flags = LDLM_FL_BLOCK_NOWAIT;
3179 flags = LDLM_FL_TEST_LOCK;
3182 CERROR("unknown fcntl lock command: %d\n", cmd);
3186 /* Save the old mode so that if the mode in the lock changes we
3187 * can decrement the appropriate reader or writer refcount. */
3188 file_lock->fl_type = einfo.ei_mode;
3190 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3191 LUSTRE_OPC_ANY, NULL);
3192 if (IS_ERR(op_data))
3193 RETURN(PTR_ERR(op_data));
3195 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3196 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3197 flock.l_flock.pid, flags, einfo.ei_mode,
3198 flock.l_flock.start, flock.l_flock.end);
3200 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3203 /* Restore the file lock type if not TEST lock. */
3204 if (!(flags & LDLM_FL_TEST_LOCK))
3205 file_lock->fl_type = fl_type;
3207 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3208 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3209 !(flags & LDLM_FL_TEST_LOCK))
3210 rc2 = locks_lock_file_wait(file, file_lock);
3212 if ((file_lock->fl_flags & FL_FLOCK) &&
3213 (rc == 0 || file_lock->fl_type == F_UNLCK))
3214 rc2 = flock_lock_file_wait(file, file_lock);
3215 if ((file_lock->fl_flags & FL_POSIX) &&
3216 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3217 !(flags & LDLM_FL_TEST_LOCK))
3218 rc2 = posix_lock_file_wait(file, file_lock);
3219 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3221 if (rc2 && file_lock->fl_type != F_UNLCK) {
3222 einfo.ei_mode = LCK_NL;
3223 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3228 ll_finish_md_op_data(op_data);
3233 int ll_get_fid_by_name(struct inode *parent, const char *name,
3234 int namelen, struct lu_fid *fid,
3235 struct inode **inode)
3237 struct md_op_data *op_data = NULL;
3238 struct mdt_body *body;
3239 struct ptlrpc_request *req;
3243 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3244 LUSTRE_OPC_ANY, NULL);
3245 if (IS_ERR(op_data))
3246 RETURN(PTR_ERR(op_data));
3248 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3249 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3250 ll_finish_md_op_data(op_data);
3254 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3256 GOTO(out_req, rc = -EFAULT);
3258 *fid = body->mbo_fid1;
3261 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3263 ptlrpc_req_finished(req);
3267 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3268 const char *name, int namelen)
3270 struct dentry *dchild = NULL;
3271 struct inode *child_inode = NULL;
3272 struct md_op_data *op_data;
3273 struct ptlrpc_request *request = NULL;
3274 struct obd_client_handle *och = NULL;
3276 struct mdt_body *body;
3278 __u64 data_version = 0;
3281 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3282 name, PFID(ll_inode2fid(parent)), mdtidx);
3284 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3285 0, LUSTRE_OPC_ANY, NULL);
3286 if (IS_ERR(op_data))
3287 RETURN(PTR_ERR(op_data));
3289 /* Get child FID first */
3290 qstr.hash = full_name_hash(name, namelen);
3293 dchild = d_lookup(file_dentry(file), &qstr);
3294 if (dchild != NULL) {
3295 if (dchild->d_inode != NULL)
3296 child_inode = igrab(dchild->d_inode);
3300 if (child_inode == NULL) {
3301 rc = ll_get_fid_by_name(parent, name, namelen,
3302 &op_data->op_fid3, &child_inode);
3307 if (child_inode == NULL)
3308 GOTO(out_free, rc = -EINVAL);
3311 * lfs migrate command needs to be blocked on the client
3312 * by checking the migrate FID against the FID of the
3315 if (child_inode == parent->i_sb->s_root->d_inode)
3316 GOTO(out_iput, rc = -EINVAL);
3318 inode_lock(child_inode);
3319 op_data->op_fid3 = *ll_inode2fid(child_inode);
3320 if (!fid_is_sane(&op_data->op_fid3)) {
3321 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3322 ll_get_fsname(parent->i_sb, NULL, 0), name,
3323 PFID(&op_data->op_fid3));
3324 GOTO(out_unlock, rc = -EINVAL);
3327 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3329 GOTO(out_unlock, rc);
3332 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3333 PFID(&op_data->op_fid3), mdtidx);
3334 GOTO(out_unlock, rc = 0);
3337 if (S_ISREG(child_inode->i_mode)) {
3338 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3342 GOTO(out_unlock, rc);
3345 rc = ll_data_version(child_inode, &data_version,
3348 GOTO(out_close, rc);
3350 op_data->op_handle = och->och_fh;
3351 op_data->op_data = och->och_mod;
3352 op_data->op_data_version = data_version;
3353 op_data->op_lease_handle = och->och_lease_handle;
3354 op_data->op_bias |= MDS_RENAME_MIGRATE;
3357 op_data->op_mds = mdtidx;
3358 op_data->op_cli_flags = CLI_MIGRATE;
3359 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3360 namelen, name, namelen, &request);
3362 LASSERT(request != NULL);
3363 ll_update_times(request, parent);
3365 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3366 LASSERT(body != NULL);
3368 /* If the server does release layout lock, then we cleanup
3369 * the client och here, otherwise release it in out_close: */
3371 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3372 obd_mod_put(och->och_mod);
3373 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3375 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3381 if (request != NULL) {
3382 ptlrpc_req_finished(request);
3386 /* Try again if the file layout has changed. */
3387 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3391 if (och != NULL) /* close the file */
3392 ll_lease_close(och, child_inode, NULL);
3394 clear_nlink(child_inode);
3396 inode_unlock(child_inode);
3400 ll_finish_md_op_data(op_data);
3405 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3413 * test if some locks matching bits and l_req_mode are acquired
3414 * - bits can be in different locks
3415 * - if found clear the common lock bits in *bits
3416 * - the bits not found, are kept in *bits
3418 * \param bits [IN] searched lock bits [IN]
3419 * \param l_req_mode [IN] searched lock mode
3420 * \retval boolean, true iff all bits are found
3422 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3424 struct lustre_handle lockh;
3425 union ldlm_policy_data policy;
3426 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3427 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3436 fid = &ll_i2info(inode)->lli_fid;
3437 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3438 ldlm_lockname[mode]);
3440 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3441 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3442 policy.l_inodebits.bits = *bits & (1 << i);
3443 if (policy.l_inodebits.bits == 0)
3446 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3447 &policy, mode, &lockh)) {
3448 struct ldlm_lock *lock;
3450 lock = ldlm_handle2lock(&lockh);
3453 ~(lock->l_policy_data.l_inodebits.bits);
3454 LDLM_LOCK_PUT(lock);
3456 *bits &= ~policy.l_inodebits.bits;
3463 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3464 struct lustre_handle *lockh, __u64 flags,
3465 enum ldlm_mode mode)
3467 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3472 fid = &ll_i2info(inode)->lli_fid;
3473 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3475 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3476 fid, LDLM_IBITS, &policy, mode, lockh);
3481 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3483 /* Already unlinked. Just update nlink and return success */
3484 if (rc == -ENOENT) {
3486 /* If it is striped directory, and there is bad stripe
3487 * Let's revalidate the dentry again, instead of returning
3489 if (S_ISDIR(inode->i_mode) &&
3490 ll_i2info(inode)->lli_lsm_md != NULL)
3493 /* This path cannot be hit for regular files unless in
3494 * case of obscure races, so no need to to validate
3496 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3498 } else if (rc != 0) {
3499 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3500 "%s: revalidate FID "DFID" error: rc = %d\n",
3501 ll_get_fsname(inode->i_sb, NULL, 0),
3502 PFID(ll_inode2fid(inode)), rc);
3508 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3510 struct inode *inode = dentry->d_inode;
3511 struct ptlrpc_request *req = NULL;
3512 struct obd_export *exp;
3516 LASSERT(inode != NULL);
3518 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3519 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3521 exp = ll_i2mdexp(inode);
3523 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3524 * But under CMD case, it caused some lock issues, should be fixed
3525 * with new CMD ibits lock. See bug 12718 */
3526 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3527 struct lookup_intent oit = { .it_op = IT_GETATTR };
3528 struct md_op_data *op_data;
3530 if (ibits == MDS_INODELOCK_LOOKUP)
3531 oit.it_op = IT_LOOKUP;
3533 /* Call getattr by fid, so do not provide name at all. */
3534 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3535 dentry->d_inode, NULL, 0, 0,
3536 LUSTRE_OPC_ANY, NULL);
3537 if (IS_ERR(op_data))
3538 RETURN(PTR_ERR(op_data));
3540 rc = md_intent_lock(exp, op_data, &oit, &req,
3541 &ll_md_blocking_ast, 0);
3542 ll_finish_md_op_data(op_data);
3544 rc = ll_inode_revalidate_fini(inode, rc);
3548 rc = ll_revalidate_it_finish(req, &oit, dentry);
3550 ll_intent_release(&oit);
3554 /* Unlinked? Unhash dentry, so it is not picked up later by
3555 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3556 here to preserve get_cwd functionality on 2.6.
3558 if (!dentry->d_inode->i_nlink) {
3559 ll_lock_dcache(inode);
3560 d_lustre_invalidate(dentry, 0);
3561 ll_unlock_dcache(inode);
3564 ll_lookup_finish_locks(&oit, dentry);
3565 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3566 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3567 u64 valid = OBD_MD_FLGETATTR;
3568 struct md_op_data *op_data;
3571 if (S_ISREG(inode->i_mode)) {
3572 rc = ll_get_default_mdsize(sbi, &ealen);
3575 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3578 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3579 0, ealen, LUSTRE_OPC_ANY,
3581 if (IS_ERR(op_data))
3582 RETURN(PTR_ERR(op_data));
3584 op_data->op_valid = valid;
3585 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3586 ll_finish_md_op_data(op_data);
3588 rc = ll_inode_revalidate_fini(inode, rc);
3592 rc = ll_prep_inode(&inode, req, NULL, NULL);
3595 ptlrpc_req_finished(req);
3599 static int ll_merge_md_attr(struct inode *inode)
3601 struct cl_attr attr = { 0 };
3604 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3605 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3606 &attr, ll_md_blocking_ast);
3610 set_nlink(inode, attr.cat_nlink);
3611 inode->i_blocks = attr.cat_blocks;
3612 i_size_write(inode, attr.cat_size);
3614 ll_i2info(inode)->lli_atime = attr.cat_atime;
3615 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3616 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3622 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3624 struct inode *inode = dentry->d_inode;
3628 rc = __ll_inode_revalidate(dentry, ibits);
3632 /* if object isn't regular file, don't validate size */
3633 if (!S_ISREG(inode->i_mode)) {
3634 if (S_ISDIR(inode->i_mode) &&
3635 ll_i2info(inode)->lli_lsm_md != NULL) {
3636 rc = ll_merge_md_attr(inode);
3641 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3642 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3643 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3645 /* In case of restore, the MDT has the right size and has
3646 * already send it back without granting the layout lock,
3647 * inode is up-to-date so glimpse is useless.
3648 * Also to glimpse we need the layout, in case of a running
3649 * restore the MDT holds the layout lock so the glimpse will
3650 * block up to the end of restore (getattr will block)
3652 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3653 rc = ll_glimpse_size(inode);
3658 static inline dev_t ll_compat_encode_dev(dev_t dev)
3660 /* The compat_sys_*stat*() syscalls will fail unless the
3661 * device majors and minors are both less than 256. Note that
3662 * the value returned here will be passed through
3663 * old_encode_dev() in cp_compat_stat(). And so we are not
3664 * trying to return a valid compat (u16) device number, just
3665 * one that will pass the old_valid_dev() check. */
3667 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3670 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3672 struct inode *inode = de->d_inode;
3673 struct ll_sb_info *sbi = ll_i2sbi(inode);
3674 struct ll_inode_info *lli = ll_i2info(inode);
3677 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3678 MDS_INODELOCK_LOOKUP);
3679 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3684 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3686 if (ll_need_32bit_api(sbi)) {
3687 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3688 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3689 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3691 stat->ino = inode->i_ino;
3692 stat->dev = inode->i_sb->s_dev;
3693 stat->rdev = inode->i_rdev;
3696 stat->mode = inode->i_mode;
3697 stat->uid = inode->i_uid;
3698 stat->gid = inode->i_gid;
3699 stat->atime = inode->i_atime;
3700 stat->mtime = inode->i_mtime;
3701 stat->ctime = inode->i_ctime;
3702 stat->blksize = 1 << inode->i_blkbits;
3704 stat->nlink = inode->i_nlink;
3705 stat->size = i_size_read(inode);
3706 stat->blocks = inode->i_blocks;
3711 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3712 __u64 start, __u64 len)
3716 struct fiemap *fiemap;
3717 unsigned int extent_count = fieinfo->fi_extents_max;
3719 num_bytes = sizeof(*fiemap) + (extent_count *
3720 sizeof(struct fiemap_extent));
3721 OBD_ALLOC_LARGE(fiemap, num_bytes);
3726 fiemap->fm_flags = fieinfo->fi_flags;
3727 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3728 fiemap->fm_start = start;
3729 fiemap->fm_length = len;
3730 if (extent_count > 0 &&
3731 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3732 sizeof(struct fiemap_extent)) != 0)
3733 GOTO(out, rc = -EFAULT);
3735 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3737 fieinfo->fi_flags = fiemap->fm_flags;
3738 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3739 if (extent_count > 0 &&
3740 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3741 fiemap->fm_mapped_extents *
3742 sizeof(struct fiemap_extent)) != 0)
3743 GOTO(out, rc = -EFAULT);
3745 OBD_FREE_LARGE(fiemap, num_bytes);
3749 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3751 struct ll_inode_info *lli = ll_i2info(inode);
3752 struct posix_acl *acl = NULL;
3755 spin_lock(&lli->lli_lock);
3756 /* VFS' acl_permission_check->check_acl will release the refcount */
3757 acl = posix_acl_dup(lli->lli_posix_acl);
3758 spin_unlock(&lli->lli_lock);
3763 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3765 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3766 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3768 ll_check_acl(struct inode *inode, int mask)
3771 # ifdef CONFIG_FS_POSIX_ACL
3772 struct posix_acl *acl;
3776 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3777 if (flags & IPERM_FLAG_RCU)
3780 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3785 rc = posix_acl_permission(inode, acl, mask);
3786 posix_acl_release(acl);
3789 # else /* !CONFIG_FS_POSIX_ACL */
3791 # endif /* CONFIG_FS_POSIX_ACL */
3793 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3795 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3796 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3798 # ifdef HAVE_INODE_PERMISION_2ARGS
3799 int ll_inode_permission(struct inode *inode, int mask)
3801 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3806 struct ll_sb_info *sbi;
3807 struct root_squash_info *squash;
3808 struct cred *cred = NULL;
3809 const struct cred *old_cred = NULL;
3811 bool squash_id = false;
3814 #ifdef MAY_NOT_BLOCK
3815 if (mask & MAY_NOT_BLOCK)
3817 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3818 if (flags & IPERM_FLAG_RCU)
3822 /* as root inode are NOT getting validated in lookup operation,
3823 * need to do it before permission check. */
3825 if (inode == inode->i_sb->s_root->d_inode) {
3826 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3827 MDS_INODELOCK_LOOKUP);
3832 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3833 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3835 /* squash fsuid/fsgid if needed */
3836 sbi = ll_i2sbi(inode);
3837 squash = &sbi->ll_squash;
3838 if (unlikely(squash->rsi_uid != 0 &&
3839 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3840 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3844 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3845 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3846 squash->rsi_uid, squash->rsi_gid);
3848 /* update current process's credentials
3849 * and FS capability */
3850 cred = prepare_creds();
3854 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3855 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3856 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3857 if ((1 << cap) & CFS_CAP_FS_MASK)
3858 cap_lower(cred->cap_effective, cap);
3860 old_cred = override_creds(cred);
3863 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3864 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3865 /* restore current process's credentials and FS capability */
3867 revert_creds(old_cred);
3874 /* -o localflock - only provides locally consistent flock locks */
3875 struct file_operations ll_file_operations = {
3876 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3877 # ifdef HAVE_SYNC_READ_WRITE
3878 .read = new_sync_read,
3879 .write = new_sync_write,
3881 .read_iter = ll_file_read_iter,
3882 .write_iter = ll_file_write_iter,
3883 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3884 .read = ll_file_read,
3885 .aio_read = ll_file_aio_read,
3886 .write = ll_file_write,
3887 .aio_write = ll_file_aio_write,
3888 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3889 .unlocked_ioctl = ll_file_ioctl,
3890 .open = ll_file_open,
3891 .release = ll_file_release,
3892 .mmap = ll_file_mmap,
3893 .llseek = ll_file_seek,
3894 .splice_read = ll_file_splice_read,
3899 struct file_operations ll_file_operations_flock = {
3900 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3901 # ifdef HAVE_SYNC_READ_WRITE
3902 .read = new_sync_read,
3903 .write = new_sync_write,
3904 # endif /* HAVE_SYNC_READ_WRITE */
3905 .read_iter = ll_file_read_iter,
3906 .write_iter = ll_file_write_iter,
3907 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3908 .read = ll_file_read,
3909 .aio_read = ll_file_aio_read,
3910 .write = ll_file_write,
3911 .aio_write = ll_file_aio_write,
3912 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3913 .unlocked_ioctl = ll_file_ioctl,
3914 .open = ll_file_open,
3915 .release = ll_file_release,
3916 .mmap = ll_file_mmap,
3917 .llseek = ll_file_seek,
3918 .splice_read = ll_file_splice_read,
3921 .flock = ll_file_flock,
3922 .lock = ll_file_flock
3925 /* These are for -o noflock - to return ENOSYS on flock calls */
3926 struct file_operations ll_file_operations_noflock = {
3927 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3928 # ifdef HAVE_SYNC_READ_WRITE
3929 .read = new_sync_read,
3930 .write = new_sync_write,
3931 # endif /* HAVE_SYNC_READ_WRITE */
3932 .read_iter = ll_file_read_iter,
3933 .write_iter = ll_file_write_iter,
3934 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3935 .read = ll_file_read,
3936 .aio_read = ll_file_aio_read,
3937 .write = ll_file_write,
3938 .aio_write = ll_file_aio_write,
3939 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3940 .unlocked_ioctl = ll_file_ioctl,
3941 .open = ll_file_open,
3942 .release = ll_file_release,
3943 .mmap = ll_file_mmap,
3944 .llseek = ll_file_seek,
3945 .splice_read = ll_file_splice_read,
3948 .flock = ll_file_noflock,
3949 .lock = ll_file_noflock
3952 struct inode_operations ll_file_inode_operations = {
3953 .setattr = ll_setattr,
3954 .getattr = ll_getattr,
3955 .permission = ll_inode_permission,
3956 .setxattr = ll_setxattr,
3957 .getxattr = ll_getxattr,
3958 .listxattr = ll_listxattr,
3959 .removexattr = ll_removexattr,
3960 .fiemap = ll_fiemap,
3961 #ifdef HAVE_IOP_GET_ACL
3962 .get_acl = ll_get_acl,
3966 /* dynamic ioctl number support routins */
3967 static struct llioc_ctl_data {
3968 struct rw_semaphore ioc_sem;
3969 struct list_head ioc_head;
3971 __RWSEM_INITIALIZER(llioc.ioc_sem),
3972 LIST_HEAD_INIT(llioc.ioc_head)
3977 struct list_head iocd_list;
3978 unsigned int iocd_size;
3979 llioc_callback_t iocd_cb;
3980 unsigned int iocd_count;
3981 unsigned int iocd_cmd[0];
3984 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3987 struct llioc_data *in_data = NULL;
3990 if (cb == NULL || cmd == NULL ||
3991 count > LLIOC_MAX_CMD || count < 0)
3994 size = sizeof(*in_data) + count * sizeof(unsigned int);
3995 OBD_ALLOC(in_data, size);
3996 if (in_data == NULL)
3999 memset(in_data, 0, sizeof(*in_data));
4000 in_data->iocd_size = size;
4001 in_data->iocd_cb = cb;
4002 in_data->iocd_count = count;
4003 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
4005 down_write(&llioc.ioc_sem);
4006 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
4007 up_write(&llioc.ioc_sem);
4012 void ll_iocontrol_unregister(void *magic)
4014 struct llioc_data *tmp;
4019 down_write(&llioc.ioc_sem);
4020 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
4022 unsigned int size = tmp->iocd_size;
4024 list_del(&tmp->iocd_list);
4025 up_write(&llioc.ioc_sem);
4027 OBD_FREE(tmp, size);
4031 up_write(&llioc.ioc_sem);
4033 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
4036 EXPORT_SYMBOL(ll_iocontrol_register);
4037 EXPORT_SYMBOL(ll_iocontrol_unregister);
4039 static enum llioc_iter
4040 ll_iocontrol_call(struct inode *inode, struct file *file,
4041 unsigned int cmd, unsigned long arg, int *rcp)
4043 enum llioc_iter ret = LLIOC_CONT;
4044 struct llioc_data *data;
4045 int rc = -EINVAL, i;
4047 down_read(&llioc.ioc_sem);
4048 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
4049 for (i = 0; i < data->iocd_count; i++) {
4050 if (cmd != data->iocd_cmd[i])
4053 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
4057 if (ret == LLIOC_STOP)
4060 up_read(&llioc.ioc_sem);
4067 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4069 struct ll_inode_info *lli = ll_i2info(inode);
4070 struct cl_object *obj = lli->lli_clob;
4079 env = cl_env_get(&refcheck);
4081 RETURN(PTR_ERR(env));
4083 rc = cl_conf_set(env, lli->lli_clob, conf);
4087 if (conf->coc_opc == OBJECT_CONF_SET) {
4088 struct ldlm_lock *lock = conf->coc_lock;
4089 struct cl_layout cl = {
4093 LASSERT(lock != NULL);
4094 LASSERT(ldlm_has_layout(lock));
4096 /* it can only be allowed to match after layout is
4097 * applied to inode otherwise false layout would be
4098 * seen. Applying layout shoud happen before dropping
4099 * the intent lock. */
4100 ldlm_lock_allow_match(lock);
4102 rc = cl_object_layout_get(env, obj, &cl);
4107 DFID": layout version change: %u -> %u\n",
4108 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4110 ll_layout_version_set(lli, cl.cl_layout_gen);
4114 cl_env_put(env, &refcheck);
4119 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4120 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4123 struct ll_sb_info *sbi = ll_i2sbi(inode);
4124 struct ptlrpc_request *req;
4125 struct mdt_body *body;
4132 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4133 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4134 lock->l_lvb_data, lock->l_lvb_len);
4136 if (lock->l_lvb_data != NULL)
4139 /* if layout lock was granted right away, the layout is returned
4140 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4141 * blocked and then granted via completion ast, we have to fetch
4142 * layout here. Please note that we can't use the LVB buffer in
4143 * completion AST because it doesn't have a large enough buffer */
4144 rc = ll_get_default_mdsize(sbi, &lmmsize);
4146 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4147 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4152 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4154 GOTO(out, rc = -EPROTO);
4156 lmmsize = body->mbo_eadatasize;
4157 if (lmmsize == 0) /* empty layout */
4160 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4162 GOTO(out, rc = -EFAULT);
4164 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4165 if (lvbdata == NULL)
4166 GOTO(out, rc = -ENOMEM);
4168 memcpy(lvbdata, lmm, lmmsize);
4169 lock_res_and_lock(lock);
4170 if (unlikely(lock->l_lvb_data == NULL)) {
4171 lock->l_lvb_type = LVB_T_LAYOUT;
4172 lock->l_lvb_data = lvbdata;
4173 lock->l_lvb_len = lmmsize;
4176 unlock_res_and_lock(lock);
4179 OBD_FREE_LARGE(lvbdata, lmmsize);
4184 ptlrpc_req_finished(req);
4189 * Apply the layout to the inode. Layout lock is held and will be released
4192 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4193 struct inode *inode)
4195 struct ll_inode_info *lli = ll_i2info(inode);
4196 struct ll_sb_info *sbi = ll_i2sbi(inode);
4197 struct ldlm_lock *lock;
4198 struct cl_object_conf conf;
4201 bool wait_layout = false;
4204 LASSERT(lustre_handle_is_used(lockh));
4206 lock = ldlm_handle2lock(lockh);
4207 LASSERT(lock != NULL);
4208 LASSERT(ldlm_has_layout(lock));
4210 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4211 PFID(&lli->lli_fid), inode);
4213 /* in case this is a caching lock and reinstate with new inode */
4214 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4216 lock_res_and_lock(lock);
4217 lvb_ready = ldlm_is_lvb_ready(lock);
4218 unlock_res_and_lock(lock);
4220 /* checking lvb_ready is racy but this is okay. The worst case is
4221 * that multi processes may configure the file on the same time. */
4225 rc = ll_layout_fetch(inode, lock);
4229 /* for layout lock, lmm is stored in lock's lvb.
4230 * lvb_data is immutable if the lock is held so it's safe to access it
4233 * set layout to file. Unlikely this will fail as old layout was
4234 * surely eliminated */
4235 memset(&conf, 0, sizeof conf);
4236 conf.coc_opc = OBJECT_CONF_SET;
4237 conf.coc_inode = inode;
4238 conf.coc_lock = lock;
4239 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4240 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4241 rc = ll_layout_conf(inode, &conf);
4243 /* refresh layout failed, need to wait */
4244 wait_layout = rc == -EBUSY;
4247 LDLM_LOCK_PUT(lock);
4248 ldlm_lock_decref(lockh, mode);
4250 /* wait for IO to complete if it's still being used. */
4252 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4253 ll_get_fsname(inode->i_sb, NULL, 0),
4254 PFID(&lli->lli_fid), inode);
4256 memset(&conf, 0, sizeof conf);
4257 conf.coc_opc = OBJECT_CONF_WAIT;
4258 conf.coc_inode = inode;
4259 rc = ll_layout_conf(inode, &conf);
4263 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4264 ll_get_fsname(inode->i_sb, NULL, 0),
4265 PFID(&lli->lli_fid), rc);
4271 * Issue layout intent RPC to MDS.
4272 * \param inode [in] file inode
4273 * \param intent [in] layout intent
4275 * \retval 0 on success
4276 * \retval < 0 error code
4278 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4280 struct ll_inode_info *lli = ll_i2info(inode);
4281 struct ll_sb_info *sbi = ll_i2sbi(inode);
4282 struct md_op_data *op_data;
4283 struct lookup_intent it;
4284 struct ptlrpc_request *req;
4288 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4289 0, 0, LUSTRE_OPC_ANY, NULL);
4290 if (IS_ERR(op_data))
4291 RETURN(PTR_ERR(op_data));
4293 op_data->op_data = intent;
4294 op_data->op_data_size = sizeof(*intent);
4296 memset(&it, 0, sizeof(it));
4297 it.it_op = IT_LAYOUT;
4298 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4299 intent->li_opc == LAYOUT_INTENT_TRUNC)
4300 it.it_flags = FMODE_WRITE;
4302 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4303 ll_get_fsname(inode->i_sb, NULL, 0),
4304 PFID(&lli->lli_fid), inode);
4306 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4307 &ll_md_blocking_ast, 0);
4308 if (it.it_request != NULL)
4309 ptlrpc_req_finished(it.it_request);
4310 it.it_request = NULL;
4312 ll_finish_md_op_data(op_data);
4314 /* set lock data in case this is a new lock */
4316 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4318 ll_intent_drop_lock(&it);
4324 * This function checks if there exists a LAYOUT lock on the client side,
4325 * or enqueues it if it doesn't have one in cache.
4327 * This function will not hold layout lock so it may be revoked any time after
4328 * this function returns. Any operations depend on layout should be redone
4331 * This function should be called before lov_io_init() to get an uptodate
4332 * layout version, the caller should save the version number and after IO
4333 * is finished, this function should be called again to verify that layout
4334 * is not changed during IO time.
4336 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4338 struct ll_inode_info *lli = ll_i2info(inode);
4339 struct ll_sb_info *sbi = ll_i2sbi(inode);
4340 struct lustre_handle lockh;
4341 struct layout_intent intent = {
4342 .li_opc = LAYOUT_INTENT_ACCESS,
4344 enum ldlm_mode mode;
4348 *gen = ll_layout_version_get(lli);
4349 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4353 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4354 LASSERT(S_ISREG(inode->i_mode));
4356 /* take layout lock mutex to enqueue layout lock exclusively. */
4357 mutex_lock(&lli->lli_layout_mutex);
4360 /* mostly layout lock is caching on the local side, so try to
4361 * match it before grabbing layout lock mutex. */
4362 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4363 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4364 if (mode != 0) { /* hit cached lock */
4365 rc = ll_layout_lock_set(&lockh, mode, inode);
4371 rc = ll_layout_intent(inode, &intent);
4377 *gen = ll_layout_version_get(lli);
4378 mutex_unlock(&lli->lli_layout_mutex);
4384 * Issue layout intent RPC indicating where in a file an IO is about to write.
4386 * \param[in] inode file inode.
4387 * \param[in] start start offset of fille in bytes where an IO is about to
4389 * \param[in] end exclusive end offset in bytes of the write range.
4391 * \retval 0 on success
4392 * \retval < 0 error code
4394 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4396 struct layout_intent intent = {
4397 .li_opc = LAYOUT_INTENT_WRITE,
4404 rc = ll_layout_intent(inode, &intent);
4410 * This function send a restore request to the MDT
4412 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4414 struct hsm_user_request *hur;
4418 len = sizeof(struct hsm_user_request) +
4419 sizeof(struct hsm_user_item);
4420 OBD_ALLOC(hur, len);
4424 hur->hur_request.hr_action = HUA_RESTORE;
4425 hur->hur_request.hr_archive_id = 0;
4426 hur->hur_request.hr_flags = 0;
4427 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4428 sizeof(hur->hur_user_item[0].hui_fid));
4429 hur->hur_user_item[0].hui_extent.offset = offset;
4430 hur->hur_user_item[0].hui_extent.length = length;
4431 hur->hur_request.hr_itemcount = 1;
4432 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,