4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
355 struct dentry *parent = de->d_parent;
356 const char *name = NULL;
358 struct md_op_data *op_data;
359 struct ptlrpc_request *req = NULL;
363 LASSERT(parent != NULL);
364 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
366 /* if server supports open-by-fid, or file name is invalid, don't pack
367 * name in open request */
368 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
369 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
370 name = de->d_name.name;
371 len = de->d_name.len;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
375 name, len, 0, LUSTRE_OPC_ANY, NULL);
377 RETURN(PTR_ERR(op_data));
378 op_data->op_data = lmm;
379 op_data->op_data_size = lmmsize;
381 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
382 &ll_md_blocking_ast, 0);
383 ll_finish_md_op_data(op_data);
385 /* reason for keep own exit path - don`t flood log
386 * with messages with -ESTALE errors.
388 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
389 it_open_error(DISP_OPEN_OPEN, itp))
391 ll_release_openhandle(de, itp);
395 if (it_disposition(itp, DISP_LOOKUP_NEG))
396 GOTO(out, rc = -ENOENT);
398 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
399 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
400 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
405 if (!rc && itp->it_lock_mode)
406 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
409 ptlrpc_req_finished(req);
410 ll_intent_drop_lock(itp);
412 /* We did open by fid, but by the time we got to the server,
413 * the object disappeared. If this is a create, we cannot really
414 * tell the userspace that the file it was trying to create
415 * does not exist. Instead let's return -ESTALE, and the VFS will
416 * retry the create with LOOKUP_REVAL that we are going to catch
417 * in ll_revalidate_dentry() and use lookup then.
419 if (rc == -ENOENT && itp->it_op & IT_CREAT)
425 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
426 struct obd_client_handle *och)
428 struct mdt_body *body;
430 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
431 och->och_fh = body->mbo_handle;
432 och->och_fid = body->mbo_fid1;
433 och->och_lease_handle.cookie = it->it_lock_handle;
434 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
435 och->och_flags = it->it_flags;
437 return md_set_open_replay_data(md_exp, och, it);
440 static int ll_local_open(struct file *file, struct lookup_intent *it,
441 struct ll_file_data *fd, struct obd_client_handle *och)
443 struct inode *inode = file_inode(file);
446 LASSERT(!LUSTRE_FPRIVATE(file));
453 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
458 LUSTRE_FPRIVATE(file) = fd;
459 ll_readahead_init(inode, &fd->fd_ras);
460 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
462 /* ll_cl_context initialize */
463 rwlock_init(&fd->fd_lock);
464 INIT_LIST_HEAD(&fd->fd_lccs);
469 /* Open a file, and (for the very first open) create objects on the OSTs at
470 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
471 * creation or open until ll_lov_setstripe() ioctl is called.
473 * If we already have the stripe MD locally then we don't request it in
474 * md_open(), by passing a lmm_size = 0.
476 * It is up to the application to ensure no other processes open this file
477 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
478 * used. We might be able to avoid races of that sort by getting lli_open_sem
479 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
480 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
482 int ll_file_open(struct inode *inode, struct file *file)
484 struct ll_inode_info *lli = ll_i2info(inode);
485 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
486 .it_flags = file->f_flags };
487 struct obd_client_handle **och_p = NULL;
488 __u64 *och_usecount = NULL;
489 struct ll_file_data *fd;
493 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
494 PFID(ll_inode2fid(inode)), inode, file->f_flags);
496 it = file->private_data; /* XXX: compat macro */
497 file->private_data = NULL; /* prevent ll_local_open assertion */
499 fd = ll_file_data_get();
501 GOTO(out_openerr, rc = -ENOMEM);
504 if (S_ISDIR(inode->i_mode))
505 ll_authorize_statahead(inode, fd);
507 if (inode->i_sb->s_root == file_dentry(file)) {
508 LUSTRE_FPRIVATE(file) = fd;
512 if (!it || !it->it_disposition) {
513 /* Convert f_flags into access mode. We cannot use file->f_mode,
514 * because everything but O_ACCMODE mask was stripped from
516 if ((oit.it_flags + 1) & O_ACCMODE)
518 if (file->f_flags & O_TRUNC)
519 oit.it_flags |= FMODE_WRITE;
521 /* kernel only call f_op->open in dentry_open. filp_open calls
522 * dentry_open after call to open_namei that checks permissions.
523 * Only nfsd_open call dentry_open directly without checking
524 * permissions and because of that this code below is safe. */
525 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
526 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
528 /* We do not want O_EXCL here, presumably we opened the file
529 * already? XXX - NFS implications? */
530 oit.it_flags &= ~O_EXCL;
532 /* bug20584, if "it_flags" contains O_CREAT, the file will be
533 * created if necessary, then "IT_CREAT" should be set to keep
534 * consistent with it */
535 if (oit.it_flags & O_CREAT)
536 oit.it_op |= IT_CREAT;
542 /* Let's see if we have file open on MDS already. */
543 if (it->it_flags & FMODE_WRITE) {
544 och_p = &lli->lli_mds_write_och;
545 och_usecount = &lli->lli_open_fd_write_count;
546 } else if (it->it_flags & FMODE_EXEC) {
547 och_p = &lli->lli_mds_exec_och;
548 och_usecount = &lli->lli_open_fd_exec_count;
550 och_p = &lli->lli_mds_read_och;
551 och_usecount = &lli->lli_open_fd_read_count;
554 mutex_lock(&lli->lli_och_mutex);
555 if (*och_p) { /* Open handle is present */
556 if (it_disposition(it, DISP_OPEN_OPEN)) {
557 /* Well, there's extra open request that we do not need,
558 let's close it somehow. This will decref request. */
559 rc = it_open_error(DISP_OPEN_OPEN, it);
561 mutex_unlock(&lli->lli_och_mutex);
562 GOTO(out_openerr, rc);
565 ll_release_openhandle(file_dentry(file), it);
569 rc = ll_local_open(file, it, fd, NULL);
572 mutex_unlock(&lli->lli_och_mutex);
573 GOTO(out_openerr, rc);
576 LASSERT(*och_usecount == 0);
577 if (!it->it_disposition) {
578 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
579 /* We cannot just request lock handle now, new ELC code
580 means that one of other OPEN locks for this file
581 could be cancelled, and since blocking ast handler
582 would attempt to grab och_mutex as well, that would
583 result in a deadlock */
584 mutex_unlock(&lli->lli_och_mutex);
586 * Normally called under two situations:
588 * 2. A race/condition on MDS resulting in no open
589 * handle to be returned from LOOKUP|OPEN request,
590 * for example if the target entry was a symlink.
592 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
593 * marked by a bit set in ll_iget_for_nfs. Clear the
594 * bit so that it's not confusing later callers.
596 * NB; when ldd is NULL, it must have come via normal
597 * lookup path only, since ll_iget_for_nfs always calls
600 if (ldd && ldd->lld_nfs_dentry) {
601 ldd->lld_nfs_dentry = 0;
602 it->it_flags |= MDS_OPEN_LOCK;
606 * Always specify MDS_OPEN_BY_FID because we don't want
607 * to get file with different fid.
609 it->it_flags |= MDS_OPEN_BY_FID;
610 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 lli->lli_update_atime = 0;
1020 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1023 atime = LTIME_S(inode->i_atime);
1024 mtime = LTIME_S(inode->i_mtime);
1025 ctime = LTIME_S(inode->i_ctime);
1027 cl_object_attr_lock(obj);
1028 rc = cl_object_attr_get(env, obj, attr);
1029 cl_object_attr_unlock(obj);
1032 GOTO(out_size_unlock, rc);
1034 if (atime < attr->cat_atime)
1035 atime = attr->cat_atime;
1037 if (ctime < attr->cat_ctime)
1038 ctime = attr->cat_ctime;
1040 if (mtime < attr->cat_mtime)
1041 mtime = attr->cat_mtime;
1043 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044 PFID(&lli->lli_fid), attr->cat_size);
1046 i_size_write(inode, attr->cat_size);
1047 inode->i_blocks = attr->cat_blocks;
1049 LTIME_S(inode->i_atime) = atime;
1050 LTIME_S(inode->i_mtime) = mtime;
1051 LTIME_S(inode->i_ctime) = ctime;
1054 ll_inode_size_unlock(inode);
1059 static bool file_is_noatime(const struct file *file)
1061 const struct vfsmount *mnt = file->f_path.mnt;
1062 const struct inode *inode = file_inode((struct file *)file);
1064 /* Adapted from file_accessed() and touch_atime().*/
1065 if (file->f_flags & O_NOATIME)
1068 if (inode->i_flags & S_NOATIME)
1071 if (IS_NOATIME(inode))
1074 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1077 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1086 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1088 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1090 struct inode *inode = file_inode(file);
1092 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1093 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1094 io->u.ci_rw.rw_file = file;
1095 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1096 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1097 if (iot == CIT_WRITE) {
1098 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1099 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1100 file->f_flags & O_DIRECT ||
1103 io->ci_obj = ll_i2info(inode)->lli_clob;
1104 io->ci_lockreq = CILR_MAYBE;
1105 if (ll_file_nolock(file)) {
1106 io->ci_lockreq = CILR_NEVER;
1107 io->ci_no_srvlock = 1;
1108 } else if (file->f_flags & O_APPEND) {
1109 io->ci_lockreq = CILR_MANDATORY;
1111 io->ci_noatime = file_is_noatime(file);
1112 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1113 io->ci_pio = !io->u.ci_rw.rw_append;
1118 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1120 struct cl_io_pt *pt = ptask->pt_cbdata;
1121 struct file *file = pt->cip_file;
1124 loff_t pos = pt->cip_pos;
1129 env = cl_env_get(&refcheck);
1131 RETURN(PTR_ERR(env));
1133 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1134 file_dentry(file)->d_name.name,
1135 pt->cip_iot == CIT_READ ? "read" : "write",
1136 pos, pos + pt->cip_count);
1139 io = vvp_env_thread_io(env);
1140 ll_io_init(io, file, pt->cip_iot);
1141 io->u.ci_rw.rw_iter = pt->cip_iter;
1142 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1143 io->ci_pio = 0; /* It's already in parallel task */
1145 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1146 pt->cip_count - pt->cip_result);
1148 struct vvp_io *vio = vvp_env_io(env);
1150 vio->vui_io_subtype = IO_NORMAL;
1151 vio->vui_fd = LUSTRE_FPRIVATE(file);
1153 ll_cl_add(file, env, io, LCC_RW);
1154 rc = cl_io_loop(env, io);
1155 ll_cl_remove(file, env);
1157 /* cl_io_rw_init() handled IO */
1161 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1167 if (io->ci_nob > 0) {
1168 pt->cip_result += io->ci_nob;
1169 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1171 pt->cip_iocb.ki_pos = pos;
1172 #ifdef HAVE_KIOCB_KI_LEFT
1173 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1174 #elif defined(HAVE_KI_NBYTES)
1175 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1179 cl_io_fini(env, io);
1181 if ((rc == 0 || rc == -ENODATA) &&
1182 pt->cip_result < pt->cip_count &&
1183 io->ci_need_restart) {
1185 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1186 file_dentry(file)->d_name.name,
1187 pt->cip_iot == CIT_READ ? "read" : "write",
1188 pos, pos + pt->cip_count - pt->cip_result,
1189 pt->cip_result, rc);
1193 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1194 file_dentry(file)->d_name.name,
1195 pt->cip_iot == CIT_READ ? "read" : "write",
1196 pt->cip_result, rc);
1198 cl_env_put(env, &refcheck);
1199 RETURN(pt->cip_result > 0 ? 0 : rc);
1203 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1204 struct file *file, enum cl_io_type iot,
1205 loff_t *ppos, size_t count)
1207 struct range_lock range;
1208 struct vvp_io *vio = vvp_env_io(env);
1209 struct inode *inode = file_inode(file);
1210 struct ll_inode_info *lli = ll_i2info(inode);
1211 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1219 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1220 file_dentry(file)->d_name.name,
1221 iot == CIT_READ ? "read" : "write", pos, pos + count);
1224 io = vvp_env_thread_io(env);
1225 ll_io_init(io, file, iot);
1226 if (args->via_io_subtype == IO_NORMAL) {
1227 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1228 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1233 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1234 bool range_locked = false;
1236 if (file->f_flags & O_APPEND)
1237 range_lock_init(&range, 0, LUSTRE_EOF);
1239 range_lock_init(&range, pos, pos + count - 1);
1241 vio->vui_fd = LUSTRE_FPRIVATE(file);
1242 vio->vui_io_subtype = args->via_io_subtype;
1244 switch (vio->vui_io_subtype) {
1246 /* Direct IO reads must also take range lock,
1247 * or multiple reads will try to work on the same pages
1248 * See LU-6227 for details. */
1249 if (((iot == CIT_WRITE) ||
1250 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1251 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1252 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1254 rc = range_lock(&lli->lli_write_tree, &range);
1258 range_locked = true;
1262 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1263 vio->u.splice.vui_flags = args->u.splice.via_flags;
1266 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1270 ll_cl_add(file, env, io, LCC_RW);
1271 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1272 !lli->lli_inode_locked) {
1274 lli->lli_inode_locked = 1;
1276 rc = cl_io_loop(env, io);
1277 if (lli->lli_inode_locked) {
1278 lli->lli_inode_locked = 0;
1279 inode_unlock(inode);
1281 ll_cl_remove(file, env);
1284 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1286 range_unlock(&lli->lli_write_tree, &range);
1289 /* cl_io_rw_init() handled IO */
1293 if (io->ci_nob > 0) {
1294 result += io->ci_nob;
1295 count -= io->ci_nob;
1297 if (args->via_io_subtype == IO_NORMAL) {
1298 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1300 args->u.normal.via_iocb->ki_pos = pos;
1301 #ifdef HAVE_KIOCB_KI_LEFT
1302 args->u.normal.via_iocb->ki_left = count;
1303 #elif defined(HAVE_KI_NBYTES)
1304 args->u.normal.via_iocb->ki_nbytes = count;
1308 pos = io->u.ci_rw.rw_range.cir_pos;
1312 cl_io_fini(env, io);
1314 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1316 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1317 file_dentry(file)->d_name.name,
1318 iot == CIT_READ ? "read" : "write",
1319 pos, pos + count, result, rc);
1323 if (iot == CIT_READ) {
1325 ll_stats_ops_tally(ll_i2sbi(inode),
1326 LPROC_LL_READ_BYTES, result);
1327 } else if (iot == CIT_WRITE) {
1329 ll_stats_ops_tally(ll_i2sbi(inode),
1330 LPROC_LL_WRITE_BYTES, result);
1331 fd->fd_write_failed = false;
1332 } else if (result == 0 && rc == 0) {
1335 fd->fd_write_failed = true;
1337 fd->fd_write_failed = false;
1338 } else if (rc != -ERESTARTSYS) {
1339 fd->fd_write_failed = true;
1343 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1344 file_dentry(file)->d_name.name,
1345 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1349 RETURN(result > 0 ? result : rc);
1353 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1354 * especially for small I/O.
1356 * To serve a read request, CLIO has to create and initialize a cl_io and
1357 * then request DLM lock. This has turned out to have siginificant overhead
1358 * and affects the performance of small I/O dramatically.
1360 * It's not necessary to create a cl_io for each I/O. Under the help of read
1361 * ahead, most of the pages being read are already in memory cache and we can
1362 * read those pages directly because if the pages exist, the corresponding DLM
1363 * lock must exist so that page content must be valid.
1365 * In fast read implementation, the llite speculatively finds and reads pages
1366 * in memory cache. There are three scenarios for fast read:
1367 * - If the page exists and is uptodate, kernel VM will provide the data and
1368 * CLIO won't be intervened;
1369 * - If the page was brought into memory by read ahead, it will be exported
1370 * and read ahead parameters will be updated;
1371 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1372 * it will go back and invoke normal read, i.e., a cl_io will be created
1373 * and DLM lock will be requested.
1375 * POSIX compliance: posix standard states that read is intended to be atomic.
1376 * Lustre read implementation is in line with Linux kernel read implementation
1377 * and neither of them complies with POSIX standard in this matter. Fast read
1378 * doesn't make the situation worse on single node but it may interleave write
1379 * results from multiple nodes due to short read handling in ll_file_aio_read().
1381 * \param env - lu_env
1382 * \param iocb - kiocb from kernel
1383 * \param iter - user space buffers where the data will be copied
1385 * \retval - number of bytes have been read, or error code if error occurred.
1388 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1392 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1395 /* NB: we can't do direct IO for fast read because it will need a lock
1396 * to make IO engine happy. */
1397 if (iocb->ki_filp->f_flags & O_DIRECT)
1400 result = generic_file_read_iter(iocb, iter);
1402 /* If the first page is not in cache, generic_file_aio_read() will be
1403 * returned with -ENODATA.
1404 * See corresponding code in ll_readpage(). */
1405 if (result == -ENODATA)
1409 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1410 LPROC_LL_READ_BYTES, result);
1416 * Read from a file (through the page cache).
1418 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1421 struct vvp_io_args *args;
1426 result = ll_do_fast_read(iocb, to);
1427 if (result < 0 || iov_iter_count(to) == 0)
1430 env = cl_env_get(&refcheck);
1432 return PTR_ERR(env);
1434 args = ll_env_args(env, IO_NORMAL);
1435 args->u.normal.via_iter = to;
1436 args->u.normal.via_iocb = iocb;
1438 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1439 &iocb->ki_pos, iov_iter_count(to));
1442 else if (result == 0)
1445 cl_env_put(env, &refcheck);
1451 * Write to a file (through the page cache).
1453 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1455 struct vvp_io_args *args;
1460 env = cl_env_get(&refcheck);
1462 return PTR_ERR(env);
1464 args = ll_env_args(env, IO_NORMAL);
1465 args->u.normal.via_iter = from;
1466 args->u.normal.via_iocb = iocb;
1468 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1469 &iocb->ki_pos, iov_iter_count(from));
1470 cl_env_put(env, &refcheck);
1474 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1476 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1478 static int ll_file_get_iov_count(const struct iovec *iov,
1479 unsigned long *nr_segs, size_t *count)
1484 for (seg = 0; seg < *nr_segs; seg++) {
1485 const struct iovec *iv = &iov[seg];
1488 * If any segment has a negative length, or the cumulative
1489 * length ever wraps negative then return -EINVAL.
1492 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1494 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1499 cnt -= iv->iov_len; /* This segment is no good */
1506 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1507 unsigned long nr_segs, loff_t pos)
1514 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1518 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1519 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1520 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1521 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1522 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1524 result = ll_file_read_iter(iocb, &to);
1529 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1532 struct iovec iov = { .iov_base = buf, .iov_len = count };
1537 init_sync_kiocb(&kiocb, file);
1538 kiocb.ki_pos = *ppos;
1539 #ifdef HAVE_KIOCB_KI_LEFT
1540 kiocb.ki_left = count;
1541 #elif defined(HAVE_KI_NBYTES)
1542 kiocb.i_nbytes = count;
1545 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1546 *ppos = kiocb.ki_pos;
1552 * Write to a file (through the page cache).
1555 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1556 unsigned long nr_segs, loff_t pos)
1558 struct iov_iter from;
1563 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1567 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1568 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1569 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1570 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1571 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1573 result = ll_file_write_iter(iocb, &from);
1578 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1579 size_t count, loff_t *ppos)
1582 struct iovec iov = { .iov_base = (void __user *)buf,
1584 struct kiocb *kiocb;
1589 env = cl_env_get(&refcheck);
1591 RETURN(PTR_ERR(env));
1593 kiocb = &ll_env_info(env)->lti_kiocb;
1594 init_sync_kiocb(kiocb, file);
1595 kiocb->ki_pos = *ppos;
1596 #ifdef HAVE_KIOCB_KI_LEFT
1597 kiocb->ki_left = count;
1598 #elif defined(HAVE_KI_NBYTES)
1599 kiocb->ki_nbytes = count;
1602 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1603 *ppos = kiocb->ki_pos;
1605 cl_env_put(env, &refcheck);
1608 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1611 * Send file content (through pagecache) somewhere with helper
1613 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1614 struct pipe_inode_info *pipe, size_t count,
1618 struct vvp_io_args *args;
1623 env = cl_env_get(&refcheck);
1625 RETURN(PTR_ERR(env));
1627 args = ll_env_args(env, IO_SPLICE);
1628 args->u.splice.via_pipe = pipe;
1629 args->u.splice.via_flags = flags;
1631 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1632 cl_env_put(env, &refcheck);
1636 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1637 __u64 flags, struct lov_user_md *lum, int lum_size)
1639 struct lookup_intent oit = {
1641 .it_flags = flags | MDS_OPEN_BY_FID,
1646 ll_inode_size_lock(inode);
1647 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1649 GOTO(out_unlock, rc);
1651 ll_release_openhandle(dentry, &oit);
1654 ll_inode_size_unlock(inode);
1655 ll_intent_release(&oit);
1660 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1661 struct lov_mds_md **lmmp, int *lmm_size,
1662 struct ptlrpc_request **request)
1664 struct ll_sb_info *sbi = ll_i2sbi(inode);
1665 struct mdt_body *body;
1666 struct lov_mds_md *lmm = NULL;
1667 struct ptlrpc_request *req = NULL;
1668 struct md_op_data *op_data;
1671 rc = ll_get_default_mdsize(sbi, &lmmsize);
1675 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1676 strlen(filename), lmmsize,
1677 LUSTRE_OPC_ANY, NULL);
1678 if (IS_ERR(op_data))
1679 RETURN(PTR_ERR(op_data));
1681 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1682 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1683 ll_finish_md_op_data(op_data);
1685 CDEBUG(D_INFO, "md_getattr_name failed "
1686 "on %s: rc %d\n", filename, rc);
1690 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1691 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1693 lmmsize = body->mbo_eadatasize;
1695 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1697 GOTO(out, rc = -ENODATA);
1700 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1701 LASSERT(lmm != NULL);
1703 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1704 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1705 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1706 GOTO(out, rc = -EPROTO);
1709 * This is coming from the MDS, so is probably in
1710 * little endian. We convert it to host endian before
1711 * passing it to userspace.
1713 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1716 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1717 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1718 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1719 if (le32_to_cpu(lmm->lmm_pattern) &
1720 LOV_PATTERN_F_RELEASED)
1724 /* if function called for directory - we should
1725 * avoid swab not existent lsm objects */
1726 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1727 lustre_swab_lov_user_md_v1(
1728 (struct lov_user_md_v1 *)lmm);
1729 if (S_ISREG(body->mbo_mode))
1730 lustre_swab_lov_user_md_objects(
1731 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1733 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1734 lustre_swab_lov_user_md_v3(
1735 (struct lov_user_md_v3 *)lmm);
1736 if (S_ISREG(body->mbo_mode))
1737 lustre_swab_lov_user_md_objects(
1738 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1740 } else if (lmm->lmm_magic ==
1741 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1742 lustre_swab_lov_comp_md_v1(
1743 (struct lov_comp_md_v1 *)lmm);
1749 *lmm_size = lmmsize;
1754 static int ll_lov_setea(struct inode *inode, struct file *file,
1757 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1758 struct lov_user_md *lump;
1759 int lum_size = sizeof(struct lov_user_md) +
1760 sizeof(struct lov_user_ost_data);
1764 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1767 OBD_ALLOC_LARGE(lump, lum_size);
1771 if (copy_from_user(lump, arg, lum_size))
1772 GOTO(out_lump, rc = -EFAULT);
1774 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1776 cl_lov_delay_create_clear(&file->f_flags);
1779 OBD_FREE_LARGE(lump, lum_size);
1783 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1790 env = cl_env_get(&refcheck);
1792 RETURN(PTR_ERR(env));
1794 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1795 cl_env_put(env, &refcheck);
1799 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1802 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1803 struct lov_user_md *klum;
1805 __u64 flags = FMODE_WRITE;
1808 rc = ll_copy_user_md(lum, &klum);
1813 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1818 rc = put_user(0, &lum->lmm_stripe_count);
1822 rc = ll_layout_refresh(inode, &gen);
1826 rc = ll_file_getstripe(inode, arg, lum_size);
1828 cl_lov_delay_create_clear(&file->f_flags);
1831 OBD_FREE(klum, lum_size);
1836 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1838 struct ll_inode_info *lli = ll_i2info(inode);
1839 struct cl_object *obj = lli->lli_clob;
1840 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1841 struct ll_grouplock grouplock;
1846 CWARN("group id for group lock must not be 0\n");
1850 if (ll_file_nolock(file))
1851 RETURN(-EOPNOTSUPP);
1853 spin_lock(&lli->lli_lock);
1854 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1855 CWARN("group lock already existed with gid %lu\n",
1856 fd->fd_grouplock.lg_gid);
1857 spin_unlock(&lli->lli_lock);
1860 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1861 spin_unlock(&lli->lli_lock);
1864 * XXX: group lock needs to protect all OST objects while PFL
1865 * can add new OST objects during the IO, so we'd instantiate
1866 * all OST objects before getting its group lock.
1871 struct cl_layout cl = {
1872 .cl_is_composite = false,
1875 env = cl_env_get(&refcheck);
1877 RETURN(PTR_ERR(env));
1879 rc = cl_object_layout_get(env, obj, &cl);
1880 if (!rc && cl.cl_is_composite)
1881 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1883 cl_env_put(env, &refcheck);
1888 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1889 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1893 spin_lock(&lli->lli_lock);
1894 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1895 spin_unlock(&lli->lli_lock);
1896 CERROR("another thread just won the race\n");
1897 cl_put_grouplock(&grouplock);
1901 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1902 fd->fd_grouplock = grouplock;
1903 spin_unlock(&lli->lli_lock);
1905 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1909 static int ll_put_grouplock(struct inode *inode, struct file *file,
1912 struct ll_inode_info *lli = ll_i2info(inode);
1913 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1914 struct ll_grouplock grouplock;
1917 spin_lock(&lli->lli_lock);
1918 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1919 spin_unlock(&lli->lli_lock);
1920 CWARN("no group lock held\n");
1924 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1926 if (fd->fd_grouplock.lg_gid != arg) {
1927 CWARN("group lock %lu doesn't match current id %lu\n",
1928 arg, fd->fd_grouplock.lg_gid);
1929 spin_unlock(&lli->lli_lock);
1933 grouplock = fd->fd_grouplock;
1934 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1935 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1936 spin_unlock(&lli->lli_lock);
1938 cl_put_grouplock(&grouplock);
1939 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1944 * Close inode open handle
1946 * \param dentry [in] dentry which contains the inode
1947 * \param it [in,out] intent which contains open info and result
1950 * \retval <0 failure
1952 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1954 struct inode *inode = dentry->d_inode;
1955 struct obd_client_handle *och;
1961 /* Root ? Do nothing. */
1962 if (dentry->d_inode->i_sb->s_root == dentry)
1965 /* No open handle to close? Move away */
1966 if (!it_disposition(it, DISP_OPEN_OPEN))
1969 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1971 OBD_ALLOC(och, sizeof(*och));
1973 GOTO(out, rc = -ENOMEM);
1975 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1977 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1979 /* this one is in place of ll_file_open */
1980 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1981 ptlrpc_req_finished(it->it_request);
1982 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1988 * Get size for inode for which FIEMAP mapping is requested.
1989 * Make the FIEMAP get_info call and returns the result.
1990 * \param fiemap kernel buffer to hold extens
1991 * \param num_bytes kernel buffer size
1993 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1999 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2002 /* Checks for fiemap flags */
2003 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2004 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2008 /* Check for FIEMAP_FLAG_SYNC */
2009 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2010 rc = filemap_fdatawrite(inode->i_mapping);
2015 env = cl_env_get(&refcheck);
2017 RETURN(PTR_ERR(env));
2019 if (i_size_read(inode) == 0) {
2020 rc = ll_glimpse_size(inode);
2025 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2026 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2027 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2029 /* If filesize is 0, then there would be no objects for mapping */
2030 if (fmkey.lfik_oa.o_size == 0) {
2031 fiemap->fm_mapped_extents = 0;
2035 fmkey.lfik_fiemap = *fiemap;
2037 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2038 &fmkey, fiemap, &num_bytes);
2040 cl_env_put(env, &refcheck);
2044 int ll_fid2path(struct inode *inode, void __user *arg)
2046 struct obd_export *exp = ll_i2mdexp(inode);
2047 const struct getinfo_fid2path __user *gfin = arg;
2049 struct getinfo_fid2path *gfout;
2055 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2056 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2059 /* Only need to get the buflen */
2060 if (get_user(pathlen, &gfin->gf_pathlen))
2063 if (pathlen > PATH_MAX)
2066 outsize = sizeof(*gfout) + pathlen;
2067 OBD_ALLOC(gfout, outsize);
2071 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2072 GOTO(gf_free, rc = -EFAULT);
2073 /* append root FID after gfout to let MDT know the root FID so that it
2074 * can lookup the correct path, this is mainly for fileset.
2075 * old server without fileset mount support will ignore this. */
2076 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2078 /* Call mdc_iocontrol */
2079 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2083 if (copy_to_user(arg, gfout, outsize))
2087 OBD_FREE(gfout, outsize);
2092 * Read the data_version for inode.
2094 * This value is computed using stripe object version on OST.
2095 * Version is computed using server side locking.
2097 * @param flags if do sync on the OST side;
2099 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2100 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2102 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2104 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2112 /* If no file object initialized, we consider its version is 0. */
2118 env = cl_env_get(&refcheck);
2120 RETURN(PTR_ERR(env));
2122 io = vvp_env_thread_io(env);
2124 io->u.ci_data_version.dv_data_version = 0;
2125 io->u.ci_data_version.dv_flags = flags;
2128 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2129 result = cl_io_loop(env, io);
2131 result = io->ci_result;
2133 *data_version = io->u.ci_data_version.dv_data_version;
2135 cl_io_fini(env, io);
2137 if (unlikely(io->ci_need_restart))
2140 cl_env_put(env, &refcheck);
2146 * Trigger a HSM release request for the provided inode.
2148 int ll_hsm_release(struct inode *inode)
2151 struct obd_client_handle *och = NULL;
2152 __u64 data_version = 0;
2157 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2158 ll_get_fsname(inode->i_sb, NULL, 0),
2159 PFID(&ll_i2info(inode)->lli_fid));
2161 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2163 GOTO(out, rc = PTR_ERR(och));
2165 /* Grab latest data_version and [am]time values */
2166 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2170 env = cl_env_get(&refcheck);
2172 GOTO(out, rc = PTR_ERR(env));
2174 ll_merge_attr(env, inode);
2175 cl_env_put(env, &refcheck);
2177 /* Release the file.
2178 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2179 * we still need it to pack l_remote_handle to MDT. */
2180 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2186 if (och != NULL && !IS_ERR(och)) /* close the file */
2187 ll_lease_close(och, inode, NULL);
2192 struct ll_swap_stack {
2195 struct inode *inode1;
2196 struct inode *inode2;
2201 static int ll_swap_layouts(struct file *file1, struct file *file2,
2202 struct lustre_swap_layouts *lsl)
2204 struct mdc_swap_layouts msl;
2205 struct md_op_data *op_data;
2208 struct ll_swap_stack *llss = NULL;
2211 OBD_ALLOC_PTR(llss);
2215 llss->inode1 = file_inode(file1);
2216 llss->inode2 = file_inode(file2);
2218 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2222 /* we use 2 bool because it is easier to swap than 2 bits */
2223 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2224 llss->check_dv1 = true;
2226 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2227 llss->check_dv2 = true;
2229 /* we cannot use lsl->sl_dvX directly because we may swap them */
2230 llss->dv1 = lsl->sl_dv1;
2231 llss->dv2 = lsl->sl_dv2;
2233 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2234 if (rc == 0) /* same file, done! */
2237 if (rc < 0) { /* sequentialize it */
2238 swap(llss->inode1, llss->inode2);
2240 swap(llss->dv1, llss->dv2);
2241 swap(llss->check_dv1, llss->check_dv2);
2245 if (gid != 0) { /* application asks to flush dirty cache */
2246 rc = ll_get_grouplock(llss->inode1, file1, gid);
2250 rc = ll_get_grouplock(llss->inode2, file2, gid);
2252 ll_put_grouplock(llss->inode1, file1, gid);
2257 /* ultimate check, before swaping the layouts we check if
2258 * dataversion has changed (if requested) */
2259 if (llss->check_dv1) {
2260 rc = ll_data_version(llss->inode1, &dv, 0);
2263 if (dv != llss->dv1)
2264 GOTO(putgl, rc = -EAGAIN);
2267 if (llss->check_dv2) {
2268 rc = ll_data_version(llss->inode2, &dv, 0);
2271 if (dv != llss->dv2)
2272 GOTO(putgl, rc = -EAGAIN);
2275 /* struct md_op_data is used to send the swap args to the mdt
2276 * only flags is missing, so we use struct mdc_swap_layouts
2277 * through the md_op_data->op_data */
2278 /* flags from user space have to be converted before they are send to
2279 * server, no flag is sent today, they are only used on the client */
2282 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2283 0, LUSTRE_OPC_ANY, &msl);
2284 if (IS_ERR(op_data))
2285 GOTO(free, rc = PTR_ERR(op_data));
2287 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2288 sizeof(*op_data), op_data, NULL);
2289 ll_finish_md_op_data(op_data);
2296 ll_put_grouplock(llss->inode2, file2, gid);
2297 ll_put_grouplock(llss->inode1, file1, gid);
2307 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2309 struct md_op_data *op_data;
2313 /* Detect out-of range masks */
2314 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2317 /* Non-root users are forbidden to set or clear flags which are
2318 * NOT defined in HSM_USER_MASK. */
2319 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2320 !cfs_capable(CFS_CAP_SYS_ADMIN))
2323 /* Detect out-of range archive id */
2324 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2325 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2328 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2329 LUSTRE_OPC_ANY, hss);
2330 if (IS_ERR(op_data))
2331 RETURN(PTR_ERR(op_data));
2333 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2334 sizeof(*op_data), op_data, NULL);
2336 ll_finish_md_op_data(op_data);
2341 static int ll_hsm_import(struct inode *inode, struct file *file,
2342 struct hsm_user_import *hui)
2344 struct hsm_state_set *hss = NULL;
2345 struct iattr *attr = NULL;
2349 if (!S_ISREG(inode->i_mode))
2355 GOTO(out, rc = -ENOMEM);
2357 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2358 hss->hss_archive_id = hui->hui_archive_id;
2359 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2360 rc = ll_hsm_state_set(inode, hss);
2364 OBD_ALLOC_PTR(attr);
2366 GOTO(out, rc = -ENOMEM);
2368 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2369 attr->ia_mode |= S_IFREG;
2370 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2371 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2372 attr->ia_size = hui->hui_size;
2373 attr->ia_mtime.tv_sec = hui->hui_mtime;
2374 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2375 attr->ia_atime.tv_sec = hui->hui_atime;
2376 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2378 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2379 ATTR_UID | ATTR_GID |
2380 ATTR_MTIME | ATTR_MTIME_SET |
2381 ATTR_ATIME | ATTR_ATIME_SET;
2385 rc = ll_setattr_raw(file_dentry(file), attr, true);
2389 inode_unlock(inode);
2401 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2403 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2404 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2407 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2409 struct inode *inode = file_inode(file);
2411 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2412 ATTR_MTIME | ATTR_MTIME_SET |
2413 ATTR_CTIME | ATTR_CTIME_SET,
2415 .tv_sec = lfu->lfu_atime_sec,
2416 .tv_nsec = lfu->lfu_atime_nsec,
2419 .tv_sec = lfu->lfu_mtime_sec,
2420 .tv_nsec = lfu->lfu_mtime_nsec,
2423 .tv_sec = lfu->lfu_ctime_sec,
2424 .tv_nsec = lfu->lfu_ctime_nsec,
2430 if (!capable(CAP_SYS_ADMIN))
2433 if (!S_ISREG(inode->i_mode))
2437 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2438 inode_unlock(inode);
2444 * Give file access advices
2446 * The ladvise interface is similar to Linux fadvise() system call, except it
2447 * forwards the advices directly from Lustre client to server. The server side
2448 * codes will apply appropriate read-ahead and caching techniques for the
2449 * corresponding files.
2451 * A typical workload for ladvise is e.g. a bunch of different clients are
2452 * doing small random reads of a file, so prefetching pages into OSS cache
2453 * with big linear reads before the random IO is a net benefit. Fetching
2454 * all that data into each client cache with fadvise() may not be, due to
2455 * much more data being sent to the client.
2457 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2458 struct llapi_lu_ladvise *ladvise)
2462 struct cl_ladvise_io *lio;
2467 env = cl_env_get(&refcheck);
2469 RETURN(PTR_ERR(env));
2471 io = vvp_env_thread_io(env);
2472 io->ci_obj = ll_i2info(inode)->lli_clob;
2474 /* initialize parameters for ladvise */
2475 lio = &io->u.ci_ladvise;
2476 lio->li_start = ladvise->lla_start;
2477 lio->li_end = ladvise->lla_end;
2478 lio->li_fid = ll_inode2fid(inode);
2479 lio->li_advice = ladvise->lla_advice;
2480 lio->li_flags = flags;
2482 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2483 rc = cl_io_loop(env, io);
2487 cl_io_fini(env, io);
2488 cl_env_put(env, &refcheck);
2492 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2495 struct fsxattr fsxattr;
2497 if (copy_from_user(&fsxattr,
2498 (const struct fsxattr __user *)arg,
2502 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2503 if (copy_to_user((struct fsxattr __user *)arg,
2504 &fsxattr, sizeof(fsxattr)))
2510 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2514 struct md_op_data *op_data;
2515 struct ptlrpc_request *req = NULL;
2517 struct fsxattr fsxattr;
2519 /* only root could change project ID */
2520 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2523 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2524 LUSTRE_OPC_ANY, NULL);
2525 if (IS_ERR(op_data))
2526 RETURN(PTR_ERR(op_data));
2528 if (copy_from_user(&fsxattr,
2529 (const struct fsxattr __user *)arg,
2531 GOTO(out_fsxattr1, rc = -EFAULT);
2533 op_data->op_projid = fsxattr.fsx_projid;
2534 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2535 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2537 ptlrpc_req_finished(req);
2540 ll_finish_md_op_data(op_data);
2547 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2549 struct inode *inode = file_inode(file);
2550 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2554 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2555 PFID(ll_inode2fid(inode)), inode, cmd);
2556 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2558 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2559 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2563 case LL_IOC_GETFLAGS:
2564 /* Get the current value of the file flags */
2565 return put_user(fd->fd_flags, (int __user *)arg);
2566 case LL_IOC_SETFLAGS:
2567 case LL_IOC_CLRFLAGS:
2568 /* Set or clear specific file flags */
2569 /* XXX This probably needs checks to ensure the flags are
2570 * not abused, and to handle any flag side effects.
2572 if (get_user(flags, (int __user *) arg))
2575 if (cmd == LL_IOC_SETFLAGS) {
2576 if ((flags & LL_FILE_IGNORE_LOCK) &&
2577 !(file->f_flags & O_DIRECT)) {
2578 CERROR("%s: unable to disable locking on "
2579 "non-O_DIRECT file\n", current->comm);
2583 fd->fd_flags |= flags;
2585 fd->fd_flags &= ~flags;
2588 case LL_IOC_LOV_SETSTRIPE:
2589 case LL_IOC_LOV_SETSTRIPE_NEW:
2590 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2591 case LL_IOC_LOV_SETEA:
2592 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2593 case LL_IOC_LOV_SWAP_LAYOUTS: {
2595 struct lustre_swap_layouts lsl;
2597 if (copy_from_user(&lsl, (char __user *)arg,
2598 sizeof(struct lustre_swap_layouts)))
2601 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2604 file2 = fget(lsl.sl_fd);
2608 /* O_WRONLY or O_RDWR */
2609 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2610 GOTO(out, rc = -EPERM);
2612 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2613 struct inode *inode2;
2614 struct ll_inode_info *lli;
2615 struct obd_client_handle *och = NULL;
2617 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2618 GOTO(out, rc = -EINVAL);
2620 lli = ll_i2info(inode);
2621 mutex_lock(&lli->lli_och_mutex);
2622 if (fd->fd_lease_och != NULL) {
2623 och = fd->fd_lease_och;
2624 fd->fd_lease_och = NULL;
2626 mutex_unlock(&lli->lli_och_mutex);
2628 GOTO(out, rc = -ENOLCK);
2629 inode2 = file_inode(file2);
2630 rc = ll_swap_layouts_close(och, inode, inode2);
2632 rc = ll_swap_layouts(file, file2, &lsl);
2638 case LL_IOC_LOV_GETSTRIPE:
2639 case LL_IOC_LOV_GETSTRIPE_NEW:
2640 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2641 case FSFILT_IOC_GETFLAGS:
2642 case FSFILT_IOC_SETFLAGS:
2643 RETURN(ll_iocontrol(inode, file, cmd, arg));
2644 case FSFILT_IOC_GETVERSION_OLD:
2645 case FSFILT_IOC_GETVERSION:
2646 RETURN(put_user(inode->i_generation, (int __user *)arg));
2647 case LL_IOC_GROUP_LOCK:
2648 RETURN(ll_get_grouplock(inode, file, arg));
2649 case LL_IOC_GROUP_UNLOCK:
2650 RETURN(ll_put_grouplock(inode, file, arg));
2651 case IOC_OBD_STATFS:
2652 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2654 /* We need to special case any other ioctls we want to handle,
2655 * to send them to the MDS/OST as appropriate and to properly
2656 * network encode the arg field.
2657 case FSFILT_IOC_SETVERSION_OLD:
2658 case FSFILT_IOC_SETVERSION:
2660 case LL_IOC_FLUSHCTX:
2661 RETURN(ll_flush_ctx(inode));
2662 case LL_IOC_PATH2FID: {
2663 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2664 sizeof(struct lu_fid)))
2669 case LL_IOC_GETPARENT:
2670 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2672 case OBD_IOC_FID2PATH:
2673 RETURN(ll_fid2path(inode, (void __user *)arg));
2674 case LL_IOC_DATA_VERSION: {
2675 struct ioc_data_version idv;
2678 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2681 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2682 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2685 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2691 case LL_IOC_GET_MDTIDX: {
2694 mdtidx = ll_get_mdt_idx(inode);
2698 if (put_user((int)mdtidx, (int __user *)arg))
2703 case OBD_IOC_GETDTNAME:
2704 case OBD_IOC_GETMDNAME:
2705 RETURN(ll_get_obd_name(inode, cmd, arg));
2706 case LL_IOC_HSM_STATE_GET: {
2707 struct md_op_data *op_data;
2708 struct hsm_user_state *hus;
2715 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2716 LUSTRE_OPC_ANY, hus);
2717 if (IS_ERR(op_data)) {
2719 RETURN(PTR_ERR(op_data));
2722 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2725 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2728 ll_finish_md_op_data(op_data);
2732 case LL_IOC_HSM_STATE_SET: {
2733 struct hsm_state_set *hss;
2740 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2745 rc = ll_hsm_state_set(inode, hss);
2750 case LL_IOC_HSM_ACTION: {
2751 struct md_op_data *op_data;
2752 struct hsm_current_action *hca;
2759 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2760 LUSTRE_OPC_ANY, hca);
2761 if (IS_ERR(op_data)) {
2763 RETURN(PTR_ERR(op_data));
2766 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2769 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2772 ll_finish_md_op_data(op_data);
2776 case LL_IOC_SET_LEASE: {
2777 struct ll_inode_info *lli = ll_i2info(inode);
2778 struct obd_client_handle *och = NULL;
2783 case LL_LEASE_WRLCK:
2784 if (!(file->f_mode & FMODE_WRITE))
2786 fmode = FMODE_WRITE;
2788 case LL_LEASE_RDLCK:
2789 if (!(file->f_mode & FMODE_READ))
2793 case LL_LEASE_UNLCK:
2794 mutex_lock(&lli->lli_och_mutex);
2795 if (fd->fd_lease_och != NULL) {
2796 och = fd->fd_lease_och;
2797 fd->fd_lease_och = NULL;
2799 mutex_unlock(&lli->lli_och_mutex);
2804 fmode = och->och_flags;
2805 rc = ll_lease_close(och, inode, &lease_broken);
2809 rc = ll_lease_och_release(inode, file);
2816 RETURN(ll_lease_type_from_fmode(fmode));
2821 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2823 /* apply for lease */
2824 och = ll_lease_open(inode, file, fmode, 0);
2826 RETURN(PTR_ERR(och));
2829 mutex_lock(&lli->lli_och_mutex);
2830 if (fd->fd_lease_och == NULL) {
2831 fd->fd_lease_och = och;
2834 mutex_unlock(&lli->lli_och_mutex);
2836 /* impossible now that only excl is supported for now */
2837 ll_lease_close(och, inode, &lease_broken);
2842 case LL_IOC_GET_LEASE: {
2843 struct ll_inode_info *lli = ll_i2info(inode);
2844 struct ldlm_lock *lock = NULL;
2847 mutex_lock(&lli->lli_och_mutex);
2848 if (fd->fd_lease_och != NULL) {
2849 struct obd_client_handle *och = fd->fd_lease_och;
2851 lock = ldlm_handle2lock(&och->och_lease_handle);
2853 lock_res_and_lock(lock);
2854 if (!ldlm_is_cancel(lock))
2855 fmode = och->och_flags;
2857 unlock_res_and_lock(lock);
2858 LDLM_LOCK_PUT(lock);
2861 mutex_unlock(&lli->lli_och_mutex);
2863 RETURN(ll_lease_type_from_fmode(fmode));
2865 case LL_IOC_HSM_IMPORT: {
2866 struct hsm_user_import *hui;
2872 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2877 rc = ll_hsm_import(inode, file, hui);
2882 case LL_IOC_FUTIMES_3: {
2883 struct ll_futimes_3 lfu;
2885 if (copy_from_user(&lfu,
2886 (const struct ll_futimes_3 __user *)arg,
2890 RETURN(ll_file_futimes_3(file, &lfu));
2892 case LL_IOC_LADVISE: {
2893 struct llapi_ladvise_hdr *ladvise_hdr;
2896 int alloc_size = sizeof(*ladvise_hdr);
2899 OBD_ALLOC_PTR(ladvise_hdr);
2900 if (ladvise_hdr == NULL)
2903 if (copy_from_user(ladvise_hdr,
2904 (const struct llapi_ladvise_hdr __user *)arg,
2906 GOTO(out_ladvise, rc = -EFAULT);
2908 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2909 ladvise_hdr->lah_count < 1)
2910 GOTO(out_ladvise, rc = -EINVAL);
2912 num_advise = ladvise_hdr->lah_count;
2913 if (num_advise >= LAH_COUNT_MAX)
2914 GOTO(out_ladvise, rc = -EFBIG);
2916 OBD_FREE_PTR(ladvise_hdr);
2917 alloc_size = offsetof(typeof(*ladvise_hdr),
2918 lah_advise[num_advise]);
2919 OBD_ALLOC(ladvise_hdr, alloc_size);
2920 if (ladvise_hdr == NULL)
2924 * TODO: submit multiple advices to one server in a single RPC
2926 if (copy_from_user(ladvise_hdr,
2927 (const struct llapi_ladvise_hdr __user *)arg,
2929 GOTO(out_ladvise, rc = -EFAULT);
2931 for (i = 0; i < num_advise; i++) {
2932 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2933 &ladvise_hdr->lah_advise[i]);
2939 OBD_FREE(ladvise_hdr, alloc_size);
2942 case LL_IOC_FSGETXATTR:
2943 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2944 case LL_IOC_FSSETXATTR:
2945 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2950 ll_iocontrol_call(inode, file, cmd, arg, &err))
2953 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2954 (void __user *)arg));
2959 #ifndef HAVE_FILE_LLSEEK_SIZE
2960 static inline loff_t
2961 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2963 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2965 if (offset > maxsize)
2968 if (offset != file->f_pos) {
2969 file->f_pos = offset;
2970 file->f_version = 0;
2976 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2977 loff_t maxsize, loff_t eof)
2979 struct inode *inode = file_inode(file);
2987 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2988 * position-querying operation. Avoid rewriting the "same"
2989 * f_pos value back to the file because a concurrent read(),
2990 * write() or lseek() might have altered it
2995 * f_lock protects against read/modify/write race with other
2996 * SEEK_CURs. Note that parallel writes and reads behave
3000 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3001 inode_unlock(inode);
3005 * In the generic case the entire file is data, so as long as
3006 * offset isn't at the end of the file then the offset is data.
3013 * There is a virtual hole at the end of the file, so as long as
3014 * offset isn't i_size or larger, return i_size.
3022 return llseek_execute(file, offset, maxsize);
3026 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3028 struct inode *inode = file_inode(file);
3029 loff_t retval, eof = 0;
3032 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3033 (origin == SEEK_CUR) ? file->f_pos : 0);
3034 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3035 PFID(ll_inode2fid(inode)), inode, retval, retval,
3037 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3039 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3040 retval = ll_glimpse_size(inode);
3043 eof = i_size_read(inode);
3046 retval = ll_generic_file_llseek_size(file, offset, origin,
3047 ll_file_maxbytes(inode), eof);
3051 static int ll_flush(struct file *file, fl_owner_t id)
3053 struct inode *inode = file_inode(file);
3054 struct ll_inode_info *lli = ll_i2info(inode);
3055 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3058 LASSERT(!S_ISDIR(inode->i_mode));
3060 /* catch async errors that were recorded back when async writeback
3061 * failed for pages in this mapping. */
3062 rc = lli->lli_async_rc;
3063 lli->lli_async_rc = 0;
3064 if (lli->lli_clob != NULL) {
3065 err = lov_read_and_clear_async_rc(lli->lli_clob);
3070 /* The application has been told write failure already.
3071 * Do not report failure again. */
3072 if (fd->fd_write_failed)
3074 return rc ? -EIO : 0;
3078 * Called to make sure a portion of file has been written out.
3079 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3081 * Return how many pages have been written.
3083 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3084 enum cl_fsync_mode mode, int ignore_layout)
3088 struct cl_fsync_io *fio;
3093 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3094 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3097 env = cl_env_get(&refcheck);
3099 RETURN(PTR_ERR(env));
3101 io = vvp_env_thread_io(env);
3102 io->ci_obj = ll_i2info(inode)->lli_clob;
3103 io->ci_ignore_layout = ignore_layout;
3105 /* initialize parameters for sync */
3106 fio = &io->u.ci_fsync;
3107 fio->fi_start = start;
3109 fio->fi_fid = ll_inode2fid(inode);
3110 fio->fi_mode = mode;
3111 fio->fi_nr_written = 0;
3113 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3114 result = cl_io_loop(env, io);
3116 result = io->ci_result;
3118 result = fio->fi_nr_written;
3119 cl_io_fini(env, io);
3120 cl_env_put(env, &refcheck);
3126 * When dentry is provided (the 'else' case), file_dentry() may be
3127 * null and dentry must be used directly rather than pulled from
3128 * file_dentry() as is done otherwise.
3131 #ifdef HAVE_FILE_FSYNC_4ARGS
3132 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3134 struct dentry *dentry = file_dentry(file);
3136 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3137 int ll_fsync(struct file *file, int datasync)
3139 struct dentry *dentry = file_dentry(file);
3141 loff_t end = LLONG_MAX;
3143 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3146 loff_t end = LLONG_MAX;
3148 struct inode *inode = dentry->d_inode;
3149 struct ll_inode_info *lli = ll_i2info(inode);
3150 struct ptlrpc_request *req;
3154 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3155 PFID(ll_inode2fid(inode)), inode);
3156 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3158 #ifdef HAVE_FILE_FSYNC_4ARGS
3159 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3160 lock_inode = !lli->lli_inode_locked;
3164 /* fsync's caller has already called _fdata{sync,write}, we want
3165 * that IO to finish before calling the osc and mdc sync methods */
3166 rc = filemap_fdatawait(inode->i_mapping);
3169 /* catch async errors that were recorded back when async writeback
3170 * failed for pages in this mapping. */
3171 if (!S_ISDIR(inode->i_mode)) {
3172 err = lli->lli_async_rc;
3173 lli->lli_async_rc = 0;
3176 if (lli->lli_clob != NULL) {
3177 err = lov_read_and_clear_async_rc(lli->lli_clob);
3183 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3187 ptlrpc_req_finished(req);
3189 if (S_ISREG(inode->i_mode)) {
3190 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3192 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3193 if (rc == 0 && err < 0)
3196 fd->fd_write_failed = true;
3198 fd->fd_write_failed = false;
3201 #ifdef HAVE_FILE_FSYNC_4ARGS
3203 inode_unlock(inode);
3209 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3211 struct inode *inode = file_inode(file);
3212 struct ll_sb_info *sbi = ll_i2sbi(inode);
3213 struct ldlm_enqueue_info einfo = {
3214 .ei_type = LDLM_FLOCK,
3215 .ei_cb_cp = ldlm_flock_completion_ast,
3216 .ei_cbdata = file_lock,
3218 struct md_op_data *op_data;
3219 struct lustre_handle lockh = { 0 };
3220 union ldlm_policy_data flock = { { 0 } };
3221 int fl_type = file_lock->fl_type;
3227 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3228 PFID(ll_inode2fid(inode)), file_lock);
3230 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3232 if (file_lock->fl_flags & FL_FLOCK) {
3233 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3234 /* flocks are whole-file locks */
3235 flock.l_flock.end = OFFSET_MAX;
3236 /* For flocks owner is determined by the local file desctiptor*/
3237 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3238 } else if (file_lock->fl_flags & FL_POSIX) {
3239 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3240 flock.l_flock.start = file_lock->fl_start;
3241 flock.l_flock.end = file_lock->fl_end;
3245 flock.l_flock.pid = file_lock->fl_pid;
3247 /* Somewhat ugly workaround for svc lockd.
3248 * lockd installs custom fl_lmops->lm_compare_owner that checks
3249 * for the fl_owner to be the same (which it always is on local node
3250 * I guess between lockd processes) and then compares pid.
3251 * As such we assign pid to the owner field to make it all work,
3252 * conflict with normal locks is unlikely since pid space and
3253 * pointer space for current->files are not intersecting */
3254 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3255 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3259 einfo.ei_mode = LCK_PR;
3262 /* An unlock request may or may not have any relation to
3263 * existing locks so we may not be able to pass a lock handle
3264 * via a normal ldlm_lock_cancel() request. The request may even
3265 * unlock a byte range in the middle of an existing lock. In
3266 * order to process an unlock request we need all of the same
3267 * information that is given with a normal read or write record
3268 * lock request. To avoid creating another ldlm unlock (cancel)
3269 * message we'll treat a LCK_NL flock request as an unlock. */
3270 einfo.ei_mode = LCK_NL;
3273 einfo.ei_mode = LCK_PW;
3276 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3291 flags = LDLM_FL_BLOCK_NOWAIT;
3297 flags = LDLM_FL_TEST_LOCK;
3300 CERROR("unknown fcntl lock command: %d\n", cmd);
3304 /* Save the old mode so that if the mode in the lock changes we
3305 * can decrement the appropriate reader or writer refcount. */
3306 file_lock->fl_type = einfo.ei_mode;
3308 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3309 LUSTRE_OPC_ANY, NULL);
3310 if (IS_ERR(op_data))
3311 RETURN(PTR_ERR(op_data));
3313 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3314 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3315 flock.l_flock.pid, flags, einfo.ei_mode,
3316 flock.l_flock.start, flock.l_flock.end);
3318 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3321 /* Restore the file lock type if not TEST lock. */
3322 if (!(flags & LDLM_FL_TEST_LOCK))
3323 file_lock->fl_type = fl_type;
3325 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3326 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3327 !(flags & LDLM_FL_TEST_LOCK))
3328 rc2 = locks_lock_file_wait(file, file_lock);
3330 if ((file_lock->fl_flags & FL_FLOCK) &&
3331 (rc == 0 || file_lock->fl_type == F_UNLCK))
3332 rc2 = flock_lock_file_wait(file, file_lock);
3333 if ((file_lock->fl_flags & FL_POSIX) &&
3334 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3335 !(flags & LDLM_FL_TEST_LOCK))
3336 rc2 = posix_lock_file_wait(file, file_lock);
3337 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3339 if (rc2 && file_lock->fl_type != F_UNLCK) {
3340 einfo.ei_mode = LCK_NL;
3341 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3346 ll_finish_md_op_data(op_data);
3351 int ll_get_fid_by_name(struct inode *parent, const char *name,
3352 int namelen, struct lu_fid *fid,
3353 struct inode **inode)
3355 struct md_op_data *op_data = NULL;
3356 struct mdt_body *body;
3357 struct ptlrpc_request *req;
3361 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3362 LUSTRE_OPC_ANY, NULL);
3363 if (IS_ERR(op_data))
3364 RETURN(PTR_ERR(op_data));
3366 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3367 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3368 ll_finish_md_op_data(op_data);
3372 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3374 GOTO(out_req, rc = -EFAULT);
3376 *fid = body->mbo_fid1;
3379 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3381 ptlrpc_req_finished(req);
3385 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3386 const char *name, int namelen)
3388 struct dentry *dchild = NULL;
3389 struct inode *child_inode = NULL;
3390 struct md_op_data *op_data;
3391 struct ptlrpc_request *request = NULL;
3392 struct obd_client_handle *och = NULL;
3394 struct mdt_body *body;
3396 __u64 data_version = 0;
3399 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3400 name, PFID(ll_inode2fid(parent)), mdtidx);
3402 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3403 0, LUSTRE_OPC_ANY, NULL);
3404 if (IS_ERR(op_data))
3405 RETURN(PTR_ERR(op_data));
3407 /* Get child FID first */
3408 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3411 dchild = d_lookup(file_dentry(file), &qstr);
3412 if (dchild != NULL) {
3413 if (dchild->d_inode != NULL)
3414 child_inode = igrab(dchild->d_inode);
3418 if (child_inode == NULL) {
3419 rc = ll_get_fid_by_name(parent, name, namelen,
3420 &op_data->op_fid3, &child_inode);
3425 if (child_inode == NULL)
3426 GOTO(out_free, rc = -EINVAL);
3429 * lfs migrate command needs to be blocked on the client
3430 * by checking the migrate FID against the FID of the
3433 if (child_inode == parent->i_sb->s_root->d_inode)
3434 GOTO(out_iput, rc = -EINVAL);
3436 inode_lock(child_inode);
3437 op_data->op_fid3 = *ll_inode2fid(child_inode);
3438 if (!fid_is_sane(&op_data->op_fid3)) {
3439 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3440 ll_get_fsname(parent->i_sb, NULL, 0), name,
3441 PFID(&op_data->op_fid3));
3442 GOTO(out_unlock, rc = -EINVAL);
3445 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3447 GOTO(out_unlock, rc);
3450 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3451 PFID(&op_data->op_fid3), mdtidx);
3452 GOTO(out_unlock, rc = 0);
3455 if (S_ISREG(child_inode->i_mode)) {
3456 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3460 GOTO(out_unlock, rc);
3463 rc = ll_data_version(child_inode, &data_version,
3466 GOTO(out_close, rc);
3468 op_data->op_handle = och->och_fh;
3469 op_data->op_data = och->och_mod;
3470 op_data->op_data_version = data_version;
3471 op_data->op_lease_handle = och->och_lease_handle;
3472 op_data->op_bias |= MDS_RENAME_MIGRATE;
3475 op_data->op_mds = mdtidx;
3476 op_data->op_cli_flags = CLI_MIGRATE;
3477 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3478 namelen, name, namelen, &request);
3480 LASSERT(request != NULL);
3481 ll_update_times(request, parent);
3483 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3484 LASSERT(body != NULL);
3486 /* If the server does release layout lock, then we cleanup
3487 * the client och here, otherwise release it in out_close: */
3489 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3490 obd_mod_put(och->och_mod);
3491 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3493 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3499 if (request != NULL) {
3500 ptlrpc_req_finished(request);
3504 /* Try again if the file layout has changed. */
3505 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3509 if (och != NULL) /* close the file */
3510 ll_lease_close(och, child_inode, NULL);
3512 clear_nlink(child_inode);
3514 inode_unlock(child_inode);
3518 ll_finish_md_op_data(op_data);
3523 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3531 * test if some locks matching bits and l_req_mode are acquired
3532 * - bits can be in different locks
3533 * - if found clear the common lock bits in *bits
3534 * - the bits not found, are kept in *bits
3536 * \param bits [IN] searched lock bits [IN]
3537 * \param l_req_mode [IN] searched lock mode
3538 * \retval boolean, true iff all bits are found
3540 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3542 struct lustre_handle lockh;
3543 union ldlm_policy_data policy;
3544 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3545 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3554 fid = &ll_i2info(inode)->lli_fid;
3555 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3556 ldlm_lockname[mode]);
3558 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3559 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3560 policy.l_inodebits.bits = *bits & (1 << i);
3561 if (policy.l_inodebits.bits == 0)
3564 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3565 &policy, mode, &lockh)) {
3566 struct ldlm_lock *lock;
3568 lock = ldlm_handle2lock(&lockh);
3571 ~(lock->l_policy_data.l_inodebits.bits);
3572 LDLM_LOCK_PUT(lock);
3574 *bits &= ~policy.l_inodebits.bits;
3581 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3582 struct lustre_handle *lockh, __u64 flags,
3583 enum ldlm_mode mode)
3585 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3590 fid = &ll_i2info(inode)->lli_fid;
3591 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3593 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3594 fid, LDLM_IBITS, &policy, mode, lockh);
3599 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3601 /* Already unlinked. Just update nlink and return success */
3602 if (rc == -ENOENT) {
3604 /* If it is striped directory, and there is bad stripe
3605 * Let's revalidate the dentry again, instead of returning
3607 if (S_ISDIR(inode->i_mode) &&
3608 ll_i2info(inode)->lli_lsm_md != NULL)
3611 /* This path cannot be hit for regular files unless in
3612 * case of obscure races, so no need to to validate
3614 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3616 } else if (rc != 0) {
3617 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3618 "%s: revalidate FID "DFID" error: rc = %d\n",
3619 ll_get_fsname(inode->i_sb, NULL, 0),
3620 PFID(ll_inode2fid(inode)), rc);
3626 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3628 struct inode *inode = dentry->d_inode;
3629 struct ptlrpc_request *req = NULL;
3630 struct obd_export *exp;
3634 LASSERT(inode != NULL);
3636 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3637 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3639 exp = ll_i2mdexp(inode);
3641 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3642 * But under CMD case, it caused some lock issues, should be fixed
3643 * with new CMD ibits lock. See bug 12718 */
3644 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3645 struct lookup_intent oit = { .it_op = IT_GETATTR };
3646 struct md_op_data *op_data;
3648 if (ibits == MDS_INODELOCK_LOOKUP)
3649 oit.it_op = IT_LOOKUP;
3651 /* Call getattr by fid, so do not provide name at all. */
3652 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3653 dentry->d_inode, NULL, 0, 0,
3654 LUSTRE_OPC_ANY, NULL);
3655 if (IS_ERR(op_data))
3656 RETURN(PTR_ERR(op_data));
3658 rc = md_intent_lock(exp, op_data, &oit, &req,
3659 &ll_md_blocking_ast, 0);
3660 ll_finish_md_op_data(op_data);
3662 rc = ll_inode_revalidate_fini(inode, rc);
3666 rc = ll_revalidate_it_finish(req, &oit, dentry);
3668 ll_intent_release(&oit);
3672 /* Unlinked? Unhash dentry, so it is not picked up later by
3673 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3674 here to preserve get_cwd functionality on 2.6.
3676 if (!dentry->d_inode->i_nlink) {
3677 ll_lock_dcache(inode);
3678 d_lustre_invalidate(dentry, 0);
3679 ll_unlock_dcache(inode);
3682 ll_lookup_finish_locks(&oit, dentry);
3683 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3684 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3685 u64 valid = OBD_MD_FLGETATTR;
3686 struct md_op_data *op_data;
3689 if (S_ISREG(inode->i_mode)) {
3690 rc = ll_get_default_mdsize(sbi, &ealen);
3693 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3696 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3697 0, ealen, LUSTRE_OPC_ANY,
3699 if (IS_ERR(op_data))
3700 RETURN(PTR_ERR(op_data));
3702 op_data->op_valid = valid;
3703 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3704 ll_finish_md_op_data(op_data);
3706 rc = ll_inode_revalidate_fini(inode, rc);
3710 rc = ll_prep_inode(&inode, req, NULL, NULL);
3713 ptlrpc_req_finished(req);
3717 static int ll_merge_md_attr(struct inode *inode)
3719 struct cl_attr attr = { 0 };
3722 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3723 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3724 &attr, ll_md_blocking_ast);
3728 set_nlink(inode, attr.cat_nlink);
3729 inode->i_blocks = attr.cat_blocks;
3730 i_size_write(inode, attr.cat_size);
3732 ll_i2info(inode)->lli_atime = attr.cat_atime;
3733 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3734 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3740 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3742 struct inode *inode = dentry->d_inode;
3746 rc = __ll_inode_revalidate(dentry, ibits);
3750 /* if object isn't regular file, don't validate size */
3751 if (!S_ISREG(inode->i_mode)) {
3752 if (S_ISDIR(inode->i_mode) &&
3753 ll_i2info(inode)->lli_lsm_md != NULL) {
3754 rc = ll_merge_md_attr(inode);
3759 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3760 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3761 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3763 /* In case of restore, the MDT has the right size and has
3764 * already send it back without granting the layout lock,
3765 * inode is up-to-date so glimpse is useless.
3766 * Also to glimpse we need the layout, in case of a running
3767 * restore the MDT holds the layout lock so the glimpse will
3768 * block up to the end of restore (getattr will block)
3770 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3771 rc = ll_glimpse_size(inode);
3776 static inline dev_t ll_compat_encode_dev(dev_t dev)
3778 /* The compat_sys_*stat*() syscalls will fail unless the
3779 * device majors and minors are both less than 256. Note that
3780 * the value returned here will be passed through
3781 * old_encode_dev() in cp_compat_stat(). And so we are not
3782 * trying to return a valid compat (u16) device number, just
3783 * one that will pass the old_valid_dev() check. */
3785 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3788 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3790 struct inode *inode = de->d_inode;
3791 struct ll_sb_info *sbi = ll_i2sbi(inode);
3792 struct ll_inode_info *lli = ll_i2info(inode);
3795 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3796 MDS_INODELOCK_LOOKUP);
3797 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3802 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3804 if (ll_need_32bit_api(sbi)) {
3805 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3806 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3807 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3809 stat->ino = inode->i_ino;
3810 stat->dev = inode->i_sb->s_dev;
3811 stat->rdev = inode->i_rdev;
3814 stat->mode = inode->i_mode;
3815 stat->uid = inode->i_uid;
3816 stat->gid = inode->i_gid;
3817 stat->atime = inode->i_atime;
3818 stat->mtime = inode->i_mtime;
3819 stat->ctime = inode->i_ctime;
3820 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3822 stat->nlink = inode->i_nlink;
3823 stat->size = i_size_read(inode);
3824 stat->blocks = inode->i_blocks;
3829 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3830 __u64 start, __u64 len)
3834 struct fiemap *fiemap;
3835 unsigned int extent_count = fieinfo->fi_extents_max;
3837 num_bytes = sizeof(*fiemap) + (extent_count *
3838 sizeof(struct fiemap_extent));
3839 OBD_ALLOC_LARGE(fiemap, num_bytes);
3844 fiemap->fm_flags = fieinfo->fi_flags;
3845 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3846 fiemap->fm_start = start;
3847 fiemap->fm_length = len;
3848 if (extent_count > 0 &&
3849 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3850 sizeof(struct fiemap_extent)) != 0)
3851 GOTO(out, rc = -EFAULT);
3853 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3855 fieinfo->fi_flags = fiemap->fm_flags;
3856 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3857 if (extent_count > 0 &&
3858 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3859 fiemap->fm_mapped_extents *
3860 sizeof(struct fiemap_extent)) != 0)
3861 GOTO(out, rc = -EFAULT);
3863 OBD_FREE_LARGE(fiemap, num_bytes);
3867 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3869 struct ll_inode_info *lli = ll_i2info(inode);
3870 struct posix_acl *acl = NULL;
3873 spin_lock(&lli->lli_lock);
3874 /* VFS' acl_permission_check->check_acl will release the refcount */
3875 acl = posix_acl_dup(lli->lli_posix_acl);
3876 spin_unlock(&lli->lli_lock);
3881 #ifdef HAVE_IOP_SET_ACL
3882 #ifdef CONFIG_FS_POSIX_ACL
3883 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
3885 const char *name = NULL;
3892 case ACL_TYPE_ACCESS:
3894 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
3898 name = XATTR_NAME_POSIX_ACL_ACCESS;
3900 case ACL_TYPE_DEFAULT:
3901 if (!S_ISDIR(inode->i_mode))
3902 GOTO(out, rc = acl ? -EACCES : 0);
3903 name = XATTR_NAME_POSIX_ACL_DEFAULT;
3906 GOTO(out, rc = -EINVAL);
3910 size = posix_acl_xattr_size(acl->a_count);
3911 value = kmalloc(size, GFP_NOFS);
3913 GOTO(out, rc = -ENOMEM);
3915 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
3920 /* dentry is only used for *.lov attributes so it's safe to be NULL */
3921 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
3926 set_cached_acl(inode, type, acl);
3928 forget_cached_acl(inode, type);
3931 #endif /* CONFIG_FS_POSIX_ACL */
3932 #endif /* HAVE_IOP_SET_ACL */
3934 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3936 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3937 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3939 ll_check_acl(struct inode *inode, int mask)
3942 # ifdef CONFIG_FS_POSIX_ACL
3943 struct posix_acl *acl;
3947 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3948 if (flags & IPERM_FLAG_RCU)
3951 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3956 rc = posix_acl_permission(inode, acl, mask);
3957 posix_acl_release(acl);
3960 # else /* !CONFIG_FS_POSIX_ACL */
3962 # endif /* CONFIG_FS_POSIX_ACL */
3964 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3966 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3967 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3969 # ifdef HAVE_INODE_PERMISION_2ARGS
3970 int ll_inode_permission(struct inode *inode, int mask)
3972 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3977 struct ll_sb_info *sbi;
3978 struct root_squash_info *squash;
3979 struct cred *cred = NULL;
3980 const struct cred *old_cred = NULL;
3982 bool squash_id = false;
3985 #ifdef MAY_NOT_BLOCK
3986 if (mask & MAY_NOT_BLOCK)
3988 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3989 if (flags & IPERM_FLAG_RCU)
3993 /* as root inode are NOT getting validated in lookup operation,
3994 * need to do it before permission check. */
3996 if (inode == inode->i_sb->s_root->d_inode) {
3997 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3998 MDS_INODELOCK_LOOKUP);
4003 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4004 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4006 /* squash fsuid/fsgid if needed */
4007 sbi = ll_i2sbi(inode);
4008 squash = &sbi->ll_squash;
4009 if (unlikely(squash->rsi_uid != 0 &&
4010 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4011 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4015 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4016 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4017 squash->rsi_uid, squash->rsi_gid);
4019 /* update current process's credentials
4020 * and FS capability */
4021 cred = prepare_creds();
4025 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4026 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4027 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4028 if ((1 << cap) & CFS_CAP_FS_MASK)
4029 cap_lower(cred->cap_effective, cap);
4031 old_cred = override_creds(cred);
4034 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4035 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4036 /* restore current process's credentials and FS capability */
4038 revert_creds(old_cred);
4045 /* -o localflock - only provides locally consistent flock locks */
4046 struct file_operations ll_file_operations = {
4047 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4048 # ifdef HAVE_SYNC_READ_WRITE
4049 .read = new_sync_read,
4050 .write = new_sync_write,
4052 .read_iter = ll_file_read_iter,
4053 .write_iter = ll_file_write_iter,
4054 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4055 .read = ll_file_read,
4056 .aio_read = ll_file_aio_read,
4057 .write = ll_file_write,
4058 .aio_write = ll_file_aio_write,
4059 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4060 .unlocked_ioctl = ll_file_ioctl,
4061 .open = ll_file_open,
4062 .release = ll_file_release,
4063 .mmap = ll_file_mmap,
4064 .llseek = ll_file_seek,
4065 .splice_read = ll_file_splice_read,
4070 struct file_operations ll_file_operations_flock = {
4071 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4072 # ifdef HAVE_SYNC_READ_WRITE
4073 .read = new_sync_read,
4074 .write = new_sync_write,
4075 # endif /* HAVE_SYNC_READ_WRITE */
4076 .read_iter = ll_file_read_iter,
4077 .write_iter = ll_file_write_iter,
4078 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4079 .read = ll_file_read,
4080 .aio_read = ll_file_aio_read,
4081 .write = ll_file_write,
4082 .aio_write = ll_file_aio_write,
4083 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4084 .unlocked_ioctl = ll_file_ioctl,
4085 .open = ll_file_open,
4086 .release = ll_file_release,
4087 .mmap = ll_file_mmap,
4088 .llseek = ll_file_seek,
4089 .splice_read = ll_file_splice_read,
4092 .flock = ll_file_flock,
4093 .lock = ll_file_flock
4096 /* These are for -o noflock - to return ENOSYS on flock calls */
4097 struct file_operations ll_file_operations_noflock = {
4098 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4099 # ifdef HAVE_SYNC_READ_WRITE
4100 .read = new_sync_read,
4101 .write = new_sync_write,
4102 # endif /* HAVE_SYNC_READ_WRITE */
4103 .read_iter = ll_file_read_iter,
4104 .write_iter = ll_file_write_iter,
4105 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4106 .read = ll_file_read,
4107 .aio_read = ll_file_aio_read,
4108 .write = ll_file_write,
4109 .aio_write = ll_file_aio_write,
4110 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4111 .unlocked_ioctl = ll_file_ioctl,
4112 .open = ll_file_open,
4113 .release = ll_file_release,
4114 .mmap = ll_file_mmap,
4115 .llseek = ll_file_seek,
4116 .splice_read = ll_file_splice_read,
4119 .flock = ll_file_noflock,
4120 .lock = ll_file_noflock
4123 struct inode_operations ll_file_inode_operations = {
4124 .setattr = ll_setattr,
4125 .getattr = ll_getattr,
4126 .permission = ll_inode_permission,
4127 #ifdef HAVE_IOP_XATTR
4128 .setxattr = ll_setxattr,
4129 .getxattr = ll_getxattr,
4130 .removexattr = ll_removexattr,
4132 .listxattr = ll_listxattr,
4133 .fiemap = ll_fiemap,
4134 #ifdef HAVE_IOP_GET_ACL
4135 .get_acl = ll_get_acl,
4137 #ifdef HAVE_IOP_SET_ACL
4138 .set_acl = ll_set_acl,
4142 /* dynamic ioctl number support routins */
4143 static struct llioc_ctl_data {
4144 struct rw_semaphore ioc_sem;
4145 struct list_head ioc_head;
4147 __RWSEM_INITIALIZER(llioc.ioc_sem),
4148 LIST_HEAD_INIT(llioc.ioc_head)
4153 struct list_head iocd_list;
4154 unsigned int iocd_size;
4155 llioc_callback_t iocd_cb;
4156 unsigned int iocd_count;
4157 unsigned int iocd_cmd[0];
4160 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
4163 struct llioc_data *in_data = NULL;
4166 if (cb == NULL || cmd == NULL ||
4167 count > LLIOC_MAX_CMD || count < 0)
4170 size = sizeof(*in_data) + count * sizeof(unsigned int);
4171 OBD_ALLOC(in_data, size);
4172 if (in_data == NULL)
4175 memset(in_data, 0, sizeof(*in_data));
4176 in_data->iocd_size = size;
4177 in_data->iocd_cb = cb;
4178 in_data->iocd_count = count;
4179 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
4181 down_write(&llioc.ioc_sem);
4182 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
4183 up_write(&llioc.ioc_sem);
4188 void ll_iocontrol_unregister(void *magic)
4190 struct llioc_data *tmp;
4195 down_write(&llioc.ioc_sem);
4196 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
4198 unsigned int size = tmp->iocd_size;
4200 list_del(&tmp->iocd_list);
4201 up_write(&llioc.ioc_sem);
4203 OBD_FREE(tmp, size);
4207 up_write(&llioc.ioc_sem);
4209 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
4212 EXPORT_SYMBOL(ll_iocontrol_register);
4213 EXPORT_SYMBOL(ll_iocontrol_unregister);
4215 static enum llioc_iter
4216 ll_iocontrol_call(struct inode *inode, struct file *file,
4217 unsigned int cmd, unsigned long arg, int *rcp)
4219 enum llioc_iter ret = LLIOC_CONT;
4220 struct llioc_data *data;
4221 int rc = -EINVAL, i;
4223 down_read(&llioc.ioc_sem);
4224 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
4225 for (i = 0; i < data->iocd_count; i++) {
4226 if (cmd != data->iocd_cmd[i])
4229 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
4233 if (ret == LLIOC_STOP)
4236 up_read(&llioc.ioc_sem);
4243 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4245 struct ll_inode_info *lli = ll_i2info(inode);
4246 struct cl_object *obj = lli->lli_clob;
4255 env = cl_env_get(&refcheck);
4257 RETURN(PTR_ERR(env));
4259 rc = cl_conf_set(env, lli->lli_clob, conf);
4263 if (conf->coc_opc == OBJECT_CONF_SET) {
4264 struct ldlm_lock *lock = conf->coc_lock;
4265 struct cl_layout cl = {
4269 LASSERT(lock != NULL);
4270 LASSERT(ldlm_has_layout(lock));
4272 /* it can only be allowed to match after layout is
4273 * applied to inode otherwise false layout would be
4274 * seen. Applying layout shoud happen before dropping
4275 * the intent lock. */
4276 ldlm_lock_allow_match(lock);
4278 rc = cl_object_layout_get(env, obj, &cl);
4283 DFID": layout version change: %u -> %u\n",
4284 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4286 ll_layout_version_set(lli, cl.cl_layout_gen);
4290 cl_env_put(env, &refcheck);
4295 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4296 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4299 struct ll_sb_info *sbi = ll_i2sbi(inode);
4300 struct ptlrpc_request *req;
4301 struct mdt_body *body;
4308 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4309 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4310 lock->l_lvb_data, lock->l_lvb_len);
4312 if (lock->l_lvb_data != NULL)
4315 /* if layout lock was granted right away, the layout is returned
4316 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4317 * blocked and then granted via completion ast, we have to fetch
4318 * layout here. Please note that we can't use the LVB buffer in
4319 * completion AST because it doesn't have a large enough buffer */
4320 rc = ll_get_default_mdsize(sbi, &lmmsize);
4322 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4323 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4328 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4330 GOTO(out, rc = -EPROTO);
4332 lmmsize = body->mbo_eadatasize;
4333 if (lmmsize == 0) /* empty layout */
4336 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4338 GOTO(out, rc = -EFAULT);
4340 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4341 if (lvbdata == NULL)
4342 GOTO(out, rc = -ENOMEM);
4344 memcpy(lvbdata, lmm, lmmsize);
4345 lock_res_and_lock(lock);
4346 if (unlikely(lock->l_lvb_data == NULL)) {
4347 lock->l_lvb_type = LVB_T_LAYOUT;
4348 lock->l_lvb_data = lvbdata;
4349 lock->l_lvb_len = lmmsize;
4352 unlock_res_and_lock(lock);
4355 OBD_FREE_LARGE(lvbdata, lmmsize);
4360 ptlrpc_req_finished(req);
4365 * Apply the layout to the inode. Layout lock is held and will be released
4368 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4369 struct inode *inode)
4371 struct ll_inode_info *lli = ll_i2info(inode);
4372 struct ll_sb_info *sbi = ll_i2sbi(inode);
4373 struct ldlm_lock *lock;
4374 struct cl_object_conf conf;
4377 bool wait_layout = false;
4380 LASSERT(lustre_handle_is_used(lockh));
4382 lock = ldlm_handle2lock(lockh);
4383 LASSERT(lock != NULL);
4384 LASSERT(ldlm_has_layout(lock));
4386 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4387 PFID(&lli->lli_fid), inode);
4389 /* in case this is a caching lock and reinstate with new inode */
4390 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4392 lock_res_and_lock(lock);
4393 lvb_ready = ldlm_is_lvb_ready(lock);
4394 unlock_res_and_lock(lock);
4396 /* checking lvb_ready is racy but this is okay. The worst case is
4397 * that multi processes may configure the file on the same time. */
4401 rc = ll_layout_fetch(inode, lock);
4405 /* for layout lock, lmm is stored in lock's lvb.
4406 * lvb_data is immutable if the lock is held so it's safe to access it
4409 * set layout to file. Unlikely this will fail as old layout was
4410 * surely eliminated */
4411 memset(&conf, 0, sizeof conf);
4412 conf.coc_opc = OBJECT_CONF_SET;
4413 conf.coc_inode = inode;
4414 conf.coc_lock = lock;
4415 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4416 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4417 rc = ll_layout_conf(inode, &conf);
4419 /* refresh layout failed, need to wait */
4420 wait_layout = rc == -EBUSY;
4423 LDLM_LOCK_PUT(lock);
4424 ldlm_lock_decref(lockh, mode);
4426 /* wait for IO to complete if it's still being used. */
4428 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4429 ll_get_fsname(inode->i_sb, NULL, 0),
4430 PFID(&lli->lli_fid), inode);
4432 memset(&conf, 0, sizeof conf);
4433 conf.coc_opc = OBJECT_CONF_WAIT;
4434 conf.coc_inode = inode;
4435 rc = ll_layout_conf(inode, &conf);
4439 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4440 ll_get_fsname(inode->i_sb, NULL, 0),
4441 PFID(&lli->lli_fid), rc);
4447 * Issue layout intent RPC to MDS.
4448 * \param inode [in] file inode
4449 * \param intent [in] layout intent
4451 * \retval 0 on success
4452 * \retval < 0 error code
4454 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4456 struct ll_inode_info *lli = ll_i2info(inode);
4457 struct ll_sb_info *sbi = ll_i2sbi(inode);
4458 struct md_op_data *op_data;
4459 struct lookup_intent it;
4460 struct ptlrpc_request *req;
4464 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4465 0, 0, LUSTRE_OPC_ANY, NULL);
4466 if (IS_ERR(op_data))
4467 RETURN(PTR_ERR(op_data));
4469 op_data->op_data = intent;
4470 op_data->op_data_size = sizeof(*intent);
4472 memset(&it, 0, sizeof(it));
4473 it.it_op = IT_LAYOUT;
4474 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4475 intent->li_opc == LAYOUT_INTENT_TRUNC)
4476 it.it_flags = FMODE_WRITE;
4478 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4479 ll_get_fsname(inode->i_sb, NULL, 0),
4480 PFID(&lli->lli_fid), inode);
4482 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4483 &ll_md_blocking_ast, 0);
4484 if (it.it_request != NULL)
4485 ptlrpc_req_finished(it.it_request);
4486 it.it_request = NULL;
4488 ll_finish_md_op_data(op_data);
4490 /* set lock data in case this is a new lock */
4492 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4494 ll_intent_drop_lock(&it);
4500 * This function checks if there exists a LAYOUT lock on the client side,
4501 * or enqueues it if it doesn't have one in cache.
4503 * This function will not hold layout lock so it may be revoked any time after
4504 * this function returns. Any operations depend on layout should be redone
4507 * This function should be called before lov_io_init() to get an uptodate
4508 * layout version, the caller should save the version number and after IO
4509 * is finished, this function should be called again to verify that layout
4510 * is not changed during IO time.
4512 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4514 struct ll_inode_info *lli = ll_i2info(inode);
4515 struct ll_sb_info *sbi = ll_i2sbi(inode);
4516 struct lustre_handle lockh;
4517 struct layout_intent intent = {
4518 .li_opc = LAYOUT_INTENT_ACCESS,
4520 enum ldlm_mode mode;
4524 *gen = ll_layout_version_get(lli);
4525 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4529 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4530 LASSERT(S_ISREG(inode->i_mode));
4532 /* take layout lock mutex to enqueue layout lock exclusively. */
4533 mutex_lock(&lli->lli_layout_mutex);
4536 /* mostly layout lock is caching on the local side, so try to
4537 * match it before grabbing layout lock mutex. */
4538 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4539 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4540 if (mode != 0) { /* hit cached lock */
4541 rc = ll_layout_lock_set(&lockh, mode, inode);
4547 rc = ll_layout_intent(inode, &intent);
4553 *gen = ll_layout_version_get(lli);
4554 mutex_unlock(&lli->lli_layout_mutex);
4560 * Issue layout intent RPC indicating where in a file an IO is about to write.
4562 * \param[in] inode file inode.
4563 * \param[in] start start offset of fille in bytes where an IO is about to
4565 * \param[in] end exclusive end offset in bytes of the write range.
4567 * \retval 0 on success
4568 * \retval < 0 error code
4570 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4572 struct layout_intent intent = {
4573 .li_opc = LAYOUT_INTENT_WRITE,
4580 rc = ll_layout_intent(inode, &intent);
4586 * This function send a restore request to the MDT
4588 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4590 struct hsm_user_request *hur;
4594 len = sizeof(struct hsm_user_request) +
4595 sizeof(struct hsm_user_item);
4596 OBD_ALLOC(hur, len);
4600 hur->hur_request.hr_action = HUA_RESTORE;
4601 hur->hur_request.hr_archive_id = 0;
4602 hur->hur_request.hr_flags = 0;
4603 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4604 sizeof(hur->hur_user_item[0].hui_fid));
4605 hur->hur_user_item[0].hui_extent.offset = offset;
4606 hur->hur_user_item[0].hui_extent.length = length;
4607 hur->hur_request.hr_itemcount = 1;
4608 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,