4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
355 struct dentry *parent = de->d_parent;
356 const char *name = NULL;
358 struct md_op_data *op_data;
359 struct ptlrpc_request *req = NULL;
363 LASSERT(parent != NULL);
364 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
366 /* if server supports open-by-fid, or file name is invalid, don't pack
367 * name in open request */
368 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
369 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
370 name = de->d_name.name;
371 len = de->d_name.len;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
375 name, len, 0, LUSTRE_OPC_ANY, NULL);
377 RETURN(PTR_ERR(op_data));
378 op_data->op_data = lmm;
379 op_data->op_data_size = lmmsize;
381 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
382 &ll_md_blocking_ast, 0);
383 ll_finish_md_op_data(op_data);
385 /* reason for keep own exit path - don`t flood log
386 * with messages with -ESTALE errors.
388 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
389 it_open_error(DISP_OPEN_OPEN, itp))
391 ll_release_openhandle(de, itp);
395 if (it_disposition(itp, DISP_LOOKUP_NEG))
396 GOTO(out, rc = -ENOENT);
398 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
399 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
400 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
405 if (!rc && itp->it_lock_mode)
406 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
409 ptlrpc_req_finished(req);
410 ll_intent_drop_lock(itp);
412 /* We did open by fid, but by the time we got to the server,
413 * the object disappeared. If this is a create, we cannot really
414 * tell the userspace that the file it was trying to create
415 * does not exist. Instead let's return -ESTALE, and the VFS will
416 * retry the create with LOOKUP_REVAL that we are going to catch
417 * in ll_revalidate_dentry() and use lookup then.
419 if (rc == -ENOENT && itp->it_op & IT_CREAT)
425 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
426 struct obd_client_handle *och)
428 struct mdt_body *body;
430 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
431 och->och_fh = body->mbo_handle;
432 och->och_fid = body->mbo_fid1;
433 och->och_lease_handle.cookie = it->it_lock_handle;
434 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
435 och->och_flags = it->it_flags;
437 return md_set_open_replay_data(md_exp, och, it);
440 static int ll_local_open(struct file *file, struct lookup_intent *it,
441 struct ll_file_data *fd, struct obd_client_handle *och)
443 struct inode *inode = file_inode(file);
446 LASSERT(!LUSTRE_FPRIVATE(file));
453 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
458 LUSTRE_FPRIVATE(file) = fd;
459 ll_readahead_init(inode, &fd->fd_ras);
460 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
462 /* ll_cl_context initialize */
463 rwlock_init(&fd->fd_lock);
464 INIT_LIST_HEAD(&fd->fd_lccs);
469 /* Open a file, and (for the very first open) create objects on the OSTs at
470 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
471 * creation or open until ll_lov_setstripe() ioctl is called.
473 * If we already have the stripe MD locally then we don't request it in
474 * md_open(), by passing a lmm_size = 0.
476 * It is up to the application to ensure no other processes open this file
477 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
478 * used. We might be able to avoid races of that sort by getting lli_open_sem
479 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
480 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
482 int ll_file_open(struct inode *inode, struct file *file)
484 struct ll_inode_info *lli = ll_i2info(inode);
485 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
486 .it_flags = file->f_flags };
487 struct obd_client_handle **och_p = NULL;
488 __u64 *och_usecount = NULL;
489 struct ll_file_data *fd;
493 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
494 PFID(ll_inode2fid(inode)), inode, file->f_flags);
496 it = file->private_data; /* XXX: compat macro */
497 file->private_data = NULL; /* prevent ll_local_open assertion */
499 fd = ll_file_data_get();
501 GOTO(out_openerr, rc = -ENOMEM);
504 if (S_ISDIR(inode->i_mode))
505 ll_authorize_statahead(inode, fd);
507 if (inode->i_sb->s_root == file_dentry(file)) {
508 LUSTRE_FPRIVATE(file) = fd;
512 if (!it || !it->it_disposition) {
513 /* Convert f_flags into access mode. We cannot use file->f_mode,
514 * because everything but O_ACCMODE mask was stripped from
516 if ((oit.it_flags + 1) & O_ACCMODE)
518 if (file->f_flags & O_TRUNC)
519 oit.it_flags |= FMODE_WRITE;
521 /* kernel only call f_op->open in dentry_open. filp_open calls
522 * dentry_open after call to open_namei that checks permissions.
523 * Only nfsd_open call dentry_open directly without checking
524 * permissions and because of that this code below is safe. */
525 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
526 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
528 /* We do not want O_EXCL here, presumably we opened the file
529 * already? XXX - NFS implications? */
530 oit.it_flags &= ~O_EXCL;
532 /* bug20584, if "it_flags" contains O_CREAT, the file will be
533 * created if necessary, then "IT_CREAT" should be set to keep
534 * consistent with it */
535 if (oit.it_flags & O_CREAT)
536 oit.it_op |= IT_CREAT;
542 /* Let's see if we have file open on MDS already. */
543 if (it->it_flags & FMODE_WRITE) {
544 och_p = &lli->lli_mds_write_och;
545 och_usecount = &lli->lli_open_fd_write_count;
546 } else if (it->it_flags & FMODE_EXEC) {
547 och_p = &lli->lli_mds_exec_och;
548 och_usecount = &lli->lli_open_fd_exec_count;
550 och_p = &lli->lli_mds_read_och;
551 och_usecount = &lli->lli_open_fd_read_count;
554 mutex_lock(&lli->lli_och_mutex);
555 if (*och_p) { /* Open handle is present */
556 if (it_disposition(it, DISP_OPEN_OPEN)) {
557 /* Well, there's extra open request that we do not need,
558 let's close it somehow. This will decref request. */
559 rc = it_open_error(DISP_OPEN_OPEN, it);
561 mutex_unlock(&lli->lli_och_mutex);
562 GOTO(out_openerr, rc);
565 ll_release_openhandle(file_dentry(file), it);
569 rc = ll_local_open(file, it, fd, NULL);
572 mutex_unlock(&lli->lli_och_mutex);
573 GOTO(out_openerr, rc);
576 LASSERT(*och_usecount == 0);
577 if (!it->it_disposition) {
578 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
579 /* We cannot just request lock handle now, new ELC code
580 means that one of other OPEN locks for this file
581 could be cancelled, and since blocking ast handler
582 would attempt to grab och_mutex as well, that would
583 result in a deadlock */
584 mutex_unlock(&lli->lli_och_mutex);
586 * Normally called under two situations:
588 * 2. A race/condition on MDS resulting in no open
589 * handle to be returned from LOOKUP|OPEN request,
590 * for example if the target entry was a symlink.
592 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
593 * marked by a bit set in ll_iget_for_nfs. Clear the
594 * bit so that it's not confusing later callers.
596 * NB; when ldd is NULL, it must have come via normal
597 * lookup path only, since ll_iget_for_nfs always calls
600 if (ldd && ldd->lld_nfs_dentry) {
601 ldd->lld_nfs_dentry = 0;
602 it->it_flags |= MDS_OPEN_LOCK;
606 * Always specify MDS_OPEN_BY_FID because we don't want
607 * to get file with different fid.
609 it->it_flags |= MDS_OPEN_BY_FID;
610 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 lli->lli_update_atime = 0;
1020 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1023 atime = LTIME_S(inode->i_atime);
1024 mtime = LTIME_S(inode->i_mtime);
1025 ctime = LTIME_S(inode->i_ctime);
1027 cl_object_attr_lock(obj);
1028 rc = cl_object_attr_get(env, obj, attr);
1029 cl_object_attr_unlock(obj);
1032 GOTO(out_size_unlock, rc);
1034 if (atime < attr->cat_atime)
1035 atime = attr->cat_atime;
1037 if (ctime < attr->cat_ctime)
1038 ctime = attr->cat_ctime;
1040 if (mtime < attr->cat_mtime)
1041 mtime = attr->cat_mtime;
1043 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044 PFID(&lli->lli_fid), attr->cat_size);
1046 i_size_write(inode, attr->cat_size);
1047 inode->i_blocks = attr->cat_blocks;
1049 LTIME_S(inode->i_atime) = atime;
1050 LTIME_S(inode->i_mtime) = mtime;
1051 LTIME_S(inode->i_ctime) = ctime;
1054 ll_inode_size_unlock(inode);
1059 static bool file_is_noatime(const struct file *file)
1061 const struct vfsmount *mnt = file->f_path.mnt;
1062 const struct inode *inode = file_inode((struct file *)file);
1064 /* Adapted from file_accessed() and touch_atime().*/
1065 if (file->f_flags & O_NOATIME)
1068 if (inode->i_flags & S_NOATIME)
1071 if (IS_NOATIME(inode))
1074 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1077 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1086 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1088 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1090 struct inode *inode = file_inode(file);
1092 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1093 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1094 io->u.ci_rw.rw_file = file;
1095 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1096 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1097 if (iot == CIT_WRITE) {
1098 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1099 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1100 file->f_flags & O_DIRECT ||
1103 io->ci_obj = ll_i2info(inode)->lli_clob;
1104 io->ci_lockreq = CILR_MAYBE;
1105 if (ll_file_nolock(file)) {
1106 io->ci_lockreq = CILR_NEVER;
1107 io->ci_no_srvlock = 1;
1108 } else if (file->f_flags & O_APPEND) {
1109 io->ci_lockreq = CILR_MANDATORY;
1111 io->ci_noatime = file_is_noatime(file);
1112 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1113 io->ci_pio = !io->u.ci_rw.rw_append;
1118 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1120 struct cl_io_pt *pt = ptask->pt_cbdata;
1121 struct file *file = pt->cip_file;
1124 loff_t pos = pt->cip_pos;
1129 env = cl_env_get(&refcheck);
1131 RETURN(PTR_ERR(env));
1133 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1134 file_dentry(file)->d_name.name,
1135 pt->cip_iot == CIT_READ ? "read" : "write",
1136 pos, pos + pt->cip_count);
1139 io = vvp_env_thread_io(env);
1140 ll_io_init(io, file, pt->cip_iot);
1141 io->u.ci_rw.rw_iter = pt->cip_iter;
1142 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1143 io->ci_pio = 0; /* It's already in parallel task */
1145 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1146 pt->cip_count - pt->cip_result);
1148 struct vvp_io *vio = vvp_env_io(env);
1150 vio->vui_io_subtype = IO_NORMAL;
1151 vio->vui_fd = LUSTRE_FPRIVATE(file);
1153 ll_cl_add(file, env, io, LCC_RW);
1154 rc = cl_io_loop(env, io);
1155 ll_cl_remove(file, env);
1157 /* cl_io_rw_init() handled IO */
1161 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1167 if (io->ci_nob > 0) {
1168 pt->cip_result += io->ci_nob;
1169 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1171 pt->cip_iocb.ki_pos = pos;
1172 #ifdef HAVE_KIOCB_KI_LEFT
1173 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1174 #elif defined(HAVE_KI_NBYTES)
1175 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1179 cl_io_fini(env, io);
1181 if ((rc == 0 || rc == -ENODATA) &&
1182 pt->cip_result < pt->cip_count &&
1183 io->ci_need_restart) {
1185 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1186 file_dentry(file)->d_name.name,
1187 pt->cip_iot == CIT_READ ? "read" : "write",
1188 pos, pos + pt->cip_count - pt->cip_result,
1189 pt->cip_result, rc);
1193 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1194 file_dentry(file)->d_name.name,
1195 pt->cip_iot == CIT_READ ? "read" : "write",
1196 pt->cip_result, rc);
1198 cl_env_put(env, &refcheck);
1199 RETURN(pt->cip_result > 0 ? 0 : rc);
1203 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1204 struct file *file, enum cl_io_type iot,
1205 loff_t *ppos, size_t count)
1207 struct range_lock range;
1208 struct vvp_io *vio = vvp_env_io(env);
1209 struct inode *inode = file_inode(file);
1210 struct ll_inode_info *lli = ll_i2info(inode);
1211 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1219 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1220 file_dentry(file)->d_name.name,
1221 iot == CIT_READ ? "read" : "write", pos, pos + count);
1224 io = vvp_env_thread_io(env);
1225 ll_io_init(io, file, iot);
1226 if (args->via_io_subtype == IO_NORMAL) {
1227 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1228 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1233 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1234 bool range_locked = false;
1236 if (file->f_flags & O_APPEND)
1237 range_lock_init(&range, 0, LUSTRE_EOF);
1239 range_lock_init(&range, pos, pos + count - 1);
1241 vio->vui_fd = LUSTRE_FPRIVATE(file);
1242 vio->vui_io_subtype = args->via_io_subtype;
1244 switch (vio->vui_io_subtype) {
1246 /* Direct IO reads must also take range lock,
1247 * or multiple reads will try to work on the same pages
1248 * See LU-6227 for details. */
1249 if (((iot == CIT_WRITE) ||
1250 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1251 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1252 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1254 rc = range_lock(&lli->lli_write_tree, &range);
1258 range_locked = true;
1262 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1263 vio->u.splice.vui_flags = args->u.splice.via_flags;
1266 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1270 ll_cl_add(file, env, io, LCC_RW);
1271 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1272 !lli->lli_inode_locked) {
1274 lli->lli_inode_locked = 1;
1276 rc = cl_io_loop(env, io);
1277 if (lli->lli_inode_locked) {
1278 lli->lli_inode_locked = 0;
1279 inode_unlock(inode);
1281 ll_cl_remove(file, env);
1284 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1286 range_unlock(&lli->lli_write_tree, &range);
1289 /* cl_io_rw_init() handled IO */
1293 if (io->ci_nob > 0) {
1294 result += io->ci_nob;
1295 count -= io->ci_nob;
1297 if (args->via_io_subtype == IO_NORMAL) {
1298 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1300 args->u.normal.via_iocb->ki_pos = pos;
1301 #ifdef HAVE_KIOCB_KI_LEFT
1302 args->u.normal.via_iocb->ki_left = count;
1303 #elif defined(HAVE_KI_NBYTES)
1304 args->u.normal.via_iocb->ki_nbytes = count;
1308 pos = io->u.ci_rw.rw_range.cir_pos;
1312 cl_io_fini(env, io);
1314 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1316 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1317 file_dentry(file)->d_name.name,
1318 iot == CIT_READ ? "read" : "write",
1319 pos, pos + count, result, rc);
1323 if (iot == CIT_READ) {
1325 ll_stats_ops_tally(ll_i2sbi(inode),
1326 LPROC_LL_READ_BYTES, result);
1327 } else if (iot == CIT_WRITE) {
1329 ll_stats_ops_tally(ll_i2sbi(inode),
1330 LPROC_LL_WRITE_BYTES, result);
1331 fd->fd_write_failed = false;
1332 } else if (result == 0 && rc == 0) {
1335 fd->fd_write_failed = true;
1337 fd->fd_write_failed = false;
1338 } else if (rc != -ERESTARTSYS) {
1339 fd->fd_write_failed = true;
1343 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1344 file_dentry(file)->d_name.name,
1345 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1349 RETURN(result > 0 ? result : rc);
1353 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1354 * especially for small I/O.
1356 * To serve a read request, CLIO has to create and initialize a cl_io and
1357 * then request DLM lock. This has turned out to have siginificant overhead
1358 * and affects the performance of small I/O dramatically.
1360 * It's not necessary to create a cl_io for each I/O. Under the help of read
1361 * ahead, most of the pages being read are already in memory cache and we can
1362 * read those pages directly because if the pages exist, the corresponding DLM
1363 * lock must exist so that page content must be valid.
1365 * In fast read implementation, the llite speculatively finds and reads pages
1366 * in memory cache. There are three scenarios for fast read:
1367 * - If the page exists and is uptodate, kernel VM will provide the data and
1368 * CLIO won't be intervened;
1369 * - If the page was brought into memory by read ahead, it will be exported
1370 * and read ahead parameters will be updated;
1371 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1372 * it will go back and invoke normal read, i.e., a cl_io will be created
1373 * and DLM lock will be requested.
1375 * POSIX compliance: posix standard states that read is intended to be atomic.
1376 * Lustre read implementation is in line with Linux kernel read implementation
1377 * and neither of them complies with POSIX standard in this matter. Fast read
1378 * doesn't make the situation worse on single node but it may interleave write
1379 * results from multiple nodes due to short read handling in ll_file_aio_read().
1381 * \param env - lu_env
1382 * \param iocb - kiocb from kernel
1383 * \param iter - user space buffers where the data will be copied
1385 * \retval - number of bytes have been read, or error code if error occurred.
1388 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1389 struct iov_iter *iter)
1393 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1396 /* NB: we can't do direct IO for fast read because it will need a lock
1397 * to make IO engine happy. */
1398 if (iocb->ki_filp->f_flags & O_DIRECT)
1401 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1402 result = generic_file_read_iter(iocb, iter);
1403 ll_cl_remove(iocb->ki_filp, env);
1405 /* If the first page is not in cache, generic_file_aio_read() will be
1406 * returned with -ENODATA.
1407 * See corresponding code in ll_readpage(). */
1408 if (result == -ENODATA)
1412 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1413 LPROC_LL_READ_BYTES, result);
1419 * Read from a file (through the page cache).
1421 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1424 struct vvp_io_args *args;
1429 env = cl_env_get(&refcheck);
1431 return PTR_ERR(env);
1433 result = ll_do_fast_read(env, iocb, to);
1434 if (result < 0 || iov_iter_count(to) == 0)
1437 args = ll_env_args(env, IO_NORMAL);
1438 args->u.normal.via_iter = to;
1439 args->u.normal.via_iocb = iocb;
1441 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1442 &iocb->ki_pos, iov_iter_count(to));
1445 else if (result == 0)
1449 cl_env_put(env, &refcheck);
1454 * Write to a file (through the page cache).
1456 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1458 struct vvp_io_args *args;
1463 env = cl_env_get(&refcheck);
1465 return PTR_ERR(env);
1467 args = ll_env_args(env, IO_NORMAL);
1468 args->u.normal.via_iter = from;
1469 args->u.normal.via_iocb = iocb;
1471 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1472 &iocb->ki_pos, iov_iter_count(from));
1473 cl_env_put(env, &refcheck);
1477 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1479 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1481 static int ll_file_get_iov_count(const struct iovec *iov,
1482 unsigned long *nr_segs, size_t *count)
1487 for (seg = 0; seg < *nr_segs; seg++) {
1488 const struct iovec *iv = &iov[seg];
1491 * If any segment has a negative length, or the cumulative
1492 * length ever wraps negative then return -EINVAL.
1495 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1497 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1502 cnt -= iv->iov_len; /* This segment is no good */
1509 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1510 unsigned long nr_segs, loff_t pos)
1517 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1521 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1522 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1523 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1524 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1525 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1527 result = ll_file_read_iter(iocb, &to);
1532 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1536 struct iovec iov = { .iov_base = buf, .iov_len = count };
1537 struct kiocb *kiocb;
1542 env = cl_env_get(&refcheck);
1544 RETURN(PTR_ERR(env));
1546 kiocb = &ll_env_info(env)->lti_kiocb;
1547 init_sync_kiocb(kiocb, file);
1548 kiocb->ki_pos = *ppos;
1549 #ifdef HAVE_KIOCB_KI_LEFT
1550 kiocb->ki_left = count;
1551 #elif defined(HAVE_KI_NBYTES)
1552 kiocb->ki_nbytes = count;
1555 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1556 *ppos = kiocb->ki_pos;
1558 cl_env_put(env, &refcheck);
1563 * Write to a file (through the page cache).
1566 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1567 unsigned long nr_segs, loff_t pos)
1569 struct iov_iter from;
1574 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1578 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1579 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1580 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1581 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1582 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1584 result = ll_file_write_iter(iocb, &from);
1589 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1590 size_t count, loff_t *ppos)
1593 struct iovec iov = { .iov_base = (void __user *)buf,
1595 struct kiocb *kiocb;
1600 env = cl_env_get(&refcheck);
1602 RETURN(PTR_ERR(env));
1604 kiocb = &ll_env_info(env)->lti_kiocb;
1605 init_sync_kiocb(kiocb, file);
1606 kiocb->ki_pos = *ppos;
1607 #ifdef HAVE_KIOCB_KI_LEFT
1608 kiocb->ki_left = count;
1609 #elif defined(HAVE_KI_NBYTES)
1610 kiocb->ki_nbytes = count;
1613 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1614 *ppos = kiocb->ki_pos;
1616 cl_env_put(env, &refcheck);
1619 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1622 * Send file content (through pagecache) somewhere with helper
1624 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1625 struct pipe_inode_info *pipe, size_t count,
1629 struct vvp_io_args *args;
1634 env = cl_env_get(&refcheck);
1636 RETURN(PTR_ERR(env));
1638 args = ll_env_args(env, IO_SPLICE);
1639 args->u.splice.via_pipe = pipe;
1640 args->u.splice.via_flags = flags;
1642 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1643 cl_env_put(env, &refcheck);
1647 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1648 __u64 flags, struct lov_user_md *lum, int lum_size)
1650 struct lookup_intent oit = {
1652 .it_flags = flags | MDS_OPEN_BY_FID,
1657 ll_inode_size_lock(inode);
1658 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1660 GOTO(out_unlock, rc);
1662 ll_release_openhandle(dentry, &oit);
1665 ll_inode_size_unlock(inode);
1666 ll_intent_release(&oit);
1671 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1672 struct lov_mds_md **lmmp, int *lmm_size,
1673 struct ptlrpc_request **request)
1675 struct ll_sb_info *sbi = ll_i2sbi(inode);
1676 struct mdt_body *body;
1677 struct lov_mds_md *lmm = NULL;
1678 struct ptlrpc_request *req = NULL;
1679 struct md_op_data *op_data;
1682 rc = ll_get_default_mdsize(sbi, &lmmsize);
1686 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1687 strlen(filename), lmmsize,
1688 LUSTRE_OPC_ANY, NULL);
1689 if (IS_ERR(op_data))
1690 RETURN(PTR_ERR(op_data));
1692 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1693 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1694 ll_finish_md_op_data(op_data);
1696 CDEBUG(D_INFO, "md_getattr_name failed "
1697 "on %s: rc %d\n", filename, rc);
1701 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1702 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1704 lmmsize = body->mbo_eadatasize;
1706 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1708 GOTO(out, rc = -ENODATA);
1711 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1712 LASSERT(lmm != NULL);
1714 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1715 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1716 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1717 GOTO(out, rc = -EPROTO);
1720 * This is coming from the MDS, so is probably in
1721 * little endian. We convert it to host endian before
1722 * passing it to userspace.
1724 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1727 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1728 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1729 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1730 if (le32_to_cpu(lmm->lmm_pattern) &
1731 LOV_PATTERN_F_RELEASED)
1735 /* if function called for directory - we should
1736 * avoid swab not existent lsm objects */
1737 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1738 lustre_swab_lov_user_md_v1(
1739 (struct lov_user_md_v1 *)lmm);
1740 if (S_ISREG(body->mbo_mode))
1741 lustre_swab_lov_user_md_objects(
1742 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1744 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1745 lustre_swab_lov_user_md_v3(
1746 (struct lov_user_md_v3 *)lmm);
1747 if (S_ISREG(body->mbo_mode))
1748 lustre_swab_lov_user_md_objects(
1749 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1751 } else if (lmm->lmm_magic ==
1752 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1753 lustre_swab_lov_comp_md_v1(
1754 (struct lov_comp_md_v1 *)lmm);
1760 *lmm_size = lmmsize;
1765 static int ll_lov_setea(struct inode *inode, struct file *file,
1768 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1769 struct lov_user_md *lump;
1770 int lum_size = sizeof(struct lov_user_md) +
1771 sizeof(struct lov_user_ost_data);
1775 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1778 OBD_ALLOC_LARGE(lump, lum_size);
1782 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
1783 GOTO(out_lump, rc = -EFAULT);
1785 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1787 cl_lov_delay_create_clear(&file->f_flags);
1790 OBD_FREE_LARGE(lump, lum_size);
1794 static int ll_file_getstripe(struct inode *inode,
1795 struct lov_user_md __user *lum)
1802 env = cl_env_get(&refcheck);
1804 RETURN(PTR_ERR(env));
1806 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1807 cl_env_put(env, &refcheck);
1811 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1814 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1815 struct lov_user_md *klum;
1817 __u64 flags = FMODE_WRITE;
1820 rc = ll_copy_user_md(lum, &klum);
1825 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1827 cl_lov_delay_create_clear(&file->f_flags);
1828 OBD_FREE(klum, lum_size);
1833 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1835 struct ll_inode_info *lli = ll_i2info(inode);
1836 struct cl_object *obj = lli->lli_clob;
1837 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1838 struct ll_grouplock grouplock;
1843 CWARN("group id for group lock must not be 0\n");
1847 if (ll_file_nolock(file))
1848 RETURN(-EOPNOTSUPP);
1850 spin_lock(&lli->lli_lock);
1851 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1852 CWARN("group lock already existed with gid %lu\n",
1853 fd->fd_grouplock.lg_gid);
1854 spin_unlock(&lli->lli_lock);
1857 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1858 spin_unlock(&lli->lli_lock);
1861 * XXX: group lock needs to protect all OST objects while PFL
1862 * can add new OST objects during the IO, so we'd instantiate
1863 * all OST objects before getting its group lock.
1868 struct cl_layout cl = {
1869 .cl_is_composite = false,
1872 env = cl_env_get(&refcheck);
1874 RETURN(PTR_ERR(env));
1876 rc = cl_object_layout_get(env, obj, &cl);
1877 if (!rc && cl.cl_is_composite)
1878 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1880 cl_env_put(env, &refcheck);
1885 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1886 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1890 spin_lock(&lli->lli_lock);
1891 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1892 spin_unlock(&lli->lli_lock);
1893 CERROR("another thread just won the race\n");
1894 cl_put_grouplock(&grouplock);
1898 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1899 fd->fd_grouplock = grouplock;
1900 spin_unlock(&lli->lli_lock);
1902 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1906 static int ll_put_grouplock(struct inode *inode, struct file *file,
1909 struct ll_inode_info *lli = ll_i2info(inode);
1910 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1911 struct ll_grouplock grouplock;
1914 spin_lock(&lli->lli_lock);
1915 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1916 spin_unlock(&lli->lli_lock);
1917 CWARN("no group lock held\n");
1921 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1923 if (fd->fd_grouplock.lg_gid != arg) {
1924 CWARN("group lock %lu doesn't match current id %lu\n",
1925 arg, fd->fd_grouplock.lg_gid);
1926 spin_unlock(&lli->lli_lock);
1930 grouplock = fd->fd_grouplock;
1931 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1932 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1933 spin_unlock(&lli->lli_lock);
1935 cl_put_grouplock(&grouplock);
1936 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1941 * Close inode open handle
1943 * \param dentry [in] dentry which contains the inode
1944 * \param it [in,out] intent which contains open info and result
1947 * \retval <0 failure
1949 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1951 struct inode *inode = dentry->d_inode;
1952 struct obd_client_handle *och;
1958 /* Root ? Do nothing. */
1959 if (dentry->d_inode->i_sb->s_root == dentry)
1962 /* No open handle to close? Move away */
1963 if (!it_disposition(it, DISP_OPEN_OPEN))
1966 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1968 OBD_ALLOC(och, sizeof(*och));
1970 GOTO(out, rc = -ENOMEM);
1972 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1974 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1976 /* this one is in place of ll_file_open */
1977 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1978 ptlrpc_req_finished(it->it_request);
1979 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1985 * Get size for inode for which FIEMAP mapping is requested.
1986 * Make the FIEMAP get_info call and returns the result.
1987 * \param fiemap kernel buffer to hold extens
1988 * \param num_bytes kernel buffer size
1990 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1996 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1999 /* Checks for fiemap flags */
2000 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2001 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2005 /* Check for FIEMAP_FLAG_SYNC */
2006 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2007 rc = filemap_fdatawrite(inode->i_mapping);
2012 env = cl_env_get(&refcheck);
2014 RETURN(PTR_ERR(env));
2016 if (i_size_read(inode) == 0) {
2017 rc = ll_glimpse_size(inode);
2022 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2023 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2024 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2026 /* If filesize is 0, then there would be no objects for mapping */
2027 if (fmkey.lfik_oa.o_size == 0) {
2028 fiemap->fm_mapped_extents = 0;
2032 fmkey.lfik_fiemap = *fiemap;
2034 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2035 &fmkey, fiemap, &num_bytes);
2037 cl_env_put(env, &refcheck);
2041 int ll_fid2path(struct inode *inode, void __user *arg)
2043 struct obd_export *exp = ll_i2mdexp(inode);
2044 const struct getinfo_fid2path __user *gfin = arg;
2046 struct getinfo_fid2path *gfout;
2052 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2053 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2056 /* Only need to get the buflen */
2057 if (get_user(pathlen, &gfin->gf_pathlen))
2060 if (pathlen > PATH_MAX)
2063 outsize = sizeof(*gfout) + pathlen;
2064 OBD_ALLOC(gfout, outsize);
2068 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2069 GOTO(gf_free, rc = -EFAULT);
2070 /* append root FID after gfout to let MDT know the root FID so that it
2071 * can lookup the correct path, this is mainly for fileset.
2072 * old server without fileset mount support will ignore this. */
2073 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2075 /* Call mdc_iocontrol */
2076 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2080 if (copy_to_user(arg, gfout, outsize))
2084 OBD_FREE(gfout, outsize);
2089 * Read the data_version for inode.
2091 * This value is computed using stripe object version on OST.
2092 * Version is computed using server side locking.
2094 * @param flags if do sync on the OST side;
2096 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2097 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2099 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2101 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2109 /* If no file object initialized, we consider its version is 0. */
2115 env = cl_env_get(&refcheck);
2117 RETURN(PTR_ERR(env));
2119 io = vvp_env_thread_io(env);
2121 io->u.ci_data_version.dv_data_version = 0;
2122 io->u.ci_data_version.dv_flags = flags;
2125 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2126 result = cl_io_loop(env, io);
2128 result = io->ci_result;
2130 *data_version = io->u.ci_data_version.dv_data_version;
2132 cl_io_fini(env, io);
2134 if (unlikely(io->ci_need_restart))
2137 cl_env_put(env, &refcheck);
2143 * Trigger a HSM release request for the provided inode.
2145 int ll_hsm_release(struct inode *inode)
2148 struct obd_client_handle *och = NULL;
2149 __u64 data_version = 0;
2154 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2155 ll_get_fsname(inode->i_sb, NULL, 0),
2156 PFID(&ll_i2info(inode)->lli_fid));
2158 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2160 GOTO(out, rc = PTR_ERR(och));
2162 /* Grab latest data_version and [am]time values */
2163 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2167 env = cl_env_get(&refcheck);
2169 GOTO(out, rc = PTR_ERR(env));
2171 ll_merge_attr(env, inode);
2172 cl_env_put(env, &refcheck);
2174 /* Release the file.
2175 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2176 * we still need it to pack l_remote_handle to MDT. */
2177 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2183 if (och != NULL && !IS_ERR(och)) /* close the file */
2184 ll_lease_close(och, inode, NULL);
2189 struct ll_swap_stack {
2192 struct inode *inode1;
2193 struct inode *inode2;
2198 static int ll_swap_layouts(struct file *file1, struct file *file2,
2199 struct lustre_swap_layouts *lsl)
2201 struct mdc_swap_layouts msl;
2202 struct md_op_data *op_data;
2205 struct ll_swap_stack *llss = NULL;
2208 OBD_ALLOC_PTR(llss);
2212 llss->inode1 = file_inode(file1);
2213 llss->inode2 = file_inode(file2);
2215 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2219 /* we use 2 bool because it is easier to swap than 2 bits */
2220 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2221 llss->check_dv1 = true;
2223 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2224 llss->check_dv2 = true;
2226 /* we cannot use lsl->sl_dvX directly because we may swap them */
2227 llss->dv1 = lsl->sl_dv1;
2228 llss->dv2 = lsl->sl_dv2;
2230 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2231 if (rc == 0) /* same file, done! */
2234 if (rc < 0) { /* sequentialize it */
2235 swap(llss->inode1, llss->inode2);
2237 swap(llss->dv1, llss->dv2);
2238 swap(llss->check_dv1, llss->check_dv2);
2242 if (gid != 0) { /* application asks to flush dirty cache */
2243 rc = ll_get_grouplock(llss->inode1, file1, gid);
2247 rc = ll_get_grouplock(llss->inode2, file2, gid);
2249 ll_put_grouplock(llss->inode1, file1, gid);
2254 /* ultimate check, before swaping the layouts we check if
2255 * dataversion has changed (if requested) */
2256 if (llss->check_dv1) {
2257 rc = ll_data_version(llss->inode1, &dv, 0);
2260 if (dv != llss->dv1)
2261 GOTO(putgl, rc = -EAGAIN);
2264 if (llss->check_dv2) {
2265 rc = ll_data_version(llss->inode2, &dv, 0);
2268 if (dv != llss->dv2)
2269 GOTO(putgl, rc = -EAGAIN);
2272 /* struct md_op_data is used to send the swap args to the mdt
2273 * only flags is missing, so we use struct mdc_swap_layouts
2274 * through the md_op_data->op_data */
2275 /* flags from user space have to be converted before they are send to
2276 * server, no flag is sent today, they are only used on the client */
2279 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2280 0, LUSTRE_OPC_ANY, &msl);
2281 if (IS_ERR(op_data))
2282 GOTO(free, rc = PTR_ERR(op_data));
2284 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2285 sizeof(*op_data), op_data, NULL);
2286 ll_finish_md_op_data(op_data);
2293 ll_put_grouplock(llss->inode2, file2, gid);
2294 ll_put_grouplock(llss->inode1, file1, gid);
2304 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2306 struct md_op_data *op_data;
2310 /* Detect out-of range masks */
2311 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2314 /* Non-root users are forbidden to set or clear flags which are
2315 * NOT defined in HSM_USER_MASK. */
2316 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2317 !cfs_capable(CFS_CAP_SYS_ADMIN))
2320 /* Detect out-of range archive id */
2321 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2322 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2325 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2326 LUSTRE_OPC_ANY, hss);
2327 if (IS_ERR(op_data))
2328 RETURN(PTR_ERR(op_data));
2330 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2331 sizeof(*op_data), op_data, NULL);
2333 ll_finish_md_op_data(op_data);
2338 static int ll_hsm_import(struct inode *inode, struct file *file,
2339 struct hsm_user_import *hui)
2341 struct hsm_state_set *hss = NULL;
2342 struct iattr *attr = NULL;
2346 if (!S_ISREG(inode->i_mode))
2352 GOTO(out, rc = -ENOMEM);
2354 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2355 hss->hss_archive_id = hui->hui_archive_id;
2356 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2357 rc = ll_hsm_state_set(inode, hss);
2361 OBD_ALLOC_PTR(attr);
2363 GOTO(out, rc = -ENOMEM);
2365 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2366 attr->ia_mode |= S_IFREG;
2367 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2368 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2369 attr->ia_size = hui->hui_size;
2370 attr->ia_mtime.tv_sec = hui->hui_mtime;
2371 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2372 attr->ia_atime.tv_sec = hui->hui_atime;
2373 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2375 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2376 ATTR_UID | ATTR_GID |
2377 ATTR_MTIME | ATTR_MTIME_SET |
2378 ATTR_ATIME | ATTR_ATIME_SET;
2382 rc = ll_setattr_raw(file_dentry(file), attr, true);
2386 inode_unlock(inode);
2398 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2400 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2401 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2404 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2406 struct inode *inode = file_inode(file);
2408 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2409 ATTR_MTIME | ATTR_MTIME_SET |
2410 ATTR_CTIME | ATTR_CTIME_SET,
2412 .tv_sec = lfu->lfu_atime_sec,
2413 .tv_nsec = lfu->lfu_atime_nsec,
2416 .tv_sec = lfu->lfu_mtime_sec,
2417 .tv_nsec = lfu->lfu_mtime_nsec,
2420 .tv_sec = lfu->lfu_ctime_sec,
2421 .tv_nsec = lfu->lfu_ctime_nsec,
2427 if (!capable(CAP_SYS_ADMIN))
2430 if (!S_ISREG(inode->i_mode))
2434 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2435 inode_unlock(inode);
2441 * Give file access advices
2443 * The ladvise interface is similar to Linux fadvise() system call, except it
2444 * forwards the advices directly from Lustre client to server. The server side
2445 * codes will apply appropriate read-ahead and caching techniques for the
2446 * corresponding files.
2448 * A typical workload for ladvise is e.g. a bunch of different clients are
2449 * doing small random reads of a file, so prefetching pages into OSS cache
2450 * with big linear reads before the random IO is a net benefit. Fetching
2451 * all that data into each client cache with fadvise() may not be, due to
2452 * much more data being sent to the client.
2454 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2455 struct llapi_lu_ladvise *ladvise)
2459 struct cl_ladvise_io *lio;
2464 env = cl_env_get(&refcheck);
2466 RETURN(PTR_ERR(env));
2468 io = vvp_env_thread_io(env);
2469 io->ci_obj = ll_i2info(inode)->lli_clob;
2471 /* initialize parameters for ladvise */
2472 lio = &io->u.ci_ladvise;
2473 lio->li_start = ladvise->lla_start;
2474 lio->li_end = ladvise->lla_end;
2475 lio->li_fid = ll_inode2fid(inode);
2476 lio->li_advice = ladvise->lla_advice;
2477 lio->li_flags = flags;
2479 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2480 rc = cl_io_loop(env, io);
2484 cl_io_fini(env, io);
2485 cl_env_put(env, &refcheck);
2489 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2492 struct fsxattr fsxattr;
2494 if (copy_from_user(&fsxattr,
2495 (const struct fsxattr __user *)arg,
2499 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2500 if (copy_to_user((struct fsxattr __user *)arg,
2501 &fsxattr, sizeof(fsxattr)))
2507 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2511 struct md_op_data *op_data;
2512 struct ptlrpc_request *req = NULL;
2514 struct fsxattr fsxattr;
2516 /* only root could change project ID */
2517 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2520 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2521 LUSTRE_OPC_ANY, NULL);
2522 if (IS_ERR(op_data))
2523 RETURN(PTR_ERR(op_data));
2525 if (copy_from_user(&fsxattr,
2526 (const struct fsxattr __user *)arg,
2528 GOTO(out_fsxattr1, rc = -EFAULT);
2530 op_data->op_projid = fsxattr.fsx_projid;
2531 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2532 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2534 ptlrpc_req_finished(req);
2537 ll_finish_md_op_data(op_data);
2544 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2546 struct inode *inode = file_inode(file);
2547 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2551 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2552 PFID(ll_inode2fid(inode)), inode, cmd);
2553 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2555 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2556 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2560 case LL_IOC_GETFLAGS:
2561 /* Get the current value of the file flags */
2562 return put_user(fd->fd_flags, (int __user *)arg);
2563 case LL_IOC_SETFLAGS:
2564 case LL_IOC_CLRFLAGS:
2565 /* Set or clear specific file flags */
2566 /* XXX This probably needs checks to ensure the flags are
2567 * not abused, and to handle any flag side effects.
2569 if (get_user(flags, (int __user *) arg))
2572 if (cmd == LL_IOC_SETFLAGS) {
2573 if ((flags & LL_FILE_IGNORE_LOCK) &&
2574 !(file->f_flags & O_DIRECT)) {
2575 CERROR("%s: unable to disable locking on "
2576 "non-O_DIRECT file\n", current->comm);
2580 fd->fd_flags |= flags;
2582 fd->fd_flags &= ~flags;
2585 case LL_IOC_LOV_SETSTRIPE:
2586 RETURN(ll_lov_setstripe(inode, file, arg));
2587 case LL_IOC_LOV_SETEA:
2588 RETURN(ll_lov_setea(inode, file, arg));
2589 case LL_IOC_LOV_SWAP_LAYOUTS: {
2591 struct lustre_swap_layouts lsl;
2593 if (copy_from_user(&lsl, (char __user *)arg,
2594 sizeof(struct lustre_swap_layouts)))
2597 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2600 file2 = fget(lsl.sl_fd);
2604 /* O_WRONLY or O_RDWR */
2605 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2606 GOTO(out, rc = -EPERM);
2608 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2609 struct inode *inode2;
2610 struct ll_inode_info *lli;
2611 struct obd_client_handle *och = NULL;
2613 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2614 GOTO(out, rc = -EINVAL);
2616 lli = ll_i2info(inode);
2617 mutex_lock(&lli->lli_och_mutex);
2618 if (fd->fd_lease_och != NULL) {
2619 och = fd->fd_lease_och;
2620 fd->fd_lease_och = NULL;
2622 mutex_unlock(&lli->lli_och_mutex);
2624 GOTO(out, rc = -ENOLCK);
2625 inode2 = file_inode(file2);
2626 rc = ll_swap_layouts_close(och, inode, inode2);
2628 rc = ll_swap_layouts(file, file2, &lsl);
2634 case LL_IOC_LOV_GETSTRIPE:
2635 RETURN(ll_file_getstripe(inode,
2636 (struct lov_user_md __user *)arg));
2637 case FSFILT_IOC_GETFLAGS:
2638 case FSFILT_IOC_SETFLAGS:
2639 RETURN(ll_iocontrol(inode, file, cmd, arg));
2640 case FSFILT_IOC_GETVERSION_OLD:
2641 case FSFILT_IOC_GETVERSION:
2642 RETURN(put_user(inode->i_generation, (int __user *)arg));
2643 case LL_IOC_GROUP_LOCK:
2644 RETURN(ll_get_grouplock(inode, file, arg));
2645 case LL_IOC_GROUP_UNLOCK:
2646 RETURN(ll_put_grouplock(inode, file, arg));
2647 case IOC_OBD_STATFS:
2648 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2650 /* We need to special case any other ioctls we want to handle,
2651 * to send them to the MDS/OST as appropriate and to properly
2652 * network encode the arg field.
2653 case FSFILT_IOC_SETVERSION_OLD:
2654 case FSFILT_IOC_SETVERSION:
2656 case LL_IOC_FLUSHCTX:
2657 RETURN(ll_flush_ctx(inode));
2658 case LL_IOC_PATH2FID: {
2659 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2660 sizeof(struct lu_fid)))
2665 case LL_IOC_GETPARENT:
2666 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2668 case OBD_IOC_FID2PATH:
2669 RETURN(ll_fid2path(inode, (void __user *)arg));
2670 case LL_IOC_DATA_VERSION: {
2671 struct ioc_data_version idv;
2674 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2677 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2678 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2681 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2687 case LL_IOC_GET_MDTIDX: {
2690 mdtidx = ll_get_mdt_idx(inode);
2694 if (put_user((int)mdtidx, (int __user *)arg))
2699 case OBD_IOC_GETDTNAME:
2700 case OBD_IOC_GETMDNAME:
2701 RETURN(ll_get_obd_name(inode, cmd, arg));
2702 case LL_IOC_HSM_STATE_GET: {
2703 struct md_op_data *op_data;
2704 struct hsm_user_state *hus;
2711 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2712 LUSTRE_OPC_ANY, hus);
2713 if (IS_ERR(op_data)) {
2715 RETURN(PTR_ERR(op_data));
2718 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2721 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2724 ll_finish_md_op_data(op_data);
2728 case LL_IOC_HSM_STATE_SET: {
2729 struct hsm_state_set *hss;
2736 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2741 rc = ll_hsm_state_set(inode, hss);
2746 case LL_IOC_HSM_ACTION: {
2747 struct md_op_data *op_data;
2748 struct hsm_current_action *hca;
2755 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2756 LUSTRE_OPC_ANY, hca);
2757 if (IS_ERR(op_data)) {
2759 RETURN(PTR_ERR(op_data));
2762 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2765 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2768 ll_finish_md_op_data(op_data);
2772 case LL_IOC_SET_LEASE: {
2773 struct ll_inode_info *lli = ll_i2info(inode);
2774 struct obd_client_handle *och = NULL;
2779 case LL_LEASE_WRLCK:
2780 if (!(file->f_mode & FMODE_WRITE))
2782 fmode = FMODE_WRITE;
2784 case LL_LEASE_RDLCK:
2785 if (!(file->f_mode & FMODE_READ))
2789 case LL_LEASE_UNLCK:
2790 mutex_lock(&lli->lli_och_mutex);
2791 if (fd->fd_lease_och != NULL) {
2792 och = fd->fd_lease_och;
2793 fd->fd_lease_och = NULL;
2795 mutex_unlock(&lli->lli_och_mutex);
2800 fmode = och->och_flags;
2801 rc = ll_lease_close(och, inode, &lease_broken);
2805 rc = ll_lease_och_release(inode, file);
2812 RETURN(ll_lease_type_from_fmode(fmode));
2817 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2819 /* apply for lease */
2820 och = ll_lease_open(inode, file, fmode, 0);
2822 RETURN(PTR_ERR(och));
2825 mutex_lock(&lli->lli_och_mutex);
2826 if (fd->fd_lease_och == NULL) {
2827 fd->fd_lease_och = och;
2830 mutex_unlock(&lli->lli_och_mutex);
2832 /* impossible now that only excl is supported for now */
2833 ll_lease_close(och, inode, &lease_broken);
2838 case LL_IOC_GET_LEASE: {
2839 struct ll_inode_info *lli = ll_i2info(inode);
2840 struct ldlm_lock *lock = NULL;
2843 mutex_lock(&lli->lli_och_mutex);
2844 if (fd->fd_lease_och != NULL) {
2845 struct obd_client_handle *och = fd->fd_lease_och;
2847 lock = ldlm_handle2lock(&och->och_lease_handle);
2849 lock_res_and_lock(lock);
2850 if (!ldlm_is_cancel(lock))
2851 fmode = och->och_flags;
2853 unlock_res_and_lock(lock);
2854 LDLM_LOCK_PUT(lock);
2857 mutex_unlock(&lli->lli_och_mutex);
2859 RETURN(ll_lease_type_from_fmode(fmode));
2861 case LL_IOC_HSM_IMPORT: {
2862 struct hsm_user_import *hui;
2868 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2873 rc = ll_hsm_import(inode, file, hui);
2878 case LL_IOC_FUTIMES_3: {
2879 struct ll_futimes_3 lfu;
2881 if (copy_from_user(&lfu,
2882 (const struct ll_futimes_3 __user *)arg,
2886 RETURN(ll_file_futimes_3(file, &lfu));
2888 case LL_IOC_LADVISE: {
2889 struct llapi_ladvise_hdr *ladvise_hdr;
2892 int alloc_size = sizeof(*ladvise_hdr);
2895 OBD_ALLOC_PTR(ladvise_hdr);
2896 if (ladvise_hdr == NULL)
2899 if (copy_from_user(ladvise_hdr,
2900 (const struct llapi_ladvise_hdr __user *)arg,
2902 GOTO(out_ladvise, rc = -EFAULT);
2904 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2905 ladvise_hdr->lah_count < 1)
2906 GOTO(out_ladvise, rc = -EINVAL);
2908 num_advise = ladvise_hdr->lah_count;
2909 if (num_advise >= LAH_COUNT_MAX)
2910 GOTO(out_ladvise, rc = -EFBIG);
2912 OBD_FREE_PTR(ladvise_hdr);
2913 alloc_size = offsetof(typeof(*ladvise_hdr),
2914 lah_advise[num_advise]);
2915 OBD_ALLOC(ladvise_hdr, alloc_size);
2916 if (ladvise_hdr == NULL)
2920 * TODO: submit multiple advices to one server in a single RPC
2922 if (copy_from_user(ladvise_hdr,
2923 (const struct llapi_ladvise_hdr __user *)arg,
2925 GOTO(out_ladvise, rc = -EFAULT);
2927 for (i = 0; i < num_advise; i++) {
2928 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2929 &ladvise_hdr->lah_advise[i]);
2935 OBD_FREE(ladvise_hdr, alloc_size);
2938 case LL_IOC_FSGETXATTR:
2939 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2940 case LL_IOC_FSSETXATTR:
2941 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2946 ll_iocontrol_call(inode, file, cmd, arg, &err))
2949 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2950 (void __user *)arg));
2955 #ifndef HAVE_FILE_LLSEEK_SIZE
2956 static inline loff_t
2957 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2959 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2961 if (offset > maxsize)
2964 if (offset != file->f_pos) {
2965 file->f_pos = offset;
2966 file->f_version = 0;
2972 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2973 loff_t maxsize, loff_t eof)
2975 struct inode *inode = file_inode(file);
2983 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2984 * position-querying operation. Avoid rewriting the "same"
2985 * f_pos value back to the file because a concurrent read(),
2986 * write() or lseek() might have altered it
2991 * f_lock protects against read/modify/write race with other
2992 * SEEK_CURs. Note that parallel writes and reads behave
2996 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2997 inode_unlock(inode);
3001 * In the generic case the entire file is data, so as long as
3002 * offset isn't at the end of the file then the offset is data.
3009 * There is a virtual hole at the end of the file, so as long as
3010 * offset isn't i_size or larger, return i_size.
3018 return llseek_execute(file, offset, maxsize);
3022 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3024 struct inode *inode = file_inode(file);
3025 loff_t retval, eof = 0;
3028 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3029 (origin == SEEK_CUR) ? file->f_pos : 0);
3030 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3031 PFID(ll_inode2fid(inode)), inode, retval, retval,
3033 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3035 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3036 retval = ll_glimpse_size(inode);
3039 eof = i_size_read(inode);
3042 retval = ll_generic_file_llseek_size(file, offset, origin,
3043 ll_file_maxbytes(inode), eof);
3047 static int ll_flush(struct file *file, fl_owner_t id)
3049 struct inode *inode = file_inode(file);
3050 struct ll_inode_info *lli = ll_i2info(inode);
3051 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3054 LASSERT(!S_ISDIR(inode->i_mode));
3056 /* catch async errors that were recorded back when async writeback
3057 * failed for pages in this mapping. */
3058 rc = lli->lli_async_rc;
3059 lli->lli_async_rc = 0;
3060 if (lli->lli_clob != NULL) {
3061 err = lov_read_and_clear_async_rc(lli->lli_clob);
3066 /* The application has been told write failure already.
3067 * Do not report failure again. */
3068 if (fd->fd_write_failed)
3070 return rc ? -EIO : 0;
3074 * Called to make sure a portion of file has been written out.
3075 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3077 * Return how many pages have been written.
3079 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3080 enum cl_fsync_mode mode, int ignore_layout)
3084 struct cl_fsync_io *fio;
3089 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3090 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3093 env = cl_env_get(&refcheck);
3095 RETURN(PTR_ERR(env));
3097 io = vvp_env_thread_io(env);
3098 io->ci_obj = ll_i2info(inode)->lli_clob;
3099 io->ci_ignore_layout = ignore_layout;
3101 /* initialize parameters for sync */
3102 fio = &io->u.ci_fsync;
3103 fio->fi_start = start;
3105 fio->fi_fid = ll_inode2fid(inode);
3106 fio->fi_mode = mode;
3107 fio->fi_nr_written = 0;
3109 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3110 result = cl_io_loop(env, io);
3112 result = io->ci_result;
3114 result = fio->fi_nr_written;
3115 cl_io_fini(env, io);
3116 cl_env_put(env, &refcheck);
3122 * When dentry is provided (the 'else' case), file_dentry() may be
3123 * null and dentry must be used directly rather than pulled from
3124 * file_dentry() as is done otherwise.
3127 #ifdef HAVE_FILE_FSYNC_4ARGS
3128 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3130 struct dentry *dentry = file_dentry(file);
3132 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3133 int ll_fsync(struct file *file, int datasync)
3135 struct dentry *dentry = file_dentry(file);
3137 loff_t end = LLONG_MAX;
3139 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3142 loff_t end = LLONG_MAX;
3144 struct inode *inode = dentry->d_inode;
3145 struct ll_inode_info *lli = ll_i2info(inode);
3146 struct ptlrpc_request *req;
3150 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3151 PFID(ll_inode2fid(inode)), inode);
3152 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3154 #ifdef HAVE_FILE_FSYNC_4ARGS
3155 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3156 lock_inode = !lli->lli_inode_locked;
3160 /* fsync's caller has already called _fdata{sync,write}, we want
3161 * that IO to finish before calling the osc and mdc sync methods */
3162 rc = filemap_fdatawait(inode->i_mapping);
3165 /* catch async errors that were recorded back when async writeback
3166 * failed for pages in this mapping. */
3167 if (!S_ISDIR(inode->i_mode)) {
3168 err = lli->lli_async_rc;
3169 lli->lli_async_rc = 0;
3172 if (lli->lli_clob != NULL) {
3173 err = lov_read_and_clear_async_rc(lli->lli_clob);
3179 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3183 ptlrpc_req_finished(req);
3185 if (S_ISREG(inode->i_mode)) {
3186 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3188 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3189 if (rc == 0 && err < 0)
3192 fd->fd_write_failed = true;
3194 fd->fd_write_failed = false;
3197 #ifdef HAVE_FILE_FSYNC_4ARGS
3199 inode_unlock(inode);
3205 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3207 struct inode *inode = file_inode(file);
3208 struct ll_sb_info *sbi = ll_i2sbi(inode);
3209 struct ldlm_enqueue_info einfo = {
3210 .ei_type = LDLM_FLOCK,
3211 .ei_cb_cp = ldlm_flock_completion_ast,
3212 .ei_cbdata = file_lock,
3214 struct md_op_data *op_data;
3215 struct lustre_handle lockh = { 0 };
3216 union ldlm_policy_data flock = { { 0 } };
3217 int fl_type = file_lock->fl_type;
3223 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3224 PFID(ll_inode2fid(inode)), file_lock);
3226 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3228 if (file_lock->fl_flags & FL_FLOCK) {
3229 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3230 /* flocks are whole-file locks */
3231 flock.l_flock.end = OFFSET_MAX;
3232 /* For flocks owner is determined by the local file desctiptor*/
3233 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3234 } else if (file_lock->fl_flags & FL_POSIX) {
3235 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3236 flock.l_flock.start = file_lock->fl_start;
3237 flock.l_flock.end = file_lock->fl_end;
3241 flock.l_flock.pid = file_lock->fl_pid;
3243 /* Somewhat ugly workaround for svc lockd.
3244 * lockd installs custom fl_lmops->lm_compare_owner that checks
3245 * for the fl_owner to be the same (which it always is on local node
3246 * I guess between lockd processes) and then compares pid.
3247 * As such we assign pid to the owner field to make it all work,
3248 * conflict with normal locks is unlikely since pid space and
3249 * pointer space for current->files are not intersecting */
3250 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3251 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3255 einfo.ei_mode = LCK_PR;
3258 /* An unlock request may or may not have any relation to
3259 * existing locks so we may not be able to pass a lock handle
3260 * via a normal ldlm_lock_cancel() request. The request may even
3261 * unlock a byte range in the middle of an existing lock. In
3262 * order to process an unlock request we need all of the same
3263 * information that is given with a normal read or write record
3264 * lock request. To avoid creating another ldlm unlock (cancel)
3265 * message we'll treat a LCK_NL flock request as an unlock. */
3266 einfo.ei_mode = LCK_NL;
3269 einfo.ei_mode = LCK_PW;
3272 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3287 flags = LDLM_FL_BLOCK_NOWAIT;
3293 flags = LDLM_FL_TEST_LOCK;
3296 CERROR("unknown fcntl lock command: %d\n", cmd);
3300 /* Save the old mode so that if the mode in the lock changes we
3301 * can decrement the appropriate reader or writer refcount. */
3302 file_lock->fl_type = einfo.ei_mode;
3304 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3305 LUSTRE_OPC_ANY, NULL);
3306 if (IS_ERR(op_data))
3307 RETURN(PTR_ERR(op_data));
3309 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3310 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3311 flock.l_flock.pid, flags, einfo.ei_mode,
3312 flock.l_flock.start, flock.l_flock.end);
3314 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3317 /* Restore the file lock type if not TEST lock. */
3318 if (!(flags & LDLM_FL_TEST_LOCK))
3319 file_lock->fl_type = fl_type;
3321 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3322 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3323 !(flags & LDLM_FL_TEST_LOCK))
3324 rc2 = locks_lock_file_wait(file, file_lock);
3326 if ((file_lock->fl_flags & FL_FLOCK) &&
3327 (rc == 0 || file_lock->fl_type == F_UNLCK))
3328 rc2 = flock_lock_file_wait(file, file_lock);
3329 if ((file_lock->fl_flags & FL_POSIX) &&
3330 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3331 !(flags & LDLM_FL_TEST_LOCK))
3332 rc2 = posix_lock_file_wait(file, file_lock);
3333 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3335 if (rc2 && file_lock->fl_type != F_UNLCK) {
3336 einfo.ei_mode = LCK_NL;
3337 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3342 ll_finish_md_op_data(op_data);
3347 int ll_get_fid_by_name(struct inode *parent, const char *name,
3348 int namelen, struct lu_fid *fid,
3349 struct inode **inode)
3351 struct md_op_data *op_data = NULL;
3352 struct mdt_body *body;
3353 struct ptlrpc_request *req;
3357 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3358 LUSTRE_OPC_ANY, NULL);
3359 if (IS_ERR(op_data))
3360 RETURN(PTR_ERR(op_data));
3362 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3363 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3364 ll_finish_md_op_data(op_data);
3368 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3370 GOTO(out_req, rc = -EFAULT);
3372 *fid = body->mbo_fid1;
3375 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3377 ptlrpc_req_finished(req);
3381 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3382 const char *name, int namelen)
3384 struct dentry *dchild = NULL;
3385 struct inode *child_inode = NULL;
3386 struct md_op_data *op_data;
3387 struct ptlrpc_request *request = NULL;
3388 struct obd_client_handle *och = NULL;
3390 struct mdt_body *body;
3392 __u64 data_version = 0;
3395 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3396 name, PFID(ll_inode2fid(parent)), mdtidx);
3398 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3399 0, LUSTRE_OPC_ANY, NULL);
3400 if (IS_ERR(op_data))
3401 RETURN(PTR_ERR(op_data));
3403 /* Get child FID first */
3404 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3407 dchild = d_lookup(file_dentry(file), &qstr);
3408 if (dchild != NULL) {
3409 if (dchild->d_inode != NULL)
3410 child_inode = igrab(dchild->d_inode);
3414 if (child_inode == NULL) {
3415 rc = ll_get_fid_by_name(parent, name, namelen,
3416 &op_data->op_fid3, &child_inode);
3421 if (child_inode == NULL)
3422 GOTO(out_free, rc = -EINVAL);
3425 * lfs migrate command needs to be blocked on the client
3426 * by checking the migrate FID against the FID of the
3429 if (child_inode == parent->i_sb->s_root->d_inode)
3430 GOTO(out_iput, rc = -EINVAL);
3432 inode_lock(child_inode);
3433 op_data->op_fid3 = *ll_inode2fid(child_inode);
3434 if (!fid_is_sane(&op_data->op_fid3)) {
3435 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3436 ll_get_fsname(parent->i_sb, NULL, 0), name,
3437 PFID(&op_data->op_fid3));
3438 GOTO(out_unlock, rc = -EINVAL);
3441 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3443 GOTO(out_unlock, rc);
3446 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3447 PFID(&op_data->op_fid3), mdtidx);
3448 GOTO(out_unlock, rc = 0);
3451 if (S_ISREG(child_inode->i_mode)) {
3452 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3456 GOTO(out_unlock, rc);
3459 rc = ll_data_version(child_inode, &data_version,
3462 GOTO(out_close, rc);
3464 op_data->op_handle = och->och_fh;
3465 op_data->op_data = och->och_mod;
3466 op_data->op_data_version = data_version;
3467 op_data->op_lease_handle = och->och_lease_handle;
3468 op_data->op_bias |= MDS_RENAME_MIGRATE;
3471 op_data->op_mds = mdtidx;
3472 op_data->op_cli_flags = CLI_MIGRATE;
3473 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3474 namelen, name, namelen, &request);
3476 LASSERT(request != NULL);
3477 ll_update_times(request, parent);
3479 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3480 LASSERT(body != NULL);
3482 /* If the server does release layout lock, then we cleanup
3483 * the client och here, otherwise release it in out_close: */
3485 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3486 obd_mod_put(och->och_mod);
3487 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3489 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3495 if (request != NULL) {
3496 ptlrpc_req_finished(request);
3500 /* Try again if the file layout has changed. */
3501 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3505 if (och != NULL) /* close the file */
3506 ll_lease_close(och, child_inode, NULL);
3508 clear_nlink(child_inode);
3510 inode_unlock(child_inode);
3514 ll_finish_md_op_data(op_data);
3519 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3527 * test if some locks matching bits and l_req_mode are acquired
3528 * - bits can be in different locks
3529 * - if found clear the common lock bits in *bits
3530 * - the bits not found, are kept in *bits
3532 * \param bits [IN] searched lock bits [IN]
3533 * \param l_req_mode [IN] searched lock mode
3534 * \retval boolean, true iff all bits are found
3536 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3538 struct lustre_handle lockh;
3539 union ldlm_policy_data policy;
3540 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3541 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3550 fid = &ll_i2info(inode)->lli_fid;
3551 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3552 ldlm_lockname[mode]);
3554 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3555 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3556 policy.l_inodebits.bits = *bits & (1 << i);
3557 if (policy.l_inodebits.bits == 0)
3560 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3561 &policy, mode, &lockh)) {
3562 struct ldlm_lock *lock;
3564 lock = ldlm_handle2lock(&lockh);
3567 ~(lock->l_policy_data.l_inodebits.bits);
3568 LDLM_LOCK_PUT(lock);
3570 *bits &= ~policy.l_inodebits.bits;
3577 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3578 struct lustre_handle *lockh, __u64 flags,
3579 enum ldlm_mode mode)
3581 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3586 fid = &ll_i2info(inode)->lli_fid;
3587 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3589 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3590 fid, LDLM_IBITS, &policy, mode, lockh);
3595 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3597 /* Already unlinked. Just update nlink and return success */
3598 if (rc == -ENOENT) {
3600 /* If it is striped directory, and there is bad stripe
3601 * Let's revalidate the dentry again, instead of returning
3603 if (S_ISDIR(inode->i_mode) &&
3604 ll_i2info(inode)->lli_lsm_md != NULL)
3607 /* This path cannot be hit for regular files unless in
3608 * case of obscure races, so no need to to validate
3610 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3612 } else if (rc != 0) {
3613 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3614 "%s: revalidate FID "DFID" error: rc = %d\n",
3615 ll_get_fsname(inode->i_sb, NULL, 0),
3616 PFID(ll_inode2fid(inode)), rc);
3622 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3624 struct inode *inode = dentry->d_inode;
3625 struct ptlrpc_request *req = NULL;
3626 struct obd_export *exp;
3630 LASSERT(inode != NULL);
3632 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3633 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3635 exp = ll_i2mdexp(inode);
3637 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3638 * But under CMD case, it caused some lock issues, should be fixed
3639 * with new CMD ibits lock. See bug 12718 */
3640 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3641 struct lookup_intent oit = { .it_op = IT_GETATTR };
3642 struct md_op_data *op_data;
3644 if (ibits == MDS_INODELOCK_LOOKUP)
3645 oit.it_op = IT_LOOKUP;
3647 /* Call getattr by fid, so do not provide name at all. */
3648 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3649 dentry->d_inode, NULL, 0, 0,
3650 LUSTRE_OPC_ANY, NULL);
3651 if (IS_ERR(op_data))
3652 RETURN(PTR_ERR(op_data));
3654 rc = md_intent_lock(exp, op_data, &oit, &req,
3655 &ll_md_blocking_ast, 0);
3656 ll_finish_md_op_data(op_data);
3658 rc = ll_inode_revalidate_fini(inode, rc);
3662 rc = ll_revalidate_it_finish(req, &oit, dentry);
3664 ll_intent_release(&oit);
3668 /* Unlinked? Unhash dentry, so it is not picked up later by
3669 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3670 here to preserve get_cwd functionality on 2.6.
3672 if (!dentry->d_inode->i_nlink) {
3673 ll_lock_dcache(inode);
3674 d_lustre_invalidate(dentry, 0);
3675 ll_unlock_dcache(inode);
3678 ll_lookup_finish_locks(&oit, dentry);
3679 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3680 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3681 u64 valid = OBD_MD_FLGETATTR;
3682 struct md_op_data *op_data;
3685 if (S_ISREG(inode->i_mode)) {
3686 rc = ll_get_default_mdsize(sbi, &ealen);
3689 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3692 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3693 0, ealen, LUSTRE_OPC_ANY,
3695 if (IS_ERR(op_data))
3696 RETURN(PTR_ERR(op_data));
3698 op_data->op_valid = valid;
3699 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3700 ll_finish_md_op_data(op_data);
3702 rc = ll_inode_revalidate_fini(inode, rc);
3706 rc = ll_prep_inode(&inode, req, NULL, NULL);
3709 ptlrpc_req_finished(req);
3713 static int ll_merge_md_attr(struct inode *inode)
3715 struct cl_attr attr = { 0 };
3718 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3719 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3720 &attr, ll_md_blocking_ast);
3724 set_nlink(inode, attr.cat_nlink);
3725 inode->i_blocks = attr.cat_blocks;
3726 i_size_write(inode, attr.cat_size);
3728 ll_i2info(inode)->lli_atime = attr.cat_atime;
3729 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3730 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3736 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3738 struct inode *inode = dentry->d_inode;
3742 rc = __ll_inode_revalidate(dentry, ibits);
3746 /* if object isn't regular file, don't validate size */
3747 if (!S_ISREG(inode->i_mode)) {
3748 if (S_ISDIR(inode->i_mode) &&
3749 ll_i2info(inode)->lli_lsm_md != NULL) {
3750 rc = ll_merge_md_attr(inode);
3755 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3756 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3757 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3759 /* In case of restore, the MDT has the right size and has
3760 * already send it back without granting the layout lock,
3761 * inode is up-to-date so glimpse is useless.
3762 * Also to glimpse we need the layout, in case of a running
3763 * restore the MDT holds the layout lock so the glimpse will
3764 * block up to the end of restore (getattr will block)
3766 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3767 rc = ll_glimpse_size(inode);
3772 static inline dev_t ll_compat_encode_dev(dev_t dev)
3774 /* The compat_sys_*stat*() syscalls will fail unless the
3775 * device majors and minors are both less than 256. Note that
3776 * the value returned here will be passed through
3777 * old_encode_dev() in cp_compat_stat(). And so we are not
3778 * trying to return a valid compat (u16) device number, just
3779 * one that will pass the old_valid_dev() check. */
3781 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3784 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3786 struct inode *inode = de->d_inode;
3787 struct ll_sb_info *sbi = ll_i2sbi(inode);
3788 struct ll_inode_info *lli = ll_i2info(inode);
3791 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3792 MDS_INODELOCK_LOOKUP);
3793 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3798 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3800 if (ll_need_32bit_api(sbi)) {
3801 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3802 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3803 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3805 stat->ino = inode->i_ino;
3806 stat->dev = inode->i_sb->s_dev;
3807 stat->rdev = inode->i_rdev;
3810 stat->mode = inode->i_mode;
3811 stat->uid = inode->i_uid;
3812 stat->gid = inode->i_gid;
3813 stat->atime = inode->i_atime;
3814 stat->mtime = inode->i_mtime;
3815 stat->ctime = inode->i_ctime;
3816 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3818 stat->nlink = inode->i_nlink;
3819 stat->size = i_size_read(inode);
3820 stat->blocks = inode->i_blocks;
3825 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3826 __u64 start, __u64 len)
3830 struct fiemap *fiemap;
3831 unsigned int extent_count = fieinfo->fi_extents_max;
3833 num_bytes = sizeof(*fiemap) + (extent_count *
3834 sizeof(struct fiemap_extent));
3835 OBD_ALLOC_LARGE(fiemap, num_bytes);
3840 fiemap->fm_flags = fieinfo->fi_flags;
3841 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3842 fiemap->fm_start = start;
3843 fiemap->fm_length = len;
3844 if (extent_count > 0 &&
3845 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3846 sizeof(struct fiemap_extent)) != 0)
3847 GOTO(out, rc = -EFAULT);
3849 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3851 fieinfo->fi_flags = fiemap->fm_flags;
3852 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3853 if (extent_count > 0 &&
3854 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3855 fiemap->fm_mapped_extents *
3856 sizeof(struct fiemap_extent)) != 0)
3857 GOTO(out, rc = -EFAULT);
3859 OBD_FREE_LARGE(fiemap, num_bytes);
3863 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3865 struct ll_inode_info *lli = ll_i2info(inode);
3866 struct posix_acl *acl = NULL;
3869 spin_lock(&lli->lli_lock);
3870 /* VFS' acl_permission_check->check_acl will release the refcount */
3871 acl = posix_acl_dup(lli->lli_posix_acl);
3872 spin_unlock(&lli->lli_lock);
3877 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3879 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3880 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3882 ll_check_acl(struct inode *inode, int mask)
3885 # ifdef CONFIG_FS_POSIX_ACL
3886 struct posix_acl *acl;
3890 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3891 if (flags & IPERM_FLAG_RCU)
3894 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3899 rc = posix_acl_permission(inode, acl, mask);
3900 posix_acl_release(acl);
3903 # else /* !CONFIG_FS_POSIX_ACL */
3905 # endif /* CONFIG_FS_POSIX_ACL */
3907 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3909 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3910 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3912 # ifdef HAVE_INODE_PERMISION_2ARGS
3913 int ll_inode_permission(struct inode *inode, int mask)
3915 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3920 struct ll_sb_info *sbi;
3921 struct root_squash_info *squash;
3922 struct cred *cred = NULL;
3923 const struct cred *old_cred = NULL;
3925 bool squash_id = false;
3928 #ifdef MAY_NOT_BLOCK
3929 if (mask & MAY_NOT_BLOCK)
3931 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3932 if (flags & IPERM_FLAG_RCU)
3936 /* as root inode are NOT getting validated in lookup operation,
3937 * need to do it before permission check. */
3939 if (inode == inode->i_sb->s_root->d_inode) {
3940 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3941 MDS_INODELOCK_LOOKUP);
3946 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3947 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3949 /* squash fsuid/fsgid if needed */
3950 sbi = ll_i2sbi(inode);
3951 squash = &sbi->ll_squash;
3952 if (unlikely(squash->rsi_uid != 0 &&
3953 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3954 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3958 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3959 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3960 squash->rsi_uid, squash->rsi_gid);
3962 /* update current process's credentials
3963 * and FS capability */
3964 cred = prepare_creds();
3968 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3969 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3970 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3971 if ((1 << cap) & CFS_CAP_FS_MASK)
3972 cap_lower(cred->cap_effective, cap);
3974 old_cred = override_creds(cred);
3977 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3978 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3979 /* restore current process's credentials and FS capability */
3981 revert_creds(old_cred);
3988 /* -o localflock - only provides locally consistent flock locks */
3989 struct file_operations ll_file_operations = {
3990 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
3991 # ifdef HAVE_SYNC_READ_WRITE
3992 .read = new_sync_read,
3993 .write = new_sync_write,
3995 .read_iter = ll_file_read_iter,
3996 .write_iter = ll_file_write_iter,
3997 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
3998 .read = ll_file_read,
3999 .aio_read = ll_file_aio_read,
4000 .write = ll_file_write,
4001 .aio_write = ll_file_aio_write,
4002 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4003 .unlocked_ioctl = ll_file_ioctl,
4004 .open = ll_file_open,
4005 .release = ll_file_release,
4006 .mmap = ll_file_mmap,
4007 .llseek = ll_file_seek,
4008 .splice_read = ll_file_splice_read,
4013 struct file_operations ll_file_operations_flock = {
4014 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4015 # ifdef HAVE_SYNC_READ_WRITE
4016 .read = new_sync_read,
4017 .write = new_sync_write,
4018 # endif /* HAVE_SYNC_READ_WRITE */
4019 .read_iter = ll_file_read_iter,
4020 .write_iter = ll_file_write_iter,
4021 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4022 .read = ll_file_read,
4023 .aio_read = ll_file_aio_read,
4024 .write = ll_file_write,
4025 .aio_write = ll_file_aio_write,
4026 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4027 .unlocked_ioctl = ll_file_ioctl,
4028 .open = ll_file_open,
4029 .release = ll_file_release,
4030 .mmap = ll_file_mmap,
4031 .llseek = ll_file_seek,
4032 .splice_read = ll_file_splice_read,
4035 .flock = ll_file_flock,
4036 .lock = ll_file_flock
4039 /* These are for -o noflock - to return ENOSYS on flock calls */
4040 struct file_operations ll_file_operations_noflock = {
4041 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4042 # ifdef HAVE_SYNC_READ_WRITE
4043 .read = new_sync_read,
4044 .write = new_sync_write,
4045 # endif /* HAVE_SYNC_READ_WRITE */
4046 .read_iter = ll_file_read_iter,
4047 .write_iter = ll_file_write_iter,
4048 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4049 .read = ll_file_read,
4050 .aio_read = ll_file_aio_read,
4051 .write = ll_file_write,
4052 .aio_write = ll_file_aio_write,
4053 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4054 .unlocked_ioctl = ll_file_ioctl,
4055 .open = ll_file_open,
4056 .release = ll_file_release,
4057 .mmap = ll_file_mmap,
4058 .llseek = ll_file_seek,
4059 .splice_read = ll_file_splice_read,
4062 .flock = ll_file_noflock,
4063 .lock = ll_file_noflock
4066 struct inode_operations ll_file_inode_operations = {
4067 .setattr = ll_setattr,
4068 .getattr = ll_getattr,
4069 .permission = ll_inode_permission,
4070 .setxattr = ll_setxattr,
4071 .getxattr = ll_getxattr,
4072 .listxattr = ll_listxattr,
4073 .removexattr = ll_removexattr,
4074 .fiemap = ll_fiemap,
4075 #ifdef HAVE_IOP_GET_ACL
4076 .get_acl = ll_get_acl,
4080 /* dynamic ioctl number support routins */
4081 static struct llioc_ctl_data {
4082 struct rw_semaphore ioc_sem;
4083 struct list_head ioc_head;
4085 __RWSEM_INITIALIZER(llioc.ioc_sem),
4086 LIST_HEAD_INIT(llioc.ioc_head)
4091 struct list_head iocd_list;
4092 unsigned int iocd_size;
4093 llioc_callback_t iocd_cb;
4094 unsigned int iocd_count;
4095 unsigned int iocd_cmd[0];
4098 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
4101 struct llioc_data *in_data = NULL;
4104 if (cb == NULL || cmd == NULL ||
4105 count > LLIOC_MAX_CMD || count < 0)
4108 size = sizeof(*in_data) + count * sizeof(unsigned int);
4109 OBD_ALLOC(in_data, size);
4110 if (in_data == NULL)
4113 memset(in_data, 0, sizeof(*in_data));
4114 in_data->iocd_size = size;
4115 in_data->iocd_cb = cb;
4116 in_data->iocd_count = count;
4117 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
4119 down_write(&llioc.ioc_sem);
4120 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
4121 up_write(&llioc.ioc_sem);
4126 void ll_iocontrol_unregister(void *magic)
4128 struct llioc_data *tmp;
4133 down_write(&llioc.ioc_sem);
4134 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
4136 unsigned int size = tmp->iocd_size;
4138 list_del(&tmp->iocd_list);
4139 up_write(&llioc.ioc_sem);
4141 OBD_FREE(tmp, size);
4145 up_write(&llioc.ioc_sem);
4147 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
4150 EXPORT_SYMBOL(ll_iocontrol_register);
4151 EXPORT_SYMBOL(ll_iocontrol_unregister);
4153 static enum llioc_iter
4154 ll_iocontrol_call(struct inode *inode, struct file *file,
4155 unsigned int cmd, unsigned long arg, int *rcp)
4157 enum llioc_iter ret = LLIOC_CONT;
4158 struct llioc_data *data;
4159 int rc = -EINVAL, i;
4161 down_read(&llioc.ioc_sem);
4162 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
4163 for (i = 0; i < data->iocd_count; i++) {
4164 if (cmd != data->iocd_cmd[i])
4167 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
4171 if (ret == LLIOC_STOP)
4174 up_read(&llioc.ioc_sem);
4181 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4183 struct ll_inode_info *lli = ll_i2info(inode);
4184 struct cl_object *obj = lli->lli_clob;
4193 env = cl_env_get(&refcheck);
4195 RETURN(PTR_ERR(env));
4197 rc = cl_conf_set(env, lli->lli_clob, conf);
4201 if (conf->coc_opc == OBJECT_CONF_SET) {
4202 struct ldlm_lock *lock = conf->coc_lock;
4203 struct cl_layout cl = {
4207 LASSERT(lock != NULL);
4208 LASSERT(ldlm_has_layout(lock));
4210 /* it can only be allowed to match after layout is
4211 * applied to inode otherwise false layout would be
4212 * seen. Applying layout shoud happen before dropping
4213 * the intent lock. */
4214 ldlm_lock_allow_match(lock);
4216 rc = cl_object_layout_get(env, obj, &cl);
4221 DFID": layout version change: %u -> %u\n",
4222 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4224 ll_layout_version_set(lli, cl.cl_layout_gen);
4228 cl_env_put(env, &refcheck);
4233 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4234 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4237 struct ll_sb_info *sbi = ll_i2sbi(inode);
4238 struct ptlrpc_request *req;
4239 struct mdt_body *body;
4246 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4247 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4248 lock->l_lvb_data, lock->l_lvb_len);
4250 if (lock->l_lvb_data != NULL)
4253 /* if layout lock was granted right away, the layout is returned
4254 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4255 * blocked and then granted via completion ast, we have to fetch
4256 * layout here. Please note that we can't use the LVB buffer in
4257 * completion AST because it doesn't have a large enough buffer */
4258 rc = ll_get_default_mdsize(sbi, &lmmsize);
4260 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4261 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4266 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4268 GOTO(out, rc = -EPROTO);
4270 lmmsize = body->mbo_eadatasize;
4271 if (lmmsize == 0) /* empty layout */
4274 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4276 GOTO(out, rc = -EFAULT);
4278 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4279 if (lvbdata == NULL)
4280 GOTO(out, rc = -ENOMEM);
4282 memcpy(lvbdata, lmm, lmmsize);
4283 lock_res_and_lock(lock);
4284 if (unlikely(lock->l_lvb_data == NULL)) {
4285 lock->l_lvb_type = LVB_T_LAYOUT;
4286 lock->l_lvb_data = lvbdata;
4287 lock->l_lvb_len = lmmsize;
4290 unlock_res_and_lock(lock);
4293 OBD_FREE_LARGE(lvbdata, lmmsize);
4298 ptlrpc_req_finished(req);
4303 * Apply the layout to the inode. Layout lock is held and will be released
4306 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4307 struct inode *inode)
4309 struct ll_inode_info *lli = ll_i2info(inode);
4310 struct ll_sb_info *sbi = ll_i2sbi(inode);
4311 struct ldlm_lock *lock;
4312 struct cl_object_conf conf;
4315 bool wait_layout = false;
4318 LASSERT(lustre_handle_is_used(lockh));
4320 lock = ldlm_handle2lock(lockh);
4321 LASSERT(lock != NULL);
4322 LASSERT(ldlm_has_layout(lock));
4324 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4325 PFID(&lli->lli_fid), inode);
4327 /* in case this is a caching lock and reinstate with new inode */
4328 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4330 lock_res_and_lock(lock);
4331 lvb_ready = ldlm_is_lvb_ready(lock);
4332 unlock_res_and_lock(lock);
4334 /* checking lvb_ready is racy but this is okay. The worst case is
4335 * that multi processes may configure the file on the same time. */
4339 rc = ll_layout_fetch(inode, lock);
4343 /* for layout lock, lmm is stored in lock's lvb.
4344 * lvb_data is immutable if the lock is held so it's safe to access it
4347 * set layout to file. Unlikely this will fail as old layout was
4348 * surely eliminated */
4349 memset(&conf, 0, sizeof conf);
4350 conf.coc_opc = OBJECT_CONF_SET;
4351 conf.coc_inode = inode;
4352 conf.coc_lock = lock;
4353 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4354 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4355 rc = ll_layout_conf(inode, &conf);
4357 /* refresh layout failed, need to wait */
4358 wait_layout = rc == -EBUSY;
4361 LDLM_LOCK_PUT(lock);
4362 ldlm_lock_decref(lockh, mode);
4364 /* wait for IO to complete if it's still being used. */
4366 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4367 ll_get_fsname(inode->i_sb, NULL, 0),
4368 PFID(&lli->lli_fid), inode);
4370 memset(&conf, 0, sizeof conf);
4371 conf.coc_opc = OBJECT_CONF_WAIT;
4372 conf.coc_inode = inode;
4373 rc = ll_layout_conf(inode, &conf);
4377 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4378 ll_get_fsname(inode->i_sb, NULL, 0),
4379 PFID(&lli->lli_fid), rc);
4385 * Issue layout intent RPC to MDS.
4386 * \param inode [in] file inode
4387 * \param intent [in] layout intent
4389 * \retval 0 on success
4390 * \retval < 0 error code
4392 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4394 struct ll_inode_info *lli = ll_i2info(inode);
4395 struct ll_sb_info *sbi = ll_i2sbi(inode);
4396 struct md_op_data *op_data;
4397 struct lookup_intent it;
4398 struct ptlrpc_request *req;
4402 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4403 0, 0, LUSTRE_OPC_ANY, NULL);
4404 if (IS_ERR(op_data))
4405 RETURN(PTR_ERR(op_data));
4407 op_data->op_data = intent;
4408 op_data->op_data_size = sizeof(*intent);
4410 memset(&it, 0, sizeof(it));
4411 it.it_op = IT_LAYOUT;
4412 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4413 intent->li_opc == LAYOUT_INTENT_TRUNC)
4414 it.it_flags = FMODE_WRITE;
4416 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4417 ll_get_fsname(inode->i_sb, NULL, 0),
4418 PFID(&lli->lli_fid), inode);
4420 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4421 &ll_md_blocking_ast, 0);
4422 if (it.it_request != NULL)
4423 ptlrpc_req_finished(it.it_request);
4424 it.it_request = NULL;
4426 ll_finish_md_op_data(op_data);
4428 /* set lock data in case this is a new lock */
4430 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4432 ll_intent_drop_lock(&it);
4438 * This function checks if there exists a LAYOUT lock on the client side,
4439 * or enqueues it if it doesn't have one in cache.
4441 * This function will not hold layout lock so it may be revoked any time after
4442 * this function returns. Any operations depend on layout should be redone
4445 * This function should be called before lov_io_init() to get an uptodate
4446 * layout version, the caller should save the version number and after IO
4447 * is finished, this function should be called again to verify that layout
4448 * is not changed during IO time.
4450 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4452 struct ll_inode_info *lli = ll_i2info(inode);
4453 struct ll_sb_info *sbi = ll_i2sbi(inode);
4454 struct lustre_handle lockh;
4455 struct layout_intent intent = {
4456 .li_opc = LAYOUT_INTENT_ACCESS,
4458 enum ldlm_mode mode;
4462 *gen = ll_layout_version_get(lli);
4463 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4467 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4468 LASSERT(S_ISREG(inode->i_mode));
4470 /* take layout lock mutex to enqueue layout lock exclusively. */
4471 mutex_lock(&lli->lli_layout_mutex);
4474 /* mostly layout lock is caching on the local side, so try to
4475 * match it before grabbing layout lock mutex. */
4476 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4477 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4478 if (mode != 0) { /* hit cached lock */
4479 rc = ll_layout_lock_set(&lockh, mode, inode);
4485 rc = ll_layout_intent(inode, &intent);
4491 *gen = ll_layout_version_get(lli);
4492 mutex_unlock(&lli->lli_layout_mutex);
4498 * Issue layout intent RPC indicating where in a file an IO is about to write.
4500 * \param[in] inode file inode.
4501 * \param[in] start start offset of fille in bytes where an IO is about to
4503 * \param[in] end exclusive end offset in bytes of the write range.
4505 * \retval 0 on success
4506 * \retval < 0 error code
4508 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4510 struct layout_intent intent = {
4511 .li_opc = LAYOUT_INTENT_WRITE,
4518 rc = ll_layout_intent(inode, &intent);
4524 * This function send a restore request to the MDT
4526 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4528 struct hsm_user_request *hur;
4532 len = sizeof(struct hsm_user_request) +
4533 sizeof(struct hsm_user_item);
4534 OBD_ALLOC(hur, len);
4538 hur->hur_request.hr_action = HUA_RESTORE;
4539 hur->hur_request.hr_archive_id = 0;
4540 hur->hur_request.hr_flags = 0;
4541 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4542 sizeof(hur->hur_user_item[0].hui_fid));
4543 hur->hur_user_item[0].hui_extent.offset = offset;
4544 hur->hur_user_item[0].hui_extent.length = length;
4545 hur->hur_request.hr_itemcount = 1;
4546 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,