4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
71 fd->fd_write_failed = false;
76 static void ll_file_data_put(struct ll_file_data *fd)
79 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 * Packs all the attributes into @op_data for the CLOSE rpc.
85 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
86 struct obd_client_handle *och)
90 ll_prep_md_op_data(op_data, inode, NULL, NULL,
91 0, 0, LUSTRE_OPC_ANY, NULL);
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
101 op_data->op_attr_blocks = inode->i_blocks;
102 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
103 op_data->op_handle = och->och_fh;
105 if (och->och_flags & FMODE_WRITE &&
106 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
107 /* For HSM: if inode data has been modified, pack it so that
108 * MDT can set data dirty flag in the archive. */
109 op_data->op_bias |= MDS_DATA_MODIFIED;
115 * Perform a close, possibly with a bias.
116 * The meaning of "data" depends on the value of "bias".
118 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
119 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
122 static int ll_close_inode_openhandle(struct inode *inode,
123 struct obd_client_handle *och,
124 enum mds_op_bias bias, void *data)
126 struct obd_export *md_exp = ll_i2mdexp(inode);
127 const struct ll_inode_info *lli = ll_i2info(inode);
128 struct md_op_data *op_data;
129 struct ptlrpc_request *req = NULL;
133 if (class_exp2obd(md_exp) == NULL) {
134 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
135 ll_get_fsname(inode->i_sb, NULL, 0),
136 PFID(&lli->lli_fid));
140 OBD_ALLOC_PTR(op_data);
141 /* We leak openhandle and request here on error, but not much to be
142 * done in OOM case since app won't retry close on error either. */
144 GOTO(out, rc = -ENOMEM);
146 ll_prepare_close(inode, op_data, och);
148 case MDS_CLOSE_LAYOUT_SWAP:
149 LASSERT(data != NULL);
150 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
151 op_data->op_data_version = 0;
152 op_data->op_lease_handle = och->och_lease_handle;
153 op_data->op_fid2 = *ll_inode2fid(data);
156 case MDS_HSM_RELEASE:
157 LASSERT(data != NULL);
158 op_data->op_bias |= MDS_HSM_RELEASE;
159 op_data->op_data_version = *(__u64 *)data;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
165 LASSERT(data == NULL);
169 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 if (rc != 0 && rc != -EINTR)
171 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
172 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
175 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
176 struct mdt_body *body;
178 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
179 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
183 ll_finish_md_op_data(op_data);
187 md_clear_open_replay_data(md_exp, och);
188 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
191 ptlrpc_req_finished(req); /* This is close request */
195 int ll_md_real_close(struct inode *inode, fmode_t fmode)
197 struct ll_inode_info *lli = ll_i2info(inode);
198 struct obd_client_handle **och_p;
199 struct obd_client_handle *och;
204 if (fmode & FMODE_WRITE) {
205 och_p = &lli->lli_mds_write_och;
206 och_usecount = &lli->lli_open_fd_write_count;
207 } else if (fmode & FMODE_EXEC) {
208 och_p = &lli->lli_mds_exec_och;
209 och_usecount = &lli->lli_open_fd_exec_count;
211 LASSERT(fmode & FMODE_READ);
212 och_p = &lli->lli_mds_read_och;
213 och_usecount = &lli->lli_open_fd_read_count;
216 mutex_lock(&lli->lli_och_mutex);
217 if (*och_usecount > 0) {
218 /* There are still users of this handle, so skip
220 mutex_unlock(&lli->lli_och_mutex);
226 mutex_unlock(&lli->lli_och_mutex);
229 /* There might be a race and this handle may already
231 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
237 static int ll_md_close(struct inode *inode, struct file *file)
239 union ldlm_policy_data policy = {
240 .l_inodebits = { MDS_INODELOCK_OPEN },
242 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
243 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
244 struct ll_inode_info *lli = ll_i2info(inode);
245 struct lustre_handle lockh;
246 enum ldlm_mode lockmode;
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
254 if (fd->fd_lease_och != NULL) {
257 /* Usually the lease is not released when the
258 * application crashed, we need to release here. */
259 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
260 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
261 PFID(&lli->lli_fid), rc, lease_broken);
263 fd->fd_lease_och = NULL;
266 if (fd->fd_och != NULL) {
267 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
272 /* Let's see if we have good enough OPEN lock on the file and if
273 we can skip talking to MDS */
274 mutex_lock(&lli->lli_och_mutex);
275 if (fd->fd_omode & FMODE_WRITE) {
277 LASSERT(lli->lli_open_fd_write_count);
278 lli->lli_open_fd_write_count--;
279 } else if (fd->fd_omode & FMODE_EXEC) {
281 LASSERT(lli->lli_open_fd_exec_count);
282 lli->lli_open_fd_exec_count--;
285 LASSERT(lli->lli_open_fd_read_count);
286 lli->lli_open_fd_read_count--;
288 mutex_unlock(&lli->lli_och_mutex);
290 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
291 LDLM_IBITS, &policy, lockmode, &lockh))
292 rc = ll_md_real_close(inode, fd->fd_omode);
295 LUSTRE_FPRIVATE(file) = NULL;
296 ll_file_data_put(fd);
301 /* While this returns an error code, fput() the caller does not, so we need
302 * to make every effort to clean up all of our state here. Also, applications
303 * rarely check close errors and even if an error is returned they will not
304 * re-try the close call.
306 int ll_file_release(struct inode *inode, struct file *file)
308 struct ll_file_data *fd;
309 struct ll_sb_info *sbi = ll_i2sbi(inode);
310 struct ll_inode_info *lli = ll_i2info(inode);
314 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
315 PFID(ll_inode2fid(inode)), inode);
317 if (inode->i_sb->s_root != file_dentry(file))
318 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
319 fd = LUSTRE_FPRIVATE(file);
322 /* The last ref on @file, maybe not the the owner pid of statahead,
323 * because parent and child process can share the same file handle. */
324 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
325 ll_deauthorize_statahead(inode, fd);
327 if (inode->i_sb->s_root == file_dentry(file)) {
328 LUSTRE_FPRIVATE(file) = NULL;
329 ll_file_data_put(fd);
333 if (!S_ISDIR(inode->i_mode)) {
334 if (lli->lli_clob != NULL)
335 lov_read_and_clear_async_rc(lli->lli_clob);
336 lli->lli_async_rc = 0;
339 rc = ll_md_close(inode, file);
341 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
342 libcfs_debug_dumplog();
347 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
348 struct lookup_intent *itp)
350 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
351 struct dentry *parent = de->d_parent;
352 const char *name = NULL;
354 struct md_op_data *op_data;
355 struct ptlrpc_request *req = NULL;
359 LASSERT(parent != NULL);
360 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
362 /* if server supports open-by-fid, or file name is invalid, don't pack
363 * name in open request */
364 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
365 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
366 name = de->d_name.name;
367 len = de->d_name.len;
370 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
371 name, len, 0, LUSTRE_OPC_ANY, NULL);
373 RETURN(PTR_ERR(op_data));
374 op_data->op_data = lmm;
375 op_data->op_data_size = lmmsize;
377 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
378 &ll_md_blocking_ast, 0);
379 ll_finish_md_op_data(op_data);
381 /* reason for keep own exit path - don`t flood log
382 * with messages with -ESTALE errors.
384 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
385 it_open_error(DISP_OPEN_OPEN, itp))
387 ll_release_openhandle(de, itp);
391 if (it_disposition(itp, DISP_LOOKUP_NEG))
392 GOTO(out, rc = -ENOENT);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
401 if (!rc && itp->it_lock_mode)
402 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
405 ptlrpc_req_finished(req);
406 ll_intent_drop_lock(itp);
408 /* We did open by fid, but by the time we got to the server,
409 * the object disappeared. If this is a create, we cannot really
410 * tell the userspace that the file it was trying to create
411 * does not exist. Instead let's return -ESTALE, and the VFS will
412 * retry the create with LOOKUP_REVAL that we are going to catch
413 * in ll_revalidate_dentry() and use lookup then.
415 if (rc == -ENOENT && itp->it_op & IT_CREAT)
421 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
422 struct obd_client_handle *och)
424 struct mdt_body *body;
426 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
427 och->och_fh = body->mbo_handle;
428 och->och_fid = body->mbo_fid1;
429 och->och_lease_handle.cookie = it->it_lock_handle;
430 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
431 och->och_flags = it->it_flags;
433 return md_set_open_replay_data(md_exp, och, it);
436 static int ll_local_open(struct file *file, struct lookup_intent *it,
437 struct ll_file_data *fd, struct obd_client_handle *och)
439 struct inode *inode = file_inode(file);
442 LASSERT(!LUSTRE_FPRIVATE(file));
449 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
454 LUSTRE_FPRIVATE(file) = fd;
455 ll_readahead_init(inode, &fd->fd_ras);
456 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
458 /* ll_cl_context initialize */
459 rwlock_init(&fd->fd_lock);
460 INIT_LIST_HEAD(&fd->fd_lccs);
465 /* Open a file, and (for the very first open) create objects on the OSTs at
466 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
467 * creation or open until ll_lov_setstripe() ioctl is called.
469 * If we already have the stripe MD locally then we don't request it in
470 * md_open(), by passing a lmm_size = 0.
472 * It is up to the application to ensure no other processes open this file
473 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
474 * used. We might be able to avoid races of that sort by getting lli_open_sem
475 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
476 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
478 int ll_file_open(struct inode *inode, struct file *file)
480 struct ll_inode_info *lli = ll_i2info(inode);
481 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
482 .it_flags = file->f_flags };
483 struct obd_client_handle **och_p = NULL;
484 __u64 *och_usecount = NULL;
485 struct ll_file_data *fd;
489 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
490 PFID(ll_inode2fid(inode)), inode, file->f_flags);
492 it = file->private_data; /* XXX: compat macro */
493 file->private_data = NULL; /* prevent ll_local_open assertion */
495 fd = ll_file_data_get();
497 GOTO(out_openerr, rc = -ENOMEM);
500 if (S_ISDIR(inode->i_mode))
501 ll_authorize_statahead(inode, fd);
503 if (inode->i_sb->s_root == file_dentry(file)) {
504 LUSTRE_FPRIVATE(file) = fd;
508 if (!it || !it->it_disposition) {
509 /* Convert f_flags into access mode. We cannot use file->f_mode,
510 * because everything but O_ACCMODE mask was stripped from
512 if ((oit.it_flags + 1) & O_ACCMODE)
514 if (file->f_flags & O_TRUNC)
515 oit.it_flags |= FMODE_WRITE;
517 /* kernel only call f_op->open in dentry_open. filp_open calls
518 * dentry_open after call to open_namei that checks permissions.
519 * Only nfsd_open call dentry_open directly without checking
520 * permissions and because of that this code below is safe. */
521 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
522 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
524 /* We do not want O_EXCL here, presumably we opened the file
525 * already? XXX - NFS implications? */
526 oit.it_flags &= ~O_EXCL;
528 /* bug20584, if "it_flags" contains O_CREAT, the file will be
529 * created if necessary, then "IT_CREAT" should be set to keep
530 * consistent with it */
531 if (oit.it_flags & O_CREAT)
532 oit.it_op |= IT_CREAT;
538 /* Let's see if we have file open on MDS already. */
539 if (it->it_flags & FMODE_WRITE) {
540 och_p = &lli->lli_mds_write_och;
541 och_usecount = &lli->lli_open_fd_write_count;
542 } else if (it->it_flags & FMODE_EXEC) {
543 och_p = &lli->lli_mds_exec_och;
544 och_usecount = &lli->lli_open_fd_exec_count;
546 och_p = &lli->lli_mds_read_och;
547 och_usecount = &lli->lli_open_fd_read_count;
550 mutex_lock(&lli->lli_och_mutex);
551 if (*och_p) { /* Open handle is present */
552 if (it_disposition(it, DISP_OPEN_OPEN)) {
553 /* Well, there's extra open request that we do not need,
554 let's close it somehow. This will decref request. */
555 rc = it_open_error(DISP_OPEN_OPEN, it);
557 mutex_unlock(&lli->lli_och_mutex);
558 GOTO(out_openerr, rc);
561 ll_release_openhandle(file_dentry(file), it);
565 rc = ll_local_open(file, it, fd, NULL);
568 mutex_unlock(&lli->lli_och_mutex);
569 GOTO(out_openerr, rc);
572 LASSERT(*och_usecount == 0);
573 if (!it->it_disposition) {
574 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
575 /* We cannot just request lock handle now, new ELC code
576 means that one of other OPEN locks for this file
577 could be cancelled, and since blocking ast handler
578 would attempt to grab och_mutex as well, that would
579 result in a deadlock */
580 mutex_unlock(&lli->lli_och_mutex);
582 * Normally called under two situations:
584 * 2. A race/condition on MDS resulting in no open
585 * handle to be returned from LOOKUP|OPEN request,
586 * for example if the target entry was a symlink.
588 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
589 * marked by a bit set in ll_iget_for_nfs. Clear the
590 * bit so that it's not confusing later callers.
592 * NB; when ldd is NULL, it must have come via normal
593 * lookup path only, since ll_iget_for_nfs always calls
596 if (ldd && ldd->lld_nfs_dentry) {
597 ldd->lld_nfs_dentry = 0;
598 it->it_flags |= MDS_OPEN_LOCK;
602 * Always specify MDS_OPEN_BY_FID because we don't want
603 * to get file with different fid.
605 it->it_flags |= MDS_OPEN_BY_FID;
606 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
609 GOTO(out_openerr, rc);
613 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
615 GOTO(out_och_free, rc = -ENOMEM);
619 /* md_intent_lock() didn't get a request ref if there was an
620 * open error, so don't do cleanup on the request here
622 /* XXX (green): Should not we bail out on any error here, not
623 * just open error? */
624 rc = it_open_error(DISP_OPEN_OPEN, it);
626 GOTO(out_och_free, rc);
628 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
629 "inode %p: disposition %x, status %d\n", inode,
630 it_disposition(it, ~0), it->it_status);
632 rc = ll_local_open(file, it, fd, *och_p);
634 GOTO(out_och_free, rc);
636 mutex_unlock(&lli->lli_och_mutex);
639 /* Must do this outside lli_och_mutex lock to prevent deadlock where
640 different kind of OPEN lock for this same inode gets cancelled
641 by ldlm_cancel_lru */
642 if (!S_ISREG(inode->i_mode))
643 GOTO(out_och_free, rc);
645 cl_lov_delay_create_clear(&file->f_flags);
646 GOTO(out_och_free, rc);
650 if (och_p && *och_p) {
651 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
652 *och_p = NULL; /* OBD_FREE writes some magic there */
655 mutex_unlock(&lli->lli_och_mutex);
658 if (lli->lli_opendir_key == fd)
659 ll_deauthorize_statahead(inode, fd);
661 ll_file_data_put(fd);
663 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
666 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
667 ptlrpc_req_finished(it->it_request);
668 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
674 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
675 struct ldlm_lock_desc *desc, void *data, int flag)
678 struct lustre_handle lockh;
682 case LDLM_CB_BLOCKING:
683 ldlm_lock2handle(lock, &lockh);
684 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
686 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
690 case LDLM_CB_CANCELING:
698 * When setting a lease on a file, we take ownership of the lli_mds_*_och
699 * and save it as fd->fd_och so as to force client to reopen the file even
700 * if it has an open lock in cache already.
702 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
703 struct lustre_handle *old_handle)
705 struct ll_inode_info *lli = ll_i2info(inode);
706 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
707 struct obd_client_handle **och_p;
712 /* Get the openhandle of the file */
713 mutex_lock(&lli->lli_och_mutex);
714 if (fd->fd_lease_och != NULL)
715 GOTO(out_unlock, rc = -EBUSY);
717 if (fd->fd_och == NULL) {
718 if (file->f_mode & FMODE_WRITE) {
719 LASSERT(lli->lli_mds_write_och != NULL);
720 och_p = &lli->lli_mds_write_och;
721 och_usecount = &lli->lli_open_fd_write_count;
723 LASSERT(lli->lli_mds_read_och != NULL);
724 och_p = &lli->lli_mds_read_och;
725 och_usecount = &lli->lli_open_fd_read_count;
728 if (*och_usecount > 1)
729 GOTO(out_unlock, rc = -EBUSY);
736 *old_handle = fd->fd_och->och_fh;
740 mutex_unlock(&lli->lli_och_mutex);
745 * Release ownership on lli_mds_*_och when putting back a file lease.
747 static int ll_lease_och_release(struct inode *inode, struct file *file)
749 struct ll_inode_info *lli = ll_i2info(inode);
750 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
751 struct obd_client_handle **och_p;
752 struct obd_client_handle *old_och = NULL;
757 mutex_lock(&lli->lli_och_mutex);
758 if (file->f_mode & FMODE_WRITE) {
759 och_p = &lli->lli_mds_write_och;
760 och_usecount = &lli->lli_open_fd_write_count;
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 /* The file may have been open by another process (broken lease) so
767 * *och_p is not NULL. In this case we should simply increase usecount
770 if (*och_p != NULL) {
771 old_och = fd->fd_och;
778 mutex_unlock(&lli->lli_och_mutex);
781 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
787 * Acquire a lease and open the file.
789 static struct obd_client_handle *
790 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
793 struct lookup_intent it = { .it_op = IT_OPEN };
794 struct ll_sb_info *sbi = ll_i2sbi(inode);
795 struct md_op_data *op_data;
796 struct ptlrpc_request *req = NULL;
797 struct lustre_handle old_handle = { 0 };
798 struct obd_client_handle *och = NULL;
803 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
804 RETURN(ERR_PTR(-EINVAL));
807 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
808 RETURN(ERR_PTR(-EPERM));
810 rc = ll_lease_och_acquire(inode, file, &old_handle);
817 RETURN(ERR_PTR(-ENOMEM));
819 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
820 LUSTRE_OPC_ANY, NULL);
822 GOTO(out, rc = PTR_ERR(op_data));
824 /* To tell the MDT this openhandle is from the same owner */
825 op_data->op_handle = old_handle;
827 it.it_flags = fmode | open_flags;
828 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
829 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
830 &ll_md_blocking_lease_ast,
831 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
832 * it can be cancelled which may mislead applications that the lease is
834 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
835 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
836 * doesn't deal with openhandle, so normal openhandle will be leaked. */
837 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
838 ll_finish_md_op_data(op_data);
839 ptlrpc_req_finished(req);
841 GOTO(out_release_it, rc);
843 if (it_disposition(&it, DISP_LOOKUP_NEG))
844 GOTO(out_release_it, rc = -ENOENT);
846 rc = it_open_error(DISP_OPEN_OPEN, &it);
848 GOTO(out_release_it, rc);
850 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
851 ll_och_fill(sbi->ll_md_exp, &it, och);
853 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
854 GOTO(out_close, rc = -EOPNOTSUPP);
856 /* already get lease, handle lease lock */
857 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
858 if (it.it_lock_mode == 0 ||
859 it.it_lock_bits != MDS_INODELOCK_OPEN) {
860 /* open lock must return for lease */
861 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
862 PFID(ll_inode2fid(inode)), it.it_lock_mode,
864 GOTO(out_close, rc = -EPROTO);
867 ll_intent_release(&it);
871 /* Cancel open lock */
872 if (it.it_lock_mode != 0) {
873 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
876 och->och_lease_handle.cookie = 0ULL;
878 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
880 CERROR("%s: error closing file "DFID": %d\n",
881 ll_get_fsname(inode->i_sb, NULL, 0),
882 PFID(&ll_i2info(inode)->lli_fid), rc2);
883 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
885 ll_intent_release(&it);
893 * Check whether a layout swap can be done between two inodes.
895 * \param[in] inode1 First inode to check
896 * \param[in] inode2 Second inode to check
898 * \retval 0 on success, layout swap can be performed between both inodes
899 * \retval negative error code if requirements are not met
901 static int ll_check_swap_layouts_validity(struct inode *inode1,
902 struct inode *inode2)
904 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
907 if (inode_permission(inode1, MAY_WRITE) ||
908 inode_permission(inode2, MAY_WRITE))
911 if (inode1->i_sb != inode2->i_sb)
917 static int ll_swap_layouts_close(struct obd_client_handle *och,
918 struct inode *inode, struct inode *inode2)
920 const struct lu_fid *fid1 = ll_inode2fid(inode);
921 const struct lu_fid *fid2;
925 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
926 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
928 rc = ll_check_swap_layouts_validity(inode, inode2);
930 GOTO(out_free_och, rc);
932 /* We now know that inode2 is a lustre inode */
933 fid2 = ll_inode2fid(inode2);
935 rc = lu_fid_cmp(fid1, fid2);
937 GOTO(out_free_och, rc = -EINVAL);
939 /* Close the file and swap layouts between inode & inode2.
940 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
941 * because we still need it to pack l_remote_handle to MDT. */
942 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
945 och = NULL; /* freed in ll_close_inode_openhandle() */
955 * Release lease and close the file.
956 * It will check if the lease has ever broken.
958 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
961 struct ldlm_lock *lock;
962 bool cancelled = true;
966 lock = ldlm_handle2lock(&och->och_lease_handle);
968 lock_res_and_lock(lock);
969 cancelled = ldlm_is_cancel(lock);
970 unlock_res_and_lock(lock);
974 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
975 PFID(&ll_i2info(inode)->lli_fid), cancelled);
978 ldlm_cli_cancel(&och->och_lease_handle, 0);
980 if (lease_broken != NULL)
981 *lease_broken = cancelled;
983 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
987 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = vvp_env_thread_attr(env);
999 ll_inode_size_lock(inode);
1001 /* Merge timestamps the most recently obtained from MDS with
1002 * timestamps obtained from OSTs.
1004 * Do not overwrite atime of inode because it may be refreshed
1005 * by file_accessed() function. If the read was served by cache
1006 * data, there is no RPC to be sent so that atime may not be
1007 * transferred to OSTs at all. MDT only updates atime at close time
1008 * if it's at least 'mdd.*.atime_diff' older.
1009 * All in all, the atime in Lustre does not strictly comply with
1010 * POSIX. Solving this problem needs to send an RPC to MDT for each
1011 * read, this will hurt performance. */
1012 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1013 LTIME_S(inode->i_atime) = lli->lli_atime;
1014 lli->lli_update_atime = 0;
1016 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1017 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1019 atime = LTIME_S(inode->i_atime);
1020 mtime = LTIME_S(inode->i_mtime);
1021 ctime = LTIME_S(inode->i_ctime);
1023 cl_object_attr_lock(obj);
1024 rc = cl_object_attr_get(env, obj, attr);
1025 cl_object_attr_unlock(obj);
1028 GOTO(out_size_unlock, rc);
1030 if (atime < attr->cat_atime)
1031 atime = attr->cat_atime;
1033 if (ctime < attr->cat_ctime)
1034 ctime = attr->cat_ctime;
1036 if (mtime < attr->cat_mtime)
1037 mtime = attr->cat_mtime;
1039 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1040 PFID(&lli->lli_fid), attr->cat_size);
1042 i_size_write(inode, attr->cat_size);
1043 inode->i_blocks = attr->cat_blocks;
1045 LTIME_S(inode->i_atime) = atime;
1046 LTIME_S(inode->i_mtime) = mtime;
1047 LTIME_S(inode->i_ctime) = ctime;
1050 ll_inode_size_unlock(inode);
1055 static bool file_is_noatime(const struct file *file)
1057 const struct vfsmount *mnt = file->f_path.mnt;
1058 const struct inode *inode = file_inode((struct file *)file);
1060 /* Adapted from file_accessed() and touch_atime().*/
1061 if (file->f_flags & O_NOATIME)
1064 if (inode->i_flags & S_NOATIME)
1067 if (IS_NOATIME(inode))
1070 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1073 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1076 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1082 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1084 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1086 struct inode *inode = file_inode(file);
1088 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1089 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1090 io->u.ci_rw.rw_file = file;
1091 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1092 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1093 if (iot == CIT_WRITE) {
1094 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1095 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1096 file->f_flags & O_DIRECT ||
1099 io->ci_obj = ll_i2info(inode)->lli_clob;
1100 io->ci_lockreq = CILR_MAYBE;
1101 if (ll_file_nolock(file)) {
1102 io->ci_lockreq = CILR_NEVER;
1103 io->ci_no_srvlock = 1;
1104 } else if (file->f_flags & O_APPEND) {
1105 io->ci_lockreq = CILR_MANDATORY;
1107 io->ci_noatime = file_is_noatime(file);
1108 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1109 io->ci_pio = !io->u.ci_rw.rw_append;
1114 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1116 struct cl_io_pt *pt = ptask->pt_cbdata;
1117 struct file *file = pt->cip_file;
1120 loff_t pos = pt->cip_pos;
1125 env = cl_env_get(&refcheck);
1127 RETURN(PTR_ERR(env));
1129 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1130 file_dentry(file)->d_name.name,
1131 pt->cip_iot == CIT_READ ? "read" : "write",
1132 pos, pos + pt->cip_count);
1135 io = vvp_env_thread_io(env);
1136 ll_io_init(io, file, pt->cip_iot);
1137 io->u.ci_rw.rw_iter = pt->cip_iter;
1138 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1139 io->ci_pio = 0; /* It's already in parallel task */
1141 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1142 pt->cip_count - pt->cip_result);
1144 struct vvp_io *vio = vvp_env_io(env);
1146 vio->vui_io_subtype = IO_NORMAL;
1147 vio->vui_fd = LUSTRE_FPRIVATE(file);
1149 ll_cl_add(file, env, io, LCC_RW);
1150 rc = cl_io_loop(env, io);
1151 ll_cl_remove(file, env);
1153 /* cl_io_rw_init() handled IO */
1157 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1163 if (io->ci_nob > 0) {
1164 pt->cip_result += io->ci_nob;
1165 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1167 pt->cip_iocb.ki_pos = pos;
1168 #ifdef HAVE_KIOCB_KI_LEFT
1169 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1170 #elif defined(HAVE_KI_NBYTES)
1171 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1175 cl_io_fini(env, io);
1177 if ((rc == 0 || rc == -ENODATA) &&
1178 pt->cip_result < pt->cip_count &&
1179 io->ci_need_restart) {
1181 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1182 file_dentry(file)->d_name.name,
1183 pt->cip_iot == CIT_READ ? "read" : "write",
1184 pos, pos + pt->cip_count - pt->cip_result,
1185 pt->cip_result, rc);
1189 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1190 file_dentry(file)->d_name.name,
1191 pt->cip_iot == CIT_READ ? "read" : "write",
1192 pt->cip_result, rc);
1194 cl_env_put(env, &refcheck);
1195 RETURN(pt->cip_result > 0 ? 0 : rc);
1199 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1200 struct file *file, enum cl_io_type iot,
1201 loff_t *ppos, size_t count)
1203 struct range_lock range;
1204 struct vvp_io *vio = vvp_env_io(env);
1205 struct inode *inode = file_inode(file);
1206 struct ll_inode_info *lli = ll_i2info(inode);
1207 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1215 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1216 file_dentry(file)->d_name.name,
1217 iot == CIT_READ ? "read" : "write", pos, pos + count);
1220 io = vvp_env_thread_io(env);
1221 ll_io_init(io, file, iot);
1222 if (args->via_io_subtype == IO_NORMAL) {
1223 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1224 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1229 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1230 bool range_locked = false;
1232 if (file->f_flags & O_APPEND)
1233 range_lock_init(&range, 0, LUSTRE_EOF);
1235 range_lock_init(&range, pos, pos + count - 1);
1237 vio->vui_fd = LUSTRE_FPRIVATE(file);
1238 vio->vui_io_subtype = args->via_io_subtype;
1240 switch (vio->vui_io_subtype) {
1242 /* Direct IO reads must also take range lock,
1243 * or multiple reads will try to work on the same pages
1244 * See LU-6227 for details. */
1245 if (((iot == CIT_WRITE) ||
1246 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1247 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1248 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1250 rc = range_lock(&lli->lli_write_tree, &range);
1254 range_locked = true;
1258 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1259 vio->u.splice.vui_flags = args->u.splice.via_flags;
1262 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1266 ll_cl_add(file, env, io, LCC_RW);
1267 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1268 !lli->lli_inode_locked) {
1270 lli->lli_inode_locked = 1;
1272 rc = cl_io_loop(env, io);
1273 if (lli->lli_inode_locked) {
1274 lli->lli_inode_locked = 0;
1275 inode_unlock(inode);
1277 ll_cl_remove(file, env);
1280 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1282 range_unlock(&lli->lli_write_tree, &range);
1285 /* cl_io_rw_init() handled IO */
1289 if (io->ci_nob > 0) {
1290 result += io->ci_nob;
1291 count -= io->ci_nob;
1293 if (args->via_io_subtype == IO_NORMAL) {
1294 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1296 args->u.normal.via_iocb->ki_pos = pos;
1297 #ifdef HAVE_KIOCB_KI_LEFT
1298 args->u.normal.via_iocb->ki_left = count;
1299 #elif defined(HAVE_KI_NBYTES)
1300 args->u.normal.via_iocb->ki_nbytes = count;
1304 pos = io->u.ci_rw.rw_range.cir_pos;
1308 cl_io_fini(env, io);
1310 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1312 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1313 file_dentry(file)->d_name.name,
1314 iot == CIT_READ ? "read" : "write",
1315 pos, pos + count, result, rc);
1319 if (iot == CIT_READ) {
1321 ll_stats_ops_tally(ll_i2sbi(inode),
1322 LPROC_LL_READ_BYTES, result);
1323 } else if (iot == CIT_WRITE) {
1325 ll_stats_ops_tally(ll_i2sbi(inode),
1326 LPROC_LL_WRITE_BYTES, result);
1327 fd->fd_write_failed = false;
1328 } else if (result == 0 && rc == 0) {
1331 fd->fd_write_failed = true;
1333 fd->fd_write_failed = false;
1334 } else if (rc != -ERESTARTSYS) {
1335 fd->fd_write_failed = true;
1339 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1340 file_dentry(file)->d_name.name,
1341 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1345 RETURN(result > 0 ? result : rc);
1349 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1350 * especially for small I/O.
1352 * To serve a read request, CLIO has to create and initialize a cl_io and
1353 * then request DLM lock. This has turned out to have siginificant overhead
1354 * and affects the performance of small I/O dramatically.
1356 * It's not necessary to create a cl_io for each I/O. Under the help of read
1357 * ahead, most of the pages being read are already in memory cache and we can
1358 * read those pages directly because if the pages exist, the corresponding DLM
1359 * lock must exist so that page content must be valid.
1361 * In fast read implementation, the llite speculatively finds and reads pages
1362 * in memory cache. There are three scenarios for fast read:
1363 * - If the page exists and is uptodate, kernel VM will provide the data and
1364 * CLIO won't be intervened;
1365 * - If the page was brought into memory by read ahead, it will be exported
1366 * and read ahead parameters will be updated;
1367 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1368 * it will go back and invoke normal read, i.e., a cl_io will be created
1369 * and DLM lock will be requested.
1371 * POSIX compliance: posix standard states that read is intended to be atomic.
1372 * Lustre read implementation is in line with Linux kernel read implementation
1373 * and neither of them complies with POSIX standard in this matter. Fast read
1374 * doesn't make the situation worse on single node but it may interleave write
1375 * results from multiple nodes due to short read handling in ll_file_aio_read().
1377 * \param env - lu_env
1378 * \param iocb - kiocb from kernel
1379 * \param iter - user space buffers where the data will be copied
1381 * \retval - number of bytes have been read, or error code if error occurred.
1384 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1388 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1391 /* NB: we can't do direct IO for fast read because it will need a lock
1392 * to make IO engine happy. */
1393 if (iocb->ki_filp->f_flags & O_DIRECT)
1396 result = generic_file_read_iter(iocb, iter);
1398 /* If the first page is not in cache, generic_file_aio_read() will be
1399 * returned with -ENODATA.
1400 * See corresponding code in ll_readpage(). */
1401 if (result == -ENODATA)
1405 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1406 LPROC_LL_READ_BYTES, result);
1412 * Read from a file (through the page cache).
1414 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1417 struct vvp_io_args *args;
1422 result = ll_do_fast_read(iocb, to);
1423 if (result < 0 || iov_iter_count(to) == 0)
1426 env = cl_env_get(&refcheck);
1428 return PTR_ERR(env);
1430 args = ll_env_args(env, IO_NORMAL);
1431 args->u.normal.via_iter = to;
1432 args->u.normal.via_iocb = iocb;
1434 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1435 &iocb->ki_pos, iov_iter_count(to));
1438 else if (result == 0)
1441 cl_env_put(env, &refcheck);
1447 * Write to a file (through the page cache).
1449 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1451 struct vvp_io_args *args;
1456 env = cl_env_get(&refcheck);
1458 return PTR_ERR(env);
1460 args = ll_env_args(env, IO_NORMAL);
1461 args->u.normal.via_iter = from;
1462 args->u.normal.via_iocb = iocb;
1464 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1465 &iocb->ki_pos, iov_iter_count(from));
1466 cl_env_put(env, &refcheck);
1470 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1472 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1474 static int ll_file_get_iov_count(const struct iovec *iov,
1475 unsigned long *nr_segs, size_t *count)
1480 for (seg = 0; seg < *nr_segs; seg++) {
1481 const struct iovec *iv = &iov[seg];
1484 * If any segment has a negative length, or the cumulative
1485 * length ever wraps negative then return -EINVAL.
1488 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1490 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1495 cnt -= iv->iov_len; /* This segment is no good */
1502 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1503 unsigned long nr_segs, loff_t pos)
1510 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1514 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1515 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1516 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1517 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1518 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1520 result = ll_file_read_iter(iocb, &to);
1525 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1528 struct iovec iov = { .iov_base = buf, .iov_len = count };
1533 init_sync_kiocb(&kiocb, file);
1534 kiocb.ki_pos = *ppos;
1535 #ifdef HAVE_KIOCB_KI_LEFT
1536 kiocb.ki_left = count;
1537 #elif defined(HAVE_KI_NBYTES)
1538 kiocb.i_nbytes = count;
1541 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1542 *ppos = kiocb.ki_pos;
1548 * Write to a file (through the page cache).
1551 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1552 unsigned long nr_segs, loff_t pos)
1554 struct iov_iter from;
1559 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1563 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1564 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1565 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1566 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1567 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1569 result = ll_file_write_iter(iocb, &from);
1574 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1575 size_t count, loff_t *ppos)
1578 struct iovec iov = { .iov_base = (void __user *)buf,
1580 struct kiocb *kiocb;
1585 env = cl_env_get(&refcheck);
1587 RETURN(PTR_ERR(env));
1589 kiocb = &ll_env_info(env)->lti_kiocb;
1590 init_sync_kiocb(kiocb, file);
1591 kiocb->ki_pos = *ppos;
1592 #ifdef HAVE_KIOCB_KI_LEFT
1593 kiocb->ki_left = count;
1594 #elif defined(HAVE_KI_NBYTES)
1595 kiocb->ki_nbytes = count;
1598 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1599 *ppos = kiocb->ki_pos;
1601 cl_env_put(env, &refcheck);
1604 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1607 * Send file content (through pagecache) somewhere with helper
1609 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1610 struct pipe_inode_info *pipe, size_t count,
1614 struct vvp_io_args *args;
1619 env = cl_env_get(&refcheck);
1621 RETURN(PTR_ERR(env));
1623 args = ll_env_args(env, IO_SPLICE);
1624 args->u.splice.via_pipe = pipe;
1625 args->u.splice.via_flags = flags;
1627 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1628 cl_env_put(env, &refcheck);
1632 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1633 __u64 flags, struct lov_user_md *lum, int lum_size)
1635 struct lookup_intent oit = {
1637 .it_flags = flags | MDS_OPEN_BY_FID,
1642 ll_inode_size_lock(inode);
1643 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1645 GOTO(out_unlock, rc);
1647 ll_release_openhandle(dentry, &oit);
1650 ll_inode_size_unlock(inode);
1651 ll_intent_release(&oit);
1656 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1657 struct lov_mds_md **lmmp, int *lmm_size,
1658 struct ptlrpc_request **request)
1660 struct ll_sb_info *sbi = ll_i2sbi(inode);
1661 struct mdt_body *body;
1662 struct lov_mds_md *lmm = NULL;
1663 struct ptlrpc_request *req = NULL;
1664 struct md_op_data *op_data;
1667 rc = ll_get_default_mdsize(sbi, &lmmsize);
1671 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1672 strlen(filename), lmmsize,
1673 LUSTRE_OPC_ANY, NULL);
1674 if (IS_ERR(op_data))
1675 RETURN(PTR_ERR(op_data));
1677 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1678 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1679 ll_finish_md_op_data(op_data);
1681 CDEBUG(D_INFO, "md_getattr_name failed "
1682 "on %s: rc %d\n", filename, rc);
1686 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1687 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1689 lmmsize = body->mbo_eadatasize;
1691 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1693 GOTO(out, rc = -ENODATA);
1696 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1697 LASSERT(lmm != NULL);
1699 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1700 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1701 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1702 GOTO(out, rc = -EPROTO);
1705 * This is coming from the MDS, so is probably in
1706 * little endian. We convert it to host endian before
1707 * passing it to userspace.
1709 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1712 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1713 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1714 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1715 if (le32_to_cpu(lmm->lmm_pattern) &
1716 LOV_PATTERN_F_RELEASED)
1720 /* if function called for directory - we should
1721 * avoid swab not existent lsm objects */
1722 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1723 lustre_swab_lov_user_md_v1(
1724 (struct lov_user_md_v1 *)lmm);
1725 if (S_ISREG(body->mbo_mode))
1726 lustre_swab_lov_user_md_objects(
1727 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1729 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1730 lustre_swab_lov_user_md_v3(
1731 (struct lov_user_md_v3 *)lmm);
1732 if (S_ISREG(body->mbo_mode))
1733 lustre_swab_lov_user_md_objects(
1734 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1736 } else if (lmm->lmm_magic ==
1737 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1738 lustre_swab_lov_comp_md_v1(
1739 (struct lov_comp_md_v1 *)lmm);
1745 *lmm_size = lmmsize;
1750 static int ll_lov_setea(struct inode *inode, struct file *file,
1753 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1754 struct lov_user_md *lump;
1755 int lum_size = sizeof(struct lov_user_md) +
1756 sizeof(struct lov_user_ost_data);
1760 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1763 OBD_ALLOC_LARGE(lump, lum_size);
1767 if (copy_from_user(lump, arg, lum_size))
1768 GOTO(out_lump, rc = -EFAULT);
1770 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1772 cl_lov_delay_create_clear(&file->f_flags);
1775 OBD_FREE_LARGE(lump, lum_size);
1779 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1786 env = cl_env_get(&refcheck);
1788 RETURN(PTR_ERR(env));
1790 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1791 cl_env_put(env, &refcheck);
1795 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1798 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1799 struct lov_user_md *klum;
1801 __u64 flags = FMODE_WRITE;
1804 rc = ll_copy_user_md(lum, &klum);
1809 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1814 rc = put_user(0, &lum->lmm_stripe_count);
1818 rc = ll_layout_refresh(inode, &gen);
1822 rc = ll_file_getstripe(inode, arg, lum_size);
1824 cl_lov_delay_create_clear(&file->f_flags);
1827 OBD_FREE(klum, lum_size);
1832 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1834 struct ll_inode_info *lli = ll_i2info(inode);
1835 struct cl_object *obj = lli->lli_clob;
1836 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1837 struct ll_grouplock grouplock;
1842 CWARN("group id for group lock must not be 0\n");
1846 if (ll_file_nolock(file))
1847 RETURN(-EOPNOTSUPP);
1849 spin_lock(&lli->lli_lock);
1850 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1851 CWARN("group lock already existed with gid %lu\n",
1852 fd->fd_grouplock.lg_gid);
1853 spin_unlock(&lli->lli_lock);
1856 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1857 spin_unlock(&lli->lli_lock);
1860 * XXX: group lock needs to protect all OST objects while PFL
1861 * can add new OST objects during the IO, so we'd instantiate
1862 * all OST objects before getting its group lock.
1867 struct cl_layout cl = {
1868 .cl_is_composite = false,
1871 env = cl_env_get(&refcheck);
1873 RETURN(PTR_ERR(env));
1875 rc = cl_object_layout_get(env, obj, &cl);
1876 if (!rc && cl.cl_is_composite)
1877 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1879 cl_env_put(env, &refcheck);
1884 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1885 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1889 spin_lock(&lli->lli_lock);
1890 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1891 spin_unlock(&lli->lli_lock);
1892 CERROR("another thread just won the race\n");
1893 cl_put_grouplock(&grouplock);
1897 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1898 fd->fd_grouplock = grouplock;
1899 spin_unlock(&lli->lli_lock);
1901 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1905 static int ll_put_grouplock(struct inode *inode, struct file *file,
1908 struct ll_inode_info *lli = ll_i2info(inode);
1909 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1910 struct ll_grouplock grouplock;
1913 spin_lock(&lli->lli_lock);
1914 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1915 spin_unlock(&lli->lli_lock);
1916 CWARN("no group lock held\n");
1920 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1922 if (fd->fd_grouplock.lg_gid != arg) {
1923 CWARN("group lock %lu doesn't match current id %lu\n",
1924 arg, fd->fd_grouplock.lg_gid);
1925 spin_unlock(&lli->lli_lock);
1929 grouplock = fd->fd_grouplock;
1930 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1931 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1932 spin_unlock(&lli->lli_lock);
1934 cl_put_grouplock(&grouplock);
1935 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1940 * Close inode open handle
1942 * \param dentry [in] dentry which contains the inode
1943 * \param it [in,out] intent which contains open info and result
1946 * \retval <0 failure
1948 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1950 struct inode *inode = dentry->d_inode;
1951 struct obd_client_handle *och;
1957 /* Root ? Do nothing. */
1958 if (dentry->d_inode->i_sb->s_root == dentry)
1961 /* No open handle to close? Move away */
1962 if (!it_disposition(it, DISP_OPEN_OPEN))
1965 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1967 OBD_ALLOC(och, sizeof(*och));
1969 GOTO(out, rc = -ENOMEM);
1971 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1973 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1975 /* this one is in place of ll_file_open */
1976 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1977 ptlrpc_req_finished(it->it_request);
1978 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1984 * Get size for inode for which FIEMAP mapping is requested.
1985 * Make the FIEMAP get_info call and returns the result.
1986 * \param fiemap kernel buffer to hold extens
1987 * \param num_bytes kernel buffer size
1989 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1995 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1998 /* Checks for fiemap flags */
1999 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2000 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2004 /* Check for FIEMAP_FLAG_SYNC */
2005 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2006 rc = filemap_fdatawrite(inode->i_mapping);
2011 env = cl_env_get(&refcheck);
2013 RETURN(PTR_ERR(env));
2015 if (i_size_read(inode) == 0) {
2016 rc = ll_glimpse_size(inode);
2021 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2022 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2023 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2025 /* If filesize is 0, then there would be no objects for mapping */
2026 if (fmkey.lfik_oa.o_size == 0) {
2027 fiemap->fm_mapped_extents = 0;
2031 fmkey.lfik_fiemap = *fiemap;
2033 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2034 &fmkey, fiemap, &num_bytes);
2036 cl_env_put(env, &refcheck);
2040 int ll_fid2path(struct inode *inode, void __user *arg)
2042 struct obd_export *exp = ll_i2mdexp(inode);
2043 const struct getinfo_fid2path __user *gfin = arg;
2045 struct getinfo_fid2path *gfout;
2051 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2052 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2055 /* Only need to get the buflen */
2056 if (get_user(pathlen, &gfin->gf_pathlen))
2059 if (pathlen > PATH_MAX)
2062 outsize = sizeof(*gfout) + pathlen;
2063 OBD_ALLOC(gfout, outsize);
2067 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2068 GOTO(gf_free, rc = -EFAULT);
2069 /* append root FID after gfout to let MDT know the root FID so that it
2070 * can lookup the correct path, this is mainly for fileset.
2071 * old server without fileset mount support will ignore this. */
2072 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2074 /* Call mdc_iocontrol */
2075 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2079 if (copy_to_user(arg, gfout, outsize))
2083 OBD_FREE(gfout, outsize);
2088 * Read the data_version for inode.
2090 * This value is computed using stripe object version on OST.
2091 * Version is computed using server side locking.
2093 * @param flags if do sync on the OST side;
2095 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2096 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2098 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2100 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2108 /* If no file object initialized, we consider its version is 0. */
2114 env = cl_env_get(&refcheck);
2116 RETURN(PTR_ERR(env));
2118 io = vvp_env_thread_io(env);
2120 io->u.ci_data_version.dv_data_version = 0;
2121 io->u.ci_data_version.dv_flags = flags;
2124 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2125 result = cl_io_loop(env, io);
2127 result = io->ci_result;
2129 *data_version = io->u.ci_data_version.dv_data_version;
2131 cl_io_fini(env, io);
2133 if (unlikely(io->ci_need_restart))
2136 cl_env_put(env, &refcheck);
2142 * Trigger a HSM release request for the provided inode.
2144 int ll_hsm_release(struct inode *inode)
2147 struct obd_client_handle *och = NULL;
2148 __u64 data_version = 0;
2153 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2154 ll_get_fsname(inode->i_sb, NULL, 0),
2155 PFID(&ll_i2info(inode)->lli_fid));
2157 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2159 GOTO(out, rc = PTR_ERR(och));
2161 /* Grab latest data_version and [am]time values */
2162 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2166 env = cl_env_get(&refcheck);
2168 GOTO(out, rc = PTR_ERR(env));
2170 ll_merge_attr(env, inode);
2171 cl_env_put(env, &refcheck);
2173 /* Release the file.
2174 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2175 * we still need it to pack l_remote_handle to MDT. */
2176 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2182 if (och != NULL && !IS_ERR(och)) /* close the file */
2183 ll_lease_close(och, inode, NULL);
2188 struct ll_swap_stack {
2191 struct inode *inode1;
2192 struct inode *inode2;
2197 static int ll_swap_layouts(struct file *file1, struct file *file2,
2198 struct lustre_swap_layouts *lsl)
2200 struct mdc_swap_layouts msl;
2201 struct md_op_data *op_data;
2204 struct ll_swap_stack *llss = NULL;
2207 OBD_ALLOC_PTR(llss);
2211 llss->inode1 = file_inode(file1);
2212 llss->inode2 = file_inode(file2);
2214 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2218 /* we use 2 bool because it is easier to swap than 2 bits */
2219 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2220 llss->check_dv1 = true;
2222 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2223 llss->check_dv2 = true;
2225 /* we cannot use lsl->sl_dvX directly because we may swap them */
2226 llss->dv1 = lsl->sl_dv1;
2227 llss->dv2 = lsl->sl_dv2;
2229 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2230 if (rc == 0) /* same file, done! */
2233 if (rc < 0) { /* sequentialize it */
2234 swap(llss->inode1, llss->inode2);
2236 swap(llss->dv1, llss->dv2);
2237 swap(llss->check_dv1, llss->check_dv2);
2241 if (gid != 0) { /* application asks to flush dirty cache */
2242 rc = ll_get_grouplock(llss->inode1, file1, gid);
2246 rc = ll_get_grouplock(llss->inode2, file2, gid);
2248 ll_put_grouplock(llss->inode1, file1, gid);
2253 /* ultimate check, before swaping the layouts we check if
2254 * dataversion has changed (if requested) */
2255 if (llss->check_dv1) {
2256 rc = ll_data_version(llss->inode1, &dv, 0);
2259 if (dv != llss->dv1)
2260 GOTO(putgl, rc = -EAGAIN);
2263 if (llss->check_dv2) {
2264 rc = ll_data_version(llss->inode2, &dv, 0);
2267 if (dv != llss->dv2)
2268 GOTO(putgl, rc = -EAGAIN);
2271 /* struct md_op_data is used to send the swap args to the mdt
2272 * only flags is missing, so we use struct mdc_swap_layouts
2273 * through the md_op_data->op_data */
2274 /* flags from user space have to be converted before they are send to
2275 * server, no flag is sent today, they are only used on the client */
2278 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2279 0, LUSTRE_OPC_ANY, &msl);
2280 if (IS_ERR(op_data))
2281 GOTO(free, rc = PTR_ERR(op_data));
2283 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2284 sizeof(*op_data), op_data, NULL);
2285 ll_finish_md_op_data(op_data);
2292 ll_put_grouplock(llss->inode2, file2, gid);
2293 ll_put_grouplock(llss->inode1, file1, gid);
2303 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2305 struct md_op_data *op_data;
2309 /* Detect out-of range masks */
2310 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2313 /* Non-root users are forbidden to set or clear flags which are
2314 * NOT defined in HSM_USER_MASK. */
2315 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2316 !cfs_capable(CFS_CAP_SYS_ADMIN))
2319 /* Detect out-of range archive id */
2320 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2321 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2324 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2325 LUSTRE_OPC_ANY, hss);
2326 if (IS_ERR(op_data))
2327 RETURN(PTR_ERR(op_data));
2329 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2330 sizeof(*op_data), op_data, NULL);
2332 ll_finish_md_op_data(op_data);
2337 static int ll_hsm_import(struct inode *inode, struct file *file,
2338 struct hsm_user_import *hui)
2340 struct hsm_state_set *hss = NULL;
2341 struct iattr *attr = NULL;
2345 if (!S_ISREG(inode->i_mode))
2351 GOTO(out, rc = -ENOMEM);
2353 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2354 hss->hss_archive_id = hui->hui_archive_id;
2355 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2356 rc = ll_hsm_state_set(inode, hss);
2360 OBD_ALLOC_PTR(attr);
2362 GOTO(out, rc = -ENOMEM);
2364 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2365 attr->ia_mode |= S_IFREG;
2366 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2367 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2368 attr->ia_size = hui->hui_size;
2369 attr->ia_mtime.tv_sec = hui->hui_mtime;
2370 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2371 attr->ia_atime.tv_sec = hui->hui_atime;
2372 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2374 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2375 ATTR_UID | ATTR_GID |
2376 ATTR_MTIME | ATTR_MTIME_SET |
2377 ATTR_ATIME | ATTR_ATIME_SET;
2381 rc = ll_setattr_raw(file_dentry(file), attr, true);
2385 inode_unlock(inode);
2397 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2399 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2400 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2403 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2405 struct inode *inode = file_inode(file);
2407 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2408 ATTR_MTIME | ATTR_MTIME_SET |
2409 ATTR_CTIME | ATTR_CTIME_SET,
2411 .tv_sec = lfu->lfu_atime_sec,
2412 .tv_nsec = lfu->lfu_atime_nsec,
2415 .tv_sec = lfu->lfu_mtime_sec,
2416 .tv_nsec = lfu->lfu_mtime_nsec,
2419 .tv_sec = lfu->lfu_ctime_sec,
2420 .tv_nsec = lfu->lfu_ctime_nsec,
2426 if (!capable(CAP_SYS_ADMIN))
2429 if (!S_ISREG(inode->i_mode))
2433 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2434 inode_unlock(inode);
2440 * Give file access advices
2442 * The ladvise interface is similar to Linux fadvise() system call, except it
2443 * forwards the advices directly from Lustre client to server. The server side
2444 * codes will apply appropriate read-ahead and caching techniques for the
2445 * corresponding files.
2447 * A typical workload for ladvise is e.g. a bunch of different clients are
2448 * doing small random reads of a file, so prefetching pages into OSS cache
2449 * with big linear reads before the random IO is a net benefit. Fetching
2450 * all that data into each client cache with fadvise() may not be, due to
2451 * much more data being sent to the client.
2453 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2454 struct llapi_lu_ladvise *ladvise)
2458 struct cl_ladvise_io *lio;
2463 env = cl_env_get(&refcheck);
2465 RETURN(PTR_ERR(env));
2467 io = vvp_env_thread_io(env);
2468 io->ci_obj = ll_i2info(inode)->lli_clob;
2470 /* initialize parameters for ladvise */
2471 lio = &io->u.ci_ladvise;
2472 lio->li_start = ladvise->lla_start;
2473 lio->li_end = ladvise->lla_end;
2474 lio->li_fid = ll_inode2fid(inode);
2475 lio->li_advice = ladvise->lla_advice;
2476 lio->li_flags = flags;
2478 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2479 rc = cl_io_loop(env, io);
2483 cl_io_fini(env, io);
2484 cl_env_put(env, &refcheck);
2488 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2491 struct fsxattr fsxattr;
2493 if (copy_from_user(&fsxattr,
2494 (const struct fsxattr __user *)arg,
2498 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2499 if (copy_to_user((struct fsxattr __user *)arg,
2500 &fsxattr, sizeof(fsxattr)))
2506 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2510 struct md_op_data *op_data;
2511 struct ptlrpc_request *req = NULL;
2513 struct fsxattr fsxattr;
2515 /* only root could change project ID */
2516 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2519 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2520 LUSTRE_OPC_ANY, NULL);
2521 if (IS_ERR(op_data))
2522 RETURN(PTR_ERR(op_data));
2524 if (copy_from_user(&fsxattr,
2525 (const struct fsxattr __user *)arg,
2527 GOTO(out_fsxattr1, rc = -EFAULT);
2529 op_data->op_projid = fsxattr.fsx_projid;
2530 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2531 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2533 ptlrpc_req_finished(req);
2536 ll_finish_md_op_data(op_data);
2543 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2545 struct inode *inode = file_inode(file);
2546 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2550 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2551 PFID(ll_inode2fid(inode)), inode, cmd);
2552 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2554 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2555 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2559 case LL_IOC_GETFLAGS:
2560 /* Get the current value of the file flags */
2561 return put_user(fd->fd_flags, (int __user *)arg);
2562 case LL_IOC_SETFLAGS:
2563 case LL_IOC_CLRFLAGS:
2564 /* Set or clear specific file flags */
2565 /* XXX This probably needs checks to ensure the flags are
2566 * not abused, and to handle any flag side effects.
2568 if (get_user(flags, (int __user *) arg))
2571 if (cmd == LL_IOC_SETFLAGS) {
2572 if ((flags & LL_FILE_IGNORE_LOCK) &&
2573 !(file->f_flags & O_DIRECT)) {
2574 CERROR("%s: unable to disable locking on "
2575 "non-O_DIRECT file\n", current->comm);
2579 fd->fd_flags |= flags;
2581 fd->fd_flags &= ~flags;
2584 case LL_IOC_LOV_SETSTRIPE:
2585 case LL_IOC_LOV_SETSTRIPE_NEW:
2586 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2587 case LL_IOC_LOV_SETEA:
2588 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2589 case LL_IOC_LOV_SWAP_LAYOUTS: {
2591 struct lustre_swap_layouts lsl;
2593 if (copy_from_user(&lsl, (char __user *)arg,
2594 sizeof(struct lustre_swap_layouts)))
2597 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2600 file2 = fget(lsl.sl_fd);
2604 /* O_WRONLY or O_RDWR */
2605 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2606 GOTO(out, rc = -EPERM);
2608 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2609 struct inode *inode2;
2610 struct ll_inode_info *lli;
2611 struct obd_client_handle *och = NULL;
2613 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2614 GOTO(out, rc = -EINVAL);
2616 lli = ll_i2info(inode);
2617 mutex_lock(&lli->lli_och_mutex);
2618 if (fd->fd_lease_och != NULL) {
2619 och = fd->fd_lease_och;
2620 fd->fd_lease_och = NULL;
2622 mutex_unlock(&lli->lli_och_mutex);
2624 GOTO(out, rc = -ENOLCK);
2625 inode2 = file_inode(file2);
2626 rc = ll_swap_layouts_close(och, inode, inode2);
2628 rc = ll_swap_layouts(file, file2, &lsl);
2634 case LL_IOC_LOV_GETSTRIPE:
2635 case LL_IOC_LOV_GETSTRIPE_NEW:
2636 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2637 case FSFILT_IOC_GETFLAGS:
2638 case FSFILT_IOC_SETFLAGS:
2639 RETURN(ll_iocontrol(inode, file, cmd, arg));
2640 case FSFILT_IOC_GETVERSION_OLD:
2641 case FSFILT_IOC_GETVERSION:
2642 RETURN(put_user(inode->i_generation, (int __user *)arg));
2643 case LL_IOC_GROUP_LOCK:
2644 RETURN(ll_get_grouplock(inode, file, arg));
2645 case LL_IOC_GROUP_UNLOCK:
2646 RETURN(ll_put_grouplock(inode, file, arg));
2647 case IOC_OBD_STATFS:
2648 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2650 /* We need to special case any other ioctls we want to handle,
2651 * to send them to the MDS/OST as appropriate and to properly
2652 * network encode the arg field.
2653 case FSFILT_IOC_SETVERSION_OLD:
2654 case FSFILT_IOC_SETVERSION:
2656 case LL_IOC_FLUSHCTX:
2657 RETURN(ll_flush_ctx(inode));
2658 case LL_IOC_PATH2FID: {
2659 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2660 sizeof(struct lu_fid)))
2665 case LL_IOC_GETPARENT:
2666 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2668 case OBD_IOC_FID2PATH:
2669 RETURN(ll_fid2path(inode, (void __user *)arg));
2670 case LL_IOC_DATA_VERSION: {
2671 struct ioc_data_version idv;
2674 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2677 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2678 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2681 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2687 case LL_IOC_GET_MDTIDX: {
2690 mdtidx = ll_get_mdt_idx(inode);
2694 if (put_user((int)mdtidx, (int __user *)arg))
2699 case OBD_IOC_GETDTNAME:
2700 case OBD_IOC_GETMDNAME:
2701 RETURN(ll_get_obd_name(inode, cmd, arg));
2702 case LL_IOC_HSM_STATE_GET: {
2703 struct md_op_data *op_data;
2704 struct hsm_user_state *hus;
2711 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2712 LUSTRE_OPC_ANY, hus);
2713 if (IS_ERR(op_data)) {
2715 RETURN(PTR_ERR(op_data));
2718 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2721 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2724 ll_finish_md_op_data(op_data);
2728 case LL_IOC_HSM_STATE_SET: {
2729 struct hsm_state_set *hss;
2736 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2741 rc = ll_hsm_state_set(inode, hss);
2746 case LL_IOC_HSM_ACTION: {
2747 struct md_op_data *op_data;
2748 struct hsm_current_action *hca;
2755 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2756 LUSTRE_OPC_ANY, hca);
2757 if (IS_ERR(op_data)) {
2759 RETURN(PTR_ERR(op_data));
2762 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2765 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2768 ll_finish_md_op_data(op_data);
2772 case LL_IOC_SET_LEASE: {
2773 struct ll_inode_info *lli = ll_i2info(inode);
2774 struct obd_client_handle *och = NULL;
2779 case LL_LEASE_WRLCK:
2780 if (!(file->f_mode & FMODE_WRITE))
2782 fmode = FMODE_WRITE;
2784 case LL_LEASE_RDLCK:
2785 if (!(file->f_mode & FMODE_READ))
2789 case LL_LEASE_UNLCK:
2790 mutex_lock(&lli->lli_och_mutex);
2791 if (fd->fd_lease_och != NULL) {
2792 och = fd->fd_lease_och;
2793 fd->fd_lease_och = NULL;
2795 mutex_unlock(&lli->lli_och_mutex);
2800 fmode = och->och_flags;
2801 rc = ll_lease_close(och, inode, &lease_broken);
2805 rc = ll_lease_och_release(inode, file);
2812 RETURN(ll_lease_type_from_fmode(fmode));
2817 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2819 /* apply for lease */
2820 och = ll_lease_open(inode, file, fmode, 0);
2822 RETURN(PTR_ERR(och));
2825 mutex_lock(&lli->lli_och_mutex);
2826 if (fd->fd_lease_och == NULL) {
2827 fd->fd_lease_och = och;
2830 mutex_unlock(&lli->lli_och_mutex);
2832 /* impossible now that only excl is supported for now */
2833 ll_lease_close(och, inode, &lease_broken);
2838 case LL_IOC_GET_LEASE: {
2839 struct ll_inode_info *lli = ll_i2info(inode);
2840 struct ldlm_lock *lock = NULL;
2843 mutex_lock(&lli->lli_och_mutex);
2844 if (fd->fd_lease_och != NULL) {
2845 struct obd_client_handle *och = fd->fd_lease_och;
2847 lock = ldlm_handle2lock(&och->och_lease_handle);
2849 lock_res_and_lock(lock);
2850 if (!ldlm_is_cancel(lock))
2851 fmode = och->och_flags;
2853 unlock_res_and_lock(lock);
2854 LDLM_LOCK_PUT(lock);
2857 mutex_unlock(&lli->lli_och_mutex);
2859 RETURN(ll_lease_type_from_fmode(fmode));
2861 case LL_IOC_HSM_IMPORT: {
2862 struct hsm_user_import *hui;
2868 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2873 rc = ll_hsm_import(inode, file, hui);
2878 case LL_IOC_FUTIMES_3: {
2879 struct ll_futimes_3 lfu;
2881 if (copy_from_user(&lfu,
2882 (const struct ll_futimes_3 __user *)arg,
2886 RETURN(ll_file_futimes_3(file, &lfu));
2888 case LL_IOC_LADVISE: {
2889 struct llapi_ladvise_hdr *ladvise_hdr;
2892 int alloc_size = sizeof(*ladvise_hdr);
2895 OBD_ALLOC_PTR(ladvise_hdr);
2896 if (ladvise_hdr == NULL)
2899 if (copy_from_user(ladvise_hdr,
2900 (const struct llapi_ladvise_hdr __user *)arg,
2902 GOTO(out_ladvise, rc = -EFAULT);
2904 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2905 ladvise_hdr->lah_count < 1)
2906 GOTO(out_ladvise, rc = -EINVAL);
2908 num_advise = ladvise_hdr->lah_count;
2909 if (num_advise >= LAH_COUNT_MAX)
2910 GOTO(out_ladvise, rc = -EFBIG);
2912 OBD_FREE_PTR(ladvise_hdr);
2913 alloc_size = offsetof(typeof(*ladvise_hdr),
2914 lah_advise[num_advise]);
2915 OBD_ALLOC(ladvise_hdr, alloc_size);
2916 if (ladvise_hdr == NULL)
2920 * TODO: submit multiple advices to one server in a single RPC
2922 if (copy_from_user(ladvise_hdr,
2923 (const struct llapi_ladvise_hdr __user *)arg,
2925 GOTO(out_ladvise, rc = -EFAULT);
2927 for (i = 0; i < num_advise; i++) {
2928 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2929 &ladvise_hdr->lah_advise[i]);
2935 OBD_FREE(ladvise_hdr, alloc_size);
2938 case LL_IOC_FSGETXATTR:
2939 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2940 case LL_IOC_FSSETXATTR:
2941 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2943 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2944 (void __user *)arg));
2948 #ifndef HAVE_FILE_LLSEEK_SIZE
2949 static inline loff_t
2950 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2952 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2954 if (offset > maxsize)
2957 if (offset != file->f_pos) {
2958 file->f_pos = offset;
2959 file->f_version = 0;
2965 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2966 loff_t maxsize, loff_t eof)
2968 struct inode *inode = file_inode(file);
2976 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2977 * position-querying operation. Avoid rewriting the "same"
2978 * f_pos value back to the file because a concurrent read(),
2979 * write() or lseek() might have altered it
2984 * f_lock protects against read/modify/write race with other
2985 * SEEK_CURs. Note that parallel writes and reads behave
2989 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2990 inode_unlock(inode);
2994 * In the generic case the entire file is data, so as long as
2995 * offset isn't at the end of the file then the offset is data.
3002 * There is a virtual hole at the end of the file, so as long as
3003 * offset isn't i_size or larger, return i_size.
3011 return llseek_execute(file, offset, maxsize);
3015 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3017 struct inode *inode = file_inode(file);
3018 loff_t retval, eof = 0;
3021 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3022 (origin == SEEK_CUR) ? file->f_pos : 0);
3023 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3024 PFID(ll_inode2fid(inode)), inode, retval, retval,
3026 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3028 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3029 retval = ll_glimpse_size(inode);
3032 eof = i_size_read(inode);
3035 retval = ll_generic_file_llseek_size(file, offset, origin,
3036 ll_file_maxbytes(inode), eof);
3040 static int ll_flush(struct file *file, fl_owner_t id)
3042 struct inode *inode = file_inode(file);
3043 struct ll_inode_info *lli = ll_i2info(inode);
3044 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3047 LASSERT(!S_ISDIR(inode->i_mode));
3049 /* catch async errors that were recorded back when async writeback
3050 * failed for pages in this mapping. */
3051 rc = lli->lli_async_rc;
3052 lli->lli_async_rc = 0;
3053 if (lli->lli_clob != NULL) {
3054 err = lov_read_and_clear_async_rc(lli->lli_clob);
3059 /* The application has been told write failure already.
3060 * Do not report failure again. */
3061 if (fd->fd_write_failed)
3063 return rc ? -EIO : 0;
3067 * Called to make sure a portion of file has been written out.
3068 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3070 * Return how many pages have been written.
3072 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3073 enum cl_fsync_mode mode, int ignore_layout)
3077 struct cl_fsync_io *fio;
3082 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3083 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3086 env = cl_env_get(&refcheck);
3088 RETURN(PTR_ERR(env));
3090 io = vvp_env_thread_io(env);
3091 io->ci_obj = ll_i2info(inode)->lli_clob;
3092 io->ci_ignore_layout = ignore_layout;
3094 /* initialize parameters for sync */
3095 fio = &io->u.ci_fsync;
3096 fio->fi_start = start;
3098 fio->fi_fid = ll_inode2fid(inode);
3099 fio->fi_mode = mode;
3100 fio->fi_nr_written = 0;
3102 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3103 result = cl_io_loop(env, io);
3105 result = io->ci_result;
3107 result = fio->fi_nr_written;
3108 cl_io_fini(env, io);
3109 cl_env_put(env, &refcheck);
3115 * When dentry is provided (the 'else' case), file_dentry() may be
3116 * null and dentry must be used directly rather than pulled from
3117 * file_dentry() as is done otherwise.
3120 #ifdef HAVE_FILE_FSYNC_4ARGS
3121 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3123 struct dentry *dentry = file_dentry(file);
3125 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3126 int ll_fsync(struct file *file, int datasync)
3128 struct dentry *dentry = file_dentry(file);
3130 loff_t end = LLONG_MAX;
3132 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3135 loff_t end = LLONG_MAX;
3137 struct inode *inode = dentry->d_inode;
3138 struct ll_inode_info *lli = ll_i2info(inode);
3139 struct ptlrpc_request *req;
3143 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3144 PFID(ll_inode2fid(inode)), inode);
3145 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3147 #ifdef HAVE_FILE_FSYNC_4ARGS
3148 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3149 lock_inode = !lli->lli_inode_locked;
3153 /* fsync's caller has already called _fdata{sync,write}, we want
3154 * that IO to finish before calling the osc and mdc sync methods */
3155 rc = filemap_fdatawait(inode->i_mapping);
3158 /* catch async errors that were recorded back when async writeback
3159 * failed for pages in this mapping. */
3160 if (!S_ISDIR(inode->i_mode)) {
3161 err = lli->lli_async_rc;
3162 lli->lli_async_rc = 0;
3165 if (lli->lli_clob != NULL) {
3166 err = lov_read_and_clear_async_rc(lli->lli_clob);
3172 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3176 ptlrpc_req_finished(req);
3178 if (S_ISREG(inode->i_mode)) {
3179 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3181 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3182 if (rc == 0 && err < 0)
3185 fd->fd_write_failed = true;
3187 fd->fd_write_failed = false;
3190 #ifdef HAVE_FILE_FSYNC_4ARGS
3192 inode_unlock(inode);
3198 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3200 struct inode *inode = file_inode(file);
3201 struct ll_sb_info *sbi = ll_i2sbi(inode);
3202 struct ldlm_enqueue_info einfo = {
3203 .ei_type = LDLM_FLOCK,
3204 .ei_cb_cp = ldlm_flock_completion_ast,
3205 .ei_cbdata = file_lock,
3207 struct md_op_data *op_data;
3208 struct lustre_handle lockh = { 0 };
3209 union ldlm_policy_data flock = { { 0 } };
3210 int fl_type = file_lock->fl_type;
3216 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3217 PFID(ll_inode2fid(inode)), file_lock);
3219 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3221 if (file_lock->fl_flags & FL_FLOCK) {
3222 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3223 /* flocks are whole-file locks */
3224 flock.l_flock.end = OFFSET_MAX;
3225 /* For flocks owner is determined by the local file desctiptor*/
3226 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3227 } else if (file_lock->fl_flags & FL_POSIX) {
3228 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3229 flock.l_flock.start = file_lock->fl_start;
3230 flock.l_flock.end = file_lock->fl_end;
3234 flock.l_flock.pid = file_lock->fl_pid;
3236 /* Somewhat ugly workaround for svc lockd.
3237 * lockd installs custom fl_lmops->lm_compare_owner that checks
3238 * for the fl_owner to be the same (which it always is on local node
3239 * I guess between lockd processes) and then compares pid.
3240 * As such we assign pid to the owner field to make it all work,
3241 * conflict with normal locks is unlikely since pid space and
3242 * pointer space for current->files are not intersecting */
3243 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3244 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3248 einfo.ei_mode = LCK_PR;
3251 /* An unlock request may or may not have any relation to
3252 * existing locks so we may not be able to pass a lock handle
3253 * via a normal ldlm_lock_cancel() request. The request may even
3254 * unlock a byte range in the middle of an existing lock. In
3255 * order to process an unlock request we need all of the same
3256 * information that is given with a normal read or write record
3257 * lock request. To avoid creating another ldlm unlock (cancel)
3258 * message we'll treat a LCK_NL flock request as an unlock. */
3259 einfo.ei_mode = LCK_NL;
3262 einfo.ei_mode = LCK_PW;
3265 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3280 flags = LDLM_FL_BLOCK_NOWAIT;
3286 flags = LDLM_FL_TEST_LOCK;
3289 CERROR("unknown fcntl lock command: %d\n", cmd);
3293 /* Save the old mode so that if the mode in the lock changes we
3294 * can decrement the appropriate reader or writer refcount. */
3295 file_lock->fl_type = einfo.ei_mode;
3297 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3298 LUSTRE_OPC_ANY, NULL);
3299 if (IS_ERR(op_data))
3300 RETURN(PTR_ERR(op_data));
3302 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3303 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3304 flock.l_flock.pid, flags, einfo.ei_mode,
3305 flock.l_flock.start, flock.l_flock.end);
3307 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3310 /* Restore the file lock type if not TEST lock. */
3311 if (!(flags & LDLM_FL_TEST_LOCK))
3312 file_lock->fl_type = fl_type;
3314 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3315 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3316 !(flags & LDLM_FL_TEST_LOCK))
3317 rc2 = locks_lock_file_wait(file, file_lock);
3319 if ((file_lock->fl_flags & FL_FLOCK) &&
3320 (rc == 0 || file_lock->fl_type == F_UNLCK))
3321 rc2 = flock_lock_file_wait(file, file_lock);
3322 if ((file_lock->fl_flags & FL_POSIX) &&
3323 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3324 !(flags & LDLM_FL_TEST_LOCK))
3325 rc2 = posix_lock_file_wait(file, file_lock);
3326 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3328 if (rc2 && file_lock->fl_type != F_UNLCK) {
3329 einfo.ei_mode = LCK_NL;
3330 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3335 ll_finish_md_op_data(op_data);
3340 int ll_get_fid_by_name(struct inode *parent, const char *name,
3341 int namelen, struct lu_fid *fid,
3342 struct inode **inode)
3344 struct md_op_data *op_data = NULL;
3345 struct mdt_body *body;
3346 struct ptlrpc_request *req;
3350 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3351 LUSTRE_OPC_ANY, NULL);
3352 if (IS_ERR(op_data))
3353 RETURN(PTR_ERR(op_data));
3355 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3356 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3357 ll_finish_md_op_data(op_data);
3361 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3363 GOTO(out_req, rc = -EFAULT);
3365 *fid = body->mbo_fid1;
3368 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3370 ptlrpc_req_finished(req);
3374 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3375 const char *name, int namelen)
3377 struct dentry *dchild = NULL;
3378 struct inode *child_inode = NULL;
3379 struct md_op_data *op_data;
3380 struct ptlrpc_request *request = NULL;
3381 struct obd_client_handle *och = NULL;
3383 struct mdt_body *body;
3385 __u64 data_version = 0;
3388 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3389 name, PFID(ll_inode2fid(parent)), mdtidx);
3391 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3392 0, LUSTRE_OPC_ANY, NULL);
3393 if (IS_ERR(op_data))
3394 RETURN(PTR_ERR(op_data));
3396 /* Get child FID first */
3397 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3400 dchild = d_lookup(file_dentry(file), &qstr);
3401 if (dchild != NULL) {
3402 if (dchild->d_inode != NULL)
3403 child_inode = igrab(dchild->d_inode);
3407 if (child_inode == NULL) {
3408 rc = ll_get_fid_by_name(parent, name, namelen,
3409 &op_data->op_fid3, &child_inode);
3414 if (child_inode == NULL)
3415 GOTO(out_free, rc = -EINVAL);
3418 * lfs migrate command needs to be blocked on the client
3419 * by checking the migrate FID against the FID of the
3422 if (child_inode == parent->i_sb->s_root->d_inode)
3423 GOTO(out_iput, rc = -EINVAL);
3425 inode_lock(child_inode);
3426 op_data->op_fid3 = *ll_inode2fid(child_inode);
3427 if (!fid_is_sane(&op_data->op_fid3)) {
3428 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3429 ll_get_fsname(parent->i_sb, NULL, 0), name,
3430 PFID(&op_data->op_fid3));
3431 GOTO(out_unlock, rc = -EINVAL);
3434 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3436 GOTO(out_unlock, rc);
3439 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3440 PFID(&op_data->op_fid3), mdtidx);
3441 GOTO(out_unlock, rc = 0);
3444 if (S_ISREG(child_inode->i_mode)) {
3445 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3449 GOTO(out_unlock, rc);
3452 rc = ll_data_version(child_inode, &data_version,
3455 GOTO(out_close, rc);
3457 op_data->op_handle = och->och_fh;
3458 op_data->op_data = och->och_mod;
3459 op_data->op_data_version = data_version;
3460 op_data->op_lease_handle = och->och_lease_handle;
3461 op_data->op_bias |= MDS_RENAME_MIGRATE;
3464 op_data->op_mds = mdtidx;
3465 op_data->op_cli_flags = CLI_MIGRATE;
3466 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3467 namelen, name, namelen, &request);
3469 LASSERT(request != NULL);
3470 ll_update_times(request, parent);
3472 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3473 LASSERT(body != NULL);
3475 /* If the server does release layout lock, then we cleanup
3476 * the client och here, otherwise release it in out_close: */
3478 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3479 obd_mod_put(och->och_mod);
3480 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3482 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3488 if (request != NULL) {
3489 ptlrpc_req_finished(request);
3493 /* Try again if the file layout has changed. */
3494 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3498 if (och != NULL) /* close the file */
3499 ll_lease_close(och, child_inode, NULL);
3501 clear_nlink(child_inode);
3503 inode_unlock(child_inode);
3507 ll_finish_md_op_data(op_data);
3512 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3520 * test if some locks matching bits and l_req_mode are acquired
3521 * - bits can be in different locks
3522 * - if found clear the common lock bits in *bits
3523 * - the bits not found, are kept in *bits
3525 * \param bits [IN] searched lock bits [IN]
3526 * \param l_req_mode [IN] searched lock mode
3527 * \retval boolean, true iff all bits are found
3529 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3531 struct lustre_handle lockh;
3532 union ldlm_policy_data policy;
3533 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3534 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3543 fid = &ll_i2info(inode)->lli_fid;
3544 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3545 ldlm_lockname[mode]);
3547 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3548 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3549 policy.l_inodebits.bits = *bits & (1 << i);
3550 if (policy.l_inodebits.bits == 0)
3553 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3554 &policy, mode, &lockh)) {
3555 struct ldlm_lock *lock;
3557 lock = ldlm_handle2lock(&lockh);
3560 ~(lock->l_policy_data.l_inodebits.bits);
3561 LDLM_LOCK_PUT(lock);
3563 *bits &= ~policy.l_inodebits.bits;
3570 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3571 struct lustre_handle *lockh, __u64 flags,
3572 enum ldlm_mode mode)
3574 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3579 fid = &ll_i2info(inode)->lli_fid;
3580 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3582 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3583 fid, LDLM_IBITS, &policy, mode, lockh);
3588 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3590 /* Already unlinked. Just update nlink and return success */
3591 if (rc == -ENOENT) {
3593 /* If it is striped directory, and there is bad stripe
3594 * Let's revalidate the dentry again, instead of returning
3596 if (S_ISDIR(inode->i_mode) &&
3597 ll_i2info(inode)->lli_lsm_md != NULL)
3600 /* This path cannot be hit for regular files unless in
3601 * case of obscure races, so no need to to validate
3603 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3605 } else if (rc != 0) {
3606 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3607 "%s: revalidate FID "DFID" error: rc = %d\n",
3608 ll_get_fsname(inode->i_sb, NULL, 0),
3609 PFID(ll_inode2fid(inode)), rc);
3615 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3617 struct inode *inode = dentry->d_inode;
3618 struct ptlrpc_request *req = NULL;
3619 struct obd_export *exp;
3623 LASSERT(inode != NULL);
3625 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3626 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3628 exp = ll_i2mdexp(inode);
3630 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3631 * But under CMD case, it caused some lock issues, should be fixed
3632 * with new CMD ibits lock. See bug 12718 */
3633 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3634 struct lookup_intent oit = { .it_op = IT_GETATTR };
3635 struct md_op_data *op_data;
3637 if (ibits == MDS_INODELOCK_LOOKUP)
3638 oit.it_op = IT_LOOKUP;
3640 /* Call getattr by fid, so do not provide name at all. */
3641 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3642 dentry->d_inode, NULL, 0, 0,
3643 LUSTRE_OPC_ANY, NULL);
3644 if (IS_ERR(op_data))
3645 RETURN(PTR_ERR(op_data));
3647 rc = md_intent_lock(exp, op_data, &oit, &req,
3648 &ll_md_blocking_ast, 0);
3649 ll_finish_md_op_data(op_data);
3651 rc = ll_inode_revalidate_fini(inode, rc);
3655 rc = ll_revalidate_it_finish(req, &oit, dentry);
3657 ll_intent_release(&oit);
3661 /* Unlinked? Unhash dentry, so it is not picked up later by
3662 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3663 here to preserve get_cwd functionality on 2.6.
3665 if (!dentry->d_inode->i_nlink) {
3666 ll_lock_dcache(inode);
3667 d_lustre_invalidate(dentry, 0);
3668 ll_unlock_dcache(inode);
3671 ll_lookup_finish_locks(&oit, dentry);
3672 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3673 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3674 u64 valid = OBD_MD_FLGETATTR;
3675 struct md_op_data *op_data;
3678 if (S_ISREG(inode->i_mode)) {
3679 rc = ll_get_default_mdsize(sbi, &ealen);
3682 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3685 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3686 0, ealen, LUSTRE_OPC_ANY,
3688 if (IS_ERR(op_data))
3689 RETURN(PTR_ERR(op_data));
3691 op_data->op_valid = valid;
3692 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3693 ll_finish_md_op_data(op_data);
3695 rc = ll_inode_revalidate_fini(inode, rc);
3699 rc = ll_prep_inode(&inode, req, NULL, NULL);
3702 ptlrpc_req_finished(req);
3706 static int ll_merge_md_attr(struct inode *inode)
3708 struct cl_attr attr = { 0 };
3711 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3712 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3713 &attr, ll_md_blocking_ast);
3717 set_nlink(inode, attr.cat_nlink);
3718 inode->i_blocks = attr.cat_blocks;
3719 i_size_write(inode, attr.cat_size);
3721 ll_i2info(inode)->lli_atime = attr.cat_atime;
3722 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3723 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3729 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3731 struct inode *inode = dentry->d_inode;
3735 rc = __ll_inode_revalidate(dentry, ibits);
3739 /* if object isn't regular file, don't validate size */
3740 if (!S_ISREG(inode->i_mode)) {
3741 if (S_ISDIR(inode->i_mode) &&
3742 ll_i2info(inode)->lli_lsm_md != NULL) {
3743 rc = ll_merge_md_attr(inode);
3748 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3749 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3750 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3752 /* In case of restore, the MDT has the right size and has
3753 * already send it back without granting the layout lock,
3754 * inode is up-to-date so glimpse is useless.
3755 * Also to glimpse we need the layout, in case of a running
3756 * restore the MDT holds the layout lock so the glimpse will
3757 * block up to the end of restore (getattr will block)
3759 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3760 rc = ll_glimpse_size(inode);
3765 static inline dev_t ll_compat_encode_dev(dev_t dev)
3767 /* The compat_sys_*stat*() syscalls will fail unless the
3768 * device majors and minors are both less than 256. Note that
3769 * the value returned here will be passed through
3770 * old_encode_dev() in cp_compat_stat(). And so we are not
3771 * trying to return a valid compat (u16) device number, just
3772 * one that will pass the old_valid_dev() check. */
3774 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3777 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
3778 int ll_getattr(const struct path *path, struct kstat *stat,
3779 u32 request_mask, unsigned int flags)
3782 struct dentry *de = path->dentry;
3784 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3787 struct inode *inode = de->d_inode;
3788 struct ll_sb_info *sbi = ll_i2sbi(inode);
3789 struct ll_inode_info *lli = ll_i2info(inode);
3792 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3793 MDS_INODELOCK_LOOKUP);
3794 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3799 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3801 if (ll_need_32bit_api(sbi)) {
3802 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3803 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3804 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3806 stat->ino = inode->i_ino;
3807 stat->dev = inode->i_sb->s_dev;
3808 stat->rdev = inode->i_rdev;
3811 stat->mode = inode->i_mode;
3812 stat->uid = inode->i_uid;
3813 stat->gid = inode->i_gid;
3814 stat->atime = inode->i_atime;
3815 stat->mtime = inode->i_mtime;
3816 stat->ctime = inode->i_ctime;
3817 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3819 stat->nlink = inode->i_nlink;
3820 stat->size = i_size_read(inode);
3821 stat->blocks = inode->i_blocks;
3826 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3827 __u64 start, __u64 len)
3831 struct fiemap *fiemap;
3832 unsigned int extent_count = fieinfo->fi_extents_max;
3834 num_bytes = sizeof(*fiemap) + (extent_count *
3835 sizeof(struct fiemap_extent));
3836 OBD_ALLOC_LARGE(fiemap, num_bytes);
3841 fiemap->fm_flags = fieinfo->fi_flags;
3842 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3843 fiemap->fm_start = start;
3844 fiemap->fm_length = len;
3845 if (extent_count > 0 &&
3846 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3847 sizeof(struct fiemap_extent)) != 0)
3848 GOTO(out, rc = -EFAULT);
3850 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3852 fieinfo->fi_flags = fiemap->fm_flags;
3853 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3854 if (extent_count > 0 &&
3855 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3856 fiemap->fm_mapped_extents *
3857 sizeof(struct fiemap_extent)) != 0)
3858 GOTO(out, rc = -EFAULT);
3860 OBD_FREE_LARGE(fiemap, num_bytes);
3864 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3866 struct ll_inode_info *lli = ll_i2info(inode);
3867 struct posix_acl *acl = NULL;
3870 spin_lock(&lli->lli_lock);
3871 /* VFS' acl_permission_check->check_acl will release the refcount */
3872 acl = posix_acl_dup(lli->lli_posix_acl);
3873 spin_unlock(&lli->lli_lock);
3878 #ifdef HAVE_IOP_SET_ACL
3879 #ifdef CONFIG_FS_POSIX_ACL
3880 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
3882 const char *name = NULL;
3889 case ACL_TYPE_ACCESS:
3891 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
3895 name = XATTR_NAME_POSIX_ACL_ACCESS;
3897 case ACL_TYPE_DEFAULT:
3898 if (!S_ISDIR(inode->i_mode))
3899 GOTO(out, rc = acl ? -EACCES : 0);
3900 name = XATTR_NAME_POSIX_ACL_DEFAULT;
3903 GOTO(out, rc = -EINVAL);
3907 size = posix_acl_xattr_size(acl->a_count);
3908 value = kmalloc(size, GFP_NOFS);
3910 GOTO(out, rc = -ENOMEM);
3912 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
3917 /* dentry is only used for *.lov attributes so it's safe to be NULL */
3918 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
3923 set_cached_acl(inode, type, acl);
3925 forget_cached_acl(inode, type);
3928 #endif /* CONFIG_FS_POSIX_ACL */
3929 #endif /* HAVE_IOP_SET_ACL */
3931 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3933 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3934 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3936 ll_check_acl(struct inode *inode, int mask)
3939 # ifdef CONFIG_FS_POSIX_ACL
3940 struct posix_acl *acl;
3944 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3945 if (flags & IPERM_FLAG_RCU)
3948 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3953 rc = posix_acl_permission(inode, acl, mask);
3954 posix_acl_release(acl);
3957 # else /* !CONFIG_FS_POSIX_ACL */
3959 # endif /* CONFIG_FS_POSIX_ACL */
3961 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3963 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3964 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3966 # ifdef HAVE_INODE_PERMISION_2ARGS
3967 int ll_inode_permission(struct inode *inode, int mask)
3969 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3974 struct ll_sb_info *sbi;
3975 struct root_squash_info *squash;
3976 struct cred *cred = NULL;
3977 const struct cred *old_cred = NULL;
3979 bool squash_id = false;
3982 #ifdef MAY_NOT_BLOCK
3983 if (mask & MAY_NOT_BLOCK)
3985 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3986 if (flags & IPERM_FLAG_RCU)
3990 /* as root inode are NOT getting validated in lookup operation,
3991 * need to do it before permission check. */
3993 if (inode == inode->i_sb->s_root->d_inode) {
3994 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3995 MDS_INODELOCK_LOOKUP);
4000 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4001 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4003 /* squash fsuid/fsgid if needed */
4004 sbi = ll_i2sbi(inode);
4005 squash = &sbi->ll_squash;
4006 if (unlikely(squash->rsi_uid != 0 &&
4007 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4008 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4012 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4013 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4014 squash->rsi_uid, squash->rsi_gid);
4016 /* update current process's credentials
4017 * and FS capability */
4018 cred = prepare_creds();
4022 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4023 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4024 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4025 if ((1 << cap) & CFS_CAP_FS_MASK)
4026 cap_lower(cred->cap_effective, cap);
4028 old_cred = override_creds(cred);
4031 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4032 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4033 /* restore current process's credentials and FS capability */
4035 revert_creds(old_cred);
4042 /* -o localflock - only provides locally consistent flock locks */
4043 struct file_operations ll_file_operations = {
4044 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4045 # ifdef HAVE_SYNC_READ_WRITE
4046 .read = new_sync_read,
4047 .write = new_sync_write,
4049 .read_iter = ll_file_read_iter,
4050 .write_iter = ll_file_write_iter,
4051 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4052 .read = ll_file_read,
4053 .aio_read = ll_file_aio_read,
4054 .write = ll_file_write,
4055 .aio_write = ll_file_aio_write,
4056 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4057 .unlocked_ioctl = ll_file_ioctl,
4058 .open = ll_file_open,
4059 .release = ll_file_release,
4060 .mmap = ll_file_mmap,
4061 .llseek = ll_file_seek,
4062 .splice_read = ll_file_splice_read,
4067 struct file_operations ll_file_operations_flock = {
4068 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4069 # ifdef HAVE_SYNC_READ_WRITE
4070 .read = new_sync_read,
4071 .write = new_sync_write,
4072 # endif /* HAVE_SYNC_READ_WRITE */
4073 .read_iter = ll_file_read_iter,
4074 .write_iter = ll_file_write_iter,
4075 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4076 .read = ll_file_read,
4077 .aio_read = ll_file_aio_read,
4078 .write = ll_file_write,
4079 .aio_write = ll_file_aio_write,
4080 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4081 .unlocked_ioctl = ll_file_ioctl,
4082 .open = ll_file_open,
4083 .release = ll_file_release,
4084 .mmap = ll_file_mmap,
4085 .llseek = ll_file_seek,
4086 .splice_read = ll_file_splice_read,
4089 .flock = ll_file_flock,
4090 .lock = ll_file_flock
4093 /* These are for -o noflock - to return ENOSYS on flock calls */
4094 struct file_operations ll_file_operations_noflock = {
4095 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4096 # ifdef HAVE_SYNC_READ_WRITE
4097 .read = new_sync_read,
4098 .write = new_sync_write,
4099 # endif /* HAVE_SYNC_READ_WRITE */
4100 .read_iter = ll_file_read_iter,
4101 .write_iter = ll_file_write_iter,
4102 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4103 .read = ll_file_read,
4104 .aio_read = ll_file_aio_read,
4105 .write = ll_file_write,
4106 .aio_write = ll_file_aio_write,
4107 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4108 .unlocked_ioctl = ll_file_ioctl,
4109 .open = ll_file_open,
4110 .release = ll_file_release,
4111 .mmap = ll_file_mmap,
4112 .llseek = ll_file_seek,
4113 .splice_read = ll_file_splice_read,
4116 .flock = ll_file_noflock,
4117 .lock = ll_file_noflock
4120 struct inode_operations ll_file_inode_operations = {
4121 .setattr = ll_setattr,
4122 .getattr = ll_getattr,
4123 .permission = ll_inode_permission,
4124 #ifdef HAVE_IOP_XATTR
4125 .setxattr = ll_setxattr,
4126 .getxattr = ll_getxattr,
4127 .removexattr = ll_removexattr,
4129 .listxattr = ll_listxattr,
4130 .fiemap = ll_fiemap,
4131 #ifdef HAVE_IOP_GET_ACL
4132 .get_acl = ll_get_acl,
4134 #ifdef HAVE_IOP_SET_ACL
4135 .set_acl = ll_set_acl,
4139 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4141 struct ll_inode_info *lli = ll_i2info(inode);
4142 struct cl_object *obj = lli->lli_clob;
4151 env = cl_env_get(&refcheck);
4153 RETURN(PTR_ERR(env));
4155 rc = cl_conf_set(env, lli->lli_clob, conf);
4159 if (conf->coc_opc == OBJECT_CONF_SET) {
4160 struct ldlm_lock *lock = conf->coc_lock;
4161 struct cl_layout cl = {
4165 LASSERT(lock != NULL);
4166 LASSERT(ldlm_has_layout(lock));
4168 /* it can only be allowed to match after layout is
4169 * applied to inode otherwise false layout would be
4170 * seen. Applying layout shoud happen before dropping
4171 * the intent lock. */
4172 ldlm_lock_allow_match(lock);
4174 rc = cl_object_layout_get(env, obj, &cl);
4179 DFID": layout version change: %u -> %u\n",
4180 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4182 ll_layout_version_set(lli, cl.cl_layout_gen);
4186 cl_env_put(env, &refcheck);
4191 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4192 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4195 struct ll_sb_info *sbi = ll_i2sbi(inode);
4196 struct ptlrpc_request *req;
4197 struct mdt_body *body;
4204 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4205 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4206 lock->l_lvb_data, lock->l_lvb_len);
4208 if (lock->l_lvb_data != NULL)
4211 /* if layout lock was granted right away, the layout is returned
4212 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4213 * blocked and then granted via completion ast, we have to fetch
4214 * layout here. Please note that we can't use the LVB buffer in
4215 * completion AST because it doesn't have a large enough buffer */
4216 rc = ll_get_default_mdsize(sbi, &lmmsize);
4218 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4219 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4224 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4226 GOTO(out, rc = -EPROTO);
4228 lmmsize = body->mbo_eadatasize;
4229 if (lmmsize == 0) /* empty layout */
4232 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4234 GOTO(out, rc = -EFAULT);
4236 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4237 if (lvbdata == NULL)
4238 GOTO(out, rc = -ENOMEM);
4240 memcpy(lvbdata, lmm, lmmsize);
4241 lock_res_and_lock(lock);
4242 if (unlikely(lock->l_lvb_data == NULL)) {
4243 lock->l_lvb_type = LVB_T_LAYOUT;
4244 lock->l_lvb_data = lvbdata;
4245 lock->l_lvb_len = lmmsize;
4248 unlock_res_and_lock(lock);
4251 OBD_FREE_LARGE(lvbdata, lmmsize);
4256 ptlrpc_req_finished(req);
4261 * Apply the layout to the inode. Layout lock is held and will be released
4264 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4265 struct inode *inode)
4267 struct ll_inode_info *lli = ll_i2info(inode);
4268 struct ll_sb_info *sbi = ll_i2sbi(inode);
4269 struct ldlm_lock *lock;
4270 struct cl_object_conf conf;
4273 bool wait_layout = false;
4276 LASSERT(lustre_handle_is_used(lockh));
4278 lock = ldlm_handle2lock(lockh);
4279 LASSERT(lock != NULL);
4280 LASSERT(ldlm_has_layout(lock));
4282 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4283 PFID(&lli->lli_fid), inode);
4285 /* in case this is a caching lock and reinstate with new inode */
4286 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4288 lock_res_and_lock(lock);
4289 lvb_ready = ldlm_is_lvb_ready(lock);
4290 unlock_res_and_lock(lock);
4292 /* checking lvb_ready is racy but this is okay. The worst case is
4293 * that multi processes may configure the file on the same time. */
4297 rc = ll_layout_fetch(inode, lock);
4301 /* for layout lock, lmm is stored in lock's lvb.
4302 * lvb_data is immutable if the lock is held so it's safe to access it
4305 * set layout to file. Unlikely this will fail as old layout was
4306 * surely eliminated */
4307 memset(&conf, 0, sizeof conf);
4308 conf.coc_opc = OBJECT_CONF_SET;
4309 conf.coc_inode = inode;
4310 conf.coc_lock = lock;
4311 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4312 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4313 rc = ll_layout_conf(inode, &conf);
4315 /* refresh layout failed, need to wait */
4316 wait_layout = rc == -EBUSY;
4319 LDLM_LOCK_PUT(lock);
4320 ldlm_lock_decref(lockh, mode);
4322 /* wait for IO to complete if it's still being used. */
4324 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4325 ll_get_fsname(inode->i_sb, NULL, 0),
4326 PFID(&lli->lli_fid), inode);
4328 memset(&conf, 0, sizeof conf);
4329 conf.coc_opc = OBJECT_CONF_WAIT;
4330 conf.coc_inode = inode;
4331 rc = ll_layout_conf(inode, &conf);
4335 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4336 ll_get_fsname(inode->i_sb, NULL, 0),
4337 PFID(&lli->lli_fid), rc);
4343 * Issue layout intent RPC to MDS.
4344 * \param inode [in] file inode
4345 * \param intent [in] layout intent
4347 * \retval 0 on success
4348 * \retval < 0 error code
4350 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4352 struct ll_inode_info *lli = ll_i2info(inode);
4353 struct ll_sb_info *sbi = ll_i2sbi(inode);
4354 struct md_op_data *op_data;
4355 struct lookup_intent it;
4356 struct ptlrpc_request *req;
4360 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4361 0, 0, LUSTRE_OPC_ANY, NULL);
4362 if (IS_ERR(op_data))
4363 RETURN(PTR_ERR(op_data));
4365 op_data->op_data = intent;
4366 op_data->op_data_size = sizeof(*intent);
4368 memset(&it, 0, sizeof(it));
4369 it.it_op = IT_LAYOUT;
4370 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4371 intent->li_opc == LAYOUT_INTENT_TRUNC)
4372 it.it_flags = FMODE_WRITE;
4374 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4375 ll_get_fsname(inode->i_sb, NULL, 0),
4376 PFID(&lli->lli_fid), inode);
4378 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4379 &ll_md_blocking_ast, 0);
4380 if (it.it_request != NULL)
4381 ptlrpc_req_finished(it.it_request);
4382 it.it_request = NULL;
4384 ll_finish_md_op_data(op_data);
4386 /* set lock data in case this is a new lock */
4388 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4390 ll_intent_drop_lock(&it);
4396 * This function checks if there exists a LAYOUT lock on the client side,
4397 * or enqueues it if it doesn't have one in cache.
4399 * This function will not hold layout lock so it may be revoked any time after
4400 * this function returns. Any operations depend on layout should be redone
4403 * This function should be called before lov_io_init() to get an uptodate
4404 * layout version, the caller should save the version number and after IO
4405 * is finished, this function should be called again to verify that layout
4406 * is not changed during IO time.
4408 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4410 struct ll_inode_info *lli = ll_i2info(inode);
4411 struct ll_sb_info *sbi = ll_i2sbi(inode);
4412 struct lustre_handle lockh;
4413 struct layout_intent intent = {
4414 .li_opc = LAYOUT_INTENT_ACCESS,
4416 enum ldlm_mode mode;
4420 *gen = ll_layout_version_get(lli);
4421 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4425 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4426 LASSERT(S_ISREG(inode->i_mode));
4428 /* take layout lock mutex to enqueue layout lock exclusively. */
4429 mutex_lock(&lli->lli_layout_mutex);
4432 /* mostly layout lock is caching on the local side, so try to
4433 * match it before grabbing layout lock mutex. */
4434 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4435 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4436 if (mode != 0) { /* hit cached lock */
4437 rc = ll_layout_lock_set(&lockh, mode, inode);
4443 rc = ll_layout_intent(inode, &intent);
4449 *gen = ll_layout_version_get(lli);
4450 mutex_unlock(&lli->lli_layout_mutex);
4456 * Issue layout intent RPC indicating where in a file an IO is about to write.
4458 * \param[in] inode file inode.
4459 * \param[in] start start offset of fille in bytes where an IO is about to
4461 * \param[in] end exclusive end offset in bytes of the write range.
4463 * \retval 0 on success
4464 * \retval < 0 error code
4466 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4468 struct layout_intent intent = {
4469 .li_opc = LAYOUT_INTENT_WRITE,
4476 rc = ll_layout_intent(inode, &intent);
4482 * This function send a restore request to the MDT
4484 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4486 struct hsm_user_request *hur;
4490 len = sizeof(struct hsm_user_request) +
4491 sizeof(struct hsm_user_item);
4492 OBD_ALLOC(hur, len);
4496 hur->hur_request.hr_action = HUA_RESTORE;
4497 hur->hur_request.hr_archive_id = 0;
4498 hur->hur_request.hr_flags = 0;
4499 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4500 sizeof(hur->hur_user_item[0].hui_fid));
4501 hur->hur_user_item[0].hui_extent.offset = offset;
4502 hur->hur_user_item[0].hui_extent.length = length;
4503 hur->hur_request.hr_itemcount = 1;
4504 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,