4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
89 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
90 struct lustre_handle *fh)
92 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr_blocks = inode->i_blocks;
99 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
101 op_data->op_handle = *fh;
102 op_data->op_capa1 = ll_mdscapa_get(inode);
104 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
105 op_data->op_bias |= MDS_DATA_MODIFIED;
109 * Packs all the attributes into @op_data for the CLOSE rpc.
111 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
112 struct obd_client_handle *och)
116 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
117 ATTR_MTIME | ATTR_MTIME_SET |
118 ATTR_CTIME | ATTR_CTIME_SET;
120 if (!(och->och_flags & FMODE_WRITE))
123 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
133 * Perform a close, possibly with a bias.
134 * The meaning of "data" depends on the value of "bias".
136 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
137 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
140 static int ll_close_inode_openhandle(struct obd_export *md_exp,
141 struct obd_client_handle *och,
143 enum mds_op_bias bias,
146 struct obd_export *exp = ll_i2mdexp(inode);
147 struct md_op_data *op_data;
148 struct ptlrpc_request *req = NULL;
149 struct obd_device *obd = class_exp2obd(exp);
155 * XXX: in case of LMV, is this correct to access
158 CERROR("Invalid MDC connection handle "LPX64"\n",
159 ll_i2mdexp(inode)->exp_handle.h_cookie);
163 OBD_ALLOC_PTR(op_data);
165 /* XXX We leak openhandle and request here. */
166 GOTO(out, rc = -ENOMEM);
168 ll_prepare_close(inode, op_data, och);
170 case MDS_CLOSE_LAYOUT_SWAP:
171 LASSERT(data != NULL);
172 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 op_data->op_fid2 = *ll_inode2fid(data);
178 case MDS_HSM_RELEASE:
179 LASSERT(data != NULL);
180 op_data->op_bias |= MDS_HSM_RELEASE;
181 op_data->op_data_version = *(__u64 *)data;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
187 LASSERT(data == NULL);
191 rc = md_close(md_exp, op_data, och->och_mod, &req);
193 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
194 ll_i2mdexp(inode)->exp_obd->obd_name,
195 PFID(ll_inode2fid(inode)), rc);
198 /* DATA_MODIFIED flag was successfully sent on close, cancel data
199 * modification flag. */
200 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
201 struct ll_inode_info *lli = ll_i2info(inode);
203 spin_lock(&lli->lli_lock);
204 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
205 spin_unlock(&lli->lli_lock);
209 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
210 struct mdt_body *body;
212 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
213 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
217 ll_finish_md_op_data(op_data);
221 md_clear_open_replay_data(md_exp, och);
222 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
225 if (req) /* This is close request */
226 ptlrpc_req_finished(req);
230 int ll_md_real_close(struct inode *inode, fmode_t fmode)
232 struct ll_inode_info *lli = ll_i2info(inode);
233 struct obd_client_handle **och_p;
234 struct obd_client_handle *och;
239 if (fmode & FMODE_WRITE) {
240 och_p = &lli->lli_mds_write_och;
241 och_usecount = &lli->lli_open_fd_write_count;
242 } else if (fmode & FMODE_EXEC) {
243 och_p = &lli->lli_mds_exec_och;
244 och_usecount = &lli->lli_open_fd_exec_count;
246 LASSERT(fmode & FMODE_READ);
247 och_p = &lli->lli_mds_read_och;
248 och_usecount = &lli->lli_open_fd_read_count;
251 mutex_lock(&lli->lli_och_mutex);
252 if (*och_usecount > 0) {
253 /* There are still users of this handle, so skip
255 mutex_unlock(&lli->lli_och_mutex);
261 mutex_unlock(&lli->lli_och_mutex);
264 /* There might be a race and this handle may already
266 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
267 och, inode, 0, NULL);
273 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
276 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
277 struct ll_inode_info *lli = ll_i2info(inode);
281 /* clear group lock, if present */
282 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
283 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
285 if (fd->fd_lease_och != NULL) {
288 /* Usually the lease is not released when the
289 * application crashed, we need to release here. */
290 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
291 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
292 PFID(&lli->lli_fid), rc, lease_broken);
294 fd->fd_lease_och = NULL;
297 if (fd->fd_och != NULL) {
298 rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 if (file->f_dentry->d_inode) { /* Can this ever be false? */
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct lustre_handle lockh;
310 struct inode *inode = file->f_dentry->d_inode;
311 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode,
332 rc = ll_md_real_close(file->f_dentry->d_inode,
336 CERROR("released file has negative dentry: file = %p, "
337 "dentry = %p, name = %s\n",
338 file, file->f_dentry, file->f_dentry->d_name.name);
342 LUSTRE_FPRIVATE(file) = NULL;
343 ll_file_data_put(fd);
344 ll_capa_close(inode);
349 /* While this returns an error code, fput() the caller does not, so we need
350 * to make every effort to clean up all of our state here. Also, applications
351 * rarely check close errors and even if an error is returned they will not
352 * re-try the close call.
354 int ll_file_release(struct inode *inode, struct file *file)
356 struct ll_file_data *fd;
357 struct ll_sb_info *sbi = ll_i2sbi(inode);
358 struct ll_inode_info *lli = ll_i2info(inode);
362 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
363 PFID(ll_inode2fid(inode)), inode);
365 #ifdef CONFIG_FS_POSIX_ACL
366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
367 inode == inode->i_sb->s_root->d_inode) {
368 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
371 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
372 fd->fd_flags &= ~LL_FILE_RMTACL;
373 rct_del(&sbi->ll_rct, current_pid());
374 et_search_free(&sbi->ll_et, current_pid());
379 if (inode->i_sb->s_root != file->f_dentry)
380 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
381 fd = LUSTRE_FPRIVATE(file);
384 /* The last ref on @file, maybe not the the owner pid of statahead,
385 * because parent and child process can share the same file handle. */
386 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
387 ll_deauthorize_statahead(inode, fd);
389 if (inode->i_sb->s_root == file->f_dentry) {
390 LUSTRE_FPRIVATE(file) = NULL;
391 ll_file_data_put(fd);
395 if (!S_ISDIR(inode->i_mode)) {
396 if (lli->lli_clob != NULL)
397 lov_read_and_clear_async_rc(lli->lli_clob);
398 lli->lli_async_rc = 0;
401 rc = ll_md_close(sbi->ll_md_exp, inode, file);
403 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
404 libcfs_debug_dumplog();
409 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
410 struct lookup_intent *itp)
412 struct dentry *de = file->f_dentry;
413 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
414 struct dentry *parent = de->d_parent;
415 const char *name = NULL;
417 struct md_op_data *op_data;
418 struct ptlrpc_request *req = NULL;
422 LASSERT(parent != NULL);
423 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
425 /* if server supports open-by-fid, or file name is invalid, don't pack
426 * name in open request */
427 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
428 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
429 name = de->d_name.name;
430 len = de->d_name.len;
433 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
434 name, len, 0, LUSTRE_OPC_ANY, NULL);
436 RETURN(PTR_ERR(op_data));
437 op_data->op_data = lmm;
438 op_data->op_data_size = lmmsize;
440 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
441 &ll_md_blocking_ast, 0);
442 ll_finish_md_op_data(op_data);
444 /* reason for keep own exit path - don`t flood log
445 * with messages with -ESTALE errors.
447 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
448 it_open_error(DISP_OPEN_OPEN, itp))
450 ll_release_openhandle(de, itp);
454 if (it_disposition(itp, DISP_LOOKUP_NEG))
455 GOTO(out, rc = -ENOENT);
457 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
458 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
459 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
463 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
464 if (!rc && itp->d.lustre.it_lock_mode)
465 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
468 ptlrpc_req_finished(req);
469 ll_intent_drop_lock(itp);
474 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
475 struct obd_client_handle *och)
477 struct ptlrpc_request *req = it->d.lustre.it_data;
478 struct mdt_body *body;
480 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
481 och->och_fh = body->mbo_handle;
482 och->och_fid = body->mbo_fid1;
483 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
484 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
485 och->och_flags = it->it_flags;
487 return md_set_open_replay_data(md_exp, och, it);
490 static int ll_local_open(struct file *file, struct lookup_intent *it,
491 struct ll_file_data *fd, struct obd_client_handle *och)
493 struct inode *inode = file->f_dentry->d_inode;
496 LASSERT(!LUSTRE_FPRIVATE(file));
503 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
508 LUSTRE_FPRIVATE(file) = fd;
509 ll_readahead_init(inode, &fd->fd_ras);
510 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
512 /* ll_cl_context initialize */
513 rwlock_init(&fd->fd_lock);
514 INIT_LIST_HEAD(&fd->fd_lccs);
519 /* Open a file, and (for the very first open) create objects on the OSTs at
520 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
521 * creation or open until ll_lov_setstripe() ioctl is called.
523 * If we already have the stripe MD locally then we don't request it in
524 * md_open(), by passing a lmm_size = 0.
526 * It is up to the application to ensure no other processes open this file
527 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
528 * used. We might be able to avoid races of that sort by getting lli_open_sem
529 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
530 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
532 int ll_file_open(struct inode *inode, struct file *file)
534 struct ll_inode_info *lli = ll_i2info(inode);
535 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
536 .it_flags = file->f_flags };
537 struct obd_client_handle **och_p = NULL;
538 __u64 *och_usecount = NULL;
539 struct ll_file_data *fd;
543 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
544 PFID(ll_inode2fid(inode)), inode, file->f_flags);
546 it = file->private_data; /* XXX: compat macro */
547 file->private_data = NULL; /* prevent ll_local_open assertion */
549 fd = ll_file_data_get();
551 GOTO(out_openerr, rc = -ENOMEM);
554 if (S_ISDIR(inode->i_mode))
555 ll_authorize_statahead(inode, fd);
557 if (inode->i_sb->s_root == file->f_dentry) {
558 LUSTRE_FPRIVATE(file) = fd;
562 if (!it || !it->d.lustre.it_disposition) {
563 /* Convert f_flags into access mode. We cannot use file->f_mode,
564 * because everything but O_ACCMODE mask was stripped from
566 if ((oit.it_flags + 1) & O_ACCMODE)
568 if (file->f_flags & O_TRUNC)
569 oit.it_flags |= FMODE_WRITE;
571 /* kernel only call f_op->open in dentry_open. filp_open calls
572 * dentry_open after call to open_namei that checks permissions.
573 * Only nfsd_open call dentry_open directly without checking
574 * permissions and because of that this code below is safe. */
575 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
578 /* We do not want O_EXCL here, presumably we opened the file
579 * already? XXX - NFS implications? */
580 oit.it_flags &= ~O_EXCL;
582 /* bug20584, if "it_flags" contains O_CREAT, the file will be
583 * created if necessary, then "IT_CREAT" should be set to keep
584 * consistent with it */
585 if (oit.it_flags & O_CREAT)
586 oit.it_op |= IT_CREAT;
592 /* Let's see if we have file open on MDS already. */
593 if (it->it_flags & FMODE_WRITE) {
594 och_p = &lli->lli_mds_write_och;
595 och_usecount = &lli->lli_open_fd_write_count;
596 } else if (it->it_flags & FMODE_EXEC) {
597 och_p = &lli->lli_mds_exec_och;
598 och_usecount = &lli->lli_open_fd_exec_count;
600 och_p = &lli->lli_mds_read_och;
601 och_usecount = &lli->lli_open_fd_read_count;
604 mutex_lock(&lli->lli_och_mutex);
605 if (*och_p) { /* Open handle is present */
606 if (it_disposition(it, DISP_OPEN_OPEN)) {
607 /* Well, there's extra open request that we do not need,
608 let's close it somehow. This will decref request. */
609 rc = it_open_error(DISP_OPEN_OPEN, it);
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
615 ll_release_openhandle(file->f_dentry, it);
619 rc = ll_local_open(file, it, fd, NULL);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 LASSERT(*och_usecount == 0);
627 if (!it->d.lustre.it_disposition) {
628 /* We cannot just request lock handle now, new ELC code
629 means that one of other OPEN locks for this file
630 could be cancelled, and since blocking ast handler
631 would attempt to grab och_mutex as well, that would
632 result in a deadlock */
633 mutex_unlock(&lli->lli_och_mutex);
635 * Normally called under two situations:
637 * 2. A race/condition on MDS resulting in no open
638 * handle to be returned from LOOKUP|OPEN request,
639 * for example if the target entry was a symlink.
641 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
643 * Always specify MDS_OPEN_BY_FID because we don't want
644 * to get file with different fid.
646 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
647 rc = ll_intent_file_open(file, NULL, 0, it);
649 GOTO(out_openerr, rc);
653 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
655 GOTO(out_och_free, rc = -ENOMEM);
659 /* md_intent_lock() didn't get a request ref if there was an
660 * open error, so don't do cleanup on the request here
662 /* XXX (green): Should not we bail out on any error here, not
663 * just open error? */
664 rc = it_open_error(DISP_OPEN_OPEN, it);
666 GOTO(out_och_free, rc);
668 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
669 "inode %p: disposition %x, status %d\n", inode,
670 it_disposition(it, ~0), it->d.lustre.it_status);
672 rc = ll_local_open(file, it, fd, *och_p);
674 GOTO(out_och_free, rc);
676 mutex_unlock(&lli->lli_och_mutex);
679 /* Must do this outside lli_och_mutex lock to prevent deadlock where
680 different kind of OPEN lock for this same inode gets cancelled
681 by ldlm_cancel_lru */
682 if (!S_ISREG(inode->i_mode))
683 GOTO(out_och_free, rc);
687 cl_lov_delay_create_clear(&file->f_flags);
688 GOTO(out_och_free, rc);
692 if (och_p && *och_p) {
693 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
694 *och_p = NULL; /* OBD_FREE writes some magic there */
697 mutex_unlock(&lli->lli_och_mutex);
700 if (lli->lli_opendir_key == fd)
701 ll_deauthorize_statahead(inode, fd);
703 ll_file_data_put(fd);
705 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
708 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
709 ptlrpc_req_finished(it->d.lustre.it_data);
710 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
716 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
717 struct ldlm_lock_desc *desc, void *data, int flag)
720 struct lustre_handle lockh;
724 case LDLM_CB_BLOCKING:
725 ldlm_lock2handle(lock, &lockh);
726 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
728 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
732 case LDLM_CB_CANCELING:
740 * Acquire a lease and open the file.
742 static struct obd_client_handle *
743 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
746 struct lookup_intent it = { .it_op = IT_OPEN };
747 struct ll_sb_info *sbi = ll_i2sbi(inode);
748 struct md_op_data *op_data;
749 struct ptlrpc_request *req = NULL;
750 struct lustre_handle old_handle = { 0 };
751 struct obd_client_handle *och = NULL;
756 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
757 RETURN(ERR_PTR(-EINVAL));
760 struct ll_inode_info *lli = ll_i2info(inode);
761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
762 struct obd_client_handle **och_p;
765 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
766 RETURN(ERR_PTR(-EPERM));
768 /* Get the openhandle of the file */
770 mutex_lock(&lli->lli_och_mutex);
771 if (fd->fd_lease_och != NULL) {
772 mutex_unlock(&lli->lli_och_mutex);
776 if (fd->fd_och == NULL) {
777 if (file->f_mode & FMODE_WRITE) {
778 LASSERT(lli->lli_mds_write_och != NULL);
779 och_p = &lli->lli_mds_write_och;
780 och_usecount = &lli->lli_open_fd_write_count;
782 LASSERT(lli->lli_mds_read_och != NULL);
783 och_p = &lli->lli_mds_read_och;
784 och_usecount = &lli->lli_open_fd_read_count;
786 if (*och_usecount == 1) {
793 mutex_unlock(&lli->lli_och_mutex);
794 if (rc < 0) /* more than 1 opener */
797 LASSERT(fd->fd_och != NULL);
798 old_handle = fd->fd_och->och_fh;
803 RETURN(ERR_PTR(-ENOMEM));
805 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
806 LUSTRE_OPC_ANY, NULL);
808 GOTO(out, rc = PTR_ERR(op_data));
810 /* To tell the MDT this openhandle is from the same owner */
811 op_data->op_handle = old_handle;
813 it.it_flags = fmode | open_flags;
814 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
815 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
816 &ll_md_blocking_lease_ast,
817 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
818 * it can be cancelled which may mislead applications that the lease is
820 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
821 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
822 * doesn't deal with openhandle, so normal openhandle will be leaked. */
823 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
824 ll_finish_md_op_data(op_data);
825 ptlrpc_req_finished(req);
827 GOTO(out_release_it, rc);
829 if (it_disposition(&it, DISP_LOOKUP_NEG))
830 GOTO(out_release_it, rc = -ENOENT);
832 rc = it_open_error(DISP_OPEN_OPEN, &it);
834 GOTO(out_release_it, rc);
836 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
837 ll_och_fill(sbi->ll_md_exp, &it, och);
839 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
840 GOTO(out_close, rc = -EOPNOTSUPP);
842 /* already get lease, handle lease lock */
843 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
844 if (it.d.lustre.it_lock_mode == 0 ||
845 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
846 /* open lock must return for lease */
847 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
848 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
849 it.d.lustre.it_lock_bits);
850 GOTO(out_close, rc = -EPROTO);
853 ll_intent_release(&it);
857 /* Cancel open lock */
858 if (it.d.lustre.it_lock_mode != 0) {
859 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
860 it.d.lustre.it_lock_mode);
861 it.d.lustre.it_lock_mode = 0;
862 och->och_lease_handle.cookie = 0ULL;
864 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
866 CERROR("%s: error closing file "DFID": %d\n",
867 ll_get_fsname(inode->i_sb, NULL, 0),
868 PFID(&ll_i2info(inode)->lli_fid), rc2);
869 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
871 ll_intent_release(&it);
879 * Check whether a layout swap can be done between two inodes.
881 * \param[in] inode1 First inode to check
882 * \param[in] inode2 Second inode to check
884 * \retval 0 on success, layout swap can be performed between both inodes
885 * \retval negative error code if requirements are not met
887 static int ll_check_swap_layouts_validity(struct inode *inode1,
888 struct inode *inode2)
890 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
893 if (inode_permission(inode1, MAY_WRITE) ||
894 inode_permission(inode2, MAY_WRITE))
897 if (inode1->i_sb != inode2->i_sb)
903 static int ll_swap_layouts_close(struct obd_client_handle *och,
904 struct inode *inode, struct inode *inode2)
906 const struct lu_fid *fid1 = ll_inode2fid(inode);
907 const struct lu_fid *fid2;
911 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
912 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
914 rc = ll_check_swap_layouts_validity(inode, inode2);
916 GOTO(out_free_och, rc);
918 /* We now know that inode2 is a lustre inode */
919 fid2 = ll_inode2fid(inode2);
921 rc = lu_fid_cmp(fid1, fid2);
923 GOTO(out_free_och, rc = -EINVAL);
925 /* Close the file and swap layouts between inode & inode2.
926 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
927 * because we still need it to pack l_remote_handle to MDT. */
928 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
929 MDS_CLOSE_LAYOUT_SWAP, inode2);
931 och = NULL; /* freed in ll_close_inode_openhandle() */
941 * Release lease and close the file.
942 * It will check if the lease has ever broken.
944 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
947 struct ldlm_lock *lock;
948 bool cancelled = true;
952 lock = ldlm_handle2lock(&och->och_lease_handle);
954 lock_res_and_lock(lock);
955 cancelled = ldlm_is_cancel(lock);
956 unlock_res_and_lock(lock);
960 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
961 PFID(&ll_i2info(inode)->lli_fid), cancelled);
964 ldlm_cli_cancel(&och->och_lease_handle, 0);
965 if (lease_broken != NULL)
966 *lease_broken = cancelled;
968 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
974 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
976 struct ll_inode_info *lli = ll_i2info(inode);
977 struct cl_object *obj = lli->lli_clob;
978 struct cl_attr *attr = vvp_env_thread_attr(env);
986 ll_inode_size_lock(inode);
988 /* merge timestamps the most recently obtained from mds with
989 timestamps obtained from osts */
990 LTIME_S(inode->i_atime) = lli->lli_atime;
991 LTIME_S(inode->i_mtime) = lli->lli_mtime;
992 LTIME_S(inode->i_ctime) = lli->lli_ctime;
994 atime = LTIME_S(inode->i_atime);
995 mtime = LTIME_S(inode->i_mtime);
996 ctime = LTIME_S(inode->i_ctime);
998 cl_object_attr_lock(obj);
999 rc = cl_object_attr_get(env, obj, attr);
1000 cl_object_attr_unlock(obj);
1003 GOTO(out_size_unlock, rc);
1005 if (atime < attr->cat_atime)
1006 atime = attr->cat_atime;
1008 if (ctime < attr->cat_ctime)
1009 ctime = attr->cat_ctime;
1011 if (mtime < attr->cat_mtime)
1012 mtime = attr->cat_mtime;
1014 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1015 PFID(&lli->lli_fid), attr->cat_size);
1017 i_size_write(inode, attr->cat_size);
1018 inode->i_blocks = attr->cat_blocks;
1020 LTIME_S(inode->i_atime) = atime;
1021 LTIME_S(inode->i_mtime) = mtime;
1022 LTIME_S(inode->i_ctime) = ctime;
1025 ll_inode_size_unlock(inode);
1030 static bool file_is_noatime(const struct file *file)
1032 const struct vfsmount *mnt = file->f_path.mnt;
1033 const struct inode *inode = file->f_path.dentry->d_inode;
1035 /* Adapted from file_accessed() and touch_atime().*/
1036 if (file->f_flags & O_NOATIME)
1039 if (inode->i_flags & S_NOATIME)
1042 if (IS_NOATIME(inode))
1045 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1048 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1051 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1057 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1059 struct inode *inode = file->f_dentry->d_inode;
1061 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1063 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1064 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1065 file->f_flags & O_DIRECT ||
1068 io->ci_obj = ll_i2info(inode)->lli_clob;
1069 io->ci_lockreq = CILR_MAYBE;
1070 if (ll_file_nolock(file)) {
1071 io->ci_lockreq = CILR_NEVER;
1072 io->ci_no_srvlock = 1;
1073 } else if (file->f_flags & O_APPEND) {
1074 io->ci_lockreq = CILR_MANDATORY;
1077 io->ci_noatime = file_is_noatime(file);
1081 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1082 struct file *file, enum cl_io_type iot,
1083 loff_t *ppos, size_t count)
1085 struct vvp_io *vio = vvp_env_io(env);
1086 struct inode *inode = file->f_dentry->d_inode;
1087 struct ll_inode_info *lli = ll_i2info(inode);
1088 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1092 struct range_lock range;
1096 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1097 file->f_dentry->d_name.name, iot, *ppos, count);
1100 io = vvp_env_thread_io(env);
1101 ll_io_init(io, file, iot == CIT_WRITE);
1103 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1104 bool range_locked = false;
1106 if (file->f_flags & O_APPEND)
1107 range_lock_init(&range, 0, LUSTRE_EOF);
1109 range_lock_init(&range, *ppos, *ppos + count - 1);
1111 vio->vui_fd = LUSTRE_FPRIVATE(file);
1112 vio->vui_io_subtype = args->via_io_subtype;
1114 switch (vio->vui_io_subtype) {
1116 vio->vui_iov = args->u.normal.via_iov;
1117 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1118 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1119 vio->vui_iocb = args->u.normal.via_iocb;
1120 /* Direct IO reads must also take range lock,
1121 * or multiple reads will try to work on the same pages
1122 * See LU-6227 for details. */
1123 if (((iot == CIT_WRITE) ||
1124 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1125 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1126 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1128 rc = range_lock(&lli->lli_write_tree, &range);
1132 range_locked = true;
1136 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1137 vio->u.splice.vui_flags = args->u.splice.via_flags;
1140 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1144 ll_cl_add(file, env, io);
1145 rc = cl_io_loop(env, io);
1146 ll_cl_remove(file, env);
1149 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1151 range_unlock(&lli->lli_write_tree, &range);
1154 /* cl_io_rw_init() handled IO */
1158 if (io->ci_nob > 0) {
1159 result += io->ci_nob;
1160 count -= io->ci_nob;
1161 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1163 /* prepare IO restart */
1164 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1165 args->u.normal.via_iov = vio->vui_iov;
1166 args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1171 cl_io_fini(env, io);
1173 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1175 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1176 file->f_dentry->d_name.name,
1177 iot == CIT_READ ? "read" : "write",
1178 *ppos, count, result);
1182 if (iot == CIT_READ) {
1184 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1185 LPROC_LL_READ_BYTES, result);
1186 } else if (iot == CIT_WRITE) {
1188 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1189 LPROC_LL_WRITE_BYTES, result);
1190 fd->fd_write_failed = false;
1191 } else if (rc != -ERESTARTSYS) {
1192 fd->fd_write_failed = true;
1196 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1198 return result > 0 ? result : rc;
1202 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1204 static int ll_file_get_iov_count(const struct iovec *iov,
1205 unsigned long *nr_segs, size_t *count)
1210 for (seg = 0; seg < *nr_segs; seg++) {
1211 const struct iovec *iv = &iov[seg];
1214 * If any segment has a negative length, or the cumulative
1215 * length ever wraps negative then return -EINVAL.
1218 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1220 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1225 cnt -= iv->iov_len; /* This segment is no good */
1232 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1233 unsigned long nr_segs, loff_t pos)
1236 struct vvp_io_args *args;
1237 struct iovec *local_iov;
1243 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1247 env = cl_env_get(&refcheck);
1249 RETURN(PTR_ERR(env));
1252 local_iov = &ll_env_info(env)->lti_local_iov;
1255 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1256 if (local_iov == NULL) {
1257 cl_env_put(env, &refcheck);
1261 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1264 args = ll_env_args(env, IO_NORMAL);
1265 args->u.normal.via_iov = local_iov;
1266 args->u.normal.via_nrsegs = nr_segs;
1267 args->u.normal.via_iocb = iocb;
1269 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1270 &iocb->ki_pos, count);
1272 cl_env_put(env, &refcheck);
1275 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1280 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1284 struct iovec iov = { .iov_base = buf, .iov_len = count };
1285 struct kiocb *kiocb;
1290 env = cl_env_get(&refcheck);
1292 RETURN(PTR_ERR(env));
1294 kiocb = &ll_env_info(env)->lti_kiocb;
1295 init_sync_kiocb(kiocb, file);
1296 kiocb->ki_pos = *ppos;
1297 #ifdef HAVE_KIOCB_KI_LEFT
1298 kiocb->ki_left = count;
1300 kiocb->ki_nbytes = count;
1303 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1304 *ppos = kiocb->ki_pos;
1306 cl_env_put(env, &refcheck);
1311 * Write to a file (through the page cache).
1314 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1315 unsigned long nr_segs, loff_t pos)
1318 struct vvp_io_args *args;
1319 struct iovec *local_iov;
1325 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1329 env = cl_env_get(&refcheck);
1331 RETURN(PTR_ERR(env));
1334 local_iov = &ll_env_info(env)->lti_local_iov;
1337 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1338 if (local_iov == NULL) {
1339 cl_env_put(env, &refcheck);
1343 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1346 args = ll_env_args(env, IO_NORMAL);
1347 args->u.normal.via_iov = local_iov;
1348 args->u.normal.via_nrsegs = nr_segs;
1349 args->u.normal.via_iocb = iocb;
1351 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1352 &iocb->ki_pos, count);
1353 cl_env_put(env, &refcheck);
1356 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1361 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1362 size_t count, loff_t *ppos)
1365 struct iovec iov = { .iov_base = (void __user *)buf,
1367 struct kiocb *kiocb;
1372 env = cl_env_get(&refcheck);
1374 RETURN(PTR_ERR(env));
1376 kiocb = &ll_env_info(env)->lti_kiocb;
1377 init_sync_kiocb(kiocb, file);
1378 kiocb->ki_pos = *ppos;
1379 #ifdef HAVE_KIOCB_KI_LEFT
1380 kiocb->ki_left = count;
1382 kiocb->ki_nbytes = count;
1385 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1386 *ppos = kiocb->ki_pos;
1388 cl_env_put(env, &refcheck);
1393 * Send file content (through pagecache) somewhere with helper
1395 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1396 struct pipe_inode_info *pipe, size_t count,
1400 struct vvp_io_args *args;
1405 env = cl_env_get(&refcheck);
1407 RETURN(PTR_ERR(env));
1409 args = ll_env_args(env, IO_SPLICE);
1410 args->u.splice.via_pipe = pipe;
1411 args->u.splice.via_flags = flags;
1413 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1414 cl_env_put(env, &refcheck);
1418 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1419 __u64 flags, struct lov_user_md *lum,
1422 struct lookup_intent oit = {
1424 .it_flags = flags | MDS_OPEN_BY_FID,
1429 ll_inode_size_lock(inode);
1430 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1432 GOTO(out_unlock, rc);
1434 ll_release_openhandle(file->f_dentry, &oit);
1437 ll_inode_size_unlock(inode);
1438 ll_intent_release(&oit);
1439 cl_lov_delay_create_clear(&file->f_flags);
1444 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1445 struct lov_mds_md **lmmp, int *lmm_size,
1446 struct ptlrpc_request **request)
1448 struct ll_sb_info *sbi = ll_i2sbi(inode);
1449 struct mdt_body *body;
1450 struct lov_mds_md *lmm = NULL;
1451 struct ptlrpc_request *req = NULL;
1452 struct md_op_data *op_data;
1455 rc = ll_get_default_mdsize(sbi, &lmmsize);
1459 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1460 strlen(filename), lmmsize,
1461 LUSTRE_OPC_ANY, NULL);
1462 if (IS_ERR(op_data))
1463 RETURN(PTR_ERR(op_data));
1465 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1466 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1467 ll_finish_md_op_data(op_data);
1469 CDEBUG(D_INFO, "md_getattr_name failed "
1470 "on %s: rc %d\n", filename, rc);
1474 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1475 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1477 lmmsize = body->mbo_eadatasize;
1479 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1481 GOTO(out, rc = -ENODATA);
1484 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1485 LASSERT(lmm != NULL);
1487 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1488 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1489 GOTO(out, rc = -EPROTO);
1493 * This is coming from the MDS, so is probably in
1494 * little endian. We convert it to host endian before
1495 * passing it to userspace.
1497 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1500 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1501 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1504 /* if function called for directory - we should
1505 * avoid swab not existent lsm objects */
1506 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1507 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1508 if (S_ISREG(body->mbo_mode))
1509 lustre_swab_lov_user_md_objects(
1510 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1512 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1513 lustre_swab_lov_user_md_v3(
1514 (struct lov_user_md_v3 *)lmm);
1515 if (S_ISREG(body->mbo_mode))
1516 lustre_swab_lov_user_md_objects(
1517 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1524 *lmm_size = lmmsize;
1529 static int ll_lov_setea(struct inode *inode, struct file *file,
1532 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1533 struct lov_user_md *lump;
1534 int lum_size = sizeof(struct lov_user_md) +
1535 sizeof(struct lov_user_ost_data);
1539 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1542 OBD_ALLOC_LARGE(lump, lum_size);
1546 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1547 OBD_FREE_LARGE(lump, lum_size);
1551 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1553 OBD_FREE_LARGE(lump, lum_size);
1557 static int ll_file_getstripe(struct inode *inode,
1558 struct lov_user_md __user *lum)
1565 env = cl_env_get(&refcheck);
1567 RETURN(PTR_ERR(env));
1569 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1570 cl_env_put(env, &refcheck);
1574 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1577 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1578 struct lov_user_md *klum;
1580 __u64 flags = FMODE_WRITE;
1583 rc = ll_copy_user_md(lum, &klum);
1588 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1592 put_user(0, &lum->lmm_stripe_count);
1594 ll_layout_refresh(inode, &gen);
1595 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1598 OBD_FREE(klum, lum_size);
1603 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1605 struct ll_inode_info *lli = ll_i2info(inode);
1606 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1607 struct ll_grouplock grouplock;
1612 CWARN("group id for group lock must not be 0\n");
1616 if (ll_file_nolock(file))
1617 RETURN(-EOPNOTSUPP);
1619 spin_lock(&lli->lli_lock);
1620 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1621 CWARN("group lock already existed with gid %lu\n",
1622 fd->fd_grouplock.lg_gid);
1623 spin_unlock(&lli->lli_lock);
1626 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1627 spin_unlock(&lli->lli_lock);
1629 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1630 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1634 spin_lock(&lli->lli_lock);
1635 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1636 spin_unlock(&lli->lli_lock);
1637 CERROR("another thread just won the race\n");
1638 cl_put_grouplock(&grouplock);
1642 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1643 fd->fd_grouplock = grouplock;
1644 spin_unlock(&lli->lli_lock);
1646 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1650 static int ll_put_grouplock(struct inode *inode, struct file *file,
1653 struct ll_inode_info *lli = ll_i2info(inode);
1654 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1655 struct ll_grouplock grouplock;
1658 spin_lock(&lli->lli_lock);
1659 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1660 spin_unlock(&lli->lli_lock);
1661 CWARN("no group lock held\n");
1665 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1667 if (fd->fd_grouplock.lg_gid != arg) {
1668 CWARN("group lock %lu doesn't match current id %lu\n",
1669 arg, fd->fd_grouplock.lg_gid);
1670 spin_unlock(&lli->lli_lock);
1674 grouplock = fd->fd_grouplock;
1675 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1676 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1677 spin_unlock(&lli->lli_lock);
1679 cl_put_grouplock(&grouplock);
1680 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1685 * Close inode open handle
1687 * \param dentry [in] dentry which contains the inode
1688 * \param it [in,out] intent which contains open info and result
1691 * \retval <0 failure
1693 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1695 struct inode *inode = dentry->d_inode;
1696 struct obd_client_handle *och;
1702 /* Root ? Do nothing. */
1703 if (dentry->d_inode->i_sb->s_root == dentry)
1706 /* No open handle to close? Move away */
1707 if (!it_disposition(it, DISP_OPEN_OPEN))
1710 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1712 OBD_ALLOC(och, sizeof(*och));
1714 GOTO(out, rc = -ENOMEM);
1716 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1718 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1719 och, inode, 0, NULL);
1721 /* this one is in place of ll_file_open */
1722 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1723 ptlrpc_req_finished(it->d.lustre.it_data);
1724 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1730 * Get size for inode for which FIEMAP mapping is requested.
1731 * Make the FIEMAP get_info call and returns the result.
1732 * \param fiemap kernel buffer to hold extens
1733 * \param num_bytes kernel buffer size
1735 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1741 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1744 /* Checks for fiemap flags */
1745 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1746 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1750 /* Check for FIEMAP_FLAG_SYNC */
1751 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1752 rc = filemap_fdatawrite(inode->i_mapping);
1757 env = cl_env_get(&refcheck);
1759 RETURN(PTR_ERR(env));
1761 if (i_size_read(inode) == 0) {
1762 rc = ll_glimpse_size(inode);
1767 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1768 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1769 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1771 /* If filesize is 0, then there would be no objects for mapping */
1772 if (fmkey.lfik_oa.o_size == 0) {
1773 fiemap->fm_mapped_extents = 0;
1777 fmkey.lfik_fiemap = *fiemap;
1779 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1780 &fmkey, fiemap, &num_bytes);
1782 cl_env_put(env, &refcheck);
1786 int ll_fid2path(struct inode *inode, void __user *arg)
1788 struct obd_export *exp = ll_i2mdexp(inode);
1789 const struct getinfo_fid2path __user *gfin = arg;
1791 struct getinfo_fid2path *gfout;
1797 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1798 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1801 /* Only need to get the buflen */
1802 if (get_user(pathlen, &gfin->gf_pathlen))
1805 if (pathlen > PATH_MAX)
1808 outsize = sizeof(*gfout) + pathlen;
1809 OBD_ALLOC(gfout, outsize);
1813 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1814 GOTO(gf_free, rc = -EFAULT);
1816 /* Call mdc_iocontrol */
1817 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1821 if (copy_to_user(arg, gfout, outsize))
1825 OBD_FREE(gfout, outsize);
1830 * Read the data_version for inode.
1832 * This value is computed using stripe object version on OST.
1833 * Version is computed using server side locking.
1835 * @param flags if do sync on the OST side;
1837 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1838 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1840 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1842 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1850 /* If no file object initialized, we consider its version is 0. */
1856 env = cl_env_get(&refcheck);
1858 RETURN(PTR_ERR(env));
1860 io = vvp_env_thread_io(env);
1862 io->u.ci_data_version.dv_data_version = 0;
1863 io->u.ci_data_version.dv_flags = flags;
1866 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1867 result = cl_io_loop(env, io);
1869 result = io->ci_result;
1871 *data_version = io->u.ci_data_version.dv_data_version;
1873 cl_io_fini(env, io);
1875 if (unlikely(io->ci_need_restart))
1878 cl_env_put(env, &refcheck);
1884 * Trigger a HSM release request for the provided inode.
1886 int ll_hsm_release(struct inode *inode)
1888 struct cl_env_nest nest;
1890 struct obd_client_handle *och = NULL;
1891 __u64 data_version = 0;
1895 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1896 ll_get_fsname(inode->i_sb, NULL, 0),
1897 PFID(&ll_i2info(inode)->lli_fid));
1899 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1901 GOTO(out, rc = PTR_ERR(och));
1903 /* Grab latest data_version and [am]time values */
1904 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1908 env = cl_env_nested_get(&nest);
1910 GOTO(out, rc = PTR_ERR(env));
1912 ll_merge_attr(env, inode);
1913 cl_env_nested_put(&nest, env);
1915 /* Release the file.
1916 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1917 * we still need it to pack l_remote_handle to MDT. */
1918 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
1919 MDS_HSM_RELEASE, &data_version);
1924 if (och != NULL && !IS_ERR(och)) /* close the file */
1925 ll_lease_close(och, inode, NULL);
1930 struct ll_swap_stack {
1933 struct inode *inode1;
1934 struct inode *inode2;
1939 static int ll_swap_layouts(struct file *file1, struct file *file2,
1940 struct lustre_swap_layouts *lsl)
1942 struct mdc_swap_layouts msl;
1943 struct md_op_data *op_data;
1946 struct ll_swap_stack *llss = NULL;
1949 OBD_ALLOC_PTR(llss);
1953 llss->inode1 = file1->f_dentry->d_inode;
1954 llss->inode2 = file2->f_dentry->d_inode;
1956 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1960 /* we use 2 bool because it is easier to swap than 2 bits */
1961 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1962 llss->check_dv1 = true;
1964 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1965 llss->check_dv2 = true;
1967 /* we cannot use lsl->sl_dvX directly because we may swap them */
1968 llss->dv1 = lsl->sl_dv1;
1969 llss->dv2 = lsl->sl_dv2;
1971 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1972 if (rc == 0) /* same file, done! */
1975 if (rc < 0) { /* sequentialize it */
1976 swap(llss->inode1, llss->inode2);
1978 swap(llss->dv1, llss->dv2);
1979 swap(llss->check_dv1, llss->check_dv2);
1983 if (gid != 0) { /* application asks to flush dirty cache */
1984 rc = ll_get_grouplock(llss->inode1, file1, gid);
1988 rc = ll_get_grouplock(llss->inode2, file2, gid);
1990 ll_put_grouplock(llss->inode1, file1, gid);
1995 /* ultimate check, before swaping the layouts we check if
1996 * dataversion has changed (if requested) */
1997 if (llss->check_dv1) {
1998 rc = ll_data_version(llss->inode1, &dv, 0);
2001 if (dv != llss->dv1)
2002 GOTO(putgl, rc = -EAGAIN);
2005 if (llss->check_dv2) {
2006 rc = ll_data_version(llss->inode2, &dv, 0);
2009 if (dv != llss->dv2)
2010 GOTO(putgl, rc = -EAGAIN);
2013 /* struct md_op_data is used to send the swap args to the mdt
2014 * only flags is missing, so we use struct mdc_swap_layouts
2015 * through the md_op_data->op_data */
2016 /* flags from user space have to be converted before they are send to
2017 * server, no flag is sent today, they are only used on the client */
2020 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2021 0, LUSTRE_OPC_ANY, &msl);
2022 if (IS_ERR(op_data))
2023 GOTO(free, rc = PTR_ERR(op_data));
2025 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2026 sizeof(*op_data), op_data, NULL);
2027 ll_finish_md_op_data(op_data);
2034 ll_put_grouplock(llss->inode2, file2, gid);
2035 ll_put_grouplock(llss->inode1, file1, gid);
2045 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2047 struct md_op_data *op_data;
2051 /* Detect out-of range masks */
2052 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2055 /* Non-root users are forbidden to set or clear flags which are
2056 * NOT defined in HSM_USER_MASK. */
2057 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2058 !cfs_capable(CFS_CAP_SYS_ADMIN))
2061 /* Detect out-of range archive id */
2062 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2063 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2066 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2067 LUSTRE_OPC_ANY, hss);
2068 if (IS_ERR(op_data))
2069 RETURN(PTR_ERR(op_data));
2071 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2072 sizeof(*op_data), op_data, NULL);
2074 ll_finish_md_op_data(op_data);
2079 static int ll_hsm_import(struct inode *inode, struct file *file,
2080 struct hsm_user_import *hui)
2082 struct hsm_state_set *hss = NULL;
2083 struct iattr *attr = NULL;
2087 if (!S_ISREG(inode->i_mode))
2093 GOTO(out, rc = -ENOMEM);
2095 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2096 hss->hss_archive_id = hui->hui_archive_id;
2097 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2098 rc = ll_hsm_state_set(inode, hss);
2102 OBD_ALLOC_PTR(attr);
2104 GOTO(out, rc = -ENOMEM);
2106 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2107 attr->ia_mode |= S_IFREG;
2108 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2109 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2110 attr->ia_size = hui->hui_size;
2111 attr->ia_mtime.tv_sec = hui->hui_mtime;
2112 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2113 attr->ia_atime.tv_sec = hui->hui_atime;
2114 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2116 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2117 ATTR_UID | ATTR_GID |
2118 ATTR_MTIME | ATTR_MTIME_SET |
2119 ATTR_ATIME | ATTR_ATIME_SET;
2121 mutex_lock(&inode->i_mutex);
2123 rc = ll_setattr_raw(file->f_dentry, attr, true);
2127 mutex_unlock(&inode->i_mutex);
2139 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2141 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2142 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2146 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2148 struct inode *inode = file->f_dentry->d_inode;
2149 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2153 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2154 PFID(ll_inode2fid(inode)), inode, cmd);
2155 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2157 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2158 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2162 case LL_IOC_GETFLAGS:
2163 /* Get the current value of the file flags */
2164 return put_user(fd->fd_flags, (int __user *)arg);
2165 case LL_IOC_SETFLAGS:
2166 case LL_IOC_CLRFLAGS:
2167 /* Set or clear specific file flags */
2168 /* XXX This probably needs checks to ensure the flags are
2169 * not abused, and to handle any flag side effects.
2171 if (get_user(flags, (int __user *) arg))
2174 if (cmd == LL_IOC_SETFLAGS) {
2175 if ((flags & LL_FILE_IGNORE_LOCK) &&
2176 !(file->f_flags & O_DIRECT)) {
2177 CERROR("%s: unable to disable locking on "
2178 "non-O_DIRECT file\n", current->comm);
2182 fd->fd_flags |= flags;
2184 fd->fd_flags &= ~flags;
2187 case LL_IOC_LOV_SETSTRIPE:
2188 RETURN(ll_lov_setstripe(inode, file, arg));
2189 case LL_IOC_LOV_SETEA:
2190 RETURN(ll_lov_setea(inode, file, arg));
2191 case LL_IOC_LOV_SWAP_LAYOUTS: {
2193 struct lustre_swap_layouts lsl;
2195 if (copy_from_user(&lsl, (char __user *)arg,
2196 sizeof(struct lustre_swap_layouts)))
2199 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2202 file2 = fget(lsl.sl_fd);
2206 /* O_WRONLY or O_RDWR */
2207 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2208 GOTO(out, rc = -EPERM);
2210 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2211 struct inode *inode2;
2212 struct ll_inode_info *lli;
2213 struct obd_client_handle *och = NULL;
2215 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2216 GOTO(out, rc = -EINVAL);
2218 lli = ll_i2info(inode);
2219 mutex_lock(&lli->lli_och_mutex);
2220 if (fd->fd_lease_och != NULL) {
2221 och = fd->fd_lease_och;
2222 fd->fd_lease_och = NULL;
2224 mutex_unlock(&lli->lli_och_mutex);
2226 GOTO(out, rc = -ENOLCK);
2227 inode2 = file2->f_dentry->d_inode;
2228 rc = ll_swap_layouts_close(och, inode, inode2);
2230 rc = ll_swap_layouts(file, file2, &lsl);
2236 case LL_IOC_LOV_GETSTRIPE:
2237 RETURN(ll_file_getstripe(inode,
2238 (struct lov_user_md __user *)arg));
2239 case FSFILT_IOC_GETFLAGS:
2240 case FSFILT_IOC_SETFLAGS:
2241 RETURN(ll_iocontrol(inode, file, cmd, arg));
2242 case FSFILT_IOC_GETVERSION_OLD:
2243 case FSFILT_IOC_GETVERSION:
2244 RETURN(put_user(inode->i_generation, (int __user *)arg));
2245 case LL_IOC_GROUP_LOCK:
2246 RETURN(ll_get_grouplock(inode, file, arg));
2247 case LL_IOC_GROUP_UNLOCK:
2248 RETURN(ll_put_grouplock(inode, file, arg));
2249 case IOC_OBD_STATFS:
2250 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2252 /* We need to special case any other ioctls we want to handle,
2253 * to send them to the MDS/OST as appropriate and to properly
2254 * network encode the arg field.
2255 case FSFILT_IOC_SETVERSION_OLD:
2256 case FSFILT_IOC_SETVERSION:
2258 case LL_IOC_FLUSHCTX:
2259 RETURN(ll_flush_ctx(inode));
2260 case LL_IOC_PATH2FID: {
2261 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2262 sizeof(struct lu_fid)))
2267 case LL_IOC_GETPARENT:
2268 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2270 case OBD_IOC_FID2PATH:
2271 RETURN(ll_fid2path(inode, (void __user *)arg));
2272 case LL_IOC_DATA_VERSION: {
2273 struct ioc_data_version idv;
2276 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2279 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2280 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2283 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2289 case LL_IOC_GET_MDTIDX: {
2292 mdtidx = ll_get_mdt_idx(inode);
2296 if (put_user((int)mdtidx, (int __user *)arg))
2301 case OBD_IOC_GETDTNAME:
2302 case OBD_IOC_GETMDNAME:
2303 RETURN(ll_get_obd_name(inode, cmd, arg));
2304 case LL_IOC_HSM_STATE_GET: {
2305 struct md_op_data *op_data;
2306 struct hsm_user_state *hus;
2313 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2314 LUSTRE_OPC_ANY, hus);
2315 if (IS_ERR(op_data)) {
2317 RETURN(PTR_ERR(op_data));
2320 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2323 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2326 ll_finish_md_op_data(op_data);
2330 case LL_IOC_HSM_STATE_SET: {
2331 struct hsm_state_set *hss;
2338 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2343 rc = ll_hsm_state_set(inode, hss);
2348 case LL_IOC_HSM_ACTION: {
2349 struct md_op_data *op_data;
2350 struct hsm_current_action *hca;
2357 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2358 LUSTRE_OPC_ANY, hca);
2359 if (IS_ERR(op_data)) {
2361 RETURN(PTR_ERR(op_data));
2364 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2367 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2370 ll_finish_md_op_data(op_data);
2374 case LL_IOC_SET_LEASE: {
2375 struct ll_inode_info *lli = ll_i2info(inode);
2376 struct obd_client_handle *och = NULL;
2381 case LL_LEASE_WRLCK:
2382 if (!(file->f_mode & FMODE_WRITE))
2384 fmode = FMODE_WRITE;
2386 case LL_LEASE_RDLCK:
2387 if (!(file->f_mode & FMODE_READ))
2391 case LL_LEASE_UNLCK:
2392 mutex_lock(&lli->lli_och_mutex);
2393 if (fd->fd_lease_och != NULL) {
2394 och = fd->fd_lease_och;
2395 fd->fd_lease_och = NULL;
2397 mutex_unlock(&lli->lli_och_mutex);
2402 fmode = och->och_flags;
2403 rc = ll_lease_close(och, inode, &lease_broken);
2410 RETURN(ll_lease_type_from_fmode(fmode));
2415 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2417 /* apply for lease */
2418 och = ll_lease_open(inode, file, fmode, 0);
2420 RETURN(PTR_ERR(och));
2423 mutex_lock(&lli->lli_och_mutex);
2424 if (fd->fd_lease_och == NULL) {
2425 fd->fd_lease_och = och;
2428 mutex_unlock(&lli->lli_och_mutex);
2430 /* impossible now that only excl is supported for now */
2431 ll_lease_close(och, inode, &lease_broken);
2436 case LL_IOC_GET_LEASE: {
2437 struct ll_inode_info *lli = ll_i2info(inode);
2438 struct ldlm_lock *lock = NULL;
2441 mutex_lock(&lli->lli_och_mutex);
2442 if (fd->fd_lease_och != NULL) {
2443 struct obd_client_handle *och = fd->fd_lease_och;
2445 lock = ldlm_handle2lock(&och->och_lease_handle);
2447 lock_res_and_lock(lock);
2448 if (!ldlm_is_cancel(lock))
2449 fmode = och->och_flags;
2451 unlock_res_and_lock(lock);
2452 LDLM_LOCK_PUT(lock);
2455 mutex_unlock(&lli->lli_och_mutex);
2457 RETURN(ll_lease_type_from_fmode(fmode));
2459 case LL_IOC_HSM_IMPORT: {
2460 struct hsm_user_import *hui;
2466 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2471 rc = ll_hsm_import(inode, file, hui);
2481 ll_iocontrol_call(inode, file, cmd, arg, &err))
2484 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2485 (void __user *)arg));
2490 #ifndef HAVE_FILE_LLSEEK_SIZE
2491 static inline loff_t
2492 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2494 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2496 if (offset > maxsize)
2499 if (offset != file->f_pos) {
2500 file->f_pos = offset;
2501 file->f_version = 0;
2507 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2508 loff_t maxsize, loff_t eof)
2510 struct inode *inode = file->f_dentry->d_inode;
2518 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2519 * position-querying operation. Avoid rewriting the "same"
2520 * f_pos value back to the file because a concurrent read(),
2521 * write() or lseek() might have altered it
2526 * f_lock protects against read/modify/write race with other
2527 * SEEK_CURs. Note that parallel writes and reads behave
2530 mutex_lock(&inode->i_mutex);
2531 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2532 mutex_unlock(&inode->i_mutex);
2536 * In the generic case the entire file is data, so as long as
2537 * offset isn't at the end of the file then the offset is data.
2544 * There is a virtual hole at the end of the file, so as long as
2545 * offset isn't i_size or larger, return i_size.
2553 return llseek_execute(file, offset, maxsize);
2557 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2559 struct inode *inode = file->f_dentry->d_inode;
2560 loff_t retval, eof = 0;
2563 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2564 (origin == SEEK_CUR) ? file->f_pos : 0);
2565 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2566 PFID(ll_inode2fid(inode)), inode, retval, retval,
2568 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2570 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2571 retval = ll_glimpse_size(inode);
2574 eof = i_size_read(inode);
2577 retval = ll_generic_file_llseek_size(file, offset, origin,
2578 ll_file_maxbytes(inode), eof);
2582 static int ll_flush(struct file *file, fl_owner_t id)
2584 struct inode *inode = file->f_dentry->d_inode;
2585 struct ll_inode_info *lli = ll_i2info(inode);
2586 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2589 LASSERT(!S_ISDIR(inode->i_mode));
2591 /* catch async errors that were recorded back when async writeback
2592 * failed for pages in this mapping. */
2593 rc = lli->lli_async_rc;
2594 lli->lli_async_rc = 0;
2595 if (lli->lli_clob != NULL) {
2596 err = lov_read_and_clear_async_rc(lli->lli_clob);
2601 /* The application has been told write failure already.
2602 * Do not report failure again. */
2603 if (fd->fd_write_failed)
2605 return rc ? -EIO : 0;
2609 * Called to make sure a portion of file has been written out.
2610 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2612 * Return how many pages have been written.
2614 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2615 enum cl_fsync_mode mode, int ignore_layout)
2617 struct cl_env_nest nest;
2620 struct obd_capa *capa = NULL;
2621 struct cl_fsync_io *fio;
2625 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2626 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2629 env = cl_env_nested_get(&nest);
2631 RETURN(PTR_ERR(env));
2633 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2635 io = vvp_env_thread_io(env);
2636 io->ci_obj = ll_i2info(inode)->lli_clob;
2637 io->ci_ignore_layout = ignore_layout;
2639 /* initialize parameters for sync */
2640 fio = &io->u.ci_fsync;
2641 fio->fi_capa = capa;
2642 fio->fi_start = start;
2644 fio->fi_fid = ll_inode2fid(inode);
2645 fio->fi_mode = mode;
2646 fio->fi_nr_written = 0;
2648 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2649 result = cl_io_loop(env, io);
2651 result = io->ci_result;
2653 result = fio->fi_nr_written;
2654 cl_io_fini(env, io);
2655 cl_env_nested_put(&nest, env);
2663 * When dentry is provided (the 'else' case), *file->f_dentry may be
2664 * null and dentry must be used directly rather than pulled from
2665 * *file->f_dentry as is done otherwise.
2668 #ifdef HAVE_FILE_FSYNC_4ARGS
2669 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2671 struct dentry *dentry = file->f_dentry;
2672 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2673 int ll_fsync(struct file *file, int datasync)
2675 struct dentry *dentry = file->f_dentry;
2677 loff_t end = LLONG_MAX;
2679 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2682 loff_t end = LLONG_MAX;
2684 struct inode *inode = dentry->d_inode;
2685 struct ll_inode_info *lli = ll_i2info(inode);
2686 struct ptlrpc_request *req;
2687 struct obd_capa *oc;
2691 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2692 PFID(ll_inode2fid(inode)), inode);
2693 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2695 #ifdef HAVE_FILE_FSYNC_4ARGS
2696 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2697 mutex_lock(&inode->i_mutex);
2699 /* fsync's caller has already called _fdata{sync,write}, we want
2700 * that IO to finish before calling the osc and mdc sync methods */
2701 rc = filemap_fdatawait(inode->i_mapping);
2704 /* catch async errors that were recorded back when async writeback
2705 * failed for pages in this mapping. */
2706 if (!S_ISDIR(inode->i_mode)) {
2707 err = lli->lli_async_rc;
2708 lli->lli_async_rc = 0;
2711 err = lov_read_and_clear_async_rc(lli->lli_clob);
2716 oc = ll_mdscapa_get(inode);
2717 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2723 ptlrpc_req_finished(req);
2725 if (S_ISREG(inode->i_mode)) {
2726 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2728 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2729 if (rc == 0 && err < 0)
2732 fd->fd_write_failed = true;
2734 fd->fd_write_failed = false;
2737 #ifdef HAVE_FILE_FSYNC_4ARGS
2738 mutex_unlock(&inode->i_mutex);
2744 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2746 struct inode *inode = file->f_dentry->d_inode;
2747 struct ll_sb_info *sbi = ll_i2sbi(inode);
2748 struct ldlm_enqueue_info einfo = {
2749 .ei_type = LDLM_FLOCK,
2750 .ei_cb_cp = ldlm_flock_completion_ast,
2751 .ei_cbdata = file_lock,
2753 struct md_op_data *op_data;
2754 struct lustre_handle lockh = {0};
2755 ldlm_policy_data_t flock = {{0}};
2756 int fl_type = file_lock->fl_type;
2762 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2763 PFID(ll_inode2fid(inode)), file_lock);
2765 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2767 if (file_lock->fl_flags & FL_FLOCK) {
2768 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2769 /* flocks are whole-file locks */
2770 flock.l_flock.end = OFFSET_MAX;
2771 /* For flocks owner is determined by the local file desctiptor*/
2772 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2773 } else if (file_lock->fl_flags & FL_POSIX) {
2774 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2775 flock.l_flock.start = file_lock->fl_start;
2776 flock.l_flock.end = file_lock->fl_end;
2780 flock.l_flock.pid = file_lock->fl_pid;
2782 /* Somewhat ugly workaround for svc lockd.
2783 * lockd installs custom fl_lmops->lm_compare_owner that checks
2784 * for the fl_owner to be the same (which it always is on local node
2785 * I guess between lockd processes) and then compares pid.
2786 * As such we assign pid to the owner field to make it all work,
2787 * conflict with normal locks is unlikely since pid space and
2788 * pointer space for current->files are not intersecting */
2789 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2790 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2794 einfo.ei_mode = LCK_PR;
2797 /* An unlock request may or may not have any relation to
2798 * existing locks so we may not be able to pass a lock handle
2799 * via a normal ldlm_lock_cancel() request. The request may even
2800 * unlock a byte range in the middle of an existing lock. In
2801 * order to process an unlock request we need all of the same
2802 * information that is given with a normal read or write record
2803 * lock request. To avoid creating another ldlm unlock (cancel)
2804 * message we'll treat a LCK_NL flock request as an unlock. */
2805 einfo.ei_mode = LCK_NL;
2808 einfo.ei_mode = LCK_PW;
2811 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2826 flags = LDLM_FL_BLOCK_NOWAIT;
2832 flags = LDLM_FL_TEST_LOCK;
2835 CERROR("unknown fcntl lock command: %d\n", cmd);
2839 /* Save the old mode so that if the mode in the lock changes we
2840 * can decrement the appropriate reader or writer refcount. */
2841 file_lock->fl_type = einfo.ei_mode;
2843 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2844 LUSTRE_OPC_ANY, NULL);
2845 if (IS_ERR(op_data))
2846 RETURN(PTR_ERR(op_data));
2848 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2849 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2850 flock.l_flock.pid, flags, einfo.ei_mode,
2851 flock.l_flock.start, flock.l_flock.end);
2853 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2856 /* Restore the file lock type if not TEST lock. */
2857 if (!(flags & LDLM_FL_TEST_LOCK))
2858 file_lock->fl_type = fl_type;
2860 if ((file_lock->fl_flags & FL_FLOCK) &&
2861 (rc == 0 || file_lock->fl_type == F_UNLCK))
2862 rc2 = flock_lock_file_wait(file, file_lock);
2863 if ((file_lock->fl_flags & FL_POSIX) &&
2864 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2865 !(flags & LDLM_FL_TEST_LOCK))
2866 rc2 = posix_lock_file_wait(file, file_lock);
2868 if (rc2 && file_lock->fl_type != F_UNLCK) {
2869 einfo.ei_mode = LCK_NL;
2870 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2875 ll_finish_md_op_data(op_data);
2880 int ll_get_fid_by_name(struct inode *parent, const char *name,
2881 int namelen, struct lu_fid *fid)
2883 struct md_op_data *op_data = NULL;
2884 struct mdt_body *body;
2885 struct ptlrpc_request *req;
2889 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2890 LUSTRE_OPC_ANY, NULL);
2891 if (IS_ERR(op_data))
2892 RETURN(PTR_ERR(op_data));
2894 op_data->op_valid = OBD_MD_FLID;
2895 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2896 ll_finish_md_op_data(op_data);
2900 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2902 GOTO(out_req, rc = -EFAULT);
2904 *fid = body->mbo_fid1;
2906 ptlrpc_req_finished(req);
2910 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2911 const char *name, int namelen)
2913 struct dentry *dchild = NULL;
2914 struct inode *child_inode = NULL;
2915 struct md_op_data *op_data;
2916 struct ptlrpc_request *request = NULL;
2921 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2922 name, PFID(ll_inode2fid(parent)), mdtidx);
2924 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2925 0, LUSTRE_OPC_ANY, NULL);
2926 if (IS_ERR(op_data))
2927 RETURN(PTR_ERR(op_data));
2929 /* Get child FID first */
2930 qstr.hash = full_name_hash(name, namelen);
2933 dchild = d_lookup(file->f_dentry, &qstr);
2934 if (dchild != NULL) {
2935 if (dchild->d_inode != NULL) {
2936 child_inode = igrab(dchild->d_inode);
2937 if (child_inode != NULL) {
2938 mutex_lock(&child_inode->i_mutex);
2939 op_data->op_fid3 = *ll_inode2fid(child_inode);
2940 ll_invalidate_aliases(child_inode);
2945 rc = ll_get_fid_by_name(parent, name, namelen,
2951 if (!fid_is_sane(&op_data->op_fid3)) {
2952 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2953 ll_get_fsname(parent->i_sb, NULL, 0), name,
2954 PFID(&op_data->op_fid3));
2955 GOTO(out_free, rc = -EINVAL);
2958 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2963 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2964 PFID(&op_data->op_fid3), mdtidx);
2965 GOTO(out_free, rc = 0);
2968 op_data->op_mds = mdtidx;
2969 op_data->op_cli_flags = CLI_MIGRATE;
2970 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2971 namelen, name, namelen, &request);
2973 ll_update_times(request, parent);
2975 ptlrpc_req_finished(request);
2980 if (child_inode != NULL) {
2981 clear_nlink(child_inode);
2982 mutex_unlock(&child_inode->i_mutex);
2986 ll_finish_md_op_data(op_data);
2991 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2999 * test if some locks matching bits and l_req_mode are acquired
3000 * - bits can be in different locks
3001 * - if found clear the common lock bits in *bits
3002 * - the bits not found, are kept in *bits
3004 * \param bits [IN] searched lock bits [IN]
3005 * \param l_req_mode [IN] searched lock mode
3006 * \retval boolean, true iff all bits are found
3008 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3010 struct lustre_handle lockh;
3011 ldlm_policy_data_t policy;
3012 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3013 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3022 fid = &ll_i2info(inode)->lli_fid;
3023 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3024 ldlm_lockname[mode]);
3026 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3027 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3028 policy.l_inodebits.bits = *bits & (1 << i);
3029 if (policy.l_inodebits.bits == 0)
3032 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3033 &policy, mode, &lockh)) {
3034 struct ldlm_lock *lock;
3036 lock = ldlm_handle2lock(&lockh);
3039 ~(lock->l_policy_data.l_inodebits.bits);
3040 LDLM_LOCK_PUT(lock);
3042 *bits &= ~policy.l_inodebits.bits;
3049 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3050 struct lustre_handle *lockh, __u64 flags,
3053 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3058 fid = &ll_i2info(inode)->lli_fid;
3059 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3061 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3062 fid, LDLM_IBITS, &policy, mode, lockh);
3067 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3069 /* Already unlinked. Just update nlink and return success */
3070 if (rc == -ENOENT) {
3072 /* This path cannot be hit for regular files unless in
3073 * case of obscure races, so no need to to validate
3075 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3077 } else if (rc != 0) {
3078 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3079 "%s: revalidate FID "DFID" error: rc = %d\n",
3080 ll_get_fsname(inode->i_sb, NULL, 0),
3081 PFID(ll_inode2fid(inode)), rc);
3087 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3089 struct inode *inode = dentry->d_inode;
3090 struct ptlrpc_request *req = NULL;
3091 struct obd_export *exp;
3095 LASSERT(inode != NULL);
3097 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3098 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3100 exp = ll_i2mdexp(inode);
3102 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3103 * But under CMD case, it caused some lock issues, should be fixed
3104 * with new CMD ibits lock. See bug 12718 */
3105 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3106 struct lookup_intent oit = { .it_op = IT_GETATTR };
3107 struct md_op_data *op_data;
3109 if (ibits == MDS_INODELOCK_LOOKUP)
3110 oit.it_op = IT_LOOKUP;
3112 /* Call getattr by fid, so do not provide name at all. */
3113 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3114 dentry->d_inode, NULL, 0, 0,
3115 LUSTRE_OPC_ANY, NULL);
3116 if (IS_ERR(op_data))
3117 RETURN(PTR_ERR(op_data));
3119 rc = md_intent_lock(exp, op_data, &oit, &req,
3120 &ll_md_blocking_ast, 0);
3121 ll_finish_md_op_data(op_data);
3123 rc = ll_inode_revalidate_fini(inode, rc);
3127 rc = ll_revalidate_it_finish(req, &oit, dentry);
3129 ll_intent_release(&oit);
3133 /* Unlinked? Unhash dentry, so it is not picked up later by
3134 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3135 here to preserve get_cwd functionality on 2.6.
3137 if (!dentry->d_inode->i_nlink)
3138 d_lustre_invalidate(dentry, 0);
3140 ll_lookup_finish_locks(&oit, dentry);
3141 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3142 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3143 u64 valid = OBD_MD_FLGETATTR;
3144 struct md_op_data *op_data;
3147 if (S_ISREG(inode->i_mode)) {
3148 rc = ll_get_default_mdsize(sbi, &ealen);
3151 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3154 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3155 0, ealen, LUSTRE_OPC_ANY,
3157 if (IS_ERR(op_data))
3158 RETURN(PTR_ERR(op_data));
3160 op_data->op_valid = valid;
3161 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3162 * capa for this inode. Because we only keep capas of dirs
3164 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3165 ll_finish_md_op_data(op_data);
3167 rc = ll_inode_revalidate_fini(inode, rc);
3171 rc = ll_prep_inode(&inode, req, NULL, NULL);
3174 ptlrpc_req_finished(req);
3178 static int ll_merge_md_attr(struct inode *inode)
3180 struct cl_attr attr = { 0 };
3183 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3184 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3185 &attr, ll_md_blocking_ast);
3189 set_nlink(inode, attr.cat_nlink);
3190 inode->i_blocks = attr.cat_blocks;
3191 i_size_write(inode, attr.cat_size);
3193 ll_i2info(inode)->lli_atime = attr.cat_atime;
3194 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3195 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3201 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3203 struct inode *inode = dentry->d_inode;
3207 rc = __ll_inode_revalidate(dentry, ibits);
3211 /* if object isn't regular file, don't validate size */
3212 if (!S_ISREG(inode->i_mode)) {
3213 if (S_ISDIR(inode->i_mode) &&
3214 ll_i2info(inode)->lli_lsm_md != NULL) {
3215 rc = ll_merge_md_attr(inode);
3220 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3221 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3222 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3224 /* In case of restore, the MDT has the right size and has
3225 * already send it back without granting the layout lock,
3226 * inode is up-to-date so glimpse is useless.
3227 * Also to glimpse we need the layout, in case of a running
3228 * restore the MDT holds the layout lock so the glimpse will
3229 * block up to the end of restore (getattr will block)
3231 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3232 rc = ll_glimpse_size(inode);
3237 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3239 struct inode *inode = de->d_inode;
3240 struct ll_sb_info *sbi = ll_i2sbi(inode);
3241 struct ll_inode_info *lli = ll_i2info(inode);
3244 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3245 MDS_INODELOCK_LOOKUP);
3246 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3251 stat->dev = inode->i_sb->s_dev;
3252 if (ll_need_32bit_api(sbi))
3253 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3255 stat->ino = inode->i_ino;
3256 stat->mode = inode->i_mode;
3257 stat->uid = inode->i_uid;
3258 stat->gid = inode->i_gid;
3259 stat->rdev = inode->i_rdev;
3260 stat->atime = inode->i_atime;
3261 stat->mtime = inode->i_mtime;
3262 stat->ctime = inode->i_ctime;
3263 stat->blksize = 1 << inode->i_blkbits;
3265 stat->nlink = inode->i_nlink;
3266 stat->size = i_size_read(inode);
3267 stat->blocks = inode->i_blocks;
3272 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3273 __u64 start, __u64 len)
3277 struct fiemap *fiemap;
3278 unsigned int extent_count = fieinfo->fi_extents_max;
3280 num_bytes = sizeof(*fiemap) + (extent_count *
3281 sizeof(struct fiemap_extent));
3282 OBD_ALLOC_LARGE(fiemap, num_bytes);
3287 fiemap->fm_flags = fieinfo->fi_flags;
3288 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3289 fiemap->fm_start = start;
3290 fiemap->fm_length = len;
3291 if (extent_count > 0 &&
3292 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3293 sizeof(struct fiemap_extent)) != 0)
3294 GOTO(out, rc = -EFAULT);
3296 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3298 fieinfo->fi_flags = fiemap->fm_flags;
3299 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3300 if (extent_count > 0 &&
3301 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3302 fiemap->fm_mapped_extents *
3303 sizeof(struct fiemap_extent)) != 0)
3304 GOTO(out, rc = -EFAULT);
3306 OBD_FREE_LARGE(fiemap, num_bytes);
3310 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3312 struct ll_inode_info *lli = ll_i2info(inode);
3313 struct posix_acl *acl = NULL;
3316 spin_lock(&lli->lli_lock);
3317 /* VFS' acl_permission_check->check_acl will release the refcount */
3318 acl = posix_acl_dup(lli->lli_posix_acl);
3319 spin_unlock(&lli->lli_lock);
3324 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3326 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3327 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3329 ll_check_acl(struct inode *inode, int mask)
3332 # ifdef CONFIG_FS_POSIX_ACL
3333 struct posix_acl *acl;
3337 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3338 if (flags & IPERM_FLAG_RCU)
3341 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3346 rc = posix_acl_permission(inode, acl, mask);
3347 posix_acl_release(acl);
3350 # else /* !CONFIG_FS_POSIX_ACL */
3352 # endif /* CONFIG_FS_POSIX_ACL */
3354 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3356 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3357 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3359 # ifdef HAVE_INODE_PERMISION_2ARGS
3360 int ll_inode_permission(struct inode *inode, int mask)
3362 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3367 struct ll_sb_info *sbi;
3368 struct root_squash_info *squash;
3369 struct cred *cred = NULL;
3370 const struct cred *old_cred = NULL;
3372 bool squash_id = false;
3375 #ifdef MAY_NOT_BLOCK
3376 if (mask & MAY_NOT_BLOCK)
3378 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3379 if (flags & IPERM_FLAG_RCU)
3383 /* as root inode are NOT getting validated in lookup operation,
3384 * need to do it before permission check. */
3386 if (inode == inode->i_sb->s_root->d_inode) {
3387 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3388 MDS_INODELOCK_LOOKUP);
3393 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3394 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3396 /* squash fsuid/fsgid if needed */
3397 sbi = ll_i2sbi(inode);
3398 squash = &sbi->ll_squash;
3399 if (unlikely(squash->rsi_uid != 0 &&
3400 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3401 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3405 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3406 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3407 squash->rsi_uid, squash->rsi_gid);
3409 /* update current process's credentials
3410 * and FS capability */
3411 cred = prepare_creds();
3415 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3416 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3417 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3418 if ((1 << cap) & CFS_CAP_FS_MASK)
3419 cap_lower(cred->cap_effective, cap);
3421 old_cred = override_creds(cred);
3424 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3426 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3427 rc = lustre_check_remote_perm(inode, mask);
3429 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3431 /* restore current process's credentials and FS capability */
3433 revert_creds(old_cred);
3440 /* -o localflock - only provides locally consistent flock locks */
3441 struct file_operations ll_file_operations = {
3442 .read = ll_file_read,
3443 .aio_read = ll_file_aio_read,
3444 .write = ll_file_write,
3445 .aio_write = ll_file_aio_write,
3446 .unlocked_ioctl = ll_file_ioctl,
3447 .open = ll_file_open,
3448 .release = ll_file_release,
3449 .mmap = ll_file_mmap,
3450 .llseek = ll_file_seek,
3451 .splice_read = ll_file_splice_read,
3456 struct file_operations ll_file_operations_flock = {
3457 .read = ll_file_read,
3458 .aio_read = ll_file_aio_read,
3459 .write = ll_file_write,
3460 .aio_write = ll_file_aio_write,
3461 .unlocked_ioctl = ll_file_ioctl,
3462 .open = ll_file_open,
3463 .release = ll_file_release,
3464 .mmap = ll_file_mmap,
3465 .llseek = ll_file_seek,
3466 .splice_read = ll_file_splice_read,
3469 .flock = ll_file_flock,
3470 .lock = ll_file_flock
3473 /* These are for -o noflock - to return ENOSYS on flock calls */
3474 struct file_operations ll_file_operations_noflock = {
3475 .read = ll_file_read,
3476 .aio_read = ll_file_aio_read,
3477 .write = ll_file_write,
3478 .aio_write = ll_file_aio_write,
3479 .unlocked_ioctl = ll_file_ioctl,
3480 .open = ll_file_open,
3481 .release = ll_file_release,
3482 .mmap = ll_file_mmap,
3483 .llseek = ll_file_seek,
3484 .splice_read = ll_file_splice_read,
3487 .flock = ll_file_noflock,
3488 .lock = ll_file_noflock
3491 struct inode_operations ll_file_inode_operations = {
3492 .setattr = ll_setattr,
3493 .getattr = ll_getattr,
3494 .permission = ll_inode_permission,
3495 .setxattr = ll_setxattr,
3496 .getxattr = ll_getxattr,
3497 .listxattr = ll_listxattr,
3498 .removexattr = ll_removexattr,
3499 .fiemap = ll_fiemap,
3500 #ifdef HAVE_IOP_GET_ACL
3501 .get_acl = ll_get_acl,
3505 /* dynamic ioctl number support routins */
3506 static struct llioc_ctl_data {
3507 struct rw_semaphore ioc_sem;
3508 struct list_head ioc_head;
3510 __RWSEM_INITIALIZER(llioc.ioc_sem),
3511 LIST_HEAD_INIT(llioc.ioc_head)
3516 struct list_head iocd_list;
3517 unsigned int iocd_size;
3518 llioc_callback_t iocd_cb;
3519 unsigned int iocd_count;
3520 unsigned int iocd_cmd[0];
3523 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3526 struct llioc_data *in_data = NULL;
3529 if (cb == NULL || cmd == NULL ||
3530 count > LLIOC_MAX_CMD || count < 0)
3533 size = sizeof(*in_data) + count * sizeof(unsigned int);
3534 OBD_ALLOC(in_data, size);
3535 if (in_data == NULL)
3538 memset(in_data, 0, sizeof(*in_data));
3539 in_data->iocd_size = size;
3540 in_data->iocd_cb = cb;
3541 in_data->iocd_count = count;
3542 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3544 down_write(&llioc.ioc_sem);
3545 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3546 up_write(&llioc.ioc_sem);
3551 void ll_iocontrol_unregister(void *magic)
3553 struct llioc_data *tmp;
3558 down_write(&llioc.ioc_sem);
3559 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3561 unsigned int size = tmp->iocd_size;
3563 list_del(&tmp->iocd_list);
3564 up_write(&llioc.ioc_sem);
3566 OBD_FREE(tmp, size);
3570 up_write(&llioc.ioc_sem);
3572 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3575 EXPORT_SYMBOL(ll_iocontrol_register);
3576 EXPORT_SYMBOL(ll_iocontrol_unregister);
3578 static enum llioc_iter
3579 ll_iocontrol_call(struct inode *inode, struct file *file,
3580 unsigned int cmd, unsigned long arg, int *rcp)
3582 enum llioc_iter ret = LLIOC_CONT;
3583 struct llioc_data *data;
3584 int rc = -EINVAL, i;
3586 down_read(&llioc.ioc_sem);
3587 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3588 for (i = 0; i < data->iocd_count; i++) {
3589 if (cmd != data->iocd_cmd[i])
3592 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3596 if (ret == LLIOC_STOP)
3599 up_read(&llioc.ioc_sem);
3606 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3608 struct ll_inode_info *lli = ll_i2info(inode);
3609 struct cl_object *obj = lli->lli_clob;
3610 struct cl_env_nest nest;
3618 env = cl_env_nested_get(&nest);
3620 RETURN(PTR_ERR(env));
3622 rc = cl_conf_set(env, lli->lli_clob, conf);
3626 if (conf->coc_opc == OBJECT_CONF_SET) {
3627 struct ldlm_lock *lock = conf->coc_lock;
3628 struct cl_layout cl = {
3632 LASSERT(lock != NULL);
3633 LASSERT(ldlm_has_layout(lock));
3635 /* it can only be allowed to match after layout is
3636 * applied to inode otherwise false layout would be
3637 * seen. Applying layout shoud happen before dropping
3638 * the intent lock. */
3639 ldlm_lock_allow_match(lock);
3641 rc = cl_object_layout_get(env, obj, &cl);
3646 DFID": layout version change: %u -> %u\n",
3647 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3649 ll_layout_version_set(lli, cl.cl_layout_gen);
3653 cl_env_nested_put(&nest, env);
3658 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3659 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3662 struct ll_sb_info *sbi = ll_i2sbi(inode);
3663 struct obd_capa *oc;
3664 struct ptlrpc_request *req;
3665 struct mdt_body *body;
3672 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3673 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3674 lock->l_lvb_data, lock->l_lvb_len);
3676 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3679 /* if layout lock was granted right away, the layout is returned
3680 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3681 * blocked and then granted via completion ast, we have to fetch
3682 * layout here. Please note that we can't use the LVB buffer in
3683 * completion AST because it doesn't have a large enough buffer */
3684 oc = ll_mdscapa_get(inode);
3685 rc = ll_get_default_mdsize(sbi, &lmmsize);
3687 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3688 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3694 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3696 GOTO(out, rc = -EPROTO);
3698 lmmsize = body->mbo_eadatasize;
3699 if (lmmsize == 0) /* empty layout */
3702 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3704 GOTO(out, rc = -EFAULT);
3706 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3707 if (lvbdata == NULL)
3708 GOTO(out, rc = -ENOMEM);
3710 memcpy(lvbdata, lmm, lmmsize);
3711 lock_res_and_lock(lock);
3712 if (lock->l_lvb_data != NULL)
3713 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3715 lock->l_lvb_data = lvbdata;
3716 lock->l_lvb_len = lmmsize;
3717 unlock_res_and_lock(lock);
3722 ptlrpc_req_finished(req);
3727 * Apply the layout to the inode. Layout lock is held and will be released
3730 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3731 struct inode *inode)
3733 struct ll_inode_info *lli = ll_i2info(inode);
3734 struct ll_sb_info *sbi = ll_i2sbi(inode);
3735 struct ldlm_lock *lock;
3736 struct lustre_md md = { NULL };
3737 struct cl_object_conf conf;
3740 bool wait_layout = false;
3743 LASSERT(lustre_handle_is_used(lockh));
3745 lock = ldlm_handle2lock(lockh);
3746 LASSERT(lock != NULL);
3747 LASSERT(ldlm_has_layout(lock));
3749 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3750 PFID(&lli->lli_fid), inode);
3752 /* in case this is a caching lock and reinstate with new inode */
3753 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3755 lock_res_and_lock(lock);
3756 lvb_ready = ldlm_is_lvb_ready(lock);
3757 unlock_res_and_lock(lock);
3758 /* checking lvb_ready is racy but this is okay. The worst case is
3759 * that multi processes may configure the file on the same time. */
3764 rc = ll_layout_fetch(inode, lock);
3768 /* for layout lock, lmm is returned in lock's lvb.
3769 * lvb_data is immutable if the lock is held so it's safe to access it
3770 * without res lock. See the description in ldlm_lock_decref_internal()
3771 * for the condition to free lvb_data of layout lock */
3772 if (lock->l_lvb_data != NULL) {
3773 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3774 lock->l_lvb_data, lock->l_lvb_len);
3776 CERROR("%s: file "DFID" unpackmd error: %d\n",
3777 ll_get_fsname(inode->i_sb, NULL, 0),
3778 PFID(&lli->lli_fid), rc);
3782 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3783 lock->l_lvb_data, lock->l_lvb_len);
3788 /* set layout to file. Unlikely this will fail as old layout was
3789 * surely eliminated */
3790 memset(&conf, 0, sizeof conf);
3791 conf.coc_opc = OBJECT_CONF_SET;
3792 conf.coc_inode = inode;
3793 conf.coc_lock = lock;
3794 conf.u.coc_md = &md;
3795 rc = ll_layout_conf(inode, &conf);
3798 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3800 /* refresh layout failed, need to wait */
3801 wait_layout = rc == -EBUSY;
3805 LDLM_LOCK_PUT(lock);
3806 ldlm_lock_decref(lockh, mode);
3808 /* wait for IO to complete if it's still being used. */
3810 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3811 ll_get_fsname(inode->i_sb, NULL, 0),
3812 PFID(&lli->lli_fid), inode);
3814 memset(&conf, 0, sizeof conf);
3815 conf.coc_opc = OBJECT_CONF_WAIT;
3816 conf.coc_inode = inode;
3817 rc = ll_layout_conf(inode, &conf);
3821 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3822 ll_get_fsname(inode->i_sb, NULL, 0),
3823 PFID(&lli->lli_fid), rc);
3828 static int ll_layout_refresh_locked(struct inode *inode)
3830 struct ll_inode_info *lli = ll_i2info(inode);
3831 struct ll_sb_info *sbi = ll_i2sbi(inode);
3832 struct md_op_data *op_data;
3833 struct lookup_intent it;
3834 struct lustre_handle lockh;
3836 struct ldlm_enqueue_info einfo = {
3837 .ei_type = LDLM_IBITS,
3839 .ei_cb_bl = &ll_md_blocking_ast,
3840 .ei_cb_cp = &ldlm_completion_ast,
3846 /* mostly layout lock is caching on the local side, so try to match
3847 * it before grabbing layout lock mutex. */
3848 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3849 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3850 if (mode != 0) { /* hit cached lock */
3851 rc = ll_layout_lock_set(&lockh, mode, inode);
3858 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3859 0, 0, LUSTRE_OPC_ANY, NULL);
3860 if (IS_ERR(op_data))
3861 RETURN(PTR_ERR(op_data));
3863 /* have to enqueue one */
3864 memset(&it, 0, sizeof(it));
3865 it.it_op = IT_LAYOUT;
3866 lockh.cookie = 0ULL;
3868 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3869 ll_get_fsname(inode->i_sb, NULL, 0),
3870 PFID(&lli->lli_fid), inode);
3872 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3873 if (it.d.lustre.it_data != NULL)
3874 ptlrpc_req_finished(it.d.lustre.it_data);
3875 it.d.lustre.it_data = NULL;
3877 ll_finish_md_op_data(op_data);
3879 mode = it.d.lustre.it_lock_mode;
3880 it.d.lustre.it_lock_mode = 0;
3881 ll_intent_drop_lock(&it);
3884 /* set lock data in case this is a new lock */
3885 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3886 rc = ll_layout_lock_set(&lockh, mode, inode);
3895 * This function checks if there exists a LAYOUT lock on the client side,
3896 * or enqueues it if it doesn't have one in cache.
3898 * This function will not hold layout lock so it may be revoked any time after
3899 * this function returns. Any operations depend on layout should be redone
3902 * This function should be called before lov_io_init() to get an uptodate
3903 * layout version, the caller should save the version number and after IO
3904 * is finished, this function should be called again to verify that layout
3905 * is not changed during IO time.
3907 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3909 struct ll_inode_info *lli = ll_i2info(inode);
3910 struct ll_sb_info *sbi = ll_i2sbi(inode);
3914 *gen = ll_layout_version_get(lli);
3915 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3919 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3920 LASSERT(S_ISREG(inode->i_mode));
3922 /* take layout lock mutex to enqueue layout lock exclusively. */
3923 mutex_lock(&lli->lli_layout_mutex);
3925 rc = ll_layout_refresh_locked(inode);
3929 *gen = ll_layout_version_get(lli);
3931 mutex_unlock(&lli->lli_layout_mutex);
3937 * This function send a restore request to the MDT
3939 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3941 struct hsm_user_request *hur;
3945 len = sizeof(struct hsm_user_request) +
3946 sizeof(struct hsm_user_item);
3947 OBD_ALLOC(hur, len);
3951 hur->hur_request.hr_action = HUA_RESTORE;
3952 hur->hur_request.hr_archive_id = 0;
3953 hur->hur_request.hr_flags = 0;
3954 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3955 sizeof(hur->hur_user_item[0].hui_fid));
3956 hur->hur_user_item[0].hui_extent.offset = offset;
3957 hur->hur_user_item[0].hui_extent.length = length;
3958 hur->hur_request.hr_itemcount = 1;
3959 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,