4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
89 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
90 struct lustre_handle *fh)
92 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr_blocks = inode->i_blocks;
99 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
101 op_data->op_handle = *fh;
102 op_data->op_capa1 = ll_mdscapa_get(inode);
104 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
105 op_data->op_bias |= MDS_DATA_MODIFIED;
109 * Packs all the attributes into @op_data for the CLOSE rpc.
111 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
112 struct obd_client_handle *och)
116 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
117 ATTR_MTIME | ATTR_MTIME_SET |
118 ATTR_CTIME | ATTR_CTIME_SET;
120 if (!(och->och_flags & FMODE_WRITE))
123 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
133 * Perform a close, possibly with a bias.
134 * The meaning of "data" depends on the value of "bias".
136 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
137 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
140 static int ll_close_inode_openhandle(struct obd_export *md_exp,
141 struct obd_client_handle *och,
143 enum mds_op_bias bias,
146 struct obd_export *exp = ll_i2mdexp(inode);
147 struct md_op_data *op_data;
148 struct ptlrpc_request *req = NULL;
149 struct obd_device *obd = class_exp2obd(exp);
155 * XXX: in case of LMV, is this correct to access
158 CERROR("Invalid MDC connection handle "LPX64"\n",
159 ll_i2mdexp(inode)->exp_handle.h_cookie);
163 OBD_ALLOC_PTR(op_data);
165 /* XXX We leak openhandle and request here. */
166 GOTO(out, rc = -ENOMEM);
168 ll_prepare_close(inode, op_data, och);
170 case MDS_CLOSE_LAYOUT_SWAP:
171 LASSERT(data != NULL);
172 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 op_data->op_fid2 = *ll_inode2fid(data);
178 case MDS_HSM_RELEASE:
179 LASSERT(data != NULL);
180 op_data->op_bias |= MDS_HSM_RELEASE;
181 op_data->op_data_version = *(__u64 *)data;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
187 LASSERT(data == NULL);
191 rc = md_close(md_exp, op_data, och->och_mod, &req);
193 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
194 ll_i2mdexp(inode)->exp_obd->obd_name,
195 PFID(ll_inode2fid(inode)), rc);
198 /* DATA_MODIFIED flag was successfully sent on close, cancel data
199 * modification flag. */
200 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
201 struct ll_inode_info *lli = ll_i2info(inode);
203 spin_lock(&lli->lli_lock);
204 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
205 spin_unlock(&lli->lli_lock);
209 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
210 struct mdt_body *body;
212 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
213 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
217 ll_finish_md_op_data(op_data);
221 md_clear_open_replay_data(md_exp, och);
222 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
225 if (req) /* This is close request */
226 ptlrpc_req_finished(req);
230 int ll_md_real_close(struct inode *inode, fmode_t fmode)
232 struct ll_inode_info *lli = ll_i2info(inode);
233 struct obd_client_handle **och_p;
234 struct obd_client_handle *och;
239 if (fmode & FMODE_WRITE) {
240 och_p = &lli->lli_mds_write_och;
241 och_usecount = &lli->lli_open_fd_write_count;
242 } else if (fmode & FMODE_EXEC) {
243 och_p = &lli->lli_mds_exec_och;
244 och_usecount = &lli->lli_open_fd_exec_count;
246 LASSERT(fmode & FMODE_READ);
247 och_p = &lli->lli_mds_read_och;
248 och_usecount = &lli->lli_open_fd_read_count;
251 mutex_lock(&lli->lli_och_mutex);
252 if (*och_usecount > 0) {
253 /* There are still users of this handle, so skip
255 mutex_unlock(&lli->lli_och_mutex);
261 mutex_unlock(&lli->lli_och_mutex);
264 /* There might be a race and this handle may already
266 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
267 och, inode, 0, NULL);
273 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
276 ldlm_policy_data_t policy = {
277 .l_inodebits = { MDS_INODELOCK_OPEN },
279 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
280 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lustre_handle lockh;
287 /* clear group lock, if present */
288 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
289 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
291 if (fd->fd_lease_och != NULL) {
294 /* Usually the lease is not released when the
295 * application crashed, we need to release here. */
296 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
297 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
298 PFID(&lli->lli_fid), rc, lease_broken);
300 fd->fd_lease_och = NULL;
303 if (fd->fd_och != NULL) {
304 rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
310 /* Let's see if we have good enough OPEN lock on the file and if
311 we can skip talking to MDS */
312 mutex_lock(&lli->lli_och_mutex);
313 if (fd->fd_omode & FMODE_WRITE) {
315 LASSERT(lli->lli_open_fd_write_count);
316 lli->lli_open_fd_write_count--;
317 } else if (fd->fd_omode & FMODE_EXEC) {
319 LASSERT(lli->lli_open_fd_exec_count);
320 lli->lli_open_fd_exec_count--;
323 LASSERT(lli->lli_open_fd_read_count);
324 lli->lli_open_fd_read_count--;
326 mutex_unlock(&lli->lli_och_mutex);
328 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
329 LDLM_IBITS, &policy, lockmode, &lockh))
330 rc = ll_md_real_close(inode, fd->fd_omode);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_path.dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_path.dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_path.dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
465 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
466 struct obd_client_handle *och)
468 struct ptlrpc_request *req = it->d.lustre.it_data;
469 struct mdt_body *body;
471 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
472 och->och_fh = body->mbo_handle;
473 och->och_fid = body->mbo_fid1;
474 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
475 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
476 och->och_flags = it->it_flags;
478 return md_set_open_replay_data(md_exp, och, it);
481 static int ll_local_open(struct file *file, struct lookup_intent *it,
482 struct ll_file_data *fd, struct obd_client_handle *och)
484 struct inode *inode = file->f_path.dentry->d_inode;
487 LASSERT(!LUSTRE_FPRIVATE(file));
494 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
499 LUSTRE_FPRIVATE(file) = fd;
500 ll_readahead_init(inode, &fd->fd_ras);
501 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
503 /* ll_cl_context initialize */
504 rwlock_init(&fd->fd_lock);
505 INIT_LIST_HEAD(&fd->fd_lccs);
510 /* Open a file, and (for the very first open) create objects on the OSTs at
511 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
512 * creation or open until ll_lov_setstripe() ioctl is called.
514 * If we already have the stripe MD locally then we don't request it in
515 * md_open(), by passing a lmm_size = 0.
517 * It is up to the application to ensure no other processes open this file
518 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
519 * used. We might be able to avoid races of that sort by getting lli_open_sem
520 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
521 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
523 int ll_file_open(struct inode *inode, struct file *file)
525 struct ll_inode_info *lli = ll_i2info(inode);
526 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
527 .it_flags = file->f_flags };
528 struct obd_client_handle **och_p = NULL;
529 __u64 *och_usecount = NULL;
530 struct ll_file_data *fd;
534 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
535 PFID(ll_inode2fid(inode)), inode, file->f_flags);
537 it = file->private_data; /* XXX: compat macro */
538 file->private_data = NULL; /* prevent ll_local_open assertion */
540 fd = ll_file_data_get();
542 GOTO(out_openerr, rc = -ENOMEM);
545 if (S_ISDIR(inode->i_mode))
546 ll_authorize_statahead(inode, fd);
548 if (inode->i_sb->s_root == file->f_path.dentry) {
549 LUSTRE_FPRIVATE(file) = fd;
553 if (!it || !it->d.lustre.it_disposition) {
554 /* Convert f_flags into access mode. We cannot use file->f_mode,
555 * because everything but O_ACCMODE mask was stripped from
557 if ((oit.it_flags + 1) & O_ACCMODE)
559 if (file->f_flags & O_TRUNC)
560 oit.it_flags |= FMODE_WRITE;
562 /* kernel only call f_op->open in dentry_open. filp_open calls
563 * dentry_open after call to open_namei that checks permissions.
564 * Only nfsd_open call dentry_open directly without checking
565 * permissions and because of that this code below is safe. */
566 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
567 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
569 /* We do not want O_EXCL here, presumably we opened the file
570 * already? XXX - NFS implications? */
571 oit.it_flags &= ~O_EXCL;
573 /* bug20584, if "it_flags" contains O_CREAT, the file will be
574 * created if necessary, then "IT_CREAT" should be set to keep
575 * consistent with it */
576 if (oit.it_flags & O_CREAT)
577 oit.it_op |= IT_CREAT;
583 /* Let's see if we have file open on MDS already. */
584 if (it->it_flags & FMODE_WRITE) {
585 och_p = &lli->lli_mds_write_och;
586 och_usecount = &lli->lli_open_fd_write_count;
587 } else if (it->it_flags & FMODE_EXEC) {
588 och_p = &lli->lli_mds_exec_och;
589 och_usecount = &lli->lli_open_fd_exec_count;
591 och_p = &lli->lli_mds_read_och;
592 och_usecount = &lli->lli_open_fd_read_count;
595 mutex_lock(&lli->lli_och_mutex);
596 if (*och_p) { /* Open handle is present */
597 if (it_disposition(it, DISP_OPEN_OPEN)) {
598 /* Well, there's extra open request that we do not need,
599 let's close it somehow. This will decref request. */
600 rc = it_open_error(DISP_OPEN_OPEN, it);
602 mutex_unlock(&lli->lli_och_mutex);
603 GOTO(out_openerr, rc);
606 ll_release_openhandle(file->f_path.dentry, it);
610 rc = ll_local_open(file, it, fd, NULL);
613 mutex_unlock(&lli->lli_och_mutex);
614 GOTO(out_openerr, rc);
617 LASSERT(*och_usecount == 0);
618 if (!it->d.lustre.it_disposition) {
619 /* We cannot just request lock handle now, new ELC code
620 means that one of other OPEN locks for this file
621 could be cancelled, and since blocking ast handler
622 would attempt to grab och_mutex as well, that would
623 result in a deadlock */
624 mutex_unlock(&lli->lli_och_mutex);
626 * Normally called under two situations:
628 * 2. A race/condition on MDS resulting in no open
629 * handle to be returned from LOOKUP|OPEN request,
630 * for example if the target entry was a symlink.
632 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file, NULL, 0, it);
640 GOTO(out_openerr, rc);
644 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
646 GOTO(out_och_free, rc = -ENOMEM);
650 /* md_intent_lock() didn't get a request ref if there was an
651 * open error, so don't do cleanup on the request here
653 /* XXX (green): Should not we bail out on any error here, not
654 * just open error? */
655 rc = it_open_error(DISP_OPEN_OPEN, it);
657 GOTO(out_och_free, rc);
659 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
660 "inode %p: disposition %x, status %d\n", inode,
661 it_disposition(it, ~0), it->d.lustre.it_status);
663 rc = ll_local_open(file, it, fd, *och_p);
665 GOTO(out_och_free, rc);
667 mutex_unlock(&lli->lli_och_mutex);
670 /* Must do this outside lli_och_mutex lock to prevent deadlock where
671 different kind of OPEN lock for this same inode gets cancelled
672 by ldlm_cancel_lru */
673 if (!S_ISREG(inode->i_mode))
674 GOTO(out_och_free, rc);
678 cl_lov_delay_create_clear(&file->f_flags);
679 GOTO(out_och_free, rc);
683 if (och_p && *och_p) {
684 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
685 *och_p = NULL; /* OBD_FREE writes some magic there */
688 mutex_unlock(&lli->lli_och_mutex);
691 if (lli->lli_opendir_key == fd)
692 ll_deauthorize_statahead(inode, fd);
694 ll_file_data_put(fd);
696 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
699 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
700 ptlrpc_req_finished(it->d.lustre.it_data);
701 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
707 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
708 struct ldlm_lock_desc *desc, void *data, int flag)
711 struct lustre_handle lockh;
715 case LDLM_CB_BLOCKING:
716 ldlm_lock2handle(lock, &lockh);
717 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
719 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
723 case LDLM_CB_CANCELING:
731 * Acquire a lease and open the file.
733 static struct obd_client_handle *
734 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
737 struct lookup_intent it = { .it_op = IT_OPEN };
738 struct ll_sb_info *sbi = ll_i2sbi(inode);
739 struct md_op_data *op_data;
740 struct ptlrpc_request *req = NULL;
741 struct lustre_handle old_handle = { 0 };
742 struct obd_client_handle *och = NULL;
747 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
748 RETURN(ERR_PTR(-EINVAL));
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
756 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
757 RETURN(ERR_PTR(-EPERM));
759 /* Get the openhandle of the file */
761 mutex_lock(&lli->lli_och_mutex);
762 if (fd->fd_lease_och != NULL) {
763 mutex_unlock(&lli->lli_och_mutex);
767 if (fd->fd_och == NULL) {
768 if (file->f_mode & FMODE_WRITE) {
769 LASSERT(lli->lli_mds_write_och != NULL);
770 och_p = &lli->lli_mds_write_och;
771 och_usecount = &lli->lli_open_fd_write_count;
773 LASSERT(lli->lli_mds_read_och != NULL);
774 och_p = &lli->lli_mds_read_och;
775 och_usecount = &lli->lli_open_fd_read_count;
777 if (*och_usecount == 1) {
784 mutex_unlock(&lli->lli_och_mutex);
785 if (rc < 0) /* more than 1 opener */
788 LASSERT(fd->fd_och != NULL);
789 old_handle = fd->fd_och->och_fh;
794 RETURN(ERR_PTR(-ENOMEM));
796 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
797 LUSTRE_OPC_ANY, NULL);
799 GOTO(out, rc = PTR_ERR(op_data));
801 /* To tell the MDT this openhandle is from the same owner */
802 op_data->op_handle = old_handle;
804 it.it_flags = fmode | open_flags;
805 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
806 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
807 &ll_md_blocking_lease_ast,
808 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
809 * it can be cancelled which may mislead applications that the lease is
811 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
812 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
813 * doesn't deal with openhandle, so normal openhandle will be leaked. */
814 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
815 ll_finish_md_op_data(op_data);
816 ptlrpc_req_finished(req);
818 GOTO(out_release_it, rc);
820 if (it_disposition(&it, DISP_LOOKUP_NEG))
821 GOTO(out_release_it, rc = -ENOENT);
823 rc = it_open_error(DISP_OPEN_OPEN, &it);
825 GOTO(out_release_it, rc);
827 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
828 ll_och_fill(sbi->ll_md_exp, &it, och);
830 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
831 GOTO(out_close, rc = -EOPNOTSUPP);
833 /* already get lease, handle lease lock */
834 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
835 if (it.d.lustre.it_lock_mode == 0 ||
836 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
837 /* open lock must return for lease */
838 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
839 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
840 it.d.lustre.it_lock_bits);
841 GOTO(out_close, rc = -EPROTO);
844 ll_intent_release(&it);
848 /* Cancel open lock */
849 if (it.d.lustre.it_lock_mode != 0) {
850 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
851 it.d.lustre.it_lock_mode);
852 it.d.lustre.it_lock_mode = 0;
853 och->och_lease_handle.cookie = 0ULL;
855 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
857 CERROR("%s: error closing file "DFID": %d\n",
858 ll_get_fsname(inode->i_sb, NULL, 0),
859 PFID(&ll_i2info(inode)->lli_fid), rc2);
860 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
862 ll_intent_release(&it);
870 * Check whether a layout swap can be done between two inodes.
872 * \param[in] inode1 First inode to check
873 * \param[in] inode2 Second inode to check
875 * \retval 0 on success, layout swap can be performed between both inodes
876 * \retval negative error code if requirements are not met
878 static int ll_check_swap_layouts_validity(struct inode *inode1,
879 struct inode *inode2)
881 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
884 if (inode_permission(inode1, MAY_WRITE) ||
885 inode_permission(inode2, MAY_WRITE))
888 if (inode1->i_sb != inode2->i_sb)
894 static int ll_swap_layouts_close(struct obd_client_handle *och,
895 struct inode *inode, struct inode *inode2)
897 const struct lu_fid *fid1 = ll_inode2fid(inode);
898 const struct lu_fid *fid2;
902 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
903 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
905 rc = ll_check_swap_layouts_validity(inode, inode2);
907 GOTO(out_free_och, rc);
909 /* We now know that inode2 is a lustre inode */
910 fid2 = ll_inode2fid(inode2);
912 rc = lu_fid_cmp(fid1, fid2);
914 GOTO(out_free_och, rc = -EINVAL);
916 /* Close the file and swap layouts between inode & inode2.
917 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
918 * because we still need it to pack l_remote_handle to MDT. */
919 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
920 MDS_CLOSE_LAYOUT_SWAP, inode2);
922 och = NULL; /* freed in ll_close_inode_openhandle() */
932 * Release lease and close the file.
933 * It will check if the lease has ever broken.
935 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
938 struct ldlm_lock *lock;
939 bool cancelled = true;
943 lock = ldlm_handle2lock(&och->och_lease_handle);
945 lock_res_and_lock(lock);
946 cancelled = ldlm_is_cancel(lock);
947 unlock_res_and_lock(lock);
951 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
952 PFID(&ll_i2info(inode)->lli_fid), cancelled);
955 ldlm_cli_cancel(&och->och_lease_handle, 0);
956 if (lease_broken != NULL)
957 *lease_broken = cancelled;
959 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
965 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
967 struct ll_inode_info *lli = ll_i2info(inode);
968 struct cl_object *obj = lli->lli_clob;
969 struct cl_attr *attr = vvp_env_thread_attr(env);
977 ll_inode_size_lock(inode);
979 /* merge timestamps the most recently obtained from mds with
980 timestamps obtained from osts */
981 LTIME_S(inode->i_atime) = lli->lli_atime;
982 LTIME_S(inode->i_mtime) = lli->lli_mtime;
983 LTIME_S(inode->i_ctime) = lli->lli_ctime;
985 atime = LTIME_S(inode->i_atime);
986 mtime = LTIME_S(inode->i_mtime);
987 ctime = LTIME_S(inode->i_ctime);
989 cl_object_attr_lock(obj);
990 rc = cl_object_attr_get(env, obj, attr);
991 cl_object_attr_unlock(obj);
994 GOTO(out_size_unlock, rc);
996 if (atime < attr->cat_atime)
997 atime = attr->cat_atime;
999 if (ctime < attr->cat_ctime)
1000 ctime = attr->cat_ctime;
1002 if (mtime < attr->cat_mtime)
1003 mtime = attr->cat_mtime;
1005 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1006 PFID(&lli->lli_fid), attr->cat_size);
1008 i_size_write(inode, attr->cat_size);
1009 inode->i_blocks = attr->cat_blocks;
1011 LTIME_S(inode->i_atime) = atime;
1012 LTIME_S(inode->i_mtime) = mtime;
1013 LTIME_S(inode->i_ctime) = ctime;
1016 ll_inode_size_unlock(inode);
1021 static bool file_is_noatime(const struct file *file)
1023 const struct vfsmount *mnt = file->f_path.mnt;
1024 const struct inode *inode = file->f_path.dentry->d_inode;
1026 /* Adapted from file_accessed() and touch_atime().*/
1027 if (file->f_flags & O_NOATIME)
1030 if (inode->i_flags & S_NOATIME)
1033 if (IS_NOATIME(inode))
1036 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1039 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1042 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1048 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1050 struct inode *inode = file->f_path.dentry->d_inode;
1052 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1054 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1055 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1056 file->f_flags & O_DIRECT ||
1059 io->ci_obj = ll_i2info(inode)->lli_clob;
1060 io->ci_lockreq = CILR_MAYBE;
1061 if (ll_file_nolock(file)) {
1062 io->ci_lockreq = CILR_NEVER;
1063 io->ci_no_srvlock = 1;
1064 } else if (file->f_flags & O_APPEND) {
1065 io->ci_lockreq = CILR_MANDATORY;
1068 io->ci_noatime = file_is_noatime(file);
1072 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1073 struct file *file, enum cl_io_type iot,
1074 loff_t *ppos, size_t count)
1076 struct vvp_io *vio = vvp_env_io(env);
1077 struct inode *inode = file->f_path.dentry->d_inode;
1078 struct ll_inode_info *lli = ll_i2info(inode);
1079 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1083 struct range_lock range;
1087 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1088 file->f_path.dentry->d_name.name, iot, *ppos, count);
1091 io = vvp_env_thread_io(env);
1092 ll_io_init(io, file, iot == CIT_WRITE);
1094 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1095 bool range_locked = false;
1097 if (file->f_flags & O_APPEND)
1098 range_lock_init(&range, 0, LUSTRE_EOF);
1100 range_lock_init(&range, *ppos, *ppos + count - 1);
1102 vio->vui_fd = LUSTRE_FPRIVATE(file);
1103 vio->vui_io_subtype = args->via_io_subtype;
1105 switch (vio->vui_io_subtype) {
1107 vio->vui_iov = args->u.normal.via_iov;
1108 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1109 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1110 vio->vui_iocb = args->u.normal.via_iocb;
1111 /* Direct IO reads must also take range lock,
1112 * or multiple reads will try to work on the same pages
1113 * See LU-6227 for details. */
1114 if (((iot == CIT_WRITE) ||
1115 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1116 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1117 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1119 rc = range_lock(&lli->lli_write_tree, &range);
1123 range_locked = true;
1127 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1128 vio->u.splice.vui_flags = args->u.splice.via_flags;
1131 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1135 ll_cl_add(file, env, io);
1136 rc = cl_io_loop(env, io);
1137 ll_cl_remove(file, env);
1140 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1142 range_unlock(&lli->lli_write_tree, &range);
1145 /* cl_io_rw_init() handled IO */
1149 if (io->ci_nob > 0) {
1150 result += io->ci_nob;
1151 count -= io->ci_nob;
1152 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1154 /* prepare IO restart */
1155 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1156 args->u.normal.via_iov = vio->vui_iov;
1157 args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1162 cl_io_fini(env, io);
1164 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1166 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1167 file->f_path.dentry->d_name.name,
1168 iot == CIT_READ ? "read" : "write",
1169 *ppos, count, result);
1173 if (iot == CIT_READ) {
1175 ll_stats_ops_tally(ll_i2sbi(inode),
1176 LPROC_LL_READ_BYTES, result);
1177 } else if (iot == CIT_WRITE) {
1179 ll_stats_ops_tally(ll_i2sbi(inode),
1180 LPROC_LL_WRITE_BYTES, result);
1181 fd->fd_write_failed = false;
1182 } else if (rc != -ERESTARTSYS) {
1183 fd->fd_write_failed = true;
1187 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1189 return result > 0 ? result : rc;
1193 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1195 static int ll_file_get_iov_count(const struct iovec *iov,
1196 unsigned long *nr_segs, size_t *count)
1201 for (seg = 0; seg < *nr_segs; seg++) {
1202 const struct iovec *iv = &iov[seg];
1205 * If any segment has a negative length, or the cumulative
1206 * length ever wraps negative then return -EINVAL.
1209 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1211 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1216 cnt -= iv->iov_len; /* This segment is no good */
1223 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1224 unsigned long nr_segs, loff_t pos)
1227 struct vvp_io_args *args;
1228 struct iovec *local_iov;
1234 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1238 env = cl_env_get(&refcheck);
1240 RETURN(PTR_ERR(env));
1243 local_iov = &ll_env_info(env)->lti_local_iov;
1246 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1247 if (local_iov == NULL) {
1248 cl_env_put(env, &refcheck);
1252 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1255 args = ll_env_args(env, IO_NORMAL);
1256 args->u.normal.via_iov = local_iov;
1257 args->u.normal.via_nrsegs = nr_segs;
1258 args->u.normal.via_iocb = iocb;
1260 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1261 &iocb->ki_pos, count);
1263 cl_env_put(env, &refcheck);
1266 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1271 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1275 struct iovec iov = { .iov_base = buf, .iov_len = count };
1276 struct kiocb *kiocb;
1281 env = cl_env_get(&refcheck);
1283 RETURN(PTR_ERR(env));
1285 kiocb = &ll_env_info(env)->lti_kiocb;
1286 init_sync_kiocb(kiocb, file);
1287 kiocb->ki_pos = *ppos;
1288 #ifdef HAVE_KIOCB_KI_LEFT
1289 kiocb->ki_left = count;
1291 kiocb->ki_nbytes = count;
1294 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1295 *ppos = kiocb->ki_pos;
1297 cl_env_put(env, &refcheck);
1302 * Write to a file (through the page cache).
1305 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1306 unsigned long nr_segs, loff_t pos)
1309 struct vvp_io_args *args;
1310 struct iovec *local_iov;
1316 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1320 env = cl_env_get(&refcheck);
1322 RETURN(PTR_ERR(env));
1325 local_iov = &ll_env_info(env)->lti_local_iov;
1328 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1329 if (local_iov == NULL) {
1330 cl_env_put(env, &refcheck);
1334 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1337 args = ll_env_args(env, IO_NORMAL);
1338 args->u.normal.via_iov = local_iov;
1339 args->u.normal.via_nrsegs = nr_segs;
1340 args->u.normal.via_iocb = iocb;
1342 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1343 &iocb->ki_pos, count);
1344 cl_env_put(env, &refcheck);
1347 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1352 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1353 size_t count, loff_t *ppos)
1356 struct iovec iov = { .iov_base = (void __user *)buf,
1358 struct kiocb *kiocb;
1363 env = cl_env_get(&refcheck);
1365 RETURN(PTR_ERR(env));
1367 kiocb = &ll_env_info(env)->lti_kiocb;
1368 init_sync_kiocb(kiocb, file);
1369 kiocb->ki_pos = *ppos;
1370 #ifdef HAVE_KIOCB_KI_LEFT
1371 kiocb->ki_left = count;
1373 kiocb->ki_nbytes = count;
1376 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1377 *ppos = kiocb->ki_pos;
1379 cl_env_put(env, &refcheck);
1384 * Send file content (through pagecache) somewhere with helper
1386 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1387 struct pipe_inode_info *pipe, size_t count,
1391 struct vvp_io_args *args;
1396 env = cl_env_get(&refcheck);
1398 RETURN(PTR_ERR(env));
1400 args = ll_env_args(env, IO_SPLICE);
1401 args->u.splice.via_pipe = pipe;
1402 args->u.splice.via_flags = flags;
1404 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1405 cl_env_put(env, &refcheck);
1409 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1410 __u64 flags, struct lov_user_md *lum,
1413 struct lookup_intent oit = {
1415 .it_flags = flags | MDS_OPEN_BY_FID,
1420 ll_inode_size_lock(inode);
1421 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1423 GOTO(out_unlock, rc);
1425 ll_release_openhandle(file->f_path.dentry, &oit);
1428 ll_inode_size_unlock(inode);
1429 ll_intent_release(&oit);
1430 cl_lov_delay_create_clear(&file->f_flags);
1435 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1436 struct lov_mds_md **lmmp, int *lmm_size,
1437 struct ptlrpc_request **request)
1439 struct ll_sb_info *sbi = ll_i2sbi(inode);
1440 struct mdt_body *body;
1441 struct lov_mds_md *lmm = NULL;
1442 struct ptlrpc_request *req = NULL;
1443 struct md_op_data *op_data;
1446 rc = ll_get_default_mdsize(sbi, &lmmsize);
1450 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1451 strlen(filename), lmmsize,
1452 LUSTRE_OPC_ANY, NULL);
1453 if (IS_ERR(op_data))
1454 RETURN(PTR_ERR(op_data));
1456 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1457 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1458 ll_finish_md_op_data(op_data);
1460 CDEBUG(D_INFO, "md_getattr_name failed "
1461 "on %s: rc %d\n", filename, rc);
1465 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1466 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1468 lmmsize = body->mbo_eadatasize;
1470 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1472 GOTO(out, rc = -ENODATA);
1475 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1476 LASSERT(lmm != NULL);
1478 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1479 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1480 GOTO(out, rc = -EPROTO);
1484 * This is coming from the MDS, so is probably in
1485 * little endian. We convert it to host endian before
1486 * passing it to userspace.
1488 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1491 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1492 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1495 /* if function called for directory - we should
1496 * avoid swab not existent lsm objects */
1497 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1498 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1499 if (S_ISREG(body->mbo_mode))
1500 lustre_swab_lov_user_md_objects(
1501 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1503 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1504 lustre_swab_lov_user_md_v3(
1505 (struct lov_user_md_v3 *)lmm);
1506 if (S_ISREG(body->mbo_mode))
1507 lustre_swab_lov_user_md_objects(
1508 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1515 *lmm_size = lmmsize;
1520 static int ll_lov_setea(struct inode *inode, struct file *file,
1523 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1524 struct lov_user_md *lump;
1525 int lum_size = sizeof(struct lov_user_md) +
1526 sizeof(struct lov_user_ost_data);
1530 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1533 OBD_ALLOC_LARGE(lump, lum_size);
1537 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1538 OBD_FREE_LARGE(lump, lum_size);
1542 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1544 OBD_FREE_LARGE(lump, lum_size);
1548 static int ll_file_getstripe(struct inode *inode,
1549 struct lov_user_md __user *lum)
1556 env = cl_env_get(&refcheck);
1558 RETURN(PTR_ERR(env));
1560 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1561 cl_env_put(env, &refcheck);
1565 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1568 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1569 struct lov_user_md *klum;
1571 __u64 flags = FMODE_WRITE;
1574 rc = ll_copy_user_md(lum, &klum);
1579 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1583 put_user(0, &lum->lmm_stripe_count);
1585 ll_layout_refresh(inode, &gen);
1586 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1589 OBD_FREE(klum, lum_size);
1594 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1596 struct ll_inode_info *lli = ll_i2info(inode);
1597 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1598 struct ll_grouplock grouplock;
1603 CWARN("group id for group lock must not be 0\n");
1607 if (ll_file_nolock(file))
1608 RETURN(-EOPNOTSUPP);
1610 spin_lock(&lli->lli_lock);
1611 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1612 CWARN("group lock already existed with gid %lu\n",
1613 fd->fd_grouplock.lg_gid);
1614 spin_unlock(&lli->lli_lock);
1617 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1618 spin_unlock(&lli->lli_lock);
1620 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1621 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1625 spin_lock(&lli->lli_lock);
1626 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1627 spin_unlock(&lli->lli_lock);
1628 CERROR("another thread just won the race\n");
1629 cl_put_grouplock(&grouplock);
1633 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1634 fd->fd_grouplock = grouplock;
1635 spin_unlock(&lli->lli_lock);
1637 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1641 static int ll_put_grouplock(struct inode *inode, struct file *file,
1644 struct ll_inode_info *lli = ll_i2info(inode);
1645 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1646 struct ll_grouplock grouplock;
1649 spin_lock(&lli->lli_lock);
1650 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1651 spin_unlock(&lli->lli_lock);
1652 CWARN("no group lock held\n");
1656 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1658 if (fd->fd_grouplock.lg_gid != arg) {
1659 CWARN("group lock %lu doesn't match current id %lu\n",
1660 arg, fd->fd_grouplock.lg_gid);
1661 spin_unlock(&lli->lli_lock);
1665 grouplock = fd->fd_grouplock;
1666 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1667 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1668 spin_unlock(&lli->lli_lock);
1670 cl_put_grouplock(&grouplock);
1671 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1676 * Close inode open handle
1678 * \param dentry [in] dentry which contains the inode
1679 * \param it [in,out] intent which contains open info and result
1682 * \retval <0 failure
1684 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1686 struct inode *inode = dentry->d_inode;
1687 struct obd_client_handle *och;
1693 /* Root ? Do nothing. */
1694 if (dentry->d_inode->i_sb->s_root == dentry)
1697 /* No open handle to close? Move away */
1698 if (!it_disposition(it, DISP_OPEN_OPEN))
1701 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1703 OBD_ALLOC(och, sizeof(*och));
1705 GOTO(out, rc = -ENOMEM);
1707 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1709 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1710 och, inode, 0, NULL);
1712 /* this one is in place of ll_file_open */
1713 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1714 ptlrpc_req_finished(it->d.lustre.it_data);
1715 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1721 * Get size for inode for which FIEMAP mapping is requested.
1722 * Make the FIEMAP get_info call and returns the result.
1723 * \param fiemap kernel buffer to hold extens
1724 * \param num_bytes kernel buffer size
1726 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1732 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1735 /* Checks for fiemap flags */
1736 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1737 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1741 /* Check for FIEMAP_FLAG_SYNC */
1742 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1743 rc = filemap_fdatawrite(inode->i_mapping);
1748 env = cl_env_get(&refcheck);
1750 RETURN(PTR_ERR(env));
1752 if (i_size_read(inode) == 0) {
1753 rc = ll_glimpse_size(inode);
1758 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1759 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1760 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1762 /* If filesize is 0, then there would be no objects for mapping */
1763 if (fmkey.lfik_oa.o_size == 0) {
1764 fiemap->fm_mapped_extents = 0;
1768 fmkey.lfik_fiemap = *fiemap;
1770 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1771 &fmkey, fiemap, &num_bytes);
1773 cl_env_put(env, &refcheck);
1777 int ll_fid2path(struct inode *inode, void __user *arg)
1779 struct obd_export *exp = ll_i2mdexp(inode);
1780 const struct getinfo_fid2path __user *gfin = arg;
1782 struct getinfo_fid2path *gfout;
1788 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1789 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1792 /* Only need to get the buflen */
1793 if (get_user(pathlen, &gfin->gf_pathlen))
1796 if (pathlen > PATH_MAX)
1799 outsize = sizeof(*gfout) + pathlen;
1800 OBD_ALLOC(gfout, outsize);
1804 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1805 GOTO(gf_free, rc = -EFAULT);
1807 /* Call mdc_iocontrol */
1808 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1812 if (copy_to_user(arg, gfout, outsize))
1816 OBD_FREE(gfout, outsize);
1821 * Read the data_version for inode.
1823 * This value is computed using stripe object version on OST.
1824 * Version is computed using server side locking.
1826 * @param flags if do sync on the OST side;
1828 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1829 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1831 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1833 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1841 /* If no file object initialized, we consider its version is 0. */
1847 env = cl_env_get(&refcheck);
1849 RETURN(PTR_ERR(env));
1851 io = vvp_env_thread_io(env);
1853 io->u.ci_data_version.dv_data_version = 0;
1854 io->u.ci_data_version.dv_flags = flags;
1857 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1858 result = cl_io_loop(env, io);
1860 result = io->ci_result;
1862 *data_version = io->u.ci_data_version.dv_data_version;
1864 cl_io_fini(env, io);
1866 if (unlikely(io->ci_need_restart))
1869 cl_env_put(env, &refcheck);
1875 * Trigger a HSM release request for the provided inode.
1877 int ll_hsm_release(struct inode *inode)
1879 struct cl_env_nest nest;
1881 struct obd_client_handle *och = NULL;
1882 __u64 data_version = 0;
1886 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1887 ll_get_fsname(inode->i_sb, NULL, 0),
1888 PFID(&ll_i2info(inode)->lli_fid));
1890 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1892 GOTO(out, rc = PTR_ERR(och));
1894 /* Grab latest data_version and [am]time values */
1895 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1899 env = cl_env_nested_get(&nest);
1901 GOTO(out, rc = PTR_ERR(env));
1903 ll_merge_attr(env, inode);
1904 cl_env_nested_put(&nest, env);
1906 /* Release the file.
1907 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1908 * we still need it to pack l_remote_handle to MDT. */
1909 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
1910 MDS_HSM_RELEASE, &data_version);
1915 if (och != NULL && !IS_ERR(och)) /* close the file */
1916 ll_lease_close(och, inode, NULL);
1921 struct ll_swap_stack {
1924 struct inode *inode1;
1925 struct inode *inode2;
1930 static int ll_swap_layouts(struct file *file1, struct file *file2,
1931 struct lustre_swap_layouts *lsl)
1933 struct mdc_swap_layouts msl;
1934 struct md_op_data *op_data;
1937 struct ll_swap_stack *llss = NULL;
1940 OBD_ALLOC_PTR(llss);
1944 llss->inode1 = file1->f_path.dentry->d_inode;
1945 llss->inode2 = file2->f_path.dentry->d_inode;
1947 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1951 /* we use 2 bool because it is easier to swap than 2 bits */
1952 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1953 llss->check_dv1 = true;
1955 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1956 llss->check_dv2 = true;
1958 /* we cannot use lsl->sl_dvX directly because we may swap them */
1959 llss->dv1 = lsl->sl_dv1;
1960 llss->dv2 = lsl->sl_dv2;
1962 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1963 if (rc == 0) /* same file, done! */
1966 if (rc < 0) { /* sequentialize it */
1967 swap(llss->inode1, llss->inode2);
1969 swap(llss->dv1, llss->dv2);
1970 swap(llss->check_dv1, llss->check_dv2);
1974 if (gid != 0) { /* application asks to flush dirty cache */
1975 rc = ll_get_grouplock(llss->inode1, file1, gid);
1979 rc = ll_get_grouplock(llss->inode2, file2, gid);
1981 ll_put_grouplock(llss->inode1, file1, gid);
1986 /* ultimate check, before swaping the layouts we check if
1987 * dataversion has changed (if requested) */
1988 if (llss->check_dv1) {
1989 rc = ll_data_version(llss->inode1, &dv, 0);
1992 if (dv != llss->dv1)
1993 GOTO(putgl, rc = -EAGAIN);
1996 if (llss->check_dv2) {
1997 rc = ll_data_version(llss->inode2, &dv, 0);
2000 if (dv != llss->dv2)
2001 GOTO(putgl, rc = -EAGAIN);
2004 /* struct md_op_data is used to send the swap args to the mdt
2005 * only flags is missing, so we use struct mdc_swap_layouts
2006 * through the md_op_data->op_data */
2007 /* flags from user space have to be converted before they are send to
2008 * server, no flag is sent today, they are only used on the client */
2011 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2012 0, LUSTRE_OPC_ANY, &msl);
2013 if (IS_ERR(op_data))
2014 GOTO(free, rc = PTR_ERR(op_data));
2016 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2017 sizeof(*op_data), op_data, NULL);
2018 ll_finish_md_op_data(op_data);
2025 ll_put_grouplock(llss->inode2, file2, gid);
2026 ll_put_grouplock(llss->inode1, file1, gid);
2036 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2038 struct md_op_data *op_data;
2042 /* Detect out-of range masks */
2043 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2046 /* Non-root users are forbidden to set or clear flags which are
2047 * NOT defined in HSM_USER_MASK. */
2048 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2049 !cfs_capable(CFS_CAP_SYS_ADMIN))
2052 /* Detect out-of range archive id */
2053 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2054 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2057 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2058 LUSTRE_OPC_ANY, hss);
2059 if (IS_ERR(op_data))
2060 RETURN(PTR_ERR(op_data));
2062 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2063 sizeof(*op_data), op_data, NULL);
2065 ll_finish_md_op_data(op_data);
2070 static int ll_hsm_import(struct inode *inode, struct file *file,
2071 struct hsm_user_import *hui)
2073 struct hsm_state_set *hss = NULL;
2074 struct iattr *attr = NULL;
2078 if (!S_ISREG(inode->i_mode))
2084 GOTO(out, rc = -ENOMEM);
2086 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2087 hss->hss_archive_id = hui->hui_archive_id;
2088 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2089 rc = ll_hsm_state_set(inode, hss);
2093 OBD_ALLOC_PTR(attr);
2095 GOTO(out, rc = -ENOMEM);
2097 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2098 attr->ia_mode |= S_IFREG;
2099 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2100 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2101 attr->ia_size = hui->hui_size;
2102 attr->ia_mtime.tv_sec = hui->hui_mtime;
2103 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2104 attr->ia_atime.tv_sec = hui->hui_atime;
2105 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2107 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2108 ATTR_UID | ATTR_GID |
2109 ATTR_MTIME | ATTR_MTIME_SET |
2110 ATTR_ATIME | ATTR_ATIME_SET;
2112 mutex_lock(&inode->i_mutex);
2114 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2118 mutex_unlock(&inode->i_mutex);
2130 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2132 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2133 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2137 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2139 struct inode *inode = file->f_path.dentry->d_inode;
2140 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2144 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2145 PFID(ll_inode2fid(inode)), inode, cmd);
2146 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2148 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2149 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2153 case LL_IOC_GETFLAGS:
2154 /* Get the current value of the file flags */
2155 return put_user(fd->fd_flags, (int __user *)arg);
2156 case LL_IOC_SETFLAGS:
2157 case LL_IOC_CLRFLAGS:
2158 /* Set or clear specific file flags */
2159 /* XXX This probably needs checks to ensure the flags are
2160 * not abused, and to handle any flag side effects.
2162 if (get_user(flags, (int __user *) arg))
2165 if (cmd == LL_IOC_SETFLAGS) {
2166 if ((flags & LL_FILE_IGNORE_LOCK) &&
2167 !(file->f_flags & O_DIRECT)) {
2168 CERROR("%s: unable to disable locking on "
2169 "non-O_DIRECT file\n", current->comm);
2173 fd->fd_flags |= flags;
2175 fd->fd_flags &= ~flags;
2178 case LL_IOC_LOV_SETSTRIPE:
2179 RETURN(ll_lov_setstripe(inode, file, arg));
2180 case LL_IOC_LOV_SETEA:
2181 RETURN(ll_lov_setea(inode, file, arg));
2182 case LL_IOC_LOV_SWAP_LAYOUTS: {
2184 struct lustre_swap_layouts lsl;
2186 if (copy_from_user(&lsl, (char __user *)arg,
2187 sizeof(struct lustre_swap_layouts)))
2190 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2193 file2 = fget(lsl.sl_fd);
2197 /* O_WRONLY or O_RDWR */
2198 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2199 GOTO(out, rc = -EPERM);
2201 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2202 struct inode *inode2;
2203 struct ll_inode_info *lli;
2204 struct obd_client_handle *och = NULL;
2206 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2207 GOTO(out, rc = -EINVAL);
2209 lli = ll_i2info(inode);
2210 mutex_lock(&lli->lli_och_mutex);
2211 if (fd->fd_lease_och != NULL) {
2212 och = fd->fd_lease_och;
2213 fd->fd_lease_och = NULL;
2215 mutex_unlock(&lli->lli_och_mutex);
2217 GOTO(out, rc = -ENOLCK);
2218 inode2 = file2->f_path.dentry->d_inode;
2219 rc = ll_swap_layouts_close(och, inode, inode2);
2221 rc = ll_swap_layouts(file, file2, &lsl);
2227 case LL_IOC_LOV_GETSTRIPE:
2228 RETURN(ll_file_getstripe(inode,
2229 (struct lov_user_md __user *)arg));
2230 case FSFILT_IOC_GETFLAGS:
2231 case FSFILT_IOC_SETFLAGS:
2232 RETURN(ll_iocontrol(inode, file, cmd, arg));
2233 case FSFILT_IOC_GETVERSION_OLD:
2234 case FSFILT_IOC_GETVERSION:
2235 RETURN(put_user(inode->i_generation, (int __user *)arg));
2236 case LL_IOC_GROUP_LOCK:
2237 RETURN(ll_get_grouplock(inode, file, arg));
2238 case LL_IOC_GROUP_UNLOCK:
2239 RETURN(ll_put_grouplock(inode, file, arg));
2240 case IOC_OBD_STATFS:
2241 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2243 /* We need to special case any other ioctls we want to handle,
2244 * to send them to the MDS/OST as appropriate and to properly
2245 * network encode the arg field.
2246 case FSFILT_IOC_SETVERSION_OLD:
2247 case FSFILT_IOC_SETVERSION:
2249 case LL_IOC_FLUSHCTX:
2250 RETURN(ll_flush_ctx(inode));
2251 case LL_IOC_PATH2FID: {
2252 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2253 sizeof(struct lu_fid)))
2258 case LL_IOC_GETPARENT:
2259 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2261 case OBD_IOC_FID2PATH:
2262 RETURN(ll_fid2path(inode, (void __user *)arg));
2263 case LL_IOC_DATA_VERSION: {
2264 struct ioc_data_version idv;
2267 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2270 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2271 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2274 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2280 case LL_IOC_GET_MDTIDX: {
2283 mdtidx = ll_get_mdt_idx(inode);
2287 if (put_user((int)mdtidx, (int __user *)arg))
2292 case OBD_IOC_GETDTNAME:
2293 case OBD_IOC_GETMDNAME:
2294 RETURN(ll_get_obd_name(inode, cmd, arg));
2295 case LL_IOC_HSM_STATE_GET: {
2296 struct md_op_data *op_data;
2297 struct hsm_user_state *hus;
2304 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2305 LUSTRE_OPC_ANY, hus);
2306 if (IS_ERR(op_data)) {
2308 RETURN(PTR_ERR(op_data));
2311 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2314 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2317 ll_finish_md_op_data(op_data);
2321 case LL_IOC_HSM_STATE_SET: {
2322 struct hsm_state_set *hss;
2329 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2334 rc = ll_hsm_state_set(inode, hss);
2339 case LL_IOC_HSM_ACTION: {
2340 struct md_op_data *op_data;
2341 struct hsm_current_action *hca;
2348 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2349 LUSTRE_OPC_ANY, hca);
2350 if (IS_ERR(op_data)) {
2352 RETURN(PTR_ERR(op_data));
2355 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2358 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2361 ll_finish_md_op_data(op_data);
2365 case LL_IOC_SET_LEASE: {
2366 struct ll_inode_info *lli = ll_i2info(inode);
2367 struct obd_client_handle *och = NULL;
2372 case LL_LEASE_WRLCK:
2373 if (!(file->f_mode & FMODE_WRITE))
2375 fmode = FMODE_WRITE;
2377 case LL_LEASE_RDLCK:
2378 if (!(file->f_mode & FMODE_READ))
2382 case LL_LEASE_UNLCK:
2383 mutex_lock(&lli->lli_och_mutex);
2384 if (fd->fd_lease_och != NULL) {
2385 och = fd->fd_lease_och;
2386 fd->fd_lease_och = NULL;
2388 mutex_unlock(&lli->lli_och_mutex);
2393 fmode = och->och_flags;
2394 rc = ll_lease_close(och, inode, &lease_broken);
2401 RETURN(ll_lease_type_from_fmode(fmode));
2406 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2408 /* apply for lease */
2409 och = ll_lease_open(inode, file, fmode, 0);
2411 RETURN(PTR_ERR(och));
2414 mutex_lock(&lli->lli_och_mutex);
2415 if (fd->fd_lease_och == NULL) {
2416 fd->fd_lease_och = och;
2419 mutex_unlock(&lli->lli_och_mutex);
2421 /* impossible now that only excl is supported for now */
2422 ll_lease_close(och, inode, &lease_broken);
2427 case LL_IOC_GET_LEASE: {
2428 struct ll_inode_info *lli = ll_i2info(inode);
2429 struct ldlm_lock *lock = NULL;
2432 mutex_lock(&lli->lli_och_mutex);
2433 if (fd->fd_lease_och != NULL) {
2434 struct obd_client_handle *och = fd->fd_lease_och;
2436 lock = ldlm_handle2lock(&och->och_lease_handle);
2438 lock_res_and_lock(lock);
2439 if (!ldlm_is_cancel(lock))
2440 fmode = och->och_flags;
2442 unlock_res_and_lock(lock);
2443 LDLM_LOCK_PUT(lock);
2446 mutex_unlock(&lli->lli_och_mutex);
2448 RETURN(ll_lease_type_from_fmode(fmode));
2450 case LL_IOC_HSM_IMPORT: {
2451 struct hsm_user_import *hui;
2457 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2462 rc = ll_hsm_import(inode, file, hui);
2472 ll_iocontrol_call(inode, file, cmd, arg, &err))
2475 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2476 (void __user *)arg));
2481 #ifndef HAVE_FILE_LLSEEK_SIZE
2482 static inline loff_t
2483 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2485 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2487 if (offset > maxsize)
2490 if (offset != file->f_pos) {
2491 file->f_pos = offset;
2492 file->f_version = 0;
2498 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2499 loff_t maxsize, loff_t eof)
2501 struct inode *inode = file->f_path.dentry->d_inode;
2509 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2510 * position-querying operation. Avoid rewriting the "same"
2511 * f_pos value back to the file because a concurrent read(),
2512 * write() or lseek() might have altered it
2517 * f_lock protects against read/modify/write race with other
2518 * SEEK_CURs. Note that parallel writes and reads behave
2521 mutex_lock(&inode->i_mutex);
2522 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2523 mutex_unlock(&inode->i_mutex);
2527 * In the generic case the entire file is data, so as long as
2528 * offset isn't at the end of the file then the offset is data.
2535 * There is a virtual hole at the end of the file, so as long as
2536 * offset isn't i_size or larger, return i_size.
2544 return llseek_execute(file, offset, maxsize);
2548 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2550 struct inode *inode = file->f_path.dentry->d_inode;
2551 loff_t retval, eof = 0;
2554 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2555 (origin == SEEK_CUR) ? file->f_pos : 0);
2556 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2557 PFID(ll_inode2fid(inode)), inode, retval, retval,
2559 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2561 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2562 retval = ll_glimpse_size(inode);
2565 eof = i_size_read(inode);
2568 retval = ll_generic_file_llseek_size(file, offset, origin,
2569 ll_file_maxbytes(inode), eof);
2573 static int ll_flush(struct file *file, fl_owner_t id)
2575 struct inode *inode = file->f_path.dentry->d_inode;
2576 struct ll_inode_info *lli = ll_i2info(inode);
2577 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2580 LASSERT(!S_ISDIR(inode->i_mode));
2582 /* catch async errors that were recorded back when async writeback
2583 * failed for pages in this mapping. */
2584 rc = lli->lli_async_rc;
2585 lli->lli_async_rc = 0;
2586 if (lli->lli_clob != NULL) {
2587 err = lov_read_and_clear_async_rc(lli->lli_clob);
2592 /* The application has been told write failure already.
2593 * Do not report failure again. */
2594 if (fd->fd_write_failed)
2596 return rc ? -EIO : 0;
2600 * Called to make sure a portion of file has been written out.
2601 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2603 * Return how many pages have been written.
2605 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2606 enum cl_fsync_mode mode, int ignore_layout)
2608 struct cl_env_nest nest;
2611 struct obd_capa *capa = NULL;
2612 struct cl_fsync_io *fio;
2616 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2617 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2620 env = cl_env_nested_get(&nest);
2622 RETURN(PTR_ERR(env));
2624 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2626 io = vvp_env_thread_io(env);
2627 io->ci_obj = ll_i2info(inode)->lli_clob;
2628 io->ci_ignore_layout = ignore_layout;
2630 /* initialize parameters for sync */
2631 fio = &io->u.ci_fsync;
2632 fio->fi_capa = capa;
2633 fio->fi_start = start;
2635 fio->fi_fid = ll_inode2fid(inode);
2636 fio->fi_mode = mode;
2637 fio->fi_nr_written = 0;
2639 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2640 result = cl_io_loop(env, io);
2642 result = io->ci_result;
2644 result = fio->fi_nr_written;
2645 cl_io_fini(env, io);
2646 cl_env_nested_put(&nest, env);
2654 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2655 * null and dentry must be used directly rather than pulled from
2656 * *file->f_path.dentry as is done otherwise.
2659 #ifdef HAVE_FILE_FSYNC_4ARGS
2660 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2662 struct dentry *dentry = file->f_path.dentry;
2663 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2664 int ll_fsync(struct file *file, int datasync)
2666 struct dentry *dentry = file->f_path.dentry;
2668 loff_t end = LLONG_MAX;
2670 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2673 loff_t end = LLONG_MAX;
2675 struct inode *inode = dentry->d_inode;
2676 struct ll_inode_info *lli = ll_i2info(inode);
2677 struct ptlrpc_request *req;
2678 struct obd_capa *oc;
2682 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2683 PFID(ll_inode2fid(inode)), inode);
2684 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2686 #ifdef HAVE_FILE_FSYNC_4ARGS
2687 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2688 mutex_lock(&inode->i_mutex);
2690 /* fsync's caller has already called _fdata{sync,write}, we want
2691 * that IO to finish before calling the osc and mdc sync methods */
2692 rc = filemap_fdatawait(inode->i_mapping);
2695 /* catch async errors that were recorded back when async writeback
2696 * failed for pages in this mapping. */
2697 if (!S_ISDIR(inode->i_mode)) {
2698 err = lli->lli_async_rc;
2699 lli->lli_async_rc = 0;
2702 err = lov_read_and_clear_async_rc(lli->lli_clob);
2707 oc = ll_mdscapa_get(inode);
2708 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2714 ptlrpc_req_finished(req);
2716 if (S_ISREG(inode->i_mode)) {
2717 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2719 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2720 if (rc == 0 && err < 0)
2723 fd->fd_write_failed = true;
2725 fd->fd_write_failed = false;
2728 #ifdef HAVE_FILE_FSYNC_4ARGS
2729 mutex_unlock(&inode->i_mutex);
2735 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2737 struct inode *inode = file->f_path.dentry->d_inode;
2738 struct ll_sb_info *sbi = ll_i2sbi(inode);
2739 struct ldlm_enqueue_info einfo = {
2740 .ei_type = LDLM_FLOCK,
2741 .ei_cb_cp = ldlm_flock_completion_ast,
2742 .ei_cbdata = file_lock,
2744 struct md_op_data *op_data;
2745 struct lustre_handle lockh = {0};
2746 ldlm_policy_data_t flock = {{0}};
2747 int fl_type = file_lock->fl_type;
2753 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2754 PFID(ll_inode2fid(inode)), file_lock);
2756 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2758 if (file_lock->fl_flags & FL_FLOCK) {
2759 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2760 /* flocks are whole-file locks */
2761 flock.l_flock.end = OFFSET_MAX;
2762 /* For flocks owner is determined by the local file desctiptor*/
2763 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2764 } else if (file_lock->fl_flags & FL_POSIX) {
2765 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2766 flock.l_flock.start = file_lock->fl_start;
2767 flock.l_flock.end = file_lock->fl_end;
2771 flock.l_flock.pid = file_lock->fl_pid;
2773 /* Somewhat ugly workaround for svc lockd.
2774 * lockd installs custom fl_lmops->lm_compare_owner that checks
2775 * for the fl_owner to be the same (which it always is on local node
2776 * I guess between lockd processes) and then compares pid.
2777 * As such we assign pid to the owner field to make it all work,
2778 * conflict with normal locks is unlikely since pid space and
2779 * pointer space for current->files are not intersecting */
2780 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2781 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2785 einfo.ei_mode = LCK_PR;
2788 /* An unlock request may or may not have any relation to
2789 * existing locks so we may not be able to pass a lock handle
2790 * via a normal ldlm_lock_cancel() request. The request may even
2791 * unlock a byte range in the middle of an existing lock. In
2792 * order to process an unlock request we need all of the same
2793 * information that is given with a normal read or write record
2794 * lock request. To avoid creating another ldlm unlock (cancel)
2795 * message we'll treat a LCK_NL flock request as an unlock. */
2796 einfo.ei_mode = LCK_NL;
2799 einfo.ei_mode = LCK_PW;
2802 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2817 flags = LDLM_FL_BLOCK_NOWAIT;
2823 flags = LDLM_FL_TEST_LOCK;
2826 CERROR("unknown fcntl lock command: %d\n", cmd);
2830 /* Save the old mode so that if the mode in the lock changes we
2831 * can decrement the appropriate reader or writer refcount. */
2832 file_lock->fl_type = einfo.ei_mode;
2834 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2835 LUSTRE_OPC_ANY, NULL);
2836 if (IS_ERR(op_data))
2837 RETURN(PTR_ERR(op_data));
2839 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2840 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2841 flock.l_flock.pid, flags, einfo.ei_mode,
2842 flock.l_flock.start, flock.l_flock.end);
2844 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2847 /* Restore the file lock type if not TEST lock. */
2848 if (!(flags & LDLM_FL_TEST_LOCK))
2849 file_lock->fl_type = fl_type;
2851 if ((file_lock->fl_flags & FL_FLOCK) &&
2852 (rc == 0 || file_lock->fl_type == F_UNLCK))
2853 rc2 = flock_lock_file_wait(file, file_lock);
2854 if ((file_lock->fl_flags & FL_POSIX) &&
2855 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2856 !(flags & LDLM_FL_TEST_LOCK))
2857 rc2 = posix_lock_file_wait(file, file_lock);
2859 if (rc2 && file_lock->fl_type != F_UNLCK) {
2860 einfo.ei_mode = LCK_NL;
2861 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2866 ll_finish_md_op_data(op_data);
2871 int ll_get_fid_by_name(struct inode *parent, const char *name,
2872 int namelen, struct lu_fid *fid)
2874 struct md_op_data *op_data = NULL;
2875 struct mdt_body *body;
2876 struct ptlrpc_request *req;
2880 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2881 LUSTRE_OPC_ANY, NULL);
2882 if (IS_ERR(op_data))
2883 RETURN(PTR_ERR(op_data));
2885 op_data->op_valid = OBD_MD_FLID;
2886 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2887 ll_finish_md_op_data(op_data);
2891 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2893 GOTO(out_req, rc = -EFAULT);
2895 *fid = body->mbo_fid1;
2897 ptlrpc_req_finished(req);
2901 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2902 const char *name, int namelen)
2904 struct dentry *dchild = NULL;
2905 struct inode *child_inode = NULL;
2906 struct md_op_data *op_data;
2907 struct ptlrpc_request *request = NULL;
2912 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2913 name, PFID(ll_inode2fid(parent)), mdtidx);
2915 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2916 0, LUSTRE_OPC_ANY, NULL);
2917 if (IS_ERR(op_data))
2918 RETURN(PTR_ERR(op_data));
2920 /* Get child FID first */
2921 qstr.hash = full_name_hash(name, namelen);
2924 dchild = d_lookup(file->f_path.dentry, &qstr);
2925 if (dchild != NULL) {
2926 if (dchild->d_inode != NULL) {
2927 child_inode = igrab(dchild->d_inode);
2928 if (child_inode != NULL) {
2929 mutex_lock(&child_inode->i_mutex);
2930 op_data->op_fid3 = *ll_inode2fid(child_inode);
2931 ll_invalidate_aliases(child_inode);
2936 rc = ll_get_fid_by_name(parent, name, namelen,
2942 if (!fid_is_sane(&op_data->op_fid3)) {
2943 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2944 ll_get_fsname(parent->i_sb, NULL, 0), name,
2945 PFID(&op_data->op_fid3));
2946 GOTO(out_free, rc = -EINVAL);
2949 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2954 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2955 PFID(&op_data->op_fid3), mdtidx);
2956 GOTO(out_free, rc = 0);
2959 op_data->op_mds = mdtidx;
2960 op_data->op_cli_flags = CLI_MIGRATE;
2961 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2962 namelen, name, namelen, &request);
2964 ll_update_times(request, parent);
2966 ptlrpc_req_finished(request);
2971 if (child_inode != NULL) {
2972 clear_nlink(child_inode);
2973 mutex_unlock(&child_inode->i_mutex);
2977 ll_finish_md_op_data(op_data);
2982 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2990 * test if some locks matching bits and l_req_mode are acquired
2991 * - bits can be in different locks
2992 * - if found clear the common lock bits in *bits
2993 * - the bits not found, are kept in *bits
2995 * \param bits [IN] searched lock bits [IN]
2996 * \param l_req_mode [IN] searched lock mode
2997 * \retval boolean, true iff all bits are found
2999 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3001 struct lustre_handle lockh;
3002 ldlm_policy_data_t policy;
3003 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3004 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3013 fid = &ll_i2info(inode)->lli_fid;
3014 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3015 ldlm_lockname[mode]);
3017 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3018 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3019 policy.l_inodebits.bits = *bits & (1 << i);
3020 if (policy.l_inodebits.bits == 0)
3023 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3024 &policy, mode, &lockh)) {
3025 struct ldlm_lock *lock;
3027 lock = ldlm_handle2lock(&lockh);
3030 ~(lock->l_policy_data.l_inodebits.bits);
3031 LDLM_LOCK_PUT(lock);
3033 *bits &= ~policy.l_inodebits.bits;
3040 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3041 struct lustre_handle *lockh, __u64 flags,
3044 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3049 fid = &ll_i2info(inode)->lli_fid;
3050 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3052 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3053 fid, LDLM_IBITS, &policy, mode, lockh);
3058 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3060 /* Already unlinked. Just update nlink and return success */
3061 if (rc == -ENOENT) {
3063 /* This path cannot be hit for regular files unless in
3064 * case of obscure races, so no need to to validate
3066 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3068 } else if (rc != 0) {
3069 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3070 "%s: revalidate FID "DFID" error: rc = %d\n",
3071 ll_get_fsname(inode->i_sb, NULL, 0),
3072 PFID(ll_inode2fid(inode)), rc);
3078 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3080 struct inode *inode = dentry->d_inode;
3081 struct ptlrpc_request *req = NULL;
3082 struct obd_export *exp;
3086 LASSERT(inode != NULL);
3088 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3089 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3091 exp = ll_i2mdexp(inode);
3093 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3094 * But under CMD case, it caused some lock issues, should be fixed
3095 * with new CMD ibits lock. See bug 12718 */
3096 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3097 struct lookup_intent oit = { .it_op = IT_GETATTR };
3098 struct md_op_data *op_data;
3100 if (ibits == MDS_INODELOCK_LOOKUP)
3101 oit.it_op = IT_LOOKUP;
3103 /* Call getattr by fid, so do not provide name at all. */
3104 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3105 dentry->d_inode, NULL, 0, 0,
3106 LUSTRE_OPC_ANY, NULL);
3107 if (IS_ERR(op_data))
3108 RETURN(PTR_ERR(op_data));
3110 rc = md_intent_lock(exp, op_data, &oit, &req,
3111 &ll_md_blocking_ast, 0);
3112 ll_finish_md_op_data(op_data);
3114 rc = ll_inode_revalidate_fini(inode, rc);
3118 rc = ll_revalidate_it_finish(req, &oit, dentry);
3120 ll_intent_release(&oit);
3124 /* Unlinked? Unhash dentry, so it is not picked up later by
3125 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3126 here to preserve get_cwd functionality on 2.6.
3128 if (!dentry->d_inode->i_nlink)
3129 d_lustre_invalidate(dentry, 0);
3131 ll_lookup_finish_locks(&oit, dentry);
3132 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3133 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3134 u64 valid = OBD_MD_FLGETATTR;
3135 struct md_op_data *op_data;
3138 if (S_ISREG(inode->i_mode)) {
3139 rc = ll_get_default_mdsize(sbi, &ealen);
3142 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3145 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3146 0, ealen, LUSTRE_OPC_ANY,
3148 if (IS_ERR(op_data))
3149 RETURN(PTR_ERR(op_data));
3151 op_data->op_valid = valid;
3152 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3153 * capa for this inode. Because we only keep capas of dirs
3155 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3156 ll_finish_md_op_data(op_data);
3158 rc = ll_inode_revalidate_fini(inode, rc);
3162 rc = ll_prep_inode(&inode, req, NULL, NULL);
3165 ptlrpc_req_finished(req);
3169 static int ll_merge_md_attr(struct inode *inode)
3171 struct cl_attr attr = { 0 };
3174 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3175 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3176 &attr, ll_md_blocking_ast);
3180 set_nlink(inode, attr.cat_nlink);
3181 inode->i_blocks = attr.cat_blocks;
3182 i_size_write(inode, attr.cat_size);
3184 ll_i2info(inode)->lli_atime = attr.cat_atime;
3185 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3186 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3192 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3194 struct inode *inode = dentry->d_inode;
3198 rc = __ll_inode_revalidate(dentry, ibits);
3202 /* if object isn't regular file, don't validate size */
3203 if (!S_ISREG(inode->i_mode)) {
3204 if (S_ISDIR(inode->i_mode) &&
3205 ll_i2info(inode)->lli_lsm_md != NULL) {
3206 rc = ll_merge_md_attr(inode);
3211 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3212 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3213 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3215 /* In case of restore, the MDT has the right size and has
3216 * already send it back without granting the layout lock,
3217 * inode is up-to-date so glimpse is useless.
3218 * Also to glimpse we need the layout, in case of a running
3219 * restore the MDT holds the layout lock so the glimpse will
3220 * block up to the end of restore (getattr will block)
3222 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3223 rc = ll_glimpse_size(inode);
3228 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3230 struct inode *inode = de->d_inode;
3231 struct ll_sb_info *sbi = ll_i2sbi(inode);
3232 struct ll_inode_info *lli = ll_i2info(inode);
3235 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3236 MDS_INODELOCK_LOOKUP);
3237 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3242 stat->dev = inode->i_sb->s_dev;
3243 if (ll_need_32bit_api(sbi))
3244 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3246 stat->ino = inode->i_ino;
3247 stat->mode = inode->i_mode;
3248 stat->uid = inode->i_uid;
3249 stat->gid = inode->i_gid;
3250 stat->rdev = inode->i_rdev;
3251 stat->atime = inode->i_atime;
3252 stat->mtime = inode->i_mtime;
3253 stat->ctime = inode->i_ctime;
3254 stat->blksize = 1 << inode->i_blkbits;
3256 stat->nlink = inode->i_nlink;
3257 stat->size = i_size_read(inode);
3258 stat->blocks = inode->i_blocks;
3263 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3264 __u64 start, __u64 len)
3268 struct fiemap *fiemap;
3269 unsigned int extent_count = fieinfo->fi_extents_max;
3271 num_bytes = sizeof(*fiemap) + (extent_count *
3272 sizeof(struct fiemap_extent));
3273 OBD_ALLOC_LARGE(fiemap, num_bytes);
3278 fiemap->fm_flags = fieinfo->fi_flags;
3279 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3280 fiemap->fm_start = start;
3281 fiemap->fm_length = len;
3282 if (extent_count > 0 &&
3283 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3284 sizeof(struct fiemap_extent)) != 0)
3285 GOTO(out, rc = -EFAULT);
3287 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3289 fieinfo->fi_flags = fiemap->fm_flags;
3290 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3291 if (extent_count > 0 &&
3292 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3293 fiemap->fm_mapped_extents *
3294 sizeof(struct fiemap_extent)) != 0)
3295 GOTO(out, rc = -EFAULT);
3297 OBD_FREE_LARGE(fiemap, num_bytes);
3301 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3303 struct ll_inode_info *lli = ll_i2info(inode);
3304 struct posix_acl *acl = NULL;
3307 spin_lock(&lli->lli_lock);
3308 /* VFS' acl_permission_check->check_acl will release the refcount */
3309 acl = posix_acl_dup(lli->lli_posix_acl);
3310 spin_unlock(&lli->lli_lock);
3315 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3317 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3318 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3320 ll_check_acl(struct inode *inode, int mask)
3323 # ifdef CONFIG_FS_POSIX_ACL
3324 struct posix_acl *acl;
3328 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3329 if (flags & IPERM_FLAG_RCU)
3332 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3337 rc = posix_acl_permission(inode, acl, mask);
3338 posix_acl_release(acl);
3341 # else /* !CONFIG_FS_POSIX_ACL */
3343 # endif /* CONFIG_FS_POSIX_ACL */
3345 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3347 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3348 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3350 # ifdef HAVE_INODE_PERMISION_2ARGS
3351 int ll_inode_permission(struct inode *inode, int mask)
3353 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3358 struct ll_sb_info *sbi;
3359 struct root_squash_info *squash;
3360 struct cred *cred = NULL;
3361 const struct cred *old_cred = NULL;
3363 bool squash_id = false;
3366 #ifdef MAY_NOT_BLOCK
3367 if (mask & MAY_NOT_BLOCK)
3369 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3370 if (flags & IPERM_FLAG_RCU)
3374 /* as root inode are NOT getting validated in lookup operation,
3375 * need to do it before permission check. */
3377 if (inode == inode->i_sb->s_root->d_inode) {
3378 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3379 MDS_INODELOCK_LOOKUP);
3384 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3385 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3387 /* squash fsuid/fsgid if needed */
3388 sbi = ll_i2sbi(inode);
3389 squash = &sbi->ll_squash;
3390 if (unlikely(squash->rsi_uid != 0 &&
3391 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3392 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3396 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3397 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3398 squash->rsi_uid, squash->rsi_gid);
3400 /* update current process's credentials
3401 * and FS capability */
3402 cred = prepare_creds();
3406 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3407 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3408 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3409 if ((1 << cap) & CFS_CAP_FS_MASK)
3410 cap_lower(cred->cap_effective, cap);
3412 old_cred = override_creds(cred);
3415 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3417 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3418 rc = lustre_check_remote_perm(inode, mask);
3420 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3422 /* restore current process's credentials and FS capability */
3424 revert_creds(old_cred);
3431 /* -o localflock - only provides locally consistent flock locks */
3432 struct file_operations ll_file_operations = {
3433 .read = ll_file_read,
3434 .aio_read = ll_file_aio_read,
3435 .write = ll_file_write,
3436 .aio_write = ll_file_aio_write,
3437 .unlocked_ioctl = ll_file_ioctl,
3438 .open = ll_file_open,
3439 .release = ll_file_release,
3440 .mmap = ll_file_mmap,
3441 .llseek = ll_file_seek,
3442 .splice_read = ll_file_splice_read,
3447 struct file_operations ll_file_operations_flock = {
3448 .read = ll_file_read,
3449 .aio_read = ll_file_aio_read,
3450 .write = ll_file_write,
3451 .aio_write = ll_file_aio_write,
3452 .unlocked_ioctl = ll_file_ioctl,
3453 .open = ll_file_open,
3454 .release = ll_file_release,
3455 .mmap = ll_file_mmap,
3456 .llseek = ll_file_seek,
3457 .splice_read = ll_file_splice_read,
3460 .flock = ll_file_flock,
3461 .lock = ll_file_flock
3464 /* These are for -o noflock - to return ENOSYS on flock calls */
3465 struct file_operations ll_file_operations_noflock = {
3466 .read = ll_file_read,
3467 .aio_read = ll_file_aio_read,
3468 .write = ll_file_write,
3469 .aio_write = ll_file_aio_write,
3470 .unlocked_ioctl = ll_file_ioctl,
3471 .open = ll_file_open,
3472 .release = ll_file_release,
3473 .mmap = ll_file_mmap,
3474 .llseek = ll_file_seek,
3475 .splice_read = ll_file_splice_read,
3478 .flock = ll_file_noflock,
3479 .lock = ll_file_noflock
3482 struct inode_operations ll_file_inode_operations = {
3483 .setattr = ll_setattr,
3484 .getattr = ll_getattr,
3485 .permission = ll_inode_permission,
3486 .setxattr = ll_setxattr,
3487 .getxattr = ll_getxattr,
3488 .listxattr = ll_listxattr,
3489 .removexattr = ll_removexattr,
3490 .fiemap = ll_fiemap,
3491 #ifdef HAVE_IOP_GET_ACL
3492 .get_acl = ll_get_acl,
3496 /* dynamic ioctl number support routins */
3497 static struct llioc_ctl_data {
3498 struct rw_semaphore ioc_sem;
3499 struct list_head ioc_head;
3501 __RWSEM_INITIALIZER(llioc.ioc_sem),
3502 LIST_HEAD_INIT(llioc.ioc_head)
3507 struct list_head iocd_list;
3508 unsigned int iocd_size;
3509 llioc_callback_t iocd_cb;
3510 unsigned int iocd_count;
3511 unsigned int iocd_cmd[0];
3514 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3517 struct llioc_data *in_data = NULL;
3520 if (cb == NULL || cmd == NULL ||
3521 count > LLIOC_MAX_CMD || count < 0)
3524 size = sizeof(*in_data) + count * sizeof(unsigned int);
3525 OBD_ALLOC(in_data, size);
3526 if (in_data == NULL)
3529 memset(in_data, 0, sizeof(*in_data));
3530 in_data->iocd_size = size;
3531 in_data->iocd_cb = cb;
3532 in_data->iocd_count = count;
3533 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3535 down_write(&llioc.ioc_sem);
3536 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3537 up_write(&llioc.ioc_sem);
3542 void ll_iocontrol_unregister(void *magic)
3544 struct llioc_data *tmp;
3549 down_write(&llioc.ioc_sem);
3550 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3552 unsigned int size = tmp->iocd_size;
3554 list_del(&tmp->iocd_list);
3555 up_write(&llioc.ioc_sem);
3557 OBD_FREE(tmp, size);
3561 up_write(&llioc.ioc_sem);
3563 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3566 EXPORT_SYMBOL(ll_iocontrol_register);
3567 EXPORT_SYMBOL(ll_iocontrol_unregister);
3569 static enum llioc_iter
3570 ll_iocontrol_call(struct inode *inode, struct file *file,
3571 unsigned int cmd, unsigned long arg, int *rcp)
3573 enum llioc_iter ret = LLIOC_CONT;
3574 struct llioc_data *data;
3575 int rc = -EINVAL, i;
3577 down_read(&llioc.ioc_sem);
3578 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3579 for (i = 0; i < data->iocd_count; i++) {
3580 if (cmd != data->iocd_cmd[i])
3583 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3587 if (ret == LLIOC_STOP)
3590 up_read(&llioc.ioc_sem);
3597 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3599 struct ll_inode_info *lli = ll_i2info(inode);
3600 struct cl_object *obj = lli->lli_clob;
3601 struct cl_env_nest nest;
3609 env = cl_env_nested_get(&nest);
3611 RETURN(PTR_ERR(env));
3613 rc = cl_conf_set(env, lli->lli_clob, conf);
3617 if (conf->coc_opc == OBJECT_CONF_SET) {
3618 struct ldlm_lock *lock = conf->coc_lock;
3619 struct cl_layout cl = {
3623 LASSERT(lock != NULL);
3624 LASSERT(ldlm_has_layout(lock));
3626 /* it can only be allowed to match after layout is
3627 * applied to inode otherwise false layout would be
3628 * seen. Applying layout shoud happen before dropping
3629 * the intent lock. */
3630 ldlm_lock_allow_match(lock);
3632 rc = cl_object_layout_get(env, obj, &cl);
3637 DFID": layout version change: %u -> %u\n",
3638 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3640 ll_layout_version_set(lli, cl.cl_layout_gen);
3644 cl_env_nested_put(&nest, env);
3649 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3650 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3653 struct ll_sb_info *sbi = ll_i2sbi(inode);
3654 struct obd_capa *oc;
3655 struct ptlrpc_request *req;
3656 struct mdt_body *body;
3663 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3664 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3665 lock->l_lvb_data, lock->l_lvb_len);
3667 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3670 /* if layout lock was granted right away, the layout is returned
3671 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3672 * blocked and then granted via completion ast, we have to fetch
3673 * layout here. Please note that we can't use the LVB buffer in
3674 * completion AST because it doesn't have a large enough buffer */
3675 oc = ll_mdscapa_get(inode);
3676 rc = ll_get_default_mdsize(sbi, &lmmsize);
3678 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3679 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3685 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3687 GOTO(out, rc = -EPROTO);
3689 lmmsize = body->mbo_eadatasize;
3690 if (lmmsize == 0) /* empty layout */
3693 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3695 GOTO(out, rc = -EFAULT);
3697 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3698 if (lvbdata == NULL)
3699 GOTO(out, rc = -ENOMEM);
3701 memcpy(lvbdata, lmm, lmmsize);
3702 lock_res_and_lock(lock);
3703 if (lock->l_lvb_data != NULL)
3704 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3706 lock->l_lvb_data = lvbdata;
3707 lock->l_lvb_len = lmmsize;
3708 unlock_res_and_lock(lock);
3713 ptlrpc_req_finished(req);
3718 * Apply the layout to the inode. Layout lock is held and will be released
3721 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3722 struct inode *inode)
3724 struct ll_inode_info *lli = ll_i2info(inode);
3725 struct ll_sb_info *sbi = ll_i2sbi(inode);
3726 struct ldlm_lock *lock;
3727 struct lustre_md md = { NULL };
3728 struct cl_object_conf conf;
3731 bool wait_layout = false;
3734 LASSERT(lustre_handle_is_used(lockh));
3736 lock = ldlm_handle2lock(lockh);
3737 LASSERT(lock != NULL);
3738 LASSERT(ldlm_has_layout(lock));
3740 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3741 PFID(&lli->lli_fid), inode);
3743 /* in case this is a caching lock and reinstate with new inode */
3744 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3746 lock_res_and_lock(lock);
3747 lvb_ready = ldlm_is_lvb_ready(lock);
3748 unlock_res_and_lock(lock);
3749 /* checking lvb_ready is racy but this is okay. The worst case is
3750 * that multi processes may configure the file on the same time. */
3755 rc = ll_layout_fetch(inode, lock);
3759 /* for layout lock, lmm is returned in lock's lvb.
3760 * lvb_data is immutable if the lock is held so it's safe to access it
3761 * without res lock. See the description in ldlm_lock_decref_internal()
3762 * for the condition to free lvb_data of layout lock */
3763 if (lock->l_lvb_data != NULL) {
3764 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3765 lock->l_lvb_data, lock->l_lvb_len);
3767 CERROR("%s: file "DFID" unpackmd error: %d\n",
3768 ll_get_fsname(inode->i_sb, NULL, 0),
3769 PFID(&lli->lli_fid), rc);
3773 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3774 lock->l_lvb_data, lock->l_lvb_len);
3779 /* set layout to file. Unlikely this will fail as old layout was
3780 * surely eliminated */
3781 memset(&conf, 0, sizeof conf);
3782 conf.coc_opc = OBJECT_CONF_SET;
3783 conf.coc_inode = inode;
3784 conf.coc_lock = lock;
3785 conf.u.coc_md = &md;
3786 rc = ll_layout_conf(inode, &conf);
3789 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3791 /* refresh layout failed, need to wait */
3792 wait_layout = rc == -EBUSY;
3796 LDLM_LOCK_PUT(lock);
3797 ldlm_lock_decref(lockh, mode);
3799 /* wait for IO to complete if it's still being used. */
3801 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3802 ll_get_fsname(inode->i_sb, NULL, 0),
3803 PFID(&lli->lli_fid), inode);
3805 memset(&conf, 0, sizeof conf);
3806 conf.coc_opc = OBJECT_CONF_WAIT;
3807 conf.coc_inode = inode;
3808 rc = ll_layout_conf(inode, &conf);
3812 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3813 ll_get_fsname(inode->i_sb, NULL, 0),
3814 PFID(&lli->lli_fid), rc);
3819 static int ll_layout_refresh_locked(struct inode *inode)
3821 struct ll_inode_info *lli = ll_i2info(inode);
3822 struct ll_sb_info *sbi = ll_i2sbi(inode);
3823 struct md_op_data *op_data;
3824 struct lookup_intent it;
3825 struct lustre_handle lockh;
3827 struct ldlm_enqueue_info einfo = {
3828 .ei_type = LDLM_IBITS,
3830 .ei_cb_bl = &ll_md_blocking_ast,
3831 .ei_cb_cp = &ldlm_completion_ast,
3837 /* mostly layout lock is caching on the local side, so try to match
3838 * it before grabbing layout lock mutex. */
3839 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3840 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3841 if (mode != 0) { /* hit cached lock */
3842 rc = ll_layout_lock_set(&lockh, mode, inode);
3849 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3850 0, 0, LUSTRE_OPC_ANY, NULL);
3851 if (IS_ERR(op_data))
3852 RETURN(PTR_ERR(op_data));
3854 /* have to enqueue one */
3855 memset(&it, 0, sizeof(it));
3856 it.it_op = IT_LAYOUT;
3857 lockh.cookie = 0ULL;
3859 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3860 ll_get_fsname(inode->i_sb, NULL, 0),
3861 PFID(&lli->lli_fid), inode);
3863 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3864 if (it.d.lustre.it_data != NULL)
3865 ptlrpc_req_finished(it.d.lustre.it_data);
3866 it.d.lustre.it_data = NULL;
3868 ll_finish_md_op_data(op_data);
3870 mode = it.d.lustre.it_lock_mode;
3871 it.d.lustre.it_lock_mode = 0;
3872 ll_intent_drop_lock(&it);
3875 /* set lock data in case this is a new lock */
3876 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3877 rc = ll_layout_lock_set(&lockh, mode, inode);
3886 * This function checks if there exists a LAYOUT lock on the client side,
3887 * or enqueues it if it doesn't have one in cache.
3889 * This function will not hold layout lock so it may be revoked any time after
3890 * this function returns. Any operations depend on layout should be redone
3893 * This function should be called before lov_io_init() to get an uptodate
3894 * layout version, the caller should save the version number and after IO
3895 * is finished, this function should be called again to verify that layout
3896 * is not changed during IO time.
3898 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3900 struct ll_inode_info *lli = ll_i2info(inode);
3901 struct ll_sb_info *sbi = ll_i2sbi(inode);
3905 *gen = ll_layout_version_get(lli);
3906 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3910 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3911 LASSERT(S_ISREG(inode->i_mode));
3913 /* take layout lock mutex to enqueue layout lock exclusively. */
3914 mutex_lock(&lli->lli_layout_mutex);
3916 rc = ll_layout_refresh_locked(inode);
3920 *gen = ll_layout_version_get(lli);
3922 mutex_unlock(&lli->lli_layout_mutex);
3928 * This function send a restore request to the MDT
3930 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3932 struct hsm_user_request *hur;
3936 len = sizeof(struct hsm_user_request) +
3937 sizeof(struct hsm_user_item);
3938 OBD_ALLOC(hur, len);
3942 hur->hur_request.hr_action = HUA_RESTORE;
3943 hur->hur_request.hr_archive_id = 0;
3944 hur->hur_request.hr_flags = 0;
3945 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3946 sizeof(hur->hur_user_item[0].hui_fid));
3947 hur->hur_user_item[0].hui_extent.offset = offset;
3948 hur->hur_user_item[0].hui_extent.length = length;
3949 hur->hur_request.hr_itemcount = 1;
3950 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,