4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
89 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
90 struct lustre_handle *fh)
92 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr_blocks = inode->i_blocks;
99 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
101 op_data->op_handle = *fh;
102 op_data->op_capa1 = ll_mdscapa_get(inode);
104 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
105 op_data->op_bias |= MDS_DATA_MODIFIED;
109 * Packs all the attributes into @op_data for the CLOSE rpc.
111 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
112 struct obd_client_handle *och)
116 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
117 ATTR_MTIME | ATTR_MTIME_SET |
118 ATTR_CTIME | ATTR_CTIME_SET;
120 if (!(och->och_flags & FMODE_WRITE))
123 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
133 * Perform a close, possibly with a bias.
134 * The meaning of "data" depends on the value of "bias".
136 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
137 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
140 static int ll_close_inode_openhandle(struct obd_export *md_exp,
141 struct obd_client_handle *och,
143 enum mds_op_bias bias,
146 struct obd_export *exp = ll_i2mdexp(inode);
147 struct md_op_data *op_data;
148 struct ptlrpc_request *req = NULL;
149 struct obd_device *obd = class_exp2obd(exp);
155 * XXX: in case of LMV, is this correct to access
158 CERROR("Invalid MDC connection handle "LPX64"\n",
159 ll_i2mdexp(inode)->exp_handle.h_cookie);
163 OBD_ALLOC_PTR(op_data);
165 /* XXX We leak openhandle and request here. */
166 GOTO(out, rc = -ENOMEM);
168 ll_prepare_close(inode, op_data, och);
170 case MDS_CLOSE_LAYOUT_SWAP:
171 LASSERT(data != NULL);
172 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
173 op_data->op_data_version = 0;
174 op_data->op_lease_handle = och->och_lease_handle;
175 op_data->op_fid2 = *ll_inode2fid(data);
178 case MDS_HSM_RELEASE:
179 LASSERT(data != NULL);
180 op_data->op_bias |= MDS_HSM_RELEASE;
181 op_data->op_data_version = *(__u64 *)data;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
187 LASSERT(data == NULL);
191 rc = md_close(md_exp, op_data, och->och_mod, &req);
193 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
194 ll_i2mdexp(inode)->exp_obd->obd_name,
195 PFID(ll_inode2fid(inode)), rc);
198 /* DATA_MODIFIED flag was successfully sent on close, cancel data
199 * modification flag. */
200 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
201 struct ll_inode_info *lli = ll_i2info(inode);
203 spin_lock(&lli->lli_lock);
204 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
205 spin_unlock(&lli->lli_lock);
209 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
210 struct mdt_body *body;
212 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
213 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
217 ll_finish_md_op_data(op_data);
221 md_clear_open_replay_data(md_exp, och);
222 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
225 if (req) /* This is close request */
226 ptlrpc_req_finished(req);
230 int ll_md_real_close(struct inode *inode, fmode_t fmode)
232 struct ll_inode_info *lli = ll_i2info(inode);
233 struct obd_client_handle **och_p;
234 struct obd_client_handle *och;
239 if (fmode & FMODE_WRITE) {
240 och_p = &lli->lli_mds_write_och;
241 och_usecount = &lli->lli_open_fd_write_count;
242 } else if (fmode & FMODE_EXEC) {
243 och_p = &lli->lli_mds_exec_och;
244 och_usecount = &lli->lli_open_fd_exec_count;
246 LASSERT(fmode & FMODE_READ);
247 och_p = &lli->lli_mds_read_och;
248 och_usecount = &lli->lli_open_fd_read_count;
251 mutex_lock(&lli->lli_och_mutex);
252 if (*och_usecount > 0) {
253 /* There are still users of this handle, so skip
255 mutex_unlock(&lli->lli_och_mutex);
261 mutex_unlock(&lli->lli_och_mutex);
264 /* There might be a race and this handle may already
266 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
267 och, inode, 0, NULL);
273 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
276 ldlm_policy_data_t policy = {
277 .l_inodebits = { MDS_INODELOCK_OPEN },
279 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
280 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lustre_handle lockh;
287 /* clear group lock, if present */
288 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
289 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
291 if (fd->fd_lease_och != NULL) {
294 /* Usually the lease is not released when the
295 * application crashed, we need to release here. */
296 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
297 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
298 PFID(&lli->lli_fid), rc, lease_broken);
300 fd->fd_lease_och = NULL;
303 if (fd->fd_och != NULL) {
304 rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
310 /* Let's see if we have good enough OPEN lock on the file and if
311 we can skip talking to MDS */
312 mutex_lock(&lli->lli_och_mutex);
313 if (fd->fd_omode & FMODE_WRITE) {
315 LASSERT(lli->lli_open_fd_write_count);
316 lli->lli_open_fd_write_count--;
317 } else if (fd->fd_omode & FMODE_EXEC) {
319 LASSERT(lli->lli_open_fd_exec_count);
320 lli->lli_open_fd_exec_count--;
323 LASSERT(lli->lli_open_fd_read_count);
324 lli->lli_open_fd_read_count--;
326 mutex_unlock(&lli->lli_och_mutex);
328 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
329 LDLM_IBITS, &policy, lockmode, &lockh))
330 rc = ll_md_real_close(inode, fd->fd_omode);
333 LUSTRE_FPRIVATE(file) = NULL;
334 ll_file_data_put(fd);
335 ll_capa_close(inode);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 #ifdef CONFIG_FS_POSIX_ACL
357 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
358 inode == inode->i_sb->s_root->d_inode) {
359 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
362 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
363 fd->fd_flags &= ~LL_FILE_RMTACL;
364 rct_del(&sbi->ll_rct, current_pid());
365 et_search_free(&sbi->ll_et, current_pid());
370 if (inode->i_sb->s_root != file->f_path.dentry)
371 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
372 fd = LUSTRE_FPRIVATE(file);
375 /* The last ref on @file, maybe not the the owner pid of statahead,
376 * because parent and child process can share the same file handle. */
377 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
378 ll_deauthorize_statahead(inode, fd);
380 if (inode->i_sb->s_root == file->f_path.dentry) {
381 LUSTRE_FPRIVATE(file) = NULL;
382 ll_file_data_put(fd);
386 if (!S_ISDIR(inode->i_mode)) {
387 if (lli->lli_clob != NULL)
388 lov_read_and_clear_async_rc(lli->lli_clob);
389 lli->lli_async_rc = 0;
392 rc = ll_md_close(sbi->ll_md_exp, inode, file);
394 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
395 libcfs_debug_dumplog();
400 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
401 struct lookup_intent *itp)
403 struct dentry *de = file->f_path.dentry;
404 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
405 struct dentry *parent = de->d_parent;
406 const char *name = NULL;
408 struct md_op_data *op_data;
409 struct ptlrpc_request *req = NULL;
413 LASSERT(parent != NULL);
414 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
416 /* if server supports open-by-fid, or file name is invalid, don't pack
417 * name in open request */
418 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
419 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
420 name = de->d_name.name;
421 len = de->d_name.len;
424 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
425 name, len, 0, LUSTRE_OPC_ANY, NULL);
427 RETURN(PTR_ERR(op_data));
428 op_data->op_data = lmm;
429 op_data->op_data_size = lmmsize;
431 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
432 &ll_md_blocking_ast, 0);
433 ll_finish_md_op_data(op_data);
435 /* reason for keep own exit path - don`t flood log
436 * with messages with -ESTALE errors.
438 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
439 it_open_error(DISP_OPEN_OPEN, itp))
441 ll_release_openhandle(de, itp);
445 if (it_disposition(itp, DISP_LOOKUP_NEG))
446 GOTO(out, rc = -ENOENT);
448 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
449 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
450 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
454 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
455 if (!rc && itp->d.lustre.it_lock_mode)
456 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
465 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
466 struct obd_client_handle *och)
468 struct ptlrpc_request *req = it->d.lustre.it_data;
469 struct mdt_body *body;
471 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
472 och->och_fh = body->mbo_handle;
473 och->och_fid = body->mbo_fid1;
474 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
475 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
476 och->och_flags = it->it_flags;
478 return md_set_open_replay_data(md_exp, och, it);
481 static int ll_local_open(struct file *file, struct lookup_intent *it,
482 struct ll_file_data *fd, struct obd_client_handle *och)
484 struct inode *inode = file->f_path.dentry->d_inode;
487 LASSERT(!LUSTRE_FPRIVATE(file));
494 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
499 LUSTRE_FPRIVATE(file) = fd;
500 ll_readahead_init(inode, &fd->fd_ras);
501 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
503 /* ll_cl_context initialize */
504 rwlock_init(&fd->fd_lock);
505 INIT_LIST_HEAD(&fd->fd_lccs);
510 /* Open a file, and (for the very first open) create objects on the OSTs at
511 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
512 * creation or open until ll_lov_setstripe() ioctl is called.
514 * If we already have the stripe MD locally then we don't request it in
515 * md_open(), by passing a lmm_size = 0.
517 * It is up to the application to ensure no other processes open this file
518 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
519 * used. We might be able to avoid races of that sort by getting lli_open_sem
520 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
521 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
523 int ll_file_open(struct inode *inode, struct file *file)
525 struct ll_inode_info *lli = ll_i2info(inode);
526 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
527 .it_flags = file->f_flags };
528 struct obd_client_handle **och_p = NULL;
529 __u64 *och_usecount = NULL;
530 struct ll_file_data *fd;
534 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
535 PFID(ll_inode2fid(inode)), inode, file->f_flags);
537 it = file->private_data; /* XXX: compat macro */
538 file->private_data = NULL; /* prevent ll_local_open assertion */
540 fd = ll_file_data_get();
542 GOTO(out_openerr, rc = -ENOMEM);
545 if (S_ISDIR(inode->i_mode))
546 ll_authorize_statahead(inode, fd);
548 if (inode->i_sb->s_root == file->f_path.dentry) {
549 LUSTRE_FPRIVATE(file) = fd;
553 if (!it || !it->d.lustre.it_disposition) {
554 /* Convert f_flags into access mode. We cannot use file->f_mode,
555 * because everything but O_ACCMODE mask was stripped from
557 if ((oit.it_flags + 1) & O_ACCMODE)
559 if (file->f_flags & O_TRUNC)
560 oit.it_flags |= FMODE_WRITE;
562 /* kernel only call f_op->open in dentry_open. filp_open calls
563 * dentry_open after call to open_namei that checks permissions.
564 * Only nfsd_open call dentry_open directly without checking
565 * permissions and because of that this code below is safe. */
566 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
567 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
569 /* We do not want O_EXCL here, presumably we opened the file
570 * already? XXX - NFS implications? */
571 oit.it_flags &= ~O_EXCL;
573 /* bug20584, if "it_flags" contains O_CREAT, the file will be
574 * created if necessary, then "IT_CREAT" should be set to keep
575 * consistent with it */
576 if (oit.it_flags & O_CREAT)
577 oit.it_op |= IT_CREAT;
583 /* Let's see if we have file open on MDS already. */
584 if (it->it_flags & FMODE_WRITE) {
585 och_p = &lli->lli_mds_write_och;
586 och_usecount = &lli->lli_open_fd_write_count;
587 } else if (it->it_flags & FMODE_EXEC) {
588 och_p = &lli->lli_mds_exec_och;
589 och_usecount = &lli->lli_open_fd_exec_count;
591 och_p = &lli->lli_mds_read_och;
592 och_usecount = &lli->lli_open_fd_read_count;
595 mutex_lock(&lli->lli_och_mutex);
596 if (*och_p) { /* Open handle is present */
597 if (it_disposition(it, DISP_OPEN_OPEN)) {
598 /* Well, there's extra open request that we do not need,
599 let's close it somehow. This will decref request. */
600 rc = it_open_error(DISP_OPEN_OPEN, it);
602 mutex_unlock(&lli->lli_och_mutex);
603 GOTO(out_openerr, rc);
606 ll_release_openhandle(file->f_path.dentry, it);
610 rc = ll_local_open(file, it, fd, NULL);
613 mutex_unlock(&lli->lli_och_mutex);
614 GOTO(out_openerr, rc);
617 LASSERT(*och_usecount == 0);
618 if (!it->d.lustre.it_disposition) {
619 /* We cannot just request lock handle now, new ELC code
620 means that one of other OPEN locks for this file
621 could be cancelled, and since blocking ast handler
622 would attempt to grab och_mutex as well, that would
623 result in a deadlock */
624 mutex_unlock(&lli->lli_och_mutex);
626 * Normally called under two situations:
628 * 2. A race/condition on MDS resulting in no open
629 * handle to be returned from LOOKUP|OPEN request,
630 * for example if the target entry was a symlink.
632 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file, NULL, 0, it);
640 GOTO(out_openerr, rc);
644 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
646 GOTO(out_och_free, rc = -ENOMEM);
650 /* md_intent_lock() didn't get a request ref if there was an
651 * open error, so don't do cleanup on the request here
653 /* XXX (green): Should not we bail out on any error here, not
654 * just open error? */
655 rc = it_open_error(DISP_OPEN_OPEN, it);
657 GOTO(out_och_free, rc);
659 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
660 "inode %p: disposition %x, status %d\n", inode,
661 it_disposition(it, ~0), it->d.lustre.it_status);
663 rc = ll_local_open(file, it, fd, *och_p);
665 GOTO(out_och_free, rc);
667 mutex_unlock(&lli->lli_och_mutex);
670 /* Must do this outside lli_och_mutex lock to prevent deadlock where
671 different kind of OPEN lock for this same inode gets cancelled
672 by ldlm_cancel_lru */
673 if (!S_ISREG(inode->i_mode))
674 GOTO(out_och_free, rc);
678 cl_lov_delay_create_clear(&file->f_flags);
679 GOTO(out_och_free, rc);
683 if (och_p && *och_p) {
684 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
685 *och_p = NULL; /* OBD_FREE writes some magic there */
688 mutex_unlock(&lli->lli_och_mutex);
691 if (lli->lli_opendir_key == fd)
692 ll_deauthorize_statahead(inode, fd);
694 ll_file_data_put(fd);
696 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
699 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
700 ptlrpc_req_finished(it->d.lustre.it_data);
701 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
707 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
708 struct ldlm_lock_desc *desc, void *data, int flag)
711 struct lustre_handle lockh;
715 case LDLM_CB_BLOCKING:
716 ldlm_lock2handle(lock, &lockh);
717 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
719 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
723 case LDLM_CB_CANCELING:
731 * Acquire a lease and open the file.
733 static struct obd_client_handle *
734 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
737 struct lookup_intent it = { .it_op = IT_OPEN };
738 struct ll_sb_info *sbi = ll_i2sbi(inode);
739 struct md_op_data *op_data;
740 struct ptlrpc_request *req = NULL;
741 struct lustre_handle old_handle = { 0 };
742 struct obd_client_handle *och = NULL;
747 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
748 RETURN(ERR_PTR(-EINVAL));
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
756 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
757 RETURN(ERR_PTR(-EPERM));
759 /* Get the openhandle of the file */
761 mutex_lock(&lli->lli_och_mutex);
762 if (fd->fd_lease_och != NULL) {
763 mutex_unlock(&lli->lli_och_mutex);
767 if (fd->fd_och == NULL) {
768 if (file->f_mode & FMODE_WRITE) {
769 LASSERT(lli->lli_mds_write_och != NULL);
770 och_p = &lli->lli_mds_write_och;
771 och_usecount = &lli->lli_open_fd_write_count;
773 LASSERT(lli->lli_mds_read_och != NULL);
774 och_p = &lli->lli_mds_read_och;
775 och_usecount = &lli->lli_open_fd_read_count;
777 if (*och_usecount == 1) {
784 mutex_unlock(&lli->lli_och_mutex);
785 if (rc < 0) /* more than 1 opener */
788 LASSERT(fd->fd_och != NULL);
789 old_handle = fd->fd_och->och_fh;
794 RETURN(ERR_PTR(-ENOMEM));
796 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
797 LUSTRE_OPC_ANY, NULL);
799 GOTO(out, rc = PTR_ERR(op_data));
801 /* To tell the MDT this openhandle is from the same owner */
802 op_data->op_handle = old_handle;
804 it.it_flags = fmode | open_flags;
805 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
806 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
807 &ll_md_blocking_lease_ast,
808 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
809 * it can be cancelled which may mislead applications that the lease is
811 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
812 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
813 * doesn't deal with openhandle, so normal openhandle will be leaked. */
814 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
815 ll_finish_md_op_data(op_data);
816 ptlrpc_req_finished(req);
818 GOTO(out_release_it, rc);
820 if (it_disposition(&it, DISP_LOOKUP_NEG))
821 GOTO(out_release_it, rc = -ENOENT);
823 rc = it_open_error(DISP_OPEN_OPEN, &it);
825 GOTO(out_release_it, rc);
827 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
828 ll_och_fill(sbi->ll_md_exp, &it, och);
830 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
831 GOTO(out_close, rc = -EOPNOTSUPP);
833 /* already get lease, handle lease lock */
834 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
835 if (it.d.lustre.it_lock_mode == 0 ||
836 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
837 /* open lock must return for lease */
838 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
839 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
840 it.d.lustre.it_lock_bits);
841 GOTO(out_close, rc = -EPROTO);
844 ll_intent_release(&it);
848 /* Cancel open lock */
849 if (it.d.lustre.it_lock_mode != 0) {
850 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
851 it.d.lustre.it_lock_mode);
852 it.d.lustre.it_lock_mode = 0;
853 och->och_lease_handle.cookie = 0ULL;
855 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
857 CERROR("%s: error closing file "DFID": %d\n",
858 ll_get_fsname(inode->i_sb, NULL, 0),
859 PFID(&ll_i2info(inode)->lli_fid), rc2);
860 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
862 ll_intent_release(&it);
870 * Check whether a layout swap can be done between two inodes.
872 * \param[in] inode1 First inode to check
873 * \param[in] inode2 Second inode to check
875 * \retval 0 on success, layout swap can be performed between both inodes
876 * \retval negative error code if requirements are not met
878 static int ll_check_swap_layouts_validity(struct inode *inode1,
879 struct inode *inode2)
881 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
884 if (inode_permission(inode1, MAY_WRITE) ||
885 inode_permission(inode2, MAY_WRITE))
888 if (inode1->i_sb != inode2->i_sb)
894 static int ll_swap_layouts_close(struct obd_client_handle *och,
895 struct inode *inode, struct inode *inode2)
897 const struct lu_fid *fid1 = ll_inode2fid(inode);
898 const struct lu_fid *fid2;
902 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
903 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
905 rc = ll_check_swap_layouts_validity(inode, inode2);
907 GOTO(out_free_och, rc);
909 /* We now know that inode2 is a lustre inode */
910 fid2 = ll_inode2fid(inode2);
912 rc = lu_fid_cmp(fid1, fid2);
914 GOTO(out_free_och, rc = -EINVAL);
916 /* Close the file and swap layouts between inode & inode2.
917 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
918 * because we still need it to pack l_remote_handle to MDT. */
919 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
920 MDS_CLOSE_LAYOUT_SWAP, inode2);
922 och = NULL; /* freed in ll_close_inode_openhandle() */
932 * Release lease and close the file.
933 * It will check if the lease has ever broken.
935 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
938 struct ldlm_lock *lock;
939 bool cancelled = true;
943 lock = ldlm_handle2lock(&och->och_lease_handle);
945 lock_res_and_lock(lock);
946 cancelled = ldlm_is_cancel(lock);
947 unlock_res_and_lock(lock);
951 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
952 PFID(&ll_i2info(inode)->lli_fid), cancelled);
955 ldlm_cli_cancel(&och->och_lease_handle, 0);
956 if (lease_broken != NULL)
957 *lease_broken = cancelled;
959 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
965 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
967 struct ll_inode_info *lli = ll_i2info(inode);
968 struct cl_object *obj = lli->lli_clob;
969 struct cl_attr *attr = vvp_env_thread_attr(env);
977 ll_inode_size_lock(inode);
979 /* merge timestamps the most recently obtained from mds with
980 timestamps obtained from osts */
981 LTIME_S(inode->i_atime) = lli->lli_atime;
982 LTIME_S(inode->i_mtime) = lli->lli_mtime;
983 LTIME_S(inode->i_ctime) = lli->lli_ctime;
985 atime = LTIME_S(inode->i_atime);
986 mtime = LTIME_S(inode->i_mtime);
987 ctime = LTIME_S(inode->i_ctime);
989 cl_object_attr_lock(obj);
990 rc = cl_object_attr_get(env, obj, attr);
991 cl_object_attr_unlock(obj);
994 GOTO(out_size_unlock, rc);
996 if (atime < attr->cat_atime)
997 atime = attr->cat_atime;
999 if (ctime < attr->cat_ctime)
1000 ctime = attr->cat_ctime;
1002 if (mtime < attr->cat_mtime)
1003 mtime = attr->cat_mtime;
1005 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1006 PFID(&lli->lli_fid), attr->cat_size);
1008 i_size_write(inode, attr->cat_size);
1009 inode->i_blocks = attr->cat_blocks;
1011 LTIME_S(inode->i_atime) = atime;
1012 LTIME_S(inode->i_mtime) = mtime;
1013 LTIME_S(inode->i_ctime) = ctime;
1016 ll_inode_size_unlock(inode);
1021 static bool file_is_noatime(const struct file *file)
1023 const struct vfsmount *mnt = file->f_path.mnt;
1024 const struct inode *inode = file->f_path.dentry->d_inode;
1026 /* Adapted from file_accessed() and touch_atime().*/
1027 if (file->f_flags & O_NOATIME)
1030 if (inode->i_flags & S_NOATIME)
1033 if (IS_NOATIME(inode))
1036 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1039 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1042 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1048 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1050 struct inode *inode = file->f_path.dentry->d_inode;
1052 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1054 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1055 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1056 file->f_flags & O_DIRECT ||
1059 io->ci_obj = ll_i2info(inode)->lli_clob;
1060 io->ci_lockreq = CILR_MAYBE;
1061 if (ll_file_nolock(file)) {
1062 io->ci_lockreq = CILR_NEVER;
1063 io->ci_no_srvlock = 1;
1064 } else if (file->f_flags & O_APPEND) {
1065 io->ci_lockreq = CILR_MANDATORY;
1068 io->ci_noatime = file_is_noatime(file);
1072 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1073 struct file *file, enum cl_io_type iot,
1074 loff_t *ppos, size_t count)
1076 struct vvp_io *vio = vvp_env_io(env);
1077 struct inode *inode = file->f_path.dentry->d_inode;
1078 struct ll_inode_info *lli = ll_i2info(inode);
1079 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1083 struct range_lock range;
1087 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1088 file->f_path.dentry->d_name.name, iot, *ppos, count);
1091 io = vvp_env_thread_io(env);
1092 ll_io_init(io, file, iot == CIT_WRITE);
1094 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1095 bool range_locked = false;
1097 if (file->f_flags & O_APPEND)
1098 range_lock_init(&range, 0, LUSTRE_EOF);
1100 range_lock_init(&range, *ppos, *ppos + count - 1);
1102 vio->vui_fd = LUSTRE_FPRIVATE(file);
1103 vio->vui_io_subtype = args->via_io_subtype;
1105 switch (vio->vui_io_subtype) {
1107 vio->vui_iov = args->u.normal.via_iov;
1108 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1109 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1110 vio->vui_iocb = args->u.normal.via_iocb;
1111 /* Direct IO reads must also take range lock,
1112 * or multiple reads will try to work on the same pages
1113 * See LU-6227 for details. */
1114 if (((iot == CIT_WRITE) ||
1115 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1116 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1117 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1119 rc = range_lock(&lli->lli_write_tree, &range);
1123 range_locked = true;
1127 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1128 vio->u.splice.vui_flags = args->u.splice.via_flags;
1131 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1135 ll_cl_add(file, env, io);
1136 rc = cl_io_loop(env, io);
1137 ll_cl_remove(file, env);
1140 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1142 range_unlock(&lli->lli_write_tree, &range);
1145 /* cl_io_rw_init() handled IO */
1149 if (io->ci_nob > 0) {
1150 result += io->ci_nob;
1151 count -= io->ci_nob;
1152 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1154 /* prepare IO restart */
1155 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1156 args->u.normal.via_iov = vio->vui_iov;
1157 args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1162 cl_io_fini(env, io);
1164 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1166 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1167 file->f_path.dentry->d_name.name,
1168 iot == CIT_READ ? "read" : "write",
1169 *ppos, count, result);
1173 if (iot == CIT_READ) {
1175 ll_stats_ops_tally(ll_i2sbi(inode),
1176 LPROC_LL_READ_BYTES, result);
1177 } else if (iot == CIT_WRITE) {
1179 ll_stats_ops_tally(ll_i2sbi(inode),
1180 LPROC_LL_WRITE_BYTES, result);
1181 fd->fd_write_failed = false;
1182 } else if (rc != -ERESTARTSYS) {
1183 fd->fd_write_failed = true;
1187 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1189 return result > 0 ? result : rc;
1193 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1195 static int ll_file_get_iov_count(const struct iovec *iov,
1196 unsigned long *nr_segs, size_t *count)
1201 for (seg = 0; seg < *nr_segs; seg++) {
1202 const struct iovec *iv = &iov[seg];
1205 * If any segment has a negative length, or the cumulative
1206 * length ever wraps negative then return -EINVAL.
1209 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1211 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1216 cnt -= iv->iov_len; /* This segment is no good */
1223 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1224 unsigned long nr_segs, loff_t pos)
1227 struct vvp_io_args *args;
1228 struct iovec *local_iov;
1234 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1238 env = cl_env_get(&refcheck);
1240 RETURN(PTR_ERR(env));
1243 local_iov = &ll_env_info(env)->lti_local_iov;
1246 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1247 if (local_iov == NULL) {
1248 cl_env_put(env, &refcheck);
1252 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1255 args = ll_env_args(env, IO_NORMAL);
1256 args->u.normal.via_iov = local_iov;
1257 args->u.normal.via_nrsegs = nr_segs;
1258 args->u.normal.via_iocb = iocb;
1260 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1261 &iocb->ki_pos, count);
1263 cl_env_put(env, &refcheck);
1266 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1271 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1275 struct iovec iov = { .iov_base = buf, .iov_len = count };
1276 struct kiocb *kiocb;
1281 env = cl_env_get(&refcheck);
1283 RETURN(PTR_ERR(env));
1285 kiocb = &ll_env_info(env)->lti_kiocb;
1286 init_sync_kiocb(kiocb, file);
1287 kiocb->ki_pos = *ppos;
1288 #ifdef HAVE_KIOCB_KI_LEFT
1289 kiocb->ki_left = count;
1291 kiocb->ki_nbytes = count;
1294 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1295 *ppos = kiocb->ki_pos;
1297 cl_env_put(env, &refcheck);
1302 * Write to a file (through the page cache).
1305 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1306 unsigned long nr_segs, loff_t pos)
1309 struct vvp_io_args *args;
1310 struct iovec *local_iov;
1316 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1320 env = cl_env_get(&refcheck);
1322 RETURN(PTR_ERR(env));
1325 local_iov = &ll_env_info(env)->lti_local_iov;
1328 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1329 if (local_iov == NULL) {
1330 cl_env_put(env, &refcheck);
1334 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1337 args = ll_env_args(env, IO_NORMAL);
1338 args->u.normal.via_iov = local_iov;
1339 args->u.normal.via_nrsegs = nr_segs;
1340 args->u.normal.via_iocb = iocb;
1342 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1343 &iocb->ki_pos, count);
1344 cl_env_put(env, &refcheck);
1347 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1352 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1353 size_t count, loff_t *ppos)
1356 struct iovec iov = { .iov_base = (void __user *)buf,
1358 struct kiocb *kiocb;
1363 env = cl_env_get(&refcheck);
1365 RETURN(PTR_ERR(env));
1367 kiocb = &ll_env_info(env)->lti_kiocb;
1368 init_sync_kiocb(kiocb, file);
1369 kiocb->ki_pos = *ppos;
1370 #ifdef HAVE_KIOCB_KI_LEFT
1371 kiocb->ki_left = count;
1373 kiocb->ki_nbytes = count;
1376 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1377 *ppos = kiocb->ki_pos;
1379 cl_env_put(env, &refcheck);
1384 * Send file content (through pagecache) somewhere with helper
1386 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1387 struct pipe_inode_info *pipe, size_t count,
1391 struct vvp_io_args *args;
1396 env = cl_env_get(&refcheck);
1398 RETURN(PTR_ERR(env));
1400 args = ll_env_args(env, IO_SPLICE);
1401 args->u.splice.via_pipe = pipe;
1402 args->u.splice.via_flags = flags;
1404 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1405 cl_env_put(env, &refcheck);
1409 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1410 __u64 flags, struct lov_user_md *lum,
1413 struct lookup_intent oit = {
1415 .it_flags = flags | MDS_OPEN_BY_FID,
1420 ll_inode_size_lock(inode);
1421 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1423 GOTO(out_unlock, rc);
1425 ll_release_openhandle(file->f_path.dentry, &oit);
1428 ll_inode_size_unlock(inode);
1429 ll_intent_release(&oit);
1430 cl_lov_delay_create_clear(&file->f_flags);
1435 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1436 struct lov_mds_md **lmmp, int *lmm_size,
1437 struct ptlrpc_request **request)
1439 struct ll_sb_info *sbi = ll_i2sbi(inode);
1440 struct mdt_body *body;
1441 struct lov_mds_md *lmm = NULL;
1442 struct ptlrpc_request *req = NULL;
1443 struct md_op_data *op_data;
1446 rc = ll_get_default_mdsize(sbi, &lmmsize);
1450 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1451 strlen(filename), lmmsize,
1452 LUSTRE_OPC_ANY, NULL);
1453 if (IS_ERR(op_data))
1454 RETURN(PTR_ERR(op_data));
1456 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1457 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1458 ll_finish_md_op_data(op_data);
1460 CDEBUG(D_INFO, "md_getattr_name failed "
1461 "on %s: rc %d\n", filename, rc);
1465 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1466 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1468 lmmsize = body->mbo_eadatasize;
1470 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1472 GOTO(out, rc = -ENODATA);
1475 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1476 LASSERT(lmm != NULL);
1478 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1479 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1480 GOTO(out, rc = -EPROTO);
1484 * This is coming from the MDS, so is probably in
1485 * little endian. We convert it to host endian before
1486 * passing it to userspace.
1488 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1491 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1492 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1495 /* if function called for directory - we should
1496 * avoid swab not existent lsm objects */
1497 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1498 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1499 if (S_ISREG(body->mbo_mode))
1500 lustre_swab_lov_user_md_objects(
1501 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1503 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1504 lustre_swab_lov_user_md_v3(
1505 (struct lov_user_md_v3 *)lmm);
1506 if (S_ISREG(body->mbo_mode))
1507 lustre_swab_lov_user_md_objects(
1508 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1515 *lmm_size = lmmsize;
1520 static int ll_lov_setea(struct inode *inode, struct file *file,
1523 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1524 struct lov_user_md *lump;
1525 int lum_size = sizeof(struct lov_user_md) +
1526 sizeof(struct lov_user_ost_data);
1530 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1533 OBD_ALLOC_LARGE(lump, lum_size);
1537 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1538 OBD_FREE_LARGE(lump, lum_size);
1542 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1544 OBD_FREE_LARGE(lump, lum_size);
1548 static int ll_file_getstripe(struct inode *inode,
1549 struct lov_user_md __user *lum)
1556 env = cl_env_get(&refcheck);
1558 RETURN(PTR_ERR(env));
1560 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1561 cl_env_put(env, &refcheck);
1565 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1568 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1569 struct lov_user_md *klum;
1571 __u64 flags = FMODE_WRITE;
1574 rc = ll_copy_user_md(lum, &klum);
1579 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1583 put_user(0, &lum->lmm_stripe_count);
1585 ll_layout_refresh(inode, &gen);
1586 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1589 OBD_FREE(klum, lum_size);
1594 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1596 struct ll_inode_info *lli = ll_i2info(inode);
1597 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1598 struct ll_grouplock grouplock;
1603 CWARN("group id for group lock must not be 0\n");
1607 if (ll_file_nolock(file))
1608 RETURN(-EOPNOTSUPP);
1610 spin_lock(&lli->lli_lock);
1611 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1612 CWARN("group lock already existed with gid %lu\n",
1613 fd->fd_grouplock.lg_gid);
1614 spin_unlock(&lli->lli_lock);
1617 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1618 spin_unlock(&lli->lli_lock);
1620 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1621 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1625 spin_lock(&lli->lli_lock);
1626 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1627 spin_unlock(&lli->lli_lock);
1628 CERROR("another thread just won the race\n");
1629 cl_put_grouplock(&grouplock);
1633 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1634 fd->fd_grouplock = grouplock;
1635 spin_unlock(&lli->lli_lock);
1637 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1641 static int ll_put_grouplock(struct inode *inode, struct file *file,
1644 struct ll_inode_info *lli = ll_i2info(inode);
1645 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1646 struct ll_grouplock grouplock;
1649 spin_lock(&lli->lli_lock);
1650 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1651 spin_unlock(&lli->lli_lock);
1652 CWARN("no group lock held\n");
1656 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1658 if (fd->fd_grouplock.lg_gid != arg) {
1659 CWARN("group lock %lu doesn't match current id %lu\n",
1660 arg, fd->fd_grouplock.lg_gid);
1661 spin_unlock(&lli->lli_lock);
1665 grouplock = fd->fd_grouplock;
1666 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1667 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1668 spin_unlock(&lli->lli_lock);
1670 cl_put_grouplock(&grouplock);
1671 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1676 * Close inode open handle
1678 * \param dentry [in] dentry which contains the inode
1679 * \param it [in,out] intent which contains open info and result
1682 * \retval <0 failure
1684 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1686 struct inode *inode = dentry->d_inode;
1687 struct obd_client_handle *och;
1693 /* Root ? Do nothing. */
1694 if (dentry->d_inode->i_sb->s_root == dentry)
1697 /* No open handle to close? Move away */
1698 if (!it_disposition(it, DISP_OPEN_OPEN))
1701 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1703 OBD_ALLOC(och, sizeof(*och));
1705 GOTO(out, rc = -ENOMEM);
1707 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1709 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1710 och, inode, 0, NULL);
1712 /* this one is in place of ll_file_open */
1713 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1714 ptlrpc_req_finished(it->d.lustre.it_data);
1715 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1721 * Get size for inode for which FIEMAP mapping is requested.
1722 * Make the FIEMAP get_info call and returns the result.
1723 * \param fiemap kernel buffer to hold extens
1724 * \param num_bytes kernel buffer size
1726 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1732 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1735 /* Checks for fiemap flags */
1736 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1737 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1741 /* Check for FIEMAP_FLAG_SYNC */
1742 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1743 rc = filemap_fdatawrite(inode->i_mapping);
1748 env = cl_env_get(&refcheck);
1750 RETURN(PTR_ERR(env));
1752 if (i_size_read(inode) == 0) {
1753 rc = ll_glimpse_size(inode);
1758 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1759 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1760 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1762 /* If filesize is 0, then there would be no objects for mapping */
1763 if (fmkey.lfik_oa.o_size == 0) {
1764 fiemap->fm_mapped_extents = 0;
1768 fmkey.lfik_fiemap = *fiemap;
1770 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1771 &fmkey, fiemap, &num_bytes);
1773 cl_env_put(env, &refcheck);
1777 int ll_fid2path(struct inode *inode, void __user *arg)
1779 struct obd_export *exp = ll_i2mdexp(inode);
1780 const struct getinfo_fid2path __user *gfin = arg;
1782 struct getinfo_fid2path *gfout;
1788 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1789 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1792 /* Only need to get the buflen */
1793 if (get_user(pathlen, &gfin->gf_pathlen))
1796 if (pathlen > PATH_MAX)
1799 outsize = sizeof(*gfout) + pathlen;
1800 OBD_ALLOC(gfout, outsize);
1804 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1805 GOTO(gf_free, rc = -EFAULT);
1807 /* Call mdc_iocontrol */
1808 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1812 if (copy_to_user(arg, gfout, outsize))
1816 OBD_FREE(gfout, outsize);
1821 * Read the data_version for inode.
1823 * This value is computed using stripe object version on OST.
1824 * Version is computed using server side locking.
1826 * @param flags if do sync on the OST side;
1828 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1829 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1831 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1833 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1841 /* If no file object initialized, we consider its version is 0. */
1847 env = cl_env_get(&refcheck);
1849 RETURN(PTR_ERR(env));
1851 io = vvp_env_thread_io(env);
1853 io->u.ci_data_version.dv_data_version = 0;
1854 io->u.ci_data_version.dv_flags = flags;
1857 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1858 result = cl_io_loop(env, io);
1860 result = io->ci_result;
1862 *data_version = io->u.ci_data_version.dv_data_version;
1864 cl_io_fini(env, io);
1866 if (unlikely(io->ci_need_restart))
1869 cl_env_put(env, &refcheck);
1875 * Trigger a HSM release request for the provided inode.
1877 int ll_hsm_release(struct inode *inode)
1879 struct cl_env_nest nest;
1881 struct obd_client_handle *och = NULL;
1882 __u64 data_version = 0;
1886 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1887 ll_get_fsname(inode->i_sb, NULL, 0),
1888 PFID(&ll_i2info(inode)->lli_fid));
1890 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1892 GOTO(out, rc = PTR_ERR(och));
1894 /* Grab latest data_version and [am]time values */
1895 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1899 env = cl_env_nested_get(&nest);
1901 GOTO(out, rc = PTR_ERR(env));
1903 ll_merge_attr(env, inode);
1904 cl_env_nested_put(&nest, env);
1906 /* Release the file.
1907 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1908 * we still need it to pack l_remote_handle to MDT. */
1909 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
1910 MDS_HSM_RELEASE, &data_version);
1915 if (och != NULL && !IS_ERR(och)) /* close the file */
1916 ll_lease_close(och, inode, NULL);
1921 struct ll_swap_stack {
1924 struct inode *inode1;
1925 struct inode *inode2;
1930 static int ll_swap_layouts(struct file *file1, struct file *file2,
1931 struct lustre_swap_layouts *lsl)
1933 struct mdc_swap_layouts msl;
1934 struct md_op_data *op_data;
1937 struct ll_swap_stack *llss = NULL;
1940 OBD_ALLOC_PTR(llss);
1944 llss->inode1 = file1->f_path.dentry->d_inode;
1945 llss->inode2 = file2->f_path.dentry->d_inode;
1947 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1951 /* we use 2 bool because it is easier to swap than 2 bits */
1952 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1953 llss->check_dv1 = true;
1955 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1956 llss->check_dv2 = true;
1958 /* we cannot use lsl->sl_dvX directly because we may swap them */
1959 llss->dv1 = lsl->sl_dv1;
1960 llss->dv2 = lsl->sl_dv2;
1962 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1963 if (rc == 0) /* same file, done! */
1966 if (rc < 0) { /* sequentialize it */
1967 swap(llss->inode1, llss->inode2);
1969 swap(llss->dv1, llss->dv2);
1970 swap(llss->check_dv1, llss->check_dv2);
1974 if (gid != 0) { /* application asks to flush dirty cache */
1975 rc = ll_get_grouplock(llss->inode1, file1, gid);
1979 rc = ll_get_grouplock(llss->inode2, file2, gid);
1981 ll_put_grouplock(llss->inode1, file1, gid);
1986 /* ultimate check, before swaping the layouts we check if
1987 * dataversion has changed (if requested) */
1988 if (llss->check_dv1) {
1989 rc = ll_data_version(llss->inode1, &dv, 0);
1992 if (dv != llss->dv1)
1993 GOTO(putgl, rc = -EAGAIN);
1996 if (llss->check_dv2) {
1997 rc = ll_data_version(llss->inode2, &dv, 0);
2000 if (dv != llss->dv2)
2001 GOTO(putgl, rc = -EAGAIN);
2004 /* struct md_op_data is used to send the swap args to the mdt
2005 * only flags is missing, so we use struct mdc_swap_layouts
2006 * through the md_op_data->op_data */
2007 /* flags from user space have to be converted before they are send to
2008 * server, no flag is sent today, they are only used on the client */
2011 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2012 0, LUSTRE_OPC_ANY, &msl);
2013 if (IS_ERR(op_data))
2014 GOTO(free, rc = PTR_ERR(op_data));
2016 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2017 sizeof(*op_data), op_data, NULL);
2018 ll_finish_md_op_data(op_data);
2025 ll_put_grouplock(llss->inode2, file2, gid);
2026 ll_put_grouplock(llss->inode1, file1, gid);
2036 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2038 struct md_op_data *op_data;
2042 /* Detect out-of range masks */
2043 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2046 /* Non-root users are forbidden to set or clear flags which are
2047 * NOT defined in HSM_USER_MASK. */
2048 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2049 !cfs_capable(CFS_CAP_SYS_ADMIN))
2052 /* Detect out-of range archive id */
2053 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2054 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2057 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2058 LUSTRE_OPC_ANY, hss);
2059 if (IS_ERR(op_data))
2060 RETURN(PTR_ERR(op_data));
2062 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2063 sizeof(*op_data), op_data, NULL);
2065 ll_finish_md_op_data(op_data);
2070 static int ll_hsm_import(struct inode *inode, struct file *file,
2071 struct hsm_user_import *hui)
2073 struct hsm_state_set *hss = NULL;
2074 struct iattr *attr = NULL;
2078 if (!S_ISREG(inode->i_mode))
2084 GOTO(out, rc = -ENOMEM);
2086 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2087 hss->hss_archive_id = hui->hui_archive_id;
2088 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2089 rc = ll_hsm_state_set(inode, hss);
2093 OBD_ALLOC_PTR(attr);
2095 GOTO(out, rc = -ENOMEM);
2097 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2098 attr->ia_mode |= S_IFREG;
2099 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2100 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2101 attr->ia_size = hui->hui_size;
2102 attr->ia_mtime.tv_sec = hui->hui_mtime;
2103 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2104 attr->ia_atime.tv_sec = hui->hui_atime;
2105 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2107 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2108 ATTR_UID | ATTR_GID |
2109 ATTR_MTIME | ATTR_MTIME_SET |
2110 ATTR_ATIME | ATTR_ATIME_SET;
2112 mutex_lock(&inode->i_mutex);
2114 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2118 mutex_unlock(&inode->i_mutex);
2130 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2132 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2133 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2137 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2139 struct inode *inode = file->f_path.dentry->d_inode;
2140 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2144 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2145 PFID(ll_inode2fid(inode)), inode, cmd);
2146 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2148 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2149 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2153 case LL_IOC_GETFLAGS:
2154 /* Get the current value of the file flags */
2155 return put_user(fd->fd_flags, (int __user *)arg);
2156 case LL_IOC_SETFLAGS:
2157 case LL_IOC_CLRFLAGS:
2158 /* Set or clear specific file flags */
2159 /* XXX This probably needs checks to ensure the flags are
2160 * not abused, and to handle any flag side effects.
2162 if (get_user(flags, (int __user *) arg))
2165 if (cmd == LL_IOC_SETFLAGS) {
2166 if ((flags & LL_FILE_IGNORE_LOCK) &&
2167 !(file->f_flags & O_DIRECT)) {
2168 CERROR("%s: unable to disable locking on "
2169 "non-O_DIRECT file\n", current->comm);
2173 fd->fd_flags |= flags;
2175 fd->fd_flags &= ~flags;
2178 case LL_IOC_LOV_SETSTRIPE:
2179 RETURN(ll_lov_setstripe(inode, file, arg));
2180 case LL_IOC_LOV_SETEA:
2181 RETURN(ll_lov_setea(inode, file, arg));
2182 case LL_IOC_LOV_SWAP_LAYOUTS: {
2184 struct lustre_swap_layouts lsl;
2186 if (copy_from_user(&lsl, (char __user *)arg,
2187 sizeof(struct lustre_swap_layouts)))
2190 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2193 file2 = fget(lsl.sl_fd);
2197 /* O_WRONLY or O_RDWR */
2198 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2199 GOTO(out, rc = -EPERM);
2201 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2202 struct inode *inode2;
2203 struct ll_inode_info *lli;
2204 struct obd_client_handle *och = NULL;
2206 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2207 GOTO(out, rc = -EINVAL);
2209 lli = ll_i2info(inode);
2210 mutex_lock(&lli->lli_och_mutex);
2211 if (fd->fd_lease_och != NULL) {
2212 och = fd->fd_lease_och;
2213 fd->fd_lease_och = NULL;
2215 mutex_unlock(&lli->lli_och_mutex);
2217 GOTO(out, rc = -ENOLCK);
2218 inode2 = file2->f_path.dentry->d_inode;
2219 rc = ll_swap_layouts_close(och, inode, inode2);
2221 rc = ll_swap_layouts(file, file2, &lsl);
2227 case LL_IOC_LOV_GETSTRIPE:
2228 RETURN(ll_file_getstripe(inode,
2229 (struct lov_user_md __user *)arg));
2230 case FSFILT_IOC_GETFLAGS:
2231 case FSFILT_IOC_SETFLAGS:
2232 RETURN(ll_iocontrol(inode, file, cmd, arg));
2233 case FSFILT_IOC_GETVERSION_OLD:
2234 case FSFILT_IOC_GETVERSION:
2235 RETURN(put_user(inode->i_generation, (int __user *)arg));
2236 case LL_IOC_GROUP_LOCK:
2237 RETURN(ll_get_grouplock(inode, file, arg));
2238 case LL_IOC_GROUP_UNLOCK:
2239 RETURN(ll_put_grouplock(inode, file, arg));
2240 case IOC_OBD_STATFS:
2241 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2243 /* We need to special case any other ioctls we want to handle,
2244 * to send them to the MDS/OST as appropriate and to properly
2245 * network encode the arg field.
2246 case FSFILT_IOC_SETVERSION_OLD:
2247 case FSFILT_IOC_SETVERSION:
2249 case LL_IOC_FLUSHCTX:
2250 RETURN(ll_flush_ctx(inode));
2251 case LL_IOC_PATH2FID: {
2252 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2253 sizeof(struct lu_fid)))
2258 case LL_IOC_GETPARENT:
2259 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2261 case OBD_IOC_FID2PATH:
2262 RETURN(ll_fid2path(inode, (void __user *)arg));
2263 case LL_IOC_DATA_VERSION: {
2264 struct ioc_data_version idv;
2267 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2270 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2271 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2274 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2280 case LL_IOC_GET_MDTIDX: {
2283 mdtidx = ll_get_mdt_idx(inode);
2287 if (put_user((int)mdtidx, (int __user *)arg))
2292 case OBD_IOC_GETDTNAME:
2293 case OBD_IOC_GETMDNAME:
2294 RETURN(ll_get_obd_name(inode, cmd, arg));
2295 case LL_IOC_HSM_STATE_GET: {
2296 struct md_op_data *op_data;
2297 struct hsm_user_state *hus;
2304 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2305 LUSTRE_OPC_ANY, hus);
2306 if (IS_ERR(op_data)) {
2308 RETURN(PTR_ERR(op_data));
2311 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2314 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2317 ll_finish_md_op_data(op_data);
2321 case LL_IOC_HSM_STATE_SET: {
2322 struct hsm_state_set *hss;
2329 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2334 rc = ll_hsm_state_set(inode, hss);
2339 case LL_IOC_HSM_ACTION: {
2340 struct md_op_data *op_data;
2341 struct hsm_current_action *hca;
2348 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2349 LUSTRE_OPC_ANY, hca);
2350 if (IS_ERR(op_data)) {
2352 RETURN(PTR_ERR(op_data));
2355 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2358 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2361 ll_finish_md_op_data(op_data);
2365 case LL_IOC_SET_LEASE: {
2366 struct ll_inode_info *lli = ll_i2info(inode);
2367 struct obd_client_handle *och = NULL;
2372 case LL_LEASE_WRLCK:
2373 if (!(file->f_mode & FMODE_WRITE))
2375 fmode = FMODE_WRITE;
2377 case LL_LEASE_RDLCK:
2378 if (!(file->f_mode & FMODE_READ))
2382 case LL_LEASE_UNLCK:
2383 mutex_lock(&lli->lli_och_mutex);
2384 if (fd->fd_lease_och != NULL) {
2385 och = fd->fd_lease_och;
2386 fd->fd_lease_och = NULL;
2388 mutex_unlock(&lli->lli_och_mutex);
2393 fmode = och->och_flags;
2394 rc = ll_lease_close(och, inode, &lease_broken);
2401 RETURN(ll_lease_type_from_fmode(fmode));
2406 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2408 /* apply for lease */
2409 och = ll_lease_open(inode, file, fmode, 0);
2411 RETURN(PTR_ERR(och));
2414 mutex_lock(&lli->lli_och_mutex);
2415 if (fd->fd_lease_och == NULL) {
2416 fd->fd_lease_och = och;
2419 mutex_unlock(&lli->lli_och_mutex);
2421 /* impossible now that only excl is supported for now */
2422 ll_lease_close(och, inode, &lease_broken);
2427 case LL_IOC_GET_LEASE: {
2428 struct ll_inode_info *lli = ll_i2info(inode);
2429 struct ldlm_lock *lock = NULL;
2432 mutex_lock(&lli->lli_och_mutex);
2433 if (fd->fd_lease_och != NULL) {
2434 struct obd_client_handle *och = fd->fd_lease_och;
2436 lock = ldlm_handle2lock(&och->och_lease_handle);
2438 lock_res_and_lock(lock);
2439 if (!ldlm_is_cancel(lock))
2440 fmode = och->och_flags;
2442 unlock_res_and_lock(lock);
2443 LDLM_LOCK_PUT(lock);
2446 mutex_unlock(&lli->lli_och_mutex);
2448 RETURN(ll_lease_type_from_fmode(fmode));
2450 case LL_IOC_HSM_IMPORT: {
2451 struct hsm_user_import *hui;
2457 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2462 rc = ll_hsm_import(inode, file, hui);
2472 ll_iocontrol_call(inode, file, cmd, arg, &err))
2475 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2476 (void __user *)arg));
2481 #ifndef HAVE_FILE_LLSEEK_SIZE
2482 static inline loff_t
2483 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2485 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2487 if (offset > maxsize)
2490 if (offset != file->f_pos) {
2491 file->f_pos = offset;
2492 file->f_version = 0;
2498 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2499 loff_t maxsize, loff_t eof)
2501 struct inode *inode = file->f_path.dentry->d_inode;
2509 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2510 * position-querying operation. Avoid rewriting the "same"
2511 * f_pos value back to the file because a concurrent read(),
2512 * write() or lseek() might have altered it
2517 * f_lock protects against read/modify/write race with other
2518 * SEEK_CURs. Note that parallel writes and reads behave
2521 mutex_lock(&inode->i_mutex);
2522 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2523 mutex_unlock(&inode->i_mutex);
2527 * In the generic case the entire file is data, so as long as
2528 * offset isn't at the end of the file then the offset is data.
2535 * There is a virtual hole at the end of the file, so as long as
2536 * offset isn't i_size or larger, return i_size.
2544 return llseek_execute(file, offset, maxsize);
2548 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2550 struct inode *inode = file->f_path.dentry->d_inode;
2551 loff_t retval, eof = 0;
2554 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2555 (origin == SEEK_CUR) ? file->f_pos : 0);
2556 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2557 PFID(ll_inode2fid(inode)), inode, retval, retval,
2559 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2561 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2562 retval = ll_glimpse_size(inode);
2565 eof = i_size_read(inode);
2568 retval = ll_generic_file_llseek_size(file, offset, origin,
2569 ll_file_maxbytes(inode), eof);
2573 static int ll_flush(struct file *file, fl_owner_t id)
2575 struct inode *inode = file->f_path.dentry->d_inode;
2576 struct ll_inode_info *lli = ll_i2info(inode);
2577 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2580 LASSERT(!S_ISDIR(inode->i_mode));
2582 /* catch async errors that were recorded back when async writeback
2583 * failed for pages in this mapping. */
2584 rc = lli->lli_async_rc;
2585 lli->lli_async_rc = 0;
2586 if (lli->lli_clob != NULL) {
2587 err = lov_read_and_clear_async_rc(lli->lli_clob);
2592 /* The application has been told write failure already.
2593 * Do not report failure again. */
2594 if (fd->fd_write_failed)
2596 return rc ? -EIO : 0;
2600 * Called to make sure a portion of file has been written out.
2601 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2603 * Return how many pages have been written.
2605 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2606 enum cl_fsync_mode mode, int ignore_layout)
2608 struct cl_env_nest nest;
2611 struct cl_fsync_io *fio;
2615 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2616 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2619 env = cl_env_nested_get(&nest);
2621 RETURN(PTR_ERR(env));
2623 io = vvp_env_thread_io(env);
2624 io->ci_obj = ll_i2info(inode)->lli_clob;
2625 io->ci_ignore_layout = ignore_layout;
2627 /* initialize parameters for sync */
2628 fio = &io->u.ci_fsync;
2629 fio->fi_start = start;
2631 fio->fi_fid = ll_inode2fid(inode);
2632 fio->fi_mode = mode;
2633 fio->fi_nr_written = 0;
2635 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2636 result = cl_io_loop(env, io);
2638 result = io->ci_result;
2640 result = fio->fi_nr_written;
2641 cl_io_fini(env, io);
2642 cl_env_nested_put(&nest, env);
2648 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2649 * null and dentry must be used directly rather than pulled from
2650 * *file->f_path.dentry as is done otherwise.
2653 #ifdef HAVE_FILE_FSYNC_4ARGS
2654 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2656 struct dentry *dentry = file->f_path.dentry;
2657 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2658 int ll_fsync(struct file *file, int datasync)
2660 struct dentry *dentry = file->f_path.dentry;
2662 loff_t end = LLONG_MAX;
2664 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2667 loff_t end = LLONG_MAX;
2669 struct inode *inode = dentry->d_inode;
2670 struct ll_inode_info *lli = ll_i2info(inode);
2671 struct ptlrpc_request *req;
2672 struct obd_capa *oc;
2676 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2677 PFID(ll_inode2fid(inode)), inode);
2678 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2680 #ifdef HAVE_FILE_FSYNC_4ARGS
2681 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2682 mutex_lock(&inode->i_mutex);
2684 /* fsync's caller has already called _fdata{sync,write}, we want
2685 * that IO to finish before calling the osc and mdc sync methods */
2686 rc = filemap_fdatawait(inode->i_mapping);
2689 /* catch async errors that were recorded back when async writeback
2690 * failed for pages in this mapping. */
2691 if (!S_ISDIR(inode->i_mode)) {
2692 err = lli->lli_async_rc;
2693 lli->lli_async_rc = 0;
2696 err = lov_read_and_clear_async_rc(lli->lli_clob);
2701 oc = ll_mdscapa_get(inode);
2702 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2708 ptlrpc_req_finished(req);
2710 if (S_ISREG(inode->i_mode)) {
2711 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2713 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2714 if (rc == 0 && err < 0)
2717 fd->fd_write_failed = true;
2719 fd->fd_write_failed = false;
2722 #ifdef HAVE_FILE_FSYNC_4ARGS
2723 mutex_unlock(&inode->i_mutex);
2729 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2731 struct inode *inode = file->f_path.dentry->d_inode;
2732 struct ll_sb_info *sbi = ll_i2sbi(inode);
2733 struct ldlm_enqueue_info einfo = {
2734 .ei_type = LDLM_FLOCK,
2735 .ei_cb_cp = ldlm_flock_completion_ast,
2736 .ei_cbdata = file_lock,
2738 struct md_op_data *op_data;
2739 struct lustre_handle lockh = {0};
2740 ldlm_policy_data_t flock = {{0}};
2741 int fl_type = file_lock->fl_type;
2747 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2748 PFID(ll_inode2fid(inode)), file_lock);
2750 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2752 if (file_lock->fl_flags & FL_FLOCK) {
2753 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2754 /* flocks are whole-file locks */
2755 flock.l_flock.end = OFFSET_MAX;
2756 /* For flocks owner is determined by the local file desctiptor*/
2757 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2758 } else if (file_lock->fl_flags & FL_POSIX) {
2759 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2760 flock.l_flock.start = file_lock->fl_start;
2761 flock.l_flock.end = file_lock->fl_end;
2765 flock.l_flock.pid = file_lock->fl_pid;
2767 /* Somewhat ugly workaround for svc lockd.
2768 * lockd installs custom fl_lmops->lm_compare_owner that checks
2769 * for the fl_owner to be the same (which it always is on local node
2770 * I guess between lockd processes) and then compares pid.
2771 * As such we assign pid to the owner field to make it all work,
2772 * conflict with normal locks is unlikely since pid space and
2773 * pointer space for current->files are not intersecting */
2774 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2775 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2779 einfo.ei_mode = LCK_PR;
2782 /* An unlock request may or may not have any relation to
2783 * existing locks so we may not be able to pass a lock handle
2784 * via a normal ldlm_lock_cancel() request. The request may even
2785 * unlock a byte range in the middle of an existing lock. In
2786 * order to process an unlock request we need all of the same
2787 * information that is given with a normal read or write record
2788 * lock request. To avoid creating another ldlm unlock (cancel)
2789 * message we'll treat a LCK_NL flock request as an unlock. */
2790 einfo.ei_mode = LCK_NL;
2793 einfo.ei_mode = LCK_PW;
2796 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2811 flags = LDLM_FL_BLOCK_NOWAIT;
2817 flags = LDLM_FL_TEST_LOCK;
2820 CERROR("unknown fcntl lock command: %d\n", cmd);
2824 /* Save the old mode so that if the mode in the lock changes we
2825 * can decrement the appropriate reader or writer refcount. */
2826 file_lock->fl_type = einfo.ei_mode;
2828 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2829 LUSTRE_OPC_ANY, NULL);
2830 if (IS_ERR(op_data))
2831 RETURN(PTR_ERR(op_data));
2833 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2834 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2835 flock.l_flock.pid, flags, einfo.ei_mode,
2836 flock.l_flock.start, flock.l_flock.end);
2838 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2841 /* Restore the file lock type if not TEST lock. */
2842 if (!(flags & LDLM_FL_TEST_LOCK))
2843 file_lock->fl_type = fl_type;
2845 if ((file_lock->fl_flags & FL_FLOCK) &&
2846 (rc == 0 || file_lock->fl_type == F_UNLCK))
2847 rc2 = flock_lock_file_wait(file, file_lock);
2848 if ((file_lock->fl_flags & FL_POSIX) &&
2849 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2850 !(flags & LDLM_FL_TEST_LOCK))
2851 rc2 = posix_lock_file_wait(file, file_lock);
2853 if (rc2 && file_lock->fl_type != F_UNLCK) {
2854 einfo.ei_mode = LCK_NL;
2855 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2860 ll_finish_md_op_data(op_data);
2865 int ll_get_fid_by_name(struct inode *parent, const char *name,
2866 int namelen, struct lu_fid *fid)
2868 struct md_op_data *op_data = NULL;
2869 struct mdt_body *body;
2870 struct ptlrpc_request *req;
2874 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2875 LUSTRE_OPC_ANY, NULL);
2876 if (IS_ERR(op_data))
2877 RETURN(PTR_ERR(op_data));
2879 op_data->op_valid = OBD_MD_FLID;
2880 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2881 ll_finish_md_op_data(op_data);
2885 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2887 GOTO(out_req, rc = -EFAULT);
2889 *fid = body->mbo_fid1;
2891 ptlrpc_req_finished(req);
2895 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2896 const char *name, int namelen)
2898 struct dentry *dchild = NULL;
2899 struct inode *child_inode = NULL;
2900 struct md_op_data *op_data;
2901 struct ptlrpc_request *request = NULL;
2906 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2907 name, PFID(ll_inode2fid(parent)), mdtidx);
2909 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2910 0, LUSTRE_OPC_ANY, NULL);
2911 if (IS_ERR(op_data))
2912 RETURN(PTR_ERR(op_data));
2914 /* Get child FID first */
2915 qstr.hash = full_name_hash(name, namelen);
2918 dchild = d_lookup(file->f_path.dentry, &qstr);
2919 if (dchild != NULL) {
2920 if (dchild->d_inode != NULL) {
2921 child_inode = igrab(dchild->d_inode);
2922 if (child_inode != NULL) {
2923 mutex_lock(&child_inode->i_mutex);
2924 op_data->op_fid3 = *ll_inode2fid(child_inode);
2925 ll_invalidate_aliases(child_inode);
2930 rc = ll_get_fid_by_name(parent, name, namelen,
2936 if (!fid_is_sane(&op_data->op_fid3)) {
2937 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2938 ll_get_fsname(parent->i_sb, NULL, 0), name,
2939 PFID(&op_data->op_fid3));
2940 GOTO(out_free, rc = -EINVAL);
2943 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2948 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2949 PFID(&op_data->op_fid3), mdtidx);
2950 GOTO(out_free, rc = 0);
2953 op_data->op_mds = mdtidx;
2954 op_data->op_cli_flags = CLI_MIGRATE;
2955 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2956 namelen, name, namelen, &request);
2958 ll_update_times(request, parent);
2960 ptlrpc_req_finished(request);
2965 if (child_inode != NULL) {
2966 clear_nlink(child_inode);
2967 mutex_unlock(&child_inode->i_mutex);
2971 ll_finish_md_op_data(op_data);
2976 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2984 * test if some locks matching bits and l_req_mode are acquired
2985 * - bits can be in different locks
2986 * - if found clear the common lock bits in *bits
2987 * - the bits not found, are kept in *bits
2989 * \param bits [IN] searched lock bits [IN]
2990 * \param l_req_mode [IN] searched lock mode
2991 * \retval boolean, true iff all bits are found
2993 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2995 struct lustre_handle lockh;
2996 ldlm_policy_data_t policy;
2997 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2998 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3007 fid = &ll_i2info(inode)->lli_fid;
3008 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3009 ldlm_lockname[mode]);
3011 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3012 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3013 policy.l_inodebits.bits = *bits & (1 << i);
3014 if (policy.l_inodebits.bits == 0)
3017 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3018 &policy, mode, &lockh)) {
3019 struct ldlm_lock *lock;
3021 lock = ldlm_handle2lock(&lockh);
3024 ~(lock->l_policy_data.l_inodebits.bits);
3025 LDLM_LOCK_PUT(lock);
3027 *bits &= ~policy.l_inodebits.bits;
3034 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3035 struct lustre_handle *lockh, __u64 flags,
3038 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3043 fid = &ll_i2info(inode)->lli_fid;
3044 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3046 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3047 fid, LDLM_IBITS, &policy, mode, lockh);
3052 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3054 /* Already unlinked. Just update nlink and return success */
3055 if (rc == -ENOENT) {
3057 /* This path cannot be hit for regular files unless in
3058 * case of obscure races, so no need to to validate
3060 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3062 } else if (rc != 0) {
3063 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3064 "%s: revalidate FID "DFID" error: rc = %d\n",
3065 ll_get_fsname(inode->i_sb, NULL, 0),
3066 PFID(ll_inode2fid(inode)), rc);
3072 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3074 struct inode *inode = dentry->d_inode;
3075 struct ptlrpc_request *req = NULL;
3076 struct obd_export *exp;
3080 LASSERT(inode != NULL);
3082 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3083 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3085 exp = ll_i2mdexp(inode);
3087 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3088 * But under CMD case, it caused some lock issues, should be fixed
3089 * with new CMD ibits lock. See bug 12718 */
3090 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3091 struct lookup_intent oit = { .it_op = IT_GETATTR };
3092 struct md_op_data *op_data;
3094 if (ibits == MDS_INODELOCK_LOOKUP)
3095 oit.it_op = IT_LOOKUP;
3097 /* Call getattr by fid, so do not provide name at all. */
3098 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3099 dentry->d_inode, NULL, 0, 0,
3100 LUSTRE_OPC_ANY, NULL);
3101 if (IS_ERR(op_data))
3102 RETURN(PTR_ERR(op_data));
3104 rc = md_intent_lock(exp, op_data, &oit, &req,
3105 &ll_md_blocking_ast, 0);
3106 ll_finish_md_op_data(op_data);
3108 rc = ll_inode_revalidate_fini(inode, rc);
3112 rc = ll_revalidate_it_finish(req, &oit, dentry);
3114 ll_intent_release(&oit);
3118 /* Unlinked? Unhash dentry, so it is not picked up later by
3119 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3120 here to preserve get_cwd functionality on 2.6.
3122 if (!dentry->d_inode->i_nlink)
3123 d_lustre_invalidate(dentry, 0);
3125 ll_lookup_finish_locks(&oit, dentry);
3126 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3127 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3128 u64 valid = OBD_MD_FLGETATTR;
3129 struct md_op_data *op_data;
3132 if (S_ISREG(inode->i_mode)) {
3133 rc = ll_get_default_mdsize(sbi, &ealen);
3136 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3139 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3140 0, ealen, LUSTRE_OPC_ANY,
3142 if (IS_ERR(op_data))
3143 RETURN(PTR_ERR(op_data));
3145 op_data->op_valid = valid;
3146 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3147 * capa for this inode. Because we only keep capas of dirs
3149 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3150 ll_finish_md_op_data(op_data);
3152 rc = ll_inode_revalidate_fini(inode, rc);
3156 rc = ll_prep_inode(&inode, req, NULL, NULL);
3159 ptlrpc_req_finished(req);
3163 static int ll_merge_md_attr(struct inode *inode)
3165 struct cl_attr attr = { 0 };
3168 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3169 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3170 &attr, ll_md_blocking_ast);
3174 set_nlink(inode, attr.cat_nlink);
3175 inode->i_blocks = attr.cat_blocks;
3176 i_size_write(inode, attr.cat_size);
3178 ll_i2info(inode)->lli_atime = attr.cat_atime;
3179 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3180 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3186 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3188 struct inode *inode = dentry->d_inode;
3192 rc = __ll_inode_revalidate(dentry, ibits);
3196 /* if object isn't regular file, don't validate size */
3197 if (!S_ISREG(inode->i_mode)) {
3198 if (S_ISDIR(inode->i_mode) &&
3199 ll_i2info(inode)->lli_lsm_md != NULL) {
3200 rc = ll_merge_md_attr(inode);
3205 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3206 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3207 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3209 /* In case of restore, the MDT has the right size and has
3210 * already send it back without granting the layout lock,
3211 * inode is up-to-date so glimpse is useless.
3212 * Also to glimpse we need the layout, in case of a running
3213 * restore the MDT holds the layout lock so the glimpse will
3214 * block up to the end of restore (getattr will block)
3216 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3217 rc = ll_glimpse_size(inode);
3222 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3224 struct inode *inode = de->d_inode;
3225 struct ll_sb_info *sbi = ll_i2sbi(inode);
3226 struct ll_inode_info *lli = ll_i2info(inode);
3229 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3230 MDS_INODELOCK_LOOKUP);
3231 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3236 stat->dev = inode->i_sb->s_dev;
3237 if (ll_need_32bit_api(sbi))
3238 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3240 stat->ino = inode->i_ino;
3241 stat->mode = inode->i_mode;
3242 stat->uid = inode->i_uid;
3243 stat->gid = inode->i_gid;
3244 stat->rdev = inode->i_rdev;
3245 stat->atime = inode->i_atime;
3246 stat->mtime = inode->i_mtime;
3247 stat->ctime = inode->i_ctime;
3248 stat->blksize = 1 << inode->i_blkbits;
3250 stat->nlink = inode->i_nlink;
3251 stat->size = i_size_read(inode);
3252 stat->blocks = inode->i_blocks;
3257 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3258 __u64 start, __u64 len)
3262 struct fiemap *fiemap;
3263 unsigned int extent_count = fieinfo->fi_extents_max;
3265 num_bytes = sizeof(*fiemap) + (extent_count *
3266 sizeof(struct fiemap_extent));
3267 OBD_ALLOC_LARGE(fiemap, num_bytes);
3272 fiemap->fm_flags = fieinfo->fi_flags;
3273 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3274 fiemap->fm_start = start;
3275 fiemap->fm_length = len;
3276 if (extent_count > 0 &&
3277 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3278 sizeof(struct fiemap_extent)) != 0)
3279 GOTO(out, rc = -EFAULT);
3281 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3283 fieinfo->fi_flags = fiemap->fm_flags;
3284 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3285 if (extent_count > 0 &&
3286 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3287 fiemap->fm_mapped_extents *
3288 sizeof(struct fiemap_extent)) != 0)
3289 GOTO(out, rc = -EFAULT);
3291 OBD_FREE_LARGE(fiemap, num_bytes);
3295 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3297 struct ll_inode_info *lli = ll_i2info(inode);
3298 struct posix_acl *acl = NULL;
3301 spin_lock(&lli->lli_lock);
3302 /* VFS' acl_permission_check->check_acl will release the refcount */
3303 acl = posix_acl_dup(lli->lli_posix_acl);
3304 spin_unlock(&lli->lli_lock);
3309 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3311 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3312 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3314 ll_check_acl(struct inode *inode, int mask)
3317 # ifdef CONFIG_FS_POSIX_ACL
3318 struct posix_acl *acl;
3322 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3323 if (flags & IPERM_FLAG_RCU)
3326 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3331 rc = posix_acl_permission(inode, acl, mask);
3332 posix_acl_release(acl);
3335 # else /* !CONFIG_FS_POSIX_ACL */
3337 # endif /* CONFIG_FS_POSIX_ACL */
3339 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3341 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3342 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3344 # ifdef HAVE_INODE_PERMISION_2ARGS
3345 int ll_inode_permission(struct inode *inode, int mask)
3347 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3352 struct ll_sb_info *sbi;
3353 struct root_squash_info *squash;
3354 struct cred *cred = NULL;
3355 const struct cred *old_cred = NULL;
3357 bool squash_id = false;
3360 #ifdef MAY_NOT_BLOCK
3361 if (mask & MAY_NOT_BLOCK)
3363 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3364 if (flags & IPERM_FLAG_RCU)
3368 /* as root inode are NOT getting validated in lookup operation,
3369 * need to do it before permission check. */
3371 if (inode == inode->i_sb->s_root->d_inode) {
3372 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3373 MDS_INODELOCK_LOOKUP);
3378 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3379 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3381 /* squash fsuid/fsgid if needed */
3382 sbi = ll_i2sbi(inode);
3383 squash = &sbi->ll_squash;
3384 if (unlikely(squash->rsi_uid != 0 &&
3385 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3386 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3390 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3391 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3392 squash->rsi_uid, squash->rsi_gid);
3394 /* update current process's credentials
3395 * and FS capability */
3396 cred = prepare_creds();
3400 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3401 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3402 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3403 if ((1 << cap) & CFS_CAP_FS_MASK)
3404 cap_lower(cred->cap_effective, cap);
3406 old_cred = override_creds(cred);
3409 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3411 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3412 rc = lustre_check_remote_perm(inode, mask);
3414 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3416 /* restore current process's credentials and FS capability */
3418 revert_creds(old_cred);
3425 /* -o localflock - only provides locally consistent flock locks */
3426 struct file_operations ll_file_operations = {
3427 .read = ll_file_read,
3428 .aio_read = ll_file_aio_read,
3429 .write = ll_file_write,
3430 .aio_write = ll_file_aio_write,
3431 .unlocked_ioctl = ll_file_ioctl,
3432 .open = ll_file_open,
3433 .release = ll_file_release,
3434 .mmap = ll_file_mmap,
3435 .llseek = ll_file_seek,
3436 .splice_read = ll_file_splice_read,
3441 struct file_operations ll_file_operations_flock = {
3442 .read = ll_file_read,
3443 .aio_read = ll_file_aio_read,
3444 .write = ll_file_write,
3445 .aio_write = ll_file_aio_write,
3446 .unlocked_ioctl = ll_file_ioctl,
3447 .open = ll_file_open,
3448 .release = ll_file_release,
3449 .mmap = ll_file_mmap,
3450 .llseek = ll_file_seek,
3451 .splice_read = ll_file_splice_read,
3454 .flock = ll_file_flock,
3455 .lock = ll_file_flock
3458 /* These are for -o noflock - to return ENOSYS on flock calls */
3459 struct file_operations ll_file_operations_noflock = {
3460 .read = ll_file_read,
3461 .aio_read = ll_file_aio_read,
3462 .write = ll_file_write,
3463 .aio_write = ll_file_aio_write,
3464 .unlocked_ioctl = ll_file_ioctl,
3465 .open = ll_file_open,
3466 .release = ll_file_release,
3467 .mmap = ll_file_mmap,
3468 .llseek = ll_file_seek,
3469 .splice_read = ll_file_splice_read,
3472 .flock = ll_file_noflock,
3473 .lock = ll_file_noflock
3476 struct inode_operations ll_file_inode_operations = {
3477 .setattr = ll_setattr,
3478 .getattr = ll_getattr,
3479 .permission = ll_inode_permission,
3480 .setxattr = ll_setxattr,
3481 .getxattr = ll_getxattr,
3482 .listxattr = ll_listxattr,
3483 .removexattr = ll_removexattr,
3484 .fiemap = ll_fiemap,
3485 #ifdef HAVE_IOP_GET_ACL
3486 .get_acl = ll_get_acl,
3490 /* dynamic ioctl number support routins */
3491 static struct llioc_ctl_data {
3492 struct rw_semaphore ioc_sem;
3493 struct list_head ioc_head;
3495 __RWSEM_INITIALIZER(llioc.ioc_sem),
3496 LIST_HEAD_INIT(llioc.ioc_head)
3501 struct list_head iocd_list;
3502 unsigned int iocd_size;
3503 llioc_callback_t iocd_cb;
3504 unsigned int iocd_count;
3505 unsigned int iocd_cmd[0];
3508 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3511 struct llioc_data *in_data = NULL;
3514 if (cb == NULL || cmd == NULL ||
3515 count > LLIOC_MAX_CMD || count < 0)
3518 size = sizeof(*in_data) + count * sizeof(unsigned int);
3519 OBD_ALLOC(in_data, size);
3520 if (in_data == NULL)
3523 memset(in_data, 0, sizeof(*in_data));
3524 in_data->iocd_size = size;
3525 in_data->iocd_cb = cb;
3526 in_data->iocd_count = count;
3527 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3529 down_write(&llioc.ioc_sem);
3530 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3531 up_write(&llioc.ioc_sem);
3536 void ll_iocontrol_unregister(void *magic)
3538 struct llioc_data *tmp;
3543 down_write(&llioc.ioc_sem);
3544 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3546 unsigned int size = tmp->iocd_size;
3548 list_del(&tmp->iocd_list);
3549 up_write(&llioc.ioc_sem);
3551 OBD_FREE(tmp, size);
3555 up_write(&llioc.ioc_sem);
3557 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3560 EXPORT_SYMBOL(ll_iocontrol_register);
3561 EXPORT_SYMBOL(ll_iocontrol_unregister);
3563 static enum llioc_iter
3564 ll_iocontrol_call(struct inode *inode, struct file *file,
3565 unsigned int cmd, unsigned long arg, int *rcp)
3567 enum llioc_iter ret = LLIOC_CONT;
3568 struct llioc_data *data;
3569 int rc = -EINVAL, i;
3571 down_read(&llioc.ioc_sem);
3572 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3573 for (i = 0; i < data->iocd_count; i++) {
3574 if (cmd != data->iocd_cmd[i])
3577 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3581 if (ret == LLIOC_STOP)
3584 up_read(&llioc.ioc_sem);
3591 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3593 struct ll_inode_info *lli = ll_i2info(inode);
3594 struct cl_object *obj = lli->lli_clob;
3595 struct cl_env_nest nest;
3603 env = cl_env_nested_get(&nest);
3605 RETURN(PTR_ERR(env));
3607 rc = cl_conf_set(env, lli->lli_clob, conf);
3611 if (conf->coc_opc == OBJECT_CONF_SET) {
3612 struct ldlm_lock *lock = conf->coc_lock;
3613 struct cl_layout cl = {
3617 LASSERT(lock != NULL);
3618 LASSERT(ldlm_has_layout(lock));
3620 /* it can only be allowed to match after layout is
3621 * applied to inode otherwise false layout would be
3622 * seen. Applying layout shoud happen before dropping
3623 * the intent lock. */
3624 ldlm_lock_allow_match(lock);
3626 rc = cl_object_layout_get(env, obj, &cl);
3631 DFID": layout version change: %u -> %u\n",
3632 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3634 ll_layout_version_set(lli, cl.cl_layout_gen);
3638 cl_env_nested_put(&nest, env);
3643 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3644 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3647 struct ll_sb_info *sbi = ll_i2sbi(inode);
3648 struct obd_capa *oc;
3649 struct ptlrpc_request *req;
3650 struct mdt_body *body;
3657 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3658 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3659 lock->l_lvb_data, lock->l_lvb_len);
3661 if (lock->l_lvb_data != NULL)
3664 /* if layout lock was granted right away, the layout is returned
3665 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3666 * blocked and then granted via completion ast, we have to fetch
3667 * layout here. Please note that we can't use the LVB buffer in
3668 * completion AST because it doesn't have a large enough buffer */
3669 oc = ll_mdscapa_get(inode);
3670 rc = ll_get_default_mdsize(sbi, &lmmsize);
3672 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3673 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3679 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3681 GOTO(out, rc = -EPROTO);
3683 lmmsize = body->mbo_eadatasize;
3684 if (lmmsize == 0) /* empty layout */
3687 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3689 GOTO(out, rc = -EFAULT);
3691 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3692 if (lvbdata == NULL)
3693 GOTO(out, rc = -ENOMEM);
3695 memcpy(lvbdata, lmm, lmmsize);
3696 lock_res_and_lock(lock);
3697 if (unlikely(lock->l_lvb_data == NULL)) {
3698 lock->l_lvb_type = LVB_T_LAYOUT;
3699 lock->l_lvb_data = lvbdata;
3700 lock->l_lvb_len = lmmsize;
3703 unlock_res_and_lock(lock);
3705 if (lvbdata != NULL)
3706 OBD_FREE_LARGE(lvbdata, lmmsize);
3711 ptlrpc_req_finished(req);
3716 * Apply the layout to the inode. Layout lock is held and will be released
3719 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3720 struct inode *inode)
3722 struct ll_inode_info *lli = ll_i2info(inode);
3723 struct ll_sb_info *sbi = ll_i2sbi(inode);
3724 struct ldlm_lock *lock;
3725 struct lustre_md md = { NULL };
3726 struct cl_object_conf conf;
3729 bool wait_layout = false;
3732 LASSERT(lustre_handle_is_used(lockh));
3734 lock = ldlm_handle2lock(lockh);
3735 LASSERT(lock != NULL);
3736 LASSERT(ldlm_has_layout(lock));
3738 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3739 PFID(&lli->lli_fid), inode);
3741 /* in case this is a caching lock and reinstate with new inode */
3742 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3744 lock_res_and_lock(lock);
3745 lvb_ready = ldlm_is_lvb_ready(lock);
3746 unlock_res_and_lock(lock);
3747 /* checking lvb_ready is racy but this is okay. The worst case is
3748 * that multi processes may configure the file on the same time. */
3753 rc = ll_layout_fetch(inode, lock);
3757 /* for layout lock, lmm is stored in lock's lvb.
3758 * lvb_data is immutable if the lock is held so it's safe to access it
3759 * without res lock. */
3760 if (lock->l_lvb_data != NULL) {
3761 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3762 lock->l_lvb_data, lock->l_lvb_len);
3764 CERROR("%s: file "DFID" unpackmd error: %d\n",
3765 ll_get_fsname(inode->i_sb, NULL, 0),
3766 PFID(&lli->lli_fid), rc);
3770 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3771 lock->l_lvb_data, lock->l_lvb_len);
3776 /* set layout to file. Unlikely this will fail as old layout was
3777 * surely eliminated */
3778 memset(&conf, 0, sizeof conf);
3779 conf.coc_opc = OBJECT_CONF_SET;
3780 conf.coc_inode = inode;
3781 conf.coc_lock = lock;
3782 conf.u.coc_md = &md;
3783 rc = ll_layout_conf(inode, &conf);
3786 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3788 /* refresh layout failed, need to wait */
3789 wait_layout = rc == -EBUSY;
3793 LDLM_LOCK_PUT(lock);
3794 ldlm_lock_decref(lockh, mode);
3796 /* wait for IO to complete if it's still being used. */
3798 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3799 ll_get_fsname(inode->i_sb, NULL, 0),
3800 PFID(&lli->lli_fid), inode);
3802 memset(&conf, 0, sizeof conf);
3803 conf.coc_opc = OBJECT_CONF_WAIT;
3804 conf.coc_inode = inode;
3805 rc = ll_layout_conf(inode, &conf);
3809 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3810 ll_get_fsname(inode->i_sb, NULL, 0),
3811 PFID(&lli->lli_fid), rc);
3816 static int ll_layout_refresh_locked(struct inode *inode)
3818 struct ll_inode_info *lli = ll_i2info(inode);
3819 struct ll_sb_info *sbi = ll_i2sbi(inode);
3820 struct md_op_data *op_data;
3821 struct lookup_intent it;
3822 struct lustre_handle lockh;
3824 struct ldlm_enqueue_info einfo = {
3825 .ei_type = LDLM_IBITS,
3827 .ei_cb_bl = &ll_md_blocking_ast,
3828 .ei_cb_cp = &ldlm_completion_ast,
3834 /* mostly layout lock is caching on the local side, so try to match
3835 * it before grabbing layout lock mutex. */
3836 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3837 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3838 if (mode != 0) { /* hit cached lock */
3839 rc = ll_layout_lock_set(&lockh, mode, inode);
3846 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3847 0, 0, LUSTRE_OPC_ANY, NULL);
3848 if (IS_ERR(op_data))
3849 RETURN(PTR_ERR(op_data));
3851 /* have to enqueue one */
3852 memset(&it, 0, sizeof(it));
3853 it.it_op = IT_LAYOUT;
3854 lockh.cookie = 0ULL;
3856 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3857 ll_get_fsname(inode->i_sb, NULL, 0),
3858 PFID(&lli->lli_fid), inode);
3860 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3861 if (it.d.lustre.it_data != NULL)
3862 ptlrpc_req_finished(it.d.lustre.it_data);
3863 it.d.lustre.it_data = NULL;
3865 ll_finish_md_op_data(op_data);
3867 mode = it.d.lustre.it_lock_mode;
3868 it.d.lustre.it_lock_mode = 0;
3869 ll_intent_drop_lock(&it);
3872 /* set lock data in case this is a new lock */
3873 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3874 rc = ll_layout_lock_set(&lockh, mode, inode);
3883 * This function checks if there exists a LAYOUT lock on the client side,
3884 * or enqueues it if it doesn't have one in cache.
3886 * This function will not hold layout lock so it may be revoked any time after
3887 * this function returns. Any operations depend on layout should be redone
3890 * This function should be called before lov_io_init() to get an uptodate
3891 * layout version, the caller should save the version number and after IO
3892 * is finished, this function should be called again to verify that layout
3893 * is not changed during IO time.
3895 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3897 struct ll_inode_info *lli = ll_i2info(inode);
3898 struct ll_sb_info *sbi = ll_i2sbi(inode);
3902 *gen = ll_layout_version_get(lli);
3903 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3907 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3908 LASSERT(S_ISREG(inode->i_mode));
3910 /* take layout lock mutex to enqueue layout lock exclusively. */
3911 mutex_lock(&lli->lli_layout_mutex);
3913 rc = ll_layout_refresh_locked(inode);
3917 *gen = ll_layout_version_get(lli);
3919 mutex_unlock(&lli->lli_layout_mutex);
3925 * This function send a restore request to the MDT
3927 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3929 struct hsm_user_request *hur;
3933 len = sizeof(struct hsm_user_request) +
3934 sizeof(struct hsm_user_item);
3935 OBD_ALLOC(hur, len);
3939 hur->hur_request.hr_action = HUA_RESTORE;
3940 hur->hur_request.hr_archive_id = 0;
3941 hur->hur_request.hr_flags = 0;
3942 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3943 sizeof(hur->hur_user_item[0].hui_fid));
3944 hur->hur_user_item[0].hui_extent.offset = offset;
3945 hur->hur_user_item[0].hui_extent.length = length;
3946 hur->hur_request.hr_itemcount = 1;
3947 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,