4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <lustre/ll_fiemap.h>
49 #include <lustre_ioctl.h>
51 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static enum llioc_iter
63 ll_iocontrol_call(struct inode *inode, struct file *file,
64 unsigned int cmd, unsigned long arg, int *rcp);
66 static struct ll_file_data *ll_file_data_get(void)
68 struct ll_file_data *fd;
70 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
74 fd->fd_write_failed = false;
79 static void ll_file_data_put(struct ll_file_data *fd)
82 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
85 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
86 struct lustre_handle *fh)
88 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
89 op_data->op_attr.ia_mode = inode->i_mode;
90 op_data->op_attr.ia_atime = inode->i_atime;
91 op_data->op_attr.ia_mtime = inode->i_mtime;
92 op_data->op_attr.ia_ctime = inode->i_ctime;
93 op_data->op_attr.ia_size = i_size_read(inode);
94 op_data->op_attr_blocks = inode->i_blocks;
95 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Packs all the attributes into @op_data for the CLOSE rpc.
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
123 ll_prep_md_op_data(op_data, inode, NULL, NULL,
124 0, 0, LUSTRE_OPC_ANY, NULL);
128 static int ll_close_inode_openhandle(struct obd_export *md_exp,
130 struct obd_client_handle *och,
131 const __u64 *data_version)
133 struct obd_export *exp = ll_i2mdexp(inode);
134 struct md_op_data *op_data;
135 struct ptlrpc_request *req = NULL;
136 struct obd_device *obd = class_exp2obd(exp);
142 * XXX: in case of LMV, is this correct to access
145 CERROR("Invalid MDC connection handle "LPX64"\n",
146 ll_i2mdexp(inode)->exp_handle.h_cookie);
150 OBD_ALLOC_PTR(op_data);
152 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
154 ll_prepare_close(inode, op_data, och);
155 if (data_version != NULL) {
156 /* Pass in data_version implies release. */
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *data_version;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 rc = md_close(md_exp, op_data, och->och_mod, &req);
165 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
166 ll_i2mdexp(inode)->exp_obd->obd_name,
167 PFID(ll_inode2fid(inode)), rc);
170 /* DATA_MODIFIED flag was successfully sent on close, cancel data
171 * modification flag. */
172 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
173 struct ll_inode_info *lli = ll_i2info(inode);
175 spin_lock(&lli->lli_lock);
176 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
177 spin_unlock(&lli->lli_lock);
180 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
181 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 if (req) /* This is close request */
196 ptlrpc_req_finished(req);
200 int ll_md_real_close(struct inode *inode, fmode_t fmode)
202 struct ll_inode_info *lli = ll_i2info(inode);
203 struct obd_client_handle **och_p;
204 struct obd_client_handle *och;
209 if (fmode & FMODE_WRITE) {
210 och_p = &lli->lli_mds_write_och;
211 och_usecount = &lli->lli_open_fd_write_count;
212 } else if (fmode & FMODE_EXEC) {
213 och_p = &lli->lli_mds_exec_och;
214 och_usecount = &lli->lli_open_fd_exec_count;
216 LASSERT(fmode & FMODE_READ);
217 och_p = &lli->lli_mds_read_och;
218 och_usecount = &lli->lli_open_fd_read_count;
221 mutex_lock(&lli->lli_och_mutex);
222 if (*och_usecount > 0) {
223 /* There are still users of this handle, so skip
225 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
234 /* There might be a race and this handle may already
236 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
243 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
246 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
247 struct ll_inode_info *lli = ll_i2info(inode);
251 /* clear group lock, if present */
252 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
253 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
255 if (fd->fd_lease_och != NULL) {
258 /* Usually the lease is not released when the
259 * application crashed, we need to release here. */
260 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
261 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
262 PFID(&lli->lli_fid), rc, lease_broken);
264 fd->fd_lease_och = NULL;
267 if (fd->fd_och != NULL) {
268 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
273 /* Let's see if we have good enough OPEN lock on the file and if
274 we can skip talking to MDS */
275 if (file->f_dentry->d_inode) { /* Can this ever be false? */
277 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
278 struct lustre_handle lockh;
279 struct inode *inode = file->f_dentry->d_inode;
280 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode,
301 rc = ll_md_real_close(file->f_dentry->d_inode,
305 CERROR("released file has negative dentry: file = %p, "
306 "dentry = %p, name = %s\n",
307 file, file->f_dentry, file->f_dentry->d_name.name);
311 LUSTRE_FPRIVATE(file) = NULL;
312 ll_file_data_put(fd);
313 ll_capa_close(inode);
318 /* While this returns an error code, fput() the caller does not, so we need
319 * to make every effort to clean up all of our state here. Also, applications
320 * rarely check close errors and even if an error is returned they will not
321 * re-try the close call.
323 int ll_file_release(struct inode *inode, struct file *file)
325 struct ll_file_data *fd;
326 struct ll_sb_info *sbi = ll_i2sbi(inode);
327 struct ll_inode_info *lli = ll_i2info(inode);
331 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
332 PFID(ll_inode2fid(inode)), inode);
334 #ifdef CONFIG_FS_POSIX_ACL
335 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
336 inode == inode->i_sb->s_root->d_inode) {
337 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
340 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
341 fd->fd_flags &= ~LL_FILE_RMTACL;
342 rct_del(&sbi->ll_rct, current_pid());
343 et_search_free(&sbi->ll_et, current_pid());
348 if (inode->i_sb->s_root != file->f_dentry)
349 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
350 fd = LUSTRE_FPRIVATE(file);
353 /* The last ref on @file, maybe not the the owner pid of statahead,
354 * because parent and child process can share the same file handle. */
355 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
356 ll_deauthorize_statahead(inode, fd);
358 if (inode->i_sb->s_root == file->f_dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 if (lli->lli_clob != NULL)
366 lov_read_and_clear_async_rc(lli->lli_clob);
367 lli->lli_async_rc = 0;
370 rc = ll_md_close(sbi->ll_md_exp, inode, file);
372 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
373 libcfs_debug_dumplog();
378 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
379 struct lookup_intent *itp)
381 struct dentry *de = file->f_dentry;
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->d.lustre.it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
443 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
444 struct obd_client_handle *och)
446 struct ptlrpc_request *req = it->d.lustre.it_data;
447 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 och->och_fh = body->mbo_handle;
451 och->och_fid = body->mbo_fid1;
452 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_flags = it->it_flags;
456 return md_set_open_replay_data(md_exp, och, it);
459 static int ll_local_open(struct file *file, struct lookup_intent *it,
460 struct ll_file_data *fd, struct obd_client_handle *och)
462 struct inode *inode = file->f_dentry->d_inode;
465 LASSERT(!LUSTRE_FPRIVATE(file));
472 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
477 LUSTRE_FPRIVATE(file) = fd;
478 ll_readahead_init(inode, &fd->fd_ras);
479 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
481 /* ll_cl_context initialize */
482 rwlock_init(&fd->fd_lock);
483 INIT_LIST_HEAD(&fd->fd_lccs);
488 /* Open a file, and (for the very first open) create objects on the OSTs at
489 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
490 * creation or open until ll_lov_setstripe() ioctl is called.
492 * If we already have the stripe MD locally then we don't request it in
493 * md_open(), by passing a lmm_size = 0.
495 * It is up to the application to ensure no other processes open this file
496 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
497 * used. We might be able to avoid races of that sort by getting lli_open_sem
498 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
499 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
501 int ll_file_open(struct inode *inode, struct file *file)
503 struct ll_inode_info *lli = ll_i2info(inode);
504 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
505 .it_flags = file->f_flags };
506 struct obd_client_handle **och_p = NULL;
507 __u64 *och_usecount = NULL;
508 struct ll_file_data *fd;
512 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
513 PFID(ll_inode2fid(inode)), inode, file->f_flags);
515 it = file->private_data; /* XXX: compat macro */
516 file->private_data = NULL; /* prevent ll_local_open assertion */
518 fd = ll_file_data_get();
520 GOTO(out_openerr, rc = -ENOMEM);
523 if (S_ISDIR(inode->i_mode))
524 ll_authorize_statahead(inode, fd);
526 if (inode->i_sb->s_root == file->f_dentry) {
527 LUSTRE_FPRIVATE(file) = fd;
531 if (!it || !it->d.lustre.it_disposition) {
532 /* Convert f_flags into access mode. We cannot use file->f_mode,
533 * because everything but O_ACCMODE mask was stripped from
535 if ((oit.it_flags + 1) & O_ACCMODE)
537 if (file->f_flags & O_TRUNC)
538 oit.it_flags |= FMODE_WRITE;
540 /* kernel only call f_op->open in dentry_open. filp_open calls
541 * dentry_open after call to open_namei that checks permissions.
542 * Only nfsd_open call dentry_open directly without checking
543 * permissions and because of that this code below is safe. */
544 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
545 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
547 /* We do not want O_EXCL here, presumably we opened the file
548 * already? XXX - NFS implications? */
549 oit.it_flags &= ~O_EXCL;
551 /* bug20584, if "it_flags" contains O_CREAT, the file will be
552 * created if necessary, then "IT_CREAT" should be set to keep
553 * consistent with it */
554 if (oit.it_flags & O_CREAT)
555 oit.it_op |= IT_CREAT;
561 /* Let's see if we have file open on MDS already. */
562 if (it->it_flags & FMODE_WRITE) {
563 och_p = &lli->lli_mds_write_och;
564 och_usecount = &lli->lli_open_fd_write_count;
565 } else if (it->it_flags & FMODE_EXEC) {
566 och_p = &lli->lli_mds_exec_och;
567 och_usecount = &lli->lli_open_fd_exec_count;
569 och_p = &lli->lli_mds_read_och;
570 och_usecount = &lli->lli_open_fd_read_count;
573 mutex_lock(&lli->lli_och_mutex);
574 if (*och_p) { /* Open handle is present */
575 if (it_disposition(it, DISP_OPEN_OPEN)) {
576 /* Well, there's extra open request that we do not need,
577 let's close it somehow. This will decref request. */
578 rc = it_open_error(DISP_OPEN_OPEN, it);
580 mutex_unlock(&lli->lli_och_mutex);
581 GOTO(out_openerr, rc);
584 ll_release_openhandle(file->f_dentry, it);
588 rc = ll_local_open(file, it, fd, NULL);
591 mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 LASSERT(*och_usecount == 0);
596 if (!it->d.lustre.it_disposition) {
597 /* We cannot just request lock handle now, new ELC code
598 means that one of other OPEN locks for this file
599 could be cancelled, and since blocking ast handler
600 would attempt to grab och_mutex as well, that would
601 result in a deadlock */
602 mutex_unlock(&lli->lli_och_mutex);
604 * Normally called under two situations:
606 * 2. A race/condition on MDS resulting in no open
607 * handle to be returned from LOOKUP|OPEN request,
608 * for example if the target entry was a symlink.
610 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
612 * Always specify MDS_OPEN_BY_FID because we don't want
613 * to get file with different fid.
615 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
616 rc = ll_intent_file_open(file, NULL, 0, it);
618 GOTO(out_openerr, rc);
622 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
624 GOTO(out_och_free, rc = -ENOMEM);
628 /* md_intent_lock() didn't get a request ref if there was an
629 * open error, so don't do cleanup on the request here
631 /* XXX (green): Should not we bail out on any error here, not
632 * just open error? */
633 rc = it_open_error(DISP_OPEN_OPEN, it);
635 GOTO(out_och_free, rc);
637 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
638 "inode %p: disposition %x, status %d\n", inode,
639 it_disposition(it, ~0), it->d.lustre.it_status);
641 rc = ll_local_open(file, it, fd, *och_p);
643 GOTO(out_och_free, rc);
645 mutex_unlock(&lli->lli_och_mutex);
648 /* Must do this outside lli_och_mutex lock to prevent deadlock where
649 different kind of OPEN lock for this same inode gets cancelled
650 by ldlm_cancel_lru */
651 if (!S_ISREG(inode->i_mode))
652 GOTO(out_och_free, rc);
656 cl_lov_delay_create_clear(&file->f_flags);
657 GOTO(out_och_free, rc);
661 if (och_p && *och_p) {
662 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
663 *och_p = NULL; /* OBD_FREE writes some magic there */
666 mutex_unlock(&lli->lli_och_mutex);
669 if (lli->lli_opendir_key == fd)
670 ll_deauthorize_statahead(inode, fd);
672 ll_file_data_put(fd);
674 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
677 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
678 ptlrpc_req_finished(it->d.lustre.it_data);
679 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
685 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
686 struct ldlm_lock_desc *desc, void *data, int flag)
689 struct lustre_handle lockh;
693 case LDLM_CB_BLOCKING:
694 ldlm_lock2handle(lock, &lockh);
695 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
697 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
701 case LDLM_CB_CANCELING:
709 * Acquire a lease and open the file.
711 static struct obd_client_handle *
712 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
715 struct lookup_intent it = { .it_op = IT_OPEN };
716 struct ll_sb_info *sbi = ll_i2sbi(inode);
717 struct md_op_data *op_data;
718 struct ptlrpc_request *req = NULL;
719 struct lustre_handle old_handle = { 0 };
720 struct obd_client_handle *och = NULL;
725 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
726 RETURN(ERR_PTR(-EINVAL));
729 struct ll_inode_info *lli = ll_i2info(inode);
730 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
731 struct obd_client_handle **och_p;
734 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
735 RETURN(ERR_PTR(-EPERM));
737 /* Get the openhandle of the file */
739 mutex_lock(&lli->lli_och_mutex);
740 if (fd->fd_lease_och != NULL) {
741 mutex_unlock(&lli->lli_och_mutex);
745 if (fd->fd_och == NULL) {
746 if (file->f_mode & FMODE_WRITE) {
747 LASSERT(lli->lli_mds_write_och != NULL);
748 och_p = &lli->lli_mds_write_och;
749 och_usecount = &lli->lli_open_fd_write_count;
751 LASSERT(lli->lli_mds_read_och != NULL);
752 och_p = &lli->lli_mds_read_och;
753 och_usecount = &lli->lli_open_fd_read_count;
755 if (*och_usecount == 1) {
762 mutex_unlock(&lli->lli_och_mutex);
763 if (rc < 0) /* more than 1 opener */
766 LASSERT(fd->fd_och != NULL);
767 old_handle = fd->fd_och->och_fh;
772 RETURN(ERR_PTR(-ENOMEM));
774 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
775 LUSTRE_OPC_ANY, NULL);
777 GOTO(out, rc = PTR_ERR(op_data));
779 /* To tell the MDT this openhandle is from the same owner */
780 op_data->op_handle = old_handle;
782 it.it_flags = fmode | open_flags;
783 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
784 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
785 &ll_md_blocking_lease_ast,
786 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
787 * it can be cancelled which may mislead applications that the lease is
789 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
790 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
791 * doesn't deal with openhandle, so normal openhandle will be leaked. */
792 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
793 ll_finish_md_op_data(op_data);
794 ptlrpc_req_finished(req);
796 GOTO(out_release_it, rc);
798 if (it_disposition(&it, DISP_LOOKUP_NEG))
799 GOTO(out_release_it, rc = -ENOENT);
801 rc = it_open_error(DISP_OPEN_OPEN, &it);
803 GOTO(out_release_it, rc);
805 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
806 ll_och_fill(sbi->ll_md_exp, &it, och);
808 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
809 GOTO(out_close, rc = -EOPNOTSUPP);
811 /* already get lease, handle lease lock */
812 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
813 if (it.d.lustre.it_lock_mode == 0 ||
814 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
815 /* open lock must return for lease */
816 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
817 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
818 it.d.lustre.it_lock_bits);
819 GOTO(out_close, rc = -EPROTO);
822 ll_intent_release(&it);
826 /* Cancel open lock */
827 if (it.d.lustre.it_lock_mode != 0) {
828 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
829 it.d.lustre.it_lock_mode);
830 it.d.lustre.it_lock_mode = 0;
831 och->och_lease_handle.cookie = 0ULL;
833 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
835 CERROR("%s: error closing file "DFID": %d\n",
836 ll_get_fsname(inode->i_sb, NULL, 0),
837 PFID(&ll_i2info(inode)->lli_fid), rc2);
838 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
840 ll_intent_release(&it);
848 * Release lease and close the file.
849 * It will check if the lease has ever broken.
851 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
854 struct ldlm_lock *lock;
855 bool cancelled = true;
859 lock = ldlm_handle2lock(&och->och_lease_handle);
861 lock_res_and_lock(lock);
862 cancelled = ldlm_is_cancel(lock);
863 unlock_res_and_lock(lock);
867 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
868 PFID(&ll_i2info(inode)->lli_fid), cancelled);
871 ldlm_cli_cancel(&och->och_lease_handle, 0);
872 if (lease_broken != NULL)
873 *lease_broken = cancelled;
875 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
880 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
882 struct ll_inode_info *lli = ll_i2info(inode);
883 struct cl_object *obj = lli->lli_clob;
884 struct cl_attr *attr = vvp_env_thread_attr(env);
892 ll_inode_size_lock(inode);
894 /* merge timestamps the most recently obtained from mds with
895 timestamps obtained from osts */
896 LTIME_S(inode->i_atime) = lli->lli_atime;
897 LTIME_S(inode->i_mtime) = lli->lli_mtime;
898 LTIME_S(inode->i_ctime) = lli->lli_ctime;
900 atime = LTIME_S(inode->i_atime);
901 mtime = LTIME_S(inode->i_mtime);
902 ctime = LTIME_S(inode->i_ctime);
904 cl_object_attr_lock(obj);
905 rc = cl_object_attr_get(env, obj, attr);
906 cl_object_attr_unlock(obj);
909 GOTO(out_size_unlock, rc);
911 if (atime < attr->cat_atime)
912 atime = attr->cat_atime;
914 if (ctime < attr->cat_ctime)
915 ctime = attr->cat_ctime;
917 if (mtime < attr->cat_mtime)
918 mtime = attr->cat_mtime;
920 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
921 PFID(&lli->lli_fid), attr->cat_size);
923 i_size_write(inode, attr->cat_size);
924 inode->i_blocks = attr->cat_blocks;
926 LTIME_S(inode->i_atime) = atime;
927 LTIME_S(inode->i_mtime) = mtime;
928 LTIME_S(inode->i_ctime) = ctime;
931 ll_inode_size_unlock(inode);
936 static bool file_is_noatime(const struct file *file)
938 const struct vfsmount *mnt = file->f_path.mnt;
939 const struct inode *inode = file->f_path.dentry->d_inode;
941 /* Adapted from file_accessed() and touch_atime().*/
942 if (file->f_flags & O_NOATIME)
945 if (inode->i_flags & S_NOATIME)
948 if (IS_NOATIME(inode))
951 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
954 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
957 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
963 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
965 struct inode *inode = file->f_dentry->d_inode;
967 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
969 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
970 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
971 file->f_flags & O_DIRECT ||
974 io->ci_obj = ll_i2info(inode)->lli_clob;
975 io->ci_lockreq = CILR_MAYBE;
976 if (ll_file_nolock(file)) {
977 io->ci_lockreq = CILR_NEVER;
978 io->ci_no_srvlock = 1;
979 } else if (file->f_flags & O_APPEND) {
980 io->ci_lockreq = CILR_MANDATORY;
983 io->ci_noatime = file_is_noatime(file);
987 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
988 struct file *file, enum cl_io_type iot,
989 loff_t *ppos, size_t count)
991 struct inode *inode = file->f_dentry->d_inode;
992 struct ll_inode_info *lli = ll_i2info(inode);
994 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
997 struct range_lock range;
1000 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1001 file->f_dentry->d_name.name, iot, *ppos, count);
1004 io = vvp_env_thread_io(env);
1005 ll_io_init(io, file, iot == CIT_WRITE);
1007 /* The maximum Lustre file size is variable, based on the
1008 * OST maximum object size and number of stripes. This
1009 * needs another check in addition to the VFS checks earlier. */
1010 end = (io->u.ci_wr.wr_append ? i_size_read(inode) : *ppos) + count;
1011 if (end > ll_file_maxbytes(inode)) {
1013 CDEBUG(D_INODE, "%s: file "DFID" offset %llu > maxbytes "LPU64
1014 ": rc = %zd\n", ll_get_fsname(inode->i_sb, NULL, 0),
1015 PFID(&lli->lli_fid), end, ll_file_maxbytes(inode),
1020 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1021 struct vvp_io *vio = vvp_env_io(env);
1022 bool range_locked = false;
1024 if (file->f_flags & O_APPEND)
1025 range_lock_init(&range, 0, LUSTRE_EOF);
1027 range_lock_init(&range, *ppos, *ppos + count - 1);
1029 vio->vui_fd = LUSTRE_FPRIVATE(file);
1030 vio->vui_io_subtype = args->via_io_subtype;
1032 switch (vio->vui_io_subtype) {
1034 vio->vui_iov = args->u.normal.via_iov;
1035 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1036 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1037 vio->vui_iocb = args->u.normal.via_iocb;
1038 /* Direct IO reads must also take range lock,
1039 * or multiple reads will try to work on the same pages
1040 * See LU-6227 for details. */
1041 if (((iot == CIT_WRITE) ||
1042 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1043 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1044 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1046 result = range_lock(&lli->lli_write_tree,
1051 range_locked = true;
1053 down_read(&lli->lli_trunc_sem);
1056 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1057 vio->u.splice.vui_flags = args->u.splice.via_flags;
1060 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1064 ll_cl_add(file, env, io);
1065 result = cl_io_loop(env, io);
1066 ll_cl_remove(file, env);
1068 if (args->via_io_subtype == IO_NORMAL)
1069 up_read(&lli->lli_trunc_sem);
1071 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1073 range_unlock(&lli->lli_write_tree, &range);
1076 /* cl_io_rw_init() handled IO */
1077 result = io->ci_result;
1080 if (io->ci_nob > 0) {
1081 result = io->ci_nob;
1082 *ppos = io->u.ci_wr.wr.crw_pos;
1086 cl_io_fini(env, io);
1087 /* If any bit been read/written (result != 0), we just return
1088 * short read/write instead of restart io. */
1089 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1090 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1091 iot == CIT_READ ? "read" : "write",
1092 file->f_dentry->d_name.name, *ppos, count);
1093 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1097 if (iot == CIT_READ) {
1099 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1100 LPROC_LL_READ_BYTES, result);
1101 } else if (iot == CIT_WRITE) {
1103 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1104 LPROC_LL_WRITE_BYTES, result);
1105 fd->fd_write_failed = false;
1106 } else if (result != -ERESTARTSYS) {
1107 fd->fd_write_failed = true;
1110 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1117 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1119 static int ll_file_get_iov_count(const struct iovec *iov,
1120 unsigned long *nr_segs, size_t *count)
1125 for (seg = 0; seg < *nr_segs; seg++) {
1126 const struct iovec *iv = &iov[seg];
1129 * If any segment has a negative length, or the cumulative
1130 * length ever wraps negative then return -EINVAL.
1133 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1135 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1140 cnt -= iv->iov_len; /* This segment is no good */
1147 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1148 unsigned long nr_segs, loff_t pos)
1151 struct vvp_io_args *args;
1157 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1161 env = cl_env_get(&refcheck);
1163 RETURN(PTR_ERR(env));
1165 args = ll_env_args(env, IO_NORMAL);
1166 args->u.normal.via_iov = (struct iovec *)iov;
1167 args->u.normal.via_nrsegs = nr_segs;
1168 args->u.normal.via_iocb = iocb;
1170 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1171 &iocb->ki_pos, count);
1172 cl_env_put(env, &refcheck);
1176 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1180 struct iovec *local_iov;
1181 struct kiocb *kiocb;
1186 env = cl_env_get(&refcheck);
1188 RETURN(PTR_ERR(env));
1190 local_iov = &ll_env_info(env)->lti_local_iov;
1191 kiocb = &ll_env_info(env)->lti_kiocb;
1192 local_iov->iov_base = (void __user *)buf;
1193 local_iov->iov_len = count;
1194 init_sync_kiocb(kiocb, file);
1195 kiocb->ki_pos = *ppos;
1196 #ifdef HAVE_KIOCB_KI_LEFT
1197 kiocb->ki_left = count;
1199 kiocb->ki_nbytes = count;
1202 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1203 *ppos = kiocb->ki_pos;
1205 cl_env_put(env, &refcheck);
1210 * Write to a file (through the page cache).
1213 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1214 unsigned long nr_segs, loff_t pos)
1217 struct vvp_io_args *args;
1223 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1227 env = cl_env_get(&refcheck);
1229 RETURN(PTR_ERR(env));
1231 args = ll_env_args(env, IO_NORMAL);
1232 args->u.normal.via_iov = (struct iovec *)iov;
1233 args->u.normal.via_nrsegs = nr_segs;
1234 args->u.normal.via_iocb = iocb;
1236 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1237 &iocb->ki_pos, count);
1238 cl_env_put(env, &refcheck);
1242 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1243 size_t count, loff_t *ppos)
1246 struct iovec *local_iov;
1247 struct kiocb *kiocb;
1252 env = cl_env_get(&refcheck);
1254 RETURN(PTR_ERR(env));
1256 local_iov = &ll_env_info(env)->lti_local_iov;
1257 kiocb = &ll_env_info(env)->lti_kiocb;
1258 local_iov->iov_base = (void __user *)buf;
1259 local_iov->iov_len = count;
1260 init_sync_kiocb(kiocb, file);
1261 kiocb->ki_pos = *ppos;
1262 #ifdef HAVE_KIOCB_KI_LEFT
1263 kiocb->ki_left = count;
1265 kiocb->ki_nbytes = count;
1268 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1269 *ppos = kiocb->ki_pos;
1271 cl_env_put(env, &refcheck);
1276 * Send file content (through pagecache) somewhere with helper
1278 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1279 struct pipe_inode_info *pipe, size_t count,
1283 struct vvp_io_args *args;
1288 env = cl_env_get(&refcheck);
1290 RETURN(PTR_ERR(env));
1292 args = ll_env_args(env, IO_SPLICE);
1293 args->u.splice.via_pipe = pipe;
1294 args->u.splice.via_flags = flags;
1296 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1297 cl_env_put(env, &refcheck);
1301 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1302 __u64 flags, struct lov_user_md *lum,
1305 struct lookup_intent oit = {
1307 .it_flags = flags | MDS_OPEN_BY_FID,
1312 ll_inode_size_lock(inode);
1313 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1315 GOTO(out_unlock, rc);
1317 ll_release_openhandle(file->f_dentry, &oit);
1320 ll_inode_size_unlock(inode);
1321 ll_intent_release(&oit);
1322 cl_lov_delay_create_clear(&file->f_flags);
1327 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1328 struct lov_mds_md **lmmp, int *lmm_size,
1329 struct ptlrpc_request **request)
1331 struct ll_sb_info *sbi = ll_i2sbi(inode);
1332 struct mdt_body *body;
1333 struct lov_mds_md *lmm = NULL;
1334 struct ptlrpc_request *req = NULL;
1335 struct md_op_data *op_data;
1338 rc = ll_get_default_mdsize(sbi, &lmmsize);
1342 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1343 strlen(filename), lmmsize,
1344 LUSTRE_OPC_ANY, NULL);
1345 if (IS_ERR(op_data))
1346 RETURN(PTR_ERR(op_data));
1348 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1349 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1350 ll_finish_md_op_data(op_data);
1352 CDEBUG(D_INFO, "md_getattr_name failed "
1353 "on %s: rc %d\n", filename, rc);
1357 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1358 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1360 lmmsize = body->mbo_eadatasize;
1362 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1364 GOTO(out, rc = -ENODATA);
1367 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1368 LASSERT(lmm != NULL);
1370 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1371 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1372 GOTO(out, rc = -EPROTO);
1376 * This is coming from the MDS, so is probably in
1377 * little endian. We convert it to host endian before
1378 * passing it to userspace.
1380 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1383 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1384 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1387 /* if function called for directory - we should
1388 * avoid swab not existent lsm objects */
1389 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1390 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1391 if (S_ISREG(body->mbo_mode))
1392 lustre_swab_lov_user_md_objects(
1393 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1395 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1396 lustre_swab_lov_user_md_v3(
1397 (struct lov_user_md_v3 *)lmm);
1398 if (S_ISREG(body->mbo_mode))
1399 lustre_swab_lov_user_md_objects(
1400 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1407 *lmm_size = lmmsize;
1412 static int ll_lov_setea(struct inode *inode, struct file *file,
1415 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1416 struct lov_user_md *lump;
1417 int lum_size = sizeof(struct lov_user_md) +
1418 sizeof(struct lov_user_ost_data);
1422 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1425 OBD_ALLOC_LARGE(lump, lum_size);
1429 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1430 OBD_FREE_LARGE(lump, lum_size);
1434 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1436 OBD_FREE_LARGE(lump, lum_size);
1440 static int ll_file_getstripe(struct inode *inode,
1441 struct lov_user_md __user *lum)
1448 env = cl_env_get(&refcheck);
1450 RETURN(PTR_ERR(env));
1452 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1453 cl_env_put(env, &refcheck);
1457 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1460 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1461 struct lov_user_md *klum;
1463 __u64 flags = FMODE_WRITE;
1466 rc = ll_copy_user_md(lum, &klum);
1471 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1475 put_user(0, &lum->lmm_stripe_count);
1477 ll_layout_refresh(inode, &gen);
1478 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1481 OBD_FREE(klum, lum_size);
1486 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1488 struct ll_inode_info *lli = ll_i2info(inode);
1489 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1490 struct ll_grouplock grouplock;
1495 CWARN("group id for group lock must not be 0\n");
1499 if (ll_file_nolock(file))
1500 RETURN(-EOPNOTSUPP);
1502 spin_lock(&lli->lli_lock);
1503 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1504 CWARN("group lock already existed with gid %lu\n",
1505 fd->fd_grouplock.lg_gid);
1506 spin_unlock(&lli->lli_lock);
1509 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1510 spin_unlock(&lli->lli_lock);
1512 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1513 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1517 spin_lock(&lli->lli_lock);
1518 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1519 spin_unlock(&lli->lli_lock);
1520 CERROR("another thread just won the race\n");
1521 cl_put_grouplock(&grouplock);
1525 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1526 fd->fd_grouplock = grouplock;
1527 spin_unlock(&lli->lli_lock);
1529 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1533 static int ll_put_grouplock(struct inode *inode, struct file *file,
1536 struct ll_inode_info *lli = ll_i2info(inode);
1537 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1538 struct ll_grouplock grouplock;
1541 spin_lock(&lli->lli_lock);
1542 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1543 spin_unlock(&lli->lli_lock);
1544 CWARN("no group lock held\n");
1548 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1550 if (fd->fd_grouplock.lg_gid != arg) {
1551 CWARN("group lock %lu doesn't match current id %lu\n",
1552 arg, fd->fd_grouplock.lg_gid);
1553 spin_unlock(&lli->lli_lock);
1557 grouplock = fd->fd_grouplock;
1558 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1559 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1560 spin_unlock(&lli->lli_lock);
1562 cl_put_grouplock(&grouplock);
1563 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1568 * Close inode open handle
1570 * \param dentry [in] dentry which contains the inode
1571 * \param it [in,out] intent which contains open info and result
1574 * \retval <0 failure
1576 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1578 struct inode *inode = dentry->d_inode;
1579 struct obd_client_handle *och;
1585 /* Root ? Do nothing. */
1586 if (dentry->d_inode->i_sb->s_root == dentry)
1589 /* No open handle to close? Move away */
1590 if (!it_disposition(it, DISP_OPEN_OPEN))
1593 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1595 OBD_ALLOC(och, sizeof(*och));
1597 GOTO(out, rc = -ENOMEM);
1599 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1601 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1604 /* this one is in place of ll_file_open */
1605 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1606 ptlrpc_req_finished(it->d.lustre.it_data);
1607 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1613 * Get size for inode for which FIEMAP mapping is requested.
1614 * Make the FIEMAP get_info call and returns the result.
1615 * \param fiemap kernel buffer to hold extens
1616 * \param num_bytes kernel buffer size
1618 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1624 struct ll_fiemap_info_key fmkey = { .name = KEY_FIEMAP, };
1627 /* Checks for fiemap flags */
1628 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1629 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1633 /* Check for FIEMAP_FLAG_SYNC */
1634 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1635 rc = filemap_fdatawrite(inode->i_mapping);
1640 env = cl_env_get(&refcheck);
1642 RETURN(PTR_ERR(env));
1644 if (i_size_read(inode) == 0) {
1645 rc = ll_glimpse_size(inode);
1650 fmkey.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1651 obdo_from_inode(&fmkey.oa, inode, OBD_MD_FLSIZE);
1652 obdo_set_parent_fid(&fmkey.oa, &ll_i2info(inode)->lli_fid);
1654 /* If filesize is 0, then there would be no objects for mapping */
1655 if (fmkey.oa.o_size == 0) {
1656 fiemap->fm_mapped_extents = 0;
1660 fmkey.fiemap = *fiemap;
1662 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1663 &fmkey, fiemap, &num_bytes);
1665 cl_env_put(env, &refcheck);
1669 int ll_fid2path(struct inode *inode, void __user *arg)
1671 struct obd_export *exp = ll_i2mdexp(inode);
1672 const struct getinfo_fid2path __user *gfin = arg;
1674 struct getinfo_fid2path *gfout;
1680 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1681 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1684 /* Only need to get the buflen */
1685 if (get_user(pathlen, &gfin->gf_pathlen))
1688 if (pathlen > PATH_MAX)
1691 outsize = sizeof(*gfout) + pathlen;
1692 OBD_ALLOC(gfout, outsize);
1696 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1697 GOTO(gf_free, rc = -EFAULT);
1699 /* Call mdc_iocontrol */
1700 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1704 if (copy_to_user(arg, gfout, outsize))
1708 OBD_FREE(gfout, outsize);
1712 static int ll_ioctl_fiemap(struct inode *inode, struct fiemap __user *arg)
1714 struct fiemap *fiemap;
1720 /* Get the extent count so we can calculate the size of
1721 * required fiemap buffer */
1722 if (get_user(extent_count, &arg->fm_extent_count))
1726 (SIZE_MAX - sizeof(*fiemap)) / sizeof(struct ll_fiemap_extent))
1728 num_bytes = sizeof(*fiemap) + (extent_count *
1729 sizeof(struct ll_fiemap_extent));
1731 OBD_ALLOC_LARGE(fiemap, num_bytes);
1735 /* get the fiemap value */
1736 if (copy_from_user(fiemap, arg, sizeof(*fiemap)))
1737 GOTO(error, rc = -EFAULT);
1739 /* If fm_extent_count is non-zero, read the first extent since
1740 * it is used to calculate end_offset and device from previous
1742 if (extent_count != 0) {
1743 if (copy_from_user(&fiemap->fm_extents[0],
1744 (char __user *)arg + sizeof(*fiemap),
1745 sizeof(struct ll_fiemap_extent)))
1746 GOTO(error, rc = -EFAULT);
1749 rc = ll_do_fiemap(inode, fiemap, num_bytes);
1753 ret_bytes = sizeof(struct fiemap);
1755 if (extent_count != 0)
1756 ret_bytes += (fiemap->fm_mapped_extents *
1757 sizeof(struct ll_fiemap_extent));
1759 if (copy_to_user((void __user *)arg, fiemap, ret_bytes))
1763 OBD_FREE_LARGE(fiemap, num_bytes);
1768 * Read the data_version for inode.
1770 * This value is computed using stripe object version on OST.
1771 * Version is computed using server side locking.
1773 * @param flags if do sync on the OST side;
1775 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1776 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1778 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1785 /* If no file object initialized, we consider its version is 0. */
1786 if (ll_i2info(inode)->lli_clob == NULL) {
1791 env = cl_env_get(&refcheck);
1793 RETURN(PTR_ERR(env));
1795 rc = cl_object_data_version(env, ll_i2info(inode)->lli_clob,
1796 data_version, flags);
1797 cl_env_put(env, &refcheck);
1802 * Trigger a HSM release request for the provided inode.
1804 int ll_hsm_release(struct inode *inode)
1806 struct cl_env_nest nest;
1808 struct obd_client_handle *och = NULL;
1809 __u64 data_version = 0;
1813 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1814 ll_get_fsname(inode->i_sb, NULL, 0),
1815 PFID(&ll_i2info(inode)->lli_fid));
1817 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1819 GOTO(out, rc = PTR_ERR(och));
1821 /* Grab latest data_version and [am]time values */
1822 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1826 env = cl_env_nested_get(&nest);
1828 GOTO(out, rc = PTR_ERR(env));
1830 ll_merge_attr(env, inode);
1831 cl_env_nested_put(&nest, env);
1833 /* Release the file.
1834 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1835 * we still need it to pack l_remote_handle to MDT. */
1836 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1842 if (och != NULL && !IS_ERR(och)) /* close the file */
1843 ll_lease_close(och, inode, NULL);
1848 struct ll_swap_stack {
1849 struct iattr ia1, ia2;
1851 struct inode *inode1, *inode2;
1852 bool check_dv1, check_dv2;
1855 static int ll_swap_layouts(struct file *file1, struct file *file2,
1856 struct lustre_swap_layouts *lsl)
1858 struct mdc_swap_layouts msl;
1859 struct md_op_data *op_data;
1862 struct ll_swap_stack *llss = NULL;
1865 OBD_ALLOC_PTR(llss);
1869 llss->inode1 = file1->f_dentry->d_inode;
1870 llss->inode2 = file2->f_dentry->d_inode;
1872 if (!S_ISREG(llss->inode2->i_mode))
1873 GOTO(free, rc = -EINVAL);
1875 if (inode_permission(llss->inode1, MAY_WRITE) ||
1876 inode_permission(llss->inode2, MAY_WRITE))
1877 GOTO(free, rc = -EPERM);
1879 if (llss->inode2->i_sb != llss->inode1->i_sb)
1880 GOTO(free, rc = -EXDEV);
1882 /* we use 2 bool because it is easier to swap than 2 bits */
1883 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1884 llss->check_dv1 = true;
1886 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1887 llss->check_dv2 = true;
1889 /* we cannot use lsl->sl_dvX directly because we may swap them */
1890 llss->dv1 = lsl->sl_dv1;
1891 llss->dv2 = lsl->sl_dv2;
1893 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1894 if (rc == 0) /* same file, done! */
1897 if (rc < 0) { /* sequentialize it */
1898 swap(llss->inode1, llss->inode2);
1900 swap(llss->dv1, llss->dv2);
1901 swap(llss->check_dv1, llss->check_dv2);
1905 if (gid != 0) { /* application asks to flush dirty cache */
1906 rc = ll_get_grouplock(llss->inode1, file1, gid);
1910 rc = ll_get_grouplock(llss->inode2, file2, gid);
1912 ll_put_grouplock(llss->inode1, file1, gid);
1917 /* to be able to restore mtime and atime after swap
1918 * we need to first save them */
1920 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1921 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1922 llss->ia1.ia_atime = llss->inode1->i_atime;
1923 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1924 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1925 llss->ia2.ia_atime = llss->inode2->i_atime;
1926 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1929 /* ultimate check, before swaping the layouts we check if
1930 * dataversion has changed (if requested) */
1931 if (llss->check_dv1) {
1932 rc = ll_data_version(llss->inode1, &dv, 0);
1935 if (dv != llss->dv1)
1936 GOTO(putgl, rc = -EAGAIN);
1939 if (llss->check_dv2) {
1940 rc = ll_data_version(llss->inode2, &dv, 0);
1943 if (dv != llss->dv2)
1944 GOTO(putgl, rc = -EAGAIN);
1947 /* struct md_op_data is used to send the swap args to the mdt
1948 * only flags is missing, so we use struct mdc_swap_layouts
1949 * through the md_op_data->op_data */
1950 /* flags from user space have to be converted before they are send to
1951 * server, no flag is sent today, they are only used on the client */
1954 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1955 0, LUSTRE_OPC_ANY, &msl);
1956 if (IS_ERR(op_data))
1957 GOTO(free, rc = PTR_ERR(op_data));
1959 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1960 sizeof(*op_data), op_data, NULL);
1961 ll_finish_md_op_data(op_data);
1965 ll_put_grouplock(llss->inode2, file2, gid);
1966 ll_put_grouplock(llss->inode1, file1, gid);
1969 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1973 /* clear useless flags */
1974 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1975 llss->ia1.ia_valid &= ~ATTR_MTIME;
1976 llss->ia2.ia_valid &= ~ATTR_MTIME;
1979 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1980 llss->ia1.ia_valid &= ~ATTR_ATIME;
1981 llss->ia2.ia_valid &= ~ATTR_ATIME;
1984 /* update time if requested */
1986 if (llss->ia2.ia_valid != 0) {
1987 mutex_lock(&llss->inode1->i_mutex);
1988 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1989 mutex_unlock(&llss->inode1->i_mutex);
1992 if (llss->ia1.ia_valid != 0) {
1995 mutex_lock(&llss->inode2->i_mutex);
1996 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1997 mutex_unlock(&llss->inode2->i_mutex);
2009 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2011 struct md_op_data *op_data;
2015 /* Detect out-of range masks */
2016 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2019 /* Non-root users are forbidden to set or clear flags which are
2020 * NOT defined in HSM_USER_MASK. */
2021 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2022 !cfs_capable(CFS_CAP_SYS_ADMIN))
2025 /* Detect out-of range archive id */
2026 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2027 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2030 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2031 LUSTRE_OPC_ANY, hss);
2032 if (IS_ERR(op_data))
2033 RETURN(PTR_ERR(op_data));
2035 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2036 sizeof(*op_data), op_data, NULL);
2038 ll_finish_md_op_data(op_data);
2043 static int ll_hsm_import(struct inode *inode, struct file *file,
2044 struct hsm_user_import *hui)
2046 struct hsm_state_set *hss = NULL;
2047 struct iattr *attr = NULL;
2051 if (!S_ISREG(inode->i_mode))
2057 GOTO(out, rc = -ENOMEM);
2059 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2060 hss->hss_archive_id = hui->hui_archive_id;
2061 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2062 rc = ll_hsm_state_set(inode, hss);
2066 OBD_ALLOC_PTR(attr);
2068 GOTO(out, rc = -ENOMEM);
2070 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2071 attr->ia_mode |= S_IFREG;
2072 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2073 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2074 attr->ia_size = hui->hui_size;
2075 attr->ia_mtime.tv_sec = hui->hui_mtime;
2076 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2077 attr->ia_atime.tv_sec = hui->hui_atime;
2078 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2080 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2081 ATTR_UID | ATTR_GID |
2082 ATTR_MTIME | ATTR_MTIME_SET |
2083 ATTR_ATIME | ATTR_ATIME_SET;
2085 mutex_lock(&inode->i_mutex);
2087 rc = ll_setattr_raw(file->f_dentry, attr, true);
2091 mutex_unlock(&inode->i_mutex);
2103 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2105 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2106 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2110 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2112 struct inode *inode = file->f_dentry->d_inode;
2113 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2117 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2118 PFID(ll_inode2fid(inode)), inode, cmd);
2119 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2121 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2122 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2126 case LL_IOC_GETFLAGS:
2127 /* Get the current value of the file flags */
2128 return put_user(fd->fd_flags, (int __user *)arg);
2129 case LL_IOC_SETFLAGS:
2130 case LL_IOC_CLRFLAGS:
2131 /* Set or clear specific file flags */
2132 /* XXX This probably needs checks to ensure the flags are
2133 * not abused, and to handle any flag side effects.
2135 if (get_user(flags, (int __user *) arg))
2138 if (cmd == LL_IOC_SETFLAGS) {
2139 if ((flags & LL_FILE_IGNORE_LOCK) &&
2140 !(file->f_flags & O_DIRECT)) {
2141 CERROR("%s: unable to disable locking on "
2142 "non-O_DIRECT file\n", current->comm);
2146 fd->fd_flags |= flags;
2148 fd->fd_flags &= ~flags;
2151 case LL_IOC_LOV_SETSTRIPE:
2152 RETURN(ll_lov_setstripe(inode, file, arg));
2153 case LL_IOC_LOV_SETEA:
2154 RETURN(ll_lov_setea(inode, file, arg));
2155 case LL_IOC_LOV_SWAP_LAYOUTS: {
2157 struct lustre_swap_layouts lsl;
2159 if (copy_from_user(&lsl, (char __user *)arg,
2160 sizeof(struct lustre_swap_layouts)))
2163 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2166 file2 = fget(lsl.sl_fd);
2171 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2172 rc = ll_swap_layouts(file, file2, &lsl);
2176 case LL_IOC_LOV_GETSTRIPE:
2177 RETURN(ll_file_getstripe(inode,
2178 (struct lov_user_md __user *)arg));
2179 case FSFILT_IOC_FIEMAP:
2180 RETURN(ll_ioctl_fiemap(inode, (struct fiemap __user *)arg));
2181 case FSFILT_IOC_GETFLAGS:
2182 case FSFILT_IOC_SETFLAGS:
2183 RETURN(ll_iocontrol(inode, file, cmd, arg));
2184 case FSFILT_IOC_GETVERSION_OLD:
2185 case FSFILT_IOC_GETVERSION:
2186 RETURN(put_user(inode->i_generation, (int __user *)arg));
2187 case LL_IOC_GROUP_LOCK:
2188 RETURN(ll_get_grouplock(inode, file, arg));
2189 case LL_IOC_GROUP_UNLOCK:
2190 RETURN(ll_put_grouplock(inode, file, arg));
2191 case IOC_OBD_STATFS:
2192 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2194 /* We need to special case any other ioctls we want to handle,
2195 * to send them to the MDS/OST as appropriate and to properly
2196 * network encode the arg field.
2197 case FSFILT_IOC_SETVERSION_OLD:
2198 case FSFILT_IOC_SETVERSION:
2200 case LL_IOC_FLUSHCTX:
2201 RETURN(ll_flush_ctx(inode));
2202 case LL_IOC_PATH2FID: {
2203 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2204 sizeof(struct lu_fid)))
2209 case LL_IOC_GETPARENT:
2210 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2212 case OBD_IOC_FID2PATH:
2213 RETURN(ll_fid2path(inode, (void __user *)arg));
2214 case LL_IOC_DATA_VERSION: {
2215 struct ioc_data_version idv;
2218 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2221 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2222 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2225 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2231 case LL_IOC_GET_MDTIDX: {
2234 mdtidx = ll_get_mdt_idx(inode);
2238 if (put_user((int)mdtidx, (int __user *)arg))
2243 case OBD_IOC_GETDTNAME:
2244 case OBD_IOC_GETMDNAME:
2245 RETURN(ll_get_obd_name(inode, cmd, arg));
2246 case LL_IOC_HSM_STATE_GET: {
2247 struct md_op_data *op_data;
2248 struct hsm_user_state *hus;
2255 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2256 LUSTRE_OPC_ANY, hus);
2257 if (IS_ERR(op_data)) {
2259 RETURN(PTR_ERR(op_data));
2262 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2265 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2268 ll_finish_md_op_data(op_data);
2272 case LL_IOC_HSM_STATE_SET: {
2273 struct hsm_state_set *hss;
2280 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2285 rc = ll_hsm_state_set(inode, hss);
2290 case LL_IOC_HSM_ACTION: {
2291 struct md_op_data *op_data;
2292 struct hsm_current_action *hca;
2299 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2300 LUSTRE_OPC_ANY, hca);
2301 if (IS_ERR(op_data)) {
2303 RETURN(PTR_ERR(op_data));
2306 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2309 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2312 ll_finish_md_op_data(op_data);
2316 case LL_IOC_SET_LEASE: {
2317 struct ll_inode_info *lli = ll_i2info(inode);
2318 struct obd_client_handle *och = NULL;
2323 case LL_LEASE_WRLCK:
2324 if (!(file->f_mode & FMODE_WRITE))
2326 fmode = FMODE_WRITE;
2328 case LL_LEASE_RDLCK:
2329 if (!(file->f_mode & FMODE_READ))
2333 case LL_LEASE_UNLCK:
2334 mutex_lock(&lli->lli_och_mutex);
2335 if (fd->fd_lease_och != NULL) {
2336 och = fd->fd_lease_och;
2337 fd->fd_lease_och = NULL;
2339 mutex_unlock(&lli->lli_och_mutex);
2344 fmode = och->och_flags;
2345 rc = ll_lease_close(och, inode, &lease_broken);
2352 RETURN(ll_lease_type_from_fmode(fmode));
2357 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2359 /* apply for lease */
2360 och = ll_lease_open(inode, file, fmode, 0);
2362 RETURN(PTR_ERR(och));
2365 mutex_lock(&lli->lli_och_mutex);
2366 if (fd->fd_lease_och == NULL) {
2367 fd->fd_lease_och = och;
2370 mutex_unlock(&lli->lli_och_mutex);
2372 /* impossible now that only excl is supported for now */
2373 ll_lease_close(och, inode, &lease_broken);
2378 case LL_IOC_GET_LEASE: {
2379 struct ll_inode_info *lli = ll_i2info(inode);
2380 struct ldlm_lock *lock = NULL;
2383 mutex_lock(&lli->lli_och_mutex);
2384 if (fd->fd_lease_och != NULL) {
2385 struct obd_client_handle *och = fd->fd_lease_och;
2387 lock = ldlm_handle2lock(&och->och_lease_handle);
2389 lock_res_and_lock(lock);
2390 if (!ldlm_is_cancel(lock))
2391 fmode = och->och_flags;
2393 unlock_res_and_lock(lock);
2394 LDLM_LOCK_PUT(lock);
2397 mutex_unlock(&lli->lli_och_mutex);
2399 RETURN(ll_lease_type_from_fmode(fmode));
2401 case LL_IOC_HSM_IMPORT: {
2402 struct hsm_user_import *hui;
2408 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2413 rc = ll_hsm_import(inode, file, hui);
2423 ll_iocontrol_call(inode, file, cmd, arg, &err))
2426 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2427 (void __user *)arg));
2432 #ifndef HAVE_FILE_LLSEEK_SIZE
2433 static inline loff_t
2434 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2436 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2438 if (offset > maxsize)
2441 if (offset != file->f_pos) {
2442 file->f_pos = offset;
2443 file->f_version = 0;
2449 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2450 loff_t maxsize, loff_t eof)
2452 struct inode *inode = file->f_dentry->d_inode;
2460 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2461 * position-querying operation. Avoid rewriting the "same"
2462 * f_pos value back to the file because a concurrent read(),
2463 * write() or lseek() might have altered it
2468 * f_lock protects against read/modify/write race with other
2469 * SEEK_CURs. Note that parallel writes and reads behave
2472 mutex_lock(&inode->i_mutex);
2473 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2474 mutex_unlock(&inode->i_mutex);
2478 * In the generic case the entire file is data, so as long as
2479 * offset isn't at the end of the file then the offset is data.
2486 * There is a virtual hole at the end of the file, so as long as
2487 * offset isn't i_size or larger, return i_size.
2495 return llseek_execute(file, offset, maxsize);
2499 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2501 struct inode *inode = file->f_dentry->d_inode;
2502 loff_t retval, eof = 0;
2505 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2506 (origin == SEEK_CUR) ? file->f_pos : 0);
2507 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2508 PFID(ll_inode2fid(inode)), inode, retval, retval,
2510 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2512 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2513 retval = ll_glimpse_size(inode);
2516 eof = i_size_read(inode);
2519 retval = ll_generic_file_llseek_size(file, offset, origin,
2520 ll_file_maxbytes(inode), eof);
2524 static int ll_flush(struct file *file, fl_owner_t id)
2526 struct inode *inode = file->f_dentry->d_inode;
2527 struct ll_inode_info *lli = ll_i2info(inode);
2528 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2531 LASSERT(!S_ISDIR(inode->i_mode));
2533 /* catch async errors that were recorded back when async writeback
2534 * failed for pages in this mapping. */
2535 rc = lli->lli_async_rc;
2536 lli->lli_async_rc = 0;
2537 if (lli->lli_clob != NULL) {
2538 err = lov_read_and_clear_async_rc(lli->lli_clob);
2543 /* The application has been told write failure already.
2544 * Do not report failure again. */
2545 if (fd->fd_write_failed)
2547 return rc ? -EIO : 0;
2551 * Called to make sure a portion of file has been written out.
2552 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2554 * Return how many pages have been written.
2556 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2557 enum cl_fsync_mode mode, int ignore_layout)
2559 struct cl_env_nest nest;
2562 struct obd_capa *capa = NULL;
2563 struct cl_fsync_io *fio;
2567 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2568 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2571 env = cl_env_nested_get(&nest);
2573 RETURN(PTR_ERR(env));
2575 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2577 io = vvp_env_thread_io(env);
2578 io->ci_obj = ll_i2info(inode)->lli_clob;
2579 io->ci_ignore_layout = ignore_layout;
2581 /* initialize parameters for sync */
2582 fio = &io->u.ci_fsync;
2583 fio->fi_capa = capa;
2584 fio->fi_start = start;
2586 fio->fi_fid = ll_inode2fid(inode);
2587 fio->fi_mode = mode;
2588 fio->fi_nr_written = 0;
2590 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2591 result = cl_io_loop(env, io);
2593 result = io->ci_result;
2595 result = fio->fi_nr_written;
2596 cl_io_fini(env, io);
2597 cl_env_nested_put(&nest, env);
2605 * When dentry is provided (the 'else' case), *file->f_dentry may be
2606 * null and dentry must be used directly rather than pulled from
2607 * *file->f_dentry as is done otherwise.
2610 #ifdef HAVE_FILE_FSYNC_4ARGS
2611 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2613 struct dentry *dentry = file->f_dentry;
2614 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2615 int ll_fsync(struct file *file, int datasync)
2617 struct dentry *dentry = file->f_dentry;
2619 loff_t end = LLONG_MAX;
2621 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2624 loff_t end = LLONG_MAX;
2626 struct inode *inode = dentry->d_inode;
2627 struct ll_inode_info *lli = ll_i2info(inode);
2628 struct ptlrpc_request *req;
2629 struct obd_capa *oc;
2633 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2634 PFID(ll_inode2fid(inode)), inode);
2635 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2637 #ifdef HAVE_FILE_FSYNC_4ARGS
2638 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2639 mutex_lock(&inode->i_mutex);
2641 /* fsync's caller has already called _fdata{sync,write}, we want
2642 * that IO to finish before calling the osc and mdc sync methods */
2643 rc = filemap_fdatawait(inode->i_mapping);
2646 /* catch async errors that were recorded back when async writeback
2647 * failed for pages in this mapping. */
2648 if (!S_ISDIR(inode->i_mode)) {
2649 err = lli->lli_async_rc;
2650 lli->lli_async_rc = 0;
2653 err = lov_read_and_clear_async_rc(lli->lli_clob);
2658 oc = ll_mdscapa_get(inode);
2659 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2665 ptlrpc_req_finished(req);
2667 if (S_ISREG(inode->i_mode)) {
2668 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2670 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2671 if (rc == 0 && err < 0)
2674 fd->fd_write_failed = true;
2676 fd->fd_write_failed = false;
2679 #ifdef HAVE_FILE_FSYNC_4ARGS
2680 mutex_unlock(&inode->i_mutex);
2686 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2688 struct inode *inode = file->f_dentry->d_inode;
2689 struct ll_sb_info *sbi = ll_i2sbi(inode);
2690 struct ldlm_enqueue_info einfo = {
2691 .ei_type = LDLM_FLOCK,
2692 .ei_cb_cp = ldlm_flock_completion_ast,
2693 .ei_cbdata = file_lock,
2695 struct md_op_data *op_data;
2696 struct lustre_handle lockh = {0};
2697 ldlm_policy_data_t flock = {{0}};
2698 int fl_type = file_lock->fl_type;
2704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2705 PFID(ll_inode2fid(inode)), file_lock);
2707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2709 if (file_lock->fl_flags & FL_FLOCK) {
2710 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2711 /* flocks are whole-file locks */
2712 flock.l_flock.end = OFFSET_MAX;
2713 /* For flocks owner is determined by the local file desctiptor*/
2714 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2715 } else if (file_lock->fl_flags & FL_POSIX) {
2716 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2717 flock.l_flock.start = file_lock->fl_start;
2718 flock.l_flock.end = file_lock->fl_end;
2722 flock.l_flock.pid = file_lock->fl_pid;
2724 /* Somewhat ugly workaround for svc lockd.
2725 * lockd installs custom fl_lmops->lm_compare_owner that checks
2726 * for the fl_owner to be the same (which it always is on local node
2727 * I guess between lockd processes) and then compares pid.
2728 * As such we assign pid to the owner field to make it all work,
2729 * conflict with normal locks is unlikely since pid space and
2730 * pointer space for current->files are not intersecting */
2731 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2732 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2736 einfo.ei_mode = LCK_PR;
2739 /* An unlock request may or may not have any relation to
2740 * existing locks so we may not be able to pass a lock handle
2741 * via a normal ldlm_lock_cancel() request. The request may even
2742 * unlock a byte range in the middle of an existing lock. In
2743 * order to process an unlock request we need all of the same
2744 * information that is given with a normal read or write record
2745 * lock request. To avoid creating another ldlm unlock (cancel)
2746 * message we'll treat a LCK_NL flock request as an unlock. */
2747 einfo.ei_mode = LCK_NL;
2750 einfo.ei_mode = LCK_PW;
2753 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2768 flags = LDLM_FL_BLOCK_NOWAIT;
2774 flags = LDLM_FL_TEST_LOCK;
2777 CERROR("unknown fcntl lock command: %d\n", cmd);
2781 /* Save the old mode so that if the mode in the lock changes we
2782 * can decrement the appropriate reader or writer refcount. */
2783 file_lock->fl_type = einfo.ei_mode;
2785 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2786 LUSTRE_OPC_ANY, NULL);
2787 if (IS_ERR(op_data))
2788 RETURN(PTR_ERR(op_data));
2790 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2791 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2792 flock.l_flock.pid, flags, einfo.ei_mode,
2793 flock.l_flock.start, flock.l_flock.end);
2795 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2798 /* Restore the file lock type if not TEST lock. */
2799 if (!(flags & LDLM_FL_TEST_LOCK))
2800 file_lock->fl_type = fl_type;
2802 if ((file_lock->fl_flags & FL_FLOCK) &&
2803 (rc == 0 || file_lock->fl_type == F_UNLCK))
2804 rc2 = flock_lock_file_wait(file, file_lock);
2805 if ((file_lock->fl_flags & FL_POSIX) &&
2806 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2807 !(flags & LDLM_FL_TEST_LOCK))
2808 rc2 = posix_lock_file_wait(file, file_lock);
2810 if (rc2 && file_lock->fl_type != F_UNLCK) {
2811 einfo.ei_mode = LCK_NL;
2812 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2817 ll_finish_md_op_data(op_data);
2822 int ll_get_fid_by_name(struct inode *parent, const char *name,
2823 int namelen, struct lu_fid *fid)
2825 struct md_op_data *op_data = NULL;
2826 struct mdt_body *body;
2827 struct ptlrpc_request *req;
2831 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2832 LUSTRE_OPC_ANY, NULL);
2833 if (IS_ERR(op_data))
2834 RETURN(PTR_ERR(op_data));
2836 op_data->op_valid = OBD_MD_FLID;
2837 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2838 ll_finish_md_op_data(op_data);
2842 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2844 GOTO(out_req, rc = -EFAULT);
2846 *fid = body->mbo_fid1;
2848 ptlrpc_req_finished(req);
2852 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2853 const char *name, int namelen)
2855 struct dentry *dchild = NULL;
2856 struct inode *child_inode = NULL;
2857 struct md_op_data *op_data;
2858 struct ptlrpc_request *request = NULL;
2863 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2864 name, PFID(ll_inode2fid(parent)), mdtidx);
2866 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2867 0, LUSTRE_OPC_ANY, NULL);
2868 if (IS_ERR(op_data))
2869 RETURN(PTR_ERR(op_data));
2871 /* Get child FID first */
2872 qstr.hash = full_name_hash(name, namelen);
2875 dchild = d_lookup(file->f_dentry, &qstr);
2876 if (dchild != NULL) {
2877 if (dchild->d_inode != NULL) {
2878 child_inode = igrab(dchild->d_inode);
2879 if (child_inode != NULL) {
2880 mutex_lock(&child_inode->i_mutex);
2881 op_data->op_fid3 = *ll_inode2fid(child_inode);
2882 ll_invalidate_aliases(child_inode);
2887 rc = ll_get_fid_by_name(parent, name, namelen,
2893 if (!fid_is_sane(&op_data->op_fid3)) {
2894 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2895 ll_get_fsname(parent->i_sb, NULL, 0), name,
2896 PFID(&op_data->op_fid3));
2897 GOTO(out_free, rc = -EINVAL);
2900 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2905 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2906 PFID(&op_data->op_fid3), mdtidx);
2907 GOTO(out_free, rc = 0);
2910 op_data->op_mds = mdtidx;
2911 op_data->op_cli_flags = CLI_MIGRATE;
2912 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2913 namelen, name, namelen, &request);
2915 ll_update_times(request, parent);
2917 ptlrpc_req_finished(request);
2922 if (child_inode != NULL) {
2923 clear_nlink(child_inode);
2924 mutex_unlock(&child_inode->i_mutex);
2928 ll_finish_md_op_data(op_data);
2933 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2941 * test if some locks matching bits and l_req_mode are acquired
2942 * - bits can be in different locks
2943 * - if found clear the common lock bits in *bits
2944 * - the bits not found, are kept in *bits
2946 * \param bits [IN] searched lock bits [IN]
2947 * \param l_req_mode [IN] searched lock mode
2948 * \retval boolean, true iff all bits are found
2950 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2952 struct lustre_handle lockh;
2953 ldlm_policy_data_t policy;
2954 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2955 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2964 fid = &ll_i2info(inode)->lli_fid;
2965 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2966 ldlm_lockname[mode]);
2968 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2969 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2970 policy.l_inodebits.bits = *bits & (1 << i);
2971 if (policy.l_inodebits.bits == 0)
2974 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2975 &policy, mode, &lockh)) {
2976 struct ldlm_lock *lock;
2978 lock = ldlm_handle2lock(&lockh);
2981 ~(lock->l_policy_data.l_inodebits.bits);
2982 LDLM_LOCK_PUT(lock);
2984 *bits &= ~policy.l_inodebits.bits;
2991 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2992 struct lustre_handle *lockh, __u64 flags,
2995 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3000 fid = &ll_i2info(inode)->lli_fid;
3001 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3003 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3004 fid, LDLM_IBITS, &policy, mode, lockh);
3009 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3011 /* Already unlinked. Just update nlink and return success */
3012 if (rc == -ENOENT) {
3014 /* This path cannot be hit for regular files unless in
3015 * case of obscure races, so no need to to validate
3017 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3019 } else if (rc != 0) {
3020 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3021 "%s: revalidate FID "DFID" error: rc = %d\n",
3022 ll_get_fsname(inode->i_sb, NULL, 0),
3023 PFID(ll_inode2fid(inode)), rc);
3029 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3031 struct inode *inode = dentry->d_inode;
3032 struct ptlrpc_request *req = NULL;
3033 struct obd_export *exp;
3037 LASSERT(inode != NULL);
3039 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3040 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3042 exp = ll_i2mdexp(inode);
3044 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3045 * But under CMD case, it caused some lock issues, should be fixed
3046 * with new CMD ibits lock. See bug 12718 */
3047 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3048 struct lookup_intent oit = { .it_op = IT_GETATTR };
3049 struct md_op_data *op_data;
3051 if (ibits == MDS_INODELOCK_LOOKUP)
3052 oit.it_op = IT_LOOKUP;
3054 /* Call getattr by fid, so do not provide name at all. */
3055 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3056 dentry->d_inode, NULL, 0, 0,
3057 LUSTRE_OPC_ANY, NULL);
3058 if (IS_ERR(op_data))
3059 RETURN(PTR_ERR(op_data));
3061 rc = md_intent_lock(exp, op_data, &oit, &req,
3062 &ll_md_blocking_ast, 0);
3063 ll_finish_md_op_data(op_data);
3065 rc = ll_inode_revalidate_fini(inode, rc);
3069 rc = ll_revalidate_it_finish(req, &oit, dentry);
3071 ll_intent_release(&oit);
3075 /* Unlinked? Unhash dentry, so it is not picked up later by
3076 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3077 here to preserve get_cwd functionality on 2.6.
3079 if (!dentry->d_inode->i_nlink)
3080 d_lustre_invalidate(dentry, 0);
3082 ll_lookup_finish_locks(&oit, dentry);
3083 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3084 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3085 u64 valid = OBD_MD_FLGETATTR;
3086 struct md_op_data *op_data;
3089 if (S_ISREG(inode->i_mode)) {
3090 rc = ll_get_default_mdsize(sbi, &ealen);
3093 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3096 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3097 0, ealen, LUSTRE_OPC_ANY,
3099 if (IS_ERR(op_data))
3100 RETURN(PTR_ERR(op_data));
3102 op_data->op_valid = valid;
3103 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3104 * capa for this inode. Because we only keep capas of dirs
3106 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3107 ll_finish_md_op_data(op_data);
3109 rc = ll_inode_revalidate_fini(inode, rc);
3113 rc = ll_prep_inode(&inode, req, NULL, NULL);
3116 ptlrpc_req_finished(req);
3120 static int ll_merge_md_attr(struct inode *inode)
3122 struct cl_attr attr = { 0 };
3125 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3126 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3127 &attr, ll_md_blocking_ast);
3131 set_nlink(inode, attr.cat_nlink);
3132 inode->i_blocks = attr.cat_blocks;
3133 i_size_write(inode, attr.cat_size);
3135 ll_i2info(inode)->lli_atime = attr.cat_atime;
3136 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3137 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3143 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3145 struct inode *inode = dentry->d_inode;
3149 rc = __ll_inode_revalidate(dentry, ibits);
3153 /* if object isn't regular file, don't validate size */
3154 if (!S_ISREG(inode->i_mode)) {
3155 if (S_ISDIR(inode->i_mode) &&
3156 ll_i2info(inode)->lli_lsm_md != NULL) {
3157 rc = ll_merge_md_attr(inode);
3162 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3163 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3164 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3166 /* In case of restore, the MDT has the right size and has
3167 * already send it back without granting the layout lock,
3168 * inode is up-to-date so glimpse is useless.
3169 * Also to glimpse we need the layout, in case of a running
3170 * restore the MDT holds the layout lock so the glimpse will
3171 * block up to the end of restore (getattr will block)
3173 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3174 rc = ll_glimpse_size(inode);
3179 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3181 struct inode *inode = de->d_inode;
3182 struct ll_sb_info *sbi = ll_i2sbi(inode);
3183 struct ll_inode_info *lli = ll_i2info(inode);
3186 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3187 MDS_INODELOCK_LOOKUP);
3188 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3193 stat->dev = inode->i_sb->s_dev;
3194 if (ll_need_32bit_api(sbi))
3195 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3197 stat->ino = inode->i_ino;
3198 stat->mode = inode->i_mode;
3199 stat->uid = inode->i_uid;
3200 stat->gid = inode->i_gid;
3201 stat->rdev = inode->i_rdev;
3202 stat->atime = inode->i_atime;
3203 stat->mtime = inode->i_mtime;
3204 stat->ctime = inode->i_ctime;
3205 stat->blksize = 1 << inode->i_blkbits;
3207 stat->nlink = inode->i_nlink;
3208 stat->size = i_size_read(inode);
3209 stat->blocks = inode->i_blocks;
3214 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3215 __u64 start, __u64 len)
3219 struct ll_user_fiemap *fiemap;
3220 unsigned int extent_count = fieinfo->fi_extents_max;
3222 num_bytes = sizeof(*fiemap) + (extent_count *
3223 sizeof(struct ll_fiemap_extent));
3224 OBD_ALLOC_LARGE(fiemap, num_bytes);
3229 fiemap->fm_flags = fieinfo->fi_flags;
3230 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3231 fiemap->fm_start = start;
3232 fiemap->fm_length = len;
3233 if (extent_count > 0)
3234 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3235 sizeof(struct ll_fiemap_extent));
3237 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3239 fieinfo->fi_flags = fiemap->fm_flags;
3240 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3241 if (extent_count > 0)
3242 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3243 fiemap->fm_mapped_extents *
3244 sizeof(struct ll_fiemap_extent));
3246 OBD_FREE_LARGE(fiemap, num_bytes);
3250 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3252 struct ll_inode_info *lli = ll_i2info(inode);
3253 struct posix_acl *acl = NULL;
3256 spin_lock(&lli->lli_lock);
3257 /* VFS' acl_permission_check->check_acl will release the refcount */
3258 acl = posix_acl_dup(lli->lli_posix_acl);
3259 spin_unlock(&lli->lli_lock);
3264 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3266 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3267 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3269 ll_check_acl(struct inode *inode, int mask)
3272 # ifdef CONFIG_FS_POSIX_ACL
3273 struct posix_acl *acl;
3277 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3278 if (flags & IPERM_FLAG_RCU)
3281 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3286 rc = posix_acl_permission(inode, acl, mask);
3287 posix_acl_release(acl);
3290 # else /* !CONFIG_FS_POSIX_ACL */
3292 # endif /* CONFIG_FS_POSIX_ACL */
3294 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3296 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3297 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3299 # ifdef HAVE_INODE_PERMISION_2ARGS
3300 int ll_inode_permission(struct inode *inode, int mask)
3302 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3307 struct ll_sb_info *sbi;
3308 struct root_squash_info *squash;
3309 struct cred *cred = NULL;
3310 const struct cred *old_cred = NULL;
3312 bool squash_id = false;
3315 #ifdef MAY_NOT_BLOCK
3316 if (mask & MAY_NOT_BLOCK)
3318 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3319 if (flags & IPERM_FLAG_RCU)
3323 /* as root inode are NOT getting validated in lookup operation,
3324 * need to do it before permission check. */
3326 if (inode == inode->i_sb->s_root->d_inode) {
3327 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3328 MDS_INODELOCK_LOOKUP);
3333 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3334 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3336 /* squash fsuid/fsgid if needed */
3337 sbi = ll_i2sbi(inode);
3338 squash = &sbi->ll_squash;
3339 if (unlikely(squash->rsi_uid != 0 &&
3340 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3341 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3345 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3346 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3347 squash->rsi_uid, squash->rsi_gid);
3349 /* update current process's credentials
3350 * and FS capability */
3351 cred = prepare_creds();
3355 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3356 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3357 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3358 if ((1 << cap) & CFS_CAP_FS_MASK)
3359 cap_lower(cred->cap_effective, cap);
3361 old_cred = override_creds(cred);
3364 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3366 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3367 rc = lustre_check_remote_perm(inode, mask);
3369 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3371 /* restore current process's credentials and FS capability */
3373 revert_creds(old_cred);
3380 /* -o localflock - only provides locally consistent flock locks */
3381 struct file_operations ll_file_operations = {
3382 .read = ll_file_read,
3383 .aio_read = ll_file_aio_read,
3384 .write = ll_file_write,
3385 .aio_write = ll_file_aio_write,
3386 .unlocked_ioctl = ll_file_ioctl,
3387 .open = ll_file_open,
3388 .release = ll_file_release,
3389 .mmap = ll_file_mmap,
3390 .llseek = ll_file_seek,
3391 .splice_read = ll_file_splice_read,
3396 struct file_operations ll_file_operations_flock = {
3397 .read = ll_file_read,
3398 .aio_read = ll_file_aio_read,
3399 .write = ll_file_write,
3400 .aio_write = ll_file_aio_write,
3401 .unlocked_ioctl = ll_file_ioctl,
3402 .open = ll_file_open,
3403 .release = ll_file_release,
3404 .mmap = ll_file_mmap,
3405 .llseek = ll_file_seek,
3406 .splice_read = ll_file_splice_read,
3409 .flock = ll_file_flock,
3410 .lock = ll_file_flock
3413 /* These are for -o noflock - to return ENOSYS on flock calls */
3414 struct file_operations ll_file_operations_noflock = {
3415 .read = ll_file_read,
3416 .aio_read = ll_file_aio_read,
3417 .write = ll_file_write,
3418 .aio_write = ll_file_aio_write,
3419 .unlocked_ioctl = ll_file_ioctl,
3420 .open = ll_file_open,
3421 .release = ll_file_release,
3422 .mmap = ll_file_mmap,
3423 .llseek = ll_file_seek,
3424 .splice_read = ll_file_splice_read,
3427 .flock = ll_file_noflock,
3428 .lock = ll_file_noflock
3431 struct inode_operations ll_file_inode_operations = {
3432 .setattr = ll_setattr,
3433 .getattr = ll_getattr,
3434 .permission = ll_inode_permission,
3435 .setxattr = ll_setxattr,
3436 .getxattr = ll_getxattr,
3437 .listxattr = ll_listxattr,
3438 .removexattr = ll_removexattr,
3439 .fiemap = ll_fiemap,
3440 #ifdef HAVE_IOP_GET_ACL
3441 .get_acl = ll_get_acl,
3445 /* dynamic ioctl number support routins */
3446 static struct llioc_ctl_data {
3447 struct rw_semaphore ioc_sem;
3448 struct list_head ioc_head;
3450 __RWSEM_INITIALIZER(llioc.ioc_sem),
3451 LIST_HEAD_INIT(llioc.ioc_head)
3456 struct list_head iocd_list;
3457 unsigned int iocd_size;
3458 llioc_callback_t iocd_cb;
3459 unsigned int iocd_count;
3460 unsigned int iocd_cmd[0];
3463 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3466 struct llioc_data *in_data = NULL;
3469 if (cb == NULL || cmd == NULL ||
3470 count > LLIOC_MAX_CMD || count < 0)
3473 size = sizeof(*in_data) + count * sizeof(unsigned int);
3474 OBD_ALLOC(in_data, size);
3475 if (in_data == NULL)
3478 memset(in_data, 0, sizeof(*in_data));
3479 in_data->iocd_size = size;
3480 in_data->iocd_cb = cb;
3481 in_data->iocd_count = count;
3482 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3484 down_write(&llioc.ioc_sem);
3485 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3486 up_write(&llioc.ioc_sem);
3491 void ll_iocontrol_unregister(void *magic)
3493 struct llioc_data *tmp;
3498 down_write(&llioc.ioc_sem);
3499 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3501 unsigned int size = tmp->iocd_size;
3503 list_del(&tmp->iocd_list);
3504 up_write(&llioc.ioc_sem);
3506 OBD_FREE(tmp, size);
3510 up_write(&llioc.ioc_sem);
3512 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3515 EXPORT_SYMBOL(ll_iocontrol_register);
3516 EXPORT_SYMBOL(ll_iocontrol_unregister);
3518 static enum llioc_iter
3519 ll_iocontrol_call(struct inode *inode, struct file *file,
3520 unsigned int cmd, unsigned long arg, int *rcp)
3522 enum llioc_iter ret = LLIOC_CONT;
3523 struct llioc_data *data;
3524 int rc = -EINVAL, i;
3526 down_read(&llioc.ioc_sem);
3527 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3528 for (i = 0; i < data->iocd_count; i++) {
3529 if (cmd != data->iocd_cmd[i])
3532 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3536 if (ret == LLIOC_STOP)
3539 up_read(&llioc.ioc_sem);
3546 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3548 struct ll_inode_info *lli = ll_i2info(inode);
3549 struct cl_object *obj = lli->lli_clob;
3550 struct cl_env_nest nest;
3558 env = cl_env_nested_get(&nest);
3560 RETURN(PTR_ERR(env));
3562 rc = cl_conf_set(env, lli->lli_clob, conf);
3566 if (conf->coc_opc == OBJECT_CONF_SET) {
3567 struct ldlm_lock *lock = conf->coc_lock;
3568 struct cl_layout cl = {
3572 LASSERT(lock != NULL);
3573 LASSERT(ldlm_has_layout(lock));
3575 /* it can only be allowed to match after layout is
3576 * applied to inode otherwise false layout would be
3577 * seen. Applying layout shoud happen before dropping
3578 * the intent lock. */
3579 ldlm_lock_allow_match(lock);
3581 rc = cl_object_layout_get(env, obj, &cl);
3586 DFID": layout version change: %u -> %u\n",
3587 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3589 ll_layout_version_set(lli, cl.cl_layout_gen);
3593 cl_env_nested_put(&nest, env);
3598 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3599 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3602 struct ll_sb_info *sbi = ll_i2sbi(inode);
3603 struct obd_capa *oc;
3604 struct ptlrpc_request *req;
3605 struct mdt_body *body;
3612 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3613 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3614 lock->l_lvb_data, lock->l_lvb_len);
3616 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3619 /* if layout lock was granted right away, the layout is returned
3620 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3621 * blocked and then granted via completion ast, we have to fetch
3622 * layout here. Please note that we can't use the LVB buffer in
3623 * completion AST because it doesn't have a large enough buffer */
3624 oc = ll_mdscapa_get(inode);
3625 rc = ll_get_default_mdsize(sbi, &lmmsize);
3627 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3628 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3634 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3636 GOTO(out, rc = -EPROTO);
3638 lmmsize = body->mbo_eadatasize;
3639 if (lmmsize == 0) /* empty layout */
3642 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3644 GOTO(out, rc = -EFAULT);
3646 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3647 if (lvbdata == NULL)
3648 GOTO(out, rc = -ENOMEM);
3650 memcpy(lvbdata, lmm, lmmsize);
3651 lock_res_and_lock(lock);
3652 if (lock->l_lvb_data != NULL)
3653 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3655 lock->l_lvb_data = lvbdata;
3656 lock->l_lvb_len = lmmsize;
3657 unlock_res_and_lock(lock);
3662 ptlrpc_req_finished(req);
3667 * Apply the layout to the inode. Layout lock is held and will be released
3670 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3671 struct inode *inode)
3673 struct ll_inode_info *lli = ll_i2info(inode);
3674 struct ll_sb_info *sbi = ll_i2sbi(inode);
3675 struct ldlm_lock *lock;
3676 struct lustre_md md = { NULL };
3677 struct cl_object_conf conf;
3680 bool wait_layout = false;
3683 LASSERT(lustre_handle_is_used(lockh));
3685 lock = ldlm_handle2lock(lockh);
3686 LASSERT(lock != NULL);
3687 LASSERT(ldlm_has_layout(lock));
3689 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3690 PFID(&lli->lli_fid), inode);
3692 /* in case this is a caching lock and reinstate with new inode */
3693 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3695 lock_res_and_lock(lock);
3696 lvb_ready = ldlm_is_lvb_ready(lock);
3697 unlock_res_and_lock(lock);
3698 /* checking lvb_ready is racy but this is okay. The worst case is
3699 * that multi processes may configure the file on the same time. */
3704 rc = ll_layout_fetch(inode, lock);
3708 /* for layout lock, lmm is returned in lock's lvb.
3709 * lvb_data is immutable if the lock is held so it's safe to access it
3710 * without res lock. See the description in ldlm_lock_decref_internal()
3711 * for the condition to free lvb_data of layout lock */
3712 if (lock->l_lvb_data != NULL) {
3713 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3714 lock->l_lvb_data, lock->l_lvb_len);
3716 CERROR("%s: file "DFID" unpackmd error: %d\n",
3717 ll_get_fsname(inode->i_sb, NULL, 0),
3718 PFID(&lli->lli_fid), rc);
3722 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3723 lock->l_lvb_data, lock->l_lvb_len);
3728 /* set layout to file. Unlikely this will fail as old layout was
3729 * surely eliminated */
3730 memset(&conf, 0, sizeof conf);
3731 conf.coc_opc = OBJECT_CONF_SET;
3732 conf.coc_inode = inode;
3733 conf.coc_lock = lock;
3734 conf.u.coc_md = &md;
3735 rc = ll_layout_conf(inode, &conf);
3738 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3740 /* refresh layout failed, need to wait */
3741 wait_layout = rc == -EBUSY;
3745 LDLM_LOCK_PUT(lock);
3746 ldlm_lock_decref(lockh, mode);
3748 /* wait for IO to complete if it's still being used. */
3750 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3751 ll_get_fsname(inode->i_sb, NULL, 0),
3752 PFID(&lli->lli_fid), inode);
3754 memset(&conf, 0, sizeof conf);
3755 conf.coc_opc = OBJECT_CONF_WAIT;
3756 conf.coc_inode = inode;
3757 rc = ll_layout_conf(inode, &conf);
3761 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3762 ll_get_fsname(inode->i_sb, NULL, 0),
3763 PFID(&lli->lli_fid), rc);
3768 static int ll_layout_refresh_locked(struct inode *inode)
3770 struct ll_inode_info *lli = ll_i2info(inode);
3771 struct ll_sb_info *sbi = ll_i2sbi(inode);
3772 struct md_op_data *op_data;
3773 struct lookup_intent it;
3774 struct lustre_handle lockh;
3776 struct ldlm_enqueue_info einfo = {
3777 .ei_type = LDLM_IBITS,
3779 .ei_cb_bl = &ll_md_blocking_ast,
3780 .ei_cb_cp = &ldlm_completion_ast,
3786 /* mostly layout lock is caching on the local side, so try to match
3787 * it before grabbing layout lock mutex. */
3788 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3789 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3790 if (mode != 0) { /* hit cached lock */
3791 rc = ll_layout_lock_set(&lockh, mode, inode);
3798 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3799 0, 0, LUSTRE_OPC_ANY, NULL);
3800 if (IS_ERR(op_data))
3801 RETURN(PTR_ERR(op_data));
3803 /* have to enqueue one */
3804 memset(&it, 0, sizeof(it));
3805 it.it_op = IT_LAYOUT;
3806 lockh.cookie = 0ULL;
3808 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3809 ll_get_fsname(inode->i_sb, NULL, 0),
3810 PFID(&lli->lli_fid), inode);
3812 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3813 if (it.d.lustre.it_data != NULL)
3814 ptlrpc_req_finished(it.d.lustre.it_data);
3815 it.d.lustre.it_data = NULL;
3817 ll_finish_md_op_data(op_data);
3819 mode = it.d.lustre.it_lock_mode;
3820 it.d.lustre.it_lock_mode = 0;
3821 ll_intent_drop_lock(&it);
3824 /* set lock data in case this is a new lock */
3825 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3826 rc = ll_layout_lock_set(&lockh, mode, inode);
3835 * This function checks if there exists a LAYOUT lock on the client side,
3836 * or enqueues it if it doesn't have one in cache.
3838 * This function will not hold layout lock so it may be revoked any time after
3839 * this function returns. Any operations depend on layout should be redone
3842 * This function should be called before lov_io_init() to get an uptodate
3843 * layout version, the caller should save the version number and after IO
3844 * is finished, this function should be called again to verify that layout
3845 * is not changed during IO time.
3847 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3849 struct ll_inode_info *lli = ll_i2info(inode);
3850 struct ll_sb_info *sbi = ll_i2sbi(inode);
3854 *gen = ll_layout_version_get(lli);
3855 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3859 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3860 LASSERT(S_ISREG(inode->i_mode));
3862 /* take layout lock mutex to enqueue layout lock exclusively. */
3863 mutex_lock(&lli->lli_layout_mutex);
3865 rc = ll_layout_refresh_locked(inode);
3869 *gen = ll_layout_version_get(lli);
3871 mutex_unlock(&lli->lli_layout_mutex);
3877 * This function send a restore request to the MDT
3879 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3881 struct hsm_user_request *hur;
3885 len = sizeof(struct hsm_user_request) +
3886 sizeof(struct hsm_user_item);
3887 OBD_ALLOC(hur, len);
3891 hur->hur_request.hr_action = HUA_RESTORE;
3892 hur->hur_request.hr_archive_id = 0;
3893 hur->hur_request.hr_flags = 0;
3894 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3895 sizeof(hur->hur_user_item[0].hui_fid));
3896 hur->hur_user_item[0].hui_extent.offset = offset;
3897 hur->hur_user_item[0].hui_extent.length = length;
3898 hur->hur_request.hr_itemcount = 1;
3899 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,