4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <lustre/ll_fiemap.h>
49 #include <lustre_ioctl.h>
51 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static enum llioc_iter
63 ll_iocontrol_call(struct inode *inode, struct file *file,
64 unsigned int cmd, unsigned long arg, int *rcp);
66 static struct ll_file_data *ll_file_data_get(void)
68 struct ll_file_data *fd;
70 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
74 fd->fd_write_failed = false;
79 static void ll_file_data_put(struct ll_file_data *fd)
82 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
85 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
86 struct lustre_handle *fh)
88 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
89 op_data->op_attr.ia_mode = inode->i_mode;
90 op_data->op_attr.ia_atime = inode->i_atime;
91 op_data->op_attr.ia_mtime = inode->i_mtime;
92 op_data->op_attr.ia_ctime = inode->i_ctime;
93 op_data->op_attr.ia_size = i_size_read(inode);
94 op_data->op_attr_blocks = inode->i_blocks;
95 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Packs all the attributes into @op_data for the CLOSE rpc.
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
123 ll_prep_md_op_data(op_data, inode, NULL, NULL,
124 0, 0, LUSTRE_OPC_ANY, NULL);
128 static int ll_close_inode_openhandle(struct obd_export *md_exp,
130 struct obd_client_handle *och,
131 const __u64 *data_version)
133 struct obd_export *exp = ll_i2mdexp(inode);
134 struct md_op_data *op_data;
135 struct ptlrpc_request *req = NULL;
136 struct obd_device *obd = class_exp2obd(exp);
142 * XXX: in case of LMV, is this correct to access
145 CERROR("Invalid MDC connection handle "LPX64"\n",
146 ll_i2mdexp(inode)->exp_handle.h_cookie);
150 OBD_ALLOC_PTR(op_data);
152 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
154 ll_prepare_close(inode, op_data, och);
155 if (data_version != NULL) {
156 /* Pass in data_version implies release. */
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *data_version;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 rc = md_close(md_exp, op_data, och->och_mod, &req);
165 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
166 ll_i2mdexp(inode)->exp_obd->obd_name,
167 PFID(ll_inode2fid(inode)), rc);
170 /* DATA_MODIFIED flag was successfully sent on close, cancel data
171 * modification flag. */
172 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
173 struct ll_inode_info *lli = ll_i2info(inode);
175 spin_lock(&lli->lli_lock);
176 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
177 spin_unlock(&lli->lli_lock);
180 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
181 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 if (req) /* This is close request */
196 ptlrpc_req_finished(req);
200 int ll_md_real_close(struct inode *inode, fmode_t fmode)
202 struct ll_inode_info *lli = ll_i2info(inode);
203 struct obd_client_handle **och_p;
204 struct obd_client_handle *och;
209 if (fmode & FMODE_WRITE) {
210 och_p = &lli->lli_mds_write_och;
211 och_usecount = &lli->lli_open_fd_write_count;
212 } else if (fmode & FMODE_EXEC) {
213 och_p = &lli->lli_mds_exec_och;
214 och_usecount = &lli->lli_open_fd_exec_count;
216 LASSERT(fmode & FMODE_READ);
217 och_p = &lli->lli_mds_read_och;
218 och_usecount = &lli->lli_open_fd_read_count;
221 mutex_lock(&lli->lli_och_mutex);
222 if (*och_usecount > 0) {
223 /* There are still users of this handle, so skip
225 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
234 /* There might be a race and this handle may already
236 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
243 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
246 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
247 struct ll_inode_info *lli = ll_i2info(inode);
251 /* clear group lock, if present */
252 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
253 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
255 if (fd->fd_lease_och != NULL) {
258 /* Usually the lease is not released when the
259 * application crashed, we need to release here. */
260 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
261 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
262 PFID(&lli->lli_fid), rc, lease_broken);
264 fd->fd_lease_och = NULL;
267 if (fd->fd_och != NULL) {
268 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
273 /* Let's see if we have good enough OPEN lock on the file and if
274 we can skip talking to MDS */
275 if (file->f_dentry->d_inode) { /* Can this ever be false? */
277 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
278 struct lustre_handle lockh;
279 struct inode *inode = file->f_dentry->d_inode;
280 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode,
301 rc = ll_md_real_close(file->f_dentry->d_inode,
305 CERROR("released file has negative dentry: file = %p, "
306 "dentry = %p, name = %s\n",
307 file, file->f_dentry, file->f_dentry->d_name.name);
311 LUSTRE_FPRIVATE(file) = NULL;
312 ll_file_data_put(fd);
313 ll_capa_close(inode);
318 /* While this returns an error code, fput() the caller does not, so we need
319 * to make every effort to clean up all of our state here. Also, applications
320 * rarely check close errors and even if an error is returned they will not
321 * re-try the close call.
323 int ll_file_release(struct inode *inode, struct file *file)
325 struct ll_file_data *fd;
326 struct ll_sb_info *sbi = ll_i2sbi(inode);
327 struct ll_inode_info *lli = ll_i2info(inode);
331 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
332 PFID(ll_inode2fid(inode)), inode);
334 #ifdef CONFIG_FS_POSIX_ACL
335 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
336 inode == inode->i_sb->s_root->d_inode) {
337 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
340 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
341 fd->fd_flags &= ~LL_FILE_RMTACL;
342 rct_del(&sbi->ll_rct, current_pid());
343 et_search_free(&sbi->ll_et, current_pid());
348 if (inode->i_sb->s_root != file->f_dentry)
349 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
350 fd = LUSTRE_FPRIVATE(file);
353 /* The last ref on @file, maybe not the the owner pid of statahead,
354 * because parent and child process can share the same file handle. */
355 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
356 ll_deauthorize_statahead(inode, fd);
358 if (inode->i_sb->s_root == file->f_dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 if (lli->lli_clob != NULL)
366 lov_read_and_clear_async_rc(lli->lli_clob);
367 lli->lli_async_rc = 0;
370 rc = ll_md_close(sbi->ll_md_exp, inode, file);
372 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
373 libcfs_debug_dumplog();
378 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
379 struct lookup_intent *itp)
381 struct dentry *de = file->f_dentry;
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->d.lustre.it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
443 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
444 struct obd_client_handle *och)
446 struct ptlrpc_request *req = it->d.lustre.it_data;
447 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 och->och_fh = body->mbo_handle;
451 och->och_fid = body->mbo_fid1;
452 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_flags = it->it_flags;
456 return md_set_open_replay_data(md_exp, och, it);
459 static int ll_local_open(struct file *file, struct lookup_intent *it,
460 struct ll_file_data *fd, struct obd_client_handle *och)
462 struct inode *inode = file->f_dentry->d_inode;
465 LASSERT(!LUSTRE_FPRIVATE(file));
472 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
477 LUSTRE_FPRIVATE(file) = fd;
478 ll_readahead_init(inode, &fd->fd_ras);
479 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
481 /* ll_cl_context initialize */
482 rwlock_init(&fd->fd_lock);
483 INIT_LIST_HEAD(&fd->fd_lccs);
488 /* Open a file, and (for the very first open) create objects on the OSTs at
489 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
490 * creation or open until ll_lov_setstripe() ioctl is called.
492 * If we already have the stripe MD locally then we don't request it in
493 * md_open(), by passing a lmm_size = 0.
495 * It is up to the application to ensure no other processes open this file
496 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
497 * used. We might be able to avoid races of that sort by getting lli_open_sem
498 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
499 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
501 int ll_file_open(struct inode *inode, struct file *file)
503 struct ll_inode_info *lli = ll_i2info(inode);
504 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
505 .it_flags = file->f_flags };
506 struct obd_client_handle **och_p = NULL;
507 __u64 *och_usecount = NULL;
508 struct ll_file_data *fd;
512 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
513 PFID(ll_inode2fid(inode)), inode, file->f_flags);
515 it = file->private_data; /* XXX: compat macro */
516 file->private_data = NULL; /* prevent ll_local_open assertion */
518 fd = ll_file_data_get();
520 GOTO(out_openerr, rc = -ENOMEM);
523 if (S_ISDIR(inode->i_mode))
524 ll_authorize_statahead(inode, fd);
526 if (inode->i_sb->s_root == file->f_dentry) {
527 LUSTRE_FPRIVATE(file) = fd;
531 if (!it || !it->d.lustre.it_disposition) {
532 /* Convert f_flags into access mode. We cannot use file->f_mode,
533 * because everything but O_ACCMODE mask was stripped from
535 if ((oit.it_flags + 1) & O_ACCMODE)
537 if (file->f_flags & O_TRUNC)
538 oit.it_flags |= FMODE_WRITE;
540 /* kernel only call f_op->open in dentry_open. filp_open calls
541 * dentry_open after call to open_namei that checks permissions.
542 * Only nfsd_open call dentry_open directly without checking
543 * permissions and because of that this code below is safe. */
544 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
545 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
547 /* We do not want O_EXCL here, presumably we opened the file
548 * already? XXX - NFS implications? */
549 oit.it_flags &= ~O_EXCL;
551 /* bug20584, if "it_flags" contains O_CREAT, the file will be
552 * created if necessary, then "IT_CREAT" should be set to keep
553 * consistent with it */
554 if (oit.it_flags & O_CREAT)
555 oit.it_op |= IT_CREAT;
561 /* Let's see if we have file open on MDS already. */
562 if (it->it_flags & FMODE_WRITE) {
563 och_p = &lli->lli_mds_write_och;
564 och_usecount = &lli->lli_open_fd_write_count;
565 } else if (it->it_flags & FMODE_EXEC) {
566 och_p = &lli->lli_mds_exec_och;
567 och_usecount = &lli->lli_open_fd_exec_count;
569 och_p = &lli->lli_mds_read_och;
570 och_usecount = &lli->lli_open_fd_read_count;
573 mutex_lock(&lli->lli_och_mutex);
574 if (*och_p) { /* Open handle is present */
575 if (it_disposition(it, DISP_OPEN_OPEN)) {
576 /* Well, there's extra open request that we do not need,
577 let's close it somehow. This will decref request. */
578 rc = it_open_error(DISP_OPEN_OPEN, it);
580 mutex_unlock(&lli->lli_och_mutex);
581 GOTO(out_openerr, rc);
584 ll_release_openhandle(file->f_dentry, it);
588 rc = ll_local_open(file, it, fd, NULL);
591 mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 LASSERT(*och_usecount == 0);
596 if (!it->d.lustre.it_disposition) {
597 /* We cannot just request lock handle now, new ELC code
598 means that one of other OPEN locks for this file
599 could be cancelled, and since blocking ast handler
600 would attempt to grab och_mutex as well, that would
601 result in a deadlock */
602 mutex_unlock(&lli->lli_och_mutex);
604 * Normally called under two situations:
606 * 2. A race/condition on MDS resulting in no open
607 * handle to be returned from LOOKUP|OPEN request,
608 * for example if the target entry was a symlink.
610 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
612 * Always specify MDS_OPEN_BY_FID because we don't want
613 * to get file with different fid.
615 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
616 rc = ll_intent_file_open(file, NULL, 0, it);
618 GOTO(out_openerr, rc);
622 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
624 GOTO(out_och_free, rc = -ENOMEM);
628 /* md_intent_lock() didn't get a request ref if there was an
629 * open error, so don't do cleanup on the request here
631 /* XXX (green): Should not we bail out on any error here, not
632 * just open error? */
633 rc = it_open_error(DISP_OPEN_OPEN, it);
635 GOTO(out_och_free, rc);
637 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
638 "inode %p: disposition %x, status %d\n", inode,
639 it_disposition(it, ~0), it->d.lustre.it_status);
641 rc = ll_local_open(file, it, fd, *och_p);
643 GOTO(out_och_free, rc);
645 mutex_unlock(&lli->lli_och_mutex);
648 /* Must do this outside lli_och_mutex lock to prevent deadlock where
649 different kind of OPEN lock for this same inode gets cancelled
650 by ldlm_cancel_lru */
651 if (!S_ISREG(inode->i_mode))
652 GOTO(out_och_free, rc);
656 if (!lli->lli_has_smd &&
657 (cl_is_lov_delay_create(file->f_flags) ||
658 (file->f_mode & FMODE_WRITE) == 0)) {
659 CDEBUG(D_INODE, "object creation was delayed\n");
660 GOTO(out_och_free, rc);
662 cl_lov_delay_create_clear(&file->f_flags);
663 GOTO(out_och_free, rc);
667 if (och_p && *och_p) {
668 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
669 *och_p = NULL; /* OBD_FREE writes some magic there */
672 mutex_unlock(&lli->lli_och_mutex);
675 if (lli->lli_opendir_key == fd)
676 ll_deauthorize_statahead(inode, fd);
678 ll_file_data_put(fd);
680 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
683 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
684 ptlrpc_req_finished(it->d.lustre.it_data);
685 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
691 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
692 struct ldlm_lock_desc *desc, void *data, int flag)
695 struct lustre_handle lockh;
699 case LDLM_CB_BLOCKING:
700 ldlm_lock2handle(lock, &lockh);
701 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
703 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
707 case LDLM_CB_CANCELING:
715 * Acquire a lease and open the file.
717 static struct obd_client_handle *
718 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
721 struct lookup_intent it = { .it_op = IT_OPEN };
722 struct ll_sb_info *sbi = ll_i2sbi(inode);
723 struct md_op_data *op_data;
724 struct ptlrpc_request *req = NULL;
725 struct lustre_handle old_handle = { 0 };
726 struct obd_client_handle *och = NULL;
731 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
732 RETURN(ERR_PTR(-EINVAL));
735 struct ll_inode_info *lli = ll_i2info(inode);
736 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
737 struct obd_client_handle **och_p;
740 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
741 RETURN(ERR_PTR(-EPERM));
743 /* Get the openhandle of the file */
745 mutex_lock(&lli->lli_och_mutex);
746 if (fd->fd_lease_och != NULL) {
747 mutex_unlock(&lli->lli_och_mutex);
751 if (fd->fd_och == NULL) {
752 if (file->f_mode & FMODE_WRITE) {
753 LASSERT(lli->lli_mds_write_och != NULL);
754 och_p = &lli->lli_mds_write_och;
755 och_usecount = &lli->lli_open_fd_write_count;
757 LASSERT(lli->lli_mds_read_och != NULL);
758 och_p = &lli->lli_mds_read_och;
759 och_usecount = &lli->lli_open_fd_read_count;
761 if (*och_usecount == 1) {
768 mutex_unlock(&lli->lli_och_mutex);
769 if (rc < 0) /* more than 1 opener */
772 LASSERT(fd->fd_och != NULL);
773 old_handle = fd->fd_och->och_fh;
778 RETURN(ERR_PTR(-ENOMEM));
780 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
781 LUSTRE_OPC_ANY, NULL);
783 GOTO(out, rc = PTR_ERR(op_data));
785 /* To tell the MDT this openhandle is from the same owner */
786 op_data->op_handle = old_handle;
788 it.it_flags = fmode | open_flags;
789 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
790 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
791 &ll_md_blocking_lease_ast,
792 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
793 * it can be cancelled which may mislead applications that the lease is
795 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
796 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
797 * doesn't deal with openhandle, so normal openhandle will be leaked. */
798 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
799 ll_finish_md_op_data(op_data);
800 ptlrpc_req_finished(req);
802 GOTO(out_release_it, rc);
804 if (it_disposition(&it, DISP_LOOKUP_NEG))
805 GOTO(out_release_it, rc = -ENOENT);
807 rc = it_open_error(DISP_OPEN_OPEN, &it);
809 GOTO(out_release_it, rc);
811 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
812 ll_och_fill(sbi->ll_md_exp, &it, och);
814 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
815 GOTO(out_close, rc = -EOPNOTSUPP);
817 /* already get lease, handle lease lock */
818 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
819 if (it.d.lustre.it_lock_mode == 0 ||
820 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
821 /* open lock must return for lease */
822 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
823 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
824 it.d.lustre.it_lock_bits);
825 GOTO(out_close, rc = -EPROTO);
828 ll_intent_release(&it);
832 /* Cancel open lock */
833 if (it.d.lustre.it_lock_mode != 0) {
834 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
835 it.d.lustre.it_lock_mode);
836 it.d.lustre.it_lock_mode = 0;
837 och->och_lease_handle.cookie = 0ULL;
839 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
841 CERROR("%s: error closing file "DFID": %d\n",
842 ll_get_fsname(inode->i_sb, NULL, 0),
843 PFID(&ll_i2info(inode)->lli_fid), rc2);
844 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
846 ll_intent_release(&it);
854 * Release lease and close the file.
855 * It will check if the lease has ever broken.
857 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
860 struct ldlm_lock *lock;
861 bool cancelled = true;
865 lock = ldlm_handle2lock(&och->och_lease_handle);
867 lock_res_and_lock(lock);
868 cancelled = ldlm_is_cancel(lock);
869 unlock_res_and_lock(lock);
873 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
874 PFID(&ll_i2info(inode)->lli_fid), cancelled);
877 ldlm_cli_cancel(&och->och_lease_handle, 0);
878 if (lease_broken != NULL)
879 *lease_broken = cancelled;
881 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
886 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
888 struct ll_inode_info *lli = ll_i2info(inode);
889 struct cl_object *obj = lli->lli_clob;
890 struct cl_attr *attr = vvp_env_thread_attr(env);
898 ll_inode_size_lock(inode);
900 /* merge timestamps the most recently obtained from mds with
901 timestamps obtained from osts */
902 LTIME_S(inode->i_atime) = lli->lli_atime;
903 LTIME_S(inode->i_mtime) = lli->lli_mtime;
904 LTIME_S(inode->i_ctime) = lli->lli_ctime;
906 atime = LTIME_S(inode->i_atime);
907 mtime = LTIME_S(inode->i_mtime);
908 ctime = LTIME_S(inode->i_ctime);
910 cl_object_attr_lock(obj);
911 rc = cl_object_attr_get(env, obj, attr);
912 cl_object_attr_unlock(obj);
915 GOTO(out_size_unlock, rc);
917 if (atime < attr->cat_atime)
918 atime = attr->cat_atime;
920 if (ctime < attr->cat_ctime)
921 ctime = attr->cat_ctime;
923 if (mtime < attr->cat_mtime)
924 mtime = attr->cat_mtime;
926 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
927 PFID(&lli->lli_fid), attr->cat_size);
929 i_size_write(inode, attr->cat_size);
930 inode->i_blocks = attr->cat_blocks;
932 LTIME_S(inode->i_atime) = atime;
933 LTIME_S(inode->i_mtime) = mtime;
934 LTIME_S(inode->i_ctime) = ctime;
937 ll_inode_size_unlock(inode);
942 static bool file_is_noatime(const struct file *file)
944 const struct vfsmount *mnt = file->f_path.mnt;
945 const struct inode *inode = file->f_path.dentry->d_inode;
947 /* Adapted from file_accessed() and touch_atime().*/
948 if (file->f_flags & O_NOATIME)
951 if (inode->i_flags & S_NOATIME)
954 if (IS_NOATIME(inode))
957 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
960 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
963 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
969 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
971 struct inode *inode = file->f_dentry->d_inode;
973 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
975 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
976 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
977 file->f_flags & O_DIRECT ||
980 io->ci_obj = ll_i2info(inode)->lli_clob;
981 io->ci_lockreq = CILR_MAYBE;
982 if (ll_file_nolock(file)) {
983 io->ci_lockreq = CILR_NEVER;
984 io->ci_no_srvlock = 1;
985 } else if (file->f_flags & O_APPEND) {
986 io->ci_lockreq = CILR_MANDATORY;
989 io->ci_noatime = file_is_noatime(file);
993 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
994 struct file *file, enum cl_io_type iot,
995 loff_t *ppos, size_t count)
997 struct inode *inode = file->f_dentry->d_inode;
998 struct ll_inode_info *lli = ll_i2info(inode);
1000 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1003 struct range_lock range;
1006 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1007 file->f_dentry->d_name.name, iot, *ppos, count);
1010 io = vvp_env_thread_io(env);
1011 ll_io_init(io, file, iot == CIT_WRITE);
1013 /* The maximum Lustre file size is variable, based on the
1014 * OST maximum object size and number of stripes. This
1015 * needs another check in addition to the VFS checks earlier. */
1016 end = (io->u.ci_wr.wr_append ? i_size_read(inode) : *ppos) + count;
1017 if (end > ll_file_maxbytes(inode)) {
1019 CDEBUG(D_INODE, "%s: file "DFID" offset %llu > maxbytes "LPU64
1020 ": rc = %zd\n", ll_get_fsname(inode->i_sb, NULL, 0),
1021 PFID(&lli->lli_fid), end, ll_file_maxbytes(inode),
1026 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1027 struct vvp_io *vio = vvp_env_io(env);
1028 bool range_locked = false;
1030 if (file->f_flags & O_APPEND)
1031 range_lock_init(&range, 0, LUSTRE_EOF);
1033 range_lock_init(&range, *ppos, *ppos + count - 1);
1035 vio->vui_fd = LUSTRE_FPRIVATE(file);
1036 vio->vui_io_subtype = args->via_io_subtype;
1038 switch (vio->vui_io_subtype) {
1040 vio->vui_iov = args->u.normal.via_iov;
1041 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1042 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1043 vio->vui_iocb = args->u.normal.via_iocb;
1044 if ((iot == CIT_WRITE) &&
1045 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1046 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1048 result = range_lock(&lli->lli_write_tree,
1053 range_locked = true;
1055 down_read(&lli->lli_trunc_sem);
1058 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1059 vio->u.splice.vui_flags = args->u.splice.via_flags;
1062 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1066 ll_cl_add(file, env, io);
1067 result = cl_io_loop(env, io);
1068 ll_cl_remove(file, env);
1070 if (args->via_io_subtype == IO_NORMAL)
1071 up_read(&lli->lli_trunc_sem);
1073 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1075 range_unlock(&lli->lli_write_tree, &range);
1078 /* cl_io_rw_init() handled IO */
1079 result = io->ci_result;
1082 if (io->ci_nob > 0) {
1083 result = io->ci_nob;
1084 *ppos = io->u.ci_wr.wr.crw_pos;
1088 cl_io_fini(env, io);
1089 /* If any bit been read/written (result != 0), we just return
1090 * short read/write instead of restart io. */
1091 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1092 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1093 iot == CIT_READ ? "read" : "write",
1094 file->f_dentry->d_name.name, *ppos, count);
1095 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1099 if (iot == CIT_READ) {
1101 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1102 LPROC_LL_READ_BYTES, result);
1103 } else if (iot == CIT_WRITE) {
1105 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1106 LPROC_LL_WRITE_BYTES, result);
1107 fd->fd_write_failed = false;
1108 } else if (result != -ERESTARTSYS) {
1109 fd->fd_write_failed = true;
1112 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1119 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1121 static int ll_file_get_iov_count(const struct iovec *iov,
1122 unsigned long *nr_segs, size_t *count)
1127 for (seg = 0; seg < *nr_segs; seg++) {
1128 const struct iovec *iv = &iov[seg];
1131 * If any segment has a negative length, or the cumulative
1132 * length ever wraps negative then return -EINVAL.
1135 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1137 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1142 cnt -= iv->iov_len; /* This segment is no good */
1149 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1150 unsigned long nr_segs, loff_t pos)
1153 struct vvp_io_args *args;
1159 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1163 env = cl_env_get(&refcheck);
1165 RETURN(PTR_ERR(env));
1167 args = ll_env_args(env, IO_NORMAL);
1168 args->u.normal.via_iov = (struct iovec *)iov;
1169 args->u.normal.via_nrsegs = nr_segs;
1170 args->u.normal.via_iocb = iocb;
1172 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1173 &iocb->ki_pos, count);
1174 cl_env_put(env, &refcheck);
1178 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1182 struct iovec *local_iov;
1183 struct kiocb *kiocb;
1188 env = cl_env_get(&refcheck);
1190 RETURN(PTR_ERR(env));
1192 local_iov = &ll_env_info(env)->lti_local_iov;
1193 kiocb = &ll_env_info(env)->lti_kiocb;
1194 local_iov->iov_base = (void __user *)buf;
1195 local_iov->iov_len = count;
1196 init_sync_kiocb(kiocb, file);
1197 kiocb->ki_pos = *ppos;
1198 #ifdef HAVE_KIOCB_KI_LEFT
1199 kiocb->ki_left = count;
1201 kiocb->ki_nbytes = count;
1204 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1205 *ppos = kiocb->ki_pos;
1207 cl_env_put(env, &refcheck);
1212 * Write to a file (through the page cache).
1215 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1216 unsigned long nr_segs, loff_t pos)
1219 struct vvp_io_args *args;
1225 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1229 env = cl_env_get(&refcheck);
1231 RETURN(PTR_ERR(env));
1233 args = ll_env_args(env, IO_NORMAL);
1234 args->u.normal.via_iov = (struct iovec *)iov;
1235 args->u.normal.via_nrsegs = nr_segs;
1236 args->u.normal.via_iocb = iocb;
1238 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1239 &iocb->ki_pos, count);
1240 cl_env_put(env, &refcheck);
1244 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1245 size_t count, loff_t *ppos)
1248 struct iovec *local_iov;
1249 struct kiocb *kiocb;
1254 env = cl_env_get(&refcheck);
1256 RETURN(PTR_ERR(env));
1258 local_iov = &ll_env_info(env)->lti_local_iov;
1259 kiocb = &ll_env_info(env)->lti_kiocb;
1260 local_iov->iov_base = (void __user *)buf;
1261 local_iov->iov_len = count;
1262 init_sync_kiocb(kiocb, file);
1263 kiocb->ki_pos = *ppos;
1264 #ifdef HAVE_KIOCB_KI_LEFT
1265 kiocb->ki_left = count;
1267 kiocb->ki_nbytes = count;
1270 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1271 *ppos = kiocb->ki_pos;
1273 cl_env_put(env, &refcheck);
1278 * Send file content (through pagecache) somewhere with helper
1280 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1281 struct pipe_inode_info *pipe, size_t count,
1285 struct vvp_io_args *args;
1290 env = cl_env_get(&refcheck);
1292 RETURN(PTR_ERR(env));
1294 args = ll_env_args(env, IO_SPLICE);
1295 args->u.splice.via_pipe = pipe;
1296 args->u.splice.via_flags = flags;
1298 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1299 cl_env_put(env, &refcheck);
1303 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1304 __u64 flags, struct lov_user_md *lum,
1307 struct lookup_intent oit = {
1309 .it_flags = flags | MDS_OPEN_BY_FID,
1314 ll_inode_size_lock(inode);
1315 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1317 GOTO(out_unlock, rc);
1319 ll_release_openhandle(file->f_dentry, &oit);
1322 ll_inode_size_unlock(inode);
1323 ll_intent_release(&oit);
1324 cl_lov_delay_create_clear(&file->f_flags);
1329 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1330 struct lov_mds_md **lmmp, int *lmm_size,
1331 struct ptlrpc_request **request)
1333 struct ll_sb_info *sbi = ll_i2sbi(inode);
1334 struct mdt_body *body;
1335 struct lov_mds_md *lmm = NULL;
1336 struct ptlrpc_request *req = NULL;
1337 struct md_op_data *op_data;
1340 rc = ll_get_default_mdsize(sbi, &lmmsize);
1344 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1345 strlen(filename), lmmsize,
1346 LUSTRE_OPC_ANY, NULL);
1347 if (IS_ERR(op_data))
1348 RETURN(PTR_ERR(op_data));
1350 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1351 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1352 ll_finish_md_op_data(op_data);
1354 CDEBUG(D_INFO, "md_getattr_name failed "
1355 "on %s: rc %d\n", filename, rc);
1359 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1360 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1362 lmmsize = body->mbo_eadatasize;
1364 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1366 GOTO(out, rc = -ENODATA);
1369 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1370 LASSERT(lmm != NULL);
1372 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1373 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1374 GOTO(out, rc = -EPROTO);
1378 * This is coming from the MDS, so is probably in
1379 * little endian. We convert it to host endian before
1380 * passing it to userspace.
1382 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1385 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1386 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1389 /* if function called for directory - we should
1390 * avoid swab not existent lsm objects */
1391 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1392 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1393 if (S_ISREG(body->mbo_mode))
1394 lustre_swab_lov_user_md_objects(
1395 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1397 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1398 lustre_swab_lov_user_md_v3(
1399 (struct lov_user_md_v3 *)lmm);
1400 if (S_ISREG(body->mbo_mode))
1401 lustre_swab_lov_user_md_objects(
1402 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1409 *lmm_size = lmmsize;
1414 static int ll_lov_setea(struct inode *inode, struct file *file,
1417 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1418 struct lov_user_md *lump;
1419 int lum_size = sizeof(struct lov_user_md) +
1420 sizeof(struct lov_user_ost_data);
1424 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1427 OBD_ALLOC_LARGE(lump, lum_size);
1431 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1432 OBD_FREE_LARGE(lump, lum_size);
1436 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1438 OBD_FREE_LARGE(lump, lum_size);
1442 static int ll_file_getstripe(struct inode *inode,
1443 struct lov_user_md __user *lum)
1450 env = cl_env_get(&refcheck);
1452 RETURN(PTR_ERR(env));
1454 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1455 cl_env_put(env, &refcheck);
1459 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1462 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1463 struct lov_user_md *klum;
1465 __u64 flags = FMODE_WRITE;
1468 rc = ll_copy_user_md(lum, &klum);
1473 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1477 put_user(0, &lum->lmm_stripe_count);
1479 ll_layout_refresh(inode, &gen);
1480 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1483 OBD_FREE(klum, lum_size);
1488 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1490 struct ll_inode_info *lli = ll_i2info(inode);
1491 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1492 struct ll_grouplock grouplock;
1497 CWARN("group id for group lock must not be 0\n");
1501 if (ll_file_nolock(file))
1502 RETURN(-EOPNOTSUPP);
1504 spin_lock(&lli->lli_lock);
1505 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1506 CWARN("group lock already existed with gid %lu\n",
1507 fd->fd_grouplock.lg_gid);
1508 spin_unlock(&lli->lli_lock);
1511 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1512 spin_unlock(&lli->lli_lock);
1514 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1515 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1519 spin_lock(&lli->lli_lock);
1520 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1521 spin_unlock(&lli->lli_lock);
1522 CERROR("another thread just won the race\n");
1523 cl_put_grouplock(&grouplock);
1527 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1528 fd->fd_grouplock = grouplock;
1529 spin_unlock(&lli->lli_lock);
1531 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1535 static int ll_put_grouplock(struct inode *inode, struct file *file,
1538 struct ll_inode_info *lli = ll_i2info(inode);
1539 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1540 struct ll_grouplock grouplock;
1543 spin_lock(&lli->lli_lock);
1544 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1545 spin_unlock(&lli->lli_lock);
1546 CWARN("no group lock held\n");
1550 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1552 if (fd->fd_grouplock.lg_gid != arg) {
1553 CWARN("group lock %lu doesn't match current id %lu\n",
1554 arg, fd->fd_grouplock.lg_gid);
1555 spin_unlock(&lli->lli_lock);
1559 grouplock = fd->fd_grouplock;
1560 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1561 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1562 spin_unlock(&lli->lli_lock);
1564 cl_put_grouplock(&grouplock);
1565 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1570 * Close inode open handle
1572 * \param dentry [in] dentry which contains the inode
1573 * \param it [in,out] intent which contains open info and result
1576 * \retval <0 failure
1578 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1580 struct inode *inode = dentry->d_inode;
1581 struct obd_client_handle *och;
1587 /* Root ? Do nothing. */
1588 if (dentry->d_inode->i_sb->s_root == dentry)
1591 /* No open handle to close? Move away */
1592 if (!it_disposition(it, DISP_OPEN_OPEN))
1595 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1597 OBD_ALLOC(och, sizeof(*och));
1599 GOTO(out, rc = -ENOMEM);
1601 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1603 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1606 /* this one is in place of ll_file_open */
1607 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1608 ptlrpc_req_finished(it->d.lustre.it_data);
1609 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1615 * Get size for inode for which FIEMAP mapping is requested.
1616 * Make the FIEMAP get_info call and returns the result.
1617 * \param fiemap kernel buffer to hold extens
1618 * \param num_bytes kernel buffer size
1620 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1626 struct ll_fiemap_info_key fmkey = { .name = KEY_FIEMAP, };
1629 /* Checks for fiemap flags */
1630 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1631 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1635 /* Check for FIEMAP_FLAG_SYNC */
1636 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1637 rc = filemap_fdatawrite(inode->i_mapping);
1642 env = cl_env_get(&refcheck);
1644 RETURN(PTR_ERR(env));
1646 if (i_size_read(inode) == 0) {
1647 rc = ll_glimpse_size(inode);
1652 fmkey.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1653 obdo_from_inode(&fmkey.oa, inode, OBD_MD_FLSIZE);
1654 obdo_set_parent_fid(&fmkey.oa, &ll_i2info(inode)->lli_fid);
1656 /* If filesize is 0, then there would be no objects for mapping */
1657 if (fmkey.oa.o_size == 0) {
1658 fiemap->fm_mapped_extents = 0;
1662 fmkey.fiemap = *fiemap;
1664 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1665 &fmkey, fiemap, &num_bytes);
1667 cl_env_put(env, &refcheck);
1671 int ll_fid2path(struct inode *inode, void __user *arg)
1673 struct obd_export *exp = ll_i2mdexp(inode);
1674 const struct getinfo_fid2path __user *gfin = arg;
1676 struct getinfo_fid2path *gfout;
1682 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1683 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1686 /* Only need to get the buflen */
1687 if (get_user(pathlen, &gfin->gf_pathlen))
1690 if (pathlen > PATH_MAX)
1693 outsize = sizeof(*gfout) + pathlen;
1694 OBD_ALLOC(gfout, outsize);
1698 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1699 GOTO(gf_free, rc = -EFAULT);
1701 /* Call mdc_iocontrol */
1702 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1706 if (copy_to_user(arg, gfout, outsize))
1710 OBD_FREE(gfout, outsize);
1714 static int ll_ioctl_fiemap(struct inode *inode, struct fiemap __user *arg)
1716 struct fiemap *fiemap;
1722 /* Get the extent count so we can calculate the size of
1723 * required fiemap buffer */
1724 if (get_user(extent_count, &arg->fm_extent_count))
1728 (SIZE_MAX - sizeof(*fiemap)) / sizeof(struct ll_fiemap_extent))
1730 num_bytes = sizeof(*fiemap) + (extent_count *
1731 sizeof(struct ll_fiemap_extent));
1733 OBD_ALLOC_LARGE(fiemap, num_bytes);
1737 /* get the fiemap value */
1738 if (copy_from_user(fiemap, arg, sizeof(*fiemap)))
1739 GOTO(error, rc = -EFAULT);
1741 /* If fm_extent_count is non-zero, read the first extent since
1742 * it is used to calculate end_offset and device from previous
1744 if (extent_count != 0) {
1745 if (copy_from_user(&fiemap->fm_extents[0],
1746 (char __user *)arg + sizeof(*fiemap),
1747 sizeof(struct ll_fiemap_extent)))
1748 GOTO(error, rc = -EFAULT);
1751 rc = ll_do_fiemap(inode, fiemap, num_bytes);
1755 ret_bytes = sizeof(struct fiemap);
1757 if (extent_count != 0)
1758 ret_bytes += (fiemap->fm_mapped_extents *
1759 sizeof(struct ll_fiemap_extent));
1761 if (copy_to_user((void __user *)arg, fiemap, ret_bytes))
1765 OBD_FREE_LARGE(fiemap, num_bytes);
1770 * Read the data_version for inode.
1772 * This value is computed using stripe object version on OST.
1773 * Version is computed using server side locking.
1775 * @param flags if do sync on the OST side;
1777 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1778 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1780 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1787 /* If no file object initialized, we consider its version is 0. */
1788 if (ll_i2info(inode)->lli_clob == NULL) {
1793 env = cl_env_get(&refcheck);
1795 RETURN(PTR_ERR(env));
1797 rc = cl_object_data_version(env, ll_i2info(inode)->lli_clob,
1798 data_version, flags);
1799 cl_env_put(env, &refcheck);
1804 * Trigger a HSM release request for the provided inode.
1806 int ll_hsm_release(struct inode *inode)
1808 struct cl_env_nest nest;
1810 struct obd_client_handle *och = NULL;
1811 __u64 data_version = 0;
1815 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1816 ll_get_fsname(inode->i_sb, NULL, 0),
1817 PFID(&ll_i2info(inode)->lli_fid));
1819 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1821 GOTO(out, rc = PTR_ERR(och));
1823 /* Grab latest data_version and [am]time values */
1824 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1828 env = cl_env_nested_get(&nest);
1830 GOTO(out, rc = PTR_ERR(env));
1832 ll_merge_attr(env, inode);
1833 cl_env_nested_put(&nest, env);
1835 /* Release the file.
1836 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1837 * we still need it to pack l_remote_handle to MDT. */
1838 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1844 if (och != NULL && !IS_ERR(och)) /* close the file */
1845 ll_lease_close(och, inode, NULL);
1850 struct ll_swap_stack {
1851 struct iattr ia1, ia2;
1853 struct inode *inode1, *inode2;
1854 bool check_dv1, check_dv2;
1857 static int ll_swap_layouts(struct file *file1, struct file *file2,
1858 struct lustre_swap_layouts *lsl)
1860 struct mdc_swap_layouts msl;
1861 struct md_op_data *op_data;
1864 struct ll_swap_stack *llss = NULL;
1867 OBD_ALLOC_PTR(llss);
1871 llss->inode1 = file1->f_dentry->d_inode;
1872 llss->inode2 = file2->f_dentry->d_inode;
1874 if (!S_ISREG(llss->inode2->i_mode))
1875 GOTO(free, rc = -EINVAL);
1877 if (inode_permission(llss->inode1, MAY_WRITE) ||
1878 inode_permission(llss->inode2, MAY_WRITE))
1879 GOTO(free, rc = -EPERM);
1881 if (llss->inode2->i_sb != llss->inode1->i_sb)
1882 GOTO(free, rc = -EXDEV);
1884 /* we use 2 bool because it is easier to swap than 2 bits */
1885 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1886 llss->check_dv1 = true;
1888 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1889 llss->check_dv2 = true;
1891 /* we cannot use lsl->sl_dvX directly because we may swap them */
1892 llss->dv1 = lsl->sl_dv1;
1893 llss->dv2 = lsl->sl_dv2;
1895 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1896 if (rc == 0) /* same file, done! */
1899 if (rc < 0) { /* sequentialize it */
1900 swap(llss->inode1, llss->inode2);
1902 swap(llss->dv1, llss->dv2);
1903 swap(llss->check_dv1, llss->check_dv2);
1907 if (gid != 0) { /* application asks to flush dirty cache */
1908 rc = ll_get_grouplock(llss->inode1, file1, gid);
1912 rc = ll_get_grouplock(llss->inode2, file2, gid);
1914 ll_put_grouplock(llss->inode1, file1, gid);
1919 /* to be able to restore mtime and atime after swap
1920 * we need to first save them */
1922 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1923 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1924 llss->ia1.ia_atime = llss->inode1->i_atime;
1925 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1926 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1927 llss->ia2.ia_atime = llss->inode2->i_atime;
1928 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1931 /* ultimate check, before swaping the layouts we check if
1932 * dataversion has changed (if requested) */
1933 if (llss->check_dv1) {
1934 rc = ll_data_version(llss->inode1, &dv, 0);
1937 if (dv != llss->dv1)
1938 GOTO(putgl, rc = -EAGAIN);
1941 if (llss->check_dv2) {
1942 rc = ll_data_version(llss->inode2, &dv, 0);
1945 if (dv != llss->dv2)
1946 GOTO(putgl, rc = -EAGAIN);
1949 /* struct md_op_data is used to send the swap args to the mdt
1950 * only flags is missing, so we use struct mdc_swap_layouts
1951 * through the md_op_data->op_data */
1952 /* flags from user space have to be converted before they are send to
1953 * server, no flag is sent today, they are only used on the client */
1956 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1957 0, LUSTRE_OPC_ANY, &msl);
1958 if (IS_ERR(op_data))
1959 GOTO(free, rc = PTR_ERR(op_data));
1961 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1962 sizeof(*op_data), op_data, NULL);
1963 ll_finish_md_op_data(op_data);
1967 ll_put_grouplock(llss->inode2, file2, gid);
1968 ll_put_grouplock(llss->inode1, file1, gid);
1971 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1975 /* clear useless flags */
1976 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1977 llss->ia1.ia_valid &= ~ATTR_MTIME;
1978 llss->ia2.ia_valid &= ~ATTR_MTIME;
1981 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1982 llss->ia1.ia_valid &= ~ATTR_ATIME;
1983 llss->ia2.ia_valid &= ~ATTR_ATIME;
1986 /* update time if requested */
1988 if (llss->ia2.ia_valid != 0) {
1989 mutex_lock(&llss->inode1->i_mutex);
1990 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1991 mutex_unlock(&llss->inode1->i_mutex);
1994 if (llss->ia1.ia_valid != 0) {
1997 mutex_lock(&llss->inode2->i_mutex);
1998 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1999 mutex_unlock(&llss->inode2->i_mutex);
2011 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2013 struct md_op_data *op_data;
2017 /* Detect out-of range masks */
2018 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2021 /* Non-root users are forbidden to set or clear flags which are
2022 * NOT defined in HSM_USER_MASK. */
2023 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2024 !cfs_capable(CFS_CAP_SYS_ADMIN))
2027 /* Detect out-of range archive id */
2028 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2029 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2032 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2033 LUSTRE_OPC_ANY, hss);
2034 if (IS_ERR(op_data))
2035 RETURN(PTR_ERR(op_data));
2037 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2038 sizeof(*op_data), op_data, NULL);
2040 ll_finish_md_op_data(op_data);
2045 static int ll_hsm_import(struct inode *inode, struct file *file,
2046 struct hsm_user_import *hui)
2048 struct hsm_state_set *hss = NULL;
2049 struct iattr *attr = NULL;
2053 if (!S_ISREG(inode->i_mode))
2059 GOTO(out, rc = -ENOMEM);
2061 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2062 hss->hss_archive_id = hui->hui_archive_id;
2063 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2064 rc = ll_hsm_state_set(inode, hss);
2068 OBD_ALLOC_PTR(attr);
2070 GOTO(out, rc = -ENOMEM);
2072 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2073 attr->ia_mode |= S_IFREG;
2074 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2075 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2076 attr->ia_size = hui->hui_size;
2077 attr->ia_mtime.tv_sec = hui->hui_mtime;
2078 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2079 attr->ia_atime.tv_sec = hui->hui_atime;
2080 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2082 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2083 ATTR_UID | ATTR_GID |
2084 ATTR_MTIME | ATTR_MTIME_SET |
2085 ATTR_ATIME | ATTR_ATIME_SET;
2087 mutex_lock(&inode->i_mutex);
2089 rc = ll_setattr_raw(file->f_dentry, attr, true);
2093 mutex_unlock(&inode->i_mutex);
2105 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2107 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2108 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2112 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2114 struct inode *inode = file->f_dentry->d_inode;
2115 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2119 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2120 PFID(ll_inode2fid(inode)), inode, cmd);
2121 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2123 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2124 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2128 case LL_IOC_GETFLAGS:
2129 /* Get the current value of the file flags */
2130 return put_user(fd->fd_flags, (int __user *)arg);
2131 case LL_IOC_SETFLAGS:
2132 case LL_IOC_CLRFLAGS:
2133 /* Set or clear specific file flags */
2134 /* XXX This probably needs checks to ensure the flags are
2135 * not abused, and to handle any flag side effects.
2137 if (get_user(flags, (int __user *) arg))
2140 if (cmd == LL_IOC_SETFLAGS) {
2141 if ((flags & LL_FILE_IGNORE_LOCK) &&
2142 !(file->f_flags & O_DIRECT)) {
2143 CERROR("%s: unable to disable locking on "
2144 "non-O_DIRECT file\n", current->comm);
2148 fd->fd_flags |= flags;
2150 fd->fd_flags &= ~flags;
2153 case LL_IOC_LOV_SETSTRIPE:
2154 RETURN(ll_lov_setstripe(inode, file, arg));
2155 case LL_IOC_LOV_SETEA:
2156 RETURN(ll_lov_setea(inode, file, arg));
2157 case LL_IOC_LOV_SWAP_LAYOUTS: {
2159 struct lustre_swap_layouts lsl;
2161 if (copy_from_user(&lsl, (char __user *)arg,
2162 sizeof(struct lustre_swap_layouts)))
2165 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2168 file2 = fget(lsl.sl_fd);
2173 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2174 rc = ll_swap_layouts(file, file2, &lsl);
2178 case LL_IOC_LOV_GETSTRIPE:
2179 RETURN(ll_file_getstripe(inode,
2180 (struct lov_user_md __user *)arg));
2181 case FSFILT_IOC_FIEMAP:
2182 RETURN(ll_ioctl_fiemap(inode, (struct fiemap __user *)arg));
2183 case FSFILT_IOC_GETFLAGS:
2184 case FSFILT_IOC_SETFLAGS:
2185 RETURN(ll_iocontrol(inode, file, cmd, arg));
2186 case FSFILT_IOC_GETVERSION_OLD:
2187 case FSFILT_IOC_GETVERSION:
2188 RETURN(put_user(inode->i_generation, (int __user *)arg));
2189 case LL_IOC_GROUP_LOCK:
2190 RETURN(ll_get_grouplock(inode, file, arg));
2191 case LL_IOC_GROUP_UNLOCK:
2192 RETURN(ll_put_grouplock(inode, file, arg));
2193 case IOC_OBD_STATFS:
2194 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2196 /* We need to special case any other ioctls we want to handle,
2197 * to send them to the MDS/OST as appropriate and to properly
2198 * network encode the arg field.
2199 case FSFILT_IOC_SETVERSION_OLD:
2200 case FSFILT_IOC_SETVERSION:
2202 case LL_IOC_FLUSHCTX:
2203 RETURN(ll_flush_ctx(inode));
2204 case LL_IOC_PATH2FID: {
2205 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2206 sizeof(struct lu_fid)))
2211 case LL_IOC_GETPARENT:
2212 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2214 case OBD_IOC_FID2PATH:
2215 RETURN(ll_fid2path(inode, (void __user *)arg));
2216 case LL_IOC_DATA_VERSION: {
2217 struct ioc_data_version idv;
2220 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2223 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2224 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2227 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2233 case LL_IOC_GET_MDTIDX: {
2236 mdtidx = ll_get_mdt_idx(inode);
2240 if (put_user((int)mdtidx, (int __user *)arg))
2245 case OBD_IOC_GETDTNAME:
2246 case OBD_IOC_GETMDNAME:
2247 RETURN(ll_get_obd_name(inode, cmd, arg));
2248 case LL_IOC_HSM_STATE_GET: {
2249 struct md_op_data *op_data;
2250 struct hsm_user_state *hus;
2257 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2258 LUSTRE_OPC_ANY, hus);
2259 if (IS_ERR(op_data)) {
2261 RETURN(PTR_ERR(op_data));
2264 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2267 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2270 ll_finish_md_op_data(op_data);
2274 case LL_IOC_HSM_STATE_SET: {
2275 struct hsm_state_set *hss;
2282 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2287 rc = ll_hsm_state_set(inode, hss);
2292 case LL_IOC_HSM_ACTION: {
2293 struct md_op_data *op_data;
2294 struct hsm_current_action *hca;
2301 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2302 LUSTRE_OPC_ANY, hca);
2303 if (IS_ERR(op_data)) {
2305 RETURN(PTR_ERR(op_data));
2308 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2311 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2314 ll_finish_md_op_data(op_data);
2318 case LL_IOC_SET_LEASE: {
2319 struct ll_inode_info *lli = ll_i2info(inode);
2320 struct obd_client_handle *och = NULL;
2325 case LL_LEASE_WRLCK:
2326 if (!(file->f_mode & FMODE_WRITE))
2328 fmode = FMODE_WRITE;
2330 case LL_LEASE_RDLCK:
2331 if (!(file->f_mode & FMODE_READ))
2335 case LL_LEASE_UNLCK:
2336 mutex_lock(&lli->lli_och_mutex);
2337 if (fd->fd_lease_och != NULL) {
2338 och = fd->fd_lease_och;
2339 fd->fd_lease_och = NULL;
2341 mutex_unlock(&lli->lli_och_mutex);
2346 fmode = och->och_flags;
2347 rc = ll_lease_close(och, inode, &lease_broken);
2354 RETURN(ll_lease_type_from_fmode(fmode));
2359 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2361 /* apply for lease */
2362 och = ll_lease_open(inode, file, fmode, 0);
2364 RETURN(PTR_ERR(och));
2367 mutex_lock(&lli->lli_och_mutex);
2368 if (fd->fd_lease_och == NULL) {
2369 fd->fd_lease_och = och;
2372 mutex_unlock(&lli->lli_och_mutex);
2374 /* impossible now that only excl is supported for now */
2375 ll_lease_close(och, inode, &lease_broken);
2380 case LL_IOC_GET_LEASE: {
2381 struct ll_inode_info *lli = ll_i2info(inode);
2382 struct ldlm_lock *lock = NULL;
2385 mutex_lock(&lli->lli_och_mutex);
2386 if (fd->fd_lease_och != NULL) {
2387 struct obd_client_handle *och = fd->fd_lease_och;
2389 lock = ldlm_handle2lock(&och->och_lease_handle);
2391 lock_res_and_lock(lock);
2392 if (!ldlm_is_cancel(lock))
2393 fmode = och->och_flags;
2395 unlock_res_and_lock(lock);
2396 LDLM_LOCK_PUT(lock);
2399 mutex_unlock(&lli->lli_och_mutex);
2401 RETURN(ll_lease_type_from_fmode(fmode));
2403 case LL_IOC_HSM_IMPORT: {
2404 struct hsm_user_import *hui;
2410 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2415 rc = ll_hsm_import(inode, file, hui);
2425 ll_iocontrol_call(inode, file, cmd, arg, &err))
2428 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2429 (void __user *)arg));
2434 #ifndef HAVE_FILE_LLSEEK_SIZE
2435 static inline loff_t
2436 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2438 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2440 if (offset > maxsize)
2443 if (offset != file->f_pos) {
2444 file->f_pos = offset;
2445 file->f_version = 0;
2451 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2452 loff_t maxsize, loff_t eof)
2454 struct inode *inode = file->f_dentry->d_inode;
2462 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2463 * position-querying operation. Avoid rewriting the "same"
2464 * f_pos value back to the file because a concurrent read(),
2465 * write() or lseek() might have altered it
2470 * f_lock protects against read/modify/write race with other
2471 * SEEK_CURs. Note that parallel writes and reads behave
2474 mutex_lock(&inode->i_mutex);
2475 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2476 mutex_unlock(&inode->i_mutex);
2480 * In the generic case the entire file is data, so as long as
2481 * offset isn't at the end of the file then the offset is data.
2488 * There is a virtual hole at the end of the file, so as long as
2489 * offset isn't i_size or larger, return i_size.
2497 return llseek_execute(file, offset, maxsize);
2501 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2503 struct inode *inode = file->f_dentry->d_inode;
2504 loff_t retval, eof = 0;
2507 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2508 (origin == SEEK_CUR) ? file->f_pos : 0);
2509 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2510 PFID(ll_inode2fid(inode)), inode, retval, retval,
2512 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2514 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2515 retval = ll_glimpse_size(inode);
2518 eof = i_size_read(inode);
2521 retval = ll_generic_file_llseek_size(file, offset, origin,
2522 ll_file_maxbytes(inode), eof);
2526 static int ll_flush(struct file *file, fl_owner_t id)
2528 struct inode *inode = file->f_dentry->d_inode;
2529 struct ll_inode_info *lli = ll_i2info(inode);
2530 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2533 LASSERT(!S_ISDIR(inode->i_mode));
2535 /* catch async errors that were recorded back when async writeback
2536 * failed for pages in this mapping. */
2537 rc = lli->lli_async_rc;
2538 lli->lli_async_rc = 0;
2539 if (lli->lli_clob != NULL) {
2540 err = lov_read_and_clear_async_rc(lli->lli_clob);
2545 /* The application has been told write failure already.
2546 * Do not report failure again. */
2547 if (fd->fd_write_failed)
2549 return rc ? -EIO : 0;
2553 * Called to make sure a portion of file has been written out.
2554 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2556 * Return how many pages have been written.
2558 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2559 enum cl_fsync_mode mode, int ignore_layout)
2561 struct cl_env_nest nest;
2564 struct obd_capa *capa = NULL;
2565 struct cl_fsync_io *fio;
2569 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2570 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2573 env = cl_env_nested_get(&nest);
2575 RETURN(PTR_ERR(env));
2577 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2579 io = vvp_env_thread_io(env);
2580 io->ci_obj = ll_i2info(inode)->lli_clob;
2581 io->ci_ignore_layout = ignore_layout;
2583 /* initialize parameters for sync */
2584 fio = &io->u.ci_fsync;
2585 fio->fi_capa = capa;
2586 fio->fi_start = start;
2588 fio->fi_fid = ll_inode2fid(inode);
2589 fio->fi_mode = mode;
2590 fio->fi_nr_written = 0;
2592 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2593 result = cl_io_loop(env, io);
2595 result = io->ci_result;
2597 result = fio->fi_nr_written;
2598 cl_io_fini(env, io);
2599 cl_env_nested_put(&nest, env);
2607 * When dentry is provided (the 'else' case), *file->f_dentry may be
2608 * null and dentry must be used directly rather than pulled from
2609 * *file->f_dentry as is done otherwise.
2612 #ifdef HAVE_FILE_FSYNC_4ARGS
2613 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2615 struct dentry *dentry = file->f_dentry;
2616 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2617 int ll_fsync(struct file *file, int datasync)
2619 struct dentry *dentry = file->f_dentry;
2621 loff_t end = LLONG_MAX;
2623 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2626 loff_t end = LLONG_MAX;
2628 struct inode *inode = dentry->d_inode;
2629 struct ll_inode_info *lli = ll_i2info(inode);
2630 struct ptlrpc_request *req;
2631 struct obd_capa *oc;
2635 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2636 PFID(ll_inode2fid(inode)), inode);
2637 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2639 #ifdef HAVE_FILE_FSYNC_4ARGS
2640 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2641 mutex_lock(&inode->i_mutex);
2643 /* fsync's caller has already called _fdata{sync,write}, we want
2644 * that IO to finish before calling the osc and mdc sync methods */
2645 rc = filemap_fdatawait(inode->i_mapping);
2648 /* catch async errors that were recorded back when async writeback
2649 * failed for pages in this mapping. */
2650 if (!S_ISDIR(inode->i_mode)) {
2651 err = lli->lli_async_rc;
2652 lli->lli_async_rc = 0;
2655 err = lov_read_and_clear_async_rc(lli->lli_clob);
2660 oc = ll_mdscapa_get(inode);
2661 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2667 ptlrpc_req_finished(req);
2669 if (S_ISREG(inode->i_mode)) {
2670 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2672 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2673 if (rc == 0 && err < 0)
2676 fd->fd_write_failed = true;
2678 fd->fd_write_failed = false;
2681 #ifdef HAVE_FILE_FSYNC_4ARGS
2682 mutex_unlock(&inode->i_mutex);
2688 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2690 struct inode *inode = file->f_dentry->d_inode;
2691 struct ll_sb_info *sbi = ll_i2sbi(inode);
2692 struct ldlm_enqueue_info einfo = {
2693 .ei_type = LDLM_FLOCK,
2694 .ei_cb_cp = ldlm_flock_completion_ast,
2695 .ei_cbdata = file_lock,
2697 struct md_op_data *op_data;
2698 struct lustre_handle lockh = {0};
2699 ldlm_policy_data_t flock = {{0}};
2700 int fl_type = file_lock->fl_type;
2706 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2707 PFID(ll_inode2fid(inode)), file_lock);
2709 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2711 if (file_lock->fl_flags & FL_FLOCK) {
2712 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2713 /* flocks are whole-file locks */
2714 flock.l_flock.end = OFFSET_MAX;
2715 /* For flocks owner is determined by the local file desctiptor*/
2716 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2717 } else if (file_lock->fl_flags & FL_POSIX) {
2718 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2719 flock.l_flock.start = file_lock->fl_start;
2720 flock.l_flock.end = file_lock->fl_end;
2724 flock.l_flock.pid = file_lock->fl_pid;
2726 /* Somewhat ugly workaround for svc lockd.
2727 * lockd installs custom fl_lmops->lm_compare_owner that checks
2728 * for the fl_owner to be the same (which it always is on local node
2729 * I guess between lockd processes) and then compares pid.
2730 * As such we assign pid to the owner field to make it all work,
2731 * conflict with normal locks is unlikely since pid space and
2732 * pointer space for current->files are not intersecting */
2733 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2734 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2738 einfo.ei_mode = LCK_PR;
2741 /* An unlock request may or may not have any relation to
2742 * existing locks so we may not be able to pass a lock handle
2743 * via a normal ldlm_lock_cancel() request. The request may even
2744 * unlock a byte range in the middle of an existing lock. In
2745 * order to process an unlock request we need all of the same
2746 * information that is given with a normal read or write record
2747 * lock request. To avoid creating another ldlm unlock (cancel)
2748 * message we'll treat a LCK_NL flock request as an unlock. */
2749 einfo.ei_mode = LCK_NL;
2752 einfo.ei_mode = LCK_PW;
2755 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2770 flags = LDLM_FL_BLOCK_NOWAIT;
2776 flags = LDLM_FL_TEST_LOCK;
2779 CERROR("unknown fcntl lock command: %d\n", cmd);
2783 /* Save the old mode so that if the mode in the lock changes we
2784 * can decrement the appropriate reader or writer refcount. */
2785 file_lock->fl_type = einfo.ei_mode;
2787 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2788 LUSTRE_OPC_ANY, NULL);
2789 if (IS_ERR(op_data))
2790 RETURN(PTR_ERR(op_data));
2792 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2793 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2794 flock.l_flock.pid, flags, einfo.ei_mode,
2795 flock.l_flock.start, flock.l_flock.end);
2797 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2800 /* Restore the file lock type if not TEST lock. */
2801 if (!(flags & LDLM_FL_TEST_LOCK))
2802 file_lock->fl_type = fl_type;
2804 if ((file_lock->fl_flags & FL_FLOCK) &&
2805 (rc == 0 || file_lock->fl_type == F_UNLCK))
2806 rc2 = flock_lock_file_wait(file, file_lock);
2807 if ((file_lock->fl_flags & FL_POSIX) &&
2808 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2809 !(flags & LDLM_FL_TEST_LOCK))
2810 rc2 = posix_lock_file_wait(file, file_lock);
2812 if (rc2 && file_lock->fl_type != F_UNLCK) {
2813 einfo.ei_mode = LCK_NL;
2814 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2819 ll_finish_md_op_data(op_data);
2824 int ll_get_fid_by_name(struct inode *parent, const char *name,
2825 int namelen, struct lu_fid *fid)
2827 struct md_op_data *op_data = NULL;
2828 struct mdt_body *body;
2829 struct ptlrpc_request *req;
2833 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2834 LUSTRE_OPC_ANY, NULL);
2835 if (IS_ERR(op_data))
2836 RETURN(PTR_ERR(op_data));
2838 op_data->op_valid = OBD_MD_FLID;
2839 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2840 ll_finish_md_op_data(op_data);
2844 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2846 GOTO(out_req, rc = -EFAULT);
2848 *fid = body->mbo_fid1;
2850 ptlrpc_req_finished(req);
2854 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2855 const char *name, int namelen)
2857 struct dentry *dchild = NULL;
2858 struct inode *child_inode = NULL;
2859 struct md_op_data *op_data;
2860 struct ptlrpc_request *request = NULL;
2865 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2866 name, PFID(ll_inode2fid(parent)), mdtidx);
2868 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2869 0, LUSTRE_OPC_ANY, NULL);
2870 if (IS_ERR(op_data))
2871 RETURN(PTR_ERR(op_data));
2873 /* Get child FID first */
2874 qstr.hash = full_name_hash(name, namelen);
2877 dchild = d_lookup(file->f_dentry, &qstr);
2878 if (dchild != NULL) {
2879 if (dchild->d_inode != NULL) {
2880 child_inode = igrab(dchild->d_inode);
2881 if (child_inode != NULL) {
2882 mutex_lock(&child_inode->i_mutex);
2883 op_data->op_fid3 = *ll_inode2fid(child_inode);
2884 ll_invalidate_aliases(child_inode);
2889 rc = ll_get_fid_by_name(parent, name, namelen,
2895 if (!fid_is_sane(&op_data->op_fid3)) {
2896 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2897 ll_get_fsname(parent->i_sb, NULL, 0), name,
2898 PFID(&op_data->op_fid3));
2899 GOTO(out_free, rc = -EINVAL);
2902 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2907 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2908 PFID(&op_data->op_fid3), mdtidx);
2909 GOTO(out_free, rc = 0);
2912 op_data->op_mds = mdtidx;
2913 op_data->op_cli_flags = CLI_MIGRATE;
2914 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2915 namelen, name, namelen, &request);
2917 ll_update_times(request, parent);
2919 ptlrpc_req_finished(request);
2924 if (child_inode != NULL) {
2925 clear_nlink(child_inode);
2926 mutex_unlock(&child_inode->i_mutex);
2930 ll_finish_md_op_data(op_data);
2935 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2943 * test if some locks matching bits and l_req_mode are acquired
2944 * - bits can be in different locks
2945 * - if found clear the common lock bits in *bits
2946 * - the bits not found, are kept in *bits
2948 * \param bits [IN] searched lock bits [IN]
2949 * \param l_req_mode [IN] searched lock mode
2950 * \retval boolean, true iff all bits are found
2952 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2954 struct lustre_handle lockh;
2955 ldlm_policy_data_t policy;
2956 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2957 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2966 fid = &ll_i2info(inode)->lli_fid;
2967 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2968 ldlm_lockname[mode]);
2970 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2971 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2972 policy.l_inodebits.bits = *bits & (1 << i);
2973 if (policy.l_inodebits.bits == 0)
2976 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2977 &policy, mode, &lockh)) {
2978 struct ldlm_lock *lock;
2980 lock = ldlm_handle2lock(&lockh);
2983 ~(lock->l_policy_data.l_inodebits.bits);
2984 LDLM_LOCK_PUT(lock);
2986 *bits &= ~policy.l_inodebits.bits;
2993 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2994 struct lustre_handle *lockh, __u64 flags,
2997 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3002 fid = &ll_i2info(inode)->lli_fid;
3003 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3005 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3006 fid, LDLM_IBITS, &policy, mode, lockh);
3011 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3013 /* Already unlinked. Just update nlink and return success */
3014 if (rc == -ENOENT) {
3016 /* This path cannot be hit for regular files unless in
3017 * case of obscure races, so no need to to validate
3019 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3021 } else if (rc != 0) {
3022 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3023 "%s: revalidate FID "DFID" error: rc = %d\n",
3024 ll_get_fsname(inode->i_sb, NULL, 0),
3025 PFID(ll_inode2fid(inode)), rc);
3031 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3033 struct inode *inode = dentry->d_inode;
3034 struct ptlrpc_request *req = NULL;
3035 struct obd_export *exp;
3039 LASSERT(inode != NULL);
3041 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3042 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3044 exp = ll_i2mdexp(inode);
3046 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3047 * But under CMD case, it caused some lock issues, should be fixed
3048 * with new CMD ibits lock. See bug 12718 */
3049 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3050 struct lookup_intent oit = { .it_op = IT_GETATTR };
3051 struct md_op_data *op_data;
3053 if (ibits == MDS_INODELOCK_LOOKUP)
3054 oit.it_op = IT_LOOKUP;
3056 /* Call getattr by fid, so do not provide name at all. */
3057 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3058 dentry->d_inode, NULL, 0, 0,
3059 LUSTRE_OPC_ANY, NULL);
3060 if (IS_ERR(op_data))
3061 RETURN(PTR_ERR(op_data));
3063 rc = md_intent_lock(exp, op_data, &oit, &req,
3064 &ll_md_blocking_ast, 0);
3065 ll_finish_md_op_data(op_data);
3067 rc = ll_inode_revalidate_fini(inode, rc);
3071 rc = ll_revalidate_it_finish(req, &oit, dentry);
3073 ll_intent_release(&oit);
3077 /* Unlinked? Unhash dentry, so it is not picked up later by
3078 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3079 here to preserve get_cwd functionality on 2.6.
3081 if (!dentry->d_inode->i_nlink)
3082 d_lustre_invalidate(dentry, 0);
3084 ll_lookup_finish_locks(&oit, dentry);
3085 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3086 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3087 u64 valid = OBD_MD_FLGETATTR;
3088 struct md_op_data *op_data;
3091 if (S_ISREG(inode->i_mode)) {
3092 rc = ll_get_default_mdsize(sbi, &ealen);
3095 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3098 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3099 0, ealen, LUSTRE_OPC_ANY,
3101 if (IS_ERR(op_data))
3102 RETURN(PTR_ERR(op_data));
3104 op_data->op_valid = valid;
3105 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3106 * capa for this inode. Because we only keep capas of dirs
3108 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3109 ll_finish_md_op_data(op_data);
3111 rc = ll_inode_revalidate_fini(inode, rc);
3115 rc = ll_prep_inode(&inode, req, NULL, NULL);
3118 ptlrpc_req_finished(req);
3122 static int ll_merge_md_attr(struct inode *inode)
3124 struct cl_attr attr = { 0 };
3127 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3128 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3129 &attr, ll_md_blocking_ast);
3133 set_nlink(inode, attr.cat_nlink);
3134 inode->i_blocks = attr.cat_blocks;
3135 i_size_write(inode, attr.cat_size);
3137 ll_i2info(inode)->lli_atime = attr.cat_atime;
3138 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3139 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3145 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3147 struct inode *inode = dentry->d_inode;
3151 rc = __ll_inode_revalidate(dentry, ibits);
3155 /* if object isn't regular file, don't validate size */
3156 if (!S_ISREG(inode->i_mode)) {
3157 if (S_ISDIR(inode->i_mode) &&
3158 ll_i2info(inode)->lli_lsm_md != NULL) {
3159 rc = ll_merge_md_attr(inode);
3164 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3165 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3166 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3168 /* In case of restore, the MDT has the right size and has
3169 * already send it back without granting the layout lock,
3170 * inode is up-to-date so glimpse is useless.
3171 * Also to glimpse we need the layout, in case of a running
3172 * restore the MDT holds the layout lock so the glimpse will
3173 * block up to the end of restore (getattr will block)
3175 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3176 rc = ll_glimpse_size(inode);
3181 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3183 struct inode *inode = de->d_inode;
3184 struct ll_sb_info *sbi = ll_i2sbi(inode);
3185 struct ll_inode_info *lli = ll_i2info(inode);
3188 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3189 MDS_INODELOCK_LOOKUP);
3190 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3195 stat->dev = inode->i_sb->s_dev;
3196 if (ll_need_32bit_api(sbi))
3197 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3199 stat->ino = inode->i_ino;
3200 stat->mode = inode->i_mode;
3201 stat->uid = inode->i_uid;
3202 stat->gid = inode->i_gid;
3203 stat->rdev = inode->i_rdev;
3204 stat->atime = inode->i_atime;
3205 stat->mtime = inode->i_mtime;
3206 stat->ctime = inode->i_ctime;
3207 stat->blksize = 1 << inode->i_blkbits;
3209 stat->nlink = inode->i_nlink;
3210 stat->size = i_size_read(inode);
3211 stat->blocks = inode->i_blocks;
3216 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3217 __u64 start, __u64 len)
3221 struct ll_user_fiemap *fiemap;
3222 unsigned int extent_count = fieinfo->fi_extents_max;
3224 num_bytes = sizeof(*fiemap) + (extent_count *
3225 sizeof(struct ll_fiemap_extent));
3226 OBD_ALLOC_LARGE(fiemap, num_bytes);
3231 fiemap->fm_flags = fieinfo->fi_flags;
3232 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3233 fiemap->fm_start = start;
3234 fiemap->fm_length = len;
3235 if (extent_count > 0)
3236 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3237 sizeof(struct ll_fiemap_extent));
3239 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3241 fieinfo->fi_flags = fiemap->fm_flags;
3242 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3243 if (extent_count > 0)
3244 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3245 fiemap->fm_mapped_extents *
3246 sizeof(struct ll_fiemap_extent));
3248 OBD_FREE_LARGE(fiemap, num_bytes);
3252 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3254 struct ll_inode_info *lli = ll_i2info(inode);
3255 struct posix_acl *acl = NULL;
3258 spin_lock(&lli->lli_lock);
3259 /* VFS' acl_permission_check->check_acl will release the refcount */
3260 acl = posix_acl_dup(lli->lli_posix_acl);
3261 spin_unlock(&lli->lli_lock);
3266 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3268 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3269 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3271 ll_check_acl(struct inode *inode, int mask)
3274 # ifdef CONFIG_FS_POSIX_ACL
3275 struct posix_acl *acl;
3279 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3280 if (flags & IPERM_FLAG_RCU)
3283 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3288 rc = posix_acl_permission(inode, acl, mask);
3289 posix_acl_release(acl);
3292 # else /* !CONFIG_FS_POSIX_ACL */
3294 # endif /* CONFIG_FS_POSIX_ACL */
3296 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3298 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3299 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3301 # ifdef HAVE_INODE_PERMISION_2ARGS
3302 int ll_inode_permission(struct inode *inode, int mask)
3304 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3309 struct ll_sb_info *sbi;
3310 struct root_squash_info *squash;
3311 struct cred *cred = NULL;
3312 const struct cred *old_cred = NULL;
3314 bool squash_id = false;
3317 #ifdef MAY_NOT_BLOCK
3318 if (mask & MAY_NOT_BLOCK)
3320 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3321 if (flags & IPERM_FLAG_RCU)
3325 /* as root inode are NOT getting validated in lookup operation,
3326 * need to do it before permission check. */
3328 if (inode == inode->i_sb->s_root->d_inode) {
3329 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3330 MDS_INODELOCK_LOOKUP);
3335 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3336 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3338 /* squash fsuid/fsgid if needed */
3339 sbi = ll_i2sbi(inode);
3340 squash = &sbi->ll_squash;
3341 if (unlikely(squash->rsi_uid != 0 &&
3342 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3343 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3347 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3348 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3349 squash->rsi_uid, squash->rsi_gid);
3351 /* update current process's credentials
3352 * and FS capability */
3353 cred = prepare_creds();
3357 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3358 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3359 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3360 if ((1 << cap) & CFS_CAP_FS_MASK)
3361 cap_lower(cred->cap_effective, cap);
3363 old_cred = override_creds(cred);
3366 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3368 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3369 rc = lustre_check_remote_perm(inode, mask);
3371 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3373 /* restore current process's credentials and FS capability */
3375 revert_creds(old_cred);
3382 /* -o localflock - only provides locally consistent flock locks */
3383 struct file_operations ll_file_operations = {
3384 .read = ll_file_read,
3385 .aio_read = ll_file_aio_read,
3386 .write = ll_file_write,
3387 .aio_write = ll_file_aio_write,
3388 .unlocked_ioctl = ll_file_ioctl,
3389 .open = ll_file_open,
3390 .release = ll_file_release,
3391 .mmap = ll_file_mmap,
3392 .llseek = ll_file_seek,
3393 .splice_read = ll_file_splice_read,
3398 struct file_operations ll_file_operations_flock = {
3399 .read = ll_file_read,
3400 .aio_read = ll_file_aio_read,
3401 .write = ll_file_write,
3402 .aio_write = ll_file_aio_write,
3403 .unlocked_ioctl = ll_file_ioctl,
3404 .open = ll_file_open,
3405 .release = ll_file_release,
3406 .mmap = ll_file_mmap,
3407 .llseek = ll_file_seek,
3408 .splice_read = ll_file_splice_read,
3411 .flock = ll_file_flock,
3412 .lock = ll_file_flock
3415 /* These are for -o noflock - to return ENOSYS on flock calls */
3416 struct file_operations ll_file_operations_noflock = {
3417 .read = ll_file_read,
3418 .aio_read = ll_file_aio_read,
3419 .write = ll_file_write,
3420 .aio_write = ll_file_aio_write,
3421 .unlocked_ioctl = ll_file_ioctl,
3422 .open = ll_file_open,
3423 .release = ll_file_release,
3424 .mmap = ll_file_mmap,
3425 .llseek = ll_file_seek,
3426 .splice_read = ll_file_splice_read,
3429 .flock = ll_file_noflock,
3430 .lock = ll_file_noflock
3433 struct inode_operations ll_file_inode_operations = {
3434 .setattr = ll_setattr,
3435 .getattr = ll_getattr,
3436 .permission = ll_inode_permission,
3437 .setxattr = ll_setxattr,
3438 .getxattr = ll_getxattr,
3439 .listxattr = ll_listxattr,
3440 .removexattr = ll_removexattr,
3441 .fiemap = ll_fiemap,
3442 #ifdef HAVE_IOP_GET_ACL
3443 .get_acl = ll_get_acl,
3447 /* dynamic ioctl number support routins */
3448 static struct llioc_ctl_data {
3449 struct rw_semaphore ioc_sem;
3450 struct list_head ioc_head;
3452 __RWSEM_INITIALIZER(llioc.ioc_sem),
3453 LIST_HEAD_INIT(llioc.ioc_head)
3458 struct list_head iocd_list;
3459 unsigned int iocd_size;
3460 llioc_callback_t iocd_cb;
3461 unsigned int iocd_count;
3462 unsigned int iocd_cmd[0];
3465 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3468 struct llioc_data *in_data = NULL;
3471 if (cb == NULL || cmd == NULL ||
3472 count > LLIOC_MAX_CMD || count < 0)
3475 size = sizeof(*in_data) + count * sizeof(unsigned int);
3476 OBD_ALLOC(in_data, size);
3477 if (in_data == NULL)
3480 memset(in_data, 0, sizeof(*in_data));
3481 in_data->iocd_size = size;
3482 in_data->iocd_cb = cb;
3483 in_data->iocd_count = count;
3484 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3486 down_write(&llioc.ioc_sem);
3487 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3488 up_write(&llioc.ioc_sem);
3493 void ll_iocontrol_unregister(void *magic)
3495 struct llioc_data *tmp;
3500 down_write(&llioc.ioc_sem);
3501 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3503 unsigned int size = tmp->iocd_size;
3505 list_del(&tmp->iocd_list);
3506 up_write(&llioc.ioc_sem);
3508 OBD_FREE(tmp, size);
3512 up_write(&llioc.ioc_sem);
3514 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3517 EXPORT_SYMBOL(ll_iocontrol_register);
3518 EXPORT_SYMBOL(ll_iocontrol_unregister);
3520 static enum llioc_iter
3521 ll_iocontrol_call(struct inode *inode, struct file *file,
3522 unsigned int cmd, unsigned long arg, int *rcp)
3524 enum llioc_iter ret = LLIOC_CONT;
3525 struct llioc_data *data;
3526 int rc = -EINVAL, i;
3528 down_read(&llioc.ioc_sem);
3529 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3530 for (i = 0; i < data->iocd_count; i++) {
3531 if (cmd != data->iocd_cmd[i])
3534 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3538 if (ret == LLIOC_STOP)
3541 up_read(&llioc.ioc_sem);
3548 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3550 struct ll_inode_info *lli = ll_i2info(inode);
3551 struct cl_env_nest nest;
3556 if (lli->lli_clob == NULL)
3559 env = cl_env_nested_get(&nest);
3561 RETURN(PTR_ERR(env));
3563 result = cl_conf_set(env, lli->lli_clob, conf);
3564 cl_env_nested_put(&nest, env);
3566 if (conf->coc_opc == OBJECT_CONF_SET) {
3567 struct ldlm_lock *lock = conf->coc_lock;
3569 LASSERT(lock != NULL);
3570 LASSERT(ldlm_has_layout(lock));
3572 struct lustre_md *md = conf->u.coc_md;
3573 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3575 /* it can only be allowed to match after layout is
3576 * applied to inode otherwise false layout would be
3577 * seen. Applying layout shoud happen before dropping
3578 * the intent lock. */
3579 ldlm_lock_allow_match(lock);
3581 lli->lli_has_smd = lsm_has_objects(md->lsm);
3582 if (md->lsm != NULL)
3583 gen = md->lsm->lsm_layout_gen;
3586 DFID ": layout version change: %u -> %u\n",
3587 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3589 ll_layout_version_set(lli, gen);
3595 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3596 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3599 struct ll_sb_info *sbi = ll_i2sbi(inode);
3600 struct obd_capa *oc;
3601 struct ptlrpc_request *req;
3602 struct mdt_body *body;
3609 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3610 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3611 lock->l_lvb_data, lock->l_lvb_len);
3613 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3616 /* if layout lock was granted right away, the layout is returned
3617 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3618 * blocked and then granted via completion ast, we have to fetch
3619 * layout here. Please note that we can't use the LVB buffer in
3620 * completion AST because it doesn't have a large enough buffer */
3621 oc = ll_mdscapa_get(inode);
3622 rc = ll_get_default_mdsize(sbi, &lmmsize);
3624 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3625 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3631 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3633 GOTO(out, rc = -EPROTO);
3635 lmmsize = body->mbo_eadatasize;
3636 if (lmmsize == 0) /* empty layout */
3639 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3641 GOTO(out, rc = -EFAULT);
3643 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3644 if (lvbdata == NULL)
3645 GOTO(out, rc = -ENOMEM);
3647 memcpy(lvbdata, lmm, lmmsize);
3648 lock_res_and_lock(lock);
3649 if (lock->l_lvb_data != NULL)
3650 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3652 lock->l_lvb_data = lvbdata;
3653 lock->l_lvb_len = lmmsize;
3654 unlock_res_and_lock(lock);
3659 ptlrpc_req_finished(req);
3664 * Apply the layout to the inode. Layout lock is held and will be released
3667 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3668 struct inode *inode, __u32 *gen, bool reconf)
3670 struct ll_inode_info *lli = ll_i2info(inode);
3671 struct ll_sb_info *sbi = ll_i2sbi(inode);
3672 struct ldlm_lock *lock;
3673 struct lustre_md md = { NULL };
3674 struct cl_object_conf conf;
3677 bool wait_layout = false;
3680 LASSERT(lustre_handle_is_used(lockh));
3682 lock = ldlm_handle2lock(lockh);
3683 LASSERT(lock != NULL);
3684 LASSERT(ldlm_has_layout(lock));
3686 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3687 PFID(&lli->lli_fid), inode, reconf);
3689 /* in case this is a caching lock and reinstate with new inode */
3690 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3692 lock_res_and_lock(lock);
3693 lvb_ready = ldlm_is_lvb_ready(lock);
3694 unlock_res_and_lock(lock);
3695 /* checking lvb_ready is racy but this is okay. The worst case is
3696 * that multi processes may configure the file on the same time. */
3698 if (lvb_ready || !reconf) {
3701 /* layout_gen must be valid if layout lock is not
3702 * cancelled and stripe has already set */
3703 *gen = ll_layout_version_get(lli);
3709 rc = ll_layout_fetch(inode, lock);
3713 /* for layout lock, lmm is returned in lock's lvb.
3714 * lvb_data is immutable if the lock is held so it's safe to access it
3715 * without res lock. See the description in ldlm_lock_decref_internal()
3716 * for the condition to free lvb_data of layout lock */
3717 if (lock->l_lvb_data != NULL) {
3718 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3719 lock->l_lvb_data, lock->l_lvb_len);
3721 *gen = LL_LAYOUT_GEN_EMPTY;
3723 *gen = md.lsm->lsm_layout_gen;
3726 CERROR("%s: file "DFID" unpackmd error: %d\n",
3727 ll_get_fsname(inode->i_sb, NULL, 0),
3728 PFID(&lli->lli_fid), rc);
3734 /* set layout to file. Unlikely this will fail as old layout was
3735 * surely eliminated */
3736 memset(&conf, 0, sizeof conf);
3737 conf.coc_opc = OBJECT_CONF_SET;
3738 conf.coc_inode = inode;
3739 conf.coc_lock = lock;
3740 conf.u.coc_md = &md;
3741 rc = ll_layout_conf(inode, &conf);
3744 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3746 /* refresh layout failed, need to wait */
3747 wait_layout = rc == -EBUSY;
3751 LDLM_LOCK_PUT(lock);
3752 ldlm_lock_decref(lockh, mode);
3754 /* wait for IO to complete if it's still being used. */
3756 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3757 ll_get_fsname(inode->i_sb, NULL, 0),
3758 PFID(&lli->lli_fid), inode);
3760 memset(&conf, 0, sizeof conf);
3761 conf.coc_opc = OBJECT_CONF_WAIT;
3762 conf.coc_inode = inode;
3763 rc = ll_layout_conf(inode, &conf);
3767 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3768 ll_get_fsname(inode->i_sb, NULL, 0),
3769 PFID(&lli->lli_fid), rc);
3775 * This function checks if there exists a LAYOUT lock on the client side,
3776 * or enqueues it if it doesn't have one in cache.
3778 * This function will not hold layout lock so it may be revoked any time after
3779 * this function returns. Any operations depend on layout should be redone
3782 * This function should be called before lov_io_init() to get an uptodate
3783 * layout version, the caller should save the version number and after IO
3784 * is finished, this function should be called again to verify that layout
3785 * is not changed during IO time.
3787 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3789 struct ll_inode_info *lli = ll_i2info(inode);
3790 struct ll_sb_info *sbi = ll_i2sbi(inode);
3791 struct md_op_data *op_data;
3792 struct lookup_intent it;
3793 struct lustre_handle lockh;
3795 struct ldlm_enqueue_info einfo = {
3796 .ei_type = LDLM_IBITS,
3798 .ei_cb_bl = &ll_md_blocking_ast,
3799 .ei_cb_cp = &ldlm_completion_ast,
3804 *gen = ll_layout_version_get(lli);
3805 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3809 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3810 LASSERT(S_ISREG(inode->i_mode));
3812 /* take layout lock mutex to enqueue layout lock exclusively. */
3813 mutex_lock(&lli->lli_layout_mutex);
3816 /* mostly layout lock is caching on the local side, so try to match
3817 * it before grabbing layout lock mutex. */
3818 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3819 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3820 if (mode != 0) { /* hit cached lock */
3821 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3825 mutex_unlock(&lli->lli_layout_mutex);
3829 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3830 0, 0, LUSTRE_OPC_ANY, NULL);
3831 if (IS_ERR(op_data)) {
3832 mutex_unlock(&lli->lli_layout_mutex);
3833 RETURN(PTR_ERR(op_data));
3836 /* have to enqueue one */
3837 memset(&it, 0, sizeof(it));
3838 it.it_op = IT_LAYOUT;
3839 lockh.cookie = 0ULL;
3841 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3842 ll_get_fsname(inode->i_sb, NULL, 0),
3843 PFID(&lli->lli_fid), inode);
3845 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3846 if (it.d.lustre.it_data != NULL)
3847 ptlrpc_req_finished(it.d.lustre.it_data);
3848 it.d.lustre.it_data = NULL;
3850 ll_finish_md_op_data(op_data);
3852 mode = it.d.lustre.it_lock_mode;
3853 it.d.lustre.it_lock_mode = 0;
3854 ll_intent_drop_lock(&it);
3857 /* set lock data in case this is a new lock */
3858 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3859 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3863 mutex_unlock(&lli->lli_layout_mutex);
3869 * This function send a restore request to the MDT
3871 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3873 struct hsm_user_request *hur;
3877 len = sizeof(struct hsm_user_request) +
3878 sizeof(struct hsm_user_item);
3879 OBD_ALLOC(hur, len);
3883 hur->hur_request.hr_action = HUA_RESTORE;
3884 hur->hur_request.hr_archive_id = 0;
3885 hur->hur_request.hr_flags = 0;
3886 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3887 sizeof(hur->hur_user_item[0].hui_fid));
3888 hur->hur_user_item[0].hui_extent.offset = offset;
3889 hur->hur_user_item[0].hui_extent.length = length;
3890 hur->hur_request.hr_itemcount = 1;
3891 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,