4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och,
124 const __u64 *data_version)
126 struct obd_export *exp = ll_i2mdexp(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
129 struct obd_device *obd = class_exp2obd(exp);
136 * XXX: in case of LMV, is this correct to access
139 CERROR("Invalid MDC connection handle "LPX64"\n",
140 ll_i2mdexp(inode)->exp_handle.h_cookie);
144 OBD_ALLOC_PTR(op_data);
146 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
148 ll_prepare_close(inode, op_data, och);
149 if (data_version != NULL) {
150 /* Pass in data_version implies release. */
151 op_data->op_bias |= MDS_HSM_RELEASE;
152 op_data->op_data_version = *data_version;
153 op_data->op_lease_handle = och->och_lease_handle;
154 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
157 rc = md_close(md_exp, op_data, och->och_mod, &req);
159 /* This close must have the epoch closed. */
160 LASSERT(epoch_close);
161 /* MDS has instructed us to obtain Size-on-MDS attribute from
162 * OSTs and send setattr to back to MDS. */
163 rc = ll_som_update(inode, op_data);
165 CERROR("inode %lu mdc Size-on-MDS update failed: "
166 "rc = %d\n", inode->i_ino, rc);
170 CERROR("inode %lu mdc close failed: rc = %d\n",
174 /* DATA_MODIFIED flag was successfully sent on close, cancel data
175 * modification flag. */
176 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
177 struct ll_inode_info *lli = ll_i2info(inode);
179 spin_lock(&lli->lli_lock);
180 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
181 spin_unlock(&lli->lli_lock);
185 rc = ll_objects_destroy(req, inode);
187 CERROR("inode %lu ll_objects destroy: rc = %d\n",
191 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
192 struct mdt_body *body;
193 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
194 if (!(body->valid & OBD_MD_FLRELEASED))
198 ll_finish_md_op_data(op_data);
202 if (exp_connect_som(exp) && !epoch_close &&
203 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
204 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
206 md_clear_open_replay_data(md_exp, och);
207 /* Free @och if it is not waiting for DONE_WRITING. */
208 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
211 if (req) /* This is close request */
212 ptlrpc_req_finished(req);
216 int ll_md_real_close(struct inode *inode, int flags)
218 struct ll_inode_info *lli = ll_i2info(inode);
219 struct obd_client_handle **och_p;
220 struct obd_client_handle *och;
225 if (flags & FMODE_WRITE) {
226 och_p = &lli->lli_mds_write_och;
227 och_usecount = &lli->lli_open_fd_write_count;
228 } else if (flags & FMODE_EXEC) {
229 och_p = &lli->lli_mds_exec_och;
230 och_usecount = &lli->lli_open_fd_exec_count;
232 LASSERT(flags & FMODE_READ);
233 och_p = &lli->lli_mds_read_och;
234 och_usecount = &lli->lli_open_fd_read_count;
237 mutex_lock(&lli->lli_och_mutex);
238 if (*och_usecount) { /* There are still users of this handle, so
240 mutex_unlock(&lli->lli_och_mutex);
245 mutex_unlock(&lli->lli_och_mutex);
247 if (och) { /* There might be a race and somebody have freed this och
249 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
256 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
260 struct ll_inode_info *lli = ll_i2info(inode);
264 /* clear group lock, if present */
265 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
266 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
268 if (fd->fd_lease_och != NULL) {
271 /* Usually the lease is not released when the
272 * application crashed, we need to release here. */
273 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
274 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
275 PFID(&lli->lli_fid), rc, lease_broken);
277 fd->fd_lease_och = NULL;
280 if (fd->fd_och != NULL) {
281 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
286 /* Let's see if we have good enough OPEN lock on the file and if
287 we can skip talking to MDS */
288 if (file->f_dentry->d_inode) { /* Can this ever be false? */
290 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
291 struct lustre_handle lockh;
292 struct inode *inode = file->f_dentry->d_inode;
293 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
295 mutex_lock(&lli->lli_och_mutex);
296 if (fd->fd_omode & FMODE_WRITE) {
298 LASSERT(lli->lli_open_fd_write_count);
299 lli->lli_open_fd_write_count--;
300 } else if (fd->fd_omode & FMODE_EXEC) {
302 LASSERT(lli->lli_open_fd_exec_count);
303 lli->lli_open_fd_exec_count--;
306 LASSERT(lli->lli_open_fd_read_count);
307 lli->lli_open_fd_read_count--;
309 mutex_unlock(&lli->lli_och_mutex);
311 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
312 LDLM_IBITS, &policy, lockmode,
314 rc = ll_md_real_close(file->f_dentry->d_inode,
318 CERROR("Releasing a file %p with negative dentry %p. Name %s",
319 file, file->f_dentry, file->f_dentry->d_name.name);
323 LUSTRE_FPRIVATE(file) = NULL;
324 ll_file_data_put(fd);
325 ll_capa_close(inode);
330 /* While this returns an error code, fput() the caller does not, so we need
331 * to make every effort to clean up all of our state here. Also, applications
332 * rarely check close errors and even if an error is returned they will not
333 * re-try the close call.
335 int ll_file_release(struct inode *inode, struct file *file)
337 struct ll_file_data *fd;
338 struct ll_sb_info *sbi = ll_i2sbi(inode);
339 struct ll_inode_info *lli = ll_i2info(inode);
343 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
344 inode->i_generation, inode);
346 #ifdef CONFIG_FS_POSIX_ACL
347 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
348 inode == inode->i_sb->s_root->d_inode) {
349 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
352 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
353 fd->fd_flags &= ~LL_FILE_RMTACL;
354 rct_del(&sbi->ll_rct, cfs_curproc_pid());
355 et_search_free(&sbi->ll_et, cfs_curproc_pid());
360 if (inode->i_sb->s_root != file->f_dentry)
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead.
366 * Different processes can open the same dir, "ll_opendir_key" means:
367 * it is me that should stop the statahead thread. */
368 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
369 lli->lli_opendir_pid != 0)
370 ll_stop_statahead(inode, lli->lli_opendir_key);
372 if (inode->i_sb->s_root == file->f_dentry) {
373 LUSTRE_FPRIVATE(file) = NULL;
374 ll_file_data_put(fd);
378 if (!S_ISDIR(inode->i_mode)) {
379 lov_read_and_clear_async_rc(lli->lli_clob);
380 lli->lli_async_rc = 0;
383 rc = ll_md_close(sbi->ll_md_exp, inode, file);
385 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
386 libcfs_debug_dumplog();
391 static int ll_intent_file_open(struct file *file, void *lmm,
392 int lmmsize, struct lookup_intent *itp)
394 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
395 struct dentry *parent = file->f_dentry->d_parent;
396 struct md_op_data *op_data;
397 struct ptlrpc_request *req;
398 __u32 opc = LUSTRE_OPC_ANY;
405 /* Usually we come here only for NFSD, and we want open lock.
406 But we can also get here with pre 2.6.15 patchless kernels, and in
407 that case that lock is also ok */
408 /* We can also get here if there was cached open handle in revalidate_it
409 * but it disappeared while we were getting from there to ll_file_open.
410 * But this means this file was closed and immediatelly opened which
411 * makes a good candidate for using OPEN lock */
412 /* If lmmsize & lmm are not 0, we are just setting stripe info
413 * parameters. No need for the open lock */
414 if (lmm == NULL && lmmsize == 0) {
415 itp->it_flags |= MDS_OPEN_LOCK;
416 if (itp->it_flags & FMODE_WRITE)
417 opc = LUSTRE_OPC_CREATE;
420 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
421 file->f_dentry->d_inode, NULL, 0,
425 RETURN(PTR_ERR(op_data));
427 itp->it_flags |= MDS_OPEN_BY_FID;
428 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
429 0 /*unused */, &req, ll_md_blocking_ast, 0);
430 ll_finish_md_op_data(op_data);
432 /* reason for keep own exit path - don`t flood log
433 * with messages with -ESTALE errors.
435 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
436 it_open_error(DISP_OPEN_OPEN, itp))
438 ll_release_openhandle(file->f_dentry, itp);
442 if (it_disposition(itp, DISP_LOOKUP_NEG))
443 GOTO(out, rc = -ENOENT);
445 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
446 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
447 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
451 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
452 if (!rc && itp->d.lustre.it_lock_mode)
453 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
457 ptlrpc_req_finished(itp->d.lustre.it_data);
458 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
459 ll_intent_drop_lock(itp);
465 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
466 * not believe attributes if a few ioepoch holders exist. Attributes for
467 * previous ioepoch if new one is opened are also skipped by MDS.
469 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
471 if (ioepoch && lli->lli_ioepoch != ioepoch) {
472 lli->lli_ioepoch = ioepoch;
473 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
474 ioepoch, PFID(&lli->lli_fid));
478 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
479 struct obd_client_handle *och)
481 struct ptlrpc_request *req = it->d.lustre.it_data;
482 struct mdt_body *body;
484 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
485 och->och_fh = body->handle;
486 och->och_fid = body->fid1;
487 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
488 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
489 och->och_flags = it->it_flags;
491 return md_set_open_replay_data(md_exp, och, req);
494 int ll_local_open(struct file *file, struct lookup_intent *it,
495 struct ll_file_data *fd, struct obd_client_handle *och)
497 struct inode *inode = file->f_dentry->d_inode;
498 struct ll_inode_info *lli = ll_i2info(inode);
501 LASSERT(!LUSTRE_FPRIVATE(file));
506 struct ptlrpc_request *req = it->d.lustre.it_data;
507 struct mdt_body *body;
510 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
514 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
515 ll_ioepoch_open(lli, body->ioepoch);
518 LUSTRE_FPRIVATE(file) = fd;
519 ll_readahead_init(inode, &fd->fd_ras);
520 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
525 /* Open a file, and (for the very first open) create objects on the OSTs at
526 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
527 * creation or open until ll_lov_setstripe() ioctl is called.
529 * If we already have the stripe MD locally then we don't request it in
530 * md_open(), by passing a lmm_size = 0.
532 * It is up to the application to ensure no other processes open this file
533 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
534 * used. We might be able to avoid races of that sort by getting lli_open_sem
535 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
536 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
538 int ll_file_open(struct inode *inode, struct file *file)
540 struct ll_inode_info *lli = ll_i2info(inode);
541 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
542 .it_flags = file->f_flags };
543 struct obd_client_handle **och_p = NULL;
544 __u64 *och_usecount = NULL;
545 struct ll_file_data *fd;
546 int rc = 0, opendir_set = 0;
549 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
550 inode->i_generation, inode, file->f_flags);
552 it = file->private_data; /* XXX: compat macro */
553 file->private_data = NULL; /* prevent ll_local_open assertion */
555 fd = ll_file_data_get();
557 GOTO(out_openerr, rc = -ENOMEM);
560 if (S_ISDIR(inode->i_mode)) {
561 spin_lock(&lli->lli_sa_lock);
562 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
563 lli->lli_opendir_pid == 0) {
564 lli->lli_opendir_key = fd;
565 lli->lli_opendir_pid = cfs_curproc_pid();
568 spin_unlock(&lli->lli_sa_lock);
571 if (inode->i_sb->s_root == file->f_dentry) {
572 LUSTRE_FPRIVATE(file) = fd;
576 if (!it || !it->d.lustre.it_disposition) {
577 /* Convert f_flags into access mode. We cannot use file->f_mode,
578 * because everything but O_ACCMODE mask was stripped from
580 if ((oit.it_flags + 1) & O_ACCMODE)
582 if (file->f_flags & O_TRUNC)
583 oit.it_flags |= FMODE_WRITE;
585 /* kernel only call f_op->open in dentry_open. filp_open calls
586 * dentry_open after call to open_namei that checks permissions.
587 * Only nfsd_open call dentry_open directly without checking
588 * permissions and because of that this code below is safe. */
589 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
590 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
592 /* We do not want O_EXCL here, presumably we opened the file
593 * already? XXX - NFS implications? */
594 oit.it_flags &= ~O_EXCL;
596 /* bug20584, if "it_flags" contains O_CREAT, the file will be
597 * created if necessary, then "IT_CREAT" should be set to keep
598 * consistent with it */
599 if (oit.it_flags & O_CREAT)
600 oit.it_op |= IT_CREAT;
606 /* Let's see if we have file open on MDS already. */
607 if (it->it_flags & FMODE_WRITE) {
608 och_p = &lli->lli_mds_write_och;
609 och_usecount = &lli->lli_open_fd_write_count;
610 } else if (it->it_flags & FMODE_EXEC) {
611 och_p = &lli->lli_mds_exec_och;
612 och_usecount = &lli->lli_open_fd_exec_count;
614 och_p = &lli->lli_mds_read_och;
615 och_usecount = &lli->lli_open_fd_read_count;
618 mutex_lock(&lli->lli_och_mutex);
619 if (*och_p) { /* Open handle is present */
620 if (it_disposition(it, DISP_OPEN_OPEN)) {
621 /* Well, there's extra open request that we do not need,
622 let's close it somehow. This will decref request. */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 mutex_unlock(&lli->lli_och_mutex);
626 GOTO(out_openerr, rc);
629 ll_release_openhandle(file->f_dentry, it);
633 rc = ll_local_open(file, it, fd, NULL);
636 mutex_unlock(&lli->lli_och_mutex);
637 GOTO(out_openerr, rc);
640 LASSERT(*och_usecount == 0);
641 if (!it->d.lustre.it_disposition) {
642 /* We cannot just request lock handle now, new ELC code
643 means that one of other OPEN locks for this file
644 could be cancelled, and since blocking ast handler
645 would attempt to grab och_mutex as well, that would
646 result in a deadlock */
647 mutex_unlock(&lli->lli_och_mutex);
648 it->it_create_mode |= M_CHECK_STALE;
649 rc = ll_intent_file_open(file, NULL, 0, it);
650 it->it_create_mode &= ~M_CHECK_STALE;
652 GOTO(out_openerr, rc);
656 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
658 GOTO(out_och_free, rc = -ENOMEM);
662 /* md_intent_lock() didn't get a request ref if there was an
663 * open error, so don't do cleanup on the request here
665 /* XXX (green): Should not we bail out on any error here, not
666 * just open error? */
667 rc = it_open_error(DISP_OPEN_OPEN, it);
669 GOTO(out_och_free, rc);
671 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
673 rc = ll_local_open(file, it, fd, *och_p);
675 GOTO(out_och_free, rc);
677 mutex_unlock(&lli->lli_och_mutex);
680 /* Must do this outside lli_och_mutex lock to prevent deadlock where
681 different kind of OPEN lock for this same inode gets cancelled
682 by ldlm_cancel_lru */
683 if (!S_ISREG(inode->i_mode))
684 GOTO(out_och_free, rc);
688 if (!lli->lli_has_smd) {
689 if (file->f_flags & O_LOV_DELAY_CREATE ||
690 !(file->f_mode & FMODE_WRITE)) {
691 CDEBUG(D_INODE, "object creation was delayed\n");
692 GOTO(out_och_free, rc);
695 file->f_flags &= ~O_LOV_DELAY_CREATE;
696 GOTO(out_och_free, rc);
700 if (och_p && *och_p) {
701 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
702 *och_p = NULL; /* OBD_FREE writes some magic there */
705 mutex_unlock(&lli->lli_och_mutex);
708 if (opendir_set != 0)
709 ll_stop_statahead(inode, lli->lli_opendir_key);
711 ll_file_data_put(fd);
713 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
716 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
717 ptlrpc_req_finished(it->d.lustre.it_data);
718 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
724 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
725 struct ldlm_lock_desc *desc, void *data, int flag)
728 struct lustre_handle lockh;
732 case LDLM_CB_BLOCKING:
733 ldlm_lock2handle(lock, &lockh);
734 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
736 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
740 case LDLM_CB_CANCELING:
748 * Acquire a lease and open the file.
750 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
751 fmode_t fmode, __u64 open_flags)
753 struct lookup_intent it = { .it_op = IT_OPEN };
754 struct ll_sb_info *sbi = ll_i2sbi(inode);
755 struct md_op_data *op_data;
756 struct ptlrpc_request *req;
757 struct lustre_handle old_handle = { 0 };
758 struct obd_client_handle *och = NULL;
763 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
764 RETURN(ERR_PTR(-EINVAL));
767 struct ll_inode_info *lli = ll_i2info(inode);
768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
769 struct obd_client_handle **och_p;
772 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
773 RETURN(ERR_PTR(-EPERM));
775 /* Get the openhandle of the file */
777 mutex_lock(&lli->lli_och_mutex);
778 if (fd->fd_lease_och != NULL) {
779 mutex_unlock(&lli->lli_och_mutex);
783 if (fd->fd_och == NULL) {
784 if (file->f_mode & FMODE_WRITE) {
785 LASSERT(lli->lli_mds_write_och != NULL);
786 och_p = &lli->lli_mds_write_och;
787 och_usecount = &lli->lli_open_fd_write_count;
789 LASSERT(lli->lli_mds_read_och != NULL);
790 och_p = &lli->lli_mds_read_och;
791 och_usecount = &lli->lli_open_fd_read_count;
793 if (*och_usecount == 1) {
800 mutex_unlock(&lli->lli_och_mutex);
801 if (rc < 0) /* more than 1 opener */
804 LASSERT(fd->fd_och != NULL);
805 old_handle = fd->fd_och->och_fh;
810 RETURN(ERR_PTR(-ENOMEM));
812 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
813 LUSTRE_OPC_ANY, NULL);
815 GOTO(out, rc = PTR_ERR(op_data));
817 /* To tell the MDT this openhandle is from the same owner */
818 op_data->op_handle = old_handle;
820 it.it_flags = fmode | open_flags;
821 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
822 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
823 ll_md_blocking_lease_ast,
824 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
825 * it can be cancelled which may mislead applications that the lease is
827 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
828 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
829 * doesn't deal with openhandle, so normal openhandle will be leaked. */
830 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
831 ll_finish_md_op_data(op_data);
833 ptlrpc_req_finished(req);
834 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
837 GOTO(out_release_it, rc);
839 if (it_disposition(&it, DISP_LOOKUP_NEG))
840 GOTO(out_release_it, rc = -ENOENT);
842 rc = it_open_error(DISP_OPEN_OPEN, &it);
844 GOTO(out_release_it, rc);
846 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
847 ll_och_fill(sbi->ll_md_exp, &it, och);
849 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
850 GOTO(out_close, rc = -EOPNOTSUPP);
852 /* already get lease, handle lease lock */
853 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
854 if (it.d.lustre.it_lock_mode == 0 ||
855 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
856 /* open lock must return for lease */
857 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
858 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
859 it.d.lustre.it_lock_bits);
860 GOTO(out_close, rc = -EPROTO);
863 ll_intent_release(&it);
867 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
869 CERROR("Close openhandle returned %d\n", rc2);
871 /* cancel open lock */
872 if (it.d.lustre.it_lock_mode != 0) {
873 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
874 it.d.lustre.it_lock_mode);
875 it.d.lustre.it_lock_mode = 0;
878 ll_intent_release(&it);
883 EXPORT_SYMBOL(ll_lease_open);
886 * Release lease and close the file.
887 * It will check if the lease has ever broken.
889 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
892 struct ldlm_lock *lock;
893 bool cancelled = true;
897 lock = ldlm_handle2lock(&och->och_lease_handle);
899 lock_res_and_lock(lock);
900 cancelled = ldlm_is_cancel(lock);
901 unlock_res_and_lock(lock);
905 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
906 PFID(&ll_i2info(inode)->lli_fid), cancelled);
909 ldlm_cli_cancel(&och->och_lease_handle, 0);
910 if (lease_broken != NULL)
911 *lease_broken = cancelled;
913 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
917 EXPORT_SYMBOL(ll_lease_close);
919 /* Fills the obdo with the attributes for the lsm */
920 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
921 struct obd_capa *capa, struct obdo *obdo,
922 __u64 ioepoch, int sync)
924 struct ptlrpc_request_set *set;
925 struct obd_info oinfo = { { { 0 } } };
930 LASSERT(lsm != NULL);
934 oinfo.oi_oa->o_oi = lsm->lsm_oi;
935 oinfo.oi_oa->o_mode = S_IFREG;
936 oinfo.oi_oa->o_ioepoch = ioepoch;
937 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
938 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
939 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
940 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
941 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
942 OBD_MD_FLDATAVERSION;
943 oinfo.oi_capa = capa;
945 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
946 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
949 set = ptlrpc_prep_set();
951 CERROR("can't allocate ptlrpc set\n");
954 rc = obd_getattr_async(exp, &oinfo, set);
956 rc = ptlrpc_set_wait(set);
957 ptlrpc_set_destroy(set);
960 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
961 OBD_MD_FLATIME | OBD_MD_FLMTIME |
962 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
963 OBD_MD_FLDATAVERSION);
968 * Performs the getattr on the inode and updates its fields.
969 * If @sync != 0, perform the getattr under the server-side lock.
971 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
972 __u64 ioepoch, int sync)
974 struct obd_capa *capa = ll_mdscapa_get(inode);
975 struct lov_stripe_md *lsm;
979 lsm = ccc_inode_lsm_get(inode);
980 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
981 capa, obdo, ioepoch, sync);
984 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
986 obdo_refresh_inode(inode, obdo, obdo->o_valid);
987 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
988 " blksize %lu\n", POSTID(oi), i_size_read(inode),
989 (unsigned long long)inode->i_blocks,
990 (unsigned long)ll_inode_blksize(inode));
992 ccc_inode_lsm_put(inode, lsm);
996 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
998 struct ll_inode_info *lli = ll_i2info(inode);
999 struct cl_object *obj = lli->lli_clob;
1000 struct cl_attr *attr = ccc_env_thread_attr(env);
1006 ll_inode_size_lock(inode);
1007 /* merge timestamps the most recently obtained from mds with
1008 timestamps obtained from osts */
1009 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1010 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1011 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1012 inode_init_lvb(inode, &lvb);
1014 cl_object_attr_lock(obj);
1015 rc = cl_object_attr_get(env, obj, attr);
1016 cl_object_attr_unlock(obj);
1019 if (lvb.lvb_atime < attr->cat_atime)
1020 lvb.lvb_atime = attr->cat_atime;
1021 if (lvb.lvb_ctime < attr->cat_ctime)
1022 lvb.lvb_ctime = attr->cat_ctime;
1023 if (lvb.lvb_mtime < attr->cat_mtime)
1024 lvb.lvb_mtime = attr->cat_mtime;
1026 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1027 PFID(&lli->lli_fid), attr->cat_size);
1028 cl_isize_write_nolock(inode, attr->cat_size);
1030 inode->i_blocks = attr->cat_blocks;
1032 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1033 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1034 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1036 ll_inode_size_unlock(inode);
1041 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1044 struct obdo obdo = { 0 };
1047 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1049 st->st_size = obdo.o_size;
1050 st->st_blocks = obdo.o_blocks;
1051 st->st_mtime = obdo.o_mtime;
1052 st->st_atime = obdo.o_atime;
1053 st->st_ctime = obdo.o_ctime;
1058 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1060 struct inode *inode = file->f_dentry->d_inode;
1062 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1064 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1065 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1066 file->f_flags & O_DIRECT ||
1069 io->ci_obj = ll_i2info(inode)->lli_clob;
1070 io->ci_lockreq = CILR_MAYBE;
1071 if (ll_file_nolock(file)) {
1072 io->ci_lockreq = CILR_NEVER;
1073 io->ci_no_srvlock = 1;
1074 } else if (file->f_flags & O_APPEND) {
1075 io->ci_lockreq = CILR_MANDATORY;
1080 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1081 struct file *file, enum cl_io_type iot,
1082 loff_t *ppos, size_t count)
1084 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1085 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1091 io = ccc_env_thread_io(env);
1092 ll_io_init(io, file, iot == CIT_WRITE);
1094 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1095 struct vvp_io *vio = vvp_env_io(env);
1096 struct ccc_io *cio = ccc_env_io(env);
1097 int write_mutex_locked = 0;
1099 cio->cui_fd = LUSTRE_FPRIVATE(file);
1100 vio->cui_io_subtype = args->via_io_subtype;
1102 switch (vio->cui_io_subtype) {
1104 cio->cui_iov = args->u.normal.via_iov;
1105 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1106 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1107 cio->cui_iocb = args->u.normal.via_iocb;
1108 if ((iot == CIT_WRITE) &&
1109 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1110 if (mutex_lock_interruptible(&lli->
1112 GOTO(out, result = -ERESTARTSYS);
1113 write_mutex_locked = 1;
1114 } else if (iot == CIT_READ) {
1115 down_read(&lli->lli_trunc_sem);
1119 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1120 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1123 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1124 vio->u.splice.cui_flags = args->u.splice.via_flags;
1127 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1130 result = cl_io_loop(env, io);
1131 if (write_mutex_locked)
1132 mutex_unlock(&lli->lli_write_mutex);
1133 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1134 up_read(&lli->lli_trunc_sem);
1136 /* cl_io_rw_init() handled IO */
1137 result = io->ci_result;
1140 if (io->ci_nob > 0) {
1141 result = io->ci_nob;
1142 *ppos = io->u.ci_wr.wr.crw_pos;
1146 cl_io_fini(env, io);
1147 /* If any bit been read/written (result != 0), we just return
1148 * short read/write instead of restart io. */
1149 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1150 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1151 iot == CIT_READ ? "read" : "write",
1152 file->f_dentry->d_name.name, *ppos, count);
1153 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1157 if (iot == CIT_READ) {
1159 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1160 LPROC_LL_READ_BYTES, result);
1161 } else if (iot == CIT_WRITE) {
1163 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1164 LPROC_LL_WRITE_BYTES, result);
1165 fd->fd_write_failed = false;
1166 } else if (result != -ERESTARTSYS) {
1167 fd->fd_write_failed = true;
1176 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1178 static int ll_file_get_iov_count(const struct iovec *iov,
1179 unsigned long *nr_segs, size_t *count)
1184 for (seg = 0; seg < *nr_segs; seg++) {
1185 const struct iovec *iv = &iov[seg];
1188 * If any segment has a negative length, or the cumulative
1189 * length ever wraps negative then return -EINVAL.
1192 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1194 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1199 cnt -= iv->iov_len; /* This segment is no good */
1206 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1207 unsigned long nr_segs, loff_t pos)
1210 struct vvp_io_args *args;
1216 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1220 env = cl_env_get(&refcheck);
1222 RETURN(PTR_ERR(env));
1224 args = vvp_env_args(env, IO_NORMAL);
1225 args->u.normal.via_iov = (struct iovec *)iov;
1226 args->u.normal.via_nrsegs = nr_segs;
1227 args->u.normal.via_iocb = iocb;
1229 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1230 &iocb->ki_pos, count);
1231 cl_env_put(env, &refcheck);
1235 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1239 struct iovec *local_iov;
1240 struct kiocb *kiocb;
1245 env = cl_env_get(&refcheck);
1247 RETURN(PTR_ERR(env));
1249 local_iov = &vvp_env_info(env)->vti_local_iov;
1250 kiocb = &vvp_env_info(env)->vti_kiocb;
1251 local_iov->iov_base = (void __user *)buf;
1252 local_iov->iov_len = count;
1253 init_sync_kiocb(kiocb, file);
1254 kiocb->ki_pos = *ppos;
1255 kiocb->ki_left = count;
1257 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1258 *ppos = kiocb->ki_pos;
1260 cl_env_put(env, &refcheck);
1265 * Write to a file (through the page cache).
1268 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1269 unsigned long nr_segs, loff_t pos)
1272 struct vvp_io_args *args;
1278 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1282 env = cl_env_get(&refcheck);
1284 RETURN(PTR_ERR(env));
1286 args = vvp_env_args(env, IO_NORMAL);
1287 args->u.normal.via_iov = (struct iovec *)iov;
1288 args->u.normal.via_nrsegs = nr_segs;
1289 args->u.normal.via_iocb = iocb;
1291 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1292 &iocb->ki_pos, count);
1293 cl_env_put(env, &refcheck);
1297 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1301 struct iovec *local_iov;
1302 struct kiocb *kiocb;
1307 env = cl_env_get(&refcheck);
1309 RETURN(PTR_ERR(env));
1311 local_iov = &vvp_env_info(env)->vti_local_iov;
1312 kiocb = &vvp_env_info(env)->vti_kiocb;
1313 local_iov->iov_base = (void __user *)buf;
1314 local_iov->iov_len = count;
1315 init_sync_kiocb(kiocb, file);
1316 kiocb->ki_pos = *ppos;
1317 kiocb->ki_left = count;
1319 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1320 *ppos = kiocb->ki_pos;
1322 cl_env_put(env, &refcheck);
1327 * Send file content (through pagecache) somewhere with helper
1329 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1330 struct pipe_inode_info *pipe, size_t count,
1334 struct vvp_io_args *args;
1339 env = cl_env_get(&refcheck);
1341 RETURN(PTR_ERR(env));
1343 args = vvp_env_args(env, IO_SPLICE);
1344 args->u.splice.via_pipe = pipe;
1345 args->u.splice.via_flags = flags;
1347 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1348 cl_env_put(env, &refcheck);
1352 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1355 struct obd_export *exp = ll_i2dtexp(inode);
1356 struct obd_trans_info oti = { 0 };
1357 struct obdo *oa = NULL;
1360 struct lov_stripe_md *lsm = NULL, *lsm2;
1367 lsm = ccc_inode_lsm_get(inode);
1368 if (!lsm_has_objects(lsm))
1369 GOTO(out, rc = -ENOENT);
1371 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1372 (lsm->lsm_stripe_count));
1374 OBD_ALLOC_LARGE(lsm2, lsm_size);
1376 GOTO(out, rc = -ENOMEM);
1379 oa->o_nlink = ost_idx;
1380 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1381 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1382 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1383 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1384 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1385 memcpy(lsm2, lsm, lsm_size);
1386 ll_inode_size_lock(inode);
1387 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1388 ll_inode_size_unlock(inode);
1390 OBD_FREE_LARGE(lsm2, lsm_size);
1393 ccc_inode_lsm_put(inode, lsm);
1398 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1400 struct ll_recreate_obj ucreat;
1404 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1407 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1411 ostid_set_seq_mdt0(&oi);
1412 ostid_set_id(&oi, ucreat.lrc_id);
1413 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1416 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1423 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1426 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1429 fid_to_ostid(&fid, &oi);
1430 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1431 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1434 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1435 int flags, struct lov_user_md *lum, int lum_size)
1437 struct lov_stripe_md *lsm = NULL;
1438 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1442 lsm = ccc_inode_lsm_get(inode);
1444 ccc_inode_lsm_put(inode, lsm);
1445 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1450 ll_inode_size_lock(inode);
1451 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1454 rc = oit.d.lustre.it_status;
1456 GOTO(out_req_free, rc);
1458 ll_release_openhandle(file->f_dentry, &oit);
1461 ll_inode_size_unlock(inode);
1462 ll_intent_release(&oit);
1463 ccc_inode_lsm_put(inode, lsm);
1466 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1470 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1471 struct lov_mds_md **lmmp, int *lmm_size,
1472 struct ptlrpc_request **request)
1474 struct ll_sb_info *sbi = ll_i2sbi(inode);
1475 struct mdt_body *body;
1476 struct lov_mds_md *lmm = NULL;
1477 struct ptlrpc_request *req = NULL;
1478 struct md_op_data *op_data;
1481 rc = ll_get_max_mdsize(sbi, &lmmsize);
1485 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1486 strlen(filename), lmmsize,
1487 LUSTRE_OPC_ANY, NULL);
1488 if (IS_ERR(op_data))
1489 RETURN(PTR_ERR(op_data));
1491 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1492 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1493 ll_finish_md_op_data(op_data);
1495 CDEBUG(D_INFO, "md_getattr_name failed "
1496 "on %s: rc %d\n", filename, rc);
1500 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1501 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1503 lmmsize = body->eadatasize;
1505 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1507 GOTO(out, rc = -ENODATA);
1510 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1511 LASSERT(lmm != NULL);
1513 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1514 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1515 GOTO(out, rc = -EPROTO);
1519 * This is coming from the MDS, so is probably in
1520 * little endian. We convert it to host endian before
1521 * passing it to userspace.
1523 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1526 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1527 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1530 /* if function called for directory - we should
1531 * avoid swab not existent lsm objects */
1532 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1533 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1534 if (S_ISREG(body->mode))
1535 lustre_swab_lov_user_md_objects(
1536 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1538 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1539 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1540 if (S_ISREG(body->mode))
1541 lustre_swab_lov_user_md_objects(
1542 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1549 *lmm_size = lmmsize;
1554 static int ll_lov_setea(struct inode *inode, struct file *file,
1557 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1558 struct lov_user_md *lump;
1559 int lum_size = sizeof(struct lov_user_md) +
1560 sizeof(struct lov_user_ost_data);
1564 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1567 OBD_ALLOC_LARGE(lump, lum_size);
1571 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1572 OBD_FREE_LARGE(lump, lum_size);
1576 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1578 OBD_FREE_LARGE(lump, lum_size);
1582 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1585 struct lov_user_md_v3 lumv3;
1586 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1587 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1588 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1590 int flags = FMODE_WRITE;
1593 /* first try with v1 which is smaller than v3 */
1594 lum_size = sizeof(struct lov_user_md_v1);
1595 if (copy_from_user(lumv1, lumv1p, lum_size))
1598 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1599 lum_size = sizeof(struct lov_user_md_v3);
1600 if (copy_from_user(&lumv3, lumv3p, lum_size))
1604 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1606 struct lov_stripe_md *lsm;
1609 put_user(0, &lumv1p->lmm_stripe_count);
1611 ll_layout_refresh(inode, &gen);
1612 lsm = ccc_inode_lsm_get(inode);
1613 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1614 0, lsm, (void *)arg);
1615 ccc_inode_lsm_put(inode, lsm);
1620 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1622 struct lov_stripe_md *lsm;
1626 lsm = ccc_inode_lsm_get(inode);
1628 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1630 ccc_inode_lsm_put(inode, lsm);
1634 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1636 struct ll_inode_info *lli = ll_i2info(inode);
1637 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1638 struct ccc_grouplock grouplock;
1642 if (ll_file_nolock(file))
1643 RETURN(-EOPNOTSUPP);
1645 spin_lock(&lli->lli_lock);
1646 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1647 CWARN("group lock already existed with gid %lu\n",
1648 fd->fd_grouplock.cg_gid);
1649 spin_unlock(&lli->lli_lock);
1652 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1653 spin_unlock(&lli->lli_lock);
1655 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1656 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1660 spin_lock(&lli->lli_lock);
1661 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1662 spin_unlock(&lli->lli_lock);
1663 CERROR("another thread just won the race\n");
1664 cl_put_grouplock(&grouplock);
1668 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1669 fd->fd_grouplock = grouplock;
1670 spin_unlock(&lli->lli_lock);
1672 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1676 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1678 struct ll_inode_info *lli = ll_i2info(inode);
1679 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1680 struct ccc_grouplock grouplock;
1683 spin_lock(&lli->lli_lock);
1684 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1685 spin_unlock(&lli->lli_lock);
1686 CWARN("no group lock held\n");
1689 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1691 if (fd->fd_grouplock.cg_gid != arg) {
1692 CWARN("group lock %lu doesn't match current id %lu\n",
1693 arg, fd->fd_grouplock.cg_gid);
1694 spin_unlock(&lli->lli_lock);
1698 grouplock = fd->fd_grouplock;
1699 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1700 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1701 spin_unlock(&lli->lli_lock);
1703 cl_put_grouplock(&grouplock);
1704 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1709 * Close inode open handle
1711 * \param dentry [in] dentry which contains the inode
1712 * \param it [in,out] intent which contains open info and result
1715 * \retval <0 failure
1717 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1719 struct inode *inode = dentry->d_inode;
1720 struct obd_client_handle *och;
1726 /* Root ? Do nothing. */
1727 if (dentry->d_inode->i_sb->s_root == dentry)
1730 /* No open handle to close? Move away */
1731 if (!it_disposition(it, DISP_OPEN_OPEN))
1734 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1736 OBD_ALLOC(och, sizeof(*och));
1738 GOTO(out, rc = -ENOMEM);
1740 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1742 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1745 /* this one is in place of ll_file_open */
1746 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1747 ptlrpc_req_finished(it->d.lustre.it_data);
1748 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1754 * Get size for inode for which FIEMAP mapping is requested.
1755 * Make the FIEMAP get_info call and returns the result.
1757 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1760 struct obd_export *exp = ll_i2dtexp(inode);
1761 struct lov_stripe_md *lsm = NULL;
1762 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1763 int vallen = num_bytes;
1767 /* Checks for fiemap flags */
1768 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1769 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1773 /* Check for FIEMAP_FLAG_SYNC */
1774 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1775 rc = filemap_fdatawrite(inode->i_mapping);
1780 lsm = ccc_inode_lsm_get(inode);
1784 /* If the stripe_count > 1 and the application does not understand
1785 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1787 if (lsm->lsm_stripe_count > 1 &&
1788 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1789 GOTO(out, rc = -EOPNOTSUPP);
1791 fm_key.oa.o_oi = lsm->lsm_oi;
1792 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1794 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1795 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1796 /* If filesize is 0, then there would be no objects for mapping */
1797 if (fm_key.oa.o_size == 0) {
1798 fiemap->fm_mapped_extents = 0;
1802 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1804 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1807 CERROR("obd_get_info failed: rc = %d\n", rc);
1810 ccc_inode_lsm_put(inode, lsm);
1814 int ll_fid2path(struct inode *inode, void *arg)
1816 struct obd_export *exp = ll_i2mdexp(inode);
1817 struct getinfo_fid2path *gfout, *gfin;
1821 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1822 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1825 /* Need to get the buflen */
1826 OBD_ALLOC_PTR(gfin);
1829 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1834 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1835 OBD_ALLOC(gfout, outsize);
1836 if (gfout == NULL) {
1840 memcpy(gfout, gfin, sizeof(*gfout));
1843 /* Call mdc_iocontrol */
1844 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1848 if (copy_to_user(arg, gfout, outsize))
1852 OBD_FREE(gfout, outsize);
1856 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1858 struct ll_user_fiemap *fiemap_s;
1859 size_t num_bytes, ret_bytes;
1860 unsigned int extent_count;
1863 /* Get the extent count so we can calculate the size of
1864 * required fiemap buffer */
1865 if (get_user(extent_count,
1866 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1868 num_bytes = sizeof(*fiemap_s) + (extent_count *
1869 sizeof(struct ll_fiemap_extent));
1871 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1872 if (fiemap_s == NULL)
1875 /* get the fiemap value */
1876 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1878 GOTO(error, rc = -EFAULT);
1880 /* If fm_extent_count is non-zero, read the first extent since
1881 * it is used to calculate end_offset and device from previous
1884 if (copy_from_user(&fiemap_s->fm_extents[0],
1885 (char __user *)arg + sizeof(*fiemap_s),
1886 sizeof(struct ll_fiemap_extent)))
1887 GOTO(error, rc = -EFAULT);
1890 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1894 ret_bytes = sizeof(struct ll_user_fiemap);
1896 if (extent_count != 0)
1897 ret_bytes += (fiemap_s->fm_mapped_extents *
1898 sizeof(struct ll_fiemap_extent));
1900 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1904 OBD_FREE_LARGE(fiemap_s, num_bytes);
1909 * Read the data_version for inode.
1911 * This value is computed using stripe object version on OST.
1912 * Version is computed using server side locking.
1914 * @param extent_lock Take extent lock. Not needed if a process is already
1915 * holding the OST object group locks.
1917 int ll_data_version(struct inode *inode, __u64 *data_version,
1920 struct lov_stripe_md *lsm = NULL;
1921 struct ll_sb_info *sbi = ll_i2sbi(inode);
1922 struct obdo *obdo = NULL;
1926 /* If no stripe, we consider version is 0. */
1927 lsm = ccc_inode_lsm_get(inode);
1928 if (!lsm_has_objects(lsm)) {
1930 CDEBUG(D_INODE, "No object for inode\n");
1934 OBD_ALLOC_PTR(obdo);
1936 GOTO(out, rc = -ENOMEM);
1938 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1940 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1943 *data_version = obdo->o_data_version;
1949 ccc_inode_lsm_put(inode, lsm);
1954 * Trigger a HSM release request for the provided inode.
1956 int ll_hsm_release(struct inode *inode)
1958 struct cl_env_nest nest;
1960 struct obd_client_handle *och = NULL;
1961 __u64 data_version = 0;
1965 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1966 ll_get_fsname(inode->i_sb, NULL, 0),
1967 PFID(&ll_i2info(inode)->lli_fid));
1969 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1971 GOTO(out, rc = PTR_ERR(och));
1973 /* Grab latest data_version and [am]time values */
1974 rc = ll_data_version(inode, &data_version, 1);
1978 env = cl_env_nested_get(&nest);
1980 GOTO(out, rc = PTR_ERR(env));
1982 ll_merge_lvb(env, inode);
1983 cl_env_nested_put(&nest, env);
1985 /* Release the file.
1986 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1987 * we still need it to pack l_remote_handle to MDT. */
1988 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1994 if (och != NULL && !IS_ERR(och)) /* close the file */
1995 ll_lease_close(och, inode, NULL);
2000 struct ll_swap_stack {
2001 struct iattr ia1, ia2;
2003 struct inode *inode1, *inode2;
2004 bool check_dv1, check_dv2;
2007 static int ll_swap_layouts(struct file *file1, struct file *file2,
2008 struct lustre_swap_layouts *lsl)
2010 struct mdc_swap_layouts msl;
2011 struct md_op_data *op_data;
2014 struct ll_swap_stack *llss = NULL;
2017 OBD_ALLOC_PTR(llss);
2021 llss->inode1 = file1->f_dentry->d_inode;
2022 llss->inode2 = file2->f_dentry->d_inode;
2024 if (!S_ISREG(llss->inode2->i_mode))
2025 GOTO(free, rc = -EINVAL);
2027 if (inode_permission(llss->inode1, MAY_WRITE) ||
2028 inode_permission(llss->inode2, MAY_WRITE))
2029 GOTO(free, rc = -EPERM);
2031 if (llss->inode2->i_sb != llss->inode1->i_sb)
2032 GOTO(free, rc = -EXDEV);
2034 /* we use 2 bool because it is easier to swap than 2 bits */
2035 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2036 llss->check_dv1 = true;
2038 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2039 llss->check_dv2 = true;
2041 /* we cannot use lsl->sl_dvX directly because we may swap them */
2042 llss->dv1 = lsl->sl_dv1;
2043 llss->dv2 = lsl->sl_dv2;
2045 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2046 if (rc == 0) /* same file, done! */
2049 if (rc < 0) { /* sequentialize it */
2050 swap(llss->inode1, llss->inode2);
2052 swap(llss->dv1, llss->dv2);
2053 swap(llss->check_dv1, llss->check_dv2);
2057 if (gid != 0) { /* application asks to flush dirty cache */
2058 rc = ll_get_grouplock(llss->inode1, file1, gid);
2062 rc = ll_get_grouplock(llss->inode2, file2, gid);
2064 ll_put_grouplock(llss->inode1, file1, gid);
2069 /* to be able to restore mtime and atime after swap
2070 * we need to first save them */
2072 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2073 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2074 llss->ia1.ia_atime = llss->inode1->i_atime;
2075 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2076 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2077 llss->ia2.ia_atime = llss->inode2->i_atime;
2078 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2081 /* ultimate check, before swaping the layouts we check if
2082 * dataversion has changed (if requested) */
2083 if (llss->check_dv1) {
2084 rc = ll_data_version(llss->inode1, &dv, 0);
2087 if (dv != llss->dv1)
2088 GOTO(putgl, rc = -EAGAIN);
2091 if (llss->check_dv2) {
2092 rc = ll_data_version(llss->inode2, &dv, 0);
2095 if (dv != llss->dv2)
2096 GOTO(putgl, rc = -EAGAIN);
2099 /* struct md_op_data is used to send the swap args to the mdt
2100 * only flags is missing, so we use struct mdc_swap_layouts
2101 * through the md_op_data->op_data */
2102 /* flags from user space have to be converted before they are send to
2103 * server, no flag is sent today, they are only used on the client */
2106 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2107 0, LUSTRE_OPC_ANY, &msl);
2108 if (IS_ERR(op_data))
2109 GOTO(free, rc = PTR_ERR(op_data));
2111 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2112 sizeof(*op_data), op_data, NULL);
2113 ll_finish_md_op_data(op_data);
2117 ll_put_grouplock(llss->inode2, file2, gid);
2118 ll_put_grouplock(llss->inode1, file1, gid);
2121 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2125 /* clear useless flags */
2126 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2127 llss->ia1.ia_valid &= ~ATTR_MTIME;
2128 llss->ia2.ia_valid &= ~ATTR_MTIME;
2131 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2132 llss->ia1.ia_valid &= ~ATTR_ATIME;
2133 llss->ia2.ia_valid &= ~ATTR_ATIME;
2136 /* update time if requested */
2138 if (llss->ia2.ia_valid != 0) {
2139 mutex_lock(&llss->inode1->i_mutex);
2140 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2141 mutex_unlock(&llss->inode1->i_mutex);
2144 if (llss->ia1.ia_valid != 0) {
2147 mutex_lock(&llss->inode2->i_mutex);
2148 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2149 mutex_unlock(&llss->inode2->i_mutex);
2161 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2163 struct inode *inode = file->f_dentry->d_inode;
2164 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2168 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2169 inode->i_generation, inode, cmd);
2170 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2172 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2173 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2177 case LL_IOC_GETFLAGS:
2178 /* Get the current value of the file flags */
2179 return put_user(fd->fd_flags, (int *)arg);
2180 case LL_IOC_SETFLAGS:
2181 case LL_IOC_CLRFLAGS:
2182 /* Set or clear specific file flags */
2183 /* XXX This probably needs checks to ensure the flags are
2184 * not abused, and to handle any flag side effects.
2186 if (get_user(flags, (int *) arg))
2189 if (cmd == LL_IOC_SETFLAGS) {
2190 if ((flags & LL_FILE_IGNORE_LOCK) &&
2191 !(file->f_flags & O_DIRECT)) {
2192 CERROR("%s: unable to disable locking on "
2193 "non-O_DIRECT file\n", current->comm);
2197 fd->fd_flags |= flags;
2199 fd->fd_flags &= ~flags;
2202 case LL_IOC_LOV_SETSTRIPE:
2203 RETURN(ll_lov_setstripe(inode, file, arg));
2204 case LL_IOC_LOV_SETEA:
2205 RETURN(ll_lov_setea(inode, file, arg));
2206 case LL_IOC_LOV_SWAP_LAYOUTS: {
2208 struct lustre_swap_layouts lsl;
2210 if (copy_from_user(&lsl, (char *)arg,
2211 sizeof(struct lustre_swap_layouts)))
2214 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2217 file2 = fget(lsl.sl_fd);
2222 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2223 rc = ll_swap_layouts(file, file2, &lsl);
2227 case LL_IOC_LOV_GETSTRIPE:
2228 RETURN(ll_lov_getstripe(inode, arg));
2229 case LL_IOC_RECREATE_OBJ:
2230 RETURN(ll_lov_recreate_obj(inode, arg));
2231 case LL_IOC_RECREATE_FID:
2232 RETURN(ll_lov_recreate_fid(inode, arg));
2233 case FSFILT_IOC_FIEMAP:
2234 RETURN(ll_ioctl_fiemap(inode, arg));
2235 case FSFILT_IOC_GETFLAGS:
2236 case FSFILT_IOC_SETFLAGS:
2237 RETURN(ll_iocontrol(inode, file, cmd, arg));
2238 case FSFILT_IOC_GETVERSION_OLD:
2239 case FSFILT_IOC_GETVERSION:
2240 RETURN(put_user(inode->i_generation, (int *)arg));
2241 case LL_IOC_GROUP_LOCK:
2242 RETURN(ll_get_grouplock(inode, file, arg));
2243 case LL_IOC_GROUP_UNLOCK:
2244 RETURN(ll_put_grouplock(inode, file, arg));
2245 case IOC_OBD_STATFS:
2246 RETURN(ll_obd_statfs(inode, (void *)arg));
2248 /* We need to special case any other ioctls we want to handle,
2249 * to send them to the MDS/OST as appropriate and to properly
2250 * network encode the arg field.
2251 case FSFILT_IOC_SETVERSION_OLD:
2252 case FSFILT_IOC_SETVERSION:
2254 case LL_IOC_FLUSHCTX:
2255 RETURN(ll_flush_ctx(inode));
2256 case LL_IOC_PATH2FID: {
2257 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2258 sizeof(struct lu_fid)))
2263 case OBD_IOC_FID2PATH:
2264 RETURN(ll_fid2path(inode, (void *)arg));
2265 case LL_IOC_DATA_VERSION: {
2266 struct ioc_data_version idv;
2269 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2272 rc = ll_data_version(inode, &idv.idv_version,
2273 !(idv.idv_flags & LL_DV_NOFLUSH));
2275 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2281 case LL_IOC_GET_MDTIDX: {
2284 mdtidx = ll_get_mdt_idx(inode);
2288 if (put_user((int)mdtidx, (int*)arg))
2293 case OBD_IOC_GETDTNAME:
2294 case OBD_IOC_GETMDNAME:
2295 RETURN(ll_get_obd_name(inode, cmd, arg));
2296 case LL_IOC_HSM_STATE_GET: {
2297 struct md_op_data *op_data;
2298 struct hsm_user_state *hus;
2305 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2306 LUSTRE_OPC_ANY, hus);
2307 if (IS_ERR(op_data)) {
2309 RETURN(PTR_ERR(op_data));
2312 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2315 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2318 ll_finish_md_op_data(op_data);
2322 case LL_IOC_HSM_STATE_SET: {
2323 struct md_op_data *op_data;
2324 struct hsm_state_set *hss;
2330 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2335 /* Non-root users are forbidden to set or clear flags which are
2336 * NOT defined in HSM_USER_MASK. */
2337 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2338 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2343 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2344 LUSTRE_OPC_ANY, hss);
2345 if (IS_ERR(op_data)) {
2347 RETURN(PTR_ERR(op_data));
2350 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2353 ll_finish_md_op_data(op_data);
2358 case LL_IOC_HSM_ACTION: {
2359 struct md_op_data *op_data;
2360 struct hsm_current_action *hca;
2367 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2368 LUSTRE_OPC_ANY, hca);
2369 if (IS_ERR(op_data)) {
2371 RETURN(PTR_ERR(op_data));
2374 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2377 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2380 ll_finish_md_op_data(op_data);
2384 case LL_IOC_SET_LEASE: {
2385 struct ll_inode_info *lli = ll_i2info(inode);
2386 struct obd_client_handle *och = NULL;
2392 if (!(file->f_mode & FMODE_WRITE))
2397 if (!(file->f_mode & FMODE_READ))
2402 mutex_lock(&lli->lli_och_mutex);
2403 if (fd->fd_lease_och != NULL) {
2404 och = fd->fd_lease_och;
2405 fd->fd_lease_och = NULL;
2407 mutex_unlock(&lli->lli_och_mutex);
2410 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2411 rc = ll_lease_close(och, inode, &lease_broken);
2412 if (rc == 0 && lease_broken)
2418 /* return the type of lease or error */
2419 RETURN(rc < 0 ? rc : (int)mode);
2424 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2426 /* apply for lease */
2427 och = ll_lease_open(inode, file, mode, 0);
2429 RETURN(PTR_ERR(och));
2432 mutex_lock(&lli->lli_och_mutex);
2433 if (fd->fd_lease_och == NULL) {
2434 fd->fd_lease_och = och;
2437 mutex_unlock(&lli->lli_och_mutex);
2439 /* impossible now that only excl is supported for now */
2440 ll_lease_close(och, inode, &lease_broken);
2445 case LL_IOC_GET_LEASE: {
2446 struct ll_inode_info *lli = ll_i2info(inode);
2447 struct ldlm_lock *lock = NULL;
2450 mutex_lock(&lli->lli_och_mutex);
2451 if (fd->fd_lease_och != NULL) {
2452 struct obd_client_handle *och = fd->fd_lease_och;
2454 lock = ldlm_handle2lock(&och->och_lease_handle);
2456 lock_res_and_lock(lock);
2457 if (!ldlm_is_cancel(lock))
2458 rc = och->och_flags &
2459 (FMODE_READ | FMODE_WRITE);
2460 unlock_res_and_lock(lock);
2461 ldlm_lock_put(lock);
2464 mutex_unlock(&lli->lli_och_mutex);
2472 ll_iocontrol_call(inode, file, cmd, arg, &err))
2475 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2481 #ifndef HAVE_FILE_LLSEEK_SIZE
2482 static inline loff_t
2483 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2485 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2487 if (offset > maxsize)
2490 if (offset != file->f_pos) {
2491 file->f_pos = offset;
2492 file->f_version = 0;
2498 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2499 loff_t maxsize, loff_t eof)
2501 struct inode *inode = file->f_dentry->d_inode;
2509 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2510 * position-querying operation. Avoid rewriting the "same"
2511 * f_pos value back to the file because a concurrent read(),
2512 * write() or lseek() might have altered it
2517 * f_lock protects against read/modify/write race with other
2518 * SEEK_CURs. Note that parallel writes and reads behave
2521 mutex_lock(&inode->i_mutex);
2522 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2523 mutex_unlock(&inode->i_mutex);
2527 * In the generic case the entire file is data, so as long as
2528 * offset isn't at the end of the file then the offset is data.
2535 * There is a virtual hole at the end of the file, so as long as
2536 * offset isn't i_size or larger, return i_size.
2544 return llseek_execute(file, offset, maxsize);
2548 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2550 struct inode *inode = file->f_dentry->d_inode;
2551 loff_t retval, eof = 0;
2554 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2555 (origin == SEEK_CUR) ? file->f_pos : 0);
2556 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2557 inode->i_ino, inode->i_generation, inode, retval, retval,
2559 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2561 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2562 retval = ll_glimpse_size(inode);
2565 eof = i_size_read(inode);
2568 retval = ll_generic_file_llseek_size(file, offset, origin,
2569 ll_file_maxbytes(inode), eof);
2573 int ll_flush(struct file *file, fl_owner_t id)
2575 struct inode *inode = file->f_dentry->d_inode;
2576 struct ll_inode_info *lli = ll_i2info(inode);
2577 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2580 LASSERT(!S_ISDIR(inode->i_mode));
2582 /* catch async errors that were recorded back when async writeback
2583 * failed for pages in this mapping. */
2584 rc = lli->lli_async_rc;
2585 lli->lli_async_rc = 0;
2586 err = lov_read_and_clear_async_rc(lli->lli_clob);
2590 /* The application has been told write failure already.
2591 * Do not report failure again. */
2592 if (fd->fd_write_failed)
2594 return rc ? -EIO : 0;
2598 * Called to make sure a portion of file has been written out.
2599 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2601 * Return how many pages have been written.
2603 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2604 enum cl_fsync_mode mode, int ignore_layout)
2606 struct cl_env_nest nest;
2609 struct obd_capa *capa = NULL;
2610 struct cl_fsync_io *fio;
2614 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2615 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2618 env = cl_env_nested_get(&nest);
2620 RETURN(PTR_ERR(env));
2622 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2624 io = ccc_env_thread_io(env);
2625 io->ci_obj = cl_i2info(inode)->lli_clob;
2626 io->ci_ignore_layout = ignore_layout;
2628 /* initialize parameters for sync */
2629 fio = &io->u.ci_fsync;
2630 fio->fi_capa = capa;
2631 fio->fi_start = start;
2633 fio->fi_fid = ll_inode2fid(inode);
2634 fio->fi_mode = mode;
2635 fio->fi_nr_written = 0;
2637 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2638 result = cl_io_loop(env, io);
2640 result = io->ci_result;
2642 result = fio->fi_nr_written;
2643 cl_io_fini(env, io);
2644 cl_env_nested_put(&nest, env);
2652 * When dentry is provided (the 'else' case), *file->f_dentry may be
2653 * null and dentry must be used directly rather than pulled from
2654 * *file->f_dentry as is done otherwise.
2657 #ifdef HAVE_FILE_FSYNC_4ARGS
2658 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2660 struct dentry *dentry = file->f_dentry;
2661 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2662 int ll_fsync(struct file *file, int datasync)
2664 struct dentry *dentry = file->f_dentry;
2666 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2669 struct inode *inode = dentry->d_inode;
2670 struct ll_inode_info *lli = ll_i2info(inode);
2671 struct ptlrpc_request *req;
2672 struct obd_capa *oc;
2676 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2677 inode->i_generation, inode);
2678 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2680 #ifdef HAVE_FILE_FSYNC_4ARGS
2681 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2682 mutex_lock(&inode->i_mutex);
2684 /* fsync's caller has already called _fdata{sync,write}, we want
2685 * that IO to finish before calling the osc and mdc sync methods */
2686 rc = filemap_fdatawait(inode->i_mapping);
2689 /* catch async errors that were recorded back when async writeback
2690 * failed for pages in this mapping. */
2691 if (!S_ISDIR(inode->i_mode)) {
2692 err = lli->lli_async_rc;
2693 lli->lli_async_rc = 0;
2696 err = lov_read_and_clear_async_rc(lli->lli_clob);
2701 oc = ll_mdscapa_get(inode);
2702 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2708 ptlrpc_req_finished(req);
2710 if (datasync && S_ISREG(inode->i_mode)) {
2711 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2713 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2715 if (rc == 0 && err < 0)
2718 fd->fd_write_failed = true;
2720 fd->fd_write_failed = false;
2723 #ifdef HAVE_FILE_FSYNC_4ARGS
2724 mutex_unlock(&inode->i_mutex);
2729 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2731 struct inode *inode = file->f_dentry->d_inode;
2732 struct ll_sb_info *sbi = ll_i2sbi(inode);
2733 struct ldlm_enqueue_info einfo = {
2734 .ei_type = LDLM_FLOCK,
2735 .ei_cb_cp = ldlm_flock_completion_ast,
2736 .ei_cbdata = file_lock,
2738 struct md_op_data *op_data;
2739 struct lustre_handle lockh = {0};
2740 ldlm_policy_data_t flock = {{0}};
2746 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2747 inode->i_ino, file_lock);
2749 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2751 if (file_lock->fl_flags & FL_FLOCK) {
2752 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2753 /* flocks are whole-file locks */
2754 flock.l_flock.end = OFFSET_MAX;
2755 /* For flocks owner is determined by the local file desctiptor*/
2756 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2757 } else if (file_lock->fl_flags & FL_POSIX) {
2758 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2759 flock.l_flock.start = file_lock->fl_start;
2760 flock.l_flock.end = file_lock->fl_end;
2764 flock.l_flock.pid = file_lock->fl_pid;
2766 /* Somewhat ugly workaround for svc lockd.
2767 * lockd installs custom fl_lmops->lm_compare_owner that checks
2768 * for the fl_owner to be the same (which it always is on local node
2769 * I guess between lockd processes) and then compares pid.
2770 * As such we assign pid to the owner field to make it all work,
2771 * conflict with normal locks is unlikely since pid space and
2772 * pointer space for current->files are not intersecting */
2773 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2774 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2776 switch (file_lock->fl_type) {
2778 einfo.ei_mode = LCK_PR;
2781 /* An unlock request may or may not have any relation to
2782 * existing locks so we may not be able to pass a lock handle
2783 * via a normal ldlm_lock_cancel() request. The request may even
2784 * unlock a byte range in the middle of an existing lock. In
2785 * order to process an unlock request we need all of the same
2786 * information that is given with a normal read or write record
2787 * lock request. To avoid creating another ldlm unlock (cancel)
2788 * message we'll treat a LCK_NL flock request as an unlock. */
2789 einfo.ei_mode = LCK_NL;
2792 einfo.ei_mode = LCK_PW;
2795 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2796 file_lock->fl_type);
2811 flags = LDLM_FL_BLOCK_NOWAIT;
2817 flags = LDLM_FL_TEST_LOCK;
2818 /* Save the old mode so that if the mode in the lock changes we
2819 * can decrement the appropriate reader or writer refcount. */
2820 file_lock->fl_type = einfo.ei_mode;
2823 CERROR("unknown fcntl lock command: %d\n", cmd);
2827 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2828 LUSTRE_OPC_ANY, NULL);
2829 if (IS_ERR(op_data))
2830 RETURN(PTR_ERR(op_data));
2832 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2833 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2834 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2836 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2837 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2839 if ((file_lock->fl_flags & FL_FLOCK) &&
2840 (rc == 0 || file_lock->fl_type == F_UNLCK))
2841 rc2 = flock_lock_file_wait(file, file_lock);
2842 if ((file_lock->fl_flags & FL_POSIX) &&
2843 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2844 !(flags & LDLM_FL_TEST_LOCK))
2845 rc2 = posix_lock_file_wait(file, file_lock);
2847 if (rc2 && file_lock->fl_type != F_UNLCK) {
2848 einfo.ei_mode = LCK_NL;
2849 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2850 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2854 ll_finish_md_op_data(op_data);
2859 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2867 * test if some locks matching bits and l_req_mode are acquired
2868 * - bits can be in different locks
2869 * - if found clear the common lock bits in *bits
2870 * - the bits not found, are kept in *bits
2872 * \param bits [IN] searched lock bits [IN]
2873 * \param l_req_mode [IN] searched lock mode
2874 * \retval boolean, true iff all bits are found
2876 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2878 struct lustre_handle lockh;
2879 ldlm_policy_data_t policy;
2880 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2881 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2890 fid = &ll_i2info(inode)->lli_fid;
2891 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2892 ldlm_lockname[mode]);
2894 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2895 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2896 policy.l_inodebits.bits = *bits & (1 << i);
2897 if (policy.l_inodebits.bits == 0)
2900 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2901 &policy, mode, &lockh)) {
2902 struct ldlm_lock *lock;
2904 lock = ldlm_handle2lock(&lockh);
2907 ~(lock->l_policy_data.l_inodebits.bits);
2908 LDLM_LOCK_PUT(lock);
2910 *bits &= ~policy.l_inodebits.bits;
2917 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2918 struct lustre_handle *lockh, __u64 flags,
2921 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2926 fid = &ll_i2info(inode)->lli_fid;
2927 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2929 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2930 fid, LDLM_IBITS, &policy, mode, lockh);
2935 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2937 /* Already unlinked. Just update nlink and return success */
2938 if (rc == -ENOENT) {
2940 /* This path cannot be hit for regular files unless in
2941 * case of obscure races, so no need to to validate
2943 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2945 } else if (rc != 0) {
2946 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2947 ll_get_fsname(inode->i_sb, NULL, 0),
2948 PFID(ll_inode2fid(inode)), rc);
2954 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2957 struct inode *inode = dentry->d_inode;
2958 struct ptlrpc_request *req = NULL;
2959 struct obd_export *exp;
2963 LASSERT(inode != NULL);
2965 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2966 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2968 exp = ll_i2mdexp(inode);
2970 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2971 * But under CMD case, it caused some lock issues, should be fixed
2972 * with new CMD ibits lock. See bug 12718 */
2973 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2974 struct lookup_intent oit = { .it_op = IT_GETATTR };
2975 struct md_op_data *op_data;
2977 if (ibits == MDS_INODELOCK_LOOKUP)
2978 oit.it_op = IT_LOOKUP;
2980 /* Call getattr by fid, so do not provide name at all. */
2981 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2982 dentry->d_inode, NULL, 0, 0,
2983 LUSTRE_OPC_ANY, NULL);
2984 if (IS_ERR(op_data))
2985 RETURN(PTR_ERR(op_data));
2987 oit.it_create_mode |= M_CHECK_STALE;
2988 rc = md_intent_lock(exp, op_data, NULL, 0,
2989 /* we are not interested in name
2992 ll_md_blocking_ast, 0);
2993 ll_finish_md_op_data(op_data);
2994 oit.it_create_mode &= ~M_CHECK_STALE;
2996 rc = ll_inode_revalidate_fini(inode, rc);
3000 rc = ll_revalidate_it_finish(req, &oit, dentry);
3002 ll_intent_release(&oit);
3006 /* Unlinked? Unhash dentry, so it is not picked up later by
3007 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3008 here to preserve get_cwd functionality on 2.6.
3010 if (!dentry->d_inode->i_nlink)
3011 d_lustre_invalidate(dentry, 0);
3013 ll_lookup_finish_locks(&oit, dentry);
3014 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3015 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3016 obd_valid valid = OBD_MD_FLGETATTR;
3017 struct md_op_data *op_data;
3020 if (S_ISREG(inode->i_mode)) {
3021 rc = ll_get_max_mdsize(sbi, &ealen);
3024 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3027 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3028 0, ealen, LUSTRE_OPC_ANY,
3030 if (IS_ERR(op_data))
3031 RETURN(PTR_ERR(op_data));
3033 op_data->op_valid = valid;
3034 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3035 * capa for this inode. Because we only keep capas of dirs
3037 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3038 ll_finish_md_op_data(op_data);
3040 rc = ll_inode_revalidate_fini(inode, rc);
3044 rc = ll_prep_inode(&inode, req, NULL, NULL);
3047 ptlrpc_req_finished(req);
3051 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3054 struct inode *inode = dentry->d_inode;
3058 rc = __ll_inode_revalidate_it(dentry, it, ibits);
3062 /* if object isn't regular file, don't validate size */
3063 if (!S_ISREG(inode->i_mode)) {
3064 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3065 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3066 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3068 /* In case of restore, the MDT has the right size and has
3069 * already send it back without granting the layout lock,
3070 * inode is up-to-date so glimpse is useless.
3071 * Also to glimpse we need the layout, in case of a running
3072 * restore the MDT holds the layout lock so the glimpse will
3073 * block up to the end of restore (getattr will block)
3075 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3076 rc = ll_glimpse_size(inode);
3081 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3082 struct lookup_intent *it, struct kstat *stat)
3084 struct inode *inode = de->d_inode;
3085 struct ll_sb_info *sbi = ll_i2sbi(inode);
3086 struct ll_inode_info *lli = ll_i2info(inode);
3089 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3090 MDS_INODELOCK_LOOKUP);
3091 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3096 stat->dev = inode->i_sb->s_dev;
3097 if (ll_need_32bit_api(sbi))
3098 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3100 stat->ino = inode->i_ino;
3101 stat->mode = inode->i_mode;
3102 stat->nlink = inode->i_nlink;
3103 stat->uid = inode->i_uid;
3104 stat->gid = inode->i_gid;
3105 stat->rdev = inode->i_rdev;
3106 stat->atime = inode->i_atime;
3107 stat->mtime = inode->i_mtime;
3108 stat->ctime = inode->i_ctime;
3109 stat->blksize = 1 << inode->i_blkbits;
3111 stat->size = i_size_read(inode);
3112 stat->blocks = inode->i_blocks;
3116 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3118 struct lookup_intent it = { .it_op = IT_GETATTR };
3120 return ll_getattr_it(mnt, de, &it, stat);
3123 #ifdef HAVE_LINUX_FIEMAP_H
3124 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3125 __u64 start, __u64 len)
3129 struct ll_user_fiemap *fiemap;
3130 unsigned int extent_count = fieinfo->fi_extents_max;
3132 num_bytes = sizeof(*fiemap) + (extent_count *
3133 sizeof(struct ll_fiemap_extent));
3134 OBD_ALLOC_LARGE(fiemap, num_bytes);
3139 fiemap->fm_flags = fieinfo->fi_flags;
3140 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3141 fiemap->fm_start = start;
3142 fiemap->fm_length = len;
3143 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3144 sizeof(struct ll_fiemap_extent));
3146 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3148 fieinfo->fi_flags = fiemap->fm_flags;
3149 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3150 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3151 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3153 OBD_FREE_LARGE(fiemap, num_bytes);
3158 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3160 struct ll_inode_info *lli = ll_i2info(inode);
3161 struct posix_acl *acl = NULL;
3164 spin_lock(&lli->lli_lock);
3165 /* VFS' acl_permission_check->check_acl will release the refcount */
3166 acl = posix_acl_dup(lli->lli_posix_acl);
3167 spin_unlock(&lli->lli_lock);
3172 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3174 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3175 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3177 ll_check_acl(struct inode *inode, int mask)
3180 # ifdef CONFIG_FS_POSIX_ACL
3181 struct posix_acl *acl;
3185 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3186 if (flags & IPERM_FLAG_RCU)
3189 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3194 rc = posix_acl_permission(inode, acl, mask);
3195 posix_acl_release(acl);
3198 # else /* !CONFIG_FS_POSIX_ACL */
3200 # endif /* CONFIG_FS_POSIX_ACL */
3202 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3204 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3205 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3207 # ifdef HAVE_INODE_PERMISION_2ARGS
3208 int ll_inode_permission(struct inode *inode, int mask)
3210 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3217 #ifdef MAY_NOT_BLOCK
3218 if (mask & MAY_NOT_BLOCK)
3220 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3221 if (flags & IPERM_FLAG_RCU)
3225 /* as root inode are NOT getting validated in lookup operation,
3226 * need to do it before permission check. */
3228 if (inode == inode->i_sb->s_root->d_inode) {
3229 struct lookup_intent it = { .it_op = IT_LOOKUP };
3231 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3232 MDS_INODELOCK_LOOKUP);
3237 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3238 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3240 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3241 return lustre_check_remote_perm(inode, mask);
3243 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3244 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3249 /* -o localflock - only provides locally consistent flock locks */
3250 struct file_operations ll_file_operations = {
3251 .read = ll_file_read,
3252 .aio_read = ll_file_aio_read,
3253 .write = ll_file_write,
3254 .aio_write = ll_file_aio_write,
3255 .unlocked_ioctl = ll_file_ioctl,
3256 .open = ll_file_open,
3257 .release = ll_file_release,
3258 .mmap = ll_file_mmap,
3259 .llseek = ll_file_seek,
3260 .splice_read = ll_file_splice_read,
3265 struct file_operations ll_file_operations_flock = {
3266 .read = ll_file_read,
3267 .aio_read = ll_file_aio_read,
3268 .write = ll_file_write,
3269 .aio_write = ll_file_aio_write,
3270 .unlocked_ioctl = ll_file_ioctl,
3271 .open = ll_file_open,
3272 .release = ll_file_release,
3273 .mmap = ll_file_mmap,
3274 .llseek = ll_file_seek,
3275 .splice_read = ll_file_splice_read,
3278 .flock = ll_file_flock,
3279 .lock = ll_file_flock
3282 /* These are for -o noflock - to return ENOSYS on flock calls */
3283 struct file_operations ll_file_operations_noflock = {
3284 .read = ll_file_read,
3285 .aio_read = ll_file_aio_read,
3286 .write = ll_file_write,
3287 .aio_write = ll_file_aio_write,
3288 .unlocked_ioctl = ll_file_ioctl,
3289 .open = ll_file_open,
3290 .release = ll_file_release,
3291 .mmap = ll_file_mmap,
3292 .llseek = ll_file_seek,
3293 .splice_read = ll_file_splice_read,
3296 .flock = ll_file_noflock,
3297 .lock = ll_file_noflock
3300 struct inode_operations ll_file_inode_operations = {
3301 .setattr = ll_setattr,
3302 .getattr = ll_getattr,
3303 .permission = ll_inode_permission,
3304 .setxattr = ll_setxattr,
3305 .getxattr = ll_getxattr,
3306 .listxattr = ll_listxattr,
3307 .removexattr = ll_removexattr,
3308 #ifdef HAVE_LINUX_FIEMAP_H
3309 .fiemap = ll_fiemap,
3311 #ifdef HAVE_IOP_GET_ACL
3312 .get_acl = ll_get_acl,
3316 /* dynamic ioctl number support routins */
3317 static struct llioc_ctl_data {
3318 struct rw_semaphore ioc_sem;
3319 cfs_list_t ioc_head;
3321 __RWSEM_INITIALIZER(llioc.ioc_sem),
3322 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3327 cfs_list_t iocd_list;
3328 unsigned int iocd_size;
3329 llioc_callback_t iocd_cb;
3330 unsigned int iocd_count;
3331 unsigned int iocd_cmd[0];
3334 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3337 struct llioc_data *in_data = NULL;
3340 if (cb == NULL || cmd == NULL ||
3341 count > LLIOC_MAX_CMD || count < 0)
3344 size = sizeof(*in_data) + count * sizeof(unsigned int);
3345 OBD_ALLOC(in_data, size);
3346 if (in_data == NULL)
3349 memset(in_data, 0, sizeof(*in_data));
3350 in_data->iocd_size = size;
3351 in_data->iocd_cb = cb;
3352 in_data->iocd_count = count;
3353 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3355 down_write(&llioc.ioc_sem);
3356 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3357 up_write(&llioc.ioc_sem);
3362 void ll_iocontrol_unregister(void *magic)
3364 struct llioc_data *tmp;
3369 down_write(&llioc.ioc_sem);
3370 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3372 unsigned int size = tmp->iocd_size;
3374 cfs_list_del(&tmp->iocd_list);
3375 up_write(&llioc.ioc_sem);
3377 OBD_FREE(tmp, size);
3381 up_write(&llioc.ioc_sem);
3383 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3386 EXPORT_SYMBOL(ll_iocontrol_register);
3387 EXPORT_SYMBOL(ll_iocontrol_unregister);
3389 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3390 unsigned int cmd, unsigned long arg, int *rcp)
3392 enum llioc_iter ret = LLIOC_CONT;
3393 struct llioc_data *data;
3394 int rc = -EINVAL, i;
3396 down_read(&llioc.ioc_sem);
3397 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3398 for (i = 0; i < data->iocd_count; i++) {
3399 if (cmd != data->iocd_cmd[i])
3402 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3406 if (ret == LLIOC_STOP)
3409 up_read(&llioc.ioc_sem);
3416 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3418 struct ll_inode_info *lli = ll_i2info(inode);
3419 struct cl_env_nest nest;
3424 if (lli->lli_clob == NULL)
3427 env = cl_env_nested_get(&nest);
3429 RETURN(PTR_ERR(env));
3431 result = cl_conf_set(env, lli->lli_clob, conf);
3432 cl_env_nested_put(&nest, env);
3434 if (conf->coc_opc == OBJECT_CONF_SET) {
3435 struct ldlm_lock *lock = conf->coc_lock;
3437 LASSERT(lock != NULL);
3438 LASSERT(ldlm_has_layout(lock));
3440 /* it can only be allowed to match after layout is
3441 * applied to inode otherwise false layout would be
3442 * seen. Applying layout shoud happen before dropping
3443 * the intent lock. */
3444 ldlm_lock_allow_match(lock);
3450 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3451 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3454 struct ll_sb_info *sbi = ll_i2sbi(inode);
3455 struct obd_capa *oc;
3456 struct ptlrpc_request *req;
3457 struct mdt_body *body;
3464 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3465 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3466 lock->l_lvb_data, lock->l_lvb_len);
3468 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3471 /* if layout lock was granted right away, the layout is returned
3472 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3473 * blocked and then granted via completion ast, we have to fetch
3474 * layout here. Please note that we can't use the LVB buffer in
3475 * completion AST because it doesn't have a large enough buffer */
3476 oc = ll_mdscapa_get(inode);
3477 rc = ll_get_max_mdsize(sbi, &lmmsize);
3479 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3480 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3486 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3487 if (body == NULL || body->eadatasize > lmmsize)
3488 GOTO(out, rc = -EPROTO);
3490 lmmsize = body->eadatasize;
3491 if (lmmsize == 0) /* empty layout */
3494 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3496 GOTO(out, rc = -EFAULT);
3498 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3499 if (lvbdata == NULL)
3500 GOTO(out, rc = -ENOMEM);
3502 memcpy(lvbdata, lmm, lmmsize);
3503 lock_res_and_lock(lock);
3504 if (lock->l_lvb_data != NULL)
3505 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3507 lock->l_lvb_data = lvbdata;
3508 lock->l_lvb_len = lmmsize;
3509 unlock_res_and_lock(lock);
3514 ptlrpc_req_finished(req);
3519 * Apply the layout to the inode. Layout lock is held and will be released
3522 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3523 struct inode *inode, __u32 *gen, bool reconf)
3525 struct ll_inode_info *lli = ll_i2info(inode);
3526 struct ll_sb_info *sbi = ll_i2sbi(inode);
3527 struct ldlm_lock *lock;
3528 struct lustre_md md = { NULL };
3529 struct cl_object_conf conf;
3532 bool wait_layout = false;
3535 LASSERT(lustre_handle_is_used(lockh));
3537 lock = ldlm_handle2lock(lockh);
3538 LASSERT(lock != NULL);
3539 LASSERT(ldlm_has_layout(lock));
3541 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3542 inode, PFID(&lli->lli_fid), reconf);
3544 /* in case this is a caching lock and reinstate with new inode */
3545 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3547 lock_res_and_lock(lock);
3548 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3549 unlock_res_and_lock(lock);
3550 /* checking lvb_ready is racy but this is okay. The worst case is
3551 * that multi processes may configure the file on the same time. */
3553 if (lvb_ready || !reconf) {
3556 /* layout_gen must be valid if layout lock is not
3557 * cancelled and stripe has already set */
3558 *gen = lli->lli_layout_gen;
3564 rc = ll_layout_fetch(inode, lock);
3568 /* for layout lock, lmm is returned in lock's lvb.
3569 * lvb_data is immutable if the lock is held so it's safe to access it
3570 * without res lock. See the description in ldlm_lock_decref_internal()
3571 * for the condition to free lvb_data of layout lock */
3572 if (lock->l_lvb_data != NULL) {
3573 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3574 lock->l_lvb_data, lock->l_lvb_len);
3576 *gen = LL_LAYOUT_GEN_EMPTY;
3578 *gen = md.lsm->lsm_layout_gen;
3581 CERROR("%s: file "DFID" unpackmd error: %d\n",
3582 ll_get_fsname(inode->i_sb, NULL, 0),
3583 PFID(&lli->lli_fid), rc);
3589 /* set layout to file. Unlikely this will fail as old layout was
3590 * surely eliminated */
3591 memset(&conf, 0, sizeof conf);
3592 conf.coc_opc = OBJECT_CONF_SET;
3593 conf.coc_inode = inode;
3594 conf.coc_lock = lock;
3595 conf.u.coc_md = &md;
3596 rc = ll_layout_conf(inode, &conf);
3599 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3601 /* refresh layout failed, need to wait */
3602 wait_layout = rc == -EBUSY;
3606 LDLM_LOCK_PUT(lock);
3607 ldlm_lock_decref(lockh, mode);
3609 /* wait for IO to complete if it's still being used. */
3611 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3612 ll_get_fsname(inode->i_sb, NULL, 0),
3613 inode, PFID(&lli->lli_fid));
3615 memset(&conf, 0, sizeof conf);
3616 conf.coc_opc = OBJECT_CONF_WAIT;
3617 conf.coc_inode = inode;
3618 rc = ll_layout_conf(inode, &conf);
3622 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3623 PFID(&lli->lli_fid), rc);
3629 * This function checks if there exists a LAYOUT lock on the client side,
3630 * or enqueues it if it doesn't have one in cache.
3632 * This function will not hold layout lock so it may be revoked any time after
3633 * this function returns. Any operations depend on layout should be redone
3636 * This function should be called before lov_io_init() to get an uptodate
3637 * layout version, the caller should save the version number and after IO
3638 * is finished, this function should be called again to verify that layout
3639 * is not changed during IO time.
3641 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3643 struct ll_inode_info *lli = ll_i2info(inode);
3644 struct ll_sb_info *sbi = ll_i2sbi(inode);
3645 struct md_op_data *op_data;
3646 struct lookup_intent it;
3647 struct lustre_handle lockh;
3649 struct ldlm_enqueue_info einfo = {
3650 .ei_type = LDLM_IBITS,
3652 .ei_cb_bl = ll_md_blocking_ast,
3653 .ei_cb_cp = ldlm_completion_ast,
3658 *gen = lli->lli_layout_gen;
3659 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3663 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3664 LASSERT(S_ISREG(inode->i_mode));
3666 /* mostly layout lock is caching on the local side, so try to match
3667 * it before grabbing layout lock mutex. */
3668 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3669 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3670 if (mode != 0) { /* hit cached lock */
3671 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3675 /* better hold lli_layout_mutex to try again otherwise
3676 * it will have starvation problem. */
3679 /* take layout lock mutex to enqueue layout lock exclusively. */
3680 mutex_lock(&lli->lli_layout_mutex);
3683 /* try again. Maybe somebody else has done this. */
3684 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3685 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3686 if (mode != 0) { /* hit cached lock */
3687 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3691 mutex_unlock(&lli->lli_layout_mutex);
3695 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3696 0, 0, LUSTRE_OPC_ANY, NULL);
3697 if (IS_ERR(op_data)) {
3698 mutex_unlock(&lli->lli_layout_mutex);
3699 RETURN(PTR_ERR(op_data));
3702 /* have to enqueue one */
3703 memset(&it, 0, sizeof(it));
3704 it.it_op = IT_LAYOUT;
3705 lockh.cookie = 0ULL;
3707 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3708 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3709 PFID(&lli->lli_fid));
3711 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3713 if (it.d.lustre.it_data != NULL)
3714 ptlrpc_req_finished(it.d.lustre.it_data);
3715 it.d.lustre.it_data = NULL;
3717 ll_finish_md_op_data(op_data);
3719 mode = it.d.lustre.it_lock_mode;
3720 it.d.lustre.it_lock_mode = 0;
3721 ll_intent_drop_lock(&it);
3724 /* set lock data in case this is a new lock */
3725 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3726 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3730 mutex_unlock(&lli->lli_layout_mutex);
3736 * This function send a restore request to the MDT
3738 int ll_layout_restore(struct inode *inode)
3740 struct hsm_user_request *hur;
3744 len = sizeof(struct hsm_user_request) +
3745 sizeof(struct hsm_user_item);
3746 OBD_ALLOC(hur, len);
3750 hur->hur_request.hr_action = HUA_RESTORE;
3751 hur->hur_request.hr_archive_id = 0;
3752 hur->hur_request.hr_flags = 0;
3753 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3754 sizeof(hur->hur_user_item[0].hui_fid));
3755 hur->hur_user_item[0].hui_extent.length = -1;
3756 hur->hur_request.hr_itemcount = 1;
3757 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,