4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 struct md_op_data *op_data;
364 struct ptlrpc_request *req;
365 __u32 opc = LUSTRE_OPC_ANY;
372 /* Usually we come here only for NFSD, and we want open lock.
373 But we can also get here with pre 2.6.15 patchless kernels, and in
374 that case that lock is also ok */
375 /* We can also get here if there was cached open handle in revalidate_it
376 * but it disappeared while we were getting from there to ll_file_open.
377 * But this means this file was closed and immediatelly opened which
378 * makes a good candidate for using OPEN lock */
379 /* If lmmsize & lmm are not 0, we are just setting stripe info
380 * parameters. No need for the open lock */
381 if (lmm == NULL && lmmsize == 0) {
382 itp->it_flags |= MDS_OPEN_LOCK;
383 if (itp->it_flags & FMODE_WRITE)
384 opc = LUSTRE_OPC_CREATE;
387 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
388 file->f_dentry->d_inode, NULL, 0,
392 RETURN(PTR_ERR(op_data));
394 itp->it_flags |= MDS_OPEN_BY_FID;
395 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
396 0 /*unused */, &req, ll_md_blocking_ast, 0);
397 ll_finish_md_op_data(op_data);
399 /* reason for keep own exit path - don`t flood log
400 * with messages with -ESTALE errors.
402 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
403 it_open_error(DISP_OPEN_OPEN, itp))
405 ll_release_openhandle(file->f_dentry, itp);
409 if (it_disposition(itp, DISP_LOOKUP_NEG))
410 GOTO(out, rc = -ENOENT);
412 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
413 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
414 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
418 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
419 if (!rc && itp->d.lustre.it_lock_mode)
420 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
424 ptlrpc_req_finished(itp->d.lustre.it_data);
425 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
426 ll_intent_drop_lock(itp);
432 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
433 * not believe attributes if a few ioepoch holders exist. Attributes for
434 * previous ioepoch if new one is opened are also skipped by MDS.
436 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
438 if (ioepoch && lli->lli_ioepoch != ioepoch) {
439 lli->lli_ioepoch = ioepoch;
440 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
441 ioepoch, PFID(&lli->lli_fid));
445 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
446 struct obd_client_handle *och)
448 struct ptlrpc_request *req = it->d.lustre.it_data;
449 struct mdt_body *body;
451 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
452 och->och_fh = body->handle;
453 och->och_fid = body->fid1;
454 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
455 och->och_flags = it->it_flags;
457 return md_set_open_replay_data(md_exp, och, req);
460 int ll_local_open(struct file *file, struct lookup_intent *it,
461 struct ll_file_data *fd, struct obd_client_handle *och)
463 struct inode *inode = file->f_dentry->d_inode;
464 struct ll_inode_info *lli = ll_i2info(inode);
467 LASSERT(!LUSTRE_FPRIVATE(file));
472 struct ptlrpc_request *req = it->d.lustre.it_data;
473 struct mdt_body *body;
476 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
480 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
481 ll_ioepoch_open(lli, body->ioepoch);
484 LUSTRE_FPRIVATE(file) = fd;
485 ll_readahead_init(inode, &fd->fd_ras);
486 fd->fd_omode = it->it_flags;
491 /* Open a file, and (for the very first open) create objects on the OSTs at
492 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
493 * creation or open until ll_lov_setstripe() ioctl is called.
495 * If we already have the stripe MD locally then we don't request it in
496 * md_open(), by passing a lmm_size = 0.
498 * It is up to the application to ensure no other processes open this file
499 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
500 * used. We might be able to avoid races of that sort by getting lli_open_sem
501 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
502 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
504 int ll_file_open(struct inode *inode, struct file *file)
506 struct ll_inode_info *lli = ll_i2info(inode);
507 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
508 .it_flags = file->f_flags };
509 struct obd_client_handle **och_p = NULL;
510 __u64 *och_usecount = NULL;
511 struct ll_file_data *fd;
512 int rc = 0, opendir_set = 0;
515 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
516 inode->i_generation, inode, file->f_flags);
518 it = file->private_data; /* XXX: compat macro */
519 file->private_data = NULL; /* prevent ll_local_open assertion */
521 fd = ll_file_data_get();
523 GOTO(out_openerr, rc = -ENOMEM);
526 if (S_ISDIR(inode->i_mode)) {
527 spin_lock(&lli->lli_sa_lock);
528 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
529 lli->lli_opendir_pid == 0) {
530 lli->lli_opendir_key = fd;
531 lli->lli_opendir_pid = cfs_curproc_pid();
534 spin_unlock(&lli->lli_sa_lock);
537 if (inode->i_sb->s_root == file->f_dentry) {
538 LUSTRE_FPRIVATE(file) = fd;
542 if (!it || !it->d.lustre.it_disposition) {
543 /* Convert f_flags into access mode. We cannot use file->f_mode,
544 * because everything but O_ACCMODE mask was stripped from
546 if ((oit.it_flags + 1) & O_ACCMODE)
548 if (file->f_flags & O_TRUNC)
549 oit.it_flags |= FMODE_WRITE;
551 /* kernel only call f_op->open in dentry_open. filp_open calls
552 * dentry_open after call to open_namei that checks permissions.
553 * Only nfsd_open call dentry_open directly without checking
554 * permissions and because of that this code below is safe. */
555 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
556 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
558 /* We do not want O_EXCL here, presumably we opened the file
559 * already? XXX - NFS implications? */
560 oit.it_flags &= ~O_EXCL;
562 /* bug20584, if "it_flags" contains O_CREAT, the file will be
563 * created if necessary, then "IT_CREAT" should be set to keep
564 * consistent with it */
565 if (oit.it_flags & O_CREAT)
566 oit.it_op |= IT_CREAT;
572 /* Let's see if we have file open on MDS already. */
573 if (it->it_flags & FMODE_WRITE) {
574 och_p = &lli->lli_mds_write_och;
575 och_usecount = &lli->lli_open_fd_write_count;
576 } else if (it->it_flags & FMODE_EXEC) {
577 och_p = &lli->lli_mds_exec_och;
578 och_usecount = &lli->lli_open_fd_exec_count;
580 och_p = &lli->lli_mds_read_och;
581 och_usecount = &lli->lli_open_fd_read_count;
584 mutex_lock(&lli->lli_och_mutex);
585 if (*och_p) { /* Open handle is present */
586 if (it_disposition(it, DISP_OPEN_OPEN)) {
587 /* Well, there's extra open request that we do not need,
588 let's close it somehow. This will decref request. */
589 rc = it_open_error(DISP_OPEN_OPEN, it);
591 mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 ll_release_openhandle(file->f_dentry, it);
599 rc = ll_local_open(file, it, fd, NULL);
602 mutex_unlock(&lli->lli_och_mutex);
603 GOTO(out_openerr, rc);
606 LASSERT(*och_usecount == 0);
607 if (!it->d.lustre.it_disposition) {
608 /* We cannot just request lock handle now, new ELC code
609 means that one of other OPEN locks for this file
610 could be cancelled, and since blocking ast handler
611 would attempt to grab och_mutex as well, that would
612 result in a deadlock */
613 mutex_unlock(&lli->lli_och_mutex);
614 it->it_create_mode |= M_CHECK_STALE;
615 rc = ll_intent_file_open(file, NULL, 0, it);
616 it->it_create_mode &= ~M_CHECK_STALE;
618 GOTO(out_openerr, rc);
622 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
624 GOTO(out_och_free, rc = -ENOMEM);
628 /* md_intent_lock() didn't get a request ref if there was an
629 * open error, so don't do cleanup on the request here
631 /* XXX (green): Should not we bail out on any error here, not
632 * just open error? */
633 rc = it_open_error(DISP_OPEN_OPEN, it);
635 GOTO(out_och_free, rc);
637 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
639 rc = ll_local_open(file, it, fd, *och_p);
641 GOTO(out_och_free, rc);
643 mutex_unlock(&lli->lli_och_mutex);
646 /* Must do this outside lli_och_mutex lock to prevent deadlock where
647 different kind of OPEN lock for this same inode gets cancelled
648 by ldlm_cancel_lru */
649 if (!S_ISREG(inode->i_mode))
650 GOTO(out_och_free, rc);
654 if (!lli->lli_has_smd) {
655 if (file->f_flags & O_LOV_DELAY_CREATE ||
656 !(file->f_mode & FMODE_WRITE)) {
657 CDEBUG(D_INODE, "object creation was delayed\n");
658 GOTO(out_och_free, rc);
661 file->f_flags &= ~O_LOV_DELAY_CREATE;
662 GOTO(out_och_free, rc);
666 if (och_p && *och_p) {
667 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
668 *och_p = NULL; /* OBD_FREE writes some magic there */
671 mutex_unlock(&lli->lli_och_mutex);
674 if (opendir_set != 0)
675 ll_stop_statahead(inode, lli->lli_opendir_key);
677 ll_file_data_put(fd);
679 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
682 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
683 ptlrpc_req_finished(it->d.lustre.it_data);
684 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
690 /* Fills the obdo with the attributes for the lsm */
691 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
692 struct obd_capa *capa, struct obdo *obdo,
693 __u64 ioepoch, int sync)
695 struct ptlrpc_request_set *set;
696 struct obd_info oinfo = { { { 0 } } };
701 LASSERT(lsm != NULL);
705 oinfo.oi_oa->o_oi = lsm->lsm_oi;
706 oinfo.oi_oa->o_mode = S_IFREG;
707 oinfo.oi_oa->o_ioepoch = ioepoch;
708 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
709 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
710 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
711 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
712 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
713 OBD_MD_FLDATAVERSION;
714 oinfo.oi_capa = capa;
716 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
717 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
720 set = ptlrpc_prep_set();
722 CERROR("can't allocate ptlrpc set\n");
725 rc = obd_getattr_async(exp, &oinfo, set);
727 rc = ptlrpc_set_wait(set);
728 ptlrpc_set_destroy(set);
731 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
732 OBD_MD_FLATIME | OBD_MD_FLMTIME |
733 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
734 OBD_MD_FLDATAVERSION);
739 * Performs the getattr on the inode and updates its fields.
740 * If @sync != 0, perform the getattr under the server-side lock.
742 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
743 __u64 ioepoch, int sync)
745 struct obd_capa *capa = ll_mdscapa_get(inode);
746 struct lov_stripe_md *lsm;
750 lsm = ccc_inode_lsm_get(inode);
751 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
752 capa, obdo, ioepoch, sync);
755 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
757 obdo_refresh_inode(inode, obdo, obdo->o_valid);
758 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
759 " blksize %lu\n", POSTID(oi), i_size_read(inode),
760 (unsigned long long)inode->i_blocks,
761 (unsigned long)ll_inode_blksize(inode));
763 ccc_inode_lsm_put(inode, lsm);
767 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
769 struct ll_inode_info *lli = ll_i2info(inode);
770 struct cl_object *obj = lli->lli_clob;
771 struct cl_attr *attr = ccc_env_thread_attr(env);
777 ll_inode_size_lock(inode);
778 /* merge timestamps the most recently obtained from mds with
779 timestamps obtained from osts */
780 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
781 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
782 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
783 inode_init_lvb(inode, &lvb);
785 cl_object_attr_lock(obj);
786 rc = cl_object_attr_get(env, obj, attr);
787 cl_object_attr_unlock(obj);
790 if (lvb.lvb_atime < attr->cat_atime)
791 lvb.lvb_atime = attr->cat_atime;
792 if (lvb.lvb_ctime < attr->cat_ctime)
793 lvb.lvb_ctime = attr->cat_ctime;
794 if (lvb.lvb_mtime < attr->cat_mtime)
795 lvb.lvb_mtime = attr->cat_mtime;
797 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
798 PFID(&lli->lli_fid), attr->cat_size);
799 cl_isize_write_nolock(inode, attr->cat_size);
801 inode->i_blocks = attr->cat_blocks;
803 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
804 LTIME_S(inode->i_atime) = lvb.lvb_atime;
805 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
807 ll_inode_size_unlock(inode);
812 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
815 struct obdo obdo = { 0 };
818 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
820 st->st_size = obdo.o_size;
821 st->st_blocks = obdo.o_blocks;
822 st->st_mtime = obdo.o_mtime;
823 st->st_atime = obdo.o_atime;
824 st->st_ctime = obdo.o_ctime;
829 void ll_io_init(struct cl_io *io, const struct file *file, int write)
831 struct inode *inode = file->f_dentry->d_inode;
833 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
835 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
836 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
837 file->f_flags & O_DIRECT ||
840 io->ci_obj = ll_i2info(inode)->lli_clob;
841 io->ci_lockreq = CILR_MAYBE;
842 if (ll_file_nolock(file)) {
843 io->ci_lockreq = CILR_NEVER;
844 io->ci_no_srvlock = 1;
845 } else if (file->f_flags & O_APPEND) {
846 io->ci_lockreq = CILR_MANDATORY;
851 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
852 struct file *file, enum cl_io_type iot,
853 loff_t *ppos, size_t count)
855 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
856 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
862 io = ccc_env_thread_io(env);
863 ll_io_init(io, file, iot == CIT_WRITE);
865 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
866 struct vvp_io *vio = vvp_env_io(env);
867 struct ccc_io *cio = ccc_env_io(env);
868 int write_mutex_locked = 0;
870 cio->cui_fd = LUSTRE_FPRIVATE(file);
871 vio->cui_io_subtype = args->via_io_subtype;
873 switch (vio->cui_io_subtype) {
875 cio->cui_iov = args->u.normal.via_iov;
876 cio->cui_nrsegs = args->u.normal.via_nrsegs;
877 cio->cui_tot_nrsegs = cio->cui_nrsegs;
878 #ifndef HAVE_FILE_WRITEV
879 cio->cui_iocb = args->u.normal.via_iocb;
881 if ((iot == CIT_WRITE) &&
882 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
883 if (mutex_lock_interruptible(&lli->
885 GOTO(out, result = -ERESTARTSYS);
886 write_mutex_locked = 1;
887 } else if (iot == CIT_READ) {
888 down_read(&lli->lli_trunc_sem);
892 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
893 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
896 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
897 vio->u.splice.cui_flags = args->u.splice.via_flags;
900 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
903 result = cl_io_loop(env, io);
904 if (write_mutex_locked)
905 mutex_unlock(&lli->lli_write_mutex);
906 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
907 up_read(&lli->lli_trunc_sem);
909 /* cl_io_rw_init() handled IO */
910 result = io->ci_result;
913 if (io->ci_nob > 0) {
915 *ppos = io->u.ci_wr.wr.crw_pos;
920 /* If any bit been read/written (result != 0), we just return
921 * short read/write instead of restart io. */
922 if (result == 0 && io->ci_need_restart) {
923 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
924 iot == CIT_READ ? "read" : "write",
925 file->f_dentry->d_name.name, *ppos, count);
926 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
930 if (iot == CIT_READ) {
932 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
933 LPROC_LL_READ_BYTES, result);
934 } else if (iot == CIT_WRITE) {
936 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
937 LPROC_LL_WRITE_BYTES, result);
938 fd->fd_write_failed = false;
939 } else if (result != -ERESTARTSYS) {
940 fd->fd_write_failed = true;
949 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
951 static int ll_file_get_iov_count(const struct iovec *iov,
952 unsigned long *nr_segs, size_t *count)
957 for (seg = 0; seg < *nr_segs; seg++) {
958 const struct iovec *iv = &iov[seg];
961 * If any segment has a negative length, or the cumulative
962 * length ever wraps negative then return -EINVAL.
965 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
967 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
972 cnt -= iv->iov_len; /* This segment is no good */
979 #ifdef HAVE_FILE_READV
980 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
981 unsigned long nr_segs, loff_t *ppos)
984 struct vvp_io_args *args;
990 result = ll_file_get_iov_count(iov, &nr_segs, &count);
994 env = cl_env_get(&refcheck);
996 RETURN(PTR_ERR(env));
998 args = vvp_env_args(env, IO_NORMAL);
999 args->u.normal.via_iov = (struct iovec *)iov;
1000 args->u.normal.via_nrsegs = nr_segs;
1002 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1003 cl_env_put(env, &refcheck);
1007 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1011 struct iovec *local_iov;
1016 env = cl_env_get(&refcheck);
1018 RETURN(PTR_ERR(env));
1020 local_iov = &vvp_env_info(env)->vti_local_iov;
1021 local_iov->iov_base = (void __user *)buf;
1022 local_iov->iov_len = count;
1023 result = ll_file_readv(file, local_iov, 1, ppos);
1024 cl_env_put(env, &refcheck);
1029 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1030 unsigned long nr_segs, loff_t pos)
1033 struct vvp_io_args *args;
1039 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1043 env = cl_env_get(&refcheck);
1045 RETURN(PTR_ERR(env));
1047 args = vvp_env_args(env, IO_NORMAL);
1048 args->u.normal.via_iov = (struct iovec *)iov;
1049 args->u.normal.via_nrsegs = nr_segs;
1050 args->u.normal.via_iocb = iocb;
1052 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1053 &iocb->ki_pos, count);
1054 cl_env_put(env, &refcheck);
1058 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1062 struct iovec *local_iov;
1063 struct kiocb *kiocb;
1068 env = cl_env_get(&refcheck);
1070 RETURN(PTR_ERR(env));
1072 local_iov = &vvp_env_info(env)->vti_local_iov;
1073 kiocb = &vvp_env_info(env)->vti_kiocb;
1074 local_iov->iov_base = (void __user *)buf;
1075 local_iov->iov_len = count;
1076 init_sync_kiocb(kiocb, file);
1077 kiocb->ki_pos = *ppos;
1078 kiocb->ki_left = count;
1080 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1081 *ppos = kiocb->ki_pos;
1083 cl_env_put(env, &refcheck);
1089 * Write to a file (through the page cache).
1091 #ifdef HAVE_FILE_WRITEV
1092 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1093 unsigned long nr_segs, loff_t *ppos)
1096 struct vvp_io_args *args;
1102 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1106 env = cl_env_get(&refcheck);
1108 RETURN(PTR_ERR(env));
1110 args = vvp_env_args(env, IO_NORMAL);
1111 args->u.normal.via_iov = (struct iovec *)iov;
1112 args->u.normal.via_nrsegs = nr_segs;
1114 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1115 cl_env_put(env, &refcheck);
1119 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1123 struct iovec *local_iov;
1128 env = cl_env_get(&refcheck);
1130 RETURN(PTR_ERR(env));
1132 local_iov = &vvp_env_info(env)->vti_local_iov;
1133 local_iov->iov_base = (void __user *)buf;
1134 local_iov->iov_len = count;
1136 result = ll_file_writev(file, local_iov, 1, ppos);
1137 cl_env_put(env, &refcheck);
1141 #else /* AIO stuff */
1142 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1143 unsigned long nr_segs, loff_t pos)
1146 struct vvp_io_args *args;
1152 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1156 env = cl_env_get(&refcheck);
1158 RETURN(PTR_ERR(env));
1160 args = vvp_env_args(env, IO_NORMAL);
1161 args->u.normal.via_iov = (struct iovec *)iov;
1162 args->u.normal.via_nrsegs = nr_segs;
1163 args->u.normal.via_iocb = iocb;
1165 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1166 &iocb->ki_pos, count);
1167 cl_env_put(env, &refcheck);
1171 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1175 struct iovec *local_iov;
1176 struct kiocb *kiocb;
1181 env = cl_env_get(&refcheck);
1183 RETURN(PTR_ERR(env));
1185 local_iov = &vvp_env_info(env)->vti_local_iov;
1186 kiocb = &vvp_env_info(env)->vti_kiocb;
1187 local_iov->iov_base = (void __user *)buf;
1188 local_iov->iov_len = count;
1189 init_sync_kiocb(kiocb, file);
1190 kiocb->ki_pos = *ppos;
1191 kiocb->ki_left = count;
1193 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1194 *ppos = kiocb->ki_pos;
1196 cl_env_put(env, &refcheck);
1202 #ifdef HAVE_KERNEL_SENDFILE
1204 * Send file content (through pagecache) somewhere with helper
1206 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1207 read_actor_t actor, void *target)
1210 struct vvp_io_args *args;
1215 env = cl_env_get(&refcheck);
1217 RETURN(PTR_ERR(env));
1219 args = vvp_env_args(env, IO_SENDFILE);
1220 args->u.sendfile.via_target = target;
1221 args->u.sendfile.via_actor = actor;
1223 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1224 cl_env_put(env, &refcheck);
1229 #ifdef HAVE_KERNEL_SPLICE_READ
1231 * Send file content (through pagecache) somewhere with helper
1233 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1234 struct pipe_inode_info *pipe, size_t count,
1238 struct vvp_io_args *args;
1243 env = cl_env_get(&refcheck);
1245 RETURN(PTR_ERR(env));
1247 args = vvp_env_args(env, IO_SPLICE);
1248 args->u.splice.via_pipe = pipe;
1249 args->u.splice.via_flags = flags;
1251 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1252 cl_env_put(env, &refcheck);
1257 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1260 struct obd_export *exp = ll_i2dtexp(inode);
1261 struct obd_trans_info oti = { 0 };
1262 struct obdo *oa = NULL;
1265 struct lov_stripe_md *lsm = NULL, *lsm2;
1272 lsm = ccc_inode_lsm_get(inode);
1274 GOTO(out, rc = -ENOENT);
1276 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1277 (lsm->lsm_stripe_count));
1279 OBD_ALLOC_LARGE(lsm2, lsm_size);
1281 GOTO(out, rc = -ENOMEM);
1284 oa->o_nlink = ost_idx;
1285 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1286 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1287 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1288 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1289 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1290 memcpy(lsm2, lsm, lsm_size);
1291 ll_inode_size_lock(inode);
1292 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1293 ll_inode_size_unlock(inode);
1295 OBD_FREE_LARGE(lsm2, lsm_size);
1298 ccc_inode_lsm_put(inode, lsm);
1303 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1305 struct ll_recreate_obj ucreat;
1309 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1312 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1316 ostid_set_seq_mdt0(&oi);
1317 ostid_set_id(&oi, ucreat.lrc_id);
1318 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1321 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1328 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1331 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1334 fid_to_ostid(&fid, &oi);
1335 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1336 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1339 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1340 int flags, struct lov_user_md *lum, int lum_size)
1342 struct lov_stripe_md *lsm = NULL;
1343 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1347 lsm = ccc_inode_lsm_get(inode);
1349 ccc_inode_lsm_put(inode, lsm);
1350 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1355 ll_inode_size_lock(inode);
1356 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1359 rc = oit.d.lustre.it_status;
1361 GOTO(out_req_free, rc);
1363 ll_release_openhandle(file->f_dentry, &oit);
1366 ll_inode_size_unlock(inode);
1367 ll_intent_release(&oit);
1368 ccc_inode_lsm_put(inode, lsm);
1371 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1375 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1376 struct lov_mds_md **lmmp, int *lmm_size,
1377 struct ptlrpc_request **request)
1379 struct ll_sb_info *sbi = ll_i2sbi(inode);
1380 struct mdt_body *body;
1381 struct lov_mds_md *lmm = NULL;
1382 struct ptlrpc_request *req = NULL;
1383 struct md_op_data *op_data;
1386 rc = ll_get_max_mdsize(sbi, &lmmsize);
1390 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1391 strlen(filename), lmmsize,
1392 LUSTRE_OPC_ANY, NULL);
1393 if (IS_ERR(op_data))
1394 RETURN(PTR_ERR(op_data));
1396 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1397 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1398 ll_finish_md_op_data(op_data);
1400 CDEBUG(D_INFO, "md_getattr_name failed "
1401 "on %s: rc %d\n", filename, rc);
1405 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1406 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1408 lmmsize = body->eadatasize;
1410 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1412 GOTO(out, rc = -ENODATA);
1415 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1416 LASSERT(lmm != NULL);
1418 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1419 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1420 GOTO(out, rc = -EPROTO);
1424 * This is coming from the MDS, so is probably in
1425 * little endian. We convert it to host endian before
1426 * passing it to userspace.
1428 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1429 /* if function called for directory - we should
1430 * avoid swab not existent lsm objects */
1431 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1432 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1433 if (S_ISREG(body->mode))
1434 lustre_swab_lov_user_md_objects(
1435 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1436 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1437 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1438 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1439 if (S_ISREG(body->mode))
1440 lustre_swab_lov_user_md_objects(
1441 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1442 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1448 *lmm_size = lmmsize;
1453 static int ll_lov_setea(struct inode *inode, struct file *file,
1456 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1457 struct lov_user_md *lump;
1458 int lum_size = sizeof(struct lov_user_md) +
1459 sizeof(struct lov_user_ost_data);
1463 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1466 OBD_ALLOC_LARGE(lump, lum_size);
1470 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1471 OBD_FREE_LARGE(lump, lum_size);
1475 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1477 OBD_FREE_LARGE(lump, lum_size);
1481 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1484 struct lov_user_md_v3 lumv3;
1485 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1486 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1487 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1489 int flags = FMODE_WRITE;
1492 /* first try with v1 which is smaller than v3 */
1493 lum_size = sizeof(struct lov_user_md_v1);
1494 if (copy_from_user(lumv1, lumv1p, lum_size))
1497 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1498 lum_size = sizeof(struct lov_user_md_v3);
1499 if (copy_from_user(&lumv3, lumv3p, lum_size))
1503 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1505 struct lov_stripe_md *lsm;
1508 put_user(0, &lumv1p->lmm_stripe_count);
1510 ll_layout_refresh(inode, &gen);
1511 lsm = ccc_inode_lsm_get(inode);
1512 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1513 0, lsm, (void *)arg);
1514 ccc_inode_lsm_put(inode, lsm);
1519 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1521 struct lov_stripe_md *lsm;
1525 lsm = ccc_inode_lsm_get(inode);
1527 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1529 ccc_inode_lsm_put(inode, lsm);
1533 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1535 struct ll_inode_info *lli = ll_i2info(inode);
1536 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1537 struct ccc_grouplock grouplock;
1541 if (ll_file_nolock(file))
1542 RETURN(-EOPNOTSUPP);
1544 spin_lock(&lli->lli_lock);
1545 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1546 CWARN("group lock already existed with gid %lu\n",
1547 fd->fd_grouplock.cg_gid);
1548 spin_unlock(&lli->lli_lock);
1551 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1552 spin_unlock(&lli->lli_lock);
1554 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1555 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1559 spin_lock(&lli->lli_lock);
1560 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1561 spin_unlock(&lli->lli_lock);
1562 CERROR("another thread just won the race\n");
1563 cl_put_grouplock(&grouplock);
1567 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1568 fd->fd_grouplock = grouplock;
1569 spin_unlock(&lli->lli_lock);
1571 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1575 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1577 struct ll_inode_info *lli = ll_i2info(inode);
1578 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1579 struct ccc_grouplock grouplock;
1582 spin_lock(&lli->lli_lock);
1583 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1584 spin_unlock(&lli->lli_lock);
1585 CWARN("no group lock held\n");
1588 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1590 if (fd->fd_grouplock.cg_gid != arg) {
1591 CWARN("group lock %lu doesn't match current id %lu\n",
1592 arg, fd->fd_grouplock.cg_gid);
1593 spin_unlock(&lli->lli_lock);
1597 grouplock = fd->fd_grouplock;
1598 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1599 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1600 spin_unlock(&lli->lli_lock);
1602 cl_put_grouplock(&grouplock);
1603 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1608 * Close inode open handle
1610 * \param dentry [in] dentry which contains the inode
1611 * \param it [in,out] intent which contains open info and result
1614 * \retval <0 failure
1616 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1618 struct inode *inode = dentry->d_inode;
1619 struct obd_client_handle *och;
1625 /* Root ? Do nothing. */
1626 if (dentry->d_inode->i_sb->s_root == dentry)
1629 /* No open handle to close? Move away */
1630 if (!it_disposition(it, DISP_OPEN_OPEN))
1633 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1635 OBD_ALLOC(och, sizeof(*och));
1637 GOTO(out, rc = -ENOMEM);
1639 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1641 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1644 /* this one is in place of ll_file_open */
1645 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1646 ptlrpc_req_finished(it->d.lustre.it_data);
1647 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1653 * Get size for inode for which FIEMAP mapping is requested.
1654 * Make the FIEMAP get_info call and returns the result.
1656 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1659 struct obd_export *exp = ll_i2dtexp(inode);
1660 struct lov_stripe_md *lsm = NULL;
1661 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1662 int vallen = num_bytes;
1666 /* Checks for fiemap flags */
1667 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1668 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1672 /* Check for FIEMAP_FLAG_SYNC */
1673 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1674 rc = filemap_fdatawrite(inode->i_mapping);
1679 lsm = ccc_inode_lsm_get(inode);
1683 /* If the stripe_count > 1 and the application does not understand
1684 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1686 if (lsm->lsm_stripe_count > 1 &&
1687 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1688 GOTO(out, rc = -EOPNOTSUPP);
1690 fm_key.oa.o_oi = lsm->lsm_oi;
1691 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1693 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1694 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1695 /* If filesize is 0, then there would be no objects for mapping */
1696 if (fm_key.oa.o_size == 0) {
1697 fiemap->fm_mapped_extents = 0;
1701 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1703 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1706 CERROR("obd_get_info failed: rc = %d\n", rc);
1709 ccc_inode_lsm_put(inode, lsm);
1713 int ll_fid2path(struct inode *inode, void *arg)
1715 struct obd_export *exp = ll_i2mdexp(inode);
1716 struct getinfo_fid2path *gfout, *gfin;
1720 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1721 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1724 /* Need to get the buflen */
1725 OBD_ALLOC_PTR(gfin);
1728 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1733 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1734 OBD_ALLOC(gfout, outsize);
1735 if (gfout == NULL) {
1739 memcpy(gfout, gfin, sizeof(*gfout));
1742 /* Call mdc_iocontrol */
1743 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1747 if (copy_to_user(arg, gfout, outsize))
1751 OBD_FREE(gfout, outsize);
1755 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1757 struct ll_user_fiemap *fiemap_s;
1758 size_t num_bytes, ret_bytes;
1759 unsigned int extent_count;
1762 /* Get the extent count so we can calculate the size of
1763 * required fiemap buffer */
1764 if (get_user(extent_count,
1765 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1767 num_bytes = sizeof(*fiemap_s) + (extent_count *
1768 sizeof(struct ll_fiemap_extent));
1770 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1771 if (fiemap_s == NULL)
1774 /* get the fiemap value */
1775 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1777 GOTO(error, rc = -EFAULT);
1779 /* If fm_extent_count is non-zero, read the first extent since
1780 * it is used to calculate end_offset and device from previous
1783 if (copy_from_user(&fiemap_s->fm_extents[0],
1784 (char __user *)arg + sizeof(*fiemap_s),
1785 sizeof(struct ll_fiemap_extent)))
1786 GOTO(error, rc = -EFAULT);
1789 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1793 ret_bytes = sizeof(struct ll_user_fiemap);
1795 if (extent_count != 0)
1796 ret_bytes += (fiemap_s->fm_mapped_extents *
1797 sizeof(struct ll_fiemap_extent));
1799 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1803 OBD_FREE_LARGE(fiemap_s, num_bytes);
1808 * Read the data_version for inode.
1810 * This value is computed using stripe object version on OST.
1811 * Version is computed using server side locking.
1813 * @param extent_lock Take extent lock. Not needed if a process is already
1814 * holding the OST object group locks.
1816 int ll_data_version(struct inode *inode, __u64 *data_version,
1819 struct lov_stripe_md *lsm = NULL;
1820 struct ll_sb_info *sbi = ll_i2sbi(inode);
1821 struct obdo *obdo = NULL;
1825 /* If no stripe, we consider version is 0. */
1826 lsm = ccc_inode_lsm_get(inode);
1829 CDEBUG(D_INODE, "No object for inode\n");
1833 OBD_ALLOC_PTR(obdo);
1835 ccc_inode_lsm_put(inode, lsm);
1839 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1841 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1844 *data_version = obdo->o_data_version;
1848 ccc_inode_lsm_put(inode, lsm);
1853 struct ll_swap_stack {
1854 struct iattr ia1, ia2;
1856 struct inode *inode1, *inode2;
1857 bool check_dv1, check_dv2;
1860 static int ll_swap_layouts(struct file *file1, struct file *file2,
1861 struct lustre_swap_layouts *lsl)
1863 struct mdc_swap_layouts msl;
1864 struct md_op_data *op_data;
1867 struct ll_swap_stack *llss = NULL;
1870 OBD_ALLOC_PTR(llss);
1874 llss->inode1 = file1->f_dentry->d_inode;
1875 llss->inode2 = file2->f_dentry->d_inode;
1877 if (!S_ISREG(llss->inode2->i_mode))
1878 GOTO(free, rc = -EINVAL);
1880 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1881 ll_permission(llss->inode2, MAY_WRITE, NULL))
1882 GOTO(free, rc = -EPERM);
1884 if (llss->inode2->i_sb != llss->inode1->i_sb)
1885 GOTO(free, rc = -EXDEV);
1887 /* we use 2 bool because it is easier to swap than 2 bits */
1888 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1889 llss->check_dv1 = true;
1891 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1892 llss->check_dv2 = true;
1894 /* we cannot use lsl->sl_dvX directly because we may swap them */
1895 llss->dv1 = lsl->sl_dv1;
1896 llss->dv2 = lsl->sl_dv2;
1898 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1899 if (rc == 0) /* same file, done! */
1902 if (rc < 0) { /* sequentialize it */
1903 swap(llss->inode1, llss->inode2);
1905 swap(llss->dv1, llss->dv2);
1906 swap(llss->check_dv1, llss->check_dv2);
1910 if (gid != 0) { /* application asks to flush dirty cache */
1911 rc = ll_get_grouplock(llss->inode1, file1, gid);
1915 rc = ll_get_grouplock(llss->inode2, file2, gid);
1917 ll_put_grouplock(llss->inode1, file1, gid);
1922 /* to be able to restore mtime and atime after swap
1923 * we need to first save them */
1925 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1926 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1927 llss->ia1.ia_atime = llss->inode1->i_atime;
1928 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1929 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1930 llss->ia2.ia_atime = llss->inode2->i_atime;
1931 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1934 /* ultimate check, before swaping the layouts we check if
1935 * dataversion has changed (if requested) */
1936 if (llss->check_dv1) {
1937 rc = ll_data_version(llss->inode1, &dv, 0);
1940 if (dv != llss->dv1)
1941 GOTO(putgl, rc = -EAGAIN);
1944 if (llss->check_dv2) {
1945 rc = ll_data_version(llss->inode2, &dv, 0);
1948 if (dv != llss->dv2)
1949 GOTO(putgl, rc = -EAGAIN);
1952 /* struct md_op_data is used to send the swap args to the mdt
1953 * only flags is missing, so we use struct mdc_swap_layouts
1954 * through the md_op_data->op_data */
1955 /* flags from user space have to be converted before they are send to
1956 * server, no flag is sent today, they are only used on the client */
1959 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1960 0, LUSTRE_OPC_ANY, &msl);
1961 if (IS_ERR(op_data))
1962 GOTO(free, rc = PTR_ERR(op_data));
1964 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1965 sizeof(*op_data), op_data, NULL);
1966 ll_finish_md_op_data(op_data);
1970 ll_put_grouplock(llss->inode2, file2, gid);
1971 ll_put_grouplock(llss->inode1, file1, gid);
1974 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1978 /* clear useless flags */
1979 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1980 llss->ia1.ia_valid &= ~ATTR_MTIME;
1981 llss->ia2.ia_valid &= ~ATTR_MTIME;
1984 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1985 llss->ia1.ia_valid &= ~ATTR_ATIME;
1986 llss->ia2.ia_valid &= ~ATTR_ATIME;
1989 /* update time if requested */
1991 if (llss->ia2.ia_valid != 0) {
1992 mutex_lock(&llss->inode1->i_mutex);
1993 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1994 mutex_unlock(&llss->inode1->i_mutex);
1997 if (llss->ia1.ia_valid != 0) {
2000 mutex_lock(&llss->inode2->i_mutex);
2001 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2002 mutex_unlock(&llss->inode2->i_mutex);
2014 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2016 struct inode *inode = file->f_dentry->d_inode;
2017 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2021 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2022 inode->i_generation, inode, cmd);
2023 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2025 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2026 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2030 case LL_IOC_GETFLAGS:
2031 /* Get the current value of the file flags */
2032 return put_user(fd->fd_flags, (int *)arg);
2033 case LL_IOC_SETFLAGS:
2034 case LL_IOC_CLRFLAGS:
2035 /* Set or clear specific file flags */
2036 /* XXX This probably needs checks to ensure the flags are
2037 * not abused, and to handle any flag side effects.
2039 if (get_user(flags, (int *) arg))
2042 if (cmd == LL_IOC_SETFLAGS) {
2043 if ((flags & LL_FILE_IGNORE_LOCK) &&
2044 !(file->f_flags & O_DIRECT)) {
2045 CERROR("%s: unable to disable locking on "
2046 "non-O_DIRECT file\n", current->comm);
2050 fd->fd_flags |= flags;
2052 fd->fd_flags &= ~flags;
2055 case LL_IOC_LOV_SETSTRIPE:
2056 RETURN(ll_lov_setstripe(inode, file, arg));
2057 case LL_IOC_LOV_SETEA:
2058 RETURN(ll_lov_setea(inode, file, arg));
2059 case LL_IOC_LOV_SWAP_LAYOUTS: {
2061 struct lustre_swap_layouts lsl;
2063 if (cfs_copy_from_user(&lsl, (char *)arg,
2064 sizeof(struct lustre_swap_layouts)))
2067 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2070 file2 = fget(lsl.sl_fd);
2075 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2076 rc = ll_swap_layouts(file, file2, &lsl);
2080 case LL_IOC_LOV_GETSTRIPE:
2081 RETURN(ll_lov_getstripe(inode, arg));
2082 case LL_IOC_RECREATE_OBJ:
2083 RETURN(ll_lov_recreate_obj(inode, arg));
2084 case LL_IOC_RECREATE_FID:
2085 RETURN(ll_lov_recreate_fid(inode, arg));
2086 case FSFILT_IOC_FIEMAP:
2087 RETURN(ll_ioctl_fiemap(inode, arg));
2088 case FSFILT_IOC_GETFLAGS:
2089 case FSFILT_IOC_SETFLAGS:
2090 RETURN(ll_iocontrol(inode, file, cmd, arg));
2091 case FSFILT_IOC_GETVERSION_OLD:
2092 case FSFILT_IOC_GETVERSION:
2093 RETURN(put_user(inode->i_generation, (int *)arg));
2094 case LL_IOC_GROUP_LOCK:
2095 RETURN(ll_get_grouplock(inode, file, arg));
2096 case LL_IOC_GROUP_UNLOCK:
2097 RETURN(ll_put_grouplock(inode, file, arg));
2098 case IOC_OBD_STATFS:
2099 RETURN(ll_obd_statfs(inode, (void *)arg));
2101 /* We need to special case any other ioctls we want to handle,
2102 * to send them to the MDS/OST as appropriate and to properly
2103 * network encode the arg field.
2104 case FSFILT_IOC_SETVERSION_OLD:
2105 case FSFILT_IOC_SETVERSION:
2107 case LL_IOC_FLUSHCTX:
2108 RETURN(ll_flush_ctx(inode));
2109 case LL_IOC_PATH2FID: {
2110 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2111 sizeof(struct lu_fid)))
2116 case OBD_IOC_FID2PATH:
2117 RETURN(ll_fid2path(inode, (void *)arg));
2118 case LL_IOC_DATA_VERSION: {
2119 struct ioc_data_version idv;
2122 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2125 rc = ll_data_version(inode, &idv.idv_version,
2126 !(idv.idv_flags & LL_DV_NOFLUSH));
2128 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2134 case LL_IOC_GET_MDTIDX: {
2137 mdtidx = ll_get_mdt_idx(inode);
2141 if (put_user((int)mdtidx, (int*)arg))
2146 case OBD_IOC_GETDTNAME:
2147 case OBD_IOC_GETMDNAME:
2148 RETURN(ll_get_obd_name(inode, cmd, arg));
2149 case LL_IOC_HSM_STATE_GET: {
2150 struct md_op_data *op_data;
2151 struct hsm_user_state *hus;
2158 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2159 LUSTRE_OPC_ANY, hus);
2160 if (IS_ERR(op_data)) {
2162 RETURN(PTR_ERR(op_data));
2165 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2168 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2171 ll_finish_md_op_data(op_data);
2175 case LL_IOC_HSM_STATE_SET: {
2176 struct md_op_data *op_data;
2177 struct hsm_state_set *hss;
2183 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2188 /* Non-root users are forbidden to set or clear flags which are
2189 * NOT defined in HSM_USER_MASK. */
2190 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2191 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2196 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2197 LUSTRE_OPC_ANY, hss);
2198 if (IS_ERR(op_data)) {
2200 RETURN(PTR_ERR(op_data));
2203 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2206 ll_finish_md_op_data(op_data);
2211 case LL_IOC_HSM_ACTION: {
2212 struct md_op_data *op_data;
2213 struct hsm_current_action *hca;
2220 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2221 LUSTRE_OPC_ANY, hca);
2222 if (IS_ERR(op_data)) {
2224 RETURN(PTR_ERR(op_data));
2227 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2230 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2233 ll_finish_md_op_data(op_data);
2241 ll_iocontrol_call(inode, file, cmd, arg, &err))
2244 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2250 #ifndef HAVE_FILE_LLSEEK_SIZE
2251 static inline loff_t
2252 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2254 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2256 if (offset > maxsize)
2259 if (offset != file->f_pos) {
2260 file->f_pos = offset;
2261 file->f_version = 0;
2267 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2268 loff_t maxsize, loff_t eof)
2270 struct inode *inode = file->f_dentry->d_inode;
2278 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2279 * position-querying operation. Avoid rewriting the "same"
2280 * f_pos value back to the file because a concurrent read(),
2281 * write() or lseek() might have altered it
2286 * f_lock protects against read/modify/write race with other
2287 * SEEK_CURs. Note that parallel writes and reads behave
2290 mutex_lock(&inode->i_mutex);
2291 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2292 mutex_unlock(&inode->i_mutex);
2296 * In the generic case the entire file is data, so as long as
2297 * offset isn't at the end of the file then the offset is data.
2304 * There is a virtual hole at the end of the file, so as long as
2305 * offset isn't i_size or larger, return i_size.
2313 return llseek_execute(file, offset, maxsize);
2317 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2319 struct inode *inode = file->f_dentry->d_inode;
2320 loff_t retval, eof = 0;
2323 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2324 (origin == SEEK_CUR) ? file->f_pos : 0);
2325 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2326 inode->i_ino, inode->i_generation, inode, retval, retval,
2328 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2330 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2331 retval = ll_glimpse_size(inode);
2334 eof = i_size_read(inode);
2337 retval = ll_generic_file_llseek_size(file, offset, origin,
2338 ll_file_maxbytes(inode), eof);
2342 int ll_flush(struct file *file, fl_owner_t id)
2344 struct inode *inode = file->f_dentry->d_inode;
2345 struct ll_inode_info *lli = ll_i2info(inode);
2346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2349 LASSERT(!S_ISDIR(inode->i_mode));
2351 /* catch async errors that were recorded back when async writeback
2352 * failed for pages in this mapping. */
2353 rc = lli->lli_async_rc;
2354 lli->lli_async_rc = 0;
2355 err = lov_read_and_clear_async_rc(lli->lli_clob);
2359 /* The application has been told write failure already.
2360 * Do not report failure again. */
2361 if (fd->fd_write_failed)
2363 return rc ? -EIO : 0;
2367 * Called to make sure a portion of file has been written out.
2368 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2370 * Return how many pages have been written.
2372 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2373 enum cl_fsync_mode mode, int ignore_layout)
2375 struct cl_env_nest nest;
2378 struct obd_capa *capa = NULL;
2379 struct cl_fsync_io *fio;
2383 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2384 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2387 env = cl_env_nested_get(&nest);
2389 RETURN(PTR_ERR(env));
2391 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2393 io = ccc_env_thread_io(env);
2394 io->ci_obj = cl_i2info(inode)->lli_clob;
2395 io->ci_ignore_layout = ignore_layout;
2397 /* initialize parameters for sync */
2398 fio = &io->u.ci_fsync;
2399 fio->fi_capa = capa;
2400 fio->fi_start = start;
2402 fio->fi_fid = ll_inode2fid(inode);
2403 fio->fi_mode = mode;
2404 fio->fi_nr_written = 0;
2406 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2407 result = cl_io_loop(env, io);
2409 result = io->ci_result;
2411 result = fio->fi_nr_written;
2412 cl_io_fini(env, io);
2413 cl_env_nested_put(&nest, env);
2421 * When dentry is provided (the 'else' case), *file->f_dentry may be
2422 * null and dentry must be used directly rather than pulled from
2423 * *file->f_dentry as is done otherwise.
2426 #ifdef HAVE_FILE_FSYNC_4ARGS
2427 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2429 struct dentry *dentry = file->f_dentry;
2430 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2431 int ll_fsync(struct file *file, int datasync)
2433 struct dentry *dentry = file->f_dentry;
2435 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2438 struct inode *inode = dentry->d_inode;
2439 struct ll_inode_info *lli = ll_i2info(inode);
2440 struct ptlrpc_request *req;
2441 struct obd_capa *oc;
2445 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2446 inode->i_generation, inode);
2447 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2449 #ifdef HAVE_FILE_FSYNC_4ARGS
2450 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2451 mutex_lock(&inode->i_mutex);
2453 /* fsync's caller has already called _fdata{sync,write}, we want
2454 * that IO to finish before calling the osc and mdc sync methods */
2455 rc = filemap_fdatawait(inode->i_mapping);
2458 /* catch async errors that were recorded back when async writeback
2459 * failed for pages in this mapping. */
2460 if (!S_ISDIR(inode->i_mode)) {
2461 err = lli->lli_async_rc;
2462 lli->lli_async_rc = 0;
2465 err = lov_read_and_clear_async_rc(lli->lli_clob);
2470 oc = ll_mdscapa_get(inode);
2471 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2477 ptlrpc_req_finished(req);
2479 if (datasync && S_ISREG(inode->i_mode)) {
2480 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2482 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2484 if (rc == 0 && err < 0)
2487 fd->fd_write_failed = true;
2489 fd->fd_write_failed = false;
2492 #ifdef HAVE_FILE_FSYNC_4ARGS
2493 mutex_unlock(&inode->i_mutex);
2498 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2500 struct inode *inode = file->f_dentry->d_inode;
2501 struct ll_sb_info *sbi = ll_i2sbi(inode);
2502 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2503 .ei_cb_cp =ldlm_flock_completion_ast,
2504 .ei_cbdata = file_lock };
2505 struct md_op_data *op_data;
2506 struct lustre_handle lockh = {0};
2507 ldlm_policy_data_t flock = {{0}};
2513 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2514 inode->i_ino, file_lock);
2516 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2518 if (file_lock->fl_flags & FL_FLOCK) {
2519 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2520 /* flocks are whole-file locks */
2521 flock.l_flock.end = OFFSET_MAX;
2522 /* For flocks owner is determined by the local file desctiptor*/
2523 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2524 } else if (file_lock->fl_flags & FL_POSIX) {
2525 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2526 flock.l_flock.start = file_lock->fl_start;
2527 flock.l_flock.end = file_lock->fl_end;
2531 flock.l_flock.pid = file_lock->fl_pid;
2533 /* Somewhat ugly workaround for svc lockd.
2534 * lockd installs custom fl_lmops->lm_compare_owner that checks
2535 * for the fl_owner to be the same (which it always is on local node
2536 * I guess between lockd processes) and then compares pid.
2537 * As such we assign pid to the owner field to make it all work,
2538 * conflict with normal locks is unlikely since pid space and
2539 * pointer space for current->files are not intersecting */
2540 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2541 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2543 switch (file_lock->fl_type) {
2545 einfo.ei_mode = LCK_PR;
2548 /* An unlock request may or may not have any relation to
2549 * existing locks so we may not be able to pass a lock handle
2550 * via a normal ldlm_lock_cancel() request. The request may even
2551 * unlock a byte range in the middle of an existing lock. In
2552 * order to process an unlock request we need all of the same
2553 * information that is given with a normal read or write record
2554 * lock request. To avoid creating another ldlm unlock (cancel)
2555 * message we'll treat a LCK_NL flock request as an unlock. */
2556 einfo.ei_mode = LCK_NL;
2559 einfo.ei_mode = LCK_PW;
2562 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2563 file_lock->fl_type);
2578 flags = LDLM_FL_BLOCK_NOWAIT;
2584 flags = LDLM_FL_TEST_LOCK;
2585 /* Save the old mode so that if the mode in the lock changes we
2586 * can decrement the appropriate reader or writer refcount. */
2587 file_lock->fl_type = einfo.ei_mode;
2590 CERROR("unknown fcntl lock command: %d\n", cmd);
2594 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2595 LUSTRE_OPC_ANY, NULL);
2596 if (IS_ERR(op_data))
2597 RETURN(PTR_ERR(op_data));
2599 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2600 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2601 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2603 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2604 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2606 if ((file_lock->fl_flags & FL_FLOCK) &&
2607 (rc == 0 || file_lock->fl_type == F_UNLCK))
2608 rc2 = flock_lock_file_wait(file, file_lock);
2609 if ((file_lock->fl_flags & FL_POSIX) &&
2610 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2611 !(flags & LDLM_FL_TEST_LOCK))
2612 rc2 = posix_lock_file_wait(file, file_lock);
2614 if (rc2 && file_lock->fl_type != F_UNLCK) {
2615 einfo.ei_mode = LCK_NL;
2616 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2617 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2621 ll_finish_md_op_data(op_data);
2626 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2634 * test if some locks matching bits and l_req_mode are acquired
2635 * - bits can be in different locks
2636 * - if found clear the common lock bits in *bits
2637 * - the bits not found, are kept in *bits
2639 * \param bits [IN] searched lock bits [IN]
2640 * \param l_req_mode [IN] searched lock mode
2641 * \retval boolean, true iff all bits are found
2643 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2645 struct lustre_handle lockh;
2646 ldlm_policy_data_t policy;
2647 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2648 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2657 fid = &ll_i2info(inode)->lli_fid;
2658 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2659 ldlm_lockname[mode]);
2661 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2662 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2663 policy.l_inodebits.bits = *bits & (1 << i);
2664 if (policy.l_inodebits.bits == 0)
2667 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2668 &policy, mode, &lockh)) {
2669 struct ldlm_lock *lock;
2671 lock = ldlm_handle2lock(&lockh);
2674 ~(lock->l_policy_data.l_inodebits.bits);
2675 LDLM_LOCK_PUT(lock);
2677 *bits &= ~policy.l_inodebits.bits;
2684 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2685 struct lustre_handle *lockh, __u64 flags)
2687 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2692 fid = &ll_i2info(inode)->lli_fid;
2693 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2695 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2696 fid, LDLM_IBITS, &policy,
2697 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2701 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2703 /* Already unlinked. Just update nlink and return success */
2704 if (rc == -ENOENT) {
2706 /* This path cannot be hit for regular files unless in
2707 * case of obscure races, so no need to to validate
2709 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2711 } else if (rc != 0) {
2712 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2713 ll_get_fsname(inode->i_sb, NULL, 0),
2714 PFID(ll_inode2fid(inode)), rc);
2720 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2723 struct inode *inode = dentry->d_inode;
2724 struct ptlrpc_request *req = NULL;
2725 struct obd_export *exp;
2729 LASSERT(inode != NULL);
2731 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2732 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2734 exp = ll_i2mdexp(inode);
2736 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2737 * But under CMD case, it caused some lock issues, should be fixed
2738 * with new CMD ibits lock. See bug 12718 */
2739 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2740 struct lookup_intent oit = { .it_op = IT_GETATTR };
2741 struct md_op_data *op_data;
2743 if (ibits == MDS_INODELOCK_LOOKUP)
2744 oit.it_op = IT_LOOKUP;
2746 /* Call getattr by fid, so do not provide name at all. */
2747 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2748 dentry->d_inode, NULL, 0, 0,
2749 LUSTRE_OPC_ANY, NULL);
2750 if (IS_ERR(op_data))
2751 RETURN(PTR_ERR(op_data));
2753 oit.it_create_mode |= M_CHECK_STALE;
2754 rc = md_intent_lock(exp, op_data, NULL, 0,
2755 /* we are not interested in name
2758 ll_md_blocking_ast, 0);
2759 ll_finish_md_op_data(op_data);
2760 oit.it_create_mode &= ~M_CHECK_STALE;
2762 rc = ll_inode_revalidate_fini(inode, rc);
2766 rc = ll_revalidate_it_finish(req, &oit, dentry);
2768 ll_intent_release(&oit);
2772 /* Unlinked? Unhash dentry, so it is not picked up later by
2773 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2774 here to preserve get_cwd functionality on 2.6.
2776 if (!dentry->d_inode->i_nlink)
2777 d_lustre_invalidate(dentry, 0);
2779 ll_lookup_finish_locks(&oit, dentry);
2780 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2781 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2782 obd_valid valid = OBD_MD_FLGETATTR;
2783 struct md_op_data *op_data;
2786 if (S_ISREG(inode->i_mode)) {
2787 rc = ll_get_max_mdsize(sbi, &ealen);
2790 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2793 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2794 0, ealen, LUSTRE_OPC_ANY,
2796 if (IS_ERR(op_data))
2797 RETURN(PTR_ERR(op_data));
2799 op_data->op_valid = valid;
2800 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2801 * capa for this inode. Because we only keep capas of dirs
2803 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2804 ll_finish_md_op_data(op_data);
2806 rc = ll_inode_revalidate_fini(inode, rc);
2810 rc = ll_prep_inode(&inode, req, NULL, NULL);
2813 ptlrpc_req_finished(req);
2817 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2820 struct inode *inode = dentry->d_inode;
2824 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2828 /* if object isn't regular file, don't validate size */
2829 if (!S_ISREG(inode->i_mode)) {
2830 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2831 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2832 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2834 rc = ll_glimpse_size(inode);
2839 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2840 struct lookup_intent *it, struct kstat *stat)
2842 struct inode *inode = de->d_inode;
2843 struct ll_sb_info *sbi = ll_i2sbi(inode);
2844 struct ll_inode_info *lli = ll_i2info(inode);
2847 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2848 MDS_INODELOCK_LOOKUP);
2849 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2854 stat->dev = inode->i_sb->s_dev;
2855 if (ll_need_32bit_api(sbi))
2856 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2858 stat->ino = inode->i_ino;
2859 stat->mode = inode->i_mode;
2860 stat->nlink = inode->i_nlink;
2861 stat->uid = inode->i_uid;
2862 stat->gid = inode->i_gid;
2863 stat->rdev = inode->i_rdev;
2864 stat->atime = inode->i_atime;
2865 stat->mtime = inode->i_mtime;
2866 stat->ctime = inode->i_ctime;
2867 stat->blksize = 1 << inode->i_blkbits;
2869 stat->size = i_size_read(inode);
2870 stat->blocks = inode->i_blocks;
2874 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2876 struct lookup_intent it = { .it_op = IT_GETATTR };
2878 return ll_getattr_it(mnt, de, &it, stat);
2881 #ifdef HAVE_LINUX_FIEMAP_H
2882 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2883 __u64 start, __u64 len)
2887 struct ll_user_fiemap *fiemap;
2888 unsigned int extent_count = fieinfo->fi_extents_max;
2890 num_bytes = sizeof(*fiemap) + (extent_count *
2891 sizeof(struct ll_fiemap_extent));
2892 OBD_ALLOC_LARGE(fiemap, num_bytes);
2897 fiemap->fm_flags = fieinfo->fi_flags;
2898 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2899 fiemap->fm_start = start;
2900 fiemap->fm_length = len;
2901 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2902 sizeof(struct ll_fiemap_extent));
2904 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2906 fieinfo->fi_flags = fiemap->fm_flags;
2907 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2908 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2909 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2911 OBD_FREE_LARGE(fiemap, num_bytes);
2916 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2918 struct ll_inode_info *lli = ll_i2info(inode);
2919 struct posix_acl *acl = NULL;
2922 spin_lock(&lli->lli_lock);
2923 /* VFS' acl_permission_check->check_acl will release the refcount */
2924 acl = posix_acl_dup(lli->lli_posix_acl);
2925 spin_unlock(&lli->lli_lock);
2930 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2932 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2933 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2935 ll_check_acl(struct inode *inode, int mask)
2938 # ifdef CONFIG_FS_POSIX_ACL
2939 struct posix_acl *acl;
2943 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2944 if (flags & IPERM_FLAG_RCU)
2947 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2952 rc = posix_acl_permission(inode, acl, mask);
2953 posix_acl_release(acl);
2956 # else /* !CONFIG_FS_POSIX_ACL */
2958 # endif /* CONFIG_FS_POSIX_ACL */
2960 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2962 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2963 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2965 # ifdef HAVE_INODE_PERMISION_2ARGS
2966 int ll_inode_permission(struct inode *inode, int mask)
2968 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2975 #ifdef MAY_NOT_BLOCK
2976 if (mask & MAY_NOT_BLOCK)
2978 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2979 if (flags & IPERM_FLAG_RCU)
2983 /* as root inode are NOT getting validated in lookup operation,
2984 * need to do it before permission check. */
2986 if (inode == inode->i_sb->s_root->d_inode) {
2987 struct lookup_intent it = { .it_op = IT_LOOKUP };
2989 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2990 MDS_INODELOCK_LOOKUP);
2995 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2996 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2998 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2999 return lustre_check_remote_perm(inode, mask);
3001 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3002 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3007 #ifdef HAVE_FILE_READV
3008 #define READ_METHOD readv
3009 #define READ_FUNCTION ll_file_readv
3010 #define WRITE_METHOD writev
3011 #define WRITE_FUNCTION ll_file_writev
3013 #define READ_METHOD aio_read
3014 #define READ_FUNCTION ll_file_aio_read
3015 #define WRITE_METHOD aio_write
3016 #define WRITE_FUNCTION ll_file_aio_write
3019 /* -o localflock - only provides locally consistent flock locks */
3020 struct file_operations ll_file_operations = {
3021 .read = ll_file_read,
3022 .READ_METHOD = READ_FUNCTION,
3023 .write = ll_file_write,
3024 .WRITE_METHOD = WRITE_FUNCTION,
3025 .unlocked_ioctl = ll_file_ioctl,
3026 .open = ll_file_open,
3027 .release = ll_file_release,
3028 .mmap = ll_file_mmap,
3029 .llseek = ll_file_seek,
3030 #ifdef HAVE_KERNEL_SENDFILE
3031 .sendfile = ll_file_sendfile,
3033 #ifdef HAVE_KERNEL_SPLICE_READ
3034 .splice_read = ll_file_splice_read,
3040 struct file_operations ll_file_operations_flock = {
3041 .read = ll_file_read,
3042 .READ_METHOD = READ_FUNCTION,
3043 .write = ll_file_write,
3044 .WRITE_METHOD = WRITE_FUNCTION,
3045 .unlocked_ioctl = ll_file_ioctl,
3046 .open = ll_file_open,
3047 .release = ll_file_release,
3048 .mmap = ll_file_mmap,
3049 .llseek = ll_file_seek,
3050 #ifdef HAVE_KERNEL_SENDFILE
3051 .sendfile = ll_file_sendfile,
3053 #ifdef HAVE_KERNEL_SPLICE_READ
3054 .splice_read = ll_file_splice_read,
3058 .flock = ll_file_flock,
3059 .lock = ll_file_flock
3062 /* These are for -o noflock - to return ENOSYS on flock calls */
3063 struct file_operations ll_file_operations_noflock = {
3064 .read = ll_file_read,
3065 .READ_METHOD = READ_FUNCTION,
3066 .write = ll_file_write,
3067 .WRITE_METHOD = WRITE_FUNCTION,
3068 .unlocked_ioctl = ll_file_ioctl,
3069 .open = ll_file_open,
3070 .release = ll_file_release,
3071 .mmap = ll_file_mmap,
3072 .llseek = ll_file_seek,
3073 #ifdef HAVE_KERNEL_SENDFILE
3074 .sendfile = ll_file_sendfile,
3076 #ifdef HAVE_KERNEL_SPLICE_READ
3077 .splice_read = ll_file_splice_read,
3081 .flock = ll_file_noflock,
3082 .lock = ll_file_noflock
3085 struct inode_operations ll_file_inode_operations = {
3086 .setattr = ll_setattr,
3087 .getattr = ll_getattr,
3088 .permission = ll_inode_permission,
3089 .setxattr = ll_setxattr,
3090 .getxattr = ll_getxattr,
3091 .listxattr = ll_listxattr,
3092 .removexattr = ll_removexattr,
3093 #ifdef HAVE_LINUX_FIEMAP_H
3094 .fiemap = ll_fiemap,
3096 #ifdef HAVE_IOP_GET_ACL
3097 .get_acl = ll_get_acl,
3101 /* dynamic ioctl number support routins */
3102 static struct llioc_ctl_data {
3103 struct rw_semaphore ioc_sem;
3104 cfs_list_t ioc_head;
3106 __RWSEM_INITIALIZER(llioc.ioc_sem),
3107 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3112 cfs_list_t iocd_list;
3113 unsigned int iocd_size;
3114 llioc_callback_t iocd_cb;
3115 unsigned int iocd_count;
3116 unsigned int iocd_cmd[0];
3119 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3122 struct llioc_data *in_data = NULL;
3125 if (cb == NULL || cmd == NULL ||
3126 count > LLIOC_MAX_CMD || count < 0)
3129 size = sizeof(*in_data) + count * sizeof(unsigned int);
3130 OBD_ALLOC(in_data, size);
3131 if (in_data == NULL)
3134 memset(in_data, 0, sizeof(*in_data));
3135 in_data->iocd_size = size;
3136 in_data->iocd_cb = cb;
3137 in_data->iocd_count = count;
3138 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3140 down_write(&llioc.ioc_sem);
3141 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3142 up_write(&llioc.ioc_sem);
3147 void ll_iocontrol_unregister(void *magic)
3149 struct llioc_data *tmp;
3154 down_write(&llioc.ioc_sem);
3155 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3157 unsigned int size = tmp->iocd_size;
3159 cfs_list_del(&tmp->iocd_list);
3160 up_write(&llioc.ioc_sem);
3162 OBD_FREE(tmp, size);
3166 up_write(&llioc.ioc_sem);
3168 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3171 EXPORT_SYMBOL(ll_iocontrol_register);
3172 EXPORT_SYMBOL(ll_iocontrol_unregister);
3174 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3175 unsigned int cmd, unsigned long arg, int *rcp)
3177 enum llioc_iter ret = LLIOC_CONT;
3178 struct llioc_data *data;
3179 int rc = -EINVAL, i;
3181 down_read(&llioc.ioc_sem);
3182 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3183 for (i = 0; i < data->iocd_count; i++) {
3184 if (cmd != data->iocd_cmd[i])
3187 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3191 if (ret == LLIOC_STOP)
3194 up_read(&llioc.ioc_sem);
3201 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3203 struct ll_inode_info *lli = ll_i2info(inode);
3204 struct cl_env_nest nest;
3209 if (lli->lli_clob == NULL)
3212 env = cl_env_nested_get(&nest);
3214 RETURN(PTR_ERR(env));
3216 result = cl_conf_set(env, lli->lli_clob, conf);
3217 cl_env_nested_put(&nest, env);
3219 if (conf->coc_opc == OBJECT_CONF_SET) {
3220 struct ldlm_lock *lock = conf->coc_lock;
3222 LASSERT(lock != NULL);
3223 LASSERT(ldlm_has_layout(lock));
3225 /* it can only be allowed to match after layout is
3226 * applied to inode otherwise false layout would be
3227 * seen. Applying layout shoud happen before dropping
3228 * the intent lock. */
3229 ldlm_lock_allow_match(lock);
3235 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3236 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3239 struct ll_sb_info *sbi = ll_i2sbi(inode);
3240 struct obd_capa *oc;
3241 struct ptlrpc_request *req;
3242 struct mdt_body *body;
3249 if (lock->l_lvb_data != NULL)
3252 /* if layout lock was granted right away, the layout is returned
3253 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3254 * blocked and then granted via completion ast, we have to fetch
3255 * layout here. Please note that we can't use the LVB buffer in
3256 * completion AST because it doesn't have a large enough buffer */
3257 oc = ll_mdscapa_get(inode);
3258 rc = ll_get_max_mdsize(sbi, &lmmsize);
3260 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3261 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3267 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3268 if (body == NULL || body->eadatasize > lmmsize)
3269 GOTO(out, rc = -EPROTO);
3271 lmmsize = body->eadatasize;
3272 if (lmmsize == 0) /* empty layout */
3275 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3277 GOTO(out, rc = -EFAULT);
3279 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3280 if (lvbdata == NULL)
3281 GOTO(out, rc = -ENOMEM);
3283 memcpy(lvbdata, lmm, lmmsize);
3284 lock_res_and_lock(lock);
3285 if (lock->l_lvb_data == NULL) {
3286 lock->l_lvb_data = lvbdata;
3287 lock->l_lvb_len = lmmsize;
3290 unlock_res_and_lock(lock);
3292 if (lvbdata != NULL)
3293 OBD_FREE_LARGE(lvbdata, lmmsize);
3297 ptlrpc_req_finished(req);
3302 * Apply the layout to the inode. Layout lock is held and will be released
3305 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3306 struct inode *inode, __u32 *gen, bool reconf)
3308 struct ll_inode_info *lli = ll_i2info(inode);
3309 struct ll_sb_info *sbi = ll_i2sbi(inode);
3310 struct ldlm_lock *lock;
3311 struct lustre_md md = { NULL };
3312 struct cl_object_conf conf;
3315 bool wait_layout = false;
3318 LASSERT(lustre_handle_is_used(lockh));
3320 lock = ldlm_handle2lock(lockh);
3321 LASSERT(lock != NULL);
3322 LASSERT(ldlm_has_layout(lock));
3324 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3325 inode, PFID(&lli->lli_fid), reconf);
3327 /* in case this is a caching lock and reinstate with new inode */
3328 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3330 lock_res_and_lock(lock);
3331 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3332 unlock_res_and_lock(lock);
3333 /* checking lvb_ready is racy but this is okay. The worst case is
3334 * that multi processes may configure the file on the same time. */
3335 if (lvb_ready || !reconf) {
3338 /* layout_gen must be valid if layout lock is not
3339 * cancelled and stripe has already set */
3340 *gen = lli->lli_layout_gen;
3346 rc = ll_layout_fetch(inode, lock);
3350 /* for layout lock, lmm is returned in lock's lvb.
3351 * lvb_data is immutable if the lock is held so it's safe to access it
3352 * without res lock. See the description in ldlm_lock_decref_internal()
3353 * for the condition to free lvb_data of layout lock */
3354 if (lock->l_lvb_data != NULL) {
3355 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3356 lock->l_lvb_data, lock->l_lvb_len);
3358 *gen = LL_LAYOUT_GEN_EMPTY;
3360 *gen = md.lsm->lsm_layout_gen;
3363 CERROR("%s: file "DFID" unpackmd error: %d\n",
3364 ll_get_fsname(inode->i_sb, NULL, 0),
3365 PFID(&lli->lli_fid), rc);
3371 /* set layout to file. Unlikely this will fail as old layout was
3372 * surely eliminated */
3373 memset(&conf, 0, sizeof conf);
3374 conf.coc_opc = OBJECT_CONF_SET;
3375 conf.coc_inode = inode;
3376 conf.coc_lock = lock;
3377 conf.u.coc_md = &md;
3378 rc = ll_layout_conf(inode, &conf);
3381 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3383 /* refresh layout failed, need to wait */
3384 wait_layout = rc == -EBUSY;
3388 LDLM_LOCK_PUT(lock);
3389 ldlm_lock_decref(lockh, mode);
3391 /* wait for IO to complete if it's still being used. */
3393 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3394 ll_get_fsname(inode->i_sb, NULL, 0),
3395 inode, PFID(&lli->lli_fid));
3397 memset(&conf, 0, sizeof conf);
3398 conf.coc_opc = OBJECT_CONF_WAIT;
3399 conf.coc_inode = inode;
3400 rc = ll_layout_conf(inode, &conf);
3404 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3405 PFID(&lli->lli_fid), rc);
3411 * This function checks if there exists a LAYOUT lock on the client side,
3412 * or enqueues it if it doesn't have one in cache.
3414 * This function will not hold layout lock so it may be revoked any time after
3415 * this function returns. Any operations depend on layout should be redone
3418 * This function should be called before lov_io_init() to get an uptodate
3419 * layout version, the caller should save the version number and after IO
3420 * is finished, this function should be called again to verify that layout
3421 * is not changed during IO time.
3423 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3425 struct ll_inode_info *lli = ll_i2info(inode);
3426 struct ll_sb_info *sbi = ll_i2sbi(inode);
3427 struct md_op_data *op_data;
3428 struct lookup_intent it;
3429 struct lustre_handle lockh;
3431 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3433 .ei_cb_bl = ll_md_blocking_ast,
3434 .ei_cb_cp = ldlm_completion_ast,
3435 .ei_cbdata = NULL };
3439 *gen = lli->lli_layout_gen;
3440 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3444 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3445 LASSERT(S_ISREG(inode->i_mode));
3447 /* mostly layout lock is caching on the local side, so try to match
3448 * it before grabbing layout lock mutex. */
3449 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3450 if (mode != 0) { /* hit cached lock */
3451 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3455 /* better hold lli_layout_mutex to try again otherwise
3456 * it will have starvation problem. */
3459 /* take layout lock mutex to enqueue layout lock exclusively. */
3460 mutex_lock(&lli->lli_layout_mutex);
3463 /* try again. Maybe somebody else has done this. */
3464 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3465 if (mode != 0) { /* hit cached lock */
3466 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3470 mutex_unlock(&lli->lli_layout_mutex);
3474 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3475 0, 0, LUSTRE_OPC_ANY, NULL);
3476 if (IS_ERR(op_data)) {
3477 mutex_unlock(&lli->lli_layout_mutex);
3478 RETURN(PTR_ERR(op_data));
3481 /* have to enqueue one */
3482 memset(&it, 0, sizeof(it));
3483 it.it_op = IT_LAYOUT;
3484 lockh.cookie = 0ULL;
3486 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3487 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3488 PFID(&lli->lli_fid));
3490 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3492 if (it.d.lustre.it_data != NULL)
3493 ptlrpc_req_finished(it.d.lustre.it_data);
3494 it.d.lustre.it_data = NULL;
3496 ll_finish_md_op_data(op_data);
3498 mode = it.d.lustre.it_lock_mode;
3499 it.d.lustre.it_lock_mode = 0;
3500 ll_intent_drop_lock(&it);
3503 /* set lock data in case this is a new lock */
3504 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3505 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3509 mutex_unlock(&lli->lli_layout_mutex);