4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 const char *name = file->f_dentry->d_name.name;
364 const int len = file->f_dentry->d_name.len;
365 struct md_op_data *op_data;
366 struct ptlrpc_request *req;
367 __u32 opc = LUSTRE_OPC_ANY;
374 /* Usually we come here only for NFSD, and we want open lock.
375 But we can also get here with pre 2.6.15 patchless kernels, and in
376 that case that lock is also ok */
377 /* We can also get here if there was cached open handle in revalidate_it
378 * but it disappeared while we were getting from there to ll_file_open.
379 * But this means this file was closed and immediatelly opened which
380 * makes a good candidate for using OPEN lock */
381 /* If lmmsize & lmm are not 0, we are just setting stripe info
382 * parameters. No need for the open lock */
383 if (lmm == NULL && lmmsize == 0) {
384 itp->it_flags |= MDS_OPEN_LOCK;
385 if (itp->it_flags & FMODE_WRITE)
386 opc = LUSTRE_OPC_CREATE;
389 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
390 file->f_dentry->d_inode, name, len,
393 RETURN(PTR_ERR(op_data));
395 itp->it_flags |= MDS_OPEN_BY_FID;
396 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
397 0 /*unused */, &req, ll_md_blocking_ast, 0);
398 ll_finish_md_op_data(op_data);
400 /* reason for keep own exit path - don`t flood log
401 * with messages with -ESTALE errors.
403 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
404 it_open_error(DISP_OPEN_OPEN, itp))
406 ll_release_openhandle(file->f_dentry, itp);
410 if (it_disposition(itp, DISP_LOOKUP_NEG))
411 GOTO(out, rc = -ENOENT);
413 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
414 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
415 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
419 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
420 if (!rc && itp->d.lustre.it_lock_mode)
421 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
425 ptlrpc_req_finished(itp->d.lustre.it_data);
426 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
427 ll_intent_drop_lock(itp);
433 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
434 * not believe attributes if a few ioepoch holders exist. Attributes for
435 * previous ioepoch if new one is opened are also skipped by MDS.
437 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
439 if (ioepoch && lli->lli_ioepoch != ioepoch) {
440 lli->lli_ioepoch = ioepoch;
441 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
442 ioepoch, PFID(&lli->lli_fid));
446 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
447 struct lookup_intent *it, struct obd_client_handle *och)
449 struct ptlrpc_request *req = it->d.lustre.it_data;
450 struct mdt_body *body;
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 LASSERT(body != NULL); /* reply already checked out */
457 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
458 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
459 och->och_fid = lli->lli_fid;
460 och->och_flags = it->it_flags;
461 ll_ioepoch_open(lli, body->ioepoch);
463 return md_set_open_replay_data(md_exp, och, req);
466 int ll_local_open(struct file *file, struct lookup_intent *it,
467 struct ll_file_data *fd, struct obd_client_handle *och)
469 struct inode *inode = file->f_dentry->d_inode;
470 struct ll_inode_info *lli = ll_i2info(inode);
473 LASSERT(!LUSTRE_FPRIVATE(file));
478 struct ptlrpc_request *req = it->d.lustre.it_data;
479 struct mdt_body *body;
482 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
486 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
487 if ((it->it_flags & FMODE_WRITE) &&
488 (body->valid & OBD_MD_FLSIZE))
489 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
490 lli->lli_ioepoch, PFID(&lli->lli_fid));
493 LUSTRE_FPRIVATE(file) = fd;
494 ll_readahead_init(inode, &fd->fd_ras);
495 fd->fd_omode = it->it_flags;
499 /* Open a file, and (for the very first open) create objects on the OSTs at
500 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
501 * creation or open until ll_lov_setstripe() ioctl is called.
503 * If we already have the stripe MD locally then we don't request it in
504 * md_open(), by passing a lmm_size = 0.
506 * It is up to the application to ensure no other processes open this file
507 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
508 * used. We might be able to avoid races of that sort by getting lli_open_sem
509 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
510 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
512 int ll_file_open(struct inode *inode, struct file *file)
514 struct ll_inode_info *lli = ll_i2info(inode);
515 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
516 .it_flags = file->f_flags };
517 struct obd_client_handle **och_p = NULL;
518 __u64 *och_usecount = NULL;
519 struct ll_file_data *fd;
520 int rc = 0, opendir_set = 0;
523 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
524 inode->i_generation, inode, file->f_flags);
526 it = file->private_data; /* XXX: compat macro */
527 file->private_data = NULL; /* prevent ll_local_open assertion */
529 fd = ll_file_data_get();
531 GOTO(out_openerr, rc = -ENOMEM);
534 if (S_ISDIR(inode->i_mode)) {
535 spin_lock(&lli->lli_sa_lock);
536 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
537 lli->lli_opendir_pid == 0) {
538 lli->lli_opendir_key = fd;
539 lli->lli_opendir_pid = cfs_curproc_pid();
542 spin_unlock(&lli->lli_sa_lock);
545 if (inode->i_sb->s_root == file->f_dentry) {
546 LUSTRE_FPRIVATE(file) = fd;
550 if (!it || !it->d.lustre.it_disposition) {
551 /* Convert f_flags into access mode. We cannot use file->f_mode,
552 * because everything but O_ACCMODE mask was stripped from
554 if ((oit.it_flags + 1) & O_ACCMODE)
556 if (file->f_flags & O_TRUNC)
557 oit.it_flags |= FMODE_WRITE;
559 /* kernel only call f_op->open in dentry_open. filp_open calls
560 * dentry_open after call to open_namei that checks permissions.
561 * Only nfsd_open call dentry_open directly without checking
562 * permissions and because of that this code below is safe. */
563 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
564 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
566 /* We do not want O_EXCL here, presumably we opened the file
567 * already? XXX - NFS implications? */
568 oit.it_flags &= ~O_EXCL;
570 /* bug20584, if "it_flags" contains O_CREAT, the file will be
571 * created if necessary, then "IT_CREAT" should be set to keep
572 * consistent with it */
573 if (oit.it_flags & O_CREAT)
574 oit.it_op |= IT_CREAT;
580 /* Let's see if we have file open on MDS already. */
581 if (it->it_flags & FMODE_WRITE) {
582 och_p = &lli->lli_mds_write_och;
583 och_usecount = &lli->lli_open_fd_write_count;
584 } else if (it->it_flags & FMODE_EXEC) {
585 och_p = &lli->lli_mds_exec_och;
586 och_usecount = &lli->lli_open_fd_exec_count;
588 och_p = &lli->lli_mds_read_och;
589 och_usecount = &lli->lli_open_fd_read_count;
592 mutex_lock(&lli->lli_och_mutex);
593 if (*och_p) { /* Open handle is present */
594 if (it_disposition(it, DISP_OPEN_OPEN)) {
595 /* Well, there's extra open request that we do not need,
596 let's close it somehow. This will decref request. */
597 rc = it_open_error(DISP_OPEN_OPEN, it);
599 mutex_unlock(&lli->lli_och_mutex);
600 GOTO(out_openerr, rc);
603 ll_release_openhandle(file->f_dentry, it);
607 rc = ll_local_open(file, it, fd, NULL);
610 mutex_unlock(&lli->lli_och_mutex);
611 GOTO(out_openerr, rc);
614 LASSERT(*och_usecount == 0);
615 if (!it->d.lustre.it_disposition) {
616 /* We cannot just request lock handle now, new ELC code
617 means that one of other OPEN locks for this file
618 could be cancelled, and since blocking ast handler
619 would attempt to grab och_mutex as well, that would
620 result in a deadlock */
621 mutex_unlock(&lli->lli_och_mutex);
622 it->it_create_mode |= M_CHECK_STALE;
623 rc = ll_intent_file_open(file, NULL, 0, it);
624 it->it_create_mode &= ~M_CHECK_STALE;
626 GOTO(out_openerr, rc);
630 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
632 GOTO(out_och_free, rc = -ENOMEM);
636 /* md_intent_lock() didn't get a request ref if there was an
637 * open error, so don't do cleanup on the request here
639 /* XXX (green): Should not we bail out on any error here, not
640 * just open error? */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 GOTO(out_och_free, rc);
645 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
647 rc = ll_local_open(file, it, fd, *och_p);
649 GOTO(out_och_free, rc);
651 mutex_unlock(&lli->lli_och_mutex);
654 /* Must do this outside lli_och_mutex lock to prevent deadlock where
655 different kind of OPEN lock for this same inode gets cancelled
656 by ldlm_cancel_lru */
657 if (!S_ISREG(inode->i_mode))
658 GOTO(out_och_free, rc);
662 if (!lli->lli_has_smd) {
663 if (file->f_flags & O_LOV_DELAY_CREATE ||
664 !(file->f_mode & FMODE_WRITE)) {
665 CDEBUG(D_INODE, "object creation was delayed\n");
666 GOTO(out_och_free, rc);
669 file->f_flags &= ~O_LOV_DELAY_CREATE;
670 GOTO(out_och_free, rc);
674 if (och_p && *och_p) {
675 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676 *och_p = NULL; /* OBD_FREE writes some magic there */
679 mutex_unlock(&lli->lli_och_mutex);
682 if (opendir_set != 0)
683 ll_stop_statahead(inode, lli->lli_opendir_key);
685 ll_file_data_put(fd);
687 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
690 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
691 ptlrpc_req_finished(it->d.lustre.it_data);
692 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
698 /* Fills the obdo with the attributes for the lsm */
699 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
700 struct obd_capa *capa, struct obdo *obdo,
701 __u64 ioepoch, int sync)
703 struct ptlrpc_request_set *set;
704 struct obd_info oinfo = { { { 0 } } };
709 LASSERT(lsm != NULL);
713 oinfo.oi_oa->o_oi = lsm->lsm_oi;
714 oinfo.oi_oa->o_mode = S_IFREG;
715 oinfo.oi_oa->o_ioepoch = ioepoch;
716 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
717 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
718 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
719 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
720 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
721 OBD_MD_FLDATAVERSION;
722 oinfo.oi_capa = capa;
724 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
725 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
728 set = ptlrpc_prep_set();
730 CERROR("can't allocate ptlrpc set\n");
733 rc = obd_getattr_async(exp, &oinfo, set);
735 rc = ptlrpc_set_wait(set);
736 ptlrpc_set_destroy(set);
739 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
740 OBD_MD_FLATIME | OBD_MD_FLMTIME |
741 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
742 OBD_MD_FLDATAVERSION);
747 * Performs the getattr on the inode and updates its fields.
748 * If @sync != 0, perform the getattr under the server-side lock.
750 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
751 __u64 ioepoch, int sync)
753 struct obd_capa *capa = ll_mdscapa_get(inode);
754 struct lov_stripe_md *lsm;
758 lsm = ccc_inode_lsm_get(inode);
759 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
760 capa, obdo, ioepoch, sync);
763 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
765 obdo_refresh_inode(inode, obdo, obdo->o_valid);
766 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
767 " blksize %lu\n", POSTID(oi), i_size_read(inode),
768 (unsigned long long)inode->i_blocks,
769 (unsigned long)ll_inode_blksize(inode));
771 ccc_inode_lsm_put(inode, lsm);
775 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct cl_object *obj = lli->lli_clob;
779 struct cl_attr *attr = ccc_env_thread_attr(env);
785 ll_inode_size_lock(inode);
786 /* merge timestamps the most recently obtained from mds with
787 timestamps obtained from osts */
788 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
789 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
790 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
791 inode_init_lvb(inode, &lvb);
793 cl_object_attr_lock(obj);
794 rc = cl_object_attr_get(env, obj, attr);
795 cl_object_attr_unlock(obj);
798 if (lvb.lvb_atime < attr->cat_atime)
799 lvb.lvb_atime = attr->cat_atime;
800 if (lvb.lvb_ctime < attr->cat_ctime)
801 lvb.lvb_ctime = attr->cat_ctime;
802 if (lvb.lvb_mtime < attr->cat_mtime)
803 lvb.lvb_mtime = attr->cat_mtime;
805 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
806 PFID(&lli->lli_fid), attr->cat_size);
807 cl_isize_write_nolock(inode, attr->cat_size);
809 inode->i_blocks = attr->cat_blocks;
811 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
812 LTIME_S(inode->i_atime) = lvb.lvb_atime;
813 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
815 ll_inode_size_unlock(inode);
820 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
823 struct obdo obdo = { 0 };
826 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
828 st->st_size = obdo.o_size;
829 st->st_blocks = obdo.o_blocks;
830 st->st_mtime = obdo.o_mtime;
831 st->st_atime = obdo.o_atime;
832 st->st_ctime = obdo.o_ctime;
837 void ll_io_init(struct cl_io *io, const struct file *file, int write)
839 struct inode *inode = file->f_dentry->d_inode;
841 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
843 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
844 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
845 file->f_flags & O_DIRECT ||
848 io->ci_obj = ll_i2info(inode)->lli_clob;
849 io->ci_lockreq = CILR_MAYBE;
850 if (ll_file_nolock(file)) {
851 io->ci_lockreq = CILR_NEVER;
852 io->ci_no_srvlock = 1;
853 } else if (file->f_flags & O_APPEND) {
854 io->ci_lockreq = CILR_MANDATORY;
859 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
860 struct file *file, enum cl_io_type iot,
861 loff_t *ppos, size_t count)
863 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
864 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
870 io = ccc_env_thread_io(env);
871 ll_io_init(io, file, iot == CIT_WRITE);
873 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
874 struct vvp_io *vio = vvp_env_io(env);
875 struct ccc_io *cio = ccc_env_io(env);
876 int write_mutex_locked = 0;
878 cio->cui_fd = LUSTRE_FPRIVATE(file);
879 vio->cui_io_subtype = args->via_io_subtype;
881 switch (vio->cui_io_subtype) {
883 cio->cui_iov = args->u.normal.via_iov;
884 cio->cui_nrsegs = args->u.normal.via_nrsegs;
885 cio->cui_tot_nrsegs = cio->cui_nrsegs;
886 #ifndef HAVE_FILE_WRITEV
887 cio->cui_iocb = args->u.normal.via_iocb;
889 if ((iot == CIT_WRITE) &&
890 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
891 if (mutex_lock_interruptible(&lli->
893 GOTO(out, result = -ERESTARTSYS);
894 write_mutex_locked = 1;
895 } else if (iot == CIT_READ) {
896 down_read(&lli->lli_trunc_sem);
900 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
901 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
904 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
905 vio->u.splice.cui_flags = args->u.splice.via_flags;
908 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
911 result = cl_io_loop(env, io);
912 if (write_mutex_locked)
913 mutex_unlock(&lli->lli_write_mutex);
914 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
915 up_read(&lli->lli_trunc_sem);
917 /* cl_io_rw_init() handled IO */
918 result = io->ci_result;
921 if (io->ci_nob > 0) {
923 *ppos = io->u.ci_wr.wr.crw_pos;
928 /* If any bit been read/written (result != 0), we just return
929 * short read/write instead of restart io. */
930 if (result == 0 && io->ci_need_restart) {
931 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
932 iot == CIT_READ ? "read" : "write",
933 file->f_dentry->d_name.name, *ppos, count);
934 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
938 if (iot == CIT_READ) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_READ_BYTES, result);
942 } else if (iot == CIT_WRITE) {
944 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
945 LPROC_LL_WRITE_BYTES, result);
946 fd->fd_write_failed = false;
947 } else if (result != -ERESTARTSYS) {
948 fd->fd_write_failed = true;
957 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
959 static int ll_file_get_iov_count(const struct iovec *iov,
960 unsigned long *nr_segs, size_t *count)
965 for (seg = 0; seg < *nr_segs; seg++) {
966 const struct iovec *iv = &iov[seg];
969 * If any segment has a negative length, or the cumulative
970 * length ever wraps negative then return -EINVAL.
973 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
975 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
980 cnt -= iv->iov_len; /* This segment is no good */
987 #ifdef HAVE_FILE_READV
988 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
989 unsigned long nr_segs, loff_t *ppos)
992 struct vvp_io_args *args;
998 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1002 env = cl_env_get(&refcheck);
1004 RETURN(PTR_ERR(env));
1006 args = vvp_env_args(env, IO_NORMAL);
1007 args->u.normal.via_iov = (struct iovec *)iov;
1008 args->u.normal.via_nrsegs = nr_segs;
1010 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1011 cl_env_put(env, &refcheck);
1015 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1019 struct iovec *local_iov;
1024 env = cl_env_get(&refcheck);
1026 RETURN(PTR_ERR(env));
1028 local_iov = &vvp_env_info(env)->vti_local_iov;
1029 local_iov->iov_base = (void __user *)buf;
1030 local_iov->iov_len = count;
1031 result = ll_file_readv(file, local_iov, 1, ppos);
1032 cl_env_put(env, &refcheck);
1037 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1038 unsigned long nr_segs, loff_t pos)
1041 struct vvp_io_args *args;
1047 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1051 env = cl_env_get(&refcheck);
1053 RETURN(PTR_ERR(env));
1055 args = vvp_env_args(env, IO_NORMAL);
1056 args->u.normal.via_iov = (struct iovec *)iov;
1057 args->u.normal.via_nrsegs = nr_segs;
1058 args->u.normal.via_iocb = iocb;
1060 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1061 &iocb->ki_pos, count);
1062 cl_env_put(env, &refcheck);
1066 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1070 struct iovec *local_iov;
1071 struct kiocb *kiocb;
1076 env = cl_env_get(&refcheck);
1078 RETURN(PTR_ERR(env));
1080 local_iov = &vvp_env_info(env)->vti_local_iov;
1081 kiocb = &vvp_env_info(env)->vti_kiocb;
1082 local_iov->iov_base = (void __user *)buf;
1083 local_iov->iov_len = count;
1084 init_sync_kiocb(kiocb, file);
1085 kiocb->ki_pos = *ppos;
1086 kiocb->ki_left = count;
1088 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1089 *ppos = kiocb->ki_pos;
1091 cl_env_put(env, &refcheck);
1097 * Write to a file (through the page cache).
1099 #ifdef HAVE_FILE_WRITEV
1100 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1101 unsigned long nr_segs, loff_t *ppos)
1104 struct vvp_io_args *args;
1110 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1114 env = cl_env_get(&refcheck);
1116 RETURN(PTR_ERR(env));
1118 args = vvp_env_args(env, IO_NORMAL);
1119 args->u.normal.via_iov = (struct iovec *)iov;
1120 args->u.normal.via_nrsegs = nr_segs;
1122 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1123 cl_env_put(env, &refcheck);
1127 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1131 struct iovec *local_iov;
1136 env = cl_env_get(&refcheck);
1138 RETURN(PTR_ERR(env));
1140 local_iov = &vvp_env_info(env)->vti_local_iov;
1141 local_iov->iov_base = (void __user *)buf;
1142 local_iov->iov_len = count;
1144 result = ll_file_writev(file, local_iov, 1, ppos);
1145 cl_env_put(env, &refcheck);
1149 #else /* AIO stuff */
1150 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1151 unsigned long nr_segs, loff_t pos)
1154 struct vvp_io_args *args;
1160 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1164 env = cl_env_get(&refcheck);
1166 RETURN(PTR_ERR(env));
1168 args = vvp_env_args(env, IO_NORMAL);
1169 args->u.normal.via_iov = (struct iovec *)iov;
1170 args->u.normal.via_nrsegs = nr_segs;
1171 args->u.normal.via_iocb = iocb;
1173 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1174 &iocb->ki_pos, count);
1175 cl_env_put(env, &refcheck);
1179 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1183 struct iovec *local_iov;
1184 struct kiocb *kiocb;
1189 env = cl_env_get(&refcheck);
1191 RETURN(PTR_ERR(env));
1193 local_iov = &vvp_env_info(env)->vti_local_iov;
1194 kiocb = &vvp_env_info(env)->vti_kiocb;
1195 local_iov->iov_base = (void __user *)buf;
1196 local_iov->iov_len = count;
1197 init_sync_kiocb(kiocb, file);
1198 kiocb->ki_pos = *ppos;
1199 kiocb->ki_left = count;
1201 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1202 *ppos = kiocb->ki_pos;
1204 cl_env_put(env, &refcheck);
1210 #ifdef HAVE_KERNEL_SENDFILE
1212 * Send file content (through pagecache) somewhere with helper
1214 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1215 read_actor_t actor, void *target)
1218 struct vvp_io_args *args;
1223 env = cl_env_get(&refcheck);
1225 RETURN(PTR_ERR(env));
1227 args = vvp_env_args(env, IO_SENDFILE);
1228 args->u.sendfile.via_target = target;
1229 args->u.sendfile.via_actor = actor;
1231 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1232 cl_env_put(env, &refcheck);
1237 #ifdef HAVE_KERNEL_SPLICE_READ
1239 * Send file content (through pagecache) somewhere with helper
1241 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1242 struct pipe_inode_info *pipe, size_t count,
1246 struct vvp_io_args *args;
1251 env = cl_env_get(&refcheck);
1253 RETURN(PTR_ERR(env));
1255 args = vvp_env_args(env, IO_SPLICE);
1256 args->u.splice.via_pipe = pipe;
1257 args->u.splice.via_flags = flags;
1259 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1260 cl_env_put(env, &refcheck);
1265 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1268 struct obd_export *exp = ll_i2dtexp(inode);
1269 struct obd_trans_info oti = { 0 };
1270 struct obdo *oa = NULL;
1273 struct lov_stripe_md *lsm = NULL, *lsm2;
1280 lsm = ccc_inode_lsm_get(inode);
1281 if (!lsm_has_objects(lsm))
1282 GOTO(out, rc = -ENOENT);
1284 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1285 (lsm->lsm_stripe_count));
1287 OBD_ALLOC_LARGE(lsm2, lsm_size);
1289 GOTO(out, rc = -ENOMEM);
1292 oa->o_nlink = ost_idx;
1293 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1294 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1295 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1296 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1297 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1298 memcpy(lsm2, lsm, lsm_size);
1299 ll_inode_size_lock(inode);
1300 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1301 ll_inode_size_unlock(inode);
1303 OBD_FREE_LARGE(lsm2, lsm_size);
1306 ccc_inode_lsm_put(inode, lsm);
1311 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1313 struct ll_recreate_obj ucreat;
1317 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1320 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1324 ostid_set_seq_mdt0(&oi);
1325 ostid_set_id(&oi, ucreat.lrc_id);
1326 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1329 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1336 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1339 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1342 fid_to_ostid(&fid, &oi);
1343 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1344 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1347 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1348 int flags, struct lov_user_md *lum, int lum_size)
1350 struct lov_stripe_md *lsm = NULL;
1351 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1355 lsm = ccc_inode_lsm_get(inode);
1357 ccc_inode_lsm_put(inode, lsm);
1358 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1363 ll_inode_size_lock(inode);
1364 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1367 rc = oit.d.lustre.it_status;
1369 GOTO(out_req_free, rc);
1371 ll_release_openhandle(file->f_dentry, &oit);
1374 ll_inode_size_unlock(inode);
1375 ll_intent_release(&oit);
1376 ccc_inode_lsm_put(inode, lsm);
1379 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1383 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1384 struct lov_mds_md **lmmp, int *lmm_size,
1385 struct ptlrpc_request **request)
1387 struct ll_sb_info *sbi = ll_i2sbi(inode);
1388 struct mdt_body *body;
1389 struct lov_mds_md *lmm = NULL;
1390 struct ptlrpc_request *req = NULL;
1391 struct md_op_data *op_data;
1394 rc = ll_get_max_mdsize(sbi, &lmmsize);
1398 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1399 strlen(filename), lmmsize,
1400 LUSTRE_OPC_ANY, NULL);
1401 if (IS_ERR(op_data))
1402 RETURN(PTR_ERR(op_data));
1404 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1405 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1406 ll_finish_md_op_data(op_data);
1408 CDEBUG(D_INFO, "md_getattr_name failed "
1409 "on %s: rc %d\n", filename, rc);
1413 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1414 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1416 lmmsize = body->eadatasize;
1418 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1420 GOTO(out, rc = -ENODATA);
1423 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1424 LASSERT(lmm != NULL);
1426 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1427 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1428 GOTO(out, rc = -EPROTO);
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1436 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1439 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1443 /* if function called for directory - we should
1444 * avoid swab not existent lsm objects */
1445 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1451 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453 if (S_ISREG(body->mode))
1454 lustre_swab_lov_user_md_objects(
1455 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1462 *lmm_size = lmmsize;
1467 static int ll_lov_setea(struct inode *inode, struct file *file,
1470 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471 struct lov_user_md *lump;
1472 int lum_size = sizeof(struct lov_user_md) +
1473 sizeof(struct lov_user_ost_data);
1477 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1480 OBD_ALLOC_LARGE(lump, lum_size);
1484 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1485 OBD_FREE_LARGE(lump, lum_size);
1489 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1491 OBD_FREE_LARGE(lump, lum_size);
1495 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1498 struct lov_user_md_v3 lumv3;
1499 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1500 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1501 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1503 int flags = FMODE_WRITE;
1506 /* first try with v1 which is smaller than v3 */
1507 lum_size = sizeof(struct lov_user_md_v1);
1508 if (copy_from_user(lumv1, lumv1p, lum_size))
1511 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512 lum_size = sizeof(struct lov_user_md_v3);
1513 if (copy_from_user(&lumv3, lumv3p, lum_size))
1517 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1519 struct lov_stripe_md *lsm;
1522 put_user(0, &lumv1p->lmm_stripe_count);
1524 ll_layout_refresh(inode, &gen);
1525 lsm = ccc_inode_lsm_get(inode);
1526 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1527 0, lsm, (void *)arg);
1528 ccc_inode_lsm_put(inode, lsm);
1533 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1535 struct lov_stripe_md *lsm;
1539 lsm = ccc_inode_lsm_get(inode);
1541 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1543 ccc_inode_lsm_put(inode, lsm);
1547 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1549 struct ll_inode_info *lli = ll_i2info(inode);
1550 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1551 struct ccc_grouplock grouplock;
1555 if (ll_file_nolock(file))
1556 RETURN(-EOPNOTSUPP);
1558 spin_lock(&lli->lli_lock);
1559 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1560 CWARN("group lock already existed with gid %lu\n",
1561 fd->fd_grouplock.cg_gid);
1562 spin_unlock(&lli->lli_lock);
1565 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1566 spin_unlock(&lli->lli_lock);
1568 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1569 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1573 spin_lock(&lli->lli_lock);
1574 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1575 spin_unlock(&lli->lli_lock);
1576 CERROR("another thread just won the race\n");
1577 cl_put_grouplock(&grouplock);
1581 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1582 fd->fd_grouplock = grouplock;
1583 spin_unlock(&lli->lli_lock);
1585 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1589 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1591 struct ll_inode_info *lli = ll_i2info(inode);
1592 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1593 struct ccc_grouplock grouplock;
1596 spin_lock(&lli->lli_lock);
1597 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1598 spin_unlock(&lli->lli_lock);
1599 CWARN("no group lock held\n");
1602 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1604 if (fd->fd_grouplock.cg_gid != arg) {
1605 CWARN("group lock %lu doesn't match current id %lu\n",
1606 arg, fd->fd_grouplock.cg_gid);
1607 spin_unlock(&lli->lli_lock);
1611 grouplock = fd->fd_grouplock;
1612 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1613 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1614 spin_unlock(&lli->lli_lock);
1616 cl_put_grouplock(&grouplock);
1617 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1622 * Close inode open handle
1624 * \param dentry [in] dentry which contains the inode
1625 * \param it [in,out] intent which contains open info and result
1628 * \retval <0 failure
1630 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1632 struct inode *inode = dentry->d_inode;
1633 struct obd_client_handle *och;
1639 /* Root ? Do nothing. */
1640 if (dentry->d_inode->i_sb->s_root == dentry)
1643 /* No open handle to close? Move away */
1644 if (!it_disposition(it, DISP_OPEN_OPEN))
1647 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1649 OBD_ALLOC(och, sizeof(*och));
1651 GOTO(out, rc = -ENOMEM);
1653 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1654 ll_i2info(inode), it, och);
1656 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1659 /* this one is in place of ll_file_open */
1660 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1661 ptlrpc_req_finished(it->d.lustre.it_data);
1662 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1668 * Get size for inode for which FIEMAP mapping is requested.
1669 * Make the FIEMAP get_info call and returns the result.
1671 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1674 struct obd_export *exp = ll_i2dtexp(inode);
1675 struct lov_stripe_md *lsm = NULL;
1676 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1677 int vallen = num_bytes;
1681 /* Checks for fiemap flags */
1682 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1683 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1687 /* Check for FIEMAP_FLAG_SYNC */
1688 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1689 rc = filemap_fdatawrite(inode->i_mapping);
1694 lsm = ccc_inode_lsm_get(inode);
1698 /* If the stripe_count > 1 and the application does not understand
1699 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1701 if (lsm->lsm_stripe_count > 1 &&
1702 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1703 GOTO(out, rc = -EOPNOTSUPP);
1705 fm_key.oa.o_oi = lsm->lsm_oi;
1706 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1708 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1709 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1710 /* If filesize is 0, then there would be no objects for mapping */
1711 if (fm_key.oa.o_size == 0) {
1712 fiemap->fm_mapped_extents = 0;
1716 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1718 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1721 CERROR("obd_get_info failed: rc = %d\n", rc);
1724 ccc_inode_lsm_put(inode, lsm);
1728 int ll_fid2path(struct inode *inode, void *arg)
1730 struct obd_export *exp = ll_i2mdexp(inode);
1731 struct getinfo_fid2path *gfout, *gfin;
1735 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1736 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1739 /* Need to get the buflen */
1740 OBD_ALLOC_PTR(gfin);
1743 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1748 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1749 OBD_ALLOC(gfout, outsize);
1750 if (gfout == NULL) {
1754 memcpy(gfout, gfin, sizeof(*gfout));
1757 /* Call mdc_iocontrol */
1758 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1762 if (copy_to_user(arg, gfout, outsize))
1766 OBD_FREE(gfout, outsize);
1770 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1772 struct ll_user_fiemap *fiemap_s;
1773 size_t num_bytes, ret_bytes;
1774 unsigned int extent_count;
1777 /* Get the extent count so we can calculate the size of
1778 * required fiemap buffer */
1779 if (get_user(extent_count,
1780 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1782 num_bytes = sizeof(*fiemap_s) + (extent_count *
1783 sizeof(struct ll_fiemap_extent));
1785 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1786 if (fiemap_s == NULL)
1789 /* get the fiemap value */
1790 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1792 GOTO(error, rc = -EFAULT);
1794 /* If fm_extent_count is non-zero, read the first extent since
1795 * it is used to calculate end_offset and device from previous
1798 if (copy_from_user(&fiemap_s->fm_extents[0],
1799 (char __user *)arg + sizeof(*fiemap_s),
1800 sizeof(struct ll_fiemap_extent)))
1801 GOTO(error, rc = -EFAULT);
1804 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1808 ret_bytes = sizeof(struct ll_user_fiemap);
1810 if (extent_count != 0)
1811 ret_bytes += (fiemap_s->fm_mapped_extents *
1812 sizeof(struct ll_fiemap_extent));
1814 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1818 OBD_FREE_LARGE(fiemap_s, num_bytes);
1823 * Read the data_version for inode.
1825 * This value is computed using stripe object version on OST.
1826 * Version is computed using server side locking.
1828 * @param extent_lock Take extent lock. Not needed if a process is already
1829 * holding the OST object group locks.
1831 int ll_data_version(struct inode *inode, __u64 *data_version,
1834 struct lov_stripe_md *lsm = NULL;
1835 struct ll_sb_info *sbi = ll_i2sbi(inode);
1836 struct obdo *obdo = NULL;
1840 /* If no stripe, we consider version is 0. */
1841 lsm = ccc_inode_lsm_get(inode);
1842 if (!lsm_has_objects(lsm)) {
1844 CDEBUG(D_INODE, "No object for inode\n");
1848 OBD_ALLOC_PTR(obdo);
1850 GOTO(out, rc = -ENOMEM);
1852 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1854 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1857 *data_version = obdo->o_data_version;
1863 ccc_inode_lsm_put(inode, lsm);
1867 struct ll_swap_stack {
1868 struct iattr ia1, ia2;
1870 struct inode *inode1, *inode2;
1871 bool check_dv1, check_dv2;
1874 static int ll_swap_layouts(struct file *file1, struct file *file2,
1875 struct lustre_swap_layouts *lsl)
1877 struct mdc_swap_layouts msl;
1878 struct md_op_data *op_data;
1881 struct ll_swap_stack *llss = NULL;
1884 OBD_ALLOC_PTR(llss);
1888 llss->inode1 = file1->f_dentry->d_inode;
1889 llss->inode2 = file2->f_dentry->d_inode;
1891 if (!S_ISREG(llss->inode2->i_mode))
1892 GOTO(free, rc = -EINVAL);
1894 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1895 ll_permission(llss->inode2, MAY_WRITE, NULL))
1896 GOTO(free, rc = -EPERM);
1898 if (llss->inode2->i_sb != llss->inode1->i_sb)
1899 GOTO(free, rc = -EXDEV);
1901 /* we use 2 bool because it is easier to swap than 2 bits */
1902 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1903 llss->check_dv1 = true;
1905 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1906 llss->check_dv2 = true;
1908 /* we cannot use lsl->sl_dvX directly because we may swap them */
1909 llss->dv1 = lsl->sl_dv1;
1910 llss->dv2 = lsl->sl_dv2;
1912 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1913 if (rc == 0) /* same file, done! */
1916 if (rc < 0) { /* sequentialize it */
1917 swap(llss->inode1, llss->inode2);
1919 swap(llss->dv1, llss->dv2);
1920 swap(llss->check_dv1, llss->check_dv2);
1924 if (gid != 0) { /* application asks to flush dirty cache */
1925 rc = ll_get_grouplock(llss->inode1, file1, gid);
1929 rc = ll_get_grouplock(llss->inode2, file2, gid);
1931 ll_put_grouplock(llss->inode1, file1, gid);
1936 /* to be able to restore mtime and atime after swap
1937 * we need to first save them */
1939 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1940 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1941 llss->ia1.ia_atime = llss->inode1->i_atime;
1942 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1943 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1944 llss->ia2.ia_atime = llss->inode2->i_atime;
1945 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1948 /* ultimate check, before swaping the layouts we check if
1949 * dataversion has changed (if requested) */
1950 if (llss->check_dv1) {
1951 rc = ll_data_version(llss->inode1, &dv, 0);
1954 if (dv != llss->dv1)
1955 GOTO(putgl, rc = -EAGAIN);
1958 if (llss->check_dv2) {
1959 rc = ll_data_version(llss->inode2, &dv, 0);
1962 if (dv != llss->dv2)
1963 GOTO(putgl, rc = -EAGAIN);
1966 /* struct md_op_data is used to send the swap args to the mdt
1967 * only flags is missing, so we use struct mdc_swap_layouts
1968 * through the md_op_data->op_data */
1969 /* flags from user space have to be converted before they are send to
1970 * server, no flag is sent today, they are only used on the client */
1973 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1974 0, LUSTRE_OPC_ANY, &msl);
1975 if (IS_ERR(op_data))
1976 GOTO(free, rc = PTR_ERR(op_data));
1978 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1979 sizeof(*op_data), op_data, NULL);
1980 ll_finish_md_op_data(op_data);
1984 ll_put_grouplock(llss->inode2, file2, gid);
1985 ll_put_grouplock(llss->inode1, file1, gid);
1988 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1992 /* clear useless flags */
1993 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1994 llss->ia1.ia_valid &= ~ATTR_MTIME;
1995 llss->ia2.ia_valid &= ~ATTR_MTIME;
1998 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1999 llss->ia1.ia_valid &= ~ATTR_ATIME;
2000 llss->ia2.ia_valid &= ~ATTR_ATIME;
2003 /* update time if requested */
2005 if (llss->ia2.ia_valid != 0) {
2006 mutex_lock(&llss->inode1->i_mutex);
2007 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2008 mutex_unlock(&llss->inode1->i_mutex);
2011 if (llss->ia1.ia_valid != 0) {
2014 mutex_lock(&llss->inode2->i_mutex);
2015 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2016 mutex_unlock(&llss->inode2->i_mutex);
2028 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2030 struct inode *inode = file->f_dentry->d_inode;
2031 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2035 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2036 inode->i_generation, inode, cmd);
2037 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2039 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2040 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2044 case LL_IOC_GETFLAGS:
2045 /* Get the current value of the file flags */
2046 return put_user(fd->fd_flags, (int *)arg);
2047 case LL_IOC_SETFLAGS:
2048 case LL_IOC_CLRFLAGS:
2049 /* Set or clear specific file flags */
2050 /* XXX This probably needs checks to ensure the flags are
2051 * not abused, and to handle any flag side effects.
2053 if (get_user(flags, (int *) arg))
2056 if (cmd == LL_IOC_SETFLAGS) {
2057 if ((flags & LL_FILE_IGNORE_LOCK) &&
2058 !(file->f_flags & O_DIRECT)) {
2059 CERROR("%s: unable to disable locking on "
2060 "non-O_DIRECT file\n", current->comm);
2064 fd->fd_flags |= flags;
2066 fd->fd_flags &= ~flags;
2069 case LL_IOC_LOV_SETSTRIPE:
2070 RETURN(ll_lov_setstripe(inode, file, arg));
2071 case LL_IOC_LOV_SETEA:
2072 RETURN(ll_lov_setea(inode, file, arg));
2073 case LL_IOC_LOV_SWAP_LAYOUTS: {
2075 struct lustre_swap_layouts lsl;
2077 if (copy_from_user(&lsl, (char *)arg,
2078 sizeof(struct lustre_swap_layouts)))
2081 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2084 file2 = fget(lsl.sl_fd);
2089 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2090 rc = ll_swap_layouts(file, file2, &lsl);
2094 case LL_IOC_LOV_GETSTRIPE:
2095 RETURN(ll_lov_getstripe(inode, arg));
2096 case LL_IOC_RECREATE_OBJ:
2097 RETURN(ll_lov_recreate_obj(inode, arg));
2098 case LL_IOC_RECREATE_FID:
2099 RETURN(ll_lov_recreate_fid(inode, arg));
2100 case FSFILT_IOC_FIEMAP:
2101 RETURN(ll_ioctl_fiemap(inode, arg));
2102 case FSFILT_IOC_GETFLAGS:
2103 case FSFILT_IOC_SETFLAGS:
2104 RETURN(ll_iocontrol(inode, file, cmd, arg));
2105 case FSFILT_IOC_GETVERSION_OLD:
2106 case FSFILT_IOC_GETVERSION:
2107 RETURN(put_user(inode->i_generation, (int *)arg));
2108 case LL_IOC_GROUP_LOCK:
2109 RETURN(ll_get_grouplock(inode, file, arg));
2110 case LL_IOC_GROUP_UNLOCK:
2111 RETURN(ll_put_grouplock(inode, file, arg));
2112 case IOC_OBD_STATFS:
2113 RETURN(ll_obd_statfs(inode, (void *)arg));
2115 /* We need to special case any other ioctls we want to handle,
2116 * to send them to the MDS/OST as appropriate and to properly
2117 * network encode the arg field.
2118 case FSFILT_IOC_SETVERSION_OLD:
2119 case FSFILT_IOC_SETVERSION:
2121 case LL_IOC_FLUSHCTX:
2122 RETURN(ll_flush_ctx(inode));
2123 case LL_IOC_PATH2FID: {
2124 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2125 sizeof(struct lu_fid)))
2130 case OBD_IOC_FID2PATH:
2131 RETURN(ll_fid2path(inode, (void *)arg));
2132 case LL_IOC_DATA_VERSION: {
2133 struct ioc_data_version idv;
2136 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2139 rc = ll_data_version(inode, &idv.idv_version,
2140 !(idv.idv_flags & LL_DV_NOFLUSH));
2142 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2148 case LL_IOC_GET_MDTIDX: {
2151 mdtidx = ll_get_mdt_idx(inode);
2155 if (put_user((int)mdtidx, (int*)arg))
2160 case OBD_IOC_GETDTNAME:
2161 case OBD_IOC_GETMDNAME:
2162 RETURN(ll_get_obd_name(inode, cmd, arg));
2163 case LL_IOC_HSM_STATE_GET: {
2164 struct md_op_data *op_data;
2165 struct hsm_user_state *hus;
2172 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2173 LUSTRE_OPC_ANY, hus);
2174 if (IS_ERR(op_data)) {
2176 RETURN(PTR_ERR(op_data));
2179 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2182 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2185 ll_finish_md_op_data(op_data);
2189 case LL_IOC_HSM_STATE_SET: {
2190 struct md_op_data *op_data;
2191 struct hsm_state_set *hss;
2197 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2202 /* Non-root users are forbidden to set or clear flags which are
2203 * NOT defined in HSM_USER_MASK. */
2204 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2205 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2210 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2211 LUSTRE_OPC_ANY, hss);
2212 if (IS_ERR(op_data)) {
2214 RETURN(PTR_ERR(op_data));
2217 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2220 ll_finish_md_op_data(op_data);
2225 case LL_IOC_HSM_ACTION: {
2226 struct md_op_data *op_data;
2227 struct hsm_current_action *hca;
2234 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2235 LUSTRE_OPC_ANY, hca);
2236 if (IS_ERR(op_data)) {
2238 RETURN(PTR_ERR(op_data));
2241 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2244 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2247 ll_finish_md_op_data(op_data);
2255 ll_iocontrol_call(inode, file, cmd, arg, &err))
2258 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2264 #ifndef HAVE_FILE_LLSEEK_SIZE
2265 static inline loff_t
2266 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2268 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2270 if (offset > maxsize)
2273 if (offset != file->f_pos) {
2274 file->f_pos = offset;
2275 file->f_version = 0;
2281 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2282 loff_t maxsize, loff_t eof)
2284 struct inode *inode = file->f_dentry->d_inode;
2292 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2293 * position-querying operation. Avoid rewriting the "same"
2294 * f_pos value back to the file because a concurrent read(),
2295 * write() or lseek() might have altered it
2300 * f_lock protects against read/modify/write race with other
2301 * SEEK_CURs. Note that parallel writes and reads behave
2304 mutex_lock(&inode->i_mutex);
2305 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2306 mutex_unlock(&inode->i_mutex);
2310 * In the generic case the entire file is data, so as long as
2311 * offset isn't at the end of the file then the offset is data.
2318 * There is a virtual hole at the end of the file, so as long as
2319 * offset isn't i_size or larger, return i_size.
2327 return llseek_execute(file, offset, maxsize);
2331 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2333 struct inode *inode = file->f_dentry->d_inode;
2334 loff_t retval, eof = 0;
2337 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2338 (origin == SEEK_CUR) ? file->f_pos : 0);
2339 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2340 inode->i_ino, inode->i_generation, inode, retval, retval,
2342 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2344 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2345 retval = ll_glimpse_size(inode);
2348 eof = i_size_read(inode);
2351 retval = ll_generic_file_llseek_size(file, offset, origin,
2352 ll_file_maxbytes(inode), eof);
2356 int ll_flush(struct file *file, fl_owner_t id)
2358 struct inode *inode = file->f_dentry->d_inode;
2359 struct ll_inode_info *lli = ll_i2info(inode);
2360 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2363 LASSERT(!S_ISDIR(inode->i_mode));
2365 /* catch async errors that were recorded back when async writeback
2366 * failed for pages in this mapping. */
2367 rc = lli->lli_async_rc;
2368 lli->lli_async_rc = 0;
2369 err = lov_read_and_clear_async_rc(lli->lli_clob);
2373 /* The application has been told write failure already.
2374 * Do not report failure again. */
2375 if (fd->fd_write_failed)
2377 return rc ? -EIO : 0;
2381 * Called to make sure a portion of file has been written out.
2382 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2384 * Return how many pages have been written.
2386 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2387 enum cl_fsync_mode mode, int ignore_layout)
2389 struct cl_env_nest nest;
2392 struct obd_capa *capa = NULL;
2393 struct cl_fsync_io *fio;
2397 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2398 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2401 env = cl_env_nested_get(&nest);
2403 RETURN(PTR_ERR(env));
2405 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2407 io = ccc_env_thread_io(env);
2408 io->ci_obj = cl_i2info(inode)->lli_clob;
2409 io->ci_ignore_layout = ignore_layout;
2411 /* initialize parameters for sync */
2412 fio = &io->u.ci_fsync;
2413 fio->fi_capa = capa;
2414 fio->fi_start = start;
2416 fio->fi_fid = ll_inode2fid(inode);
2417 fio->fi_mode = mode;
2418 fio->fi_nr_written = 0;
2420 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2421 result = cl_io_loop(env, io);
2423 result = io->ci_result;
2425 result = fio->fi_nr_written;
2426 cl_io_fini(env, io);
2427 cl_env_nested_put(&nest, env);
2435 * When dentry is provided (the 'else' case), *file->f_dentry may be
2436 * null and dentry must be used directly rather than pulled from
2437 * *file->f_dentry as is done otherwise.
2440 #ifdef HAVE_FILE_FSYNC_4ARGS
2441 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2443 struct dentry *dentry = file->f_dentry;
2444 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2445 int ll_fsync(struct file *file, int datasync)
2447 struct dentry *dentry = file->f_dentry;
2449 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2452 struct inode *inode = dentry->d_inode;
2453 struct ll_inode_info *lli = ll_i2info(inode);
2454 struct ptlrpc_request *req;
2455 struct obd_capa *oc;
2459 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2460 inode->i_generation, inode);
2461 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2463 #ifdef HAVE_FILE_FSYNC_4ARGS
2464 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2465 mutex_lock(&inode->i_mutex);
2467 /* fsync's caller has already called _fdata{sync,write}, we want
2468 * that IO to finish before calling the osc and mdc sync methods */
2469 rc = filemap_fdatawait(inode->i_mapping);
2472 /* catch async errors that were recorded back when async writeback
2473 * failed for pages in this mapping. */
2474 if (!S_ISDIR(inode->i_mode)) {
2475 err = lli->lli_async_rc;
2476 lli->lli_async_rc = 0;
2479 err = lov_read_and_clear_async_rc(lli->lli_clob);
2484 oc = ll_mdscapa_get(inode);
2485 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2491 ptlrpc_req_finished(req);
2493 if (datasync && S_ISREG(inode->i_mode)) {
2494 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2496 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2498 if (rc == 0 && err < 0)
2501 fd->fd_write_failed = true;
2503 fd->fd_write_failed = false;
2506 #ifdef HAVE_FILE_FSYNC_4ARGS
2507 mutex_unlock(&inode->i_mutex);
2512 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2514 struct inode *inode = file->f_dentry->d_inode;
2515 struct ll_sb_info *sbi = ll_i2sbi(inode);
2516 struct ldlm_enqueue_info einfo = {
2517 .ei_type = LDLM_FLOCK,
2518 .ei_cb_cp = ldlm_flock_completion_ast,
2519 .ei_cbdata = file_lock,
2521 struct md_op_data *op_data;
2522 struct lustre_handle lockh = {0};
2523 ldlm_policy_data_t flock = {{0}};
2529 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2530 inode->i_ino, file_lock);
2532 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2534 if (file_lock->fl_flags & FL_FLOCK) {
2535 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2536 /* flocks are whole-file locks */
2537 flock.l_flock.end = OFFSET_MAX;
2538 /* For flocks owner is determined by the local file desctiptor*/
2539 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2540 } else if (file_lock->fl_flags & FL_POSIX) {
2541 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2542 flock.l_flock.start = file_lock->fl_start;
2543 flock.l_flock.end = file_lock->fl_end;
2547 flock.l_flock.pid = file_lock->fl_pid;
2549 /* Somewhat ugly workaround for svc lockd.
2550 * lockd installs custom fl_lmops->lm_compare_owner that checks
2551 * for the fl_owner to be the same (which it always is on local node
2552 * I guess between lockd processes) and then compares pid.
2553 * As such we assign pid to the owner field to make it all work,
2554 * conflict with normal locks is unlikely since pid space and
2555 * pointer space for current->files are not intersecting */
2556 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2557 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2559 switch (file_lock->fl_type) {
2561 einfo.ei_mode = LCK_PR;
2564 /* An unlock request may or may not have any relation to
2565 * existing locks so we may not be able to pass a lock handle
2566 * via a normal ldlm_lock_cancel() request. The request may even
2567 * unlock a byte range in the middle of an existing lock. In
2568 * order to process an unlock request we need all of the same
2569 * information that is given with a normal read or write record
2570 * lock request. To avoid creating another ldlm unlock (cancel)
2571 * message we'll treat a LCK_NL flock request as an unlock. */
2572 einfo.ei_mode = LCK_NL;
2575 einfo.ei_mode = LCK_PW;
2578 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2579 file_lock->fl_type);
2594 flags = LDLM_FL_BLOCK_NOWAIT;
2600 flags = LDLM_FL_TEST_LOCK;
2601 /* Save the old mode so that if the mode in the lock changes we
2602 * can decrement the appropriate reader or writer refcount. */
2603 file_lock->fl_type = einfo.ei_mode;
2606 CERROR("unknown fcntl lock command: %d\n", cmd);
2610 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2611 LUSTRE_OPC_ANY, NULL);
2612 if (IS_ERR(op_data))
2613 RETURN(PTR_ERR(op_data));
2615 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2616 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2617 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2619 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2620 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2622 if ((file_lock->fl_flags & FL_FLOCK) &&
2623 (rc == 0 || file_lock->fl_type == F_UNLCK))
2624 rc2 = flock_lock_file_wait(file, file_lock);
2625 if ((file_lock->fl_flags & FL_POSIX) &&
2626 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2627 !(flags & LDLM_FL_TEST_LOCK))
2628 rc2 = posix_lock_file_wait(file, file_lock);
2630 if (rc2 && file_lock->fl_type != F_UNLCK) {
2631 einfo.ei_mode = LCK_NL;
2632 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2633 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2637 ll_finish_md_op_data(op_data);
2642 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2650 * test if some locks matching bits and l_req_mode are acquired
2651 * - bits can be in different locks
2652 * - if found clear the common lock bits in *bits
2653 * - the bits not found, are kept in *bits
2655 * \param bits [IN] searched lock bits [IN]
2656 * \param l_req_mode [IN] searched lock mode
2657 * \retval boolean, true iff all bits are found
2659 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2661 struct lustre_handle lockh;
2662 ldlm_policy_data_t policy;
2663 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2664 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2673 fid = &ll_i2info(inode)->lli_fid;
2674 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2675 ldlm_lockname[mode]);
2677 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2678 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2679 policy.l_inodebits.bits = *bits & (1 << i);
2680 if (policy.l_inodebits.bits == 0)
2683 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2684 &policy, mode, &lockh)) {
2685 struct ldlm_lock *lock;
2687 lock = ldlm_handle2lock(&lockh);
2690 ~(lock->l_policy_data.l_inodebits.bits);
2691 LDLM_LOCK_PUT(lock);
2693 *bits &= ~policy.l_inodebits.bits;
2700 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2701 struct lustre_handle *lockh, __u64 flags)
2703 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2708 fid = &ll_i2info(inode)->lli_fid;
2709 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2711 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2712 fid, LDLM_IBITS, &policy,
2713 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2717 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2719 /* Already unlinked. Just update nlink and return success */
2720 if (rc == -ENOENT) {
2722 /* This path cannot be hit for regular files unless in
2723 * case of obscure races, so no need to to validate
2725 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2727 } else if (rc != 0) {
2728 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2729 ll_get_fsname(inode->i_sb, NULL, 0),
2730 PFID(ll_inode2fid(inode)), rc);
2736 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2739 struct inode *inode = dentry->d_inode;
2740 struct ptlrpc_request *req = NULL;
2741 struct obd_export *exp;
2745 LASSERT(inode != NULL);
2747 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2748 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2750 exp = ll_i2mdexp(inode);
2752 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2753 * But under CMD case, it caused some lock issues, should be fixed
2754 * with new CMD ibits lock. See bug 12718 */
2755 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2756 struct lookup_intent oit = { .it_op = IT_GETATTR };
2757 struct md_op_data *op_data;
2759 if (ibits == MDS_INODELOCK_LOOKUP)
2760 oit.it_op = IT_LOOKUP;
2762 /* Call getattr by fid, so do not provide name at all. */
2763 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2764 dentry->d_inode, NULL, 0, 0,
2765 LUSTRE_OPC_ANY, NULL);
2766 if (IS_ERR(op_data))
2767 RETURN(PTR_ERR(op_data));
2769 oit.it_create_mode |= M_CHECK_STALE;
2770 rc = md_intent_lock(exp, op_data, NULL, 0,
2771 /* we are not interested in name
2774 ll_md_blocking_ast, 0);
2775 ll_finish_md_op_data(op_data);
2776 oit.it_create_mode &= ~M_CHECK_STALE;
2778 rc = ll_inode_revalidate_fini(inode, rc);
2782 rc = ll_revalidate_it_finish(req, &oit, dentry);
2784 ll_intent_release(&oit);
2788 /* Unlinked? Unhash dentry, so it is not picked up later by
2789 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2790 here to preserve get_cwd functionality on 2.6.
2792 if (!dentry->d_inode->i_nlink)
2793 d_lustre_invalidate(dentry, 0);
2795 ll_lookup_finish_locks(&oit, dentry);
2796 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2797 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2798 obd_valid valid = OBD_MD_FLGETATTR;
2799 struct md_op_data *op_data;
2802 if (S_ISREG(inode->i_mode)) {
2803 rc = ll_get_max_mdsize(sbi, &ealen);
2806 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2809 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2810 0, ealen, LUSTRE_OPC_ANY,
2812 if (IS_ERR(op_data))
2813 RETURN(PTR_ERR(op_data));
2815 op_data->op_valid = valid;
2816 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2817 * capa for this inode. Because we only keep capas of dirs
2819 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2820 ll_finish_md_op_data(op_data);
2822 rc = ll_inode_revalidate_fini(inode, rc);
2826 rc = ll_prep_inode(&inode, req, NULL, NULL);
2829 ptlrpc_req_finished(req);
2833 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2836 struct inode *inode = dentry->d_inode;
2840 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2844 /* if object isn't regular file, don't validate size */
2845 if (!S_ISREG(inode->i_mode)) {
2846 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2847 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2848 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2850 rc = ll_glimpse_size(inode);
2855 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2856 struct lookup_intent *it, struct kstat *stat)
2858 struct inode *inode = de->d_inode;
2859 struct ll_sb_info *sbi = ll_i2sbi(inode);
2860 struct ll_inode_info *lli = ll_i2info(inode);
2863 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2864 MDS_INODELOCK_LOOKUP);
2865 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2870 stat->dev = inode->i_sb->s_dev;
2871 if (ll_need_32bit_api(sbi))
2872 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2874 stat->ino = inode->i_ino;
2875 stat->mode = inode->i_mode;
2876 stat->nlink = inode->i_nlink;
2877 stat->uid = inode->i_uid;
2878 stat->gid = inode->i_gid;
2879 stat->rdev = inode->i_rdev;
2880 stat->atime = inode->i_atime;
2881 stat->mtime = inode->i_mtime;
2882 stat->ctime = inode->i_ctime;
2883 stat->blksize = 1 << inode->i_blkbits;
2885 stat->size = i_size_read(inode);
2886 stat->blocks = inode->i_blocks;
2890 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2892 struct lookup_intent it = { .it_op = IT_GETATTR };
2894 return ll_getattr_it(mnt, de, &it, stat);
2897 #ifdef HAVE_LINUX_FIEMAP_H
2898 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2899 __u64 start, __u64 len)
2903 struct ll_user_fiemap *fiemap;
2904 unsigned int extent_count = fieinfo->fi_extents_max;
2906 num_bytes = sizeof(*fiemap) + (extent_count *
2907 sizeof(struct ll_fiemap_extent));
2908 OBD_ALLOC_LARGE(fiemap, num_bytes);
2913 fiemap->fm_flags = fieinfo->fi_flags;
2914 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2915 fiemap->fm_start = start;
2916 fiemap->fm_length = len;
2917 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2918 sizeof(struct ll_fiemap_extent));
2920 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2922 fieinfo->fi_flags = fiemap->fm_flags;
2923 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2924 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2925 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2927 OBD_FREE_LARGE(fiemap, num_bytes);
2932 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2934 struct ll_inode_info *lli = ll_i2info(inode);
2935 struct posix_acl *acl = NULL;
2938 spin_lock(&lli->lli_lock);
2939 /* VFS' acl_permission_check->check_acl will release the refcount */
2940 acl = posix_acl_dup(lli->lli_posix_acl);
2941 spin_unlock(&lli->lli_lock);
2946 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2948 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2949 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2951 ll_check_acl(struct inode *inode, int mask)
2954 # ifdef CONFIG_FS_POSIX_ACL
2955 struct posix_acl *acl;
2959 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2960 if (flags & IPERM_FLAG_RCU)
2963 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2968 rc = posix_acl_permission(inode, acl, mask);
2969 posix_acl_release(acl);
2972 # else /* !CONFIG_FS_POSIX_ACL */
2974 # endif /* CONFIG_FS_POSIX_ACL */
2976 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2978 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2979 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2981 # ifdef HAVE_INODE_PERMISION_2ARGS
2982 int ll_inode_permission(struct inode *inode, int mask)
2984 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2991 #ifdef MAY_NOT_BLOCK
2992 if (mask & MAY_NOT_BLOCK)
2994 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2995 if (flags & IPERM_FLAG_RCU)
2999 /* as root inode are NOT getting validated in lookup operation,
3000 * need to do it before permission check. */
3002 if (inode == inode->i_sb->s_root->d_inode) {
3003 struct lookup_intent it = { .it_op = IT_LOOKUP };
3005 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3006 MDS_INODELOCK_LOOKUP);
3011 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3012 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3014 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3015 return lustre_check_remote_perm(inode, mask);
3017 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3018 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3023 #ifdef HAVE_FILE_READV
3024 #define READ_METHOD readv
3025 #define READ_FUNCTION ll_file_readv
3026 #define WRITE_METHOD writev
3027 #define WRITE_FUNCTION ll_file_writev
3029 #define READ_METHOD aio_read
3030 #define READ_FUNCTION ll_file_aio_read
3031 #define WRITE_METHOD aio_write
3032 #define WRITE_FUNCTION ll_file_aio_write
3035 /* -o localflock - only provides locally consistent flock locks */
3036 struct file_operations ll_file_operations = {
3037 .read = ll_file_read,
3038 .READ_METHOD = READ_FUNCTION,
3039 .write = ll_file_write,
3040 .WRITE_METHOD = WRITE_FUNCTION,
3041 .unlocked_ioctl = ll_file_ioctl,
3042 .open = ll_file_open,
3043 .release = ll_file_release,
3044 .mmap = ll_file_mmap,
3045 .llseek = ll_file_seek,
3046 #ifdef HAVE_KERNEL_SENDFILE
3047 .sendfile = ll_file_sendfile,
3049 #ifdef HAVE_KERNEL_SPLICE_READ
3050 .splice_read = ll_file_splice_read,
3056 struct file_operations ll_file_operations_flock = {
3057 .read = ll_file_read,
3058 .READ_METHOD = READ_FUNCTION,
3059 .write = ll_file_write,
3060 .WRITE_METHOD = WRITE_FUNCTION,
3061 .unlocked_ioctl = ll_file_ioctl,
3062 .open = ll_file_open,
3063 .release = ll_file_release,
3064 .mmap = ll_file_mmap,
3065 .llseek = ll_file_seek,
3066 #ifdef HAVE_KERNEL_SENDFILE
3067 .sendfile = ll_file_sendfile,
3069 #ifdef HAVE_KERNEL_SPLICE_READ
3070 .splice_read = ll_file_splice_read,
3074 .flock = ll_file_flock,
3075 .lock = ll_file_flock
3078 /* These are for -o noflock - to return ENOSYS on flock calls */
3079 struct file_operations ll_file_operations_noflock = {
3080 .read = ll_file_read,
3081 .READ_METHOD = READ_FUNCTION,
3082 .write = ll_file_write,
3083 .WRITE_METHOD = WRITE_FUNCTION,
3084 .unlocked_ioctl = ll_file_ioctl,
3085 .open = ll_file_open,
3086 .release = ll_file_release,
3087 .mmap = ll_file_mmap,
3088 .llseek = ll_file_seek,
3089 #ifdef HAVE_KERNEL_SENDFILE
3090 .sendfile = ll_file_sendfile,
3092 #ifdef HAVE_KERNEL_SPLICE_READ
3093 .splice_read = ll_file_splice_read,
3097 .flock = ll_file_noflock,
3098 .lock = ll_file_noflock
3101 struct inode_operations ll_file_inode_operations = {
3102 .setattr = ll_setattr,
3103 .getattr = ll_getattr,
3104 .permission = ll_inode_permission,
3105 .setxattr = ll_setxattr,
3106 .getxattr = ll_getxattr,
3107 .listxattr = ll_listxattr,
3108 .removexattr = ll_removexattr,
3109 #ifdef HAVE_LINUX_FIEMAP_H
3110 .fiemap = ll_fiemap,
3112 #ifdef HAVE_IOP_GET_ACL
3113 .get_acl = ll_get_acl,
3117 /* dynamic ioctl number support routins */
3118 static struct llioc_ctl_data {
3119 struct rw_semaphore ioc_sem;
3120 cfs_list_t ioc_head;
3122 __RWSEM_INITIALIZER(llioc.ioc_sem),
3123 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3128 cfs_list_t iocd_list;
3129 unsigned int iocd_size;
3130 llioc_callback_t iocd_cb;
3131 unsigned int iocd_count;
3132 unsigned int iocd_cmd[0];
3135 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3138 struct llioc_data *in_data = NULL;
3141 if (cb == NULL || cmd == NULL ||
3142 count > LLIOC_MAX_CMD || count < 0)
3145 size = sizeof(*in_data) + count * sizeof(unsigned int);
3146 OBD_ALLOC(in_data, size);
3147 if (in_data == NULL)
3150 memset(in_data, 0, sizeof(*in_data));
3151 in_data->iocd_size = size;
3152 in_data->iocd_cb = cb;
3153 in_data->iocd_count = count;
3154 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3156 down_write(&llioc.ioc_sem);
3157 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3158 up_write(&llioc.ioc_sem);
3163 void ll_iocontrol_unregister(void *magic)
3165 struct llioc_data *tmp;
3170 down_write(&llioc.ioc_sem);
3171 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3173 unsigned int size = tmp->iocd_size;
3175 cfs_list_del(&tmp->iocd_list);
3176 up_write(&llioc.ioc_sem);
3178 OBD_FREE(tmp, size);
3182 up_write(&llioc.ioc_sem);
3184 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3187 EXPORT_SYMBOL(ll_iocontrol_register);
3188 EXPORT_SYMBOL(ll_iocontrol_unregister);
3190 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3191 unsigned int cmd, unsigned long arg, int *rcp)
3193 enum llioc_iter ret = LLIOC_CONT;
3194 struct llioc_data *data;
3195 int rc = -EINVAL, i;
3197 down_read(&llioc.ioc_sem);
3198 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3199 for (i = 0; i < data->iocd_count; i++) {
3200 if (cmd != data->iocd_cmd[i])
3203 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3207 if (ret == LLIOC_STOP)
3210 up_read(&llioc.ioc_sem);
3217 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3219 struct ll_inode_info *lli = ll_i2info(inode);
3220 struct cl_env_nest nest;
3225 if (lli->lli_clob == NULL)
3228 env = cl_env_nested_get(&nest);
3230 RETURN(PTR_ERR(env));
3232 result = cl_conf_set(env, lli->lli_clob, conf);
3233 cl_env_nested_put(&nest, env);
3235 if (conf->coc_opc == OBJECT_CONF_SET) {
3236 struct ldlm_lock *lock = conf->coc_lock;
3238 LASSERT(lock != NULL);
3239 LASSERT(ldlm_has_layout(lock));
3241 /* it can only be allowed to match after layout is
3242 * applied to inode otherwise false layout would be
3243 * seen. Applying layout shoud happen before dropping
3244 * the intent lock. */
3245 ldlm_lock_allow_match(lock);
3251 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3252 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3255 struct ll_sb_info *sbi = ll_i2sbi(inode);
3256 struct obd_capa *oc;
3257 struct ptlrpc_request *req;
3258 struct mdt_body *body;
3265 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3266 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3267 lock->l_lvb_data, lock->l_lvb_len);
3269 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3272 /* if layout lock was granted right away, the layout is returned
3273 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3274 * blocked and then granted via completion ast, we have to fetch
3275 * layout here. Please note that we can't use the LVB buffer in
3276 * completion AST because it doesn't have a large enough buffer */
3277 oc = ll_mdscapa_get(inode);
3278 rc = ll_get_max_mdsize(sbi, &lmmsize);
3280 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3281 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3287 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3288 if (body == NULL || body->eadatasize > lmmsize)
3289 GOTO(out, rc = -EPROTO);
3291 lmmsize = body->eadatasize;
3292 if (lmmsize == 0) /* empty layout */
3295 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3297 GOTO(out, rc = -EFAULT);
3299 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3300 if (lvbdata == NULL)
3301 GOTO(out, rc = -ENOMEM);
3303 memcpy(lvbdata, lmm, lmmsize);
3304 lock_res_and_lock(lock);
3305 if (lock->l_lvb_data != NULL)
3306 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3308 lock->l_lvb_data = lvbdata;
3309 lock->l_lvb_len = lmmsize;
3310 unlock_res_and_lock(lock);
3315 ptlrpc_req_finished(req);
3320 * Apply the layout to the inode. Layout lock is held and will be released
3323 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3324 struct inode *inode, __u32 *gen, bool reconf)
3326 struct ll_inode_info *lli = ll_i2info(inode);
3327 struct ll_sb_info *sbi = ll_i2sbi(inode);
3328 struct ldlm_lock *lock;
3329 struct lustre_md md = { NULL };
3330 struct cl_object_conf conf;
3333 bool wait_layout = false;
3336 LASSERT(lustre_handle_is_used(lockh));
3338 lock = ldlm_handle2lock(lockh);
3339 LASSERT(lock != NULL);
3340 LASSERT(ldlm_has_layout(lock));
3342 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3343 inode, PFID(&lli->lli_fid), reconf);
3345 /* in case this is a caching lock and reinstate with new inode */
3346 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3348 lock_res_and_lock(lock);
3349 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3350 unlock_res_and_lock(lock);
3351 /* checking lvb_ready is racy but this is okay. The worst case is
3352 * that multi processes may configure the file on the same time. */
3353 if (lvb_ready || !reconf) {
3356 /* layout_gen must be valid if layout lock is not
3357 * cancelled and stripe has already set */
3358 *gen = lli->lli_layout_gen;
3364 rc = ll_layout_fetch(inode, lock);
3368 /* for layout lock, lmm is returned in lock's lvb.
3369 * lvb_data is immutable if the lock is held so it's safe to access it
3370 * without res lock. See the description in ldlm_lock_decref_internal()
3371 * for the condition to free lvb_data of layout lock */
3372 if (lock->l_lvb_data != NULL) {
3373 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3374 lock->l_lvb_data, lock->l_lvb_len);
3376 *gen = LL_LAYOUT_GEN_EMPTY;
3378 *gen = md.lsm->lsm_layout_gen;
3381 CERROR("%s: file "DFID" unpackmd error: %d\n",
3382 ll_get_fsname(inode->i_sb, NULL, 0),
3383 PFID(&lli->lli_fid), rc);
3389 /* set layout to file. Unlikely this will fail as old layout was
3390 * surely eliminated */
3391 memset(&conf, 0, sizeof conf);
3392 conf.coc_opc = OBJECT_CONF_SET;
3393 conf.coc_inode = inode;
3394 conf.coc_lock = lock;
3395 conf.u.coc_md = &md;
3396 rc = ll_layout_conf(inode, &conf);
3399 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3401 /* refresh layout failed, need to wait */
3402 wait_layout = rc == -EBUSY;
3406 LDLM_LOCK_PUT(lock);
3407 ldlm_lock_decref(lockh, mode);
3409 /* wait for IO to complete if it's still being used. */
3411 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3412 ll_get_fsname(inode->i_sb, NULL, 0),
3413 inode, PFID(&lli->lli_fid));
3415 memset(&conf, 0, sizeof conf);
3416 conf.coc_opc = OBJECT_CONF_WAIT;
3417 conf.coc_inode = inode;
3418 rc = ll_layout_conf(inode, &conf);
3422 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3423 PFID(&lli->lli_fid), rc);
3429 * This function checks if there exists a LAYOUT lock on the client side,
3430 * or enqueues it if it doesn't have one in cache.
3432 * This function will not hold layout lock so it may be revoked any time after
3433 * this function returns. Any operations depend on layout should be redone
3436 * This function should be called before lov_io_init() to get an uptodate
3437 * layout version, the caller should save the version number and after IO
3438 * is finished, this function should be called again to verify that layout
3439 * is not changed during IO time.
3441 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3443 struct ll_inode_info *lli = ll_i2info(inode);
3444 struct ll_sb_info *sbi = ll_i2sbi(inode);
3445 struct md_op_data *op_data;
3446 struct lookup_intent it;
3447 struct lustre_handle lockh;
3449 struct ldlm_enqueue_info einfo = {
3450 .ei_type = LDLM_IBITS,
3452 .ei_cb_bl = ll_md_blocking_ast,
3453 .ei_cb_cp = ldlm_completion_ast,
3458 *gen = lli->lli_layout_gen;
3459 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3463 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3464 LASSERT(S_ISREG(inode->i_mode));
3466 /* mostly layout lock is caching on the local side, so try to match
3467 * it before grabbing layout lock mutex. */
3468 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3469 if (mode != 0) { /* hit cached lock */
3470 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3474 /* better hold lli_layout_mutex to try again otherwise
3475 * it will have starvation problem. */
3478 /* take layout lock mutex to enqueue layout lock exclusively. */
3479 mutex_lock(&lli->lli_layout_mutex);
3482 /* try again. Maybe somebody else has done this. */
3483 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3484 if (mode != 0) { /* hit cached lock */
3485 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3489 mutex_unlock(&lli->lli_layout_mutex);
3493 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3494 0, 0, LUSTRE_OPC_ANY, NULL);
3495 if (IS_ERR(op_data)) {
3496 mutex_unlock(&lli->lli_layout_mutex);
3497 RETURN(PTR_ERR(op_data));
3500 /* have to enqueue one */
3501 memset(&it, 0, sizeof(it));
3502 it.it_op = IT_LAYOUT;
3503 lockh.cookie = 0ULL;
3505 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3506 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3507 PFID(&lli->lli_fid));
3509 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3511 if (it.d.lustre.it_data != NULL)
3512 ptlrpc_req_finished(it.d.lustre.it_data);
3513 it.d.lustre.it_data = NULL;
3515 ll_finish_md_op_data(op_data);
3517 mode = it.d.lustre.it_lock_mode;
3518 it.d.lustre.it_lock_mode = 0;
3519 ll_intent_drop_lock(&it);
3522 /* set lock data in case this is a new lock */
3523 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3524 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3528 mutex_unlock(&lli->lli_layout_mutex);