4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 const char *name = file->f_dentry->d_name.name;
364 const int len = file->f_dentry->d_name.len;
365 struct md_op_data *op_data;
366 struct ptlrpc_request *req;
367 __u32 opc = LUSTRE_OPC_ANY;
374 /* Usually we come here only for NFSD, and we want open lock.
375 But we can also get here with pre 2.6.15 patchless kernels, and in
376 that case that lock is also ok */
377 /* We can also get here if there was cached open handle in revalidate_it
378 * but it disappeared while we were getting from there to ll_file_open.
379 * But this means this file was closed and immediatelly opened which
380 * makes a good candidate for using OPEN lock */
381 /* If lmmsize & lmm are not 0, we are just setting stripe info
382 * parameters. No need for the open lock */
383 if (lmm == NULL && lmmsize == 0) {
384 itp->it_flags |= MDS_OPEN_LOCK;
385 if (itp->it_flags & FMODE_WRITE)
386 opc = LUSTRE_OPC_CREATE;
389 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
390 file->f_dentry->d_inode, name, len,
393 RETURN(PTR_ERR(op_data));
395 itp->it_flags |= MDS_OPEN_BY_FID;
396 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
397 0 /*unused */, &req, ll_md_blocking_ast, 0);
398 ll_finish_md_op_data(op_data);
400 /* reason for keep own exit path - don`t flood log
401 * with messages with -ESTALE errors.
403 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
404 it_open_error(DISP_OPEN_OPEN, itp))
406 ll_release_openhandle(file->f_dentry, itp);
410 if (it_disposition(itp, DISP_LOOKUP_NEG))
411 GOTO(out, rc = -ENOENT);
413 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
414 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
415 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
419 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
420 if (!rc && itp->d.lustre.it_lock_mode)
421 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
425 ptlrpc_req_finished(itp->d.lustre.it_data);
426 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
427 ll_intent_drop_lock(itp);
433 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
434 * not believe attributes if a few ioepoch holders exist. Attributes for
435 * previous ioepoch if new one is opened are also skipped by MDS.
437 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
439 if (ioepoch && lli->lli_ioepoch != ioepoch) {
440 lli->lli_ioepoch = ioepoch;
441 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
442 ioepoch, PFID(&lli->lli_fid));
446 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
447 struct lookup_intent *it, struct obd_client_handle *och)
449 struct ptlrpc_request *req = it->d.lustre.it_data;
450 struct mdt_body *body;
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 LASSERT(body != NULL); /* reply already checked out */
457 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
458 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
459 och->och_fid = lli->lli_fid;
460 och->och_flags = it->it_flags;
461 ll_ioepoch_open(lli, body->ioepoch);
463 return md_set_open_replay_data(md_exp, och, req);
466 int ll_local_open(struct file *file, struct lookup_intent *it,
467 struct ll_file_data *fd, struct obd_client_handle *och)
469 struct inode *inode = file->f_dentry->d_inode;
470 struct ll_inode_info *lli = ll_i2info(inode);
473 LASSERT(!LUSTRE_FPRIVATE(file));
478 struct ptlrpc_request *req = it->d.lustre.it_data;
479 struct mdt_body *body;
482 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
486 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
487 if ((it->it_flags & FMODE_WRITE) &&
488 (body->valid & OBD_MD_FLSIZE))
489 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
490 lli->lli_ioepoch, PFID(&lli->lli_fid));
493 LUSTRE_FPRIVATE(file) = fd;
494 ll_readahead_init(inode, &fd->fd_ras);
495 fd->fd_omode = it->it_flags;
499 /* Open a file, and (for the very first open) create objects on the OSTs at
500 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
501 * creation or open until ll_lov_setstripe() ioctl is called.
503 * If we already have the stripe MD locally then we don't request it in
504 * md_open(), by passing a lmm_size = 0.
506 * It is up to the application to ensure no other processes open this file
507 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
508 * used. We might be able to avoid races of that sort by getting lli_open_sem
509 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
510 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
512 int ll_file_open(struct inode *inode, struct file *file)
514 struct ll_inode_info *lli = ll_i2info(inode);
515 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
516 .it_flags = file->f_flags };
517 struct obd_client_handle **och_p = NULL;
518 __u64 *och_usecount = NULL;
519 struct ll_file_data *fd;
520 int rc = 0, opendir_set = 0;
523 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
524 inode->i_generation, inode, file->f_flags);
526 it = file->private_data; /* XXX: compat macro */
527 file->private_data = NULL; /* prevent ll_local_open assertion */
529 fd = ll_file_data_get();
531 GOTO(out_openerr, rc = -ENOMEM);
534 if (S_ISDIR(inode->i_mode)) {
535 spin_lock(&lli->lli_sa_lock);
536 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
537 lli->lli_opendir_pid == 0) {
538 lli->lli_opendir_key = fd;
539 lli->lli_opendir_pid = cfs_curproc_pid();
542 spin_unlock(&lli->lli_sa_lock);
545 if (inode->i_sb->s_root == file->f_dentry) {
546 LUSTRE_FPRIVATE(file) = fd;
550 if (!it || !it->d.lustre.it_disposition) {
551 /* Convert f_flags into access mode. We cannot use file->f_mode,
552 * because everything but O_ACCMODE mask was stripped from
554 if ((oit.it_flags + 1) & O_ACCMODE)
556 if (file->f_flags & O_TRUNC)
557 oit.it_flags |= FMODE_WRITE;
559 /* kernel only call f_op->open in dentry_open. filp_open calls
560 * dentry_open after call to open_namei that checks permissions.
561 * Only nfsd_open call dentry_open directly without checking
562 * permissions and because of that this code below is safe. */
563 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
564 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
566 /* We do not want O_EXCL here, presumably we opened the file
567 * already? XXX - NFS implications? */
568 oit.it_flags &= ~O_EXCL;
570 /* bug20584, if "it_flags" contains O_CREAT, the file will be
571 * created if necessary, then "IT_CREAT" should be set to keep
572 * consistent with it */
573 if (oit.it_flags & O_CREAT)
574 oit.it_op |= IT_CREAT;
580 /* Let's see if we have file open on MDS already. */
581 if (it->it_flags & FMODE_WRITE) {
582 och_p = &lli->lli_mds_write_och;
583 och_usecount = &lli->lli_open_fd_write_count;
584 } else if (it->it_flags & FMODE_EXEC) {
585 och_p = &lli->lli_mds_exec_och;
586 och_usecount = &lli->lli_open_fd_exec_count;
588 och_p = &lli->lli_mds_read_och;
589 och_usecount = &lli->lli_open_fd_read_count;
592 mutex_lock(&lli->lli_och_mutex);
593 if (*och_p) { /* Open handle is present */
594 if (it_disposition(it, DISP_OPEN_OPEN)) {
595 /* Well, there's extra open request that we do not need,
596 let's close it somehow. This will decref request. */
597 rc = it_open_error(DISP_OPEN_OPEN, it);
599 mutex_unlock(&lli->lli_och_mutex);
600 GOTO(out_openerr, rc);
603 ll_release_openhandle(file->f_dentry, it);
607 rc = ll_local_open(file, it, fd, NULL);
610 mutex_unlock(&lli->lli_och_mutex);
611 GOTO(out_openerr, rc);
614 LASSERT(*och_usecount == 0);
615 if (!it->d.lustre.it_disposition) {
616 /* We cannot just request lock handle now, new ELC code
617 means that one of other OPEN locks for this file
618 could be cancelled, and since blocking ast handler
619 would attempt to grab och_mutex as well, that would
620 result in a deadlock */
621 mutex_unlock(&lli->lli_och_mutex);
622 it->it_create_mode |= M_CHECK_STALE;
623 rc = ll_intent_file_open(file, NULL, 0, it);
624 it->it_create_mode &= ~M_CHECK_STALE;
626 GOTO(out_openerr, rc);
630 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
632 GOTO(out_och_free, rc = -ENOMEM);
636 /* md_intent_lock() didn't get a request ref if there was an
637 * open error, so don't do cleanup on the request here
639 /* XXX (green): Should not we bail out on any error here, not
640 * just open error? */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 GOTO(out_och_free, rc);
645 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
647 rc = ll_local_open(file, it, fd, *och_p);
649 GOTO(out_och_free, rc);
651 mutex_unlock(&lli->lli_och_mutex);
654 /* Must do this outside lli_och_mutex lock to prevent deadlock where
655 different kind of OPEN lock for this same inode gets cancelled
656 by ldlm_cancel_lru */
657 if (!S_ISREG(inode->i_mode))
658 GOTO(out_och_free, rc);
662 if (!lli->lli_has_smd) {
663 if (file->f_flags & O_LOV_DELAY_CREATE ||
664 !(file->f_mode & FMODE_WRITE)) {
665 CDEBUG(D_INODE, "object creation was delayed\n");
666 GOTO(out_och_free, rc);
669 file->f_flags &= ~O_LOV_DELAY_CREATE;
670 GOTO(out_och_free, rc);
674 if (och_p && *och_p) {
675 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676 *och_p = NULL; /* OBD_FREE writes some magic there */
679 mutex_unlock(&lli->lli_och_mutex);
682 if (opendir_set != 0)
683 ll_stop_statahead(inode, lli->lli_opendir_key);
685 ll_file_data_put(fd);
687 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
690 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
691 ptlrpc_req_finished(it->d.lustre.it_data);
692 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
698 /* Fills the obdo with the attributes for the lsm */
699 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
700 struct obd_capa *capa, struct obdo *obdo,
701 __u64 ioepoch, int sync)
703 struct ptlrpc_request_set *set;
704 struct obd_info oinfo = { { { 0 } } };
709 LASSERT(lsm != NULL);
713 oinfo.oi_oa->o_oi = lsm->lsm_oi;
714 oinfo.oi_oa->o_mode = S_IFREG;
715 oinfo.oi_oa->o_ioepoch = ioepoch;
716 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
717 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
718 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
719 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
720 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
721 OBD_MD_FLDATAVERSION;
722 oinfo.oi_capa = capa;
724 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
725 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
728 set = ptlrpc_prep_set();
730 CERROR("can't allocate ptlrpc set\n");
733 rc = obd_getattr_async(exp, &oinfo, set);
735 rc = ptlrpc_set_wait(set);
736 ptlrpc_set_destroy(set);
739 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
740 OBD_MD_FLATIME | OBD_MD_FLMTIME |
741 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
742 OBD_MD_FLDATAVERSION);
747 * Performs the getattr on the inode and updates its fields.
748 * If @sync != 0, perform the getattr under the server-side lock.
750 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
751 __u64 ioepoch, int sync)
753 struct obd_capa *capa = ll_mdscapa_get(inode);
754 struct lov_stripe_md *lsm;
758 lsm = ccc_inode_lsm_get(inode);
759 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
760 capa, obdo, ioepoch, sync);
763 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
765 obdo_refresh_inode(inode, obdo, obdo->o_valid);
766 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
767 " blksize %lu\n", POSTID(oi), i_size_read(inode),
768 (unsigned long long)inode->i_blocks,
769 (unsigned long)ll_inode_blksize(inode));
771 ccc_inode_lsm_put(inode, lsm);
775 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct cl_object *obj = lli->lli_clob;
779 struct cl_attr *attr = ccc_env_thread_attr(env);
785 ll_inode_size_lock(inode);
786 /* merge timestamps the most recently obtained from mds with
787 timestamps obtained from osts */
788 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
789 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
790 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
791 inode_init_lvb(inode, &lvb);
793 cl_object_attr_lock(obj);
794 rc = cl_object_attr_get(env, obj, attr);
795 cl_object_attr_unlock(obj);
798 if (lvb.lvb_atime < attr->cat_atime)
799 lvb.lvb_atime = attr->cat_atime;
800 if (lvb.lvb_ctime < attr->cat_ctime)
801 lvb.lvb_ctime = attr->cat_ctime;
802 if (lvb.lvb_mtime < attr->cat_mtime)
803 lvb.lvb_mtime = attr->cat_mtime;
805 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
806 PFID(&lli->lli_fid), attr->cat_size);
807 cl_isize_write_nolock(inode, attr->cat_size);
809 inode->i_blocks = attr->cat_blocks;
811 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
812 LTIME_S(inode->i_atime) = lvb.lvb_atime;
813 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
815 ll_inode_size_unlock(inode);
820 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
823 struct obdo obdo = { 0 };
826 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
828 st->st_size = obdo.o_size;
829 st->st_blocks = obdo.o_blocks;
830 st->st_mtime = obdo.o_mtime;
831 st->st_atime = obdo.o_atime;
832 st->st_ctime = obdo.o_ctime;
837 void ll_io_init(struct cl_io *io, const struct file *file, int write)
839 struct inode *inode = file->f_dentry->d_inode;
841 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
843 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
844 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
845 file->f_flags & O_DIRECT ||
848 io->ci_obj = ll_i2info(inode)->lli_clob;
849 io->ci_lockreq = CILR_MAYBE;
850 if (ll_file_nolock(file)) {
851 io->ci_lockreq = CILR_NEVER;
852 io->ci_no_srvlock = 1;
853 } else if (file->f_flags & O_APPEND) {
854 io->ci_lockreq = CILR_MANDATORY;
859 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
860 struct file *file, enum cl_io_type iot,
861 loff_t *ppos, size_t count)
863 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
864 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
870 io = ccc_env_thread_io(env);
871 ll_io_init(io, file, iot == CIT_WRITE);
873 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
874 struct vvp_io *vio = vvp_env_io(env);
875 struct ccc_io *cio = ccc_env_io(env);
876 int write_mutex_locked = 0;
878 cio->cui_fd = LUSTRE_FPRIVATE(file);
879 vio->cui_io_subtype = args->via_io_subtype;
881 switch (vio->cui_io_subtype) {
883 cio->cui_iov = args->u.normal.via_iov;
884 cio->cui_nrsegs = args->u.normal.via_nrsegs;
885 cio->cui_tot_nrsegs = cio->cui_nrsegs;
886 #ifndef HAVE_FILE_WRITEV
887 cio->cui_iocb = args->u.normal.via_iocb;
889 if ((iot == CIT_WRITE) &&
890 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
891 if (mutex_lock_interruptible(&lli->
893 GOTO(out, result = -ERESTARTSYS);
894 write_mutex_locked = 1;
895 } else if (iot == CIT_READ) {
896 down_read(&lli->lli_trunc_sem);
900 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
901 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
904 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
905 vio->u.splice.cui_flags = args->u.splice.via_flags;
908 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
911 result = cl_io_loop(env, io);
912 if (write_mutex_locked)
913 mutex_unlock(&lli->lli_write_mutex);
914 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
915 up_read(&lli->lli_trunc_sem);
917 /* cl_io_rw_init() handled IO */
918 result = io->ci_result;
921 if (io->ci_nob > 0) {
923 *ppos = io->u.ci_wr.wr.crw_pos;
928 /* If any bit been read/written (result != 0), we just return
929 * short read/write instead of restart io. */
930 if (result == 0 && io->ci_need_restart) {
931 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
932 iot == CIT_READ ? "read" : "write",
933 file->f_dentry->d_name.name, *ppos, count);
934 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
938 if (iot == CIT_READ) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_READ_BYTES, result);
942 } else if (iot == CIT_WRITE) {
944 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
945 LPROC_LL_WRITE_BYTES, result);
946 fd->fd_write_failed = false;
947 } else if (result != -ERESTARTSYS) {
948 fd->fd_write_failed = true;
957 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
959 static int ll_file_get_iov_count(const struct iovec *iov,
960 unsigned long *nr_segs, size_t *count)
965 for (seg = 0; seg < *nr_segs; seg++) {
966 const struct iovec *iv = &iov[seg];
969 * If any segment has a negative length, or the cumulative
970 * length ever wraps negative then return -EINVAL.
973 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
975 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
980 cnt -= iv->iov_len; /* This segment is no good */
987 #ifdef HAVE_FILE_READV
988 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
989 unsigned long nr_segs, loff_t *ppos)
992 struct vvp_io_args *args;
998 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1002 env = cl_env_get(&refcheck);
1004 RETURN(PTR_ERR(env));
1006 args = vvp_env_args(env, IO_NORMAL);
1007 args->u.normal.via_iov = (struct iovec *)iov;
1008 args->u.normal.via_nrsegs = nr_segs;
1010 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1011 cl_env_put(env, &refcheck);
1015 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1019 struct iovec *local_iov;
1024 env = cl_env_get(&refcheck);
1026 RETURN(PTR_ERR(env));
1028 local_iov = &vvp_env_info(env)->vti_local_iov;
1029 local_iov->iov_base = (void __user *)buf;
1030 local_iov->iov_len = count;
1031 result = ll_file_readv(file, local_iov, 1, ppos);
1032 cl_env_put(env, &refcheck);
1037 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1038 unsigned long nr_segs, loff_t pos)
1041 struct vvp_io_args *args;
1047 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1051 env = cl_env_get(&refcheck);
1053 RETURN(PTR_ERR(env));
1055 args = vvp_env_args(env, IO_NORMAL);
1056 args->u.normal.via_iov = (struct iovec *)iov;
1057 args->u.normal.via_nrsegs = nr_segs;
1058 args->u.normal.via_iocb = iocb;
1060 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1061 &iocb->ki_pos, count);
1062 cl_env_put(env, &refcheck);
1066 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1070 struct iovec *local_iov;
1071 struct kiocb *kiocb;
1076 env = cl_env_get(&refcheck);
1078 RETURN(PTR_ERR(env));
1080 local_iov = &vvp_env_info(env)->vti_local_iov;
1081 kiocb = &vvp_env_info(env)->vti_kiocb;
1082 local_iov->iov_base = (void __user *)buf;
1083 local_iov->iov_len = count;
1084 init_sync_kiocb(kiocb, file);
1085 kiocb->ki_pos = *ppos;
1086 kiocb->ki_left = count;
1088 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1089 *ppos = kiocb->ki_pos;
1091 cl_env_put(env, &refcheck);
1097 * Write to a file (through the page cache).
1099 #ifdef HAVE_FILE_WRITEV
1100 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1101 unsigned long nr_segs, loff_t *ppos)
1104 struct vvp_io_args *args;
1110 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1114 env = cl_env_get(&refcheck);
1116 RETURN(PTR_ERR(env));
1118 args = vvp_env_args(env, IO_NORMAL);
1119 args->u.normal.via_iov = (struct iovec *)iov;
1120 args->u.normal.via_nrsegs = nr_segs;
1122 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1123 cl_env_put(env, &refcheck);
1127 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1131 struct iovec *local_iov;
1136 env = cl_env_get(&refcheck);
1138 RETURN(PTR_ERR(env));
1140 local_iov = &vvp_env_info(env)->vti_local_iov;
1141 local_iov->iov_base = (void __user *)buf;
1142 local_iov->iov_len = count;
1144 result = ll_file_writev(file, local_iov, 1, ppos);
1145 cl_env_put(env, &refcheck);
1149 #else /* AIO stuff */
1150 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1151 unsigned long nr_segs, loff_t pos)
1154 struct vvp_io_args *args;
1160 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1164 env = cl_env_get(&refcheck);
1166 RETURN(PTR_ERR(env));
1168 args = vvp_env_args(env, IO_NORMAL);
1169 args->u.normal.via_iov = (struct iovec *)iov;
1170 args->u.normal.via_nrsegs = nr_segs;
1171 args->u.normal.via_iocb = iocb;
1173 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1174 &iocb->ki_pos, count);
1175 cl_env_put(env, &refcheck);
1179 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1183 struct iovec *local_iov;
1184 struct kiocb *kiocb;
1189 env = cl_env_get(&refcheck);
1191 RETURN(PTR_ERR(env));
1193 local_iov = &vvp_env_info(env)->vti_local_iov;
1194 kiocb = &vvp_env_info(env)->vti_kiocb;
1195 local_iov->iov_base = (void __user *)buf;
1196 local_iov->iov_len = count;
1197 init_sync_kiocb(kiocb, file);
1198 kiocb->ki_pos = *ppos;
1199 kiocb->ki_left = count;
1201 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1202 *ppos = kiocb->ki_pos;
1204 cl_env_put(env, &refcheck);
1210 #ifdef HAVE_KERNEL_SENDFILE
1212 * Send file content (through pagecache) somewhere with helper
1214 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1215 read_actor_t actor, void *target)
1218 struct vvp_io_args *args;
1223 env = cl_env_get(&refcheck);
1225 RETURN(PTR_ERR(env));
1227 args = vvp_env_args(env, IO_SENDFILE);
1228 args->u.sendfile.via_target = target;
1229 args->u.sendfile.via_actor = actor;
1231 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1232 cl_env_put(env, &refcheck);
1237 #ifdef HAVE_KERNEL_SPLICE_READ
1239 * Send file content (through pagecache) somewhere with helper
1241 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1242 struct pipe_inode_info *pipe, size_t count,
1246 struct vvp_io_args *args;
1251 env = cl_env_get(&refcheck);
1253 RETURN(PTR_ERR(env));
1255 args = vvp_env_args(env, IO_SPLICE);
1256 args->u.splice.via_pipe = pipe;
1257 args->u.splice.via_flags = flags;
1259 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1260 cl_env_put(env, &refcheck);
1265 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1268 struct obd_export *exp = ll_i2dtexp(inode);
1269 struct obd_trans_info oti = { 0 };
1270 struct obdo *oa = NULL;
1273 struct lov_stripe_md *lsm = NULL, *lsm2;
1280 lsm = ccc_inode_lsm_get(inode);
1282 GOTO(out, rc = -ENOENT);
1284 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1285 (lsm->lsm_stripe_count));
1287 OBD_ALLOC_LARGE(lsm2, lsm_size);
1289 GOTO(out, rc = -ENOMEM);
1292 oa->o_nlink = ost_idx;
1293 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1294 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1295 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1296 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1297 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1298 memcpy(lsm2, lsm, lsm_size);
1299 ll_inode_size_lock(inode);
1300 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1301 ll_inode_size_unlock(inode);
1303 OBD_FREE_LARGE(lsm2, lsm_size);
1306 ccc_inode_lsm_put(inode, lsm);
1311 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1313 struct ll_recreate_obj ucreat;
1317 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1320 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1324 ostid_set_seq_mdt0(&oi);
1325 ostid_set_id(&oi, ucreat.lrc_id);
1326 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1329 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1336 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1339 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1342 fid_to_ostid(&fid, &oi);
1343 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1344 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1347 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1348 int flags, struct lov_user_md *lum, int lum_size)
1350 struct lov_stripe_md *lsm = NULL;
1351 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1355 lsm = ccc_inode_lsm_get(inode);
1357 ccc_inode_lsm_put(inode, lsm);
1358 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1363 ll_inode_size_lock(inode);
1364 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1367 rc = oit.d.lustre.it_status;
1369 GOTO(out_req_free, rc);
1371 ll_release_openhandle(file->f_dentry, &oit);
1374 ll_inode_size_unlock(inode);
1375 ll_intent_release(&oit);
1376 ccc_inode_lsm_put(inode, lsm);
1379 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1383 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1384 struct lov_mds_md **lmmp, int *lmm_size,
1385 struct ptlrpc_request **request)
1387 struct ll_sb_info *sbi = ll_i2sbi(inode);
1388 struct mdt_body *body;
1389 struct lov_mds_md *lmm = NULL;
1390 struct ptlrpc_request *req = NULL;
1391 struct md_op_data *op_data;
1394 rc = ll_get_max_mdsize(sbi, &lmmsize);
1398 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1399 strlen(filename), lmmsize,
1400 LUSTRE_OPC_ANY, NULL);
1401 if (IS_ERR(op_data))
1402 RETURN(PTR_ERR(op_data));
1404 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1405 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1406 ll_finish_md_op_data(op_data);
1408 CDEBUG(D_INFO, "md_getattr_name failed "
1409 "on %s: rc %d\n", filename, rc);
1413 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1414 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1416 lmmsize = body->eadatasize;
1418 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1420 GOTO(out, rc = -ENODATA);
1423 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1424 LASSERT(lmm != NULL);
1426 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1427 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1428 GOTO(out, rc = -EPROTO);
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1436 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1437 /* if function called for directory - we should
1438 * avoid swab not existent lsm objects */
1439 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1440 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1441 if (S_ISREG(body->mode))
1442 lustre_swab_lov_user_md_objects(
1443 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1444 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1445 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1446 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1450 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1456 *lmm_size = lmmsize;
1461 static int ll_lov_setea(struct inode *inode, struct file *file,
1464 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1465 struct lov_user_md *lump;
1466 int lum_size = sizeof(struct lov_user_md) +
1467 sizeof(struct lov_user_ost_data);
1471 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1474 OBD_ALLOC_LARGE(lump, lum_size);
1478 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1479 OBD_FREE_LARGE(lump, lum_size);
1483 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1485 OBD_FREE_LARGE(lump, lum_size);
1489 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1492 struct lov_user_md_v3 lumv3;
1493 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1494 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1495 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1497 int flags = FMODE_WRITE;
1500 /* first try with v1 which is smaller than v3 */
1501 lum_size = sizeof(struct lov_user_md_v1);
1502 if (copy_from_user(lumv1, lumv1p, lum_size))
1505 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1506 lum_size = sizeof(struct lov_user_md_v3);
1507 if (copy_from_user(&lumv3, lumv3p, lum_size))
1511 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1513 struct lov_stripe_md *lsm;
1516 put_user(0, &lumv1p->lmm_stripe_count);
1518 ll_layout_refresh(inode, &gen);
1519 lsm = ccc_inode_lsm_get(inode);
1520 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1521 0, lsm, (void *)arg);
1522 ccc_inode_lsm_put(inode, lsm);
1527 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1529 struct lov_stripe_md *lsm;
1533 lsm = ccc_inode_lsm_get(inode);
1535 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1537 ccc_inode_lsm_put(inode, lsm);
1541 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1543 struct ll_inode_info *lli = ll_i2info(inode);
1544 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1545 struct ccc_grouplock grouplock;
1549 if (ll_file_nolock(file))
1550 RETURN(-EOPNOTSUPP);
1552 spin_lock(&lli->lli_lock);
1553 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1554 CWARN("group lock already existed with gid %lu\n",
1555 fd->fd_grouplock.cg_gid);
1556 spin_unlock(&lli->lli_lock);
1559 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1560 spin_unlock(&lli->lli_lock);
1562 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1563 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1567 spin_lock(&lli->lli_lock);
1568 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1569 spin_unlock(&lli->lli_lock);
1570 CERROR("another thread just won the race\n");
1571 cl_put_grouplock(&grouplock);
1575 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1576 fd->fd_grouplock = grouplock;
1577 spin_unlock(&lli->lli_lock);
1579 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1583 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1585 struct ll_inode_info *lli = ll_i2info(inode);
1586 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1587 struct ccc_grouplock grouplock;
1590 spin_lock(&lli->lli_lock);
1591 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1592 spin_unlock(&lli->lli_lock);
1593 CWARN("no group lock held\n");
1596 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1598 if (fd->fd_grouplock.cg_gid != arg) {
1599 CWARN("group lock %lu doesn't match current id %lu\n",
1600 arg, fd->fd_grouplock.cg_gid);
1601 spin_unlock(&lli->lli_lock);
1605 grouplock = fd->fd_grouplock;
1606 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1607 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1608 spin_unlock(&lli->lli_lock);
1610 cl_put_grouplock(&grouplock);
1611 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1616 * Close inode open handle
1618 * \param dentry [in] dentry which contains the inode
1619 * \param it [in,out] intent which contains open info and result
1622 * \retval <0 failure
1624 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1626 struct inode *inode = dentry->d_inode;
1627 struct obd_client_handle *och;
1633 /* Root ? Do nothing. */
1634 if (dentry->d_inode->i_sb->s_root == dentry)
1637 /* No open handle to close? Move away */
1638 if (!it_disposition(it, DISP_OPEN_OPEN))
1641 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1643 OBD_ALLOC(och, sizeof(*och));
1645 GOTO(out, rc = -ENOMEM);
1647 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1648 ll_i2info(inode), it, och);
1650 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1653 /* this one is in place of ll_file_open */
1654 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1655 ptlrpc_req_finished(it->d.lustre.it_data);
1656 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1662 * Get size for inode for which FIEMAP mapping is requested.
1663 * Make the FIEMAP get_info call and returns the result.
1665 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1668 struct obd_export *exp = ll_i2dtexp(inode);
1669 struct lov_stripe_md *lsm = NULL;
1670 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1671 int vallen = num_bytes;
1675 /* Checks for fiemap flags */
1676 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1677 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1681 /* Check for FIEMAP_FLAG_SYNC */
1682 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1683 rc = filemap_fdatawrite(inode->i_mapping);
1688 lsm = ccc_inode_lsm_get(inode);
1692 /* If the stripe_count > 1 and the application does not understand
1693 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1695 if (lsm->lsm_stripe_count > 1 &&
1696 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1697 GOTO(out, rc = -EOPNOTSUPP);
1699 fm_key.oa.o_oi = lsm->lsm_oi;
1700 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1702 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1703 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1704 /* If filesize is 0, then there would be no objects for mapping */
1705 if (fm_key.oa.o_size == 0) {
1706 fiemap->fm_mapped_extents = 0;
1710 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1712 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1715 CERROR("obd_get_info failed: rc = %d\n", rc);
1718 ccc_inode_lsm_put(inode, lsm);
1722 int ll_fid2path(struct inode *inode, void *arg)
1724 struct obd_export *exp = ll_i2mdexp(inode);
1725 struct getinfo_fid2path *gfout, *gfin;
1729 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1730 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1733 /* Need to get the buflen */
1734 OBD_ALLOC_PTR(gfin);
1737 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1742 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1743 OBD_ALLOC(gfout, outsize);
1744 if (gfout == NULL) {
1748 memcpy(gfout, gfin, sizeof(*gfout));
1751 /* Call mdc_iocontrol */
1752 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1756 if (copy_to_user(arg, gfout, outsize))
1760 OBD_FREE(gfout, outsize);
1764 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1766 struct ll_user_fiemap *fiemap_s;
1767 size_t num_bytes, ret_bytes;
1768 unsigned int extent_count;
1771 /* Get the extent count so we can calculate the size of
1772 * required fiemap buffer */
1773 if (get_user(extent_count,
1774 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1776 num_bytes = sizeof(*fiemap_s) + (extent_count *
1777 sizeof(struct ll_fiemap_extent));
1779 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1780 if (fiemap_s == NULL)
1783 /* get the fiemap value */
1784 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1786 GOTO(error, rc = -EFAULT);
1788 /* If fm_extent_count is non-zero, read the first extent since
1789 * it is used to calculate end_offset and device from previous
1792 if (copy_from_user(&fiemap_s->fm_extents[0],
1793 (char __user *)arg + sizeof(*fiemap_s),
1794 sizeof(struct ll_fiemap_extent)))
1795 GOTO(error, rc = -EFAULT);
1798 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1802 ret_bytes = sizeof(struct ll_user_fiemap);
1804 if (extent_count != 0)
1805 ret_bytes += (fiemap_s->fm_mapped_extents *
1806 sizeof(struct ll_fiemap_extent));
1808 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1812 OBD_FREE_LARGE(fiemap_s, num_bytes);
1817 * Read the data_version for inode.
1819 * This value is computed using stripe object version on OST.
1820 * Version is computed using server side locking.
1822 * @param extent_lock Take extent lock. Not needed if a process is already
1823 * holding the OST object group locks.
1825 int ll_data_version(struct inode *inode, __u64 *data_version,
1828 struct lov_stripe_md *lsm = NULL;
1829 struct ll_sb_info *sbi = ll_i2sbi(inode);
1830 struct obdo *obdo = NULL;
1834 /* If no stripe, we consider version is 0. */
1835 lsm = ccc_inode_lsm_get(inode);
1838 CDEBUG(D_INODE, "No object for inode\n");
1842 OBD_ALLOC_PTR(obdo);
1844 ccc_inode_lsm_put(inode, lsm);
1848 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1850 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1853 *data_version = obdo->o_data_version;
1857 ccc_inode_lsm_put(inode, lsm);
1862 struct ll_swap_stack {
1863 struct iattr ia1, ia2;
1865 struct inode *inode1, *inode2;
1866 bool check_dv1, check_dv2;
1869 static int ll_swap_layouts(struct file *file1, struct file *file2,
1870 struct lustre_swap_layouts *lsl)
1872 struct mdc_swap_layouts msl;
1873 struct md_op_data *op_data;
1876 struct ll_swap_stack *llss = NULL;
1879 OBD_ALLOC_PTR(llss);
1883 llss->inode1 = file1->f_dentry->d_inode;
1884 llss->inode2 = file2->f_dentry->d_inode;
1886 if (!S_ISREG(llss->inode2->i_mode))
1887 GOTO(free, rc = -EINVAL);
1889 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1890 ll_permission(llss->inode2, MAY_WRITE, NULL))
1891 GOTO(free, rc = -EPERM);
1893 if (llss->inode2->i_sb != llss->inode1->i_sb)
1894 GOTO(free, rc = -EXDEV);
1896 /* we use 2 bool because it is easier to swap than 2 bits */
1897 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1898 llss->check_dv1 = true;
1900 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1901 llss->check_dv2 = true;
1903 /* we cannot use lsl->sl_dvX directly because we may swap them */
1904 llss->dv1 = lsl->sl_dv1;
1905 llss->dv2 = lsl->sl_dv2;
1907 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1908 if (rc == 0) /* same file, done! */
1911 if (rc < 0) { /* sequentialize it */
1912 swap(llss->inode1, llss->inode2);
1914 swap(llss->dv1, llss->dv2);
1915 swap(llss->check_dv1, llss->check_dv2);
1919 if (gid != 0) { /* application asks to flush dirty cache */
1920 rc = ll_get_grouplock(llss->inode1, file1, gid);
1924 rc = ll_get_grouplock(llss->inode2, file2, gid);
1926 ll_put_grouplock(llss->inode1, file1, gid);
1931 /* to be able to restore mtime and atime after swap
1932 * we need to first save them */
1934 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1935 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1936 llss->ia1.ia_atime = llss->inode1->i_atime;
1937 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1938 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1939 llss->ia2.ia_atime = llss->inode2->i_atime;
1940 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1943 /* ultimate check, before swaping the layouts we check if
1944 * dataversion has changed (if requested) */
1945 if (llss->check_dv1) {
1946 rc = ll_data_version(llss->inode1, &dv, 0);
1949 if (dv != llss->dv1)
1950 GOTO(putgl, rc = -EAGAIN);
1953 if (llss->check_dv2) {
1954 rc = ll_data_version(llss->inode2, &dv, 0);
1957 if (dv != llss->dv2)
1958 GOTO(putgl, rc = -EAGAIN);
1961 /* struct md_op_data is used to send the swap args to the mdt
1962 * only flags is missing, so we use struct mdc_swap_layouts
1963 * through the md_op_data->op_data */
1964 /* flags from user space have to be converted before they are send to
1965 * server, no flag is sent today, they are only used on the client */
1968 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1969 0, LUSTRE_OPC_ANY, &msl);
1970 if (IS_ERR(op_data))
1971 GOTO(free, rc = PTR_ERR(op_data));
1973 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1974 sizeof(*op_data), op_data, NULL);
1975 ll_finish_md_op_data(op_data);
1979 ll_put_grouplock(llss->inode2, file2, gid);
1980 ll_put_grouplock(llss->inode1, file1, gid);
1983 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1987 /* clear useless flags */
1988 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1989 llss->ia1.ia_valid &= ~ATTR_MTIME;
1990 llss->ia2.ia_valid &= ~ATTR_MTIME;
1993 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1994 llss->ia1.ia_valid &= ~ATTR_ATIME;
1995 llss->ia2.ia_valid &= ~ATTR_ATIME;
1998 /* update time if requested */
2000 if (llss->ia2.ia_valid != 0) {
2001 mutex_lock(&llss->inode1->i_mutex);
2002 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2003 mutex_unlock(&llss->inode1->i_mutex);
2006 if (llss->ia1.ia_valid != 0) {
2009 mutex_lock(&llss->inode2->i_mutex);
2010 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2011 mutex_unlock(&llss->inode2->i_mutex);
2023 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2025 struct inode *inode = file->f_dentry->d_inode;
2026 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2030 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2031 inode->i_generation, inode, cmd);
2032 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2034 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2035 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2039 case LL_IOC_GETFLAGS:
2040 /* Get the current value of the file flags */
2041 return put_user(fd->fd_flags, (int *)arg);
2042 case LL_IOC_SETFLAGS:
2043 case LL_IOC_CLRFLAGS:
2044 /* Set or clear specific file flags */
2045 /* XXX This probably needs checks to ensure the flags are
2046 * not abused, and to handle any flag side effects.
2048 if (get_user(flags, (int *) arg))
2051 if (cmd == LL_IOC_SETFLAGS) {
2052 if ((flags & LL_FILE_IGNORE_LOCK) &&
2053 !(file->f_flags & O_DIRECT)) {
2054 CERROR("%s: unable to disable locking on "
2055 "non-O_DIRECT file\n", current->comm);
2059 fd->fd_flags |= flags;
2061 fd->fd_flags &= ~flags;
2064 case LL_IOC_LOV_SETSTRIPE:
2065 RETURN(ll_lov_setstripe(inode, file, arg));
2066 case LL_IOC_LOV_SETEA:
2067 RETURN(ll_lov_setea(inode, file, arg));
2068 case LL_IOC_LOV_SWAP_LAYOUTS: {
2070 struct lustre_swap_layouts lsl;
2072 if (cfs_copy_from_user(&lsl, (char *)arg,
2073 sizeof(struct lustre_swap_layouts)))
2076 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2079 file2 = fget(lsl.sl_fd);
2084 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2085 rc = ll_swap_layouts(file, file2, &lsl);
2089 case LL_IOC_LOV_GETSTRIPE:
2090 RETURN(ll_lov_getstripe(inode, arg));
2091 case LL_IOC_RECREATE_OBJ:
2092 RETURN(ll_lov_recreate_obj(inode, arg));
2093 case LL_IOC_RECREATE_FID:
2094 RETURN(ll_lov_recreate_fid(inode, arg));
2095 case FSFILT_IOC_FIEMAP:
2096 RETURN(ll_ioctl_fiemap(inode, arg));
2097 case FSFILT_IOC_GETFLAGS:
2098 case FSFILT_IOC_SETFLAGS:
2099 RETURN(ll_iocontrol(inode, file, cmd, arg));
2100 case FSFILT_IOC_GETVERSION_OLD:
2101 case FSFILT_IOC_GETVERSION:
2102 RETURN(put_user(inode->i_generation, (int *)arg));
2103 case LL_IOC_GROUP_LOCK:
2104 RETURN(ll_get_grouplock(inode, file, arg));
2105 case LL_IOC_GROUP_UNLOCK:
2106 RETURN(ll_put_grouplock(inode, file, arg));
2107 case IOC_OBD_STATFS:
2108 RETURN(ll_obd_statfs(inode, (void *)arg));
2110 /* We need to special case any other ioctls we want to handle,
2111 * to send them to the MDS/OST as appropriate and to properly
2112 * network encode the arg field.
2113 case FSFILT_IOC_SETVERSION_OLD:
2114 case FSFILT_IOC_SETVERSION:
2116 case LL_IOC_FLUSHCTX:
2117 RETURN(ll_flush_ctx(inode));
2118 case LL_IOC_PATH2FID: {
2119 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2120 sizeof(struct lu_fid)))
2125 case OBD_IOC_FID2PATH:
2126 RETURN(ll_fid2path(inode, (void *)arg));
2127 case LL_IOC_DATA_VERSION: {
2128 struct ioc_data_version idv;
2131 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2134 rc = ll_data_version(inode, &idv.idv_version,
2135 !(idv.idv_flags & LL_DV_NOFLUSH));
2137 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2143 case LL_IOC_GET_MDTIDX: {
2146 mdtidx = ll_get_mdt_idx(inode);
2150 if (put_user((int)mdtidx, (int*)arg))
2155 case OBD_IOC_GETDTNAME:
2156 case OBD_IOC_GETMDNAME:
2157 RETURN(ll_get_obd_name(inode, cmd, arg));
2158 case LL_IOC_HSM_STATE_GET: {
2159 struct md_op_data *op_data;
2160 struct hsm_user_state *hus;
2167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2168 LUSTRE_OPC_ANY, hus);
2169 if (IS_ERR(op_data)) {
2171 RETURN(PTR_ERR(op_data));
2174 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2177 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2180 ll_finish_md_op_data(op_data);
2184 case LL_IOC_HSM_STATE_SET: {
2185 struct md_op_data *op_data;
2186 struct hsm_state_set *hss;
2192 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2197 /* Non-root users are forbidden to set or clear flags which are
2198 * NOT defined in HSM_USER_MASK. */
2199 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2200 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2205 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2206 LUSTRE_OPC_ANY, hss);
2207 if (IS_ERR(op_data)) {
2209 RETURN(PTR_ERR(op_data));
2212 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2215 ll_finish_md_op_data(op_data);
2220 case LL_IOC_HSM_ACTION: {
2221 struct md_op_data *op_data;
2222 struct hsm_current_action *hca;
2229 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2230 LUSTRE_OPC_ANY, hca);
2231 if (IS_ERR(op_data)) {
2233 RETURN(PTR_ERR(op_data));
2236 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2239 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2242 ll_finish_md_op_data(op_data);
2250 ll_iocontrol_call(inode, file, cmd, arg, &err))
2253 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2259 #ifndef HAVE_FILE_LLSEEK_SIZE
2260 static inline loff_t
2261 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2263 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2265 if (offset > maxsize)
2268 if (offset != file->f_pos) {
2269 file->f_pos = offset;
2270 file->f_version = 0;
2276 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2277 loff_t maxsize, loff_t eof)
2279 struct inode *inode = file->f_dentry->d_inode;
2287 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2288 * position-querying operation. Avoid rewriting the "same"
2289 * f_pos value back to the file because a concurrent read(),
2290 * write() or lseek() might have altered it
2295 * f_lock protects against read/modify/write race with other
2296 * SEEK_CURs. Note that parallel writes and reads behave
2299 mutex_lock(&inode->i_mutex);
2300 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2301 mutex_unlock(&inode->i_mutex);
2305 * In the generic case the entire file is data, so as long as
2306 * offset isn't at the end of the file then the offset is data.
2313 * There is a virtual hole at the end of the file, so as long as
2314 * offset isn't i_size or larger, return i_size.
2322 return llseek_execute(file, offset, maxsize);
2326 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2328 struct inode *inode = file->f_dentry->d_inode;
2329 loff_t retval, eof = 0;
2332 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2333 (origin == SEEK_CUR) ? file->f_pos : 0);
2334 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2335 inode->i_ino, inode->i_generation, inode, retval, retval,
2337 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2339 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2340 retval = ll_glimpse_size(inode);
2343 eof = i_size_read(inode);
2346 retval = ll_generic_file_llseek_size(file, offset, origin,
2347 ll_file_maxbytes(inode), eof);
2351 int ll_flush(struct file *file, fl_owner_t id)
2353 struct inode *inode = file->f_dentry->d_inode;
2354 struct ll_inode_info *lli = ll_i2info(inode);
2355 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2358 LASSERT(!S_ISDIR(inode->i_mode));
2360 /* catch async errors that were recorded back when async writeback
2361 * failed for pages in this mapping. */
2362 rc = lli->lli_async_rc;
2363 lli->lli_async_rc = 0;
2364 err = lov_read_and_clear_async_rc(lli->lli_clob);
2368 /* The application has been told write failure already.
2369 * Do not report failure again. */
2370 if (fd->fd_write_failed)
2372 return rc ? -EIO : 0;
2376 * Called to make sure a portion of file has been written out.
2377 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2379 * Return how many pages have been written.
2381 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2382 enum cl_fsync_mode mode, int ignore_layout)
2384 struct cl_env_nest nest;
2387 struct obd_capa *capa = NULL;
2388 struct cl_fsync_io *fio;
2392 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2393 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2396 env = cl_env_nested_get(&nest);
2398 RETURN(PTR_ERR(env));
2400 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2402 io = ccc_env_thread_io(env);
2403 io->ci_obj = cl_i2info(inode)->lli_clob;
2404 io->ci_ignore_layout = ignore_layout;
2406 /* initialize parameters for sync */
2407 fio = &io->u.ci_fsync;
2408 fio->fi_capa = capa;
2409 fio->fi_start = start;
2411 fio->fi_fid = ll_inode2fid(inode);
2412 fio->fi_mode = mode;
2413 fio->fi_nr_written = 0;
2415 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2416 result = cl_io_loop(env, io);
2418 result = io->ci_result;
2420 result = fio->fi_nr_written;
2421 cl_io_fini(env, io);
2422 cl_env_nested_put(&nest, env);
2430 * When dentry is provided (the 'else' case), *file->f_dentry may be
2431 * null and dentry must be used directly rather than pulled from
2432 * *file->f_dentry as is done otherwise.
2435 #ifdef HAVE_FILE_FSYNC_4ARGS
2436 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2438 struct dentry *dentry = file->f_dentry;
2439 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2440 int ll_fsync(struct file *file, int datasync)
2442 struct dentry *dentry = file->f_dentry;
2444 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2447 struct inode *inode = dentry->d_inode;
2448 struct ll_inode_info *lli = ll_i2info(inode);
2449 struct ptlrpc_request *req;
2450 struct obd_capa *oc;
2454 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2455 inode->i_generation, inode);
2456 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2458 #ifdef HAVE_FILE_FSYNC_4ARGS
2459 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2460 mutex_lock(&inode->i_mutex);
2462 /* fsync's caller has already called _fdata{sync,write}, we want
2463 * that IO to finish before calling the osc and mdc sync methods */
2464 rc = filemap_fdatawait(inode->i_mapping);
2467 /* catch async errors that were recorded back when async writeback
2468 * failed for pages in this mapping. */
2469 if (!S_ISDIR(inode->i_mode)) {
2470 err = lli->lli_async_rc;
2471 lli->lli_async_rc = 0;
2474 err = lov_read_and_clear_async_rc(lli->lli_clob);
2479 oc = ll_mdscapa_get(inode);
2480 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2486 ptlrpc_req_finished(req);
2488 if (datasync && S_ISREG(inode->i_mode)) {
2489 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2491 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2493 if (rc == 0 && err < 0)
2496 fd->fd_write_failed = true;
2498 fd->fd_write_failed = false;
2501 #ifdef HAVE_FILE_FSYNC_4ARGS
2502 mutex_unlock(&inode->i_mutex);
2507 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2509 struct inode *inode = file->f_dentry->d_inode;
2510 struct ll_sb_info *sbi = ll_i2sbi(inode);
2511 struct ldlm_enqueue_info einfo = {
2512 .ei_type = LDLM_FLOCK,
2513 .ei_cb_cp = ldlm_flock_completion_ast,
2514 .ei_cbdata = file_lock,
2516 struct md_op_data *op_data;
2517 struct lustre_handle lockh = {0};
2518 ldlm_policy_data_t flock = {{0}};
2524 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2525 inode->i_ino, file_lock);
2527 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2529 if (file_lock->fl_flags & FL_FLOCK) {
2530 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2531 /* flocks are whole-file locks */
2532 flock.l_flock.end = OFFSET_MAX;
2533 /* For flocks owner is determined by the local file desctiptor*/
2534 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2535 } else if (file_lock->fl_flags & FL_POSIX) {
2536 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2537 flock.l_flock.start = file_lock->fl_start;
2538 flock.l_flock.end = file_lock->fl_end;
2542 flock.l_flock.pid = file_lock->fl_pid;
2544 /* Somewhat ugly workaround for svc lockd.
2545 * lockd installs custom fl_lmops->lm_compare_owner that checks
2546 * for the fl_owner to be the same (which it always is on local node
2547 * I guess between lockd processes) and then compares pid.
2548 * As such we assign pid to the owner field to make it all work,
2549 * conflict with normal locks is unlikely since pid space and
2550 * pointer space for current->files are not intersecting */
2551 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2552 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2554 switch (file_lock->fl_type) {
2556 einfo.ei_mode = LCK_PR;
2559 /* An unlock request may or may not have any relation to
2560 * existing locks so we may not be able to pass a lock handle
2561 * via a normal ldlm_lock_cancel() request. The request may even
2562 * unlock a byte range in the middle of an existing lock. In
2563 * order to process an unlock request we need all of the same
2564 * information that is given with a normal read or write record
2565 * lock request. To avoid creating another ldlm unlock (cancel)
2566 * message we'll treat a LCK_NL flock request as an unlock. */
2567 einfo.ei_mode = LCK_NL;
2570 einfo.ei_mode = LCK_PW;
2573 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2574 file_lock->fl_type);
2589 flags = LDLM_FL_BLOCK_NOWAIT;
2595 flags = LDLM_FL_TEST_LOCK;
2596 /* Save the old mode so that if the mode in the lock changes we
2597 * can decrement the appropriate reader or writer refcount. */
2598 file_lock->fl_type = einfo.ei_mode;
2601 CERROR("unknown fcntl lock command: %d\n", cmd);
2605 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2606 LUSTRE_OPC_ANY, NULL);
2607 if (IS_ERR(op_data))
2608 RETURN(PTR_ERR(op_data));
2610 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2611 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2612 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2614 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2615 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2617 if ((file_lock->fl_flags & FL_FLOCK) &&
2618 (rc == 0 || file_lock->fl_type == F_UNLCK))
2619 rc2 = flock_lock_file_wait(file, file_lock);
2620 if ((file_lock->fl_flags & FL_POSIX) &&
2621 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2622 !(flags & LDLM_FL_TEST_LOCK))
2623 rc2 = posix_lock_file_wait(file, file_lock);
2625 if (rc2 && file_lock->fl_type != F_UNLCK) {
2626 einfo.ei_mode = LCK_NL;
2627 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2628 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2632 ll_finish_md_op_data(op_data);
2637 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2645 * test if some locks matching bits and l_req_mode are acquired
2646 * - bits can be in different locks
2647 * - if found clear the common lock bits in *bits
2648 * - the bits not found, are kept in *bits
2650 * \param bits [IN] searched lock bits [IN]
2651 * \param l_req_mode [IN] searched lock mode
2652 * \retval boolean, true iff all bits are found
2654 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2656 struct lustre_handle lockh;
2657 ldlm_policy_data_t policy;
2658 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2659 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2668 fid = &ll_i2info(inode)->lli_fid;
2669 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2670 ldlm_lockname[mode]);
2672 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2673 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2674 policy.l_inodebits.bits = *bits & (1 << i);
2675 if (policy.l_inodebits.bits == 0)
2678 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2679 &policy, mode, &lockh)) {
2680 struct ldlm_lock *lock;
2682 lock = ldlm_handle2lock(&lockh);
2685 ~(lock->l_policy_data.l_inodebits.bits);
2686 LDLM_LOCK_PUT(lock);
2688 *bits &= ~policy.l_inodebits.bits;
2695 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2696 struct lustre_handle *lockh, __u64 flags)
2698 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2703 fid = &ll_i2info(inode)->lli_fid;
2704 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2706 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2707 fid, LDLM_IBITS, &policy,
2708 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2712 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2714 /* Already unlinked. Just update nlink and return success */
2715 if (rc == -ENOENT) {
2717 /* This path cannot be hit for regular files unless in
2718 * case of obscure races, so no need to to validate
2720 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2722 } else if (rc != 0) {
2723 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2724 ll_get_fsname(inode->i_sb, NULL, 0),
2725 PFID(ll_inode2fid(inode)), rc);
2731 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2734 struct inode *inode = dentry->d_inode;
2735 struct ptlrpc_request *req = NULL;
2736 struct obd_export *exp;
2740 LASSERT(inode != NULL);
2742 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2743 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2745 exp = ll_i2mdexp(inode);
2747 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2748 * But under CMD case, it caused some lock issues, should be fixed
2749 * with new CMD ibits lock. See bug 12718 */
2750 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2751 struct lookup_intent oit = { .it_op = IT_GETATTR };
2752 struct md_op_data *op_data;
2754 if (ibits == MDS_INODELOCK_LOOKUP)
2755 oit.it_op = IT_LOOKUP;
2757 /* Call getattr by fid, so do not provide name at all. */
2758 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2759 dentry->d_inode, NULL, 0, 0,
2760 LUSTRE_OPC_ANY, NULL);
2761 if (IS_ERR(op_data))
2762 RETURN(PTR_ERR(op_data));
2764 oit.it_create_mode |= M_CHECK_STALE;
2765 rc = md_intent_lock(exp, op_data, NULL, 0,
2766 /* we are not interested in name
2769 ll_md_blocking_ast, 0);
2770 ll_finish_md_op_data(op_data);
2771 oit.it_create_mode &= ~M_CHECK_STALE;
2773 rc = ll_inode_revalidate_fini(inode, rc);
2777 rc = ll_revalidate_it_finish(req, &oit, dentry);
2779 ll_intent_release(&oit);
2783 /* Unlinked? Unhash dentry, so it is not picked up later by
2784 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2785 here to preserve get_cwd functionality on 2.6.
2787 if (!dentry->d_inode->i_nlink)
2788 d_lustre_invalidate(dentry, 0);
2790 ll_lookup_finish_locks(&oit, dentry);
2791 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2792 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2793 obd_valid valid = OBD_MD_FLGETATTR;
2794 struct md_op_data *op_data;
2797 if (S_ISREG(inode->i_mode)) {
2798 rc = ll_get_max_mdsize(sbi, &ealen);
2801 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2804 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2805 0, ealen, LUSTRE_OPC_ANY,
2807 if (IS_ERR(op_data))
2808 RETURN(PTR_ERR(op_data));
2810 op_data->op_valid = valid;
2811 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2812 * capa for this inode. Because we only keep capas of dirs
2814 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2815 ll_finish_md_op_data(op_data);
2817 rc = ll_inode_revalidate_fini(inode, rc);
2821 rc = ll_prep_inode(&inode, req, NULL, NULL);
2824 ptlrpc_req_finished(req);
2828 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2831 struct inode *inode = dentry->d_inode;
2835 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2839 /* if object isn't regular file, don't validate size */
2840 if (!S_ISREG(inode->i_mode)) {
2841 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2842 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2843 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2845 rc = ll_glimpse_size(inode);
2850 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2851 struct lookup_intent *it, struct kstat *stat)
2853 struct inode *inode = de->d_inode;
2854 struct ll_sb_info *sbi = ll_i2sbi(inode);
2855 struct ll_inode_info *lli = ll_i2info(inode);
2858 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2859 MDS_INODELOCK_LOOKUP);
2860 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2865 stat->dev = inode->i_sb->s_dev;
2866 if (ll_need_32bit_api(sbi))
2867 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2869 stat->ino = inode->i_ino;
2870 stat->mode = inode->i_mode;
2871 stat->nlink = inode->i_nlink;
2872 stat->uid = inode->i_uid;
2873 stat->gid = inode->i_gid;
2874 stat->rdev = inode->i_rdev;
2875 stat->atime = inode->i_atime;
2876 stat->mtime = inode->i_mtime;
2877 stat->ctime = inode->i_ctime;
2878 stat->blksize = 1 << inode->i_blkbits;
2880 stat->size = i_size_read(inode);
2881 stat->blocks = inode->i_blocks;
2885 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2887 struct lookup_intent it = { .it_op = IT_GETATTR };
2889 return ll_getattr_it(mnt, de, &it, stat);
2892 #ifdef HAVE_LINUX_FIEMAP_H
2893 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2894 __u64 start, __u64 len)
2898 struct ll_user_fiemap *fiemap;
2899 unsigned int extent_count = fieinfo->fi_extents_max;
2901 num_bytes = sizeof(*fiemap) + (extent_count *
2902 sizeof(struct ll_fiemap_extent));
2903 OBD_ALLOC_LARGE(fiemap, num_bytes);
2908 fiemap->fm_flags = fieinfo->fi_flags;
2909 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2910 fiemap->fm_start = start;
2911 fiemap->fm_length = len;
2912 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2913 sizeof(struct ll_fiemap_extent));
2915 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2917 fieinfo->fi_flags = fiemap->fm_flags;
2918 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2919 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2920 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2922 OBD_FREE_LARGE(fiemap, num_bytes);
2927 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2929 struct ll_inode_info *lli = ll_i2info(inode);
2930 struct posix_acl *acl = NULL;
2933 spin_lock(&lli->lli_lock);
2934 /* VFS' acl_permission_check->check_acl will release the refcount */
2935 acl = posix_acl_dup(lli->lli_posix_acl);
2936 spin_unlock(&lli->lli_lock);
2941 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2943 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2944 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2946 ll_check_acl(struct inode *inode, int mask)
2949 # ifdef CONFIG_FS_POSIX_ACL
2950 struct posix_acl *acl;
2954 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2955 if (flags & IPERM_FLAG_RCU)
2958 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2963 rc = posix_acl_permission(inode, acl, mask);
2964 posix_acl_release(acl);
2967 # else /* !CONFIG_FS_POSIX_ACL */
2969 # endif /* CONFIG_FS_POSIX_ACL */
2971 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2973 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2974 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2976 # ifdef HAVE_INODE_PERMISION_2ARGS
2977 int ll_inode_permission(struct inode *inode, int mask)
2979 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2986 #ifdef MAY_NOT_BLOCK
2987 if (mask & MAY_NOT_BLOCK)
2989 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2990 if (flags & IPERM_FLAG_RCU)
2994 /* as root inode are NOT getting validated in lookup operation,
2995 * need to do it before permission check. */
2997 if (inode == inode->i_sb->s_root->d_inode) {
2998 struct lookup_intent it = { .it_op = IT_LOOKUP };
3000 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3001 MDS_INODELOCK_LOOKUP);
3006 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3007 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3009 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3010 return lustre_check_remote_perm(inode, mask);
3012 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3013 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3018 #ifdef HAVE_FILE_READV
3019 #define READ_METHOD readv
3020 #define READ_FUNCTION ll_file_readv
3021 #define WRITE_METHOD writev
3022 #define WRITE_FUNCTION ll_file_writev
3024 #define READ_METHOD aio_read
3025 #define READ_FUNCTION ll_file_aio_read
3026 #define WRITE_METHOD aio_write
3027 #define WRITE_FUNCTION ll_file_aio_write
3030 /* -o localflock - only provides locally consistent flock locks */
3031 struct file_operations ll_file_operations = {
3032 .read = ll_file_read,
3033 .READ_METHOD = READ_FUNCTION,
3034 .write = ll_file_write,
3035 .WRITE_METHOD = WRITE_FUNCTION,
3036 .unlocked_ioctl = ll_file_ioctl,
3037 .open = ll_file_open,
3038 .release = ll_file_release,
3039 .mmap = ll_file_mmap,
3040 .llseek = ll_file_seek,
3041 #ifdef HAVE_KERNEL_SENDFILE
3042 .sendfile = ll_file_sendfile,
3044 #ifdef HAVE_KERNEL_SPLICE_READ
3045 .splice_read = ll_file_splice_read,
3051 struct file_operations ll_file_operations_flock = {
3052 .read = ll_file_read,
3053 .READ_METHOD = READ_FUNCTION,
3054 .write = ll_file_write,
3055 .WRITE_METHOD = WRITE_FUNCTION,
3056 .unlocked_ioctl = ll_file_ioctl,
3057 .open = ll_file_open,
3058 .release = ll_file_release,
3059 .mmap = ll_file_mmap,
3060 .llseek = ll_file_seek,
3061 #ifdef HAVE_KERNEL_SENDFILE
3062 .sendfile = ll_file_sendfile,
3064 #ifdef HAVE_KERNEL_SPLICE_READ
3065 .splice_read = ll_file_splice_read,
3069 .flock = ll_file_flock,
3070 .lock = ll_file_flock
3073 /* These are for -o noflock - to return ENOSYS on flock calls */
3074 struct file_operations ll_file_operations_noflock = {
3075 .read = ll_file_read,
3076 .READ_METHOD = READ_FUNCTION,
3077 .write = ll_file_write,
3078 .WRITE_METHOD = WRITE_FUNCTION,
3079 .unlocked_ioctl = ll_file_ioctl,
3080 .open = ll_file_open,
3081 .release = ll_file_release,
3082 .mmap = ll_file_mmap,
3083 .llseek = ll_file_seek,
3084 #ifdef HAVE_KERNEL_SENDFILE
3085 .sendfile = ll_file_sendfile,
3087 #ifdef HAVE_KERNEL_SPLICE_READ
3088 .splice_read = ll_file_splice_read,
3092 .flock = ll_file_noflock,
3093 .lock = ll_file_noflock
3096 struct inode_operations ll_file_inode_operations = {
3097 .setattr = ll_setattr,
3098 .getattr = ll_getattr,
3099 .permission = ll_inode_permission,
3100 .setxattr = ll_setxattr,
3101 .getxattr = ll_getxattr,
3102 .listxattr = ll_listxattr,
3103 .removexattr = ll_removexattr,
3104 #ifdef HAVE_LINUX_FIEMAP_H
3105 .fiemap = ll_fiemap,
3107 #ifdef HAVE_IOP_GET_ACL
3108 .get_acl = ll_get_acl,
3112 /* dynamic ioctl number support routins */
3113 static struct llioc_ctl_data {
3114 struct rw_semaphore ioc_sem;
3115 cfs_list_t ioc_head;
3117 __RWSEM_INITIALIZER(llioc.ioc_sem),
3118 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3123 cfs_list_t iocd_list;
3124 unsigned int iocd_size;
3125 llioc_callback_t iocd_cb;
3126 unsigned int iocd_count;
3127 unsigned int iocd_cmd[0];
3130 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3133 struct llioc_data *in_data = NULL;
3136 if (cb == NULL || cmd == NULL ||
3137 count > LLIOC_MAX_CMD || count < 0)
3140 size = sizeof(*in_data) + count * sizeof(unsigned int);
3141 OBD_ALLOC(in_data, size);
3142 if (in_data == NULL)
3145 memset(in_data, 0, sizeof(*in_data));
3146 in_data->iocd_size = size;
3147 in_data->iocd_cb = cb;
3148 in_data->iocd_count = count;
3149 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3151 down_write(&llioc.ioc_sem);
3152 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3153 up_write(&llioc.ioc_sem);
3158 void ll_iocontrol_unregister(void *magic)
3160 struct llioc_data *tmp;
3165 down_write(&llioc.ioc_sem);
3166 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3168 unsigned int size = tmp->iocd_size;
3170 cfs_list_del(&tmp->iocd_list);
3171 up_write(&llioc.ioc_sem);
3173 OBD_FREE(tmp, size);
3177 up_write(&llioc.ioc_sem);
3179 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3182 EXPORT_SYMBOL(ll_iocontrol_register);
3183 EXPORT_SYMBOL(ll_iocontrol_unregister);
3185 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3186 unsigned int cmd, unsigned long arg, int *rcp)
3188 enum llioc_iter ret = LLIOC_CONT;
3189 struct llioc_data *data;
3190 int rc = -EINVAL, i;
3192 down_read(&llioc.ioc_sem);
3193 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3194 for (i = 0; i < data->iocd_count; i++) {
3195 if (cmd != data->iocd_cmd[i])
3198 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3202 if (ret == LLIOC_STOP)
3205 up_read(&llioc.ioc_sem);
3212 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3214 struct ll_inode_info *lli = ll_i2info(inode);
3215 struct cl_env_nest nest;
3220 if (lli->lli_clob == NULL)
3223 env = cl_env_nested_get(&nest);
3225 RETURN(PTR_ERR(env));
3227 result = cl_conf_set(env, lli->lli_clob, conf);
3228 cl_env_nested_put(&nest, env);
3230 if (conf->coc_opc == OBJECT_CONF_SET) {
3231 struct ldlm_lock *lock = conf->coc_lock;
3233 LASSERT(lock != NULL);
3234 LASSERT(ldlm_has_layout(lock));
3236 /* it can only be allowed to match after layout is
3237 * applied to inode otherwise false layout would be
3238 * seen. Applying layout shoud happen before dropping
3239 * the intent lock. */
3240 ldlm_lock_allow_match(lock);
3246 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3247 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3250 struct ll_sb_info *sbi = ll_i2sbi(inode);
3251 struct obd_capa *oc;
3252 struct ptlrpc_request *req;
3253 struct mdt_body *body;
3260 if (lock->l_lvb_data != NULL)
3263 /* if layout lock was granted right away, the layout is returned
3264 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3265 * blocked and then granted via completion ast, we have to fetch
3266 * layout here. Please note that we can't use the LVB buffer in
3267 * completion AST because it doesn't have a large enough buffer */
3268 oc = ll_mdscapa_get(inode);
3269 rc = ll_get_max_mdsize(sbi, &lmmsize);
3271 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3272 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3278 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3279 if (body == NULL || body->eadatasize > lmmsize)
3280 GOTO(out, rc = -EPROTO);
3282 lmmsize = body->eadatasize;
3283 if (lmmsize == 0) /* empty layout */
3286 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3288 GOTO(out, rc = -EFAULT);
3290 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3291 if (lvbdata == NULL)
3292 GOTO(out, rc = -ENOMEM);
3294 memcpy(lvbdata, lmm, lmmsize);
3295 lock_res_and_lock(lock);
3296 if (lock->l_lvb_data == NULL) {
3297 lock->l_lvb_data = lvbdata;
3298 lock->l_lvb_len = lmmsize;
3301 unlock_res_and_lock(lock);
3303 if (lvbdata != NULL)
3304 OBD_FREE_LARGE(lvbdata, lmmsize);
3308 ptlrpc_req_finished(req);
3313 * Apply the layout to the inode. Layout lock is held and will be released
3316 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3317 struct inode *inode, __u32 *gen, bool reconf)
3319 struct ll_inode_info *lli = ll_i2info(inode);
3320 struct ll_sb_info *sbi = ll_i2sbi(inode);
3321 struct ldlm_lock *lock;
3322 struct lustre_md md = { NULL };
3323 struct cl_object_conf conf;
3326 bool wait_layout = false;
3329 LASSERT(lustre_handle_is_used(lockh));
3331 lock = ldlm_handle2lock(lockh);
3332 LASSERT(lock != NULL);
3333 LASSERT(ldlm_has_layout(lock));
3335 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3336 inode, PFID(&lli->lli_fid), reconf);
3338 /* in case this is a caching lock and reinstate with new inode */
3339 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3341 lock_res_and_lock(lock);
3342 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3343 unlock_res_and_lock(lock);
3344 /* checking lvb_ready is racy but this is okay. The worst case is
3345 * that multi processes may configure the file on the same time. */
3346 if (lvb_ready || !reconf) {
3349 /* layout_gen must be valid if layout lock is not
3350 * cancelled and stripe has already set */
3351 *gen = lli->lli_layout_gen;
3357 rc = ll_layout_fetch(inode, lock);
3361 /* for layout lock, lmm is returned in lock's lvb.
3362 * lvb_data is immutable if the lock is held so it's safe to access it
3363 * without res lock. See the description in ldlm_lock_decref_internal()
3364 * for the condition to free lvb_data of layout lock */
3365 if (lock->l_lvb_data != NULL) {
3366 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3367 lock->l_lvb_data, lock->l_lvb_len);
3369 *gen = LL_LAYOUT_GEN_EMPTY;
3371 *gen = md.lsm->lsm_layout_gen;
3374 CERROR("%s: file "DFID" unpackmd error: %d\n",
3375 ll_get_fsname(inode->i_sb, NULL, 0),
3376 PFID(&lli->lli_fid), rc);
3382 /* set layout to file. Unlikely this will fail as old layout was
3383 * surely eliminated */
3384 memset(&conf, 0, sizeof conf);
3385 conf.coc_opc = OBJECT_CONF_SET;
3386 conf.coc_inode = inode;
3387 conf.coc_lock = lock;
3388 conf.u.coc_md = &md;
3389 rc = ll_layout_conf(inode, &conf);
3392 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3394 /* refresh layout failed, need to wait */
3395 wait_layout = rc == -EBUSY;
3399 LDLM_LOCK_PUT(lock);
3400 ldlm_lock_decref(lockh, mode);
3402 /* wait for IO to complete if it's still being used. */
3404 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3405 ll_get_fsname(inode->i_sb, NULL, 0),
3406 inode, PFID(&lli->lli_fid));
3408 memset(&conf, 0, sizeof conf);
3409 conf.coc_opc = OBJECT_CONF_WAIT;
3410 conf.coc_inode = inode;
3411 rc = ll_layout_conf(inode, &conf);
3415 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3416 PFID(&lli->lli_fid), rc);
3422 * This function checks if there exists a LAYOUT lock on the client side,
3423 * or enqueues it if it doesn't have one in cache.
3425 * This function will not hold layout lock so it may be revoked any time after
3426 * this function returns. Any operations depend on layout should be redone
3429 * This function should be called before lov_io_init() to get an uptodate
3430 * layout version, the caller should save the version number and after IO
3431 * is finished, this function should be called again to verify that layout
3432 * is not changed during IO time.
3434 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3436 struct ll_inode_info *lli = ll_i2info(inode);
3437 struct ll_sb_info *sbi = ll_i2sbi(inode);
3438 struct md_op_data *op_data;
3439 struct lookup_intent it;
3440 struct lustre_handle lockh;
3442 struct ldlm_enqueue_info einfo = {
3443 .ei_type = LDLM_IBITS,
3445 .ei_cb_bl = ll_md_blocking_ast,
3446 .ei_cb_cp = ldlm_completion_ast,
3451 *gen = lli->lli_layout_gen;
3452 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3456 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3457 LASSERT(S_ISREG(inode->i_mode));
3459 /* mostly layout lock is caching on the local side, so try to match
3460 * it before grabbing layout lock mutex. */
3461 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3462 if (mode != 0) { /* hit cached lock */
3463 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3467 /* better hold lli_layout_mutex to try again otherwise
3468 * it will have starvation problem. */
3471 /* take layout lock mutex to enqueue layout lock exclusively. */
3472 mutex_lock(&lli->lli_layout_mutex);
3475 /* try again. Maybe somebody else has done this. */
3476 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3477 if (mode != 0) { /* hit cached lock */
3478 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3482 mutex_unlock(&lli->lli_layout_mutex);
3486 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3487 0, 0, LUSTRE_OPC_ANY, NULL);
3488 if (IS_ERR(op_data)) {
3489 mutex_unlock(&lli->lli_layout_mutex);
3490 RETURN(PTR_ERR(op_data));
3493 /* have to enqueue one */
3494 memset(&it, 0, sizeof(it));
3495 it.it_op = IT_LAYOUT;
3496 lockh.cookie = 0ULL;
3498 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3499 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3500 PFID(&lli->lli_fid));
3502 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3504 if (it.d.lustre.it_data != NULL)
3505 ptlrpc_req_finished(it.d.lustre.it_data);
3506 it.d.lustre.it_data = NULL;
3508 ll_finish_md_op_data(op_data);
3510 mode = it.d.lustre.it_lock_mode;
3511 it.d.lustre.it_lock_mode = 0;
3512 ll_intent_drop_lock(&it);
3515 /* set lock data in case this is a new lock */
3516 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3517 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3521 mutex_unlock(&lli->lli_layout_mutex);