4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
85 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
86 op_data->op_bias |= MDS_DATA_MODIFIED;
90 * Closes the IO epoch and packs all the attributes into @op_data for
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
102 if (!(och->och_flags & FMODE_WRITE))
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
108 ll_ioepoch_close(inode, op_data, &och, 0);
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
117 static int ll_close_inode_openhandle(struct obd_export *md_exp,
119 struct obd_client_handle *och)
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
131 * XXX: in case of LMV, is this correct to access
134 CERROR("Invalid MDC connection handle "LPX64"\n",
135 ll_i2mdexp(inode)->exp_handle.h_cookie);
139 OBD_ALLOC_PTR(op_data);
141 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
143 ll_prepare_close(inode, op_data, och);
144 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
145 rc = md_close(md_exp, op_data, och->och_mod, &req);
147 /* This close must have the epoch closed. */
148 LASSERT(epoch_close);
149 /* MDS has instructed us to obtain Size-on-MDS attribute from
150 * OSTs and send setattr to back to MDS. */
151 rc = ll_som_update(inode, op_data);
153 CERROR("inode %lu mdc Size-on-MDS update failed: "
154 "rc = %d\n", inode->i_ino, rc);
158 CERROR("inode %lu mdc close failed: rc = %d\n",
162 /* DATA_MODIFIED flag was successfully sent on close, cancel data
163 * modification flag. */
164 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
165 struct ll_inode_info *lli = ll_i2info(inode);
167 spin_lock(&lli->lli_lock);
168 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
169 spin_unlock(&lli->lli_lock);
172 ll_finish_md_op_data(op_data);
175 rc = ll_objects_destroy(req, inode);
177 CERROR("inode %lu ll_objects destroy: rc = %d\n",
184 if (exp_connect_som(exp) && !epoch_close &&
185 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
186 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
188 md_clear_open_replay_data(md_exp, och);
189 /* Free @och if it is not waiting for DONE_WRITING. */
190 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
193 if (req) /* This is close request */
194 ptlrpc_req_finished(req);
198 int ll_md_real_close(struct inode *inode, int flags)
200 struct ll_inode_info *lli = ll_i2info(inode);
201 struct obd_client_handle **och_p;
202 struct obd_client_handle *och;
207 if (flags & FMODE_WRITE) {
208 och_p = &lli->lli_mds_write_och;
209 och_usecount = &lli->lli_open_fd_write_count;
210 } else if (flags & FMODE_EXEC) {
211 och_p = &lli->lli_mds_exec_och;
212 och_usecount = &lli->lli_open_fd_exec_count;
214 LASSERT(flags & FMODE_READ);
215 och_p = &lli->lli_mds_read_och;
216 och_usecount = &lli->lli_open_fd_read_count;
219 mutex_lock(&lli->lli_och_mutex);
220 if (*och_usecount) { /* There are still users of this handle, so
222 mutex_unlock(&lli->lli_och_mutex);
227 mutex_unlock(&lli->lli_och_mutex);
229 if (och) { /* There might be a race and somebody have freed this och
231 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
238 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
241 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
242 struct ll_inode_info *lli = ll_i2info(inode);
246 /* clear group lock, if present */
247 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
248 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
250 /* Let's see if we have good enough OPEN lock on the file and if
251 we can skip talking to MDS */
252 if (file->f_dentry->d_inode) { /* Can this ever be false? */
254 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
255 struct lustre_handle lockh;
256 struct inode *inode = file->f_dentry->d_inode;
257 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
259 mutex_lock(&lli->lli_och_mutex);
260 if (fd->fd_omode & FMODE_WRITE) {
262 LASSERT(lli->lli_open_fd_write_count);
263 lli->lli_open_fd_write_count--;
264 } else if (fd->fd_omode & FMODE_EXEC) {
266 LASSERT(lli->lli_open_fd_exec_count);
267 lli->lli_open_fd_exec_count--;
270 LASSERT(lli->lli_open_fd_read_count);
271 lli->lli_open_fd_read_count--;
273 mutex_unlock(&lli->lli_och_mutex);
275 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
276 LDLM_IBITS, &policy, lockmode,
278 rc = ll_md_real_close(file->f_dentry->d_inode,
282 CERROR("Releasing a file %p with negative dentry %p. Name %s",
283 file, file->f_dentry, file->f_dentry->d_name.name);
286 LUSTRE_FPRIVATE(file) = NULL;
287 ll_file_data_put(fd);
288 ll_capa_close(inode);
293 /* While this returns an error code, fput() the caller does not, so we need
294 * to make every effort to clean up all of our state here. Also, applications
295 * rarely check close errors and even if an error is returned they will not
296 * re-try the close call.
298 int ll_file_release(struct inode *inode, struct file *file)
300 struct ll_file_data *fd;
301 struct ll_sb_info *sbi = ll_i2sbi(inode);
302 struct ll_inode_info *lli = ll_i2info(inode);
306 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
307 inode->i_generation, inode);
309 #ifdef CONFIG_FS_POSIX_ACL
310 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
311 inode == inode->i_sb->s_root->d_inode) {
312 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
315 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
316 fd->fd_flags &= ~LL_FILE_RMTACL;
317 rct_del(&sbi->ll_rct, cfs_curproc_pid());
318 et_search_free(&sbi->ll_et, cfs_curproc_pid());
323 if (inode->i_sb->s_root != file->f_dentry)
324 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
325 fd = LUSTRE_FPRIVATE(file);
328 /* The last ref on @file, maybe not the the owner pid of statahead.
329 * Different processes can open the same dir, "ll_opendir_key" means:
330 * it is me that should stop the statahead thread. */
331 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
332 lli->lli_opendir_pid != 0)
333 ll_stop_statahead(inode, lli->lli_opendir_key);
335 if (inode->i_sb->s_root == file->f_dentry) {
336 LUSTRE_FPRIVATE(file) = NULL;
337 ll_file_data_put(fd);
341 if (!S_ISDIR(inode->i_mode)) {
342 lov_read_and_clear_async_rc(lli->lli_clob);
343 lli->lli_async_rc = 0;
346 rc = ll_md_close(sbi->ll_md_exp, inode, file);
348 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
349 libcfs_debug_dumplog();
354 static int ll_intent_file_open(struct file *file, void *lmm,
355 int lmmsize, struct lookup_intent *itp)
357 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
358 struct dentry *parent = file->f_dentry->d_parent;
359 const char *name = file->f_dentry->d_name.name;
360 const int len = file->f_dentry->d_name.len;
361 struct md_op_data *op_data;
362 struct ptlrpc_request *req;
363 __u32 opc = LUSTRE_OPC_ANY;
370 /* Usually we come here only for NFSD, and we want open lock.
371 But we can also get here with pre 2.6.15 patchless kernels, and in
372 that case that lock is also ok */
373 /* We can also get here if there was cached open handle in revalidate_it
374 * but it disappeared while we were getting from there to ll_file_open.
375 * But this means this file was closed and immediatelly opened which
376 * makes a good candidate for using OPEN lock */
377 /* If lmmsize & lmm are not 0, we are just setting stripe info
378 * parameters. No need for the open lock */
379 if (lmm == NULL && lmmsize == 0) {
380 itp->it_flags |= MDS_OPEN_LOCK;
381 if (itp->it_flags & FMODE_WRITE)
382 opc = LUSTRE_OPC_CREATE;
385 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
386 file->f_dentry->d_inode, name, len,
389 RETURN(PTR_ERR(op_data));
391 itp->it_flags |= MDS_OPEN_BY_FID;
392 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
393 0 /*unused */, &req, ll_md_blocking_ast, 0);
394 ll_finish_md_op_data(op_data);
396 /* reason for keep own exit path - don`t flood log
397 * with messages with -ESTALE errors.
399 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
400 it_open_error(DISP_OPEN_OPEN, itp))
402 ll_release_openhandle(file->f_dentry, itp);
406 if (it_disposition(itp, DISP_LOOKUP_NEG))
407 GOTO(out, rc = -ENOENT);
409 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
410 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
411 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
415 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
416 if (!rc && itp->d.lustre.it_lock_mode)
417 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
421 ptlrpc_req_finished(itp->d.lustre.it_data);
422 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
423 ll_intent_drop_lock(itp);
429 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
430 * not believe attributes if a few ioepoch holders exist. Attributes for
431 * previous ioepoch if new one is opened are also skipped by MDS.
433 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
435 if (ioepoch && lli->lli_ioepoch != ioepoch) {
436 lli->lli_ioepoch = ioepoch;
437 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
438 ioepoch, PFID(&lli->lli_fid));
442 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
443 struct lookup_intent *it, struct obd_client_handle *och)
445 struct ptlrpc_request *req = it->d.lustre.it_data;
446 struct mdt_body *body;
450 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
451 LASSERT(body != NULL); /* reply already checked out */
453 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
454 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
455 och->och_fid = lli->lli_fid;
456 och->och_flags = it->it_flags;
457 ll_ioepoch_open(lli, body->ioepoch);
459 return md_set_open_replay_data(md_exp, och, req);
462 int ll_local_open(struct file *file, struct lookup_intent *it,
463 struct ll_file_data *fd, struct obd_client_handle *och)
465 struct inode *inode = file->f_dentry->d_inode;
466 struct ll_inode_info *lli = ll_i2info(inode);
469 LASSERT(!LUSTRE_FPRIVATE(file));
474 struct ptlrpc_request *req = it->d.lustre.it_data;
475 struct mdt_body *body;
478 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
482 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
483 if ((it->it_flags & FMODE_WRITE) &&
484 (body->valid & OBD_MD_FLSIZE))
485 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
486 lli->lli_ioepoch, PFID(&lli->lli_fid));
489 LUSTRE_FPRIVATE(file) = fd;
490 ll_readahead_init(inode, &fd->fd_ras);
491 fd->fd_omode = it->it_flags;
495 /* Open a file, and (for the very first open) create objects on the OSTs at
496 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
497 * creation or open until ll_lov_setstripe() ioctl is called.
499 * If we already have the stripe MD locally then we don't request it in
500 * md_open(), by passing a lmm_size = 0.
502 * It is up to the application to ensure no other processes open this file
503 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
504 * used. We might be able to avoid races of that sort by getting lli_open_sem
505 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
506 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
508 int ll_file_open(struct inode *inode, struct file *file)
510 struct ll_inode_info *lli = ll_i2info(inode);
511 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
512 .it_flags = file->f_flags };
513 struct obd_client_handle **och_p = NULL;
514 __u64 *och_usecount = NULL;
515 struct ll_file_data *fd;
516 int rc = 0, opendir_set = 0;
519 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
520 inode->i_generation, inode, file->f_flags);
522 it = file->private_data; /* XXX: compat macro */
523 file->private_data = NULL; /* prevent ll_local_open assertion */
525 fd = ll_file_data_get();
527 GOTO(out_och_free, rc = -ENOMEM);
530 if (S_ISDIR(inode->i_mode)) {
531 spin_lock(&lli->lli_sa_lock);
532 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
533 lli->lli_opendir_pid == 0) {
534 lli->lli_opendir_key = fd;
535 lli->lli_opendir_pid = cfs_curproc_pid();
538 spin_unlock(&lli->lli_sa_lock);
541 if (inode->i_sb->s_root == file->f_dentry) {
542 LUSTRE_FPRIVATE(file) = fd;
546 if (!it || !it->d.lustre.it_disposition) {
547 /* Convert f_flags into access mode. We cannot use file->f_mode,
548 * because everything but O_ACCMODE mask was stripped from
550 if ((oit.it_flags + 1) & O_ACCMODE)
552 if (file->f_flags & O_TRUNC)
553 oit.it_flags |= FMODE_WRITE;
555 /* kernel only call f_op->open in dentry_open. filp_open calls
556 * dentry_open after call to open_namei that checks permissions.
557 * Only nfsd_open call dentry_open directly without checking
558 * permissions and because of that this code below is safe. */
559 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
560 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
562 /* We do not want O_EXCL here, presumably we opened the file
563 * already? XXX - NFS implications? */
564 oit.it_flags &= ~O_EXCL;
566 /* bug20584, if "it_flags" contains O_CREAT, the file will be
567 * created if necessary, then "IT_CREAT" should be set to keep
568 * consistent with it */
569 if (oit.it_flags & O_CREAT)
570 oit.it_op |= IT_CREAT;
576 /* Let's see if we have file open on MDS already. */
577 if (it->it_flags & FMODE_WRITE) {
578 och_p = &lli->lli_mds_write_och;
579 och_usecount = &lli->lli_open_fd_write_count;
580 } else if (it->it_flags & FMODE_EXEC) {
581 och_p = &lli->lli_mds_exec_och;
582 och_usecount = &lli->lli_open_fd_exec_count;
584 och_p = &lli->lli_mds_read_och;
585 och_usecount = &lli->lli_open_fd_read_count;
588 mutex_lock(&lli->lli_och_mutex);
589 if (*och_p) { /* Open handle is present */
590 if (it_disposition(it, DISP_OPEN_OPEN)) {
591 /* Well, there's extra open request that we do not need,
592 let's close it somehow. This will decref request. */
593 rc = it_open_error(DISP_OPEN_OPEN, it);
595 mutex_unlock(&lli->lli_och_mutex);
596 GOTO(out_openerr, rc);
599 ll_release_openhandle(file->f_dentry, it);
603 rc = ll_local_open(file, it, fd, NULL);
606 mutex_unlock(&lli->lli_och_mutex);
607 GOTO(out_openerr, rc);
610 LASSERT(*och_usecount == 0);
611 if (!it->d.lustre.it_disposition) {
612 /* We cannot just request lock handle now, new ELC code
613 means that one of other OPEN locks for this file
614 could be cancelled, and since blocking ast handler
615 would attempt to grab och_mutex as well, that would
616 result in a deadlock */
617 mutex_unlock(&lli->lli_och_mutex);
618 it->it_create_mode |= M_CHECK_STALE;
619 rc = ll_intent_file_open(file, NULL, 0, it);
620 it->it_create_mode &= ~M_CHECK_STALE;
622 GOTO(out_openerr, rc);
626 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
628 GOTO(out_och_free, rc = -ENOMEM);
632 /* md_intent_lock() didn't get a request ref if there was an
633 * open error, so don't do cleanup on the request here
635 /* XXX (green): Should not we bail out on any error here, not
636 * just open error? */
637 rc = it_open_error(DISP_OPEN_OPEN, it);
639 GOTO(out_och_free, rc);
641 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
643 rc = ll_local_open(file, it, fd, *och_p);
645 GOTO(out_och_free, rc);
647 mutex_unlock(&lli->lli_och_mutex);
650 /* Must do this outside lli_och_mutex lock to prevent deadlock where
651 different kind of OPEN lock for this same inode gets cancelled
652 by ldlm_cancel_lru */
653 if (!S_ISREG(inode->i_mode))
654 GOTO(out_och_free, rc);
658 if (!lli->lli_has_smd) {
659 if (file->f_flags & O_LOV_DELAY_CREATE ||
660 !(file->f_mode & FMODE_WRITE)) {
661 CDEBUG(D_INODE, "object creation was delayed\n");
662 GOTO(out_och_free, rc);
665 file->f_flags &= ~O_LOV_DELAY_CREATE;
666 GOTO(out_och_free, rc);
670 if (och_p && *och_p) {
671 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
672 *och_p = NULL; /* OBD_FREE writes some magic there */
675 mutex_unlock(&lli->lli_och_mutex);
678 if (opendir_set != 0)
679 ll_stop_statahead(inode, lli->lli_opendir_key);
681 ll_file_data_put(fd);
683 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
686 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
687 ptlrpc_req_finished(it->d.lustre.it_data);
688 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
694 /* Fills the obdo with the attributes for the lsm */
695 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
696 struct obd_capa *capa, struct obdo *obdo,
697 __u64 ioepoch, int sync)
699 struct ptlrpc_request_set *set;
700 struct obd_info oinfo = { { { 0 } } };
705 LASSERT(lsm != NULL);
709 oinfo.oi_oa->o_oi = lsm->lsm_oi;
710 oinfo.oi_oa->o_mode = S_IFREG;
711 oinfo.oi_oa->o_ioepoch = ioepoch;
712 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
713 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
714 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
715 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
716 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
717 OBD_MD_FLDATAVERSION;
718 oinfo.oi_capa = capa;
720 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
721 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
724 set = ptlrpc_prep_set();
726 CERROR("can't allocate ptlrpc set\n");
729 rc = obd_getattr_async(exp, &oinfo, set);
731 rc = ptlrpc_set_wait(set);
732 ptlrpc_set_destroy(set);
735 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
736 OBD_MD_FLATIME | OBD_MD_FLMTIME |
737 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
738 OBD_MD_FLDATAVERSION);
743 * Performs the getattr on the inode and updates its fields.
744 * If @sync != 0, perform the getattr under the server-side lock.
746 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
747 __u64 ioepoch, int sync)
749 struct obd_capa *capa = ll_mdscapa_get(inode);
750 struct lov_stripe_md *lsm;
754 lsm = ccc_inode_lsm_get(inode);
755 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
756 capa, obdo, ioepoch, sync);
759 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
761 obdo_refresh_inode(inode, obdo, obdo->o_valid);
762 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
763 " blksize %lu\n", POSTID(oi), i_size_read(inode),
764 (unsigned long long)inode->i_blocks,
765 (unsigned long)ll_inode_blksize(inode));
767 ccc_inode_lsm_put(inode, lsm);
771 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
773 struct ll_inode_info *lli = ll_i2info(inode);
774 struct cl_object *obj = lli->lli_clob;
775 struct cl_attr *attr = ccc_env_thread_attr(env);
781 ll_inode_size_lock(inode);
782 /* merge timestamps the most recently obtained from mds with
783 timestamps obtained from osts */
784 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
785 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
786 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
787 inode_init_lvb(inode, &lvb);
789 cl_object_attr_lock(obj);
790 rc = cl_object_attr_get(env, obj, attr);
791 cl_object_attr_unlock(obj);
794 if (lvb.lvb_atime < attr->cat_atime)
795 lvb.lvb_atime = attr->cat_atime;
796 if (lvb.lvb_ctime < attr->cat_ctime)
797 lvb.lvb_ctime = attr->cat_ctime;
798 if (lvb.lvb_mtime < attr->cat_mtime)
799 lvb.lvb_mtime = attr->cat_mtime;
801 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
802 PFID(&lli->lli_fid), attr->cat_size);
803 cl_isize_write_nolock(inode, attr->cat_size);
805 inode->i_blocks = attr->cat_blocks;
807 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
808 LTIME_S(inode->i_atime) = lvb.lvb_atime;
809 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
811 ll_inode_size_unlock(inode);
816 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
819 struct obdo obdo = { 0 };
822 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
824 st->st_size = obdo.o_size;
825 st->st_blocks = obdo.o_blocks;
826 st->st_mtime = obdo.o_mtime;
827 st->st_atime = obdo.o_atime;
828 st->st_ctime = obdo.o_ctime;
833 void ll_io_init(struct cl_io *io, const struct file *file, int write)
835 struct inode *inode = file->f_dentry->d_inode;
837 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
839 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
840 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
841 file->f_flags & O_DIRECT ||
844 io->ci_obj = ll_i2info(inode)->lli_clob;
845 io->ci_lockreq = CILR_MAYBE;
846 if (ll_file_nolock(file)) {
847 io->ci_lockreq = CILR_NEVER;
848 io->ci_no_srvlock = 1;
849 } else if (file->f_flags & O_APPEND) {
850 io->ci_lockreq = CILR_MANDATORY;
855 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
856 struct file *file, enum cl_io_type iot,
857 loff_t *ppos, size_t count)
859 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
866 io = ccc_env_thread_io(env);
867 ll_io_init(io, file, iot == CIT_WRITE);
869 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
870 struct vvp_io *vio = vvp_env_io(env);
871 struct ccc_io *cio = ccc_env_io(env);
872 int write_mutex_locked = 0;
874 cio->cui_fd = LUSTRE_FPRIVATE(file);
875 vio->cui_io_subtype = args->via_io_subtype;
877 switch (vio->cui_io_subtype) {
879 cio->cui_iov = args->u.normal.via_iov;
880 cio->cui_nrsegs = args->u.normal.via_nrsegs;
881 cio->cui_tot_nrsegs = cio->cui_nrsegs;
882 #ifndef HAVE_FILE_WRITEV
883 cio->cui_iocb = args->u.normal.via_iocb;
885 if ((iot == CIT_WRITE) &&
886 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
887 if (mutex_lock_interruptible(&lli->
889 GOTO(out, result = -ERESTARTSYS);
890 write_mutex_locked = 1;
891 } else if (iot == CIT_READ) {
892 down_read(&lli->lli_trunc_sem);
896 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
897 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
900 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
901 vio->u.splice.cui_flags = args->u.splice.via_flags;
904 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
907 result = cl_io_loop(env, io);
908 if (write_mutex_locked)
909 mutex_unlock(&lli->lli_write_mutex);
910 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
911 up_read(&lli->lli_trunc_sem);
913 /* cl_io_rw_init() handled IO */
914 result = io->ci_result;
917 if (io->ci_nob > 0) {
919 *ppos = io->u.ci_wr.wr.crw_pos;
924 /* If any bit been read/written (result != 0), we just return
925 * short read/write instead of restart io. */
926 if (result == 0 && io->ci_need_restart) {
927 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
928 iot == CIT_READ ? "read" : "write",
929 file->f_dentry->d_name.name, *ppos, count);
930 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
934 if (iot == CIT_READ) {
936 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
937 LPROC_LL_READ_BYTES, result);
938 } else if (iot == CIT_WRITE) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_WRITE_BYTES, result);
942 fd->fd_write_failed = false;
943 } else if (result != -ERESTARTSYS) {
944 fd->fd_write_failed = true;
953 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
955 static int ll_file_get_iov_count(const struct iovec *iov,
956 unsigned long *nr_segs, size_t *count)
961 for (seg = 0; seg < *nr_segs; seg++) {
962 const struct iovec *iv = &iov[seg];
965 * If any segment has a negative length, or the cumulative
966 * length ever wraps negative then return -EINVAL.
969 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
971 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
976 cnt -= iv->iov_len; /* This segment is no good */
983 #ifdef HAVE_FILE_READV
984 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
985 unsigned long nr_segs, loff_t *ppos)
988 struct vvp_io_args *args;
994 result = ll_file_get_iov_count(iov, &nr_segs, &count);
998 env = cl_env_get(&refcheck);
1000 RETURN(PTR_ERR(env));
1002 args = vvp_env_args(env, IO_NORMAL);
1003 args->u.normal.via_iov = (struct iovec *)iov;
1004 args->u.normal.via_nrsegs = nr_segs;
1006 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1007 cl_env_put(env, &refcheck);
1011 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1015 struct iovec *local_iov;
1020 env = cl_env_get(&refcheck);
1022 RETURN(PTR_ERR(env));
1024 local_iov = &vvp_env_info(env)->vti_local_iov;
1025 local_iov->iov_base = (void __user *)buf;
1026 local_iov->iov_len = count;
1027 result = ll_file_readv(file, local_iov, 1, ppos);
1028 cl_env_put(env, &refcheck);
1033 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1034 unsigned long nr_segs, loff_t pos)
1037 struct vvp_io_args *args;
1043 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1047 env = cl_env_get(&refcheck);
1049 RETURN(PTR_ERR(env));
1051 args = vvp_env_args(env, IO_NORMAL);
1052 args->u.normal.via_iov = (struct iovec *)iov;
1053 args->u.normal.via_nrsegs = nr_segs;
1054 args->u.normal.via_iocb = iocb;
1056 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1057 &iocb->ki_pos, count);
1058 cl_env_put(env, &refcheck);
1062 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1066 struct iovec *local_iov;
1067 struct kiocb *kiocb;
1072 env = cl_env_get(&refcheck);
1074 RETURN(PTR_ERR(env));
1076 local_iov = &vvp_env_info(env)->vti_local_iov;
1077 kiocb = &vvp_env_info(env)->vti_kiocb;
1078 local_iov->iov_base = (void __user *)buf;
1079 local_iov->iov_len = count;
1080 init_sync_kiocb(kiocb, file);
1081 kiocb->ki_pos = *ppos;
1082 kiocb->ki_left = count;
1084 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1085 *ppos = kiocb->ki_pos;
1087 cl_env_put(env, &refcheck);
1093 * Write to a file (through the page cache).
1095 #ifdef HAVE_FILE_WRITEV
1096 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1097 unsigned long nr_segs, loff_t *ppos)
1100 struct vvp_io_args *args;
1106 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1110 env = cl_env_get(&refcheck);
1112 RETURN(PTR_ERR(env));
1114 args = vvp_env_args(env, IO_NORMAL);
1115 args->u.normal.via_iov = (struct iovec *)iov;
1116 args->u.normal.via_nrsegs = nr_segs;
1118 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1119 cl_env_put(env, &refcheck);
1123 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1127 struct iovec *local_iov;
1132 env = cl_env_get(&refcheck);
1134 RETURN(PTR_ERR(env));
1136 local_iov = &vvp_env_info(env)->vti_local_iov;
1137 local_iov->iov_base = (void __user *)buf;
1138 local_iov->iov_len = count;
1140 result = ll_file_writev(file, local_iov, 1, ppos);
1141 cl_env_put(env, &refcheck);
1145 #else /* AIO stuff */
1146 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1147 unsigned long nr_segs, loff_t pos)
1150 struct vvp_io_args *args;
1156 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1160 env = cl_env_get(&refcheck);
1162 RETURN(PTR_ERR(env));
1164 args = vvp_env_args(env, IO_NORMAL);
1165 args->u.normal.via_iov = (struct iovec *)iov;
1166 args->u.normal.via_nrsegs = nr_segs;
1167 args->u.normal.via_iocb = iocb;
1169 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1170 &iocb->ki_pos, count);
1171 cl_env_put(env, &refcheck);
1175 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1179 struct iovec *local_iov;
1180 struct kiocb *kiocb;
1185 env = cl_env_get(&refcheck);
1187 RETURN(PTR_ERR(env));
1189 local_iov = &vvp_env_info(env)->vti_local_iov;
1190 kiocb = &vvp_env_info(env)->vti_kiocb;
1191 local_iov->iov_base = (void __user *)buf;
1192 local_iov->iov_len = count;
1193 init_sync_kiocb(kiocb, file);
1194 kiocb->ki_pos = *ppos;
1195 kiocb->ki_left = count;
1197 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1198 *ppos = kiocb->ki_pos;
1200 cl_env_put(env, &refcheck);
1206 #ifdef HAVE_KERNEL_SENDFILE
1208 * Send file content (through pagecache) somewhere with helper
1210 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1211 read_actor_t actor, void *target)
1214 struct vvp_io_args *args;
1219 env = cl_env_get(&refcheck);
1221 RETURN(PTR_ERR(env));
1223 args = vvp_env_args(env, IO_SENDFILE);
1224 args->u.sendfile.via_target = target;
1225 args->u.sendfile.via_actor = actor;
1227 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1228 cl_env_put(env, &refcheck);
1233 #ifdef HAVE_KERNEL_SPLICE_READ
1235 * Send file content (through pagecache) somewhere with helper
1237 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1238 struct pipe_inode_info *pipe, size_t count,
1242 struct vvp_io_args *args;
1247 env = cl_env_get(&refcheck);
1249 RETURN(PTR_ERR(env));
1251 args = vvp_env_args(env, IO_SPLICE);
1252 args->u.splice.via_pipe = pipe;
1253 args->u.splice.via_flags = flags;
1255 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1256 cl_env_put(env, &refcheck);
1261 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1264 struct obd_export *exp = ll_i2dtexp(inode);
1265 struct obd_trans_info oti = { 0 };
1266 struct obdo *oa = NULL;
1269 struct lov_stripe_md *lsm = NULL, *lsm2;
1276 lsm = ccc_inode_lsm_get(inode);
1278 GOTO(out, rc = -ENOENT);
1280 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1281 (lsm->lsm_stripe_count));
1283 OBD_ALLOC_LARGE(lsm2, lsm_size);
1285 GOTO(out, rc = -ENOMEM);
1288 oa->o_nlink = ost_idx;
1289 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1290 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1291 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1292 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1293 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1294 memcpy(lsm2, lsm, lsm_size);
1295 ll_inode_size_lock(inode);
1296 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1297 ll_inode_size_unlock(inode);
1299 OBD_FREE_LARGE(lsm2, lsm_size);
1302 ccc_inode_lsm_put(inode, lsm);
1307 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1309 struct ll_recreate_obj ucreat;
1313 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1316 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1320 ostid_set_seq_mdt0(&oi);
1321 ostid_set_id(&oi, ucreat.lrc_id);
1322 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1325 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1332 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1335 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1338 fid_to_ostid(&fid, &oi);
1339 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1340 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1343 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1344 int flags, struct lov_user_md *lum, int lum_size)
1346 struct lov_stripe_md *lsm = NULL;
1347 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1351 lsm = ccc_inode_lsm_get(inode);
1353 ccc_inode_lsm_put(inode, lsm);
1354 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1359 ll_inode_size_lock(inode);
1360 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1363 rc = oit.d.lustre.it_status;
1365 GOTO(out_req_free, rc);
1367 ll_release_openhandle(file->f_dentry, &oit);
1370 ll_inode_size_unlock(inode);
1371 ll_intent_release(&oit);
1372 ccc_inode_lsm_put(inode, lsm);
1375 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1379 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1380 struct lov_mds_md **lmmp, int *lmm_size,
1381 struct ptlrpc_request **request)
1383 struct ll_sb_info *sbi = ll_i2sbi(inode);
1384 struct mdt_body *body;
1385 struct lov_mds_md *lmm = NULL;
1386 struct ptlrpc_request *req = NULL;
1387 struct md_op_data *op_data;
1390 rc = ll_get_max_mdsize(sbi, &lmmsize);
1394 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1395 strlen(filename), lmmsize,
1396 LUSTRE_OPC_ANY, NULL);
1397 if (IS_ERR(op_data))
1398 RETURN(PTR_ERR(op_data));
1400 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1401 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1402 ll_finish_md_op_data(op_data);
1404 CDEBUG(D_INFO, "md_getattr_name failed "
1405 "on %s: rc %d\n", filename, rc);
1409 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1410 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1412 lmmsize = body->eadatasize;
1414 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1416 GOTO(out, rc = -ENODATA);
1419 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1420 LASSERT(lmm != NULL);
1422 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1423 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1424 GOTO(out, rc = -EPROTO);
1428 * This is coming from the MDS, so is probably in
1429 * little endian. We convert it to host endian before
1430 * passing it to userspace.
1432 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1433 /* if function called for directory - we should
1434 * avoid swab not existent lsm objects */
1435 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1436 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1437 if (S_ISREG(body->mode))
1438 lustre_swab_lov_user_md_objects(
1439 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1440 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1441 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1442 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1443 if (S_ISREG(body->mode))
1444 lustre_swab_lov_user_md_objects(
1445 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1446 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1452 *lmm_size = lmmsize;
1457 static int ll_lov_setea(struct inode *inode, struct file *file,
1460 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1461 struct lov_user_md *lump;
1462 int lum_size = sizeof(struct lov_user_md) +
1463 sizeof(struct lov_user_ost_data);
1467 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1470 OBD_ALLOC_LARGE(lump, lum_size);
1474 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1475 OBD_FREE_LARGE(lump, lum_size);
1479 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1481 OBD_FREE_LARGE(lump, lum_size);
1485 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1488 struct lov_user_md_v3 lumv3;
1489 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1490 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1491 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1493 int flags = FMODE_WRITE;
1496 /* first try with v1 which is smaller than v3 */
1497 lum_size = sizeof(struct lov_user_md_v1);
1498 if (copy_from_user(lumv1, lumv1p, lum_size))
1501 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1502 lum_size = sizeof(struct lov_user_md_v3);
1503 if (copy_from_user(&lumv3, lumv3p, lum_size))
1507 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1509 struct lov_stripe_md *lsm;
1512 put_user(0, &lumv1p->lmm_stripe_count);
1514 ll_layout_refresh(inode, &gen);
1515 lsm = ccc_inode_lsm_get(inode);
1516 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1517 0, lsm, (void *)arg);
1518 ccc_inode_lsm_put(inode, lsm);
1523 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1525 struct lov_stripe_md *lsm;
1529 lsm = ccc_inode_lsm_get(inode);
1531 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1533 ccc_inode_lsm_put(inode, lsm);
1537 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1539 struct ll_inode_info *lli = ll_i2info(inode);
1540 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1541 struct ccc_grouplock grouplock;
1545 if (ll_file_nolock(file))
1546 RETURN(-EOPNOTSUPP);
1548 spin_lock(&lli->lli_lock);
1549 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1550 CWARN("group lock already existed with gid %lu\n",
1551 fd->fd_grouplock.cg_gid);
1552 spin_unlock(&lli->lli_lock);
1555 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1556 spin_unlock(&lli->lli_lock);
1558 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1559 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1563 spin_lock(&lli->lli_lock);
1564 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1565 spin_unlock(&lli->lli_lock);
1566 CERROR("another thread just won the race\n");
1567 cl_put_grouplock(&grouplock);
1571 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1572 fd->fd_grouplock = grouplock;
1573 spin_unlock(&lli->lli_lock);
1575 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1579 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1581 struct ll_inode_info *lli = ll_i2info(inode);
1582 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1583 struct ccc_grouplock grouplock;
1586 spin_lock(&lli->lli_lock);
1587 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1588 spin_unlock(&lli->lli_lock);
1589 CWARN("no group lock held\n");
1592 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1594 if (fd->fd_grouplock.cg_gid != arg) {
1595 CWARN("group lock %lu doesn't match current id %lu\n",
1596 arg, fd->fd_grouplock.cg_gid);
1597 spin_unlock(&lli->lli_lock);
1601 grouplock = fd->fd_grouplock;
1602 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1603 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1604 spin_unlock(&lli->lli_lock);
1606 cl_put_grouplock(&grouplock);
1607 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1612 * Close inode open handle
1614 * \param dentry [in] dentry which contains the inode
1615 * \param it [in,out] intent which contains open info and result
1618 * \retval <0 failure
1620 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1622 struct inode *inode = dentry->d_inode;
1623 struct obd_client_handle *och;
1629 /* Root ? Do nothing. */
1630 if (dentry->d_inode->i_sb->s_root == dentry)
1633 /* No open handle to close? Move away */
1634 if (!it_disposition(it, DISP_OPEN_OPEN))
1637 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1639 OBD_ALLOC(och, sizeof(*och));
1641 GOTO(out, rc = -ENOMEM);
1643 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1644 ll_i2info(inode), it, och);
1646 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1649 /* this one is in place of ll_file_open */
1650 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1651 ptlrpc_req_finished(it->d.lustre.it_data);
1652 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1658 * Get size for inode for which FIEMAP mapping is requested.
1659 * Make the FIEMAP get_info call and returns the result.
1661 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1664 struct obd_export *exp = ll_i2dtexp(inode);
1665 struct lov_stripe_md *lsm = NULL;
1666 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1667 int vallen = num_bytes;
1671 /* Checks for fiemap flags */
1672 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1673 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1677 /* Check for FIEMAP_FLAG_SYNC */
1678 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1679 rc = filemap_fdatawrite(inode->i_mapping);
1684 lsm = ccc_inode_lsm_get(inode);
1688 /* If the stripe_count > 1 and the application does not understand
1689 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1691 if (lsm->lsm_stripe_count > 1 &&
1692 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1693 GOTO(out, rc = -EOPNOTSUPP);
1695 fm_key.oa.o_oi = lsm->lsm_oi;
1696 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1698 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1699 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1700 /* If filesize is 0, then there would be no objects for mapping */
1701 if (fm_key.oa.o_size == 0) {
1702 fiemap->fm_mapped_extents = 0;
1706 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1708 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1711 CERROR("obd_get_info failed: rc = %d\n", rc);
1714 ccc_inode_lsm_put(inode, lsm);
1718 int ll_fid2path(struct inode *inode, void *arg)
1720 struct obd_export *exp = ll_i2mdexp(inode);
1721 struct getinfo_fid2path *gfout, *gfin;
1725 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1726 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1729 /* Need to get the buflen */
1730 OBD_ALLOC_PTR(gfin);
1733 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1738 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1739 OBD_ALLOC(gfout, outsize);
1740 if (gfout == NULL) {
1744 memcpy(gfout, gfin, sizeof(*gfout));
1747 /* Call mdc_iocontrol */
1748 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1752 if (copy_to_user(arg, gfout, outsize))
1756 OBD_FREE(gfout, outsize);
1760 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1762 struct ll_user_fiemap *fiemap_s;
1763 size_t num_bytes, ret_bytes;
1764 unsigned int extent_count;
1767 /* Get the extent count so we can calculate the size of
1768 * required fiemap buffer */
1769 if (get_user(extent_count,
1770 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1772 num_bytes = sizeof(*fiemap_s) + (extent_count *
1773 sizeof(struct ll_fiemap_extent));
1775 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1776 if (fiemap_s == NULL)
1779 /* get the fiemap value */
1780 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1782 GOTO(error, rc = -EFAULT);
1784 /* If fm_extent_count is non-zero, read the first extent since
1785 * it is used to calculate end_offset and device from previous
1788 if (copy_from_user(&fiemap_s->fm_extents[0],
1789 (char __user *)arg + sizeof(*fiemap_s),
1790 sizeof(struct ll_fiemap_extent)))
1791 GOTO(error, rc = -EFAULT);
1794 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1798 ret_bytes = sizeof(struct ll_user_fiemap);
1800 if (extent_count != 0)
1801 ret_bytes += (fiemap_s->fm_mapped_extents *
1802 sizeof(struct ll_fiemap_extent));
1804 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1808 OBD_FREE_LARGE(fiemap_s, num_bytes);
1813 * Read the data_version for inode.
1815 * This value is computed using stripe object version on OST.
1816 * Version is computed using server side locking.
1818 * @param extent_lock Take extent lock. Not needed if a process is already
1819 * holding the OST object group locks.
1821 int ll_data_version(struct inode *inode, __u64 *data_version,
1824 struct lov_stripe_md *lsm = NULL;
1825 struct ll_sb_info *sbi = ll_i2sbi(inode);
1826 struct obdo *obdo = NULL;
1830 /* If no stripe, we consider version is 0. */
1831 lsm = ccc_inode_lsm_get(inode);
1834 CDEBUG(D_INODE, "No object for inode\n");
1838 OBD_ALLOC_PTR(obdo);
1840 ccc_inode_lsm_put(inode, lsm);
1844 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1846 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1849 *data_version = obdo->o_data_version;
1853 ccc_inode_lsm_put(inode, lsm);
1858 struct ll_swap_stack {
1859 struct iattr ia1, ia2;
1861 struct inode *inode1, *inode2;
1862 bool check_dv1, check_dv2;
1865 static int ll_swap_layouts(struct file *file1, struct file *file2,
1866 struct lustre_swap_layouts *lsl)
1868 struct mdc_swap_layouts msl;
1869 struct md_op_data *op_data;
1872 struct ll_swap_stack *llss = NULL;
1875 OBD_ALLOC_PTR(llss);
1879 llss->inode1 = file1->f_dentry->d_inode;
1880 llss->inode2 = file2->f_dentry->d_inode;
1882 if (!S_ISREG(llss->inode2->i_mode))
1883 GOTO(free, rc = -EINVAL);
1885 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1886 ll_permission(llss->inode2, MAY_WRITE, NULL))
1887 GOTO(free, rc = -EPERM);
1889 if (llss->inode2->i_sb != llss->inode1->i_sb)
1890 GOTO(free, rc = -EXDEV);
1892 /* we use 2 bool because it is easier to swap than 2 bits */
1893 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1894 llss->check_dv1 = true;
1896 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1897 llss->check_dv2 = true;
1899 /* we cannot use lsl->sl_dvX directly because we may swap them */
1900 llss->dv1 = lsl->sl_dv1;
1901 llss->dv2 = lsl->sl_dv2;
1903 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1904 if (rc == 0) /* same file, done! */
1907 if (rc < 0) { /* sequentialize it */
1908 swap(llss->inode1, llss->inode2);
1910 swap(llss->dv1, llss->dv2);
1911 swap(llss->check_dv1, llss->check_dv2);
1915 if (gid != 0) { /* application asks to flush dirty cache */
1916 rc = ll_get_grouplock(llss->inode1, file1, gid);
1920 rc = ll_get_grouplock(llss->inode2, file2, gid);
1922 ll_put_grouplock(llss->inode1, file1, gid);
1927 /* to be able to restore mtime and atime after swap
1928 * we need to first save them */
1930 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1931 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1932 llss->ia1.ia_atime = llss->inode1->i_atime;
1933 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1934 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1935 llss->ia2.ia_atime = llss->inode2->i_atime;
1936 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1939 /* ultimate check, before swaping the layouts we check if
1940 * dataversion has changed (if requested) */
1941 if (llss->check_dv1) {
1942 rc = ll_data_version(llss->inode1, &dv, 0);
1945 if (dv != llss->dv1)
1946 GOTO(putgl, rc = -EAGAIN);
1949 if (llss->check_dv2) {
1950 rc = ll_data_version(llss->inode2, &dv, 0);
1953 if (dv != llss->dv2)
1954 GOTO(putgl, rc = -EAGAIN);
1957 /* struct md_op_data is used to send the swap args to the mdt
1958 * only flags is missing, so we use struct mdc_swap_layouts
1959 * through the md_op_data->op_data */
1960 /* flags from user space have to be converted before they are send to
1961 * server, no flag is sent today, they are only used on the client */
1964 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1965 0, LUSTRE_OPC_ANY, &msl);
1966 if (op_data != NULL) {
1967 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1968 ll_i2mdexp(llss->inode1),
1969 sizeof(*op_data), op_data, NULL);
1970 ll_finish_md_op_data(op_data);
1975 ll_put_grouplock(llss->inode2, file2, gid);
1976 ll_put_grouplock(llss->inode1, file1, gid);
1979 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1983 /* clear useless flags */
1984 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1985 llss->ia1.ia_valid &= ~ATTR_MTIME;
1986 llss->ia2.ia_valid &= ~ATTR_MTIME;
1989 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1990 llss->ia1.ia_valid &= ~ATTR_ATIME;
1991 llss->ia2.ia_valid &= ~ATTR_ATIME;
1994 /* update time if requested */
1996 if (llss->ia2.ia_valid != 0) {
1997 mutex_lock(&llss->inode1->i_mutex);
1998 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1999 mutex_unlock(&llss->inode1->i_mutex);
2002 if (llss->ia1.ia_valid != 0) {
2005 mutex_lock(&llss->inode2->i_mutex);
2006 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2007 mutex_unlock(&llss->inode2->i_mutex);
2019 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2021 struct inode *inode = file->f_dentry->d_inode;
2022 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2026 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2027 inode->i_generation, inode, cmd);
2028 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2030 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2031 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2035 case LL_IOC_GETFLAGS:
2036 /* Get the current value of the file flags */
2037 return put_user(fd->fd_flags, (int *)arg);
2038 case LL_IOC_SETFLAGS:
2039 case LL_IOC_CLRFLAGS:
2040 /* Set or clear specific file flags */
2041 /* XXX This probably needs checks to ensure the flags are
2042 * not abused, and to handle any flag side effects.
2044 if (get_user(flags, (int *) arg))
2047 if (cmd == LL_IOC_SETFLAGS) {
2048 if ((flags & LL_FILE_IGNORE_LOCK) &&
2049 !(file->f_flags & O_DIRECT)) {
2050 CERROR("%s: unable to disable locking on "
2051 "non-O_DIRECT file\n", current->comm);
2055 fd->fd_flags |= flags;
2057 fd->fd_flags &= ~flags;
2060 case LL_IOC_LOV_SETSTRIPE:
2061 RETURN(ll_lov_setstripe(inode, file, arg));
2062 case LL_IOC_LOV_SETEA:
2063 RETURN(ll_lov_setea(inode, file, arg));
2064 case LL_IOC_LOV_SWAP_LAYOUTS: {
2066 struct lustre_swap_layouts lsl;
2068 if (cfs_copy_from_user(&lsl, (char *)arg,
2069 sizeof(struct lustre_swap_layouts)))
2072 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2075 file2 = fget(lsl.sl_fd);
2080 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2081 rc = ll_swap_layouts(file, file2, &lsl);
2085 case LL_IOC_LOV_GETSTRIPE:
2086 RETURN(ll_lov_getstripe(inode, arg));
2087 case LL_IOC_RECREATE_OBJ:
2088 RETURN(ll_lov_recreate_obj(inode, arg));
2089 case LL_IOC_RECREATE_FID:
2090 RETURN(ll_lov_recreate_fid(inode, arg));
2091 case FSFILT_IOC_FIEMAP:
2092 RETURN(ll_ioctl_fiemap(inode, arg));
2093 case FSFILT_IOC_GETFLAGS:
2094 case FSFILT_IOC_SETFLAGS:
2095 RETURN(ll_iocontrol(inode, file, cmd, arg));
2096 case FSFILT_IOC_GETVERSION_OLD:
2097 case FSFILT_IOC_GETVERSION:
2098 RETURN(put_user(inode->i_generation, (int *)arg));
2099 case LL_IOC_GROUP_LOCK:
2100 RETURN(ll_get_grouplock(inode, file, arg));
2101 case LL_IOC_GROUP_UNLOCK:
2102 RETURN(ll_put_grouplock(inode, file, arg));
2103 case IOC_OBD_STATFS:
2104 RETURN(ll_obd_statfs(inode, (void *)arg));
2106 /* We need to special case any other ioctls we want to handle,
2107 * to send them to the MDS/OST as appropriate and to properly
2108 * network encode the arg field.
2109 case FSFILT_IOC_SETVERSION_OLD:
2110 case FSFILT_IOC_SETVERSION:
2112 case LL_IOC_FLUSHCTX:
2113 RETURN(ll_flush_ctx(inode));
2114 case LL_IOC_PATH2FID: {
2115 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2116 sizeof(struct lu_fid)))
2121 case OBD_IOC_FID2PATH:
2122 RETURN(ll_fid2path(inode, (void *)arg));
2123 case LL_IOC_DATA_VERSION: {
2124 struct ioc_data_version idv;
2127 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2130 rc = ll_data_version(inode, &idv.idv_version,
2131 !(idv.idv_flags & LL_DV_NOFLUSH));
2133 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2139 case LL_IOC_GET_MDTIDX: {
2142 mdtidx = ll_get_mdt_idx(inode);
2146 if (put_user((int)mdtidx, (int*)arg))
2151 case OBD_IOC_GETDTNAME:
2152 case OBD_IOC_GETMDNAME:
2153 RETURN(ll_get_obd_name(inode, cmd, arg));
2154 case LL_IOC_HSM_STATE_GET: {
2155 struct md_op_data *op_data;
2156 struct hsm_user_state *hus;
2163 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2164 LUSTRE_OPC_ANY, hus);
2165 if (op_data == NULL) {
2170 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2173 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2176 ll_finish_md_op_data(op_data);
2180 case LL_IOC_HSM_STATE_SET: {
2181 struct md_op_data *op_data;
2182 struct hsm_state_set *hss;
2188 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2193 /* Non-root users are forbidden to set or clear flags which are
2194 * NOT defined in HSM_USER_MASK. */
2195 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2196 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2201 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2202 LUSTRE_OPC_ANY, hss);
2203 if (op_data == NULL) {
2208 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2211 ll_finish_md_op_data(op_data);
2216 case LL_IOC_HSM_ACTION: {
2217 struct md_op_data *op_data;
2218 struct hsm_current_action *hca;
2225 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2226 LUSTRE_OPC_ANY, hca);
2227 if (op_data == NULL) {
2232 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2235 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2238 ll_finish_md_op_data(op_data);
2246 ll_iocontrol_call(inode, file, cmd, arg, &err))
2249 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2255 #ifndef HAVE_FILE_LLSEEK_SIZE
2256 static inline loff_t
2257 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2259 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2261 if (offset > maxsize)
2264 if (offset != file->f_pos) {
2265 file->f_pos = offset;
2266 file->f_version = 0;
2272 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2273 loff_t maxsize, loff_t eof)
2275 struct inode *inode = file->f_dentry->d_inode;
2283 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2284 * position-querying operation. Avoid rewriting the "same"
2285 * f_pos value back to the file because a concurrent read(),
2286 * write() or lseek() might have altered it
2291 * f_lock protects against read/modify/write race with other
2292 * SEEK_CURs. Note that parallel writes and reads behave
2295 mutex_lock(&inode->i_mutex);
2296 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2297 mutex_unlock(&inode->i_mutex);
2301 * In the generic case the entire file is data, so as long as
2302 * offset isn't at the end of the file then the offset is data.
2309 * There is a virtual hole at the end of the file, so as long as
2310 * offset isn't i_size or larger, return i_size.
2318 return llseek_execute(file, offset, maxsize);
2322 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2324 struct inode *inode = file->f_dentry->d_inode;
2325 loff_t retval, eof = 0;
2328 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2329 (origin == SEEK_CUR) ? file->f_pos : 0);
2330 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2331 inode->i_ino, inode->i_generation, inode, retval, retval,
2333 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2335 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2336 retval = ll_glimpse_size(inode);
2339 eof = i_size_read(inode);
2342 retval = ll_generic_file_llseek_size(file, offset, origin,
2343 ll_file_maxbytes(inode), eof);
2347 int ll_flush(struct file *file, fl_owner_t id)
2349 struct inode *inode = file->f_dentry->d_inode;
2350 struct ll_inode_info *lli = ll_i2info(inode);
2351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2354 LASSERT(!S_ISDIR(inode->i_mode));
2356 /* catch async errors that were recorded back when async writeback
2357 * failed for pages in this mapping. */
2358 rc = lli->lli_async_rc;
2359 lli->lli_async_rc = 0;
2360 err = lov_read_and_clear_async_rc(lli->lli_clob);
2364 /* The application has been told write failure already.
2365 * Do not report failure again. */
2366 if (fd->fd_write_failed)
2368 return rc ? -EIO : 0;
2372 * Called to make sure a portion of file has been written out.
2373 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2375 * Return how many pages have been written.
2377 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2378 enum cl_fsync_mode mode, int ignore_layout)
2380 struct cl_env_nest nest;
2383 struct obd_capa *capa = NULL;
2384 struct cl_fsync_io *fio;
2388 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2389 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2392 env = cl_env_nested_get(&nest);
2394 RETURN(PTR_ERR(env));
2396 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2398 io = ccc_env_thread_io(env);
2399 io->ci_obj = cl_i2info(inode)->lli_clob;
2400 io->ci_ignore_layout = ignore_layout;
2402 /* initialize parameters for sync */
2403 fio = &io->u.ci_fsync;
2404 fio->fi_capa = capa;
2405 fio->fi_start = start;
2407 fio->fi_fid = ll_inode2fid(inode);
2408 fio->fi_mode = mode;
2409 fio->fi_nr_written = 0;
2411 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2412 result = cl_io_loop(env, io);
2414 result = io->ci_result;
2416 result = fio->fi_nr_written;
2417 cl_io_fini(env, io);
2418 cl_env_nested_put(&nest, env);
2426 * When dentry is provided (the 'else' case), *file->f_dentry may be
2427 * null and dentry must be used directly rather than pulled from
2428 * *file->f_dentry as is done otherwise.
2431 #ifdef HAVE_FILE_FSYNC_4ARGS
2432 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2434 struct dentry *dentry = file->f_dentry;
2435 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2436 int ll_fsync(struct file *file, int datasync)
2438 struct dentry *dentry = file->f_dentry;
2440 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2443 struct inode *inode = dentry->d_inode;
2444 struct ll_inode_info *lli = ll_i2info(inode);
2445 struct ptlrpc_request *req;
2446 struct obd_capa *oc;
2450 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2451 inode->i_generation, inode);
2452 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2454 #ifdef HAVE_FILE_FSYNC_4ARGS
2455 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2456 mutex_lock(&inode->i_mutex);
2458 /* fsync's caller has already called _fdata{sync,write}, we want
2459 * that IO to finish before calling the osc and mdc sync methods */
2460 rc = filemap_fdatawait(inode->i_mapping);
2463 /* catch async errors that were recorded back when async writeback
2464 * failed for pages in this mapping. */
2465 if (!S_ISDIR(inode->i_mode)) {
2466 err = lli->lli_async_rc;
2467 lli->lli_async_rc = 0;
2470 err = lov_read_and_clear_async_rc(lli->lli_clob);
2475 oc = ll_mdscapa_get(inode);
2476 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2482 ptlrpc_req_finished(req);
2484 if (datasync && S_ISREG(inode->i_mode)) {
2485 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2487 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2489 if (rc == 0 && err < 0)
2492 fd->fd_write_failed = true;
2494 fd->fd_write_failed = false;
2497 #ifdef HAVE_FILE_FSYNC_4ARGS
2498 mutex_unlock(&inode->i_mutex);
2503 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2505 struct inode *inode = file->f_dentry->d_inode;
2506 struct ll_sb_info *sbi = ll_i2sbi(inode);
2507 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2508 .ei_cb_cp =ldlm_flock_completion_ast,
2509 .ei_cbdata = file_lock };
2510 struct md_op_data *op_data;
2511 struct lustre_handle lockh = {0};
2512 ldlm_policy_data_t flock = {{0}};
2518 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2519 inode->i_ino, file_lock);
2521 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2523 if (file_lock->fl_flags & FL_FLOCK) {
2524 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2525 /* flocks are whole-file locks */
2526 flock.l_flock.end = OFFSET_MAX;
2527 /* For flocks owner is determined by the local file desctiptor*/
2528 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2529 } else if (file_lock->fl_flags & FL_POSIX) {
2530 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2531 flock.l_flock.start = file_lock->fl_start;
2532 flock.l_flock.end = file_lock->fl_end;
2536 flock.l_flock.pid = file_lock->fl_pid;
2538 /* Somewhat ugly workaround for svc lockd.
2539 * lockd installs custom fl_lmops->lm_compare_owner that checks
2540 * for the fl_owner to be the same (which it always is on local node
2541 * I guess between lockd processes) and then compares pid.
2542 * As such we assign pid to the owner field to make it all work,
2543 * conflict with normal locks is unlikely since pid space and
2544 * pointer space for current->files are not intersecting */
2545 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2546 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2548 switch (file_lock->fl_type) {
2550 einfo.ei_mode = LCK_PR;
2553 /* An unlock request may or may not have any relation to
2554 * existing locks so we may not be able to pass a lock handle
2555 * via a normal ldlm_lock_cancel() request. The request may even
2556 * unlock a byte range in the middle of an existing lock. In
2557 * order to process an unlock request we need all of the same
2558 * information that is given with a normal read or write record
2559 * lock request. To avoid creating another ldlm unlock (cancel)
2560 * message we'll treat a LCK_NL flock request as an unlock. */
2561 einfo.ei_mode = LCK_NL;
2564 einfo.ei_mode = LCK_PW;
2567 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2568 file_lock->fl_type);
2583 flags = LDLM_FL_BLOCK_NOWAIT;
2589 flags = LDLM_FL_TEST_LOCK;
2590 /* Save the old mode so that if the mode in the lock changes we
2591 * can decrement the appropriate reader or writer refcount. */
2592 file_lock->fl_type = einfo.ei_mode;
2595 CERROR("unknown fcntl lock command: %d\n", cmd);
2599 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2600 LUSTRE_OPC_ANY, NULL);
2601 if (IS_ERR(op_data))
2602 RETURN(PTR_ERR(op_data));
2604 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2605 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2606 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2608 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2609 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2611 if ((file_lock->fl_flags & FL_FLOCK) &&
2612 (rc == 0 || file_lock->fl_type == F_UNLCK))
2613 rc2 = flock_lock_file_wait(file, file_lock);
2614 if ((file_lock->fl_flags & FL_POSIX) &&
2615 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2616 !(flags & LDLM_FL_TEST_LOCK))
2617 rc2 = posix_lock_file_wait(file, file_lock);
2619 if (rc2 && file_lock->fl_type != F_UNLCK) {
2620 einfo.ei_mode = LCK_NL;
2621 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2622 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2626 ll_finish_md_op_data(op_data);
2631 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2639 * test if some locks matching bits and l_req_mode are acquired
2640 * - bits can be in different locks
2641 * - if found clear the common lock bits in *bits
2642 * - the bits not found, are kept in *bits
2644 * \param bits [IN] searched lock bits [IN]
2645 * \param l_req_mode [IN] searched lock mode
2646 * \retval boolean, true iff all bits are found
2648 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2650 struct lustre_handle lockh;
2651 ldlm_policy_data_t policy;
2652 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2653 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2662 fid = &ll_i2info(inode)->lli_fid;
2663 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2664 ldlm_lockname[mode]);
2666 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2667 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2668 policy.l_inodebits.bits = *bits & (1 << i);
2669 if (policy.l_inodebits.bits == 0)
2672 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2673 &policy, mode, &lockh)) {
2674 struct ldlm_lock *lock;
2676 lock = ldlm_handle2lock(&lockh);
2679 ~(lock->l_policy_data.l_inodebits.bits);
2680 LDLM_LOCK_PUT(lock);
2682 *bits &= ~policy.l_inodebits.bits;
2689 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2690 struct lustre_handle *lockh, __u64 flags)
2692 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2697 fid = &ll_i2info(inode)->lli_fid;
2698 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2700 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2701 fid, LDLM_IBITS, &policy,
2702 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2706 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2708 /* Already unlinked. Just update nlink and return success */
2709 if (rc == -ENOENT) {
2711 /* This path cannot be hit for regular files unless in
2712 * case of obscure races, so no need to to validate
2714 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2716 } else if (rc != 0) {
2717 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2718 ll_get_fsname(inode->i_sb, NULL, 0),
2719 PFID(ll_inode2fid(inode)), rc);
2725 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2728 struct inode *inode = dentry->d_inode;
2729 struct ptlrpc_request *req = NULL;
2730 struct obd_export *exp;
2734 LASSERT(inode != NULL);
2736 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2737 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2739 exp = ll_i2mdexp(inode);
2741 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2742 * But under CMD case, it caused some lock issues, should be fixed
2743 * with new CMD ibits lock. See bug 12718 */
2744 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2745 struct lookup_intent oit = { .it_op = IT_GETATTR };
2746 struct md_op_data *op_data;
2748 if (ibits == MDS_INODELOCK_LOOKUP)
2749 oit.it_op = IT_LOOKUP;
2751 /* Call getattr by fid, so do not provide name at all. */
2752 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2753 dentry->d_inode, NULL, 0, 0,
2754 LUSTRE_OPC_ANY, NULL);
2755 if (IS_ERR(op_data))
2756 RETURN(PTR_ERR(op_data));
2758 oit.it_create_mode |= M_CHECK_STALE;
2759 rc = md_intent_lock(exp, op_data, NULL, 0,
2760 /* we are not interested in name
2763 ll_md_blocking_ast, 0);
2764 ll_finish_md_op_data(op_data);
2765 oit.it_create_mode &= ~M_CHECK_STALE;
2767 rc = ll_inode_revalidate_fini(inode, rc);
2771 rc = ll_revalidate_it_finish(req, &oit, dentry);
2773 ll_intent_release(&oit);
2777 /* Unlinked? Unhash dentry, so it is not picked up later by
2778 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2779 here to preserve get_cwd functionality on 2.6.
2781 if (!dentry->d_inode->i_nlink)
2782 d_lustre_invalidate(dentry, 0);
2784 ll_lookup_finish_locks(&oit, dentry);
2785 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2786 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2787 obd_valid valid = OBD_MD_FLGETATTR;
2788 struct md_op_data *op_data;
2791 if (S_ISREG(inode->i_mode)) {
2792 rc = ll_get_max_mdsize(sbi, &ealen);
2795 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2798 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2799 0, ealen, LUSTRE_OPC_ANY,
2801 if (IS_ERR(op_data))
2802 RETURN(PTR_ERR(op_data));
2804 op_data->op_valid = valid;
2805 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2806 * capa for this inode. Because we only keep capas of dirs
2808 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2809 ll_finish_md_op_data(op_data);
2811 rc = ll_inode_revalidate_fini(inode, rc);
2815 rc = ll_prep_inode(&inode, req, NULL, NULL);
2818 ptlrpc_req_finished(req);
2822 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2825 struct inode *inode = dentry->d_inode;
2829 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2833 /* if object isn't regular file, don't validate size */
2834 if (!S_ISREG(inode->i_mode)) {
2835 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2836 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2837 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2839 rc = ll_glimpse_size(inode);
2844 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2845 struct lookup_intent *it, struct kstat *stat)
2847 struct inode *inode = de->d_inode;
2848 struct ll_sb_info *sbi = ll_i2sbi(inode);
2849 struct ll_inode_info *lli = ll_i2info(inode);
2852 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2853 MDS_INODELOCK_LOOKUP);
2854 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2859 stat->dev = inode->i_sb->s_dev;
2860 if (ll_need_32bit_api(sbi))
2861 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2863 stat->ino = inode->i_ino;
2864 stat->mode = inode->i_mode;
2865 stat->nlink = inode->i_nlink;
2866 stat->uid = inode->i_uid;
2867 stat->gid = inode->i_gid;
2868 stat->rdev = inode->i_rdev;
2869 stat->atime = inode->i_atime;
2870 stat->mtime = inode->i_mtime;
2871 stat->ctime = inode->i_ctime;
2872 stat->blksize = 1 << inode->i_blkbits;
2874 stat->size = i_size_read(inode);
2875 stat->blocks = inode->i_blocks;
2879 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2881 struct lookup_intent it = { .it_op = IT_GETATTR };
2883 return ll_getattr_it(mnt, de, &it, stat);
2886 #ifdef HAVE_LINUX_FIEMAP_H
2887 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2888 __u64 start, __u64 len)
2892 struct ll_user_fiemap *fiemap;
2893 unsigned int extent_count = fieinfo->fi_extents_max;
2895 num_bytes = sizeof(*fiemap) + (extent_count *
2896 sizeof(struct ll_fiemap_extent));
2897 OBD_ALLOC_LARGE(fiemap, num_bytes);
2902 fiemap->fm_flags = fieinfo->fi_flags;
2903 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2904 fiemap->fm_start = start;
2905 fiemap->fm_length = len;
2906 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2907 sizeof(struct ll_fiemap_extent));
2909 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2911 fieinfo->fi_flags = fiemap->fm_flags;
2912 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2913 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2914 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2916 OBD_FREE_LARGE(fiemap, num_bytes);
2921 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2923 struct ll_inode_info *lli = ll_i2info(inode);
2924 struct posix_acl *acl = NULL;
2927 spin_lock(&lli->lli_lock);
2928 /* VFS' acl_permission_check->check_acl will release the refcount */
2929 acl = posix_acl_dup(lli->lli_posix_acl);
2930 spin_unlock(&lli->lli_lock);
2935 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2937 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2938 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2940 ll_check_acl(struct inode *inode, int mask)
2943 # ifdef CONFIG_FS_POSIX_ACL
2944 struct posix_acl *acl;
2948 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2949 if (flags & IPERM_FLAG_RCU)
2952 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2957 rc = posix_acl_permission(inode, acl, mask);
2958 posix_acl_release(acl);
2961 # else /* !CONFIG_FS_POSIX_ACL */
2963 # endif /* CONFIG_FS_POSIX_ACL */
2965 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2967 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2968 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2970 # ifdef HAVE_INODE_PERMISION_2ARGS
2971 int ll_inode_permission(struct inode *inode, int mask)
2973 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2980 #ifdef MAY_NOT_BLOCK
2981 if (mask & MAY_NOT_BLOCK)
2983 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2984 if (flags & IPERM_FLAG_RCU)
2988 /* as root inode are NOT getting validated in lookup operation,
2989 * need to do it before permission check. */
2991 if (inode == inode->i_sb->s_root->d_inode) {
2992 struct lookup_intent it = { .it_op = IT_LOOKUP };
2994 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2995 MDS_INODELOCK_LOOKUP);
3000 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3001 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3003 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3004 return lustre_check_remote_perm(inode, mask);
3006 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3007 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3012 #ifdef HAVE_FILE_READV
3013 #define READ_METHOD readv
3014 #define READ_FUNCTION ll_file_readv
3015 #define WRITE_METHOD writev
3016 #define WRITE_FUNCTION ll_file_writev
3018 #define READ_METHOD aio_read
3019 #define READ_FUNCTION ll_file_aio_read
3020 #define WRITE_METHOD aio_write
3021 #define WRITE_FUNCTION ll_file_aio_write
3024 /* -o localflock - only provides locally consistent flock locks */
3025 struct file_operations ll_file_operations = {
3026 .read = ll_file_read,
3027 .READ_METHOD = READ_FUNCTION,
3028 .write = ll_file_write,
3029 .WRITE_METHOD = WRITE_FUNCTION,
3030 .unlocked_ioctl = ll_file_ioctl,
3031 .open = ll_file_open,
3032 .release = ll_file_release,
3033 .mmap = ll_file_mmap,
3034 .llseek = ll_file_seek,
3035 #ifdef HAVE_KERNEL_SENDFILE
3036 .sendfile = ll_file_sendfile,
3038 #ifdef HAVE_KERNEL_SPLICE_READ
3039 .splice_read = ll_file_splice_read,
3045 struct file_operations ll_file_operations_flock = {
3046 .read = ll_file_read,
3047 .READ_METHOD = READ_FUNCTION,
3048 .write = ll_file_write,
3049 .WRITE_METHOD = WRITE_FUNCTION,
3050 .unlocked_ioctl = ll_file_ioctl,
3051 .open = ll_file_open,
3052 .release = ll_file_release,
3053 .mmap = ll_file_mmap,
3054 .llseek = ll_file_seek,
3055 #ifdef HAVE_KERNEL_SENDFILE
3056 .sendfile = ll_file_sendfile,
3058 #ifdef HAVE_KERNEL_SPLICE_READ
3059 .splice_read = ll_file_splice_read,
3063 .flock = ll_file_flock,
3064 .lock = ll_file_flock
3067 /* These are for -o noflock - to return ENOSYS on flock calls */
3068 struct file_operations ll_file_operations_noflock = {
3069 .read = ll_file_read,
3070 .READ_METHOD = READ_FUNCTION,
3071 .write = ll_file_write,
3072 .WRITE_METHOD = WRITE_FUNCTION,
3073 .unlocked_ioctl = ll_file_ioctl,
3074 .open = ll_file_open,
3075 .release = ll_file_release,
3076 .mmap = ll_file_mmap,
3077 .llseek = ll_file_seek,
3078 #ifdef HAVE_KERNEL_SENDFILE
3079 .sendfile = ll_file_sendfile,
3081 #ifdef HAVE_KERNEL_SPLICE_READ
3082 .splice_read = ll_file_splice_read,
3086 .flock = ll_file_noflock,
3087 .lock = ll_file_noflock
3090 struct inode_operations ll_file_inode_operations = {
3091 .setattr = ll_setattr,
3092 .getattr = ll_getattr,
3093 .permission = ll_inode_permission,
3094 .setxattr = ll_setxattr,
3095 .getxattr = ll_getxattr,
3096 .listxattr = ll_listxattr,
3097 .removexattr = ll_removexattr,
3098 #ifdef HAVE_LINUX_FIEMAP_H
3099 .fiemap = ll_fiemap,
3101 #ifdef HAVE_IOP_GET_ACL
3102 .get_acl = ll_get_acl,
3106 /* dynamic ioctl number support routins */
3107 static struct llioc_ctl_data {
3108 struct rw_semaphore ioc_sem;
3109 cfs_list_t ioc_head;
3111 __RWSEM_INITIALIZER(llioc.ioc_sem),
3112 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3117 cfs_list_t iocd_list;
3118 unsigned int iocd_size;
3119 llioc_callback_t iocd_cb;
3120 unsigned int iocd_count;
3121 unsigned int iocd_cmd[0];
3124 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3127 struct llioc_data *in_data = NULL;
3130 if (cb == NULL || cmd == NULL ||
3131 count > LLIOC_MAX_CMD || count < 0)
3134 size = sizeof(*in_data) + count * sizeof(unsigned int);
3135 OBD_ALLOC(in_data, size);
3136 if (in_data == NULL)
3139 memset(in_data, 0, sizeof(*in_data));
3140 in_data->iocd_size = size;
3141 in_data->iocd_cb = cb;
3142 in_data->iocd_count = count;
3143 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3145 down_write(&llioc.ioc_sem);
3146 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3147 up_write(&llioc.ioc_sem);
3152 void ll_iocontrol_unregister(void *magic)
3154 struct llioc_data *tmp;
3159 down_write(&llioc.ioc_sem);
3160 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3162 unsigned int size = tmp->iocd_size;
3164 cfs_list_del(&tmp->iocd_list);
3165 up_write(&llioc.ioc_sem);
3167 OBD_FREE(tmp, size);
3171 up_write(&llioc.ioc_sem);
3173 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3176 EXPORT_SYMBOL(ll_iocontrol_register);
3177 EXPORT_SYMBOL(ll_iocontrol_unregister);
3179 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3180 unsigned int cmd, unsigned long arg, int *rcp)
3182 enum llioc_iter ret = LLIOC_CONT;
3183 struct llioc_data *data;
3184 int rc = -EINVAL, i;
3186 down_read(&llioc.ioc_sem);
3187 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3188 for (i = 0; i < data->iocd_count; i++) {
3189 if (cmd != data->iocd_cmd[i])
3192 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3196 if (ret == LLIOC_STOP)
3199 up_read(&llioc.ioc_sem);
3206 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3208 struct ll_inode_info *lli = ll_i2info(inode);
3209 struct cl_env_nest nest;
3214 if (lli->lli_clob == NULL)
3217 env = cl_env_nested_get(&nest);
3219 RETURN(PTR_ERR(env));
3221 result = cl_conf_set(env, lli->lli_clob, conf);
3222 cl_env_nested_put(&nest, env);
3224 if (conf->coc_opc == OBJECT_CONF_SET) {
3225 struct ldlm_lock *lock = conf->coc_lock;
3227 LASSERT(lock != NULL);
3228 LASSERT(ldlm_has_layout(lock));
3230 /* it can only be allowed to match after layout is
3231 * applied to inode otherwise false layout would be
3232 * seen. Applying layout shoud happen before dropping
3233 * the intent lock. */
3234 ldlm_lock_allow_match(lock);
3240 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3241 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3244 struct ll_sb_info *sbi = ll_i2sbi(inode);
3245 struct obd_capa *oc;
3246 struct ptlrpc_request *req;
3247 struct mdt_body *body;
3254 if (lock->l_lvb_data != NULL)
3257 /* if layout lock was granted right away, the layout is returned
3258 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3259 * blocked and then granted via completion ast, we have to fetch
3260 * layout here. Please note that we can't use the LVB buffer in
3261 * completion AST because it doesn't have a large enough buffer */
3262 oc = ll_mdscapa_get(inode);
3263 rc = ll_get_max_mdsize(sbi, &lmmsize);
3265 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3266 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3272 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3273 if (body == NULL || body->eadatasize > lmmsize)
3274 GOTO(out, rc = -EPROTO);
3276 lmmsize = body->eadatasize;
3277 if (lmmsize == 0) /* empty layout */
3280 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3282 GOTO(out, rc = -EFAULT);
3284 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3285 if (lvbdata == NULL)
3286 GOTO(out, rc = -ENOMEM);
3288 memcpy(lvbdata, lmm, lmmsize);
3289 lock_res_and_lock(lock);
3290 if (lock->l_lvb_data == NULL) {
3291 lock->l_lvb_data = lvbdata;
3292 lock->l_lvb_len = lmmsize;
3295 unlock_res_and_lock(lock);
3297 if (lvbdata != NULL)
3298 OBD_FREE_LARGE(lvbdata, lmmsize);
3302 ptlrpc_req_finished(req);
3307 * Apply the layout to the inode. Layout lock is held and will be released
3310 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3311 struct inode *inode, __u32 *gen, bool reconf)
3313 struct ll_inode_info *lli = ll_i2info(inode);
3314 struct ll_sb_info *sbi = ll_i2sbi(inode);
3315 struct ldlm_lock *lock;
3316 struct lustre_md md = { NULL };
3317 struct cl_object_conf conf;
3320 bool wait_layout = false;
3323 LASSERT(lustre_handle_is_used(lockh));
3325 lock = ldlm_handle2lock(lockh);
3326 LASSERT(lock != NULL);
3327 LASSERT(ldlm_has_layout(lock));
3329 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3330 inode, PFID(&lli->lli_fid), reconf);
3332 /* in case this is a caching lock and reinstate with new inode */
3333 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3335 lock_res_and_lock(lock);
3336 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3337 unlock_res_and_lock(lock);
3338 /* checking lvb_ready is racy but this is okay. The worst case is
3339 * that multi processes may configure the file on the same time. */
3340 if (lvb_ready || !reconf) {
3343 /* layout_gen must be valid if layout lock is not
3344 * cancelled and stripe has already set */
3345 *gen = lli->lli_layout_gen;
3351 rc = ll_layout_fetch(inode, lock);
3355 /* for layout lock, lmm is returned in lock's lvb.
3356 * lvb_data is immutable if the lock is held so it's safe to access it
3357 * without res lock. See the description in ldlm_lock_decref_internal()
3358 * for the condition to free lvb_data of layout lock */
3359 if (lock->l_lvb_data != NULL) {
3360 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3361 lock->l_lvb_data, lock->l_lvb_len);
3363 *gen = LL_LAYOUT_GEN_EMPTY;
3365 *gen = md.lsm->lsm_layout_gen;
3368 CERROR("%s: file "DFID" unpackmd error: %d\n",
3369 ll_get_fsname(inode->i_sb, NULL, 0),
3370 PFID(&lli->lli_fid), rc);
3376 /* set layout to file. Unlikely this will fail as old layout was
3377 * surely eliminated */
3378 memset(&conf, 0, sizeof conf);
3379 conf.coc_opc = OBJECT_CONF_SET;
3380 conf.coc_inode = inode;
3381 conf.coc_lock = lock;
3382 conf.u.coc_md = &md;
3383 rc = ll_layout_conf(inode, &conf);
3386 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3388 /* refresh layout failed, need to wait */
3389 wait_layout = rc == -EBUSY;
3393 LDLM_LOCK_PUT(lock);
3394 ldlm_lock_decref(lockh, mode);
3396 /* wait for IO to complete if it's still being used. */
3398 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3399 ll_get_fsname(inode->i_sb, NULL, 0),
3400 inode, PFID(&lli->lli_fid));
3402 memset(&conf, 0, sizeof conf);
3403 conf.coc_opc = OBJECT_CONF_WAIT;
3404 conf.coc_inode = inode;
3405 rc = ll_layout_conf(inode, &conf);
3409 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3410 PFID(&lli->lli_fid), rc);
3416 * This function checks if there exists a LAYOUT lock on the client side,
3417 * or enqueues it if it doesn't have one in cache.
3419 * This function will not hold layout lock so it may be revoked any time after
3420 * this function returns. Any operations depend on layout should be redone
3423 * This function should be called before lov_io_init() to get an uptodate
3424 * layout version, the caller should save the version number and after IO
3425 * is finished, this function should be called again to verify that layout
3426 * is not changed during IO time.
3428 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3430 struct ll_inode_info *lli = ll_i2info(inode);
3431 struct ll_sb_info *sbi = ll_i2sbi(inode);
3432 struct md_op_data *op_data;
3433 struct lookup_intent it;
3434 struct lustre_handle lockh;
3436 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3438 .ei_cb_bl = ll_md_blocking_ast,
3439 .ei_cb_cp = ldlm_completion_ast,
3440 .ei_cbdata = NULL };
3444 *gen = lli->lli_layout_gen;
3445 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3449 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3450 LASSERT(S_ISREG(inode->i_mode));
3452 /* mostly layout lock is caching on the local side, so try to match
3453 * it before grabbing layout lock mutex. */
3454 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3455 if (mode != 0) { /* hit cached lock */
3456 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3460 /* better hold lli_layout_mutex to try again otherwise
3461 * it will have starvation problem. */
3464 /* take layout lock mutex to enqueue layout lock exclusively. */
3465 mutex_lock(&lli->lli_layout_mutex);
3468 /* try again. Maybe somebody else has done this. */
3469 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3470 if (mode != 0) { /* hit cached lock */
3471 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3475 mutex_unlock(&lli->lli_layout_mutex);
3479 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3480 0, 0, LUSTRE_OPC_ANY, NULL);
3481 if (IS_ERR(op_data)) {
3482 mutex_unlock(&lli->lli_layout_mutex);
3483 RETURN(PTR_ERR(op_data));
3486 /* have to enqueue one */
3487 memset(&it, 0, sizeof(it));
3488 it.it_op = IT_LAYOUT;
3489 lockh.cookie = 0ULL;
3491 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3492 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3493 PFID(&lli->lli_fid));
3495 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3497 if (it.d.lustre.it_data != NULL)
3498 ptlrpc_req_finished(it.d.lustre.it_data);
3499 it.d.lustre.it_data = NULL;
3501 ll_finish_md_op_data(op_data);
3503 mode = it.d.lustre.it_lock_mode;
3504 it.d.lustre.it_lock_mode = 0;
3505 ll_intent_drop_lock(&it);
3508 /* set lock data in case this is a new lock */
3509 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3510 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3514 mutex_unlock(&lli->lli_layout_mutex);