4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
85 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
86 op_data->op_bias |= MDS_DATA_MODIFIED;
90 * Closes the IO epoch and packs all the attributes into @op_data for
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
99 ATTR_MTIME_SET | ATTR_CTIME_SET;
101 if (!(och->och_flags & FMODE_WRITE))
104 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
105 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 ll_ioepoch_close(inode, op_data, &och, 0);
110 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
111 ll_prep_md_op_data(op_data, inode, NULL, NULL,
112 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och)
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
144 rc = md_close(md_exp, op_data, och->och_mod, &req);
146 /* This close must have the epoch closed. */
147 LASSERT(epoch_close);
148 /* MDS has instructed us to obtain Size-on-MDS attribute from
149 * OSTs and send setattr to back to MDS. */
150 rc = ll_som_update(inode, op_data);
152 CERROR("inode %lu mdc Size-on-MDS update failed: "
153 "rc = %d\n", inode->i_ino, rc);
157 CERROR("inode %lu mdc close failed: rc = %d\n",
161 /* DATA_MODIFIED flag was successfully sent on close, cancel data
162 * modification flag. */
163 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
164 struct ll_inode_info *lli = ll_i2info(inode);
166 spin_lock(&lli->lli_lock);
167 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
168 spin_unlock(&lli->lli_lock);
171 ll_finish_md_op_data(op_data);
174 rc = ll_objects_destroy(req, inode);
176 CERROR("inode %lu ll_objects destroy: rc = %d\n",
183 if (exp_connect_som(exp) && !epoch_close &&
184 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
185 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
187 md_clear_open_replay_data(md_exp, och);
188 /* Free @och if it is not waiting for DONE_WRITING. */
189 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
192 if (req) /* This is close request */
193 ptlrpc_req_finished(req);
197 int ll_md_real_close(struct inode *inode, int flags)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (flags & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (flags & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(flags & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount) { /* There are still users of this handle, so
221 mutex_unlock(&lli->lli_och_mutex);
226 mutex_unlock(&lli->lli_och_mutex);
228 if (och) { /* There might be a race and somebody have freed this och
230 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
240 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
241 struct ll_inode_info *lli = ll_i2info(inode);
245 /* clear group lock, if present */
246 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
247 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
249 /* Let's see if we have good enough OPEN lock on the file and if
250 we can skip talking to MDS */
251 if (file->f_dentry->d_inode) { /* Can this ever be false? */
253 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
254 struct lustre_handle lockh;
255 struct inode *inode = file->f_dentry->d_inode;
256 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
258 mutex_lock(&lli->lli_och_mutex);
259 if (fd->fd_omode & FMODE_WRITE) {
261 LASSERT(lli->lli_open_fd_write_count);
262 lli->lli_open_fd_write_count--;
263 } else if (fd->fd_omode & FMODE_EXEC) {
265 LASSERT(lli->lli_open_fd_exec_count);
266 lli->lli_open_fd_exec_count--;
269 LASSERT(lli->lli_open_fd_read_count);
270 lli->lli_open_fd_read_count--;
272 mutex_unlock(&lli->lli_och_mutex);
274 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
275 LDLM_IBITS, &policy, lockmode,
277 rc = ll_md_real_close(file->f_dentry->d_inode,
281 CERROR("Releasing a file %p with negative dentry %p. Name %s",
282 file, file->f_dentry, file->f_dentry->d_name.name);
285 LUSTRE_FPRIVATE(file) = NULL;
286 ll_file_data_put(fd);
287 ll_capa_close(inode);
292 /* While this returns an error code, fput() the caller does not, so we need
293 * to make every effort to clean up all of our state here. Also, applications
294 * rarely check close errors and even if an error is returned they will not
295 * re-try the close call.
297 int ll_file_release(struct inode *inode, struct file *file)
299 struct ll_file_data *fd;
300 struct ll_sb_info *sbi = ll_i2sbi(inode);
301 struct ll_inode_info *lli = ll_i2info(inode);
305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
306 inode->i_generation, inode);
308 #ifdef CONFIG_FS_POSIX_ACL
309 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
310 inode == inode->i_sb->s_root->d_inode) {
311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
314 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
315 fd->fd_flags &= ~LL_FILE_RMTACL;
316 rct_del(&sbi->ll_rct, cfs_curproc_pid());
317 et_search_free(&sbi->ll_et, cfs_curproc_pid());
322 if (inode->i_sb->s_root != file->f_dentry)
323 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
324 fd = LUSTRE_FPRIVATE(file);
327 /* The last ref on @file, maybe not the the owner pid of statahead.
328 * Different processes can open the same dir, "ll_opendir_key" means:
329 * it is me that should stop the statahead thread. */
330 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
331 lli->lli_opendir_pid != 0)
332 ll_stop_statahead(inode, lli->lli_opendir_key);
334 if (inode->i_sb->s_root == file->f_dentry) {
335 LUSTRE_FPRIVATE(file) = NULL;
336 ll_file_data_put(fd);
340 if (!S_ISDIR(inode->i_mode)) {
341 lov_read_and_clear_async_rc(lli->lli_clob);
342 lli->lli_async_rc = 0;
345 rc = ll_md_close(sbi->ll_md_exp, inode, file);
347 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
348 libcfs_debug_dumplog();
353 static int ll_intent_file_open(struct file *file, void *lmm,
354 int lmmsize, struct lookup_intent *itp)
356 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
357 struct dentry *parent = file->f_dentry->d_parent;
358 const char *name = file->f_dentry->d_name.name;
359 const int len = file->f_dentry->d_name.len;
360 struct md_op_data *op_data;
361 struct ptlrpc_request *req;
362 __u32 opc = LUSTRE_OPC_ANY;
369 /* Usually we come here only for NFSD, and we want open lock.
370 But we can also get here with pre 2.6.15 patchless kernels, and in
371 that case that lock is also ok */
372 /* We can also get here if there was cached open handle in revalidate_it
373 * but it disappeared while we were getting from there to ll_file_open.
374 * But this means this file was closed and immediatelly opened which
375 * makes a good candidate for using OPEN lock */
376 /* If lmmsize & lmm are not 0, we are just setting stripe info
377 * parameters. No need for the open lock */
378 if (lmm == NULL && lmmsize == 0) {
379 itp->it_flags |= MDS_OPEN_LOCK;
380 if (itp->it_flags & FMODE_WRITE)
381 opc = LUSTRE_OPC_CREATE;
384 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
385 file->f_dentry->d_inode, name, len,
388 RETURN(PTR_ERR(op_data));
390 itp->it_flags |= MDS_OPEN_BY_FID;
391 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
392 0 /*unused */, &req, ll_md_blocking_ast, 0);
393 ll_finish_md_op_data(op_data);
395 /* reason for keep own exit path - don`t flood log
396 * with messages with -ESTALE errors.
398 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
399 it_open_error(DISP_OPEN_OPEN, itp))
401 ll_release_openhandle(file->f_dentry, itp);
405 if (it_disposition(itp, DISP_LOOKUP_NEG))
406 GOTO(out, rc = -ENOENT);
408 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
409 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
410 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
414 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
415 if (!rc && itp->d.lustre.it_lock_mode)
416 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
420 ptlrpc_req_finished(itp->d.lustre.it_data);
421 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
422 ll_intent_drop_lock(itp);
428 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
429 * not believe attributes if a few ioepoch holders exist. Attributes for
430 * previous ioepoch if new one is opened are also skipped by MDS.
432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
434 if (ioepoch && lli->lli_ioepoch != ioepoch) {
435 lli->lli_ioepoch = ioepoch;
436 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
437 ioepoch, PFID(&lli->lli_fid));
441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
442 struct lookup_intent *it, struct obd_client_handle *och)
444 struct ptlrpc_request *req = it->d.lustre.it_data;
445 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 LASSERT(body != NULL); /* reply already checked out */
452 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_fid = lli->lli_fid;
455 och->och_flags = it->it_flags;
456 ll_ioepoch_open(lli, body->ioepoch);
458 return md_set_open_replay_data(md_exp, och, req);
461 int ll_local_open(struct file *file, struct lookup_intent *it,
462 struct ll_file_data *fd, struct obd_client_handle *och)
464 struct inode *inode = file->f_dentry->d_inode;
465 struct ll_inode_info *lli = ll_i2info(inode);
468 LASSERT(!LUSTRE_FPRIVATE(file));
473 struct ptlrpc_request *req = it->d.lustre.it_data;
474 struct mdt_body *body;
477 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
481 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
482 if ((it->it_flags & FMODE_WRITE) &&
483 (body->valid & OBD_MD_FLSIZE))
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 lli->lli_ioepoch, PFID(&lli->lli_fid));
488 LUSTRE_FPRIVATE(file) = fd;
489 ll_readahead_init(inode, &fd->fd_ras);
490 fd->fd_omode = it->it_flags;
494 /* Open a file, and (for the very first open) create objects on the OSTs at
495 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
496 * creation or open until ll_lov_setstripe() ioctl is called.
498 * If we already have the stripe MD locally then we don't request it in
499 * md_open(), by passing a lmm_size = 0.
501 * It is up to the application to ensure no other processes open this file
502 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
503 * used. We might be able to avoid races of that sort by getting lli_open_sem
504 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
505 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
507 int ll_file_open(struct inode *inode, struct file *file)
509 struct ll_inode_info *lli = ll_i2info(inode);
510 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
511 .it_flags = file->f_flags };
512 struct obd_client_handle **och_p = NULL;
513 __u64 *och_usecount = NULL;
514 struct ll_file_data *fd;
515 int rc = 0, opendir_set = 0;
518 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
519 inode->i_generation, inode, file->f_flags);
521 it = file->private_data; /* XXX: compat macro */
522 file->private_data = NULL; /* prevent ll_local_open assertion */
524 fd = ll_file_data_get();
526 GOTO(out_och_free, rc = -ENOMEM);
529 if (S_ISDIR(inode->i_mode)) {
530 spin_lock(&lli->lli_sa_lock);
531 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
532 lli->lli_opendir_pid == 0) {
533 lli->lli_opendir_key = fd;
534 lli->lli_opendir_pid = cfs_curproc_pid();
537 spin_unlock(&lli->lli_sa_lock);
540 if (inode->i_sb->s_root == file->f_dentry) {
541 LUSTRE_FPRIVATE(file) = fd;
545 if (!it || !it->d.lustre.it_disposition) {
546 /* Convert f_flags into access mode. We cannot use file->f_mode,
547 * because everything but O_ACCMODE mask was stripped from
549 if ((oit.it_flags + 1) & O_ACCMODE)
551 if (file->f_flags & O_TRUNC)
552 oit.it_flags |= FMODE_WRITE;
554 /* kernel only call f_op->open in dentry_open. filp_open calls
555 * dentry_open after call to open_namei that checks permissions.
556 * Only nfsd_open call dentry_open directly without checking
557 * permissions and because of that this code below is safe. */
558 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
559 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
561 /* We do not want O_EXCL here, presumably we opened the file
562 * already? XXX - NFS implications? */
563 oit.it_flags &= ~O_EXCL;
565 /* bug20584, if "it_flags" contains O_CREAT, the file will be
566 * created if necessary, then "IT_CREAT" should be set to keep
567 * consistent with it */
568 if (oit.it_flags & O_CREAT)
569 oit.it_op |= IT_CREAT;
575 /* Let's see if we have file open on MDS already. */
576 if (it->it_flags & FMODE_WRITE) {
577 och_p = &lli->lli_mds_write_och;
578 och_usecount = &lli->lli_open_fd_write_count;
579 } else if (it->it_flags & FMODE_EXEC) {
580 och_p = &lli->lli_mds_exec_och;
581 och_usecount = &lli->lli_open_fd_exec_count;
583 och_p = &lli->lli_mds_read_och;
584 och_usecount = &lli->lli_open_fd_read_count;
587 mutex_lock(&lli->lli_och_mutex);
588 if (*och_p) { /* Open handle is present */
589 if (it_disposition(it, DISP_OPEN_OPEN)) {
590 /* Well, there's extra open request that we do not need,
591 let's close it somehow. This will decref request. */
592 rc = it_open_error(DISP_OPEN_OPEN, it);
594 mutex_unlock(&lli->lli_och_mutex);
595 GOTO(out_openerr, rc);
598 ll_release_openhandle(file->f_dentry, it);
602 rc = ll_local_open(file, it, fd, NULL);
605 mutex_unlock(&lli->lli_och_mutex);
606 GOTO(out_openerr, rc);
609 LASSERT(*och_usecount == 0);
610 if (!it->d.lustre.it_disposition) {
611 /* We cannot just request lock handle now, new ELC code
612 means that one of other OPEN locks for this file
613 could be cancelled, and since blocking ast handler
614 would attempt to grab och_mutex as well, that would
615 result in a deadlock */
616 mutex_unlock(&lli->lli_och_mutex);
617 it->it_create_mode |= M_CHECK_STALE;
618 rc = ll_intent_file_open(file, NULL, 0, it);
619 it->it_create_mode &= ~M_CHECK_STALE;
621 GOTO(out_openerr, rc);
625 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
627 GOTO(out_och_free, rc = -ENOMEM);
631 /* md_intent_lock() didn't get a request ref if there was an
632 * open error, so don't do cleanup on the request here
634 /* XXX (green): Should not we bail out on any error here, not
635 * just open error? */
636 rc = it_open_error(DISP_OPEN_OPEN, it);
638 GOTO(out_och_free, rc);
640 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
642 rc = ll_local_open(file, it, fd, *och_p);
644 GOTO(out_och_free, rc);
646 mutex_unlock(&lli->lli_och_mutex);
649 /* Must do this outside lli_och_mutex lock to prevent deadlock where
650 different kind of OPEN lock for this same inode gets cancelled
651 by ldlm_cancel_lru */
652 if (!S_ISREG(inode->i_mode))
653 GOTO(out_och_free, rc);
657 if (!lli->lli_has_smd) {
658 if (file->f_flags & O_LOV_DELAY_CREATE ||
659 !(file->f_mode & FMODE_WRITE)) {
660 CDEBUG(D_INODE, "object creation was delayed\n");
661 GOTO(out_och_free, rc);
664 file->f_flags &= ~O_LOV_DELAY_CREATE;
665 GOTO(out_och_free, rc);
669 if (och_p && *och_p) {
670 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
671 *och_p = NULL; /* OBD_FREE writes some magic there */
674 mutex_unlock(&lli->lli_och_mutex);
677 if (opendir_set != 0)
678 ll_stop_statahead(inode, lli->lli_opendir_key);
680 ll_file_data_put(fd);
682 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
685 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
686 ptlrpc_req_finished(it->d.lustre.it_data);
687 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
693 /* Fills the obdo with the attributes for the lsm */
694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
695 struct obd_capa *capa, struct obdo *obdo,
696 __u64 ioepoch, int sync)
698 struct ptlrpc_request_set *set;
699 struct obd_info oinfo = { { { 0 } } };
704 LASSERT(lsm != NULL);
708 oinfo.oi_oa->o_oi = lsm->lsm_oi;
709 oinfo.oi_oa->o_mode = S_IFREG;
710 oinfo.oi_oa->o_ioepoch = ioepoch;
711 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
712 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
713 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
714 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
715 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
716 OBD_MD_FLDATAVERSION;
717 oinfo.oi_capa = capa;
719 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
720 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
723 set = ptlrpc_prep_set();
725 CERROR("can't allocate ptlrpc set\n");
728 rc = obd_getattr_async(exp, &oinfo, set);
730 rc = ptlrpc_set_wait(set);
731 ptlrpc_set_destroy(set);
734 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
735 OBD_MD_FLATIME | OBD_MD_FLMTIME |
736 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
737 OBD_MD_FLDATAVERSION);
742 * Performs the getattr on the inode and updates its fields.
743 * If @sync != 0, perform the getattr under the server-side lock.
745 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
746 __u64 ioepoch, int sync)
748 struct obd_capa *capa = ll_mdscapa_get(inode);
749 struct lov_stripe_md *lsm;
753 lsm = ccc_inode_lsm_get(inode);
754 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
755 capa, obdo, ioepoch, sync);
758 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
760 obdo_refresh_inode(inode, obdo, obdo->o_valid);
761 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
762 " blksize %lu\n", POSTID(oi), i_size_read(inode),
763 (unsigned long long)inode->i_blocks,
764 (unsigned long)ll_inode_blksize(inode));
766 ccc_inode_lsm_put(inode, lsm);
770 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
772 struct ll_inode_info *lli = ll_i2info(inode);
773 struct cl_object *obj = lli->lli_clob;
774 struct cl_attr *attr = ccc_env_thread_attr(env);
780 ll_inode_size_lock(inode);
781 /* merge timestamps the most recently obtained from mds with
782 timestamps obtained from osts */
783 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
784 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
785 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
786 inode_init_lvb(inode, &lvb);
788 cl_object_attr_lock(obj);
789 rc = cl_object_attr_get(env, obj, attr);
790 cl_object_attr_unlock(obj);
793 if (lvb.lvb_atime < attr->cat_atime)
794 lvb.lvb_atime = attr->cat_atime;
795 if (lvb.lvb_ctime < attr->cat_ctime)
796 lvb.lvb_ctime = attr->cat_ctime;
797 if (lvb.lvb_mtime < attr->cat_mtime)
798 lvb.lvb_mtime = attr->cat_mtime;
800 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
801 PFID(&lli->lli_fid), attr->cat_size);
802 cl_isize_write_nolock(inode, attr->cat_size);
804 inode->i_blocks = attr->cat_blocks;
806 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
807 LTIME_S(inode->i_atime) = lvb.lvb_atime;
808 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
810 ll_inode_size_unlock(inode);
815 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
818 struct obdo obdo = { 0 };
821 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
823 st->st_size = obdo.o_size;
824 st->st_blocks = obdo.o_blocks;
825 st->st_mtime = obdo.o_mtime;
826 st->st_atime = obdo.o_atime;
827 st->st_ctime = obdo.o_ctime;
832 void ll_io_init(struct cl_io *io, const struct file *file, int write)
834 struct inode *inode = file->f_dentry->d_inode;
836 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
838 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
839 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
840 file->f_flags & O_DIRECT ||
843 io->ci_obj = ll_i2info(inode)->lli_clob;
844 io->ci_lockreq = CILR_MAYBE;
845 if (ll_file_nolock(file)) {
846 io->ci_lockreq = CILR_NEVER;
847 io->ci_no_srvlock = 1;
848 } else if (file->f_flags & O_APPEND) {
849 io->ci_lockreq = CILR_MANDATORY;
854 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
855 struct file *file, enum cl_io_type iot,
856 loff_t *ppos, size_t count)
858 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
859 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
865 io = ccc_env_thread_io(env);
866 ll_io_init(io, file, iot == CIT_WRITE);
868 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
869 struct vvp_io *vio = vvp_env_io(env);
870 struct ccc_io *cio = ccc_env_io(env);
871 int write_mutex_locked = 0;
873 cio->cui_fd = LUSTRE_FPRIVATE(file);
874 vio->cui_io_subtype = args->via_io_subtype;
876 switch (vio->cui_io_subtype) {
878 cio->cui_iov = args->u.normal.via_iov;
879 cio->cui_nrsegs = args->u.normal.via_nrsegs;
880 cio->cui_tot_nrsegs = cio->cui_nrsegs;
881 #ifndef HAVE_FILE_WRITEV
882 cio->cui_iocb = args->u.normal.via_iocb;
884 if ((iot == CIT_WRITE) &&
885 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
886 if (mutex_lock_interruptible(&lli->
888 GOTO(out, result = -ERESTARTSYS);
889 write_mutex_locked = 1;
890 } else if (iot == CIT_READ) {
891 down_read(&lli->lli_trunc_sem);
895 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
896 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
899 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
900 vio->u.splice.cui_flags = args->u.splice.via_flags;
903 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
906 result = cl_io_loop(env, io);
907 if (write_mutex_locked)
908 mutex_unlock(&lli->lli_write_mutex);
909 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
910 up_read(&lli->lli_trunc_sem);
912 /* cl_io_rw_init() handled IO */
913 result = io->ci_result;
916 if (io->ci_nob > 0) {
918 *ppos = io->u.ci_wr.wr.crw_pos;
923 /* If any bit been read/written (result != 0), we just return
924 * short read/write instead of restart io. */
925 if (result == 0 && io->ci_need_restart) {
926 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
927 iot == CIT_READ ? "read" : "write",
928 file->f_dentry->d_name.name, *ppos, count);
929 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
933 if (iot == CIT_READ) {
935 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
936 LPROC_LL_READ_BYTES, result);
937 } else if (iot == CIT_WRITE) {
939 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
940 LPROC_LL_WRITE_BYTES, result);
941 fd->fd_write_failed = false;
942 } else if (result != -ERESTARTSYS) {
943 fd->fd_write_failed = true;
952 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
954 static int ll_file_get_iov_count(const struct iovec *iov,
955 unsigned long *nr_segs, size_t *count)
960 for (seg = 0; seg < *nr_segs; seg++) {
961 const struct iovec *iv = &iov[seg];
964 * If any segment has a negative length, or the cumulative
965 * length ever wraps negative then return -EINVAL.
968 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
970 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
975 cnt -= iv->iov_len; /* This segment is no good */
982 #ifdef HAVE_FILE_READV
983 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
984 unsigned long nr_segs, loff_t *ppos)
987 struct vvp_io_args *args;
993 result = ll_file_get_iov_count(iov, &nr_segs, &count);
997 env = cl_env_get(&refcheck);
999 RETURN(PTR_ERR(env));
1001 args = vvp_env_args(env, IO_NORMAL);
1002 args->u.normal.via_iov = (struct iovec *)iov;
1003 args->u.normal.via_nrsegs = nr_segs;
1005 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1006 cl_env_put(env, &refcheck);
1010 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1014 struct iovec *local_iov;
1019 env = cl_env_get(&refcheck);
1021 RETURN(PTR_ERR(env));
1023 local_iov = &vvp_env_info(env)->vti_local_iov;
1024 local_iov->iov_base = (void __user *)buf;
1025 local_iov->iov_len = count;
1026 result = ll_file_readv(file, local_iov, 1, ppos);
1027 cl_env_put(env, &refcheck);
1032 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1033 unsigned long nr_segs, loff_t pos)
1036 struct vvp_io_args *args;
1042 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1046 env = cl_env_get(&refcheck);
1048 RETURN(PTR_ERR(env));
1050 args = vvp_env_args(env, IO_NORMAL);
1051 args->u.normal.via_iov = (struct iovec *)iov;
1052 args->u.normal.via_nrsegs = nr_segs;
1053 args->u.normal.via_iocb = iocb;
1055 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1056 &iocb->ki_pos, count);
1057 cl_env_put(env, &refcheck);
1061 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1065 struct iovec *local_iov;
1066 struct kiocb *kiocb;
1071 env = cl_env_get(&refcheck);
1073 RETURN(PTR_ERR(env));
1075 local_iov = &vvp_env_info(env)->vti_local_iov;
1076 kiocb = &vvp_env_info(env)->vti_kiocb;
1077 local_iov->iov_base = (void __user *)buf;
1078 local_iov->iov_len = count;
1079 init_sync_kiocb(kiocb, file);
1080 kiocb->ki_pos = *ppos;
1081 kiocb->ki_left = count;
1083 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1084 *ppos = kiocb->ki_pos;
1086 cl_env_put(env, &refcheck);
1092 * Write to a file (through the page cache).
1094 #ifdef HAVE_FILE_WRITEV
1095 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1096 unsigned long nr_segs, loff_t *ppos)
1099 struct vvp_io_args *args;
1105 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1109 env = cl_env_get(&refcheck);
1111 RETURN(PTR_ERR(env));
1113 args = vvp_env_args(env, IO_NORMAL);
1114 args->u.normal.via_iov = (struct iovec *)iov;
1115 args->u.normal.via_nrsegs = nr_segs;
1117 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1118 cl_env_put(env, &refcheck);
1122 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1126 struct iovec *local_iov;
1131 env = cl_env_get(&refcheck);
1133 RETURN(PTR_ERR(env));
1135 local_iov = &vvp_env_info(env)->vti_local_iov;
1136 local_iov->iov_base = (void __user *)buf;
1137 local_iov->iov_len = count;
1139 result = ll_file_writev(file, local_iov, 1, ppos);
1140 cl_env_put(env, &refcheck);
1144 #else /* AIO stuff */
1145 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1146 unsigned long nr_segs, loff_t pos)
1149 struct vvp_io_args *args;
1155 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1159 env = cl_env_get(&refcheck);
1161 RETURN(PTR_ERR(env));
1163 args = vvp_env_args(env, IO_NORMAL);
1164 args->u.normal.via_iov = (struct iovec *)iov;
1165 args->u.normal.via_nrsegs = nr_segs;
1166 args->u.normal.via_iocb = iocb;
1168 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1169 &iocb->ki_pos, count);
1170 cl_env_put(env, &refcheck);
1174 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1178 struct iovec *local_iov;
1179 struct kiocb *kiocb;
1184 env = cl_env_get(&refcheck);
1186 RETURN(PTR_ERR(env));
1188 local_iov = &vvp_env_info(env)->vti_local_iov;
1189 kiocb = &vvp_env_info(env)->vti_kiocb;
1190 local_iov->iov_base = (void __user *)buf;
1191 local_iov->iov_len = count;
1192 init_sync_kiocb(kiocb, file);
1193 kiocb->ki_pos = *ppos;
1194 kiocb->ki_left = count;
1196 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1197 *ppos = kiocb->ki_pos;
1199 cl_env_put(env, &refcheck);
1205 #ifdef HAVE_KERNEL_SENDFILE
1207 * Send file content (through pagecache) somewhere with helper
1209 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1210 read_actor_t actor, void *target)
1213 struct vvp_io_args *args;
1218 env = cl_env_get(&refcheck);
1220 RETURN(PTR_ERR(env));
1222 args = vvp_env_args(env, IO_SENDFILE);
1223 args->u.sendfile.via_target = target;
1224 args->u.sendfile.via_actor = actor;
1226 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1227 cl_env_put(env, &refcheck);
1232 #ifdef HAVE_KERNEL_SPLICE_READ
1234 * Send file content (through pagecache) somewhere with helper
1236 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1237 struct pipe_inode_info *pipe, size_t count,
1241 struct vvp_io_args *args;
1246 env = cl_env_get(&refcheck);
1248 RETURN(PTR_ERR(env));
1250 args = vvp_env_args(env, IO_SPLICE);
1251 args->u.splice.via_pipe = pipe;
1252 args->u.splice.via_flags = flags;
1254 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1255 cl_env_put(env, &refcheck);
1260 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1263 struct obd_export *exp = ll_i2dtexp(inode);
1264 struct obd_trans_info oti = { 0 };
1265 struct obdo *oa = NULL;
1268 struct lov_stripe_md *lsm = NULL, *lsm2;
1275 lsm = ccc_inode_lsm_get(inode);
1277 GOTO(out, rc = -ENOENT);
1279 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1280 (lsm->lsm_stripe_count));
1282 OBD_ALLOC_LARGE(lsm2, lsm_size);
1284 GOTO(out, rc = -ENOMEM);
1288 oa->o_nlink = ost_idx;
1289 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1290 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1291 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1292 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1293 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1294 memcpy(lsm2, lsm, lsm_size);
1295 ll_inode_size_lock(inode);
1296 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1297 ll_inode_size_unlock(inode);
1299 OBD_FREE_LARGE(lsm2, lsm_size);
1302 ccc_inode_lsm_put(inode, lsm);
1307 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1309 struct ll_recreate_obj ucreat;
1312 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1315 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1320 ucreat.lrc_ost_idx));
1323 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1330 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1333 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1336 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1337 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1338 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1341 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1342 int flags, struct lov_user_md *lum, int lum_size)
1344 struct lov_stripe_md *lsm = NULL;
1345 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1349 lsm = ccc_inode_lsm_get(inode);
1351 ccc_inode_lsm_put(inode, lsm);
1352 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1357 ll_inode_size_lock(inode);
1358 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1361 rc = oit.d.lustre.it_status;
1363 GOTO(out_req_free, rc);
1365 ll_release_openhandle(file->f_dentry, &oit);
1368 ll_inode_size_unlock(inode);
1369 ll_intent_release(&oit);
1370 ccc_inode_lsm_put(inode, lsm);
1373 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1377 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1378 struct lov_mds_md **lmmp, int *lmm_size,
1379 struct ptlrpc_request **request)
1381 struct ll_sb_info *sbi = ll_i2sbi(inode);
1382 struct mdt_body *body;
1383 struct lov_mds_md *lmm = NULL;
1384 struct ptlrpc_request *req = NULL;
1385 struct md_op_data *op_data;
1388 rc = ll_get_max_mdsize(sbi, &lmmsize);
1392 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1393 strlen(filename), lmmsize,
1394 LUSTRE_OPC_ANY, NULL);
1395 if (IS_ERR(op_data))
1396 RETURN(PTR_ERR(op_data));
1398 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1399 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1400 ll_finish_md_op_data(op_data);
1402 CDEBUG(D_INFO, "md_getattr_name failed "
1403 "on %s: rc %d\n", filename, rc);
1407 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1408 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1410 lmmsize = body->eadatasize;
1412 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1414 GOTO(out, rc = -ENODATA);
1417 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1418 LASSERT(lmm != NULL);
1420 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1421 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1422 GOTO(out, rc = -EPROTO);
1426 * This is coming from the MDS, so is probably in
1427 * little endian. We convert it to host endian before
1428 * passing it to userspace.
1430 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1431 /* if function called for directory - we should
1432 * avoid swab not existent lsm objects */
1433 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1434 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1435 if (S_ISREG(body->mode))
1436 lustre_swab_lov_user_md_objects(
1437 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1438 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1439 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1440 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1441 if (S_ISREG(body->mode))
1442 lustre_swab_lov_user_md_objects(
1443 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1444 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1450 *lmm_size = lmmsize;
1455 static int ll_lov_setea(struct inode *inode, struct file *file,
1458 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1459 struct lov_user_md *lump;
1460 int lum_size = sizeof(struct lov_user_md) +
1461 sizeof(struct lov_user_ost_data);
1465 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1468 OBD_ALLOC_LARGE(lump, lum_size);
1472 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1473 OBD_FREE_LARGE(lump, lum_size);
1477 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1479 OBD_FREE_LARGE(lump, lum_size);
1483 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1486 struct lov_user_md_v3 lumv3;
1487 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1488 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1489 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1491 int flags = FMODE_WRITE;
1494 /* first try with v1 which is smaller than v3 */
1495 lum_size = sizeof(struct lov_user_md_v1);
1496 if (copy_from_user(lumv1, lumv1p, lum_size))
1499 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1500 lum_size = sizeof(struct lov_user_md_v3);
1501 if (copy_from_user(&lumv3, lumv3p, lum_size))
1505 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1507 struct lov_stripe_md *lsm;
1510 put_user(0, &lumv1p->lmm_stripe_count);
1512 ll_layout_refresh(inode, &gen);
1513 lsm = ccc_inode_lsm_get(inode);
1514 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1515 0, lsm, (void *)arg);
1516 ccc_inode_lsm_put(inode, lsm);
1521 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1523 struct lov_stripe_md *lsm;
1527 lsm = ccc_inode_lsm_get(inode);
1529 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1531 ccc_inode_lsm_put(inode, lsm);
1535 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1537 struct ll_inode_info *lli = ll_i2info(inode);
1538 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1539 struct ccc_grouplock grouplock;
1543 if (ll_file_nolock(file))
1544 RETURN(-EOPNOTSUPP);
1546 spin_lock(&lli->lli_lock);
1547 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1548 CWARN("group lock already existed with gid %lu\n",
1549 fd->fd_grouplock.cg_gid);
1550 spin_unlock(&lli->lli_lock);
1553 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1554 spin_unlock(&lli->lli_lock);
1556 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1557 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1561 spin_lock(&lli->lli_lock);
1562 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1563 spin_unlock(&lli->lli_lock);
1564 CERROR("another thread just won the race\n");
1565 cl_put_grouplock(&grouplock);
1569 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1570 fd->fd_grouplock = grouplock;
1571 spin_unlock(&lli->lli_lock);
1573 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1577 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1579 struct ll_inode_info *lli = ll_i2info(inode);
1580 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1581 struct ccc_grouplock grouplock;
1584 spin_lock(&lli->lli_lock);
1585 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1586 spin_unlock(&lli->lli_lock);
1587 CWARN("no group lock held\n");
1590 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1592 if (fd->fd_grouplock.cg_gid != arg) {
1593 CWARN("group lock %lu doesn't match current id %lu\n",
1594 arg, fd->fd_grouplock.cg_gid);
1595 spin_unlock(&lli->lli_lock);
1599 grouplock = fd->fd_grouplock;
1600 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1601 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1602 spin_unlock(&lli->lli_lock);
1604 cl_put_grouplock(&grouplock);
1605 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1610 * Close inode open handle
1612 * \param dentry [in] dentry which contains the inode
1613 * \param it [in,out] intent which contains open info and result
1616 * \retval <0 failure
1618 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1620 struct inode *inode = dentry->d_inode;
1621 struct obd_client_handle *och;
1627 /* Root ? Do nothing. */
1628 if (dentry->d_inode->i_sb->s_root == dentry)
1631 /* No open handle to close? Move away */
1632 if (!it_disposition(it, DISP_OPEN_OPEN))
1635 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1637 OBD_ALLOC(och, sizeof(*och));
1639 GOTO(out, rc = -ENOMEM);
1641 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1642 ll_i2info(inode), it, och);
1644 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1647 /* this one is in place of ll_file_open */
1648 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1649 ptlrpc_req_finished(it->d.lustre.it_data);
1650 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1656 * Get size for inode for which FIEMAP mapping is requested.
1657 * Make the FIEMAP get_info call and returns the result.
1659 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1662 struct obd_export *exp = ll_i2dtexp(inode);
1663 struct lov_stripe_md *lsm = NULL;
1664 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1665 int vallen = num_bytes;
1669 /* Checks for fiemap flags */
1670 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1671 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1675 /* Check for FIEMAP_FLAG_SYNC */
1676 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1677 rc = filemap_fdatawrite(inode->i_mapping);
1682 lsm = ccc_inode_lsm_get(inode);
1686 /* If the stripe_count > 1 and the application does not understand
1687 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1689 if (lsm->lsm_stripe_count > 1 &&
1690 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1691 GOTO(out, rc = -EOPNOTSUPP);
1693 fm_key.oa.o_oi = lsm->lsm_oi;
1694 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1696 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1697 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1698 /* If filesize is 0, then there would be no objects for mapping */
1699 if (fm_key.oa.o_size == 0) {
1700 fiemap->fm_mapped_extents = 0;
1704 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1706 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1709 CERROR("obd_get_info failed: rc = %d\n", rc);
1712 ccc_inode_lsm_put(inode, lsm);
1716 int ll_fid2path(struct inode *inode, void *arg)
1718 struct obd_export *exp = ll_i2mdexp(inode);
1719 struct getinfo_fid2path *gfout, *gfin;
1723 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1724 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1727 /* Need to get the buflen */
1728 OBD_ALLOC_PTR(gfin);
1731 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1736 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1737 OBD_ALLOC(gfout, outsize);
1738 if (gfout == NULL) {
1742 memcpy(gfout, gfin, sizeof(*gfout));
1745 /* Call mdc_iocontrol */
1746 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1750 if (copy_to_user(arg, gfout, outsize))
1754 OBD_FREE(gfout, outsize);
1758 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1760 struct ll_user_fiemap *fiemap_s;
1761 size_t num_bytes, ret_bytes;
1762 unsigned int extent_count;
1765 /* Get the extent count so we can calculate the size of
1766 * required fiemap buffer */
1767 if (get_user(extent_count,
1768 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1770 num_bytes = sizeof(*fiemap_s) + (extent_count *
1771 sizeof(struct ll_fiemap_extent));
1773 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1774 if (fiemap_s == NULL)
1777 /* get the fiemap value */
1778 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1780 GOTO(error, rc = -EFAULT);
1782 /* If fm_extent_count is non-zero, read the first extent since
1783 * it is used to calculate end_offset and device from previous
1786 if (copy_from_user(&fiemap_s->fm_extents[0],
1787 (char __user *)arg + sizeof(*fiemap_s),
1788 sizeof(struct ll_fiemap_extent)))
1789 GOTO(error, rc = -EFAULT);
1792 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1796 ret_bytes = sizeof(struct ll_user_fiemap);
1798 if (extent_count != 0)
1799 ret_bytes += (fiemap_s->fm_mapped_extents *
1800 sizeof(struct ll_fiemap_extent));
1802 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1806 OBD_FREE_LARGE(fiemap_s, num_bytes);
1811 * Read the data_version for inode.
1813 * This value is computed using stripe object version on OST.
1814 * Version is computed using server side locking.
1816 * @param extent_lock Take extent lock. Not needed if a process is already
1817 * holding the OST object group locks.
1819 int ll_data_version(struct inode *inode, __u64 *data_version,
1822 struct lov_stripe_md *lsm = NULL;
1823 struct ll_sb_info *sbi = ll_i2sbi(inode);
1824 struct obdo *obdo = NULL;
1828 /* If no stripe, we consider version is 0. */
1829 lsm = ccc_inode_lsm_get(inode);
1832 CDEBUG(D_INODE, "No object for inode\n");
1836 OBD_ALLOC_PTR(obdo);
1838 ccc_inode_lsm_put(inode, lsm);
1842 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1844 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1847 *data_version = obdo->o_data_version;
1851 ccc_inode_lsm_put(inode, lsm);
1856 struct ll_swap_stack {
1857 struct iattr ia1, ia2;
1859 struct inode *inode1, *inode2;
1860 bool check_dv1, check_dv2;
1863 static int ll_swap_layouts(struct file *file1, struct file *file2,
1864 struct lustre_swap_layouts *lsl)
1866 struct mdc_swap_layouts msl;
1867 struct md_op_data *op_data;
1870 struct ll_swap_stack *llss = NULL;
1873 OBD_ALLOC_PTR(llss);
1877 llss->inode1 = file1->f_dentry->d_inode;
1878 llss->inode2 = file2->f_dentry->d_inode;
1880 if (!S_ISREG(llss->inode2->i_mode))
1881 GOTO(free, rc = -EINVAL);
1883 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1884 ll_permission(llss->inode2, MAY_WRITE, NULL))
1885 GOTO(free, rc = -EPERM);
1887 if (llss->inode2->i_sb != llss->inode1->i_sb)
1888 GOTO(free, rc = -EXDEV);
1890 /* we use 2 bool because it is easier to swap than 2 bits */
1891 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1892 llss->check_dv1 = true;
1894 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1895 llss->check_dv2 = true;
1897 /* we cannot use lsl->sl_dvX directly because we may swap them */
1898 llss->dv1 = lsl->sl_dv1;
1899 llss->dv2 = lsl->sl_dv2;
1901 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1902 if (rc == 0) /* same file, done! */
1905 if (rc < 0) { /* sequentialize it */
1906 swap(llss->inode1, llss->inode2);
1908 swap(llss->dv1, llss->dv2);
1909 swap(llss->check_dv1, llss->check_dv2);
1913 if (gid != 0) { /* application asks to flush dirty cache */
1914 rc = ll_get_grouplock(llss->inode1, file1, gid);
1918 rc = ll_get_grouplock(llss->inode2, file2, gid);
1920 ll_put_grouplock(llss->inode1, file1, gid);
1925 /* to be able to restore mtime and atime after swap
1926 * we need to first save them */
1928 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1929 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1930 llss->ia1.ia_atime = llss->inode1->i_atime;
1931 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1932 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1933 llss->ia2.ia_atime = llss->inode2->i_atime;
1934 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1937 /* ultimate check, before swaping the layouts we check if
1938 * dataversion has changed (if requested) */
1939 if (llss->check_dv1) {
1940 rc = ll_data_version(llss->inode1, &dv, 0);
1943 if (dv != llss->dv1)
1944 GOTO(putgl, rc = -EAGAIN);
1947 if (llss->check_dv2) {
1948 rc = ll_data_version(llss->inode2, &dv, 0);
1951 if (dv != llss->dv2)
1952 GOTO(putgl, rc = -EAGAIN);
1955 /* struct md_op_data is used to send the swap args to the mdt
1956 * only flags is missing, so we use struct mdc_swap_layouts
1957 * through the md_op_data->op_data */
1958 /* flags from user space have to be converted before they are send to
1959 * server, no flag is sent today, they are only used on the client */
1962 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1963 0, LUSTRE_OPC_ANY, &msl);
1964 if (op_data != NULL) {
1965 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1966 ll_i2mdexp(llss->inode1),
1967 sizeof(*op_data), op_data, NULL);
1968 ll_finish_md_op_data(op_data);
1973 ll_put_grouplock(llss->inode2, file2, gid);
1974 ll_put_grouplock(llss->inode1, file1, gid);
1977 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1981 /* clear useless flags */
1982 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1983 llss->ia1.ia_valid &= ~ATTR_MTIME;
1984 llss->ia2.ia_valid &= ~ATTR_MTIME;
1987 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1988 llss->ia1.ia_valid &= ~ATTR_ATIME;
1989 llss->ia2.ia_valid &= ~ATTR_ATIME;
1992 /* update time if requested */
1994 if (llss->ia2.ia_valid != 0)
1995 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1997 if (llss->ia1.ia_valid != 0)
1998 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2007 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2009 struct inode *inode = file->f_dentry->d_inode;
2010 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2014 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2015 inode->i_generation, inode, cmd);
2016 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2018 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2019 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2023 case LL_IOC_GETFLAGS:
2024 /* Get the current value of the file flags */
2025 return put_user(fd->fd_flags, (int *)arg);
2026 case LL_IOC_SETFLAGS:
2027 case LL_IOC_CLRFLAGS:
2028 /* Set or clear specific file flags */
2029 /* XXX This probably needs checks to ensure the flags are
2030 * not abused, and to handle any flag side effects.
2032 if (get_user(flags, (int *) arg))
2035 if (cmd == LL_IOC_SETFLAGS) {
2036 if ((flags & LL_FILE_IGNORE_LOCK) &&
2037 !(file->f_flags & O_DIRECT)) {
2038 CERROR("%s: unable to disable locking on "
2039 "non-O_DIRECT file\n", current->comm);
2043 fd->fd_flags |= flags;
2045 fd->fd_flags &= ~flags;
2048 case LL_IOC_LOV_SETSTRIPE:
2049 RETURN(ll_lov_setstripe(inode, file, arg));
2050 case LL_IOC_LOV_SETEA:
2051 RETURN(ll_lov_setea(inode, file, arg));
2052 case LL_IOC_LOV_SWAP_LAYOUTS: {
2054 struct lustre_swap_layouts lsl;
2056 if (cfs_copy_from_user(&lsl, (char *)arg,
2057 sizeof(struct lustre_swap_layouts)))
2060 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2063 file2 = fget(lsl.sl_fd);
2068 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2069 rc = ll_swap_layouts(file, file2, &lsl);
2073 case LL_IOC_LOV_GETSTRIPE:
2074 RETURN(ll_lov_getstripe(inode, arg));
2075 case LL_IOC_RECREATE_OBJ:
2076 RETURN(ll_lov_recreate_obj(inode, arg));
2077 case LL_IOC_RECREATE_FID:
2078 RETURN(ll_lov_recreate_fid(inode, arg));
2079 case FSFILT_IOC_FIEMAP:
2080 RETURN(ll_ioctl_fiemap(inode, arg));
2081 case FSFILT_IOC_GETFLAGS:
2082 case FSFILT_IOC_SETFLAGS:
2083 RETURN(ll_iocontrol(inode, file, cmd, arg));
2084 case FSFILT_IOC_GETVERSION_OLD:
2085 case FSFILT_IOC_GETVERSION:
2086 RETURN(put_user(inode->i_generation, (int *)arg));
2087 case LL_IOC_GROUP_LOCK:
2088 RETURN(ll_get_grouplock(inode, file, arg));
2089 case LL_IOC_GROUP_UNLOCK:
2090 RETURN(ll_put_grouplock(inode, file, arg));
2091 case IOC_OBD_STATFS:
2092 RETURN(ll_obd_statfs(inode, (void *)arg));
2094 /* We need to special case any other ioctls we want to handle,
2095 * to send them to the MDS/OST as appropriate and to properly
2096 * network encode the arg field.
2097 case FSFILT_IOC_SETVERSION_OLD:
2098 case FSFILT_IOC_SETVERSION:
2100 case LL_IOC_FLUSHCTX:
2101 RETURN(ll_flush_ctx(inode));
2102 case LL_IOC_PATH2FID: {
2103 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2104 sizeof(struct lu_fid)))
2109 case OBD_IOC_FID2PATH:
2110 RETURN(ll_fid2path(inode, (void *)arg));
2111 case LL_IOC_DATA_VERSION: {
2112 struct ioc_data_version idv;
2115 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2118 rc = ll_data_version(inode, &idv.idv_version,
2119 !(idv.idv_flags & LL_DV_NOFLUSH));
2121 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2127 case LL_IOC_GET_MDTIDX: {
2130 mdtidx = ll_get_mdt_idx(inode);
2134 if (put_user((int)mdtidx, (int*)arg))
2139 case OBD_IOC_GETDTNAME:
2140 case OBD_IOC_GETMDNAME:
2141 RETURN(ll_get_obd_name(inode, cmd, arg));
2142 case LL_IOC_HSM_STATE_GET: {
2143 struct md_op_data *op_data;
2144 struct hsm_user_state *hus;
2151 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2152 LUSTRE_OPC_ANY, hus);
2153 if (op_data == NULL) {
2158 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2161 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2164 ll_finish_md_op_data(op_data);
2168 case LL_IOC_HSM_STATE_SET: {
2169 struct md_op_data *op_data;
2170 struct hsm_state_set *hss;
2176 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2181 /* Non-root users are forbidden to set or clear flags which are
2182 * NOT defined in HSM_USER_MASK. */
2183 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2184 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2189 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2190 LUSTRE_OPC_ANY, hss);
2191 if (op_data == NULL) {
2196 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2199 ll_finish_md_op_data(op_data);
2204 case LL_IOC_HSM_ACTION: {
2205 struct md_op_data *op_data;
2206 struct hsm_current_action *hca;
2213 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2214 LUSTRE_OPC_ANY, hca);
2215 if (op_data == NULL) {
2220 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2223 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2226 ll_finish_md_op_data(op_data);
2234 ll_iocontrol_call(inode, file, cmd, arg, &err))
2237 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2243 #ifndef HAVE_FILE_LLSEEK_SIZE
2244 static inline loff_t
2245 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2247 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2249 if (offset > maxsize)
2252 if (offset != file->f_pos) {
2253 file->f_pos = offset;
2254 file->f_version = 0;
2260 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2261 loff_t maxsize, loff_t eof)
2263 struct inode *inode = file->f_dentry->d_inode;
2271 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2272 * position-querying operation. Avoid rewriting the "same"
2273 * f_pos value back to the file because a concurrent read(),
2274 * write() or lseek() might have altered it
2279 * f_lock protects against read/modify/write race with other
2280 * SEEK_CURs. Note that parallel writes and reads behave
2283 mutex_lock(&inode->i_mutex);
2284 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2285 mutex_unlock(&inode->i_mutex);
2289 * In the generic case the entire file is data, so as long as
2290 * offset isn't at the end of the file then the offset is data.
2297 * There is a virtual hole at the end of the file, so as long as
2298 * offset isn't i_size or larger, return i_size.
2306 return llseek_execute(file, offset, maxsize);
2310 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2312 struct inode *inode = file->f_dentry->d_inode;
2313 loff_t retval, eof = 0;
2316 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2317 (origin == SEEK_CUR) ? file->f_pos : 0);
2318 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2319 inode->i_ino, inode->i_generation, inode, retval, retval,
2321 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2323 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2324 retval = ll_glimpse_size(inode);
2327 eof = i_size_read(inode);
2330 retval = ll_generic_file_llseek_size(file, offset, origin,
2331 ll_file_maxbytes(inode), eof);
2335 int ll_flush(struct file *file, fl_owner_t id)
2337 struct inode *inode = file->f_dentry->d_inode;
2338 struct ll_inode_info *lli = ll_i2info(inode);
2339 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2342 LASSERT(!S_ISDIR(inode->i_mode));
2344 /* catch async errors that were recorded back when async writeback
2345 * failed for pages in this mapping. */
2346 rc = lli->lli_async_rc;
2347 lli->lli_async_rc = 0;
2348 err = lov_read_and_clear_async_rc(lli->lli_clob);
2352 /* The application has been told write failure already.
2353 * Do not report failure again. */
2354 if (fd->fd_write_failed)
2356 return rc ? -EIO : 0;
2360 * Called to make sure a portion of file has been written out.
2361 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2363 * Return how many pages have been written.
2365 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2366 enum cl_fsync_mode mode)
2368 struct cl_env_nest nest;
2371 struct obd_capa *capa = NULL;
2372 struct cl_fsync_io *fio;
2376 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2377 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2380 env = cl_env_nested_get(&nest);
2382 RETURN(PTR_ERR(env));
2384 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2386 io = ccc_env_thread_io(env);
2387 io->ci_obj = cl_i2info(inode)->lli_clob;
2388 io->ci_ignore_layout = 1;
2390 /* initialize parameters for sync */
2391 fio = &io->u.ci_fsync;
2392 fio->fi_capa = capa;
2393 fio->fi_start = start;
2395 fio->fi_fid = ll_inode2fid(inode);
2396 fio->fi_mode = mode;
2397 fio->fi_nr_written = 0;
2399 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2400 result = cl_io_loop(env, io);
2402 result = io->ci_result;
2404 result = fio->fi_nr_written;
2405 cl_io_fini(env, io);
2406 cl_env_nested_put(&nest, env);
2414 * When dentry is provided (the 'else' case), *file->f_dentry may be
2415 * null and dentry must be used directly rather than pulled from
2416 * *file->f_dentry as is done otherwise.
2419 #ifdef HAVE_FILE_FSYNC_4ARGS
2420 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2422 struct dentry *dentry = file->f_dentry;
2423 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2424 int ll_fsync(struct file *file, int data)
2426 struct dentry *dentry = file->f_dentry;
2428 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2431 struct inode *inode = dentry->d_inode;
2432 struct ll_inode_info *lli = ll_i2info(inode);
2433 struct ptlrpc_request *req;
2434 struct obd_capa *oc;
2438 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2439 inode->i_generation, inode);
2440 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2442 #ifdef HAVE_FILE_FSYNC_4ARGS
2443 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2444 mutex_lock(&inode->i_mutex);
2446 /* fsync's caller has already called _fdata{sync,write}, we want
2447 * that IO to finish before calling the osc and mdc sync methods */
2448 rc = filemap_fdatawait(inode->i_mapping);
2451 /* catch async errors that were recorded back when async writeback
2452 * failed for pages in this mapping. */
2453 if (!S_ISDIR(inode->i_mode)) {
2454 err = lli->lli_async_rc;
2455 lli->lli_async_rc = 0;
2458 err = lov_read_and_clear_async_rc(lli->lli_clob);
2463 oc = ll_mdscapa_get(inode);
2464 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2470 ptlrpc_req_finished(req);
2473 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2475 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2477 if (rc == 0 && err < 0)
2480 fd->fd_write_failed = true;
2482 fd->fd_write_failed = false;
2485 #ifdef HAVE_FILE_FSYNC_4ARGS
2486 mutex_unlock(&inode->i_mutex);
2491 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2493 struct inode *inode = file->f_dentry->d_inode;
2494 struct ll_sb_info *sbi = ll_i2sbi(inode);
2495 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2496 .ei_cb_cp =ldlm_flock_completion_ast,
2497 .ei_cbdata = file_lock };
2498 struct md_op_data *op_data;
2499 struct lustre_handle lockh = {0};
2500 ldlm_policy_data_t flock = {{0}};
2506 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2507 inode->i_ino, file_lock);
2509 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2511 if (file_lock->fl_flags & FL_FLOCK) {
2512 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2513 /* flocks are whole-file locks */
2514 flock.l_flock.end = OFFSET_MAX;
2515 /* For flocks owner is determined by the local file desctiptor*/
2516 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2517 } else if (file_lock->fl_flags & FL_POSIX) {
2518 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2519 flock.l_flock.start = file_lock->fl_start;
2520 flock.l_flock.end = file_lock->fl_end;
2524 flock.l_flock.pid = file_lock->fl_pid;
2526 /* Somewhat ugly workaround for svc lockd.
2527 * lockd installs custom fl_lmops->lm_compare_owner that checks
2528 * for the fl_owner to be the same (which it always is on local node
2529 * I guess between lockd processes) and then compares pid.
2530 * As such we assign pid to the owner field to make it all work,
2531 * conflict with normal locks is unlikely since pid space and
2532 * pointer space for current->files are not intersecting */
2533 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2534 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2536 switch (file_lock->fl_type) {
2538 einfo.ei_mode = LCK_PR;
2541 /* An unlock request may or may not have any relation to
2542 * existing locks so we may not be able to pass a lock handle
2543 * via a normal ldlm_lock_cancel() request. The request may even
2544 * unlock a byte range in the middle of an existing lock. In
2545 * order to process an unlock request we need all of the same
2546 * information that is given with a normal read or write record
2547 * lock request. To avoid creating another ldlm unlock (cancel)
2548 * message we'll treat a LCK_NL flock request as an unlock. */
2549 einfo.ei_mode = LCK_NL;
2552 einfo.ei_mode = LCK_PW;
2555 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2556 file_lock->fl_type);
2571 flags = LDLM_FL_BLOCK_NOWAIT;
2577 flags = LDLM_FL_TEST_LOCK;
2578 /* Save the old mode so that if the mode in the lock changes we
2579 * can decrement the appropriate reader or writer refcount. */
2580 file_lock->fl_type = einfo.ei_mode;
2583 CERROR("unknown fcntl lock command: %d\n", cmd);
2587 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2588 LUSTRE_OPC_ANY, NULL);
2589 if (IS_ERR(op_data))
2590 RETURN(PTR_ERR(op_data));
2592 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2593 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2594 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2596 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2597 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2599 if ((file_lock->fl_flags & FL_FLOCK) &&
2600 (rc == 0 || file_lock->fl_type == F_UNLCK))
2601 rc2 = flock_lock_file_wait(file, file_lock);
2602 if ((file_lock->fl_flags & FL_POSIX) &&
2603 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2604 !(flags & LDLM_FL_TEST_LOCK))
2605 rc2 = posix_lock_file_wait(file, file_lock);
2607 if (rc2 && file_lock->fl_type != F_UNLCK) {
2608 einfo.ei_mode = LCK_NL;
2609 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2610 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2614 ll_finish_md_op_data(op_data);
2619 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2627 * test if some locks matching bits and l_req_mode are acquired
2628 * - bits can be in different locks
2629 * - if found clear the common lock bits in *bits
2630 * - the bits not found, are kept in *bits
2632 * \param bits [IN] searched lock bits [IN]
2633 * \param l_req_mode [IN] searched lock mode
2634 * \retval boolean, true iff all bits are found
2636 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2638 struct lustre_handle lockh;
2639 ldlm_policy_data_t policy;
2640 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2641 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2650 fid = &ll_i2info(inode)->lli_fid;
2651 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2652 ldlm_lockname[mode]);
2654 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2655 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2656 policy.l_inodebits.bits = *bits & (1 << i);
2657 if (policy.l_inodebits.bits == 0)
2660 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2661 &policy, mode, &lockh)) {
2662 struct ldlm_lock *lock;
2664 lock = ldlm_handle2lock(&lockh);
2667 ~(lock->l_policy_data.l_inodebits.bits);
2668 LDLM_LOCK_PUT(lock);
2670 *bits &= ~policy.l_inodebits.bits;
2677 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2678 struct lustre_handle *lockh, __u64 flags)
2680 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2685 fid = &ll_i2info(inode)->lli_fid;
2686 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2688 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2689 fid, LDLM_IBITS, &policy,
2690 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2694 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2696 /* Already unlinked. Just update nlink and return success */
2697 if (rc == -ENOENT) {
2699 /* This path cannot be hit for regular files unless in
2700 * case of obscure races, so no need to to validate
2702 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2704 } else if (rc != 0) {
2705 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2706 ll_get_fsname(inode->i_sb, NULL, 0),
2707 PFID(ll_inode2fid(inode)), rc);
2713 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2716 struct inode *inode = dentry->d_inode;
2717 struct ptlrpc_request *req = NULL;
2718 struct obd_export *exp;
2722 LASSERT(inode != NULL);
2724 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2725 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2727 exp = ll_i2mdexp(inode);
2729 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2730 * But under CMD case, it caused some lock issues, should be fixed
2731 * with new CMD ibits lock. See bug 12718 */
2732 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2733 struct lookup_intent oit = { .it_op = IT_GETATTR };
2734 struct md_op_data *op_data;
2736 if (ibits == MDS_INODELOCK_LOOKUP)
2737 oit.it_op = IT_LOOKUP;
2739 /* Call getattr by fid, so do not provide name at all. */
2740 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2741 dentry->d_inode, NULL, 0, 0,
2742 LUSTRE_OPC_ANY, NULL);
2743 if (IS_ERR(op_data))
2744 RETURN(PTR_ERR(op_data));
2746 oit.it_create_mode |= M_CHECK_STALE;
2747 rc = md_intent_lock(exp, op_data, NULL, 0,
2748 /* we are not interested in name
2751 ll_md_blocking_ast, 0);
2752 ll_finish_md_op_data(op_data);
2753 oit.it_create_mode &= ~M_CHECK_STALE;
2755 rc = ll_inode_revalidate_fini(inode, rc);
2759 rc = ll_revalidate_it_finish(req, &oit, dentry);
2761 ll_intent_release(&oit);
2765 /* Unlinked? Unhash dentry, so it is not picked up later by
2766 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2767 here to preserve get_cwd functionality on 2.6.
2769 if (!dentry->d_inode->i_nlink)
2770 d_lustre_invalidate(dentry);
2772 ll_lookup_finish_locks(&oit, dentry);
2773 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2774 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2775 obd_valid valid = OBD_MD_FLGETATTR;
2776 struct md_op_data *op_data;
2779 if (S_ISREG(inode->i_mode)) {
2780 rc = ll_get_max_mdsize(sbi, &ealen);
2783 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2786 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2787 0, ealen, LUSTRE_OPC_ANY,
2789 if (IS_ERR(op_data))
2790 RETURN(PTR_ERR(op_data));
2792 op_data->op_valid = valid;
2793 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2794 * capa for this inode. Because we only keep capas of dirs
2796 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2797 ll_finish_md_op_data(op_data);
2799 rc = ll_inode_revalidate_fini(inode, rc);
2803 rc = ll_prep_inode(&inode, req, NULL, NULL);
2806 ptlrpc_req_finished(req);
2810 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2813 struct inode *inode = dentry->d_inode;
2817 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2821 /* if object isn't regular file, don't validate size */
2822 if (!S_ISREG(inode->i_mode)) {
2823 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2824 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2825 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2827 rc = ll_glimpse_size(inode);
2832 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2833 struct lookup_intent *it, struct kstat *stat)
2835 struct inode *inode = de->d_inode;
2836 struct ll_sb_info *sbi = ll_i2sbi(inode);
2837 struct ll_inode_info *lli = ll_i2info(inode);
2840 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2841 MDS_INODELOCK_LOOKUP);
2842 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2847 stat->dev = inode->i_sb->s_dev;
2848 if (ll_need_32bit_api(sbi))
2849 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2851 stat->ino = inode->i_ino;
2852 stat->mode = inode->i_mode;
2853 stat->nlink = inode->i_nlink;
2854 stat->uid = inode->i_uid;
2855 stat->gid = inode->i_gid;
2856 stat->rdev = inode->i_rdev;
2857 stat->atime = inode->i_atime;
2858 stat->mtime = inode->i_mtime;
2859 stat->ctime = inode->i_ctime;
2860 stat->blksize = 1 << inode->i_blkbits;
2862 stat->size = i_size_read(inode);
2863 stat->blocks = inode->i_blocks;
2867 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2869 struct lookup_intent it = { .it_op = IT_GETATTR };
2871 return ll_getattr_it(mnt, de, &it, stat);
2874 #ifdef HAVE_LINUX_FIEMAP_H
2875 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2876 __u64 start, __u64 len)
2880 struct ll_user_fiemap *fiemap;
2881 unsigned int extent_count = fieinfo->fi_extents_max;
2883 num_bytes = sizeof(*fiemap) + (extent_count *
2884 sizeof(struct ll_fiemap_extent));
2885 OBD_ALLOC_LARGE(fiemap, num_bytes);
2890 fiemap->fm_flags = fieinfo->fi_flags;
2891 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2892 fiemap->fm_start = start;
2893 fiemap->fm_length = len;
2894 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2895 sizeof(struct ll_fiemap_extent));
2897 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2899 fieinfo->fi_flags = fiemap->fm_flags;
2900 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2901 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2902 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2904 OBD_FREE_LARGE(fiemap, num_bytes);
2909 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2911 struct ll_inode_info *lli = ll_i2info(inode);
2912 struct posix_acl *acl = NULL;
2915 spin_lock(&lli->lli_lock);
2916 /* VFS' acl_permission_check->check_acl will release the refcount */
2917 acl = posix_acl_dup(lli->lli_posix_acl);
2918 spin_unlock(&lli->lli_lock);
2923 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2925 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2926 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2928 ll_check_acl(struct inode *inode, int mask)
2931 # ifdef CONFIG_FS_POSIX_ACL
2932 struct posix_acl *acl;
2936 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2937 if (flags & IPERM_FLAG_RCU)
2940 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2945 rc = posix_acl_permission(inode, acl, mask);
2946 posix_acl_release(acl);
2949 # else /* !CONFIG_FS_POSIX_ACL */
2951 # endif /* CONFIG_FS_POSIX_ACL */
2953 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2955 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2956 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2958 # ifdef HAVE_INODE_PERMISION_2ARGS
2959 int ll_inode_permission(struct inode *inode, int mask)
2961 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2968 #ifdef MAY_NOT_BLOCK
2969 if (mask & MAY_NOT_BLOCK)
2971 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2972 if (flags & IPERM_FLAG_RCU)
2976 /* as root inode are NOT getting validated in lookup operation,
2977 * need to do it before permission check. */
2979 if (inode == inode->i_sb->s_root->d_inode) {
2980 struct lookup_intent it = { .it_op = IT_LOOKUP };
2982 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2983 MDS_INODELOCK_LOOKUP);
2988 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2989 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2991 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2992 return lustre_check_remote_perm(inode, mask);
2994 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2995 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3000 #ifdef HAVE_FILE_READV
3001 #define READ_METHOD readv
3002 #define READ_FUNCTION ll_file_readv
3003 #define WRITE_METHOD writev
3004 #define WRITE_FUNCTION ll_file_writev
3006 #define READ_METHOD aio_read
3007 #define READ_FUNCTION ll_file_aio_read
3008 #define WRITE_METHOD aio_write
3009 #define WRITE_FUNCTION ll_file_aio_write
3012 /* -o localflock - only provides locally consistent flock locks */
3013 struct file_operations ll_file_operations = {
3014 .read = ll_file_read,
3015 .READ_METHOD = READ_FUNCTION,
3016 .write = ll_file_write,
3017 .WRITE_METHOD = WRITE_FUNCTION,
3018 .unlocked_ioctl = ll_file_ioctl,
3019 .open = ll_file_open,
3020 .release = ll_file_release,
3021 .mmap = ll_file_mmap,
3022 .llseek = ll_file_seek,
3023 #ifdef HAVE_KERNEL_SENDFILE
3024 .sendfile = ll_file_sendfile,
3026 #ifdef HAVE_KERNEL_SPLICE_READ
3027 .splice_read = ll_file_splice_read,
3033 struct file_operations ll_file_operations_flock = {
3034 .read = ll_file_read,
3035 .READ_METHOD = READ_FUNCTION,
3036 .write = ll_file_write,
3037 .WRITE_METHOD = WRITE_FUNCTION,
3038 .unlocked_ioctl = ll_file_ioctl,
3039 .open = ll_file_open,
3040 .release = ll_file_release,
3041 .mmap = ll_file_mmap,
3042 .llseek = ll_file_seek,
3043 #ifdef HAVE_KERNEL_SENDFILE
3044 .sendfile = ll_file_sendfile,
3046 #ifdef HAVE_KERNEL_SPLICE_READ
3047 .splice_read = ll_file_splice_read,
3051 .flock = ll_file_flock,
3052 .lock = ll_file_flock
3055 /* These are for -o noflock - to return ENOSYS on flock calls */
3056 struct file_operations ll_file_operations_noflock = {
3057 .read = ll_file_read,
3058 .READ_METHOD = READ_FUNCTION,
3059 .write = ll_file_write,
3060 .WRITE_METHOD = WRITE_FUNCTION,
3061 .unlocked_ioctl = ll_file_ioctl,
3062 .open = ll_file_open,
3063 .release = ll_file_release,
3064 .mmap = ll_file_mmap,
3065 .llseek = ll_file_seek,
3066 #ifdef HAVE_KERNEL_SENDFILE
3067 .sendfile = ll_file_sendfile,
3069 #ifdef HAVE_KERNEL_SPLICE_READ
3070 .splice_read = ll_file_splice_read,
3074 .flock = ll_file_noflock,
3075 .lock = ll_file_noflock
3078 struct inode_operations ll_file_inode_operations = {
3079 .setattr = ll_setattr,
3080 .getattr = ll_getattr,
3081 .permission = ll_inode_permission,
3082 .setxattr = ll_setxattr,
3083 .getxattr = ll_getxattr,
3084 .listxattr = ll_listxattr,
3085 .removexattr = ll_removexattr,
3086 #ifdef HAVE_LINUX_FIEMAP_H
3087 .fiemap = ll_fiemap,
3089 #ifdef HAVE_IOP_GET_ACL
3090 .get_acl = ll_get_acl,
3094 /* dynamic ioctl number support routins */
3095 static struct llioc_ctl_data {
3096 struct rw_semaphore ioc_sem;
3097 cfs_list_t ioc_head;
3099 __RWSEM_INITIALIZER(llioc.ioc_sem),
3100 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3105 cfs_list_t iocd_list;
3106 unsigned int iocd_size;
3107 llioc_callback_t iocd_cb;
3108 unsigned int iocd_count;
3109 unsigned int iocd_cmd[0];
3112 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3115 struct llioc_data *in_data = NULL;
3118 if (cb == NULL || cmd == NULL ||
3119 count > LLIOC_MAX_CMD || count < 0)
3122 size = sizeof(*in_data) + count * sizeof(unsigned int);
3123 OBD_ALLOC(in_data, size);
3124 if (in_data == NULL)
3127 memset(in_data, 0, sizeof(*in_data));
3128 in_data->iocd_size = size;
3129 in_data->iocd_cb = cb;
3130 in_data->iocd_count = count;
3131 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3133 down_write(&llioc.ioc_sem);
3134 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3135 up_write(&llioc.ioc_sem);
3140 void ll_iocontrol_unregister(void *magic)
3142 struct llioc_data *tmp;
3147 down_write(&llioc.ioc_sem);
3148 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3150 unsigned int size = tmp->iocd_size;
3152 cfs_list_del(&tmp->iocd_list);
3153 up_write(&llioc.ioc_sem);
3155 OBD_FREE(tmp, size);
3159 up_write(&llioc.ioc_sem);
3161 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3164 EXPORT_SYMBOL(ll_iocontrol_register);
3165 EXPORT_SYMBOL(ll_iocontrol_unregister);
3167 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3168 unsigned int cmd, unsigned long arg, int *rcp)
3170 enum llioc_iter ret = LLIOC_CONT;
3171 struct llioc_data *data;
3172 int rc = -EINVAL, i;
3174 down_read(&llioc.ioc_sem);
3175 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3176 for (i = 0; i < data->iocd_count; i++) {
3177 if (cmd != data->iocd_cmd[i])
3180 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3184 if (ret == LLIOC_STOP)
3187 up_read(&llioc.ioc_sem);
3194 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3196 struct ll_inode_info *lli = ll_i2info(inode);
3197 struct cl_env_nest nest;
3202 if (lli->lli_clob == NULL)
3205 env = cl_env_nested_get(&nest);
3207 RETURN(PTR_ERR(env));
3209 result = cl_conf_set(env, lli->lli_clob, conf);
3210 cl_env_nested_put(&nest, env);
3212 if (conf->coc_opc == OBJECT_CONF_SET) {
3213 struct ldlm_lock *lock = conf->coc_lock;
3215 LASSERT(lock != NULL);
3216 LASSERT(ldlm_has_layout(lock));
3218 /* it can only be allowed to match after layout is
3219 * applied to inode otherwise false layout would be
3220 * seen. Applying layout shoud happen before dropping
3221 * the intent lock. */
3222 ldlm_lock_allow_match(lock);
3229 * Apply the layout to the inode. Layout lock is held and will be released
3232 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3233 struct inode *inode, __u32 *gen, bool reconf)
3235 struct ll_inode_info *lli = ll_i2info(inode);
3236 struct ll_sb_info *sbi = ll_i2sbi(inode);
3237 struct ldlm_lock *lock;
3238 struct lustre_md md = { NULL };
3239 struct cl_object_conf conf;
3244 LASSERT(lustre_handle_is_used(lockh));
3246 lock = ldlm_handle2lock(lockh);
3247 LASSERT(lock != NULL);
3248 LASSERT(ldlm_has_layout(lock));
3250 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3251 inode, PFID(&lli->lli_fid), reconf);
3253 lock_res_and_lock(lock);
3254 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3255 unlock_res_and_lock(lock);
3256 /* checking lvb_ready is racy but this is okay. The worst case is
3257 * that multi processes may configure the file on the same time. */
3258 if (lvb_ready || !reconf) {
3259 LDLM_LOCK_PUT(lock);
3263 /* layout_gen must be valid if layout lock is not
3264 * cancelled and stripe has already set */
3265 *gen = lli->lli_layout_gen;
3268 ldlm_lock_decref(lockh, mode);
3272 /* for layout lock, lmm is returned in lock's lvb.
3273 * lvb_data is immutable if the lock is held so it's safe to access it
3274 * without res lock. See the description in ldlm_lock_decref_internal()
3275 * for the condition to free lvb_data of layout lock */
3276 if (lock->l_lvb_data != NULL) {
3277 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3278 lock->l_lvb_data, lock->l_lvb_len);
3280 *gen = LL_LAYOUT_GEN_EMPTY;
3282 *gen = md.lsm->lsm_layout_gen;
3285 CERROR("%s: file "DFID" unpackmd error: %d\n",
3286 ll_get_fsname(inode->i_sb, NULL, 0),
3287 PFID(&lli->lli_fid), rc);
3291 LDLM_LOCK_PUT(lock);
3292 ldlm_lock_decref(lockh, mode);
3296 /* set layout to file. Unlikely this will fail as old layout was
3297 * surely eliminated */
3298 memset(&conf, 0, sizeof conf);
3299 conf.coc_opc = OBJECT_CONF_SET;
3300 conf.coc_inode = inode;
3301 conf.coc_lock = lock;
3302 conf.u.coc_md = &md;
3303 rc = ll_layout_conf(inode, &conf);
3304 LDLM_LOCK_PUT(lock);
3306 ldlm_lock_decref(lockh, mode);
3309 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3311 /* wait for IO to complete if it's still being used. */
3313 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3314 ll_get_fsname(inode->i_sb, NULL, 0),
3315 inode, PFID(&lli->lli_fid));
3317 memset(&conf, 0, sizeof conf);
3318 conf.coc_opc = OBJECT_CONF_WAIT;
3319 conf.coc_inode = inode;
3320 rc = ll_layout_conf(inode, &conf);
3324 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3325 PFID(&lli->lli_fid), rc);
3332 * This function checks if there exists a LAYOUT lock on the client side,
3333 * or enqueues it if it doesn't have one in cache.
3335 * This function will not hold layout lock so it may be revoked any time after
3336 * this function returns. Any operations depend on layout should be redone
3339 * This function should be called before lov_io_init() to get an uptodate
3340 * layout version, the caller should save the version number and after IO
3341 * is finished, this function should be called again to verify that layout
3342 * is not changed during IO time.
3344 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3346 struct ll_inode_info *lli = ll_i2info(inode);
3347 struct ll_sb_info *sbi = ll_i2sbi(inode);
3348 struct md_op_data *op_data;
3349 struct lookup_intent it;
3350 struct lustre_handle lockh;
3352 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3354 .ei_cb_bl = ll_md_blocking_ast,
3355 .ei_cb_cp = ldlm_completion_ast,
3356 .ei_cbdata = NULL };
3360 *gen = LL_LAYOUT_GEN_NONE;
3361 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3365 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3366 LASSERT(S_ISREG(inode->i_mode));
3368 /* mostly layout lock is caching on the local side, so try to match
3369 * it before grabbing layout lock mutex. */
3370 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3371 if (mode != 0) { /* hit cached lock */
3372 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3376 /* better hold lli_layout_mutex to try again otherwise
3377 * it will have starvation problem. */
3380 /* take layout lock mutex to enqueue layout lock exclusively. */
3381 mutex_lock(&lli->lli_layout_mutex);
3384 /* try again. Maybe somebody else has done this. */
3385 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3386 if (mode != 0) { /* hit cached lock */
3387 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3391 mutex_unlock(&lli->lli_layout_mutex);
3395 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3396 0, 0, LUSTRE_OPC_ANY, NULL);
3397 if (IS_ERR(op_data)) {
3398 mutex_unlock(&lli->lli_layout_mutex);
3399 RETURN(PTR_ERR(op_data));
3402 /* have to enqueue one */
3403 memset(&it, 0, sizeof(it));
3404 it.it_op = IT_LAYOUT;
3405 lockh.cookie = 0ULL;
3407 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3408 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3409 PFID(&lli->lli_fid));
3411 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3413 if (it.d.lustre.it_data != NULL)
3414 ptlrpc_req_finished(it.d.lustre.it_data);
3415 it.d.lustre.it_data = NULL;
3417 ll_finish_md_op_data(op_data);
3419 md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
3421 mode = it.d.lustre.it_lock_mode;
3422 it.d.lustre.it_lock_mode = 0;
3423 ll_intent_drop_lock(&it);
3426 /* set lock data in case this is a new lock */
3427 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3428 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3432 mutex_unlock(&lli->lli_layout_mutex);