4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
85 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
86 op_data->op_bias |= MDS_DATA_MODIFIED;
90 * Closes the IO epoch and packs all the attributes into @op_data for
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
99 ATTR_MTIME_SET | ATTR_CTIME_SET;
101 if (!(och->och_flags & FMODE_WRITE))
104 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
105 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 ll_ioepoch_close(inode, op_data, &och, 0);
110 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
111 ll_prep_md_op_data(op_data, inode, NULL, NULL,
112 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och)
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
144 rc = md_close(md_exp, op_data, och->och_mod, &req);
146 /* This close must have the epoch closed. */
147 LASSERT(epoch_close);
148 /* MDS has instructed us to obtain Size-on-MDS attribute from
149 * OSTs and send setattr to back to MDS. */
150 rc = ll_som_update(inode, op_data);
152 CERROR("inode %lu mdc Size-on-MDS update failed: "
153 "rc = %d\n", inode->i_ino, rc);
157 CERROR("inode %lu mdc close failed: rc = %d\n",
161 /* DATA_MODIFIED flag was successfully sent on close, cancel data
162 * modification flag. */
163 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
164 struct ll_inode_info *lli = ll_i2info(inode);
166 spin_lock(&lli->lli_lock);
167 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
168 spin_unlock(&lli->lli_lock);
171 ll_finish_md_op_data(op_data);
174 rc = ll_objects_destroy(req, inode);
176 CERROR("inode %lu ll_objects destroy: rc = %d\n",
183 if (exp_connect_som(exp) && !epoch_close &&
184 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
185 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
187 md_clear_open_replay_data(md_exp, och);
188 /* Free @och if it is not waiting for DONE_WRITING. */
189 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
192 if (req) /* This is close request */
193 ptlrpc_req_finished(req);
197 int ll_md_real_close(struct inode *inode, int flags)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (flags & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (flags & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(flags & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount) { /* There are still users of this handle, so
221 mutex_unlock(&lli->lli_och_mutex);
226 mutex_unlock(&lli->lli_och_mutex);
228 if (och) { /* There might be a race and somebody have freed this och
230 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
240 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
241 struct ll_inode_info *lli = ll_i2info(inode);
245 /* clear group lock, if present */
246 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
247 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
249 /* Let's see if we have good enough OPEN lock on the file and if
250 we can skip talking to MDS */
251 if (file->f_dentry->d_inode) { /* Can this ever be false? */
253 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
254 struct lustre_handle lockh;
255 struct inode *inode = file->f_dentry->d_inode;
256 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
258 mutex_lock(&lli->lli_och_mutex);
259 if (fd->fd_omode & FMODE_WRITE) {
261 LASSERT(lli->lli_open_fd_write_count);
262 lli->lli_open_fd_write_count--;
263 } else if (fd->fd_omode & FMODE_EXEC) {
265 LASSERT(lli->lli_open_fd_exec_count);
266 lli->lli_open_fd_exec_count--;
269 LASSERT(lli->lli_open_fd_read_count);
270 lli->lli_open_fd_read_count--;
272 mutex_unlock(&lli->lli_och_mutex);
274 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
275 LDLM_IBITS, &policy, lockmode,
277 rc = ll_md_real_close(file->f_dentry->d_inode,
281 CERROR("Releasing a file %p with negative dentry %p. Name %s",
282 file, file->f_dentry, file->f_dentry->d_name.name);
285 LUSTRE_FPRIVATE(file) = NULL;
286 ll_file_data_put(fd);
287 ll_capa_close(inode);
292 /* While this returns an error code, fput() the caller does not, so we need
293 * to make every effort to clean up all of our state here. Also, applications
294 * rarely check close errors and even if an error is returned they will not
295 * re-try the close call.
297 int ll_file_release(struct inode *inode, struct file *file)
299 struct ll_file_data *fd;
300 struct ll_sb_info *sbi = ll_i2sbi(inode);
301 struct ll_inode_info *lli = ll_i2info(inode);
305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
306 inode->i_generation, inode);
308 #ifdef CONFIG_FS_POSIX_ACL
309 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
310 inode == inode->i_sb->s_root->d_inode) {
311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
314 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
315 fd->fd_flags &= ~LL_FILE_RMTACL;
316 rct_del(&sbi->ll_rct, cfs_curproc_pid());
317 et_search_free(&sbi->ll_et, cfs_curproc_pid());
322 if (inode->i_sb->s_root != file->f_dentry)
323 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
324 fd = LUSTRE_FPRIVATE(file);
327 /* The last ref on @file, maybe not the the owner pid of statahead.
328 * Different processes can open the same dir, "ll_opendir_key" means:
329 * it is me that should stop the statahead thread. */
330 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
331 lli->lli_opendir_pid != 0)
332 ll_stop_statahead(inode, lli->lli_opendir_key);
334 if (inode->i_sb->s_root == file->f_dentry) {
335 LUSTRE_FPRIVATE(file) = NULL;
336 ll_file_data_put(fd);
340 if (!S_ISDIR(inode->i_mode)) {
341 lov_read_and_clear_async_rc(lli->lli_clob);
342 lli->lli_async_rc = 0;
345 rc = ll_md_close(sbi->ll_md_exp, inode, file);
347 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
348 libcfs_debug_dumplog();
353 static int ll_intent_file_open(struct file *file, void *lmm,
354 int lmmsize, struct lookup_intent *itp)
356 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
357 struct dentry *parent = file->f_dentry->d_parent;
358 const char *name = file->f_dentry->d_name.name;
359 const int len = file->f_dentry->d_name.len;
360 struct md_op_data *op_data;
361 struct ptlrpc_request *req;
362 __u32 opc = LUSTRE_OPC_ANY;
369 /* Usually we come here only for NFSD, and we want open lock.
370 But we can also get here with pre 2.6.15 patchless kernels, and in
371 that case that lock is also ok */
372 /* We can also get here if there was cached open handle in revalidate_it
373 * but it disappeared while we were getting from there to ll_file_open.
374 * But this means this file was closed and immediatelly opened which
375 * makes a good candidate for using OPEN lock */
376 /* If lmmsize & lmm are not 0, we are just setting stripe info
377 * parameters. No need for the open lock */
378 if (lmm == NULL && lmmsize == 0) {
379 itp->it_flags |= MDS_OPEN_LOCK;
380 if (itp->it_flags & FMODE_WRITE)
381 opc = LUSTRE_OPC_CREATE;
384 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
385 file->f_dentry->d_inode, name, len,
388 RETURN(PTR_ERR(op_data));
390 itp->it_flags |= MDS_OPEN_BY_FID;
391 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
392 0 /*unused */, &req, ll_md_blocking_ast, 0);
393 ll_finish_md_op_data(op_data);
395 /* reason for keep own exit path - don`t flood log
396 * with messages with -ESTALE errors.
398 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
399 it_open_error(DISP_OPEN_OPEN, itp))
401 ll_release_openhandle(file->f_dentry, itp);
405 if (it_disposition(itp, DISP_LOOKUP_NEG))
406 GOTO(out, rc = -ENOENT);
408 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
409 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
410 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
414 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
415 if (!rc && itp->d.lustre.it_lock_mode)
416 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
420 ptlrpc_req_finished(itp->d.lustre.it_data);
421 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
422 ll_intent_drop_lock(itp);
428 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
429 * not believe attributes if a few ioepoch holders exist. Attributes for
430 * previous ioepoch if new one is opened are also skipped by MDS.
432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
434 if (ioepoch && lli->lli_ioepoch != ioepoch) {
435 lli->lli_ioepoch = ioepoch;
436 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
437 ioepoch, PFID(&lli->lli_fid));
441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
442 struct lookup_intent *it, struct obd_client_handle *och)
444 struct ptlrpc_request *req = it->d.lustre.it_data;
445 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 LASSERT(body != NULL); /* reply already checked out */
452 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_fid = lli->lli_fid;
455 och->och_flags = it->it_flags;
456 ll_ioepoch_open(lli, body->ioepoch);
458 return md_set_open_replay_data(md_exp, och, req);
461 int ll_local_open(struct file *file, struct lookup_intent *it,
462 struct ll_file_data *fd, struct obd_client_handle *och)
464 struct inode *inode = file->f_dentry->d_inode;
465 struct ll_inode_info *lli = ll_i2info(inode);
468 LASSERT(!LUSTRE_FPRIVATE(file));
473 struct ptlrpc_request *req = it->d.lustre.it_data;
474 struct mdt_body *body;
477 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
481 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
482 if ((it->it_flags & FMODE_WRITE) &&
483 (body->valid & OBD_MD_FLSIZE))
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 lli->lli_ioepoch, PFID(&lli->lli_fid));
488 LUSTRE_FPRIVATE(file) = fd;
489 ll_readahead_init(inode, &fd->fd_ras);
490 fd->fd_omode = it->it_flags;
494 /* Open a file, and (for the very first open) create objects on the OSTs at
495 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
496 * creation or open until ll_lov_setstripe() ioctl is called.
498 * If we already have the stripe MD locally then we don't request it in
499 * md_open(), by passing a lmm_size = 0.
501 * It is up to the application to ensure no other processes open this file
502 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
503 * used. We might be able to avoid races of that sort by getting lli_open_sem
504 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
505 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
507 int ll_file_open(struct inode *inode, struct file *file)
509 struct ll_inode_info *lli = ll_i2info(inode);
510 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
511 .it_flags = file->f_flags };
512 struct obd_client_handle **och_p = NULL;
513 __u64 *och_usecount = NULL;
514 struct ll_file_data *fd;
515 int rc = 0, opendir_set = 0;
518 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
519 inode->i_generation, inode, file->f_flags);
521 it = file->private_data; /* XXX: compat macro */
522 file->private_data = NULL; /* prevent ll_local_open assertion */
524 fd = ll_file_data_get();
526 GOTO(out_och_free, rc = -ENOMEM);
529 if (S_ISDIR(inode->i_mode)) {
530 spin_lock(&lli->lli_sa_lock);
531 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
532 lli->lli_opendir_pid == 0) {
533 lli->lli_opendir_key = fd;
534 lli->lli_opendir_pid = cfs_curproc_pid();
537 spin_unlock(&lli->lli_sa_lock);
540 if (inode->i_sb->s_root == file->f_dentry) {
541 LUSTRE_FPRIVATE(file) = fd;
545 if (!it || !it->d.lustre.it_disposition) {
546 /* Convert f_flags into access mode. We cannot use file->f_mode,
547 * because everything but O_ACCMODE mask was stripped from
549 if ((oit.it_flags + 1) & O_ACCMODE)
551 if (file->f_flags & O_TRUNC)
552 oit.it_flags |= FMODE_WRITE;
554 /* kernel only call f_op->open in dentry_open. filp_open calls
555 * dentry_open after call to open_namei that checks permissions.
556 * Only nfsd_open call dentry_open directly without checking
557 * permissions and because of that this code below is safe. */
558 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
559 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
561 /* We do not want O_EXCL here, presumably we opened the file
562 * already? XXX - NFS implications? */
563 oit.it_flags &= ~O_EXCL;
565 /* bug20584, if "it_flags" contains O_CREAT, the file will be
566 * created if necessary, then "IT_CREAT" should be set to keep
567 * consistent with it */
568 if (oit.it_flags & O_CREAT)
569 oit.it_op |= IT_CREAT;
575 /* Let's see if we have file open on MDS already. */
576 if (it->it_flags & FMODE_WRITE) {
577 och_p = &lli->lli_mds_write_och;
578 och_usecount = &lli->lli_open_fd_write_count;
579 } else if (it->it_flags & FMODE_EXEC) {
580 och_p = &lli->lli_mds_exec_och;
581 och_usecount = &lli->lli_open_fd_exec_count;
583 och_p = &lli->lli_mds_read_och;
584 och_usecount = &lli->lli_open_fd_read_count;
587 mutex_lock(&lli->lli_och_mutex);
588 if (*och_p) { /* Open handle is present */
589 if (it_disposition(it, DISP_OPEN_OPEN)) {
590 /* Well, there's extra open request that we do not need,
591 let's close it somehow. This will decref request. */
592 rc = it_open_error(DISP_OPEN_OPEN, it);
594 mutex_unlock(&lli->lli_och_mutex);
595 GOTO(out_openerr, rc);
598 ll_release_openhandle(file->f_dentry, it);
602 rc = ll_local_open(file, it, fd, NULL);
605 mutex_unlock(&lli->lli_och_mutex);
606 GOTO(out_openerr, rc);
609 LASSERT(*och_usecount == 0);
610 if (!it->d.lustre.it_disposition) {
611 /* We cannot just request lock handle now, new ELC code
612 means that one of other OPEN locks for this file
613 could be cancelled, and since blocking ast handler
614 would attempt to grab och_mutex as well, that would
615 result in a deadlock */
616 mutex_unlock(&lli->lli_och_mutex);
617 it->it_create_mode |= M_CHECK_STALE;
618 rc = ll_intent_file_open(file, NULL, 0, it);
619 it->it_create_mode &= ~M_CHECK_STALE;
621 GOTO(out_openerr, rc);
625 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
627 GOTO(out_och_free, rc = -ENOMEM);
631 /* md_intent_lock() didn't get a request ref if there was an
632 * open error, so don't do cleanup on the request here
634 /* XXX (green): Should not we bail out on any error here, not
635 * just open error? */
636 rc = it_open_error(DISP_OPEN_OPEN, it);
638 GOTO(out_och_free, rc);
640 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
642 rc = ll_local_open(file, it, fd, *och_p);
644 GOTO(out_och_free, rc);
646 mutex_unlock(&lli->lli_och_mutex);
649 /* Must do this outside lli_och_mutex lock to prevent deadlock where
650 different kind of OPEN lock for this same inode gets cancelled
651 by ldlm_cancel_lru */
652 if (!S_ISREG(inode->i_mode))
653 GOTO(out_och_free, rc);
657 if (!lli->lli_has_smd) {
658 if (file->f_flags & O_LOV_DELAY_CREATE ||
659 !(file->f_mode & FMODE_WRITE)) {
660 CDEBUG(D_INODE, "object creation was delayed\n");
661 GOTO(out_och_free, rc);
664 file->f_flags &= ~O_LOV_DELAY_CREATE;
665 GOTO(out_och_free, rc);
669 if (och_p && *och_p) {
670 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
671 *och_p = NULL; /* OBD_FREE writes some magic there */
674 mutex_unlock(&lli->lli_och_mutex);
677 if (opendir_set != 0)
678 ll_stop_statahead(inode, lli->lli_opendir_key);
680 ll_file_data_put(fd);
682 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
685 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
686 ptlrpc_req_finished(it->d.lustre.it_data);
687 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
693 /* Fills the obdo with the attributes for the lsm */
694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
695 struct obd_capa *capa, struct obdo *obdo,
696 __u64 ioepoch, int sync)
698 struct ptlrpc_request_set *set;
699 struct obd_info oinfo = { { { 0 } } };
704 LASSERT(lsm != NULL);
708 oinfo.oi_oa->o_oi = lsm->lsm_oi;
709 oinfo.oi_oa->o_mode = S_IFREG;
710 oinfo.oi_oa->o_ioepoch = ioepoch;
711 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
712 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
713 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
714 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
715 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
716 OBD_MD_FLDATAVERSION;
717 oinfo.oi_capa = capa;
719 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
720 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
723 set = ptlrpc_prep_set();
725 CERROR("can't allocate ptlrpc set\n");
728 rc = obd_getattr_async(exp, &oinfo, set);
730 rc = ptlrpc_set_wait(set);
731 ptlrpc_set_destroy(set);
734 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
735 OBD_MD_FLATIME | OBD_MD_FLMTIME |
736 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
737 OBD_MD_FLDATAVERSION);
742 * Performs the getattr on the inode and updates its fields.
743 * If @sync != 0, perform the getattr under the server-side lock.
745 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
746 __u64 ioepoch, int sync)
748 struct obd_capa *capa = ll_mdscapa_get(inode);
749 struct lov_stripe_md *lsm;
753 lsm = ccc_inode_lsm_get(inode);
754 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
755 capa, obdo, ioepoch, sync);
758 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
760 obdo_refresh_inode(inode, obdo, obdo->o_valid);
761 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
762 " blksize %lu\n", POSTID(oi), i_size_read(inode),
763 (unsigned long long)inode->i_blocks,
764 (unsigned long)ll_inode_blksize(inode));
766 ccc_inode_lsm_put(inode, lsm);
770 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
772 struct ll_inode_info *lli = ll_i2info(inode);
773 struct cl_object *obj = lli->lli_clob;
774 struct cl_attr *attr = ccc_env_thread_attr(env);
780 ll_inode_size_lock(inode);
781 /* merge timestamps the most recently obtained from mds with
782 timestamps obtained from osts */
783 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
784 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
785 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
786 inode_init_lvb(inode, &lvb);
788 cl_object_attr_lock(obj);
789 rc = cl_object_attr_get(env, obj, attr);
790 cl_object_attr_unlock(obj);
793 if (lvb.lvb_atime < attr->cat_atime)
794 lvb.lvb_atime = attr->cat_atime;
795 if (lvb.lvb_ctime < attr->cat_ctime)
796 lvb.lvb_ctime = attr->cat_ctime;
797 if (lvb.lvb_mtime < attr->cat_mtime)
798 lvb.lvb_mtime = attr->cat_mtime;
800 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
801 PFID(&lli->lli_fid), attr->cat_size);
802 cl_isize_write_nolock(inode, attr->cat_size);
804 inode->i_blocks = attr->cat_blocks;
806 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
807 LTIME_S(inode->i_atime) = lvb.lvb_atime;
808 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
810 ll_inode_size_unlock(inode);
815 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
818 struct obdo obdo = { 0 };
821 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
823 st->st_size = obdo.o_size;
824 st->st_blocks = obdo.o_blocks;
825 st->st_mtime = obdo.o_mtime;
826 st->st_atime = obdo.o_atime;
827 st->st_ctime = obdo.o_ctime;
832 void ll_io_init(struct cl_io *io, const struct file *file, int write)
834 struct inode *inode = file->f_dentry->d_inode;
836 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
838 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
839 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
840 file->f_flags & O_DIRECT ||
843 io->ci_obj = ll_i2info(inode)->lli_clob;
844 io->ci_lockreq = CILR_MAYBE;
845 if (ll_file_nolock(file)) {
846 io->ci_lockreq = CILR_NEVER;
847 io->ci_no_srvlock = 1;
848 } else if (file->f_flags & O_APPEND) {
849 io->ci_lockreq = CILR_MANDATORY;
854 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
855 struct file *file, enum cl_io_type iot,
856 loff_t *ppos, size_t count)
858 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
859 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
865 io = ccc_env_thread_io(env);
866 ll_io_init(io, file, iot == CIT_WRITE);
868 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
869 struct vvp_io *vio = vvp_env_io(env);
870 struct ccc_io *cio = ccc_env_io(env);
871 int write_mutex_locked = 0;
873 cio->cui_fd = LUSTRE_FPRIVATE(file);
874 vio->cui_io_subtype = args->via_io_subtype;
876 switch (vio->cui_io_subtype) {
878 cio->cui_iov = args->u.normal.via_iov;
879 cio->cui_nrsegs = args->u.normal.via_nrsegs;
880 cio->cui_tot_nrsegs = cio->cui_nrsegs;
881 #ifndef HAVE_FILE_WRITEV
882 cio->cui_iocb = args->u.normal.via_iocb;
884 if ((iot == CIT_WRITE) &&
885 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
886 if (mutex_lock_interruptible(&lli->
888 GOTO(out, result = -ERESTARTSYS);
889 write_mutex_locked = 1;
890 } else if (iot == CIT_READ) {
891 down_read(&lli->lli_trunc_sem);
895 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
896 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
899 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
900 vio->u.splice.cui_flags = args->u.splice.via_flags;
903 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
906 result = cl_io_loop(env, io);
907 if (write_mutex_locked)
908 mutex_unlock(&lli->lli_write_mutex);
909 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
910 up_read(&lli->lli_trunc_sem);
912 /* cl_io_rw_init() handled IO */
913 result = io->ci_result;
916 if (io->ci_nob > 0) {
918 *ppos = io->u.ci_wr.wr.crw_pos;
923 /* If any bit been read/written (result != 0), we just return
924 * short read/write instead of restart io. */
925 if (result == 0 && io->ci_need_restart) {
926 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
927 iot == CIT_READ ? "read" : "write",
928 file->f_dentry->d_name.name, *ppos, count);
929 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
933 if (iot == CIT_READ) {
935 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
936 LPROC_LL_READ_BYTES, result);
937 } else if (iot == CIT_WRITE) {
939 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
940 LPROC_LL_WRITE_BYTES, result);
941 fd->fd_write_failed = false;
942 } else if (result != -ERESTARTSYS) {
943 fd->fd_write_failed = true;
952 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
954 static int ll_file_get_iov_count(const struct iovec *iov,
955 unsigned long *nr_segs, size_t *count)
960 for (seg = 0; seg < *nr_segs; seg++) {
961 const struct iovec *iv = &iov[seg];
964 * If any segment has a negative length, or the cumulative
965 * length ever wraps negative then return -EINVAL.
968 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
970 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
975 cnt -= iv->iov_len; /* This segment is no good */
982 #ifdef HAVE_FILE_READV
983 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
984 unsigned long nr_segs, loff_t *ppos)
987 struct vvp_io_args *args;
993 result = ll_file_get_iov_count(iov, &nr_segs, &count);
997 env = cl_env_get(&refcheck);
999 RETURN(PTR_ERR(env));
1001 args = vvp_env_args(env, IO_NORMAL);
1002 args->u.normal.via_iov = (struct iovec *)iov;
1003 args->u.normal.via_nrsegs = nr_segs;
1005 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1006 cl_env_put(env, &refcheck);
1010 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1014 struct iovec *local_iov;
1019 env = cl_env_get(&refcheck);
1021 RETURN(PTR_ERR(env));
1023 local_iov = &vvp_env_info(env)->vti_local_iov;
1024 local_iov->iov_base = (void __user *)buf;
1025 local_iov->iov_len = count;
1026 result = ll_file_readv(file, local_iov, 1, ppos);
1027 cl_env_put(env, &refcheck);
1032 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1033 unsigned long nr_segs, loff_t pos)
1036 struct vvp_io_args *args;
1042 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1046 env = cl_env_get(&refcheck);
1048 RETURN(PTR_ERR(env));
1050 args = vvp_env_args(env, IO_NORMAL);
1051 args->u.normal.via_iov = (struct iovec *)iov;
1052 args->u.normal.via_nrsegs = nr_segs;
1053 args->u.normal.via_iocb = iocb;
1055 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1056 &iocb->ki_pos, count);
1057 cl_env_put(env, &refcheck);
1061 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1065 struct iovec *local_iov;
1066 struct kiocb *kiocb;
1071 env = cl_env_get(&refcheck);
1073 RETURN(PTR_ERR(env));
1075 local_iov = &vvp_env_info(env)->vti_local_iov;
1076 kiocb = &vvp_env_info(env)->vti_kiocb;
1077 local_iov->iov_base = (void __user *)buf;
1078 local_iov->iov_len = count;
1079 init_sync_kiocb(kiocb, file);
1080 kiocb->ki_pos = *ppos;
1081 kiocb->ki_left = count;
1083 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1084 *ppos = kiocb->ki_pos;
1086 cl_env_put(env, &refcheck);
1092 * Write to a file (through the page cache).
1094 #ifdef HAVE_FILE_WRITEV
1095 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1096 unsigned long nr_segs, loff_t *ppos)
1099 struct vvp_io_args *args;
1105 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1109 env = cl_env_get(&refcheck);
1111 RETURN(PTR_ERR(env));
1113 args = vvp_env_args(env, IO_NORMAL);
1114 args->u.normal.via_iov = (struct iovec *)iov;
1115 args->u.normal.via_nrsegs = nr_segs;
1117 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1118 cl_env_put(env, &refcheck);
1122 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1126 struct iovec *local_iov;
1131 env = cl_env_get(&refcheck);
1133 RETURN(PTR_ERR(env));
1135 local_iov = &vvp_env_info(env)->vti_local_iov;
1136 local_iov->iov_base = (void __user *)buf;
1137 local_iov->iov_len = count;
1139 result = ll_file_writev(file, local_iov, 1, ppos);
1140 cl_env_put(env, &refcheck);
1144 #else /* AIO stuff */
1145 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1146 unsigned long nr_segs, loff_t pos)
1149 struct vvp_io_args *args;
1155 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1159 env = cl_env_get(&refcheck);
1161 RETURN(PTR_ERR(env));
1163 args = vvp_env_args(env, IO_NORMAL);
1164 args->u.normal.via_iov = (struct iovec *)iov;
1165 args->u.normal.via_nrsegs = nr_segs;
1166 args->u.normal.via_iocb = iocb;
1168 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1169 &iocb->ki_pos, count);
1170 cl_env_put(env, &refcheck);
1174 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1178 struct iovec *local_iov;
1179 struct kiocb *kiocb;
1184 env = cl_env_get(&refcheck);
1186 RETURN(PTR_ERR(env));
1188 local_iov = &vvp_env_info(env)->vti_local_iov;
1189 kiocb = &vvp_env_info(env)->vti_kiocb;
1190 local_iov->iov_base = (void __user *)buf;
1191 local_iov->iov_len = count;
1192 init_sync_kiocb(kiocb, file);
1193 kiocb->ki_pos = *ppos;
1194 kiocb->ki_left = count;
1196 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1197 *ppos = kiocb->ki_pos;
1199 cl_env_put(env, &refcheck);
1205 #ifdef HAVE_KERNEL_SENDFILE
1207 * Send file content (through pagecache) somewhere with helper
1209 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1210 read_actor_t actor, void *target)
1213 struct vvp_io_args *args;
1218 env = cl_env_get(&refcheck);
1220 RETURN(PTR_ERR(env));
1222 args = vvp_env_args(env, IO_SENDFILE);
1223 args->u.sendfile.via_target = target;
1224 args->u.sendfile.via_actor = actor;
1226 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1227 cl_env_put(env, &refcheck);
1232 #ifdef HAVE_KERNEL_SPLICE_READ
1234 * Send file content (through pagecache) somewhere with helper
1236 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1237 struct pipe_inode_info *pipe, size_t count,
1241 struct vvp_io_args *args;
1246 env = cl_env_get(&refcheck);
1248 RETURN(PTR_ERR(env));
1250 args = vvp_env_args(env, IO_SPLICE);
1251 args->u.splice.via_pipe = pipe;
1252 args->u.splice.via_flags = flags;
1254 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1255 cl_env_put(env, &refcheck);
1260 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1263 struct obd_export *exp = ll_i2dtexp(inode);
1264 struct obd_trans_info oti = { 0 };
1265 struct obdo *oa = NULL;
1268 struct lov_stripe_md *lsm = NULL, *lsm2;
1275 lsm = ccc_inode_lsm_get(inode);
1277 GOTO(out, rc = -ENOENT);
1279 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1280 (lsm->lsm_stripe_count));
1282 OBD_ALLOC_LARGE(lsm2, lsm_size);
1284 GOTO(out, rc = -ENOMEM);
1287 oa->o_nlink = ost_idx;
1288 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1289 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1290 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1291 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1292 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1293 memcpy(lsm2, lsm, lsm_size);
1294 ll_inode_size_lock(inode);
1295 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1296 ll_inode_size_unlock(inode);
1298 OBD_FREE_LARGE(lsm2, lsm_size);
1301 ccc_inode_lsm_put(inode, lsm);
1306 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1308 struct ll_recreate_obj ucreat;
1312 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1315 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319 ostid_set_seq_mdt0(&oi);
1320 ostid_set_id(&oi, ucreat.lrc_id);
1321 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1324 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1331 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1334 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1337 fid_to_ostid(&fid, &oi);
1338 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1339 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1342 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1343 int flags, struct lov_user_md *lum, int lum_size)
1345 struct lov_stripe_md *lsm = NULL;
1346 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1350 lsm = ccc_inode_lsm_get(inode);
1352 ccc_inode_lsm_put(inode, lsm);
1353 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1358 ll_inode_size_lock(inode);
1359 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1362 rc = oit.d.lustre.it_status;
1364 GOTO(out_req_free, rc);
1366 ll_release_openhandle(file->f_dentry, &oit);
1369 ll_inode_size_unlock(inode);
1370 ll_intent_release(&oit);
1371 ccc_inode_lsm_put(inode, lsm);
1374 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1378 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1379 struct lov_mds_md **lmmp, int *lmm_size,
1380 struct ptlrpc_request **request)
1382 struct ll_sb_info *sbi = ll_i2sbi(inode);
1383 struct mdt_body *body;
1384 struct lov_mds_md *lmm = NULL;
1385 struct ptlrpc_request *req = NULL;
1386 struct md_op_data *op_data;
1389 rc = ll_get_max_mdsize(sbi, &lmmsize);
1393 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1394 strlen(filename), lmmsize,
1395 LUSTRE_OPC_ANY, NULL);
1396 if (IS_ERR(op_data))
1397 RETURN(PTR_ERR(op_data));
1399 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1400 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1401 ll_finish_md_op_data(op_data);
1403 CDEBUG(D_INFO, "md_getattr_name failed "
1404 "on %s: rc %d\n", filename, rc);
1408 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1409 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1411 lmmsize = body->eadatasize;
1413 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1415 GOTO(out, rc = -ENODATA);
1418 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1419 LASSERT(lmm != NULL);
1421 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1422 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1423 GOTO(out, rc = -EPROTO);
1427 * This is coming from the MDS, so is probably in
1428 * little endian. We convert it to host endian before
1429 * passing it to userspace.
1431 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1432 /* if function called for directory - we should
1433 * avoid swab not existent lsm objects */
1434 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1435 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1436 if (S_ISREG(body->mode))
1437 lustre_swab_lov_user_md_objects(
1438 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1439 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1440 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1441 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1442 if (S_ISREG(body->mode))
1443 lustre_swab_lov_user_md_objects(
1444 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1445 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1451 *lmm_size = lmmsize;
1456 static int ll_lov_setea(struct inode *inode, struct file *file,
1459 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1460 struct lov_user_md *lump;
1461 int lum_size = sizeof(struct lov_user_md) +
1462 sizeof(struct lov_user_ost_data);
1466 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1469 OBD_ALLOC_LARGE(lump, lum_size);
1473 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1474 OBD_FREE_LARGE(lump, lum_size);
1478 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1480 OBD_FREE_LARGE(lump, lum_size);
1484 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1487 struct lov_user_md_v3 lumv3;
1488 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1489 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1490 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1492 int flags = FMODE_WRITE;
1495 /* first try with v1 which is smaller than v3 */
1496 lum_size = sizeof(struct lov_user_md_v1);
1497 if (copy_from_user(lumv1, lumv1p, lum_size))
1500 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1501 lum_size = sizeof(struct lov_user_md_v3);
1502 if (copy_from_user(&lumv3, lumv3p, lum_size))
1506 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1508 struct lov_stripe_md *lsm;
1511 put_user(0, &lumv1p->lmm_stripe_count);
1513 ll_layout_refresh(inode, &gen);
1514 lsm = ccc_inode_lsm_get(inode);
1515 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1516 0, lsm, (void *)arg);
1517 ccc_inode_lsm_put(inode, lsm);
1522 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1524 struct lov_stripe_md *lsm;
1528 lsm = ccc_inode_lsm_get(inode);
1530 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1532 ccc_inode_lsm_put(inode, lsm);
1536 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1538 struct ll_inode_info *lli = ll_i2info(inode);
1539 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1540 struct ccc_grouplock grouplock;
1544 if (ll_file_nolock(file))
1545 RETURN(-EOPNOTSUPP);
1547 spin_lock(&lli->lli_lock);
1548 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1549 CWARN("group lock already existed with gid %lu\n",
1550 fd->fd_grouplock.cg_gid);
1551 spin_unlock(&lli->lli_lock);
1554 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1555 spin_unlock(&lli->lli_lock);
1557 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1558 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1562 spin_lock(&lli->lli_lock);
1563 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1564 spin_unlock(&lli->lli_lock);
1565 CERROR("another thread just won the race\n");
1566 cl_put_grouplock(&grouplock);
1570 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1571 fd->fd_grouplock = grouplock;
1572 spin_unlock(&lli->lli_lock);
1574 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1578 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1580 struct ll_inode_info *lli = ll_i2info(inode);
1581 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1582 struct ccc_grouplock grouplock;
1585 spin_lock(&lli->lli_lock);
1586 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1587 spin_unlock(&lli->lli_lock);
1588 CWARN("no group lock held\n");
1591 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1593 if (fd->fd_grouplock.cg_gid != arg) {
1594 CWARN("group lock %lu doesn't match current id %lu\n",
1595 arg, fd->fd_grouplock.cg_gid);
1596 spin_unlock(&lli->lli_lock);
1600 grouplock = fd->fd_grouplock;
1601 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1602 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1603 spin_unlock(&lli->lli_lock);
1605 cl_put_grouplock(&grouplock);
1606 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1611 * Close inode open handle
1613 * \param dentry [in] dentry which contains the inode
1614 * \param it [in,out] intent which contains open info and result
1617 * \retval <0 failure
1619 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1621 struct inode *inode = dentry->d_inode;
1622 struct obd_client_handle *och;
1628 /* Root ? Do nothing. */
1629 if (dentry->d_inode->i_sb->s_root == dentry)
1632 /* No open handle to close? Move away */
1633 if (!it_disposition(it, DISP_OPEN_OPEN))
1636 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1638 OBD_ALLOC(och, sizeof(*och));
1640 GOTO(out, rc = -ENOMEM);
1642 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1643 ll_i2info(inode), it, och);
1645 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1648 /* this one is in place of ll_file_open */
1649 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1650 ptlrpc_req_finished(it->d.lustre.it_data);
1651 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1657 * Get size for inode for which FIEMAP mapping is requested.
1658 * Make the FIEMAP get_info call and returns the result.
1660 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1663 struct obd_export *exp = ll_i2dtexp(inode);
1664 struct lov_stripe_md *lsm = NULL;
1665 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1666 int vallen = num_bytes;
1670 /* Checks for fiemap flags */
1671 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1672 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1676 /* Check for FIEMAP_FLAG_SYNC */
1677 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1678 rc = filemap_fdatawrite(inode->i_mapping);
1683 lsm = ccc_inode_lsm_get(inode);
1687 /* If the stripe_count > 1 and the application does not understand
1688 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1690 if (lsm->lsm_stripe_count > 1 &&
1691 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1692 GOTO(out, rc = -EOPNOTSUPP);
1694 fm_key.oa.o_oi = lsm->lsm_oi;
1695 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1697 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1698 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1699 /* If filesize is 0, then there would be no objects for mapping */
1700 if (fm_key.oa.o_size == 0) {
1701 fiemap->fm_mapped_extents = 0;
1705 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1707 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1710 CERROR("obd_get_info failed: rc = %d\n", rc);
1713 ccc_inode_lsm_put(inode, lsm);
1717 int ll_fid2path(struct inode *inode, void *arg)
1719 struct obd_export *exp = ll_i2mdexp(inode);
1720 struct getinfo_fid2path *gfout, *gfin;
1724 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1725 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1728 /* Need to get the buflen */
1729 OBD_ALLOC_PTR(gfin);
1732 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1737 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1738 OBD_ALLOC(gfout, outsize);
1739 if (gfout == NULL) {
1743 memcpy(gfout, gfin, sizeof(*gfout));
1746 /* Call mdc_iocontrol */
1747 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1751 if (copy_to_user(arg, gfout, outsize))
1755 OBD_FREE(gfout, outsize);
1759 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1761 struct ll_user_fiemap *fiemap_s;
1762 size_t num_bytes, ret_bytes;
1763 unsigned int extent_count;
1766 /* Get the extent count so we can calculate the size of
1767 * required fiemap buffer */
1768 if (get_user(extent_count,
1769 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1771 num_bytes = sizeof(*fiemap_s) + (extent_count *
1772 sizeof(struct ll_fiemap_extent));
1774 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1775 if (fiemap_s == NULL)
1778 /* get the fiemap value */
1779 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1781 GOTO(error, rc = -EFAULT);
1783 /* If fm_extent_count is non-zero, read the first extent since
1784 * it is used to calculate end_offset and device from previous
1787 if (copy_from_user(&fiemap_s->fm_extents[0],
1788 (char __user *)arg + sizeof(*fiemap_s),
1789 sizeof(struct ll_fiemap_extent)))
1790 GOTO(error, rc = -EFAULT);
1793 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1797 ret_bytes = sizeof(struct ll_user_fiemap);
1799 if (extent_count != 0)
1800 ret_bytes += (fiemap_s->fm_mapped_extents *
1801 sizeof(struct ll_fiemap_extent));
1803 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1807 OBD_FREE_LARGE(fiemap_s, num_bytes);
1812 * Read the data_version for inode.
1814 * This value is computed using stripe object version on OST.
1815 * Version is computed using server side locking.
1817 * @param extent_lock Take extent lock. Not needed if a process is already
1818 * holding the OST object group locks.
1820 int ll_data_version(struct inode *inode, __u64 *data_version,
1823 struct lov_stripe_md *lsm = NULL;
1824 struct ll_sb_info *sbi = ll_i2sbi(inode);
1825 struct obdo *obdo = NULL;
1829 /* If no stripe, we consider version is 0. */
1830 lsm = ccc_inode_lsm_get(inode);
1833 CDEBUG(D_INODE, "No object for inode\n");
1837 OBD_ALLOC_PTR(obdo);
1839 ccc_inode_lsm_put(inode, lsm);
1843 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1845 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1848 *data_version = obdo->o_data_version;
1852 ccc_inode_lsm_put(inode, lsm);
1857 struct ll_swap_stack {
1858 struct iattr ia1, ia2;
1860 struct inode *inode1, *inode2;
1861 bool check_dv1, check_dv2;
1864 static int ll_swap_layouts(struct file *file1, struct file *file2,
1865 struct lustre_swap_layouts *lsl)
1867 struct mdc_swap_layouts msl;
1868 struct md_op_data *op_data;
1871 struct ll_swap_stack *llss = NULL;
1874 OBD_ALLOC_PTR(llss);
1878 llss->inode1 = file1->f_dentry->d_inode;
1879 llss->inode2 = file2->f_dentry->d_inode;
1881 if (!S_ISREG(llss->inode2->i_mode))
1882 GOTO(free, rc = -EINVAL);
1884 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1885 ll_permission(llss->inode2, MAY_WRITE, NULL))
1886 GOTO(free, rc = -EPERM);
1888 if (llss->inode2->i_sb != llss->inode1->i_sb)
1889 GOTO(free, rc = -EXDEV);
1891 /* we use 2 bool because it is easier to swap than 2 bits */
1892 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1893 llss->check_dv1 = true;
1895 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1896 llss->check_dv2 = true;
1898 /* we cannot use lsl->sl_dvX directly because we may swap them */
1899 llss->dv1 = lsl->sl_dv1;
1900 llss->dv2 = lsl->sl_dv2;
1902 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1903 if (rc == 0) /* same file, done! */
1906 if (rc < 0) { /* sequentialize it */
1907 swap(llss->inode1, llss->inode2);
1909 swap(llss->dv1, llss->dv2);
1910 swap(llss->check_dv1, llss->check_dv2);
1914 if (gid != 0) { /* application asks to flush dirty cache */
1915 rc = ll_get_grouplock(llss->inode1, file1, gid);
1919 rc = ll_get_grouplock(llss->inode2, file2, gid);
1921 ll_put_grouplock(llss->inode1, file1, gid);
1926 /* to be able to restore mtime and atime after swap
1927 * we need to first save them */
1929 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1930 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1931 llss->ia1.ia_atime = llss->inode1->i_atime;
1932 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1933 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1934 llss->ia2.ia_atime = llss->inode2->i_atime;
1935 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1938 /* ultimate check, before swaping the layouts we check if
1939 * dataversion has changed (if requested) */
1940 if (llss->check_dv1) {
1941 rc = ll_data_version(llss->inode1, &dv, 0);
1944 if (dv != llss->dv1)
1945 GOTO(putgl, rc = -EAGAIN);
1948 if (llss->check_dv2) {
1949 rc = ll_data_version(llss->inode2, &dv, 0);
1952 if (dv != llss->dv2)
1953 GOTO(putgl, rc = -EAGAIN);
1956 /* struct md_op_data is used to send the swap args to the mdt
1957 * only flags is missing, so we use struct mdc_swap_layouts
1958 * through the md_op_data->op_data */
1959 /* flags from user space have to be converted before they are send to
1960 * server, no flag is sent today, they are only used on the client */
1963 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1964 0, LUSTRE_OPC_ANY, &msl);
1965 if (op_data != NULL) {
1966 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1967 ll_i2mdexp(llss->inode1),
1968 sizeof(*op_data), op_data, NULL);
1969 ll_finish_md_op_data(op_data);
1974 ll_put_grouplock(llss->inode2, file2, gid);
1975 ll_put_grouplock(llss->inode1, file1, gid);
1978 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1982 /* clear useless flags */
1983 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1984 llss->ia1.ia_valid &= ~ATTR_MTIME;
1985 llss->ia2.ia_valid &= ~ATTR_MTIME;
1988 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1989 llss->ia1.ia_valid &= ~ATTR_ATIME;
1990 llss->ia2.ia_valid &= ~ATTR_ATIME;
1993 /* update time if requested */
1995 if (llss->ia2.ia_valid != 0) {
1996 mutex_lock(&llss->inode1->i_mutex);
1997 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1998 mutex_unlock(&llss->inode1->i_mutex);
2001 if (llss->ia1.ia_valid != 0) {
2004 mutex_lock(&llss->inode2->i_mutex);
2005 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2006 mutex_unlock(&llss->inode2->i_mutex);
2018 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2020 struct inode *inode = file->f_dentry->d_inode;
2021 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2025 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2026 inode->i_generation, inode, cmd);
2027 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2029 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2030 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2034 case LL_IOC_GETFLAGS:
2035 /* Get the current value of the file flags */
2036 return put_user(fd->fd_flags, (int *)arg);
2037 case LL_IOC_SETFLAGS:
2038 case LL_IOC_CLRFLAGS:
2039 /* Set or clear specific file flags */
2040 /* XXX This probably needs checks to ensure the flags are
2041 * not abused, and to handle any flag side effects.
2043 if (get_user(flags, (int *) arg))
2046 if (cmd == LL_IOC_SETFLAGS) {
2047 if ((flags & LL_FILE_IGNORE_LOCK) &&
2048 !(file->f_flags & O_DIRECT)) {
2049 CERROR("%s: unable to disable locking on "
2050 "non-O_DIRECT file\n", current->comm);
2054 fd->fd_flags |= flags;
2056 fd->fd_flags &= ~flags;
2059 case LL_IOC_LOV_SETSTRIPE:
2060 RETURN(ll_lov_setstripe(inode, file, arg));
2061 case LL_IOC_LOV_SETEA:
2062 RETURN(ll_lov_setea(inode, file, arg));
2063 case LL_IOC_LOV_SWAP_LAYOUTS: {
2065 struct lustre_swap_layouts lsl;
2067 if (cfs_copy_from_user(&lsl, (char *)arg,
2068 sizeof(struct lustre_swap_layouts)))
2071 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2074 file2 = fget(lsl.sl_fd);
2079 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2080 rc = ll_swap_layouts(file, file2, &lsl);
2084 case LL_IOC_LOV_GETSTRIPE:
2085 RETURN(ll_lov_getstripe(inode, arg));
2086 case LL_IOC_RECREATE_OBJ:
2087 RETURN(ll_lov_recreate_obj(inode, arg));
2088 case LL_IOC_RECREATE_FID:
2089 RETURN(ll_lov_recreate_fid(inode, arg));
2090 case FSFILT_IOC_FIEMAP:
2091 RETURN(ll_ioctl_fiemap(inode, arg));
2092 case FSFILT_IOC_GETFLAGS:
2093 case FSFILT_IOC_SETFLAGS:
2094 RETURN(ll_iocontrol(inode, file, cmd, arg));
2095 case FSFILT_IOC_GETVERSION_OLD:
2096 case FSFILT_IOC_GETVERSION:
2097 RETURN(put_user(inode->i_generation, (int *)arg));
2098 case LL_IOC_GROUP_LOCK:
2099 RETURN(ll_get_grouplock(inode, file, arg));
2100 case LL_IOC_GROUP_UNLOCK:
2101 RETURN(ll_put_grouplock(inode, file, arg));
2102 case IOC_OBD_STATFS:
2103 RETURN(ll_obd_statfs(inode, (void *)arg));
2105 /* We need to special case any other ioctls we want to handle,
2106 * to send them to the MDS/OST as appropriate and to properly
2107 * network encode the arg field.
2108 case FSFILT_IOC_SETVERSION_OLD:
2109 case FSFILT_IOC_SETVERSION:
2111 case LL_IOC_FLUSHCTX:
2112 RETURN(ll_flush_ctx(inode));
2113 case LL_IOC_PATH2FID: {
2114 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2115 sizeof(struct lu_fid)))
2120 case OBD_IOC_FID2PATH:
2121 RETURN(ll_fid2path(inode, (void *)arg));
2122 case LL_IOC_DATA_VERSION: {
2123 struct ioc_data_version idv;
2126 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2129 rc = ll_data_version(inode, &idv.idv_version,
2130 !(idv.idv_flags & LL_DV_NOFLUSH));
2132 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2138 case LL_IOC_GET_MDTIDX: {
2141 mdtidx = ll_get_mdt_idx(inode);
2145 if (put_user((int)mdtidx, (int*)arg))
2150 case OBD_IOC_GETDTNAME:
2151 case OBD_IOC_GETMDNAME:
2152 RETURN(ll_get_obd_name(inode, cmd, arg));
2153 case LL_IOC_HSM_STATE_GET: {
2154 struct md_op_data *op_data;
2155 struct hsm_user_state *hus;
2162 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2163 LUSTRE_OPC_ANY, hus);
2164 if (op_data == NULL) {
2169 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2172 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2175 ll_finish_md_op_data(op_data);
2179 case LL_IOC_HSM_STATE_SET: {
2180 struct md_op_data *op_data;
2181 struct hsm_state_set *hss;
2187 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2192 /* Non-root users are forbidden to set or clear flags which are
2193 * NOT defined in HSM_USER_MASK. */
2194 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2195 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2200 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2201 LUSTRE_OPC_ANY, hss);
2202 if (op_data == NULL) {
2207 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2210 ll_finish_md_op_data(op_data);
2215 case LL_IOC_HSM_ACTION: {
2216 struct md_op_data *op_data;
2217 struct hsm_current_action *hca;
2224 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2225 LUSTRE_OPC_ANY, hca);
2226 if (op_data == NULL) {
2231 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2234 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2237 ll_finish_md_op_data(op_data);
2245 ll_iocontrol_call(inode, file, cmd, arg, &err))
2248 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2254 #ifndef HAVE_FILE_LLSEEK_SIZE
2255 static inline loff_t
2256 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2258 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2260 if (offset > maxsize)
2263 if (offset != file->f_pos) {
2264 file->f_pos = offset;
2265 file->f_version = 0;
2271 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2272 loff_t maxsize, loff_t eof)
2274 struct inode *inode = file->f_dentry->d_inode;
2282 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2283 * position-querying operation. Avoid rewriting the "same"
2284 * f_pos value back to the file because a concurrent read(),
2285 * write() or lseek() might have altered it
2290 * f_lock protects against read/modify/write race with other
2291 * SEEK_CURs. Note that parallel writes and reads behave
2294 mutex_lock(&inode->i_mutex);
2295 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2296 mutex_unlock(&inode->i_mutex);
2300 * In the generic case the entire file is data, so as long as
2301 * offset isn't at the end of the file then the offset is data.
2308 * There is a virtual hole at the end of the file, so as long as
2309 * offset isn't i_size or larger, return i_size.
2317 return llseek_execute(file, offset, maxsize);
2321 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2323 struct inode *inode = file->f_dentry->d_inode;
2324 loff_t retval, eof = 0;
2327 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2328 (origin == SEEK_CUR) ? file->f_pos : 0);
2329 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2330 inode->i_ino, inode->i_generation, inode, retval, retval,
2332 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2334 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2335 retval = ll_glimpse_size(inode);
2338 eof = i_size_read(inode);
2341 retval = ll_generic_file_llseek_size(file, offset, origin,
2342 ll_file_maxbytes(inode), eof);
2346 int ll_flush(struct file *file, fl_owner_t id)
2348 struct inode *inode = file->f_dentry->d_inode;
2349 struct ll_inode_info *lli = ll_i2info(inode);
2350 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2353 LASSERT(!S_ISDIR(inode->i_mode));
2355 /* catch async errors that were recorded back when async writeback
2356 * failed for pages in this mapping. */
2357 rc = lli->lli_async_rc;
2358 lli->lli_async_rc = 0;
2359 err = lov_read_and_clear_async_rc(lli->lli_clob);
2363 /* The application has been told write failure already.
2364 * Do not report failure again. */
2365 if (fd->fd_write_failed)
2367 return rc ? -EIO : 0;
2371 * Called to make sure a portion of file has been written out.
2372 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2374 * Return how many pages have been written.
2376 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2377 enum cl_fsync_mode mode, int ignore_layout)
2379 struct cl_env_nest nest;
2382 struct obd_capa *capa = NULL;
2383 struct cl_fsync_io *fio;
2387 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2388 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2391 env = cl_env_nested_get(&nest);
2393 RETURN(PTR_ERR(env));
2395 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2397 io = ccc_env_thread_io(env);
2398 io->ci_obj = cl_i2info(inode)->lli_clob;
2399 io->ci_ignore_layout = ignore_layout;
2401 /* initialize parameters for sync */
2402 fio = &io->u.ci_fsync;
2403 fio->fi_capa = capa;
2404 fio->fi_start = start;
2406 fio->fi_fid = ll_inode2fid(inode);
2407 fio->fi_mode = mode;
2408 fio->fi_nr_written = 0;
2410 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2411 result = cl_io_loop(env, io);
2413 result = io->ci_result;
2415 result = fio->fi_nr_written;
2416 cl_io_fini(env, io);
2417 cl_env_nested_put(&nest, env);
2425 * When dentry is provided (the 'else' case), *file->f_dentry may be
2426 * null and dentry must be used directly rather than pulled from
2427 * *file->f_dentry as is done otherwise.
2430 #ifdef HAVE_FILE_FSYNC_4ARGS
2431 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2433 struct dentry *dentry = file->f_dentry;
2434 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2435 int ll_fsync(struct file *file, int datasync)
2437 struct dentry *dentry = file->f_dentry;
2439 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2442 struct inode *inode = dentry->d_inode;
2443 struct ll_inode_info *lli = ll_i2info(inode);
2444 struct ptlrpc_request *req;
2445 struct obd_capa *oc;
2449 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2450 inode->i_generation, inode);
2451 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2453 #ifdef HAVE_FILE_FSYNC_4ARGS
2454 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2455 mutex_lock(&inode->i_mutex);
2457 /* fsync's caller has already called _fdata{sync,write}, we want
2458 * that IO to finish before calling the osc and mdc sync methods */
2459 rc = filemap_fdatawait(inode->i_mapping);
2462 /* catch async errors that were recorded back when async writeback
2463 * failed for pages in this mapping. */
2464 if (!S_ISDIR(inode->i_mode)) {
2465 err = lli->lli_async_rc;
2466 lli->lli_async_rc = 0;
2469 err = lov_read_and_clear_async_rc(lli->lli_clob);
2474 oc = ll_mdscapa_get(inode);
2475 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2481 ptlrpc_req_finished(req);
2483 if (datasync && S_ISREG(inode->i_mode)) {
2484 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2486 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2488 if (rc == 0 && err < 0)
2491 fd->fd_write_failed = true;
2493 fd->fd_write_failed = false;
2496 #ifdef HAVE_FILE_FSYNC_4ARGS
2497 mutex_unlock(&inode->i_mutex);
2502 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2504 struct inode *inode = file->f_dentry->d_inode;
2505 struct ll_sb_info *sbi = ll_i2sbi(inode);
2506 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2507 .ei_cb_cp =ldlm_flock_completion_ast,
2508 .ei_cbdata = file_lock };
2509 struct md_op_data *op_data;
2510 struct lustre_handle lockh = {0};
2511 ldlm_policy_data_t flock = {{0}};
2517 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2518 inode->i_ino, file_lock);
2520 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2522 if (file_lock->fl_flags & FL_FLOCK) {
2523 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2524 /* flocks are whole-file locks */
2525 flock.l_flock.end = OFFSET_MAX;
2526 /* For flocks owner is determined by the local file desctiptor*/
2527 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2528 } else if (file_lock->fl_flags & FL_POSIX) {
2529 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2530 flock.l_flock.start = file_lock->fl_start;
2531 flock.l_flock.end = file_lock->fl_end;
2535 flock.l_flock.pid = file_lock->fl_pid;
2537 /* Somewhat ugly workaround for svc lockd.
2538 * lockd installs custom fl_lmops->lm_compare_owner that checks
2539 * for the fl_owner to be the same (which it always is on local node
2540 * I guess between lockd processes) and then compares pid.
2541 * As such we assign pid to the owner field to make it all work,
2542 * conflict with normal locks is unlikely since pid space and
2543 * pointer space for current->files are not intersecting */
2544 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2545 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2547 switch (file_lock->fl_type) {
2549 einfo.ei_mode = LCK_PR;
2552 /* An unlock request may or may not have any relation to
2553 * existing locks so we may not be able to pass a lock handle
2554 * via a normal ldlm_lock_cancel() request. The request may even
2555 * unlock a byte range in the middle of an existing lock. In
2556 * order to process an unlock request we need all of the same
2557 * information that is given with a normal read or write record
2558 * lock request. To avoid creating another ldlm unlock (cancel)
2559 * message we'll treat a LCK_NL flock request as an unlock. */
2560 einfo.ei_mode = LCK_NL;
2563 einfo.ei_mode = LCK_PW;
2566 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2567 file_lock->fl_type);
2582 flags = LDLM_FL_BLOCK_NOWAIT;
2588 flags = LDLM_FL_TEST_LOCK;
2589 /* Save the old mode so that if the mode in the lock changes we
2590 * can decrement the appropriate reader or writer refcount. */
2591 file_lock->fl_type = einfo.ei_mode;
2594 CERROR("unknown fcntl lock command: %d\n", cmd);
2598 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2599 LUSTRE_OPC_ANY, NULL);
2600 if (IS_ERR(op_data))
2601 RETURN(PTR_ERR(op_data));
2603 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2604 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2605 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2607 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2608 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2610 if ((file_lock->fl_flags & FL_FLOCK) &&
2611 (rc == 0 || file_lock->fl_type == F_UNLCK))
2612 rc2 = flock_lock_file_wait(file, file_lock);
2613 if ((file_lock->fl_flags & FL_POSIX) &&
2614 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2615 !(flags & LDLM_FL_TEST_LOCK))
2616 rc2 = posix_lock_file_wait(file, file_lock);
2618 if (rc2 && file_lock->fl_type != F_UNLCK) {
2619 einfo.ei_mode = LCK_NL;
2620 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2621 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2625 ll_finish_md_op_data(op_data);
2630 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2638 * test if some locks matching bits and l_req_mode are acquired
2639 * - bits can be in different locks
2640 * - if found clear the common lock bits in *bits
2641 * - the bits not found, are kept in *bits
2643 * \param bits [IN] searched lock bits [IN]
2644 * \param l_req_mode [IN] searched lock mode
2645 * \retval boolean, true iff all bits are found
2647 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2649 struct lustre_handle lockh;
2650 ldlm_policy_data_t policy;
2651 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2652 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2661 fid = &ll_i2info(inode)->lli_fid;
2662 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2663 ldlm_lockname[mode]);
2665 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2666 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2667 policy.l_inodebits.bits = *bits & (1 << i);
2668 if (policy.l_inodebits.bits == 0)
2671 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2672 &policy, mode, &lockh)) {
2673 struct ldlm_lock *lock;
2675 lock = ldlm_handle2lock(&lockh);
2678 ~(lock->l_policy_data.l_inodebits.bits);
2679 LDLM_LOCK_PUT(lock);
2681 *bits &= ~policy.l_inodebits.bits;
2688 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2689 struct lustre_handle *lockh, __u64 flags)
2691 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2696 fid = &ll_i2info(inode)->lli_fid;
2697 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2699 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2700 fid, LDLM_IBITS, &policy,
2701 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2705 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2707 /* Already unlinked. Just update nlink and return success */
2708 if (rc == -ENOENT) {
2710 /* This path cannot be hit for regular files unless in
2711 * case of obscure races, so no need to to validate
2713 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2715 } else if (rc != 0) {
2716 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2717 ll_get_fsname(inode->i_sb, NULL, 0),
2718 PFID(ll_inode2fid(inode)), rc);
2724 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2727 struct inode *inode = dentry->d_inode;
2728 struct ptlrpc_request *req = NULL;
2729 struct obd_export *exp;
2733 LASSERT(inode != NULL);
2735 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2736 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2738 exp = ll_i2mdexp(inode);
2740 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2741 * But under CMD case, it caused some lock issues, should be fixed
2742 * with new CMD ibits lock. See bug 12718 */
2743 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2744 struct lookup_intent oit = { .it_op = IT_GETATTR };
2745 struct md_op_data *op_data;
2747 if (ibits == MDS_INODELOCK_LOOKUP)
2748 oit.it_op = IT_LOOKUP;
2750 /* Call getattr by fid, so do not provide name at all. */
2751 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2752 dentry->d_inode, NULL, 0, 0,
2753 LUSTRE_OPC_ANY, NULL);
2754 if (IS_ERR(op_data))
2755 RETURN(PTR_ERR(op_data));
2757 oit.it_create_mode |= M_CHECK_STALE;
2758 rc = md_intent_lock(exp, op_data, NULL, 0,
2759 /* we are not interested in name
2762 ll_md_blocking_ast, 0);
2763 ll_finish_md_op_data(op_data);
2764 oit.it_create_mode &= ~M_CHECK_STALE;
2766 rc = ll_inode_revalidate_fini(inode, rc);
2770 rc = ll_revalidate_it_finish(req, &oit, dentry);
2772 ll_intent_release(&oit);
2776 /* Unlinked? Unhash dentry, so it is not picked up later by
2777 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2778 here to preserve get_cwd functionality on 2.6.
2780 if (!dentry->d_inode->i_nlink)
2781 d_lustre_invalidate(dentry);
2783 ll_lookup_finish_locks(&oit, dentry);
2784 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2785 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2786 obd_valid valid = OBD_MD_FLGETATTR;
2787 struct md_op_data *op_data;
2790 if (S_ISREG(inode->i_mode)) {
2791 rc = ll_get_max_mdsize(sbi, &ealen);
2794 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2797 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2798 0, ealen, LUSTRE_OPC_ANY,
2800 if (IS_ERR(op_data))
2801 RETURN(PTR_ERR(op_data));
2803 op_data->op_valid = valid;
2804 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2805 * capa for this inode. Because we only keep capas of dirs
2807 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2808 ll_finish_md_op_data(op_data);
2810 rc = ll_inode_revalidate_fini(inode, rc);
2814 rc = ll_prep_inode(&inode, req, NULL, NULL);
2817 ptlrpc_req_finished(req);
2821 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2824 struct inode *inode = dentry->d_inode;
2828 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2832 /* if object isn't regular file, don't validate size */
2833 if (!S_ISREG(inode->i_mode)) {
2834 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2835 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2836 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2838 rc = ll_glimpse_size(inode);
2843 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2844 struct lookup_intent *it, struct kstat *stat)
2846 struct inode *inode = de->d_inode;
2847 struct ll_sb_info *sbi = ll_i2sbi(inode);
2848 struct ll_inode_info *lli = ll_i2info(inode);
2851 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2852 MDS_INODELOCK_LOOKUP);
2853 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2858 stat->dev = inode->i_sb->s_dev;
2859 if (ll_need_32bit_api(sbi))
2860 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2862 stat->ino = inode->i_ino;
2863 stat->mode = inode->i_mode;
2864 stat->nlink = inode->i_nlink;
2865 stat->uid = inode->i_uid;
2866 stat->gid = inode->i_gid;
2867 stat->rdev = inode->i_rdev;
2868 stat->atime = inode->i_atime;
2869 stat->mtime = inode->i_mtime;
2870 stat->ctime = inode->i_ctime;
2871 stat->blksize = 1 << inode->i_blkbits;
2873 stat->size = i_size_read(inode);
2874 stat->blocks = inode->i_blocks;
2878 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2880 struct lookup_intent it = { .it_op = IT_GETATTR };
2882 return ll_getattr_it(mnt, de, &it, stat);
2885 #ifdef HAVE_LINUX_FIEMAP_H
2886 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2887 __u64 start, __u64 len)
2891 struct ll_user_fiemap *fiemap;
2892 unsigned int extent_count = fieinfo->fi_extents_max;
2894 num_bytes = sizeof(*fiemap) + (extent_count *
2895 sizeof(struct ll_fiemap_extent));
2896 OBD_ALLOC_LARGE(fiemap, num_bytes);
2901 fiemap->fm_flags = fieinfo->fi_flags;
2902 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2903 fiemap->fm_start = start;
2904 fiemap->fm_length = len;
2905 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2906 sizeof(struct ll_fiemap_extent));
2908 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2910 fieinfo->fi_flags = fiemap->fm_flags;
2911 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2912 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2913 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2915 OBD_FREE_LARGE(fiemap, num_bytes);
2920 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2922 struct ll_inode_info *lli = ll_i2info(inode);
2923 struct posix_acl *acl = NULL;
2926 spin_lock(&lli->lli_lock);
2927 /* VFS' acl_permission_check->check_acl will release the refcount */
2928 acl = posix_acl_dup(lli->lli_posix_acl);
2929 spin_unlock(&lli->lli_lock);
2934 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2936 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2937 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2939 ll_check_acl(struct inode *inode, int mask)
2942 # ifdef CONFIG_FS_POSIX_ACL
2943 struct posix_acl *acl;
2947 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2948 if (flags & IPERM_FLAG_RCU)
2951 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2956 rc = posix_acl_permission(inode, acl, mask);
2957 posix_acl_release(acl);
2960 # else /* !CONFIG_FS_POSIX_ACL */
2962 # endif /* CONFIG_FS_POSIX_ACL */
2964 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2966 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2967 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2969 # ifdef HAVE_INODE_PERMISION_2ARGS
2970 int ll_inode_permission(struct inode *inode, int mask)
2972 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2979 #ifdef MAY_NOT_BLOCK
2980 if (mask & MAY_NOT_BLOCK)
2982 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2983 if (flags & IPERM_FLAG_RCU)
2987 /* as root inode are NOT getting validated in lookup operation,
2988 * need to do it before permission check. */
2990 if (inode == inode->i_sb->s_root->d_inode) {
2991 struct lookup_intent it = { .it_op = IT_LOOKUP };
2993 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2994 MDS_INODELOCK_LOOKUP);
2999 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3000 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3002 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3003 return lustre_check_remote_perm(inode, mask);
3005 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3006 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3011 #ifdef HAVE_FILE_READV
3012 #define READ_METHOD readv
3013 #define READ_FUNCTION ll_file_readv
3014 #define WRITE_METHOD writev
3015 #define WRITE_FUNCTION ll_file_writev
3017 #define READ_METHOD aio_read
3018 #define READ_FUNCTION ll_file_aio_read
3019 #define WRITE_METHOD aio_write
3020 #define WRITE_FUNCTION ll_file_aio_write
3023 /* -o localflock - only provides locally consistent flock locks */
3024 struct file_operations ll_file_operations = {
3025 .read = ll_file_read,
3026 .READ_METHOD = READ_FUNCTION,
3027 .write = ll_file_write,
3028 .WRITE_METHOD = WRITE_FUNCTION,
3029 .unlocked_ioctl = ll_file_ioctl,
3030 .open = ll_file_open,
3031 .release = ll_file_release,
3032 .mmap = ll_file_mmap,
3033 .llseek = ll_file_seek,
3034 #ifdef HAVE_KERNEL_SENDFILE
3035 .sendfile = ll_file_sendfile,
3037 #ifdef HAVE_KERNEL_SPLICE_READ
3038 .splice_read = ll_file_splice_read,
3044 struct file_operations ll_file_operations_flock = {
3045 .read = ll_file_read,
3046 .READ_METHOD = READ_FUNCTION,
3047 .write = ll_file_write,
3048 .WRITE_METHOD = WRITE_FUNCTION,
3049 .unlocked_ioctl = ll_file_ioctl,
3050 .open = ll_file_open,
3051 .release = ll_file_release,
3052 .mmap = ll_file_mmap,
3053 .llseek = ll_file_seek,
3054 #ifdef HAVE_KERNEL_SENDFILE
3055 .sendfile = ll_file_sendfile,
3057 #ifdef HAVE_KERNEL_SPLICE_READ
3058 .splice_read = ll_file_splice_read,
3062 .flock = ll_file_flock,
3063 .lock = ll_file_flock
3066 /* These are for -o noflock - to return ENOSYS on flock calls */
3067 struct file_operations ll_file_operations_noflock = {
3068 .read = ll_file_read,
3069 .READ_METHOD = READ_FUNCTION,
3070 .write = ll_file_write,
3071 .WRITE_METHOD = WRITE_FUNCTION,
3072 .unlocked_ioctl = ll_file_ioctl,
3073 .open = ll_file_open,
3074 .release = ll_file_release,
3075 .mmap = ll_file_mmap,
3076 .llseek = ll_file_seek,
3077 #ifdef HAVE_KERNEL_SENDFILE
3078 .sendfile = ll_file_sendfile,
3080 #ifdef HAVE_KERNEL_SPLICE_READ
3081 .splice_read = ll_file_splice_read,
3085 .flock = ll_file_noflock,
3086 .lock = ll_file_noflock
3089 struct inode_operations ll_file_inode_operations = {
3090 .setattr = ll_setattr,
3091 .getattr = ll_getattr,
3092 .permission = ll_inode_permission,
3093 .setxattr = ll_setxattr,
3094 .getxattr = ll_getxattr,
3095 .listxattr = ll_listxattr,
3096 .removexattr = ll_removexattr,
3097 #ifdef HAVE_LINUX_FIEMAP_H
3098 .fiemap = ll_fiemap,
3100 #ifdef HAVE_IOP_GET_ACL
3101 .get_acl = ll_get_acl,
3105 /* dynamic ioctl number support routins */
3106 static struct llioc_ctl_data {
3107 struct rw_semaphore ioc_sem;
3108 cfs_list_t ioc_head;
3110 __RWSEM_INITIALIZER(llioc.ioc_sem),
3111 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3116 cfs_list_t iocd_list;
3117 unsigned int iocd_size;
3118 llioc_callback_t iocd_cb;
3119 unsigned int iocd_count;
3120 unsigned int iocd_cmd[0];
3123 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3126 struct llioc_data *in_data = NULL;
3129 if (cb == NULL || cmd == NULL ||
3130 count > LLIOC_MAX_CMD || count < 0)
3133 size = sizeof(*in_data) + count * sizeof(unsigned int);
3134 OBD_ALLOC(in_data, size);
3135 if (in_data == NULL)
3138 memset(in_data, 0, sizeof(*in_data));
3139 in_data->iocd_size = size;
3140 in_data->iocd_cb = cb;
3141 in_data->iocd_count = count;
3142 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3144 down_write(&llioc.ioc_sem);
3145 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3146 up_write(&llioc.ioc_sem);
3151 void ll_iocontrol_unregister(void *magic)
3153 struct llioc_data *tmp;
3158 down_write(&llioc.ioc_sem);
3159 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3161 unsigned int size = tmp->iocd_size;
3163 cfs_list_del(&tmp->iocd_list);
3164 up_write(&llioc.ioc_sem);
3166 OBD_FREE(tmp, size);
3170 up_write(&llioc.ioc_sem);
3172 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3175 EXPORT_SYMBOL(ll_iocontrol_register);
3176 EXPORT_SYMBOL(ll_iocontrol_unregister);
3178 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3179 unsigned int cmd, unsigned long arg, int *rcp)
3181 enum llioc_iter ret = LLIOC_CONT;
3182 struct llioc_data *data;
3183 int rc = -EINVAL, i;
3185 down_read(&llioc.ioc_sem);
3186 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3187 for (i = 0; i < data->iocd_count; i++) {
3188 if (cmd != data->iocd_cmd[i])
3191 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3195 if (ret == LLIOC_STOP)
3198 up_read(&llioc.ioc_sem);
3205 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3207 struct ll_inode_info *lli = ll_i2info(inode);
3208 struct cl_env_nest nest;
3213 if (lli->lli_clob == NULL)
3216 env = cl_env_nested_get(&nest);
3218 RETURN(PTR_ERR(env));
3220 result = cl_conf_set(env, lli->lli_clob, conf);
3221 cl_env_nested_put(&nest, env);
3223 if (conf->coc_opc == OBJECT_CONF_SET) {
3224 struct ldlm_lock *lock = conf->coc_lock;
3226 LASSERT(lock != NULL);
3227 LASSERT(ldlm_has_layout(lock));
3229 /* it can only be allowed to match after layout is
3230 * applied to inode otherwise false layout would be
3231 * seen. Applying layout shoud happen before dropping
3232 * the intent lock. */
3233 ldlm_lock_allow_match(lock);
3239 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3240 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3243 struct ll_sb_info *sbi = ll_i2sbi(inode);
3244 struct obd_capa *oc;
3245 struct ptlrpc_request *req;
3246 struct mdt_body *body;
3253 if (lock->l_lvb_data != NULL)
3256 /* if layout lock was granted right away, the layout is returned
3257 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3258 * blocked and then granted via completion ast, we have to fetch
3259 * layout here. Please note that we can't use the LVB buffer in
3260 * completion AST because it doesn't have a large enough buffer */
3261 oc = ll_mdscapa_get(inode);
3262 rc = ll_get_max_mdsize(sbi, &lmmsize);
3264 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3265 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3271 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3272 if (body == NULL || body->eadatasize > lmmsize)
3273 GOTO(out, rc = -EPROTO);
3275 lmmsize = body->eadatasize;
3276 if (lmmsize == 0) /* empty layout */
3279 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3281 GOTO(out, rc = -EFAULT);
3283 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3284 if (lvbdata == NULL)
3285 GOTO(out, rc = -ENOMEM);
3287 memcpy(lvbdata, lmm, lmmsize);
3288 lock_res_and_lock(lock);
3289 if (lock->l_lvb_data == NULL) {
3290 lock->l_lvb_data = lvbdata;
3291 lock->l_lvb_len = lmmsize;
3294 unlock_res_and_lock(lock);
3296 if (lvbdata != NULL)
3297 OBD_FREE_LARGE(lvbdata, lmmsize);
3301 ptlrpc_req_finished(req);
3306 * Apply the layout to the inode. Layout lock is held and will be released
3309 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3310 struct inode *inode, __u32 *gen, bool reconf)
3312 struct ll_inode_info *lli = ll_i2info(inode);
3313 struct ll_sb_info *sbi = ll_i2sbi(inode);
3314 struct ldlm_lock *lock;
3315 struct lustre_md md = { NULL };
3316 struct cl_object_conf conf;
3319 bool wait_layout = false;
3322 LASSERT(lustre_handle_is_used(lockh));
3324 lock = ldlm_handle2lock(lockh);
3325 LASSERT(lock != NULL);
3326 LASSERT(ldlm_has_layout(lock));
3328 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3329 inode, PFID(&lli->lli_fid), reconf);
3331 lock_res_and_lock(lock);
3332 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3333 unlock_res_and_lock(lock);
3334 /* checking lvb_ready is racy but this is okay. The worst case is
3335 * that multi processes may configure the file on the same time. */
3336 if (lvb_ready || !reconf) {
3339 /* layout_gen must be valid if layout lock is not
3340 * cancelled and stripe has already set */
3341 *gen = lli->lli_layout_gen;
3347 rc = ll_layout_fetch(inode, lock);
3351 /* for layout lock, lmm is returned in lock's lvb.
3352 * lvb_data is immutable if the lock is held so it's safe to access it
3353 * without res lock. See the description in ldlm_lock_decref_internal()
3354 * for the condition to free lvb_data of layout lock */
3355 if (lock->l_lvb_data != NULL) {
3356 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3357 lock->l_lvb_data, lock->l_lvb_len);
3359 *gen = LL_LAYOUT_GEN_EMPTY;
3361 *gen = md.lsm->lsm_layout_gen;
3364 CERROR("%s: file "DFID" unpackmd error: %d\n",
3365 ll_get_fsname(inode->i_sb, NULL, 0),
3366 PFID(&lli->lli_fid), rc);
3372 /* set layout to file. Unlikely this will fail as old layout was
3373 * surely eliminated */
3374 memset(&conf, 0, sizeof conf);
3375 conf.coc_opc = OBJECT_CONF_SET;
3376 conf.coc_inode = inode;
3377 conf.coc_lock = lock;
3378 conf.u.coc_md = &md;
3379 rc = ll_layout_conf(inode, &conf);
3382 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3384 /* refresh layout failed, need to wait */
3385 wait_layout = rc == -EBUSY;
3389 LDLM_LOCK_PUT(lock);
3390 ldlm_lock_decref(lockh, mode);
3392 /* wait for IO to complete if it's still being used. */
3394 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3395 ll_get_fsname(inode->i_sb, NULL, 0),
3396 inode, PFID(&lli->lli_fid));
3398 memset(&conf, 0, sizeof conf);
3399 conf.coc_opc = OBJECT_CONF_WAIT;
3400 conf.coc_inode = inode;
3401 rc = ll_layout_conf(inode, &conf);
3405 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3406 PFID(&lli->lli_fid), rc);
3412 * This function checks if there exists a LAYOUT lock on the client side,
3413 * or enqueues it if it doesn't have one in cache.
3415 * This function will not hold layout lock so it may be revoked any time after
3416 * this function returns. Any operations depend on layout should be redone
3419 * This function should be called before lov_io_init() to get an uptodate
3420 * layout version, the caller should save the version number and after IO
3421 * is finished, this function should be called again to verify that layout
3422 * is not changed during IO time.
3424 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3426 struct ll_inode_info *lli = ll_i2info(inode);
3427 struct ll_sb_info *sbi = ll_i2sbi(inode);
3428 struct md_op_data *op_data;
3429 struct lookup_intent it;
3430 struct lustre_handle lockh;
3432 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3434 .ei_cb_bl = ll_md_blocking_ast,
3435 .ei_cb_cp = ldlm_completion_ast,
3436 .ei_cbdata = NULL };
3440 *gen = lli->lli_layout_gen;
3441 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3445 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3446 LASSERT(S_ISREG(inode->i_mode));
3448 /* mostly layout lock is caching on the local side, so try to match
3449 * it before grabbing layout lock mutex. */
3450 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3451 if (mode != 0) { /* hit cached lock */
3452 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3456 /* better hold lli_layout_mutex to try again otherwise
3457 * it will have starvation problem. */
3460 /* take layout lock mutex to enqueue layout lock exclusively. */
3461 mutex_lock(&lli->lli_layout_mutex);
3464 /* try again. Maybe somebody else has done this. */
3465 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3466 if (mode != 0) { /* hit cached lock */
3467 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3471 mutex_unlock(&lli->lli_layout_mutex);
3475 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3476 0, 0, LUSTRE_OPC_ANY, NULL);
3477 if (IS_ERR(op_data)) {
3478 mutex_unlock(&lli->lli_layout_mutex);
3479 RETURN(PTR_ERR(op_data));
3482 /* have to enqueue one */
3483 memset(&it, 0, sizeof(it));
3484 it.it_op = IT_LAYOUT;
3485 lockh.cookie = 0ULL;
3487 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3488 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3489 PFID(&lli->lli_fid));
3491 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3493 if (it.d.lustre.it_data != NULL)
3494 ptlrpc_req_finished(it.d.lustre.it_data);
3495 it.d.lustre.it_data = NULL;
3497 ll_finish_md_op_data(op_data);
3499 md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
3501 mode = it.d.lustre.it_lock_mode;
3502 it.d.lustre.it_lock_mode = 0;
3503 ll_intent_drop_lock(&it);
3506 /* set lock data in case this is a new lock */
3507 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3508 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3512 mutex_unlock(&lli->lli_layout_mutex);