4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 static void ll_file_data_put(struct ll_file_data *fd)
64 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
67 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
68 struct lustre_handle *fh)
70 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
71 op_data->op_attr.ia_mode = inode->i_mode;
72 op_data->op_attr.ia_atime = inode->i_atime;
73 op_data->op_attr.ia_mtime = inode->i_mtime;
74 op_data->op_attr.ia_ctime = inode->i_ctime;
75 op_data->op_attr.ia_size = i_size_read(inode);
76 op_data->op_attr_blocks = inode->i_blocks;
77 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
78 ll_inode_to_ext_flags(inode->i_flags);
79 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
81 op_data->op_handle = *fh;
82 op_data->op_capa1 = ll_mdscapa_get(inode);
86 * Closes the IO epoch and packs all the attributes into @op_data for
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
95 ATTR_MTIME_SET | ATTR_CTIME_SET;
97 if (!(och->och_flags & FMODE_WRITE))
100 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
101 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
103 ll_ioepoch_close(inode, op_data, &och, 0);
106 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
107 ll_prep_md_op_data(op_data, inode, NULL, NULL,
108 0, 0, LUSTRE_OPC_ANY, NULL);
112 static int ll_close_inode_openhandle(struct obd_export *md_exp,
114 struct obd_client_handle *och)
116 struct obd_export *exp = ll_i2mdexp(inode);
117 struct md_op_data *op_data;
118 struct ptlrpc_request *req = NULL;
119 struct obd_device *obd = class_exp2obd(exp);
126 * XXX: in case of LMV, is this correct to access
129 CERROR("Invalid MDC connection handle "LPX64"\n",
130 ll_i2mdexp(inode)->exp_handle.h_cookie);
134 OBD_ALLOC_PTR(op_data);
136 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138 ll_prepare_close(inode, op_data, och);
139 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
140 rc = md_close(md_exp, op_data, och->och_mod, &req);
142 /* This close must have the epoch closed. */
143 LASSERT(epoch_close);
144 /* MDS has instructed us to obtain Size-on-MDS attribute from
145 * OSTs and send setattr to back to MDS. */
146 rc = ll_som_update(inode, op_data);
148 CERROR("inode %lu mdc Size-on-MDS update failed: "
149 "rc = %d\n", inode->i_ino, rc);
153 CERROR("inode %lu mdc close failed: rc = %d\n",
156 ll_finish_md_op_data(op_data);
159 rc = ll_objects_destroy(req, inode);
161 CERROR("inode %lu ll_objects destroy: rc = %d\n",
168 if (exp_connect_som(exp) && !epoch_close &&
169 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
170 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
172 md_clear_open_replay_data(md_exp, och);
173 /* Free @och if it is not waiting for DONE_WRITING. */
174 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177 if (req) /* This is close request */
178 ptlrpc_req_finished(req);
182 int ll_md_real_close(struct inode *inode, int flags)
184 struct ll_inode_info *lli = ll_i2info(inode);
185 struct obd_client_handle **och_p;
186 struct obd_client_handle *och;
191 if (flags & FMODE_WRITE) {
192 och_p = &lli->lli_mds_write_och;
193 och_usecount = &lli->lli_open_fd_write_count;
194 } else if (flags & FMODE_EXEC) {
195 och_p = &lli->lli_mds_exec_och;
196 och_usecount = &lli->lli_open_fd_exec_count;
198 LASSERT(flags & FMODE_READ);
199 och_p = &lli->lli_mds_read_och;
200 och_usecount = &lli->lli_open_fd_read_count;
203 cfs_mutex_lock(&lli->lli_och_mutex);
204 if (*och_usecount) { /* There are still users of this handle, so
206 cfs_mutex_unlock(&lli->lli_och_mutex);
211 cfs_mutex_unlock(&lli->lli_och_mutex);
213 if (och) { /* There might be a race and somebody have freed this och
215 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
222 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
226 struct ll_inode_info *lli = ll_i2info(inode);
230 /* clear group lock, if present */
231 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
232 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
234 /* Let's see if we have good enough OPEN lock on the file and if
235 we can skip talking to MDS */
236 if (file->f_dentry->d_inode) { /* Can this ever be false? */
238 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
239 struct lustre_handle lockh;
240 struct inode *inode = file->f_dentry->d_inode;
241 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
243 cfs_mutex_lock(&lli->lli_och_mutex);
244 if (fd->fd_omode & FMODE_WRITE) {
246 LASSERT(lli->lli_open_fd_write_count);
247 lli->lli_open_fd_write_count--;
248 } else if (fd->fd_omode & FMODE_EXEC) {
250 LASSERT(lli->lli_open_fd_exec_count);
251 lli->lli_open_fd_exec_count--;
254 LASSERT(lli->lli_open_fd_read_count);
255 lli->lli_open_fd_read_count--;
257 cfs_mutex_unlock(&lli->lli_och_mutex);
259 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
260 LDLM_IBITS, &policy, lockmode,
262 rc = ll_md_real_close(file->f_dentry->d_inode,
266 CERROR("Releasing a file %p with negative dentry %p. Name %s",
267 file, file->f_dentry, file->f_dentry->d_name.name);
270 LUSTRE_FPRIVATE(file) = NULL;
271 ll_file_data_put(fd);
272 ll_capa_close(inode);
277 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
279 /* While this returns an error code, fput() the caller does not, so we need
280 * to make every effort to clean up all of our state here. Also, applications
281 * rarely check close errors and even if an error is returned they will not
282 * re-try the close call.
284 int ll_file_release(struct inode *inode, struct file *file)
286 struct ll_file_data *fd;
287 struct ll_sb_info *sbi = ll_i2sbi(inode);
288 struct ll_inode_info *lli = ll_i2info(inode);
289 struct lov_stripe_md *lsm = lli->lli_smd;
293 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
294 inode->i_generation, inode);
296 #ifdef CONFIG_FS_POSIX_ACL
297 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
298 inode == inode->i_sb->s_root->d_inode) {
299 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
303 fd->fd_flags &= ~LL_FILE_RMTACL;
304 rct_del(&sbi->ll_rct, cfs_curproc_pid());
305 et_search_free(&sbi->ll_et, cfs_curproc_pid());
310 if (inode->i_sb->s_root != file->f_dentry)
311 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
312 fd = LUSTRE_FPRIVATE(file);
315 /* The last ref on @file, maybe not the the owner pid of statahead.
316 * Different processes can open the same dir, "ll_opendir_key" means:
317 * it is me that should stop the statahead thread. */
318 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
319 lli->lli_opendir_pid != 0)
320 ll_stop_statahead(inode, lli->lli_opendir_key);
322 if (inode->i_sb->s_root == file->f_dentry) {
323 LUSTRE_FPRIVATE(file) = NULL;
324 ll_file_data_put(fd);
328 if (!S_ISDIR(inode->i_mode)) {
330 lov_test_and_clear_async_rc(lsm);
331 lli->lli_async_rc = 0;
334 rc = ll_md_close(sbi->ll_md_exp, inode, file);
336 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
337 libcfs_debug_dumplog();
342 static int ll_intent_file_open(struct file *file, void *lmm,
343 int lmmsize, struct lookup_intent *itp)
345 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
346 struct dentry *parent = file->f_dentry->d_parent;
347 const char *name = file->f_dentry->d_name.name;
348 const int len = file->f_dentry->d_name.len;
349 struct md_op_data *op_data;
350 struct ptlrpc_request *req;
351 __u32 opc = LUSTRE_OPC_ANY;
358 /* Usually we come here only for NFSD, and we want open lock.
359 But we can also get here with pre 2.6.15 patchless kernels, and in
360 that case that lock is also ok */
361 /* We can also get here if there was cached open handle in revalidate_it
362 * but it disappeared while we were getting from there to ll_file_open.
363 * But this means this file was closed and immediatelly opened which
364 * makes a good candidate for using OPEN lock */
365 /* If lmmsize & lmm are not 0, we are just setting stripe info
366 * parameters. No need for the open lock */
367 if (lmm == NULL && lmmsize == 0) {
368 itp->it_flags |= MDS_OPEN_LOCK;
369 if (itp->it_flags & FMODE_WRITE)
370 opc = LUSTRE_OPC_CREATE;
373 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
374 file->f_dentry->d_inode, name, len,
377 RETURN(PTR_ERR(op_data));
379 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
380 0 /*unused */, &req, ll_md_blocking_ast, 0);
381 ll_finish_md_op_data(op_data);
383 /* reason for keep own exit path - don`t flood log
384 * with messages with -ESTALE errors.
386 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
387 it_open_error(DISP_OPEN_OPEN, itp))
389 ll_release_openhandle(file->f_dentry, itp);
393 if (it_disposition(itp, DISP_LOOKUP_NEG))
394 GOTO(out, rc = -ENOENT);
396 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
397 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
398 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
402 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
403 if (!rc && itp->d.lustre.it_lock_mode)
404 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
408 ptlrpc_req_finished(itp->d.lustre.it_data);
409 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
410 ll_intent_drop_lock(itp);
416 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
417 * not believe attributes if a few ioepoch holders exist. Attributes for
418 * previous ioepoch if new one is opened are also skipped by MDS.
420 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
422 if (ioepoch && lli->lli_ioepoch != ioepoch) {
423 lli->lli_ioepoch = ioepoch;
424 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
425 ioepoch, PFID(&lli->lli_fid));
429 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
430 struct lookup_intent *it, struct obd_client_handle *och)
432 struct ptlrpc_request *req = it->d.lustre.it_data;
433 struct mdt_body *body;
437 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
438 LASSERT(body != NULL); /* reply already checked out */
440 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
441 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
442 och->och_fid = lli->lli_fid;
443 och->och_flags = it->it_flags;
444 ll_ioepoch_open(lli, body->ioepoch);
446 return md_set_open_replay_data(md_exp, och, req);
449 int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
452 struct inode *inode = file->f_dentry->d_inode;
453 struct ll_inode_info *lli = ll_i2info(inode);
456 LASSERT(!LUSTRE_FPRIVATE(file));
461 struct ptlrpc_request *req = it->d.lustre.it_data;
462 struct mdt_body *body;
465 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
469 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
470 if ((it->it_flags & FMODE_WRITE) &&
471 (body->valid & OBD_MD_FLSIZE))
472 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
473 lli->lli_ioepoch, PFID(&lli->lli_fid));
476 LUSTRE_FPRIVATE(file) = fd;
477 ll_readahead_init(inode, &fd->fd_ras);
478 fd->fd_omode = it->it_flags;
482 /* Open a file, and (for the very first open) create objects on the OSTs at
483 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
484 * creation or open until ll_lov_setstripe() ioctl is called.
486 * If we already have the stripe MD locally then we don't request it in
487 * md_open(), by passing a lmm_size = 0.
489 * It is up to the application to ensure no other processes open this file
490 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
491 * used. We might be able to avoid races of that sort by getting lli_open_sem
492 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
493 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
495 int ll_file_open(struct inode *inode, struct file *file)
497 struct ll_inode_info *lli = ll_i2info(inode);
498 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
499 .it_flags = file->f_flags };
500 struct lov_stripe_md *lsm;
501 struct obd_client_handle **och_p = NULL;
502 __u64 *och_usecount = NULL;
503 struct ll_file_data *fd;
504 int rc = 0, opendir_set = 0;
507 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
508 inode->i_generation, inode, file->f_flags);
510 it = file->private_data; /* XXX: compat macro */
511 file->private_data = NULL; /* prevent ll_local_open assertion */
513 fd = ll_file_data_get();
515 GOTO(out_och_free, rc = -ENOMEM);
518 if (S_ISDIR(inode->i_mode)) {
519 cfs_spin_lock(&lli->lli_sa_lock);
520 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
521 lli->lli_opendir_pid == 0) {
522 lli->lli_opendir_key = fd;
523 lli->lli_opendir_pid = cfs_curproc_pid();
526 cfs_spin_unlock(&lli->lli_sa_lock);
529 if (inode->i_sb->s_root == file->f_dentry) {
530 LUSTRE_FPRIVATE(file) = fd;
534 if (!it || !it->d.lustre.it_disposition) {
535 /* Convert f_flags into access mode. We cannot use file->f_mode,
536 * because everything but O_ACCMODE mask was stripped from
538 if ((oit.it_flags + 1) & O_ACCMODE)
540 if (file->f_flags & O_TRUNC)
541 oit.it_flags |= FMODE_WRITE;
543 /* kernel only call f_op->open in dentry_open. filp_open calls
544 * dentry_open after call to open_namei that checks permissions.
545 * Only nfsd_open call dentry_open directly without checking
546 * permissions and because of that this code below is safe. */
547 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
548 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
550 /* We do not want O_EXCL here, presumably we opened the file
551 * already? XXX - NFS implications? */
552 oit.it_flags &= ~O_EXCL;
554 /* bug20584, if "it_flags" contains O_CREAT, the file will be
555 * created if necessary, then "IT_CREAT" should be set to keep
556 * consistent with it */
557 if (oit.it_flags & O_CREAT)
558 oit.it_op |= IT_CREAT;
564 /* Let's see if we have file open on MDS already. */
565 if (it->it_flags & FMODE_WRITE) {
566 och_p = &lli->lli_mds_write_och;
567 och_usecount = &lli->lli_open_fd_write_count;
568 } else if (it->it_flags & FMODE_EXEC) {
569 och_p = &lli->lli_mds_exec_och;
570 och_usecount = &lli->lli_open_fd_exec_count;
572 och_p = &lli->lli_mds_read_och;
573 och_usecount = &lli->lli_open_fd_read_count;
576 cfs_mutex_lock(&lli->lli_och_mutex);
577 if (*och_p) { /* Open handle is present */
578 if (it_disposition(it, DISP_OPEN_OPEN)) {
579 /* Well, there's extra open request that we do not need,
580 let's close it somehow. This will decref request. */
581 rc = it_open_error(DISP_OPEN_OPEN, it);
583 cfs_mutex_unlock(&lli->lli_och_mutex);
584 GOTO(out_openerr, rc);
587 ll_release_openhandle(file->f_dentry, it);
591 rc = ll_local_open(file, it, fd, NULL);
594 cfs_mutex_unlock(&lli->lli_och_mutex);
595 GOTO(out_openerr, rc);
598 LASSERT(*och_usecount == 0);
599 if (!it->d.lustre.it_disposition) {
600 /* We cannot just request lock handle now, new ELC code
601 means that one of other OPEN locks for this file
602 could be cancelled, and since blocking ast handler
603 would attempt to grab och_mutex as well, that would
604 result in a deadlock */
605 cfs_mutex_unlock(&lli->lli_och_mutex);
606 it->it_create_mode |= M_CHECK_STALE;
607 rc = ll_intent_file_open(file, NULL, 0, it);
608 it->it_create_mode &= ~M_CHECK_STALE;
610 GOTO(out_openerr, rc);
614 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
616 GOTO(out_och_free, rc = -ENOMEM);
620 /* md_intent_lock() didn't get a request ref if there was an
621 * open error, so don't do cleanup on the request here
623 /* XXX (green): Should not we bail out on any error here, not
624 * just open error? */
625 rc = it_open_error(DISP_OPEN_OPEN, it);
627 GOTO(out_och_free, rc);
629 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 cfs_mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
648 if (file->f_flags & O_LOV_DELAY_CREATE ||
649 !(file->f_mode & FMODE_WRITE)) {
650 CDEBUG(D_INODE, "object creation was delayed\n");
651 GOTO(out_och_free, rc);
654 file->f_flags &= ~O_LOV_DELAY_CREATE;
655 GOTO(out_och_free, rc);
658 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
659 ptlrpc_req_finished(it->d.lustre.it_data);
660 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
664 if (och_p && *och_p) {
665 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
666 *och_p = NULL; /* OBD_FREE writes some magic there */
669 cfs_mutex_unlock(&lli->lli_och_mutex);
672 if (opendir_set != 0)
673 ll_stop_statahead(inode, lli->lli_opendir_key);
675 ll_file_data_put(fd);
677 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
683 /* Fills the obdo with the attributes for the lsm */
684 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
685 struct obd_capa *capa, struct obdo *obdo,
686 __u64 ioepoch, int sync)
688 struct ptlrpc_request_set *set;
689 struct obd_info oinfo = { { { 0 } } };
694 LASSERT(lsm != NULL);
698 oinfo.oi_oa->o_id = lsm->lsm_object_id;
699 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
700 oinfo.oi_oa->o_mode = S_IFREG;
701 oinfo.oi_oa->o_ioepoch = ioepoch;
702 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
703 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
704 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
705 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
706 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
707 OBD_MD_FLDATAVERSION;
708 oinfo.oi_capa = capa;
710 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
711 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
714 set = ptlrpc_prep_set();
716 CERROR("can't allocate ptlrpc set\n");
719 rc = obd_getattr_async(exp, &oinfo, set);
721 rc = ptlrpc_set_wait(set);
722 ptlrpc_set_destroy(set);
725 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
726 OBD_MD_FLATIME | OBD_MD_FLMTIME |
727 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
728 OBD_MD_FLDATAVERSION);
733 * Performs the getattr on the inode and updates its fields.
734 * If @sync != 0, perform the getattr under the server-side lock.
736 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
737 __u64 ioepoch, int sync)
739 struct ll_inode_info *lli = ll_i2info(inode);
740 struct obd_capa *capa = ll_mdscapa_get(inode);
744 rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode),
745 capa, obdo, ioepoch, sync);
748 obdo_refresh_inode(inode, obdo, obdo->o_valid);
750 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
751 lli->lli_smd->lsm_object_id, i_size_read(inode),
752 (unsigned long long)inode->i_blocks,
753 (unsigned long)ll_inode_blksize(inode));
758 int ll_merge_lvb(struct inode *inode)
760 struct ll_inode_info *lli = ll_i2info(inode);
761 struct ll_sb_info *sbi = ll_i2sbi(inode);
767 ll_inode_size_lock(inode, 1);
768 inode_init_lvb(inode, &lvb);
770 /* merge timestamps the most resently obtained from mds with
771 timestamps obtained from osts */
772 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
773 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
774 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
775 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
776 cl_isize_write_nolock(inode, lvb.lvb_size);
778 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
779 PFID(&lli->lli_fid), lvb.lvb_size);
780 inode->i_blocks = lvb.lvb_blocks;
782 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
783 LTIME_S(inode->i_atime) = lvb.lvb_atime;
784 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
785 ll_inode_size_unlock(inode, 1);
790 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
793 struct obdo obdo = { 0 };
796 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
798 st->st_size = obdo.o_size;
799 st->st_blocks = obdo.o_blocks;
800 st->st_mtime = obdo.o_mtime;
801 st->st_atime = obdo.o_atime;
802 st->st_ctime = obdo.o_ctime;
807 void ll_io_init(struct cl_io *io, const struct file *file, int write)
809 struct inode *inode = file->f_dentry->d_inode;
811 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
813 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
814 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || IS_SYNC(inode);
816 io->ci_obj = ll_i2info(inode)->lli_clob;
817 io->ci_lockreq = CILR_MAYBE;
818 if (ll_file_nolock(file)) {
819 io->ci_lockreq = CILR_NEVER;
820 io->ci_no_srvlock = 1;
821 } else if (file->f_flags & O_APPEND) {
822 io->ci_lockreq = CILR_MANDATORY;
826 static ssize_t ll_file_io_generic(const struct lu_env *env,
827 struct vvp_io_args *args, struct file *file,
828 enum cl_io_type iot, loff_t *ppos, size_t count)
830 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
835 io = ccc_env_thread_io(env);
836 ll_io_init(io, file, iot == CIT_WRITE);
838 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
839 struct vvp_io *vio = vvp_env_io(env);
840 struct ccc_io *cio = ccc_env_io(env);
841 int write_mutex_locked = 0;
843 cio->cui_fd = LUSTRE_FPRIVATE(file);
844 vio->cui_io_subtype = args->via_io_subtype;
846 switch (vio->cui_io_subtype) {
848 cio->cui_iov = args->u.normal.via_iov;
849 cio->cui_nrsegs = args->u.normal.via_nrsegs;
850 cio->cui_tot_nrsegs = cio->cui_nrsegs;
851 #ifndef HAVE_FILE_WRITEV
852 cio->cui_iocb = args->u.normal.via_iocb;
854 if ((iot == CIT_WRITE) &&
855 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
856 if (cfs_mutex_lock_interruptible(&lli->
858 GOTO(out, result = -ERESTARTSYS);
859 write_mutex_locked = 1;
860 } else if (iot == CIT_READ) {
861 cfs_down_read(&lli->lli_trunc_sem);
865 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
866 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
869 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
870 vio->u.splice.cui_flags = args->u.splice.via_flags;
873 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
876 result = cl_io_loop(env, io);
877 if (write_mutex_locked)
878 cfs_mutex_unlock(&lli->lli_write_mutex);
879 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
880 cfs_up_read(&lli->lli_trunc_sem);
882 /* cl_io_rw_init() handled IO */
883 result = io->ci_result;
886 if (io->ci_nob > 0) {
888 *ppos = io->u.ci_wr.wr.crw_pos;
894 if (iot == CIT_READ) {
896 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
897 LPROC_LL_READ_BYTES, result);
898 } else if (iot == CIT_WRITE) {
900 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
901 LPROC_LL_WRITE_BYTES, result);
902 lli->lli_write_rc = 0;
904 lli->lli_write_rc = result;
913 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
915 static int ll_file_get_iov_count(const struct iovec *iov,
916 unsigned long *nr_segs, size_t *count)
921 for (seg = 0; seg < *nr_segs; seg++) {
922 const struct iovec *iv = &iov[seg];
925 * If any segment has a negative length, or the cumulative
926 * length ever wraps negative then return -EINVAL.
929 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
931 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
936 cnt -= iv->iov_len; /* This segment is no good */
943 #ifdef HAVE_FILE_READV
944 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
945 unsigned long nr_segs, loff_t *ppos)
948 struct vvp_io_args *args;
954 result = ll_file_get_iov_count(iov, &nr_segs, &count);
958 env = cl_env_get(&refcheck);
960 RETURN(PTR_ERR(env));
962 args = vvp_env_args(env, IO_NORMAL);
963 args->u.normal.via_iov = (struct iovec *)iov;
964 args->u.normal.via_nrsegs = nr_segs;
966 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
967 cl_env_put(env, &refcheck);
971 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
975 struct iovec *local_iov;
980 env = cl_env_get(&refcheck);
982 RETURN(PTR_ERR(env));
984 local_iov = &vvp_env_info(env)->vti_local_iov;
985 local_iov->iov_base = (void __user *)buf;
986 local_iov->iov_len = count;
987 result = ll_file_readv(file, local_iov, 1, ppos);
988 cl_env_put(env, &refcheck);
993 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
994 unsigned long nr_segs, loff_t pos)
997 struct vvp_io_args *args;
1003 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1007 env = cl_env_get(&refcheck);
1009 RETURN(PTR_ERR(env));
1011 args = vvp_env_args(env, IO_NORMAL);
1012 args->u.normal.via_iov = (struct iovec *)iov;
1013 args->u.normal.via_nrsegs = nr_segs;
1014 args->u.normal.via_iocb = iocb;
1016 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1017 &iocb->ki_pos, count);
1018 cl_env_put(env, &refcheck);
1022 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1026 struct iovec *local_iov;
1027 struct kiocb *kiocb;
1032 env = cl_env_get(&refcheck);
1034 RETURN(PTR_ERR(env));
1036 local_iov = &vvp_env_info(env)->vti_local_iov;
1037 kiocb = &vvp_env_info(env)->vti_kiocb;
1038 local_iov->iov_base = (void __user *)buf;
1039 local_iov->iov_len = count;
1040 init_sync_kiocb(kiocb, file);
1041 kiocb->ki_pos = *ppos;
1042 kiocb->ki_left = count;
1044 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1045 *ppos = kiocb->ki_pos;
1047 cl_env_put(env, &refcheck);
1053 * Write to a file (through the page cache).
1055 #ifdef HAVE_FILE_WRITEV
1056 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1057 unsigned long nr_segs, loff_t *ppos)
1060 struct vvp_io_args *args;
1066 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1070 env = cl_env_get(&refcheck);
1072 RETURN(PTR_ERR(env));
1074 args = vvp_env_args(env, IO_NORMAL);
1075 args->u.normal.via_iov = (struct iovec *)iov;
1076 args->u.normal.via_nrsegs = nr_segs;
1078 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1079 cl_env_put(env, &refcheck);
1083 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1087 struct iovec *local_iov;
1092 env = cl_env_get(&refcheck);
1094 RETURN(PTR_ERR(env));
1096 local_iov = &vvp_env_info(env)->vti_local_iov;
1097 local_iov->iov_base = (void __user *)buf;
1098 local_iov->iov_len = count;
1100 result = ll_file_writev(file, local_iov, 1, ppos);
1101 cl_env_put(env, &refcheck);
1105 #else /* AIO stuff */
1106 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1107 unsigned long nr_segs, loff_t pos)
1110 struct vvp_io_args *args;
1116 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1120 env = cl_env_get(&refcheck);
1122 RETURN(PTR_ERR(env));
1124 args = vvp_env_args(env, IO_NORMAL);
1125 args->u.normal.via_iov = (struct iovec *)iov;
1126 args->u.normal.via_nrsegs = nr_segs;
1127 args->u.normal.via_iocb = iocb;
1129 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1130 &iocb->ki_pos, count);
1131 cl_env_put(env, &refcheck);
1135 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1139 struct iovec *local_iov;
1140 struct kiocb *kiocb;
1145 env = cl_env_get(&refcheck);
1147 RETURN(PTR_ERR(env));
1149 local_iov = &vvp_env_info(env)->vti_local_iov;
1150 kiocb = &vvp_env_info(env)->vti_kiocb;
1151 local_iov->iov_base = (void __user *)buf;
1152 local_iov->iov_len = count;
1153 init_sync_kiocb(kiocb, file);
1154 kiocb->ki_pos = *ppos;
1155 kiocb->ki_left = count;
1157 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1158 *ppos = kiocb->ki_pos;
1160 cl_env_put(env, &refcheck);
1166 #ifdef HAVE_KERNEL_SENDFILE
1168 * Send file content (through pagecache) somewhere with helper
1170 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1171 read_actor_t actor, void *target)
1174 struct vvp_io_args *args;
1179 env = cl_env_get(&refcheck);
1181 RETURN(PTR_ERR(env));
1183 args = vvp_env_args(env, IO_SENDFILE);
1184 args->u.sendfile.via_target = target;
1185 args->u.sendfile.via_actor = actor;
1187 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1188 cl_env_put(env, &refcheck);
1193 #ifdef HAVE_KERNEL_SPLICE_READ
1195 * Send file content (through pagecache) somewhere with helper
1197 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1198 struct pipe_inode_info *pipe, size_t count,
1202 struct vvp_io_args *args;
1207 env = cl_env_get(&refcheck);
1209 RETURN(PTR_ERR(env));
1211 args = vvp_env_args(env, IO_SPLICE);
1212 args->u.splice.via_pipe = pipe;
1213 args->u.splice.via_flags = flags;
1215 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1216 cl_env_put(env, &refcheck);
1221 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1224 struct obd_export *exp = ll_i2dtexp(inode);
1225 struct obd_trans_info oti = { 0 };
1226 struct obdo *oa = NULL;
1229 struct lov_stripe_md *lsm, *lsm2;
1236 ll_inode_size_lock(inode, 0);
1237 lsm = ll_i2info(inode)->lli_smd;
1239 GOTO(out, rc = -ENOENT);
1240 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1241 (lsm->lsm_stripe_count));
1243 OBD_ALLOC_LARGE(lsm2, lsm_size);
1245 GOTO(out, rc = -ENOMEM);
1249 oa->o_nlink = ost_idx;
1250 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1251 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1252 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1253 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1254 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1255 memcpy(lsm2, lsm, lsm_size);
1256 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1258 OBD_FREE_LARGE(lsm2, lsm_size);
1261 ll_inode_size_unlock(inode, 0);
1266 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1268 struct ll_recreate_obj ucreat;
1271 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1274 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1275 sizeof(struct ll_recreate_obj)))
1278 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1279 ucreat.lrc_ost_idx));
1282 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1289 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1292 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1293 sizeof(struct lu_fid)))
1296 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1297 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1298 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1301 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1302 int flags, struct lov_user_md *lum, int lum_size)
1304 struct lov_stripe_md *lsm;
1305 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1309 ll_inode_size_lock(inode, 0);
1310 lsm = ll_i2info(inode)->lli_smd;
1312 ll_inode_size_unlock(inode, 0);
1313 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1318 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1321 rc = oit.d.lustre.it_status;
1323 GOTO(out_req_free, rc);
1325 ll_release_openhandle(file->f_dentry, &oit);
1328 ll_inode_size_unlock(inode, 0);
1329 ll_intent_release(&oit);
1332 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1336 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1337 struct lov_mds_md **lmmp, int *lmm_size,
1338 struct ptlrpc_request **request)
1340 struct ll_sb_info *sbi = ll_i2sbi(inode);
1341 struct mdt_body *body;
1342 struct lov_mds_md *lmm = NULL;
1343 struct ptlrpc_request *req = NULL;
1344 struct md_op_data *op_data;
1347 rc = ll_get_max_mdsize(sbi, &lmmsize);
1351 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1352 strlen(filename), lmmsize,
1353 LUSTRE_OPC_ANY, NULL);
1354 if (IS_ERR(op_data))
1355 RETURN(PTR_ERR(op_data));
1357 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1358 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1359 ll_finish_md_op_data(op_data);
1361 CDEBUG(D_INFO, "md_getattr_name failed "
1362 "on %s: rc %d\n", filename, rc);
1366 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1367 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1369 lmmsize = body->eadatasize;
1371 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1373 GOTO(out, rc = -ENODATA);
1376 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1377 LASSERT(lmm != NULL);
1379 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1380 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1381 GOTO(out, rc = -EPROTO);
1385 * This is coming from the MDS, so is probably in
1386 * little endian. We convert it to host endian before
1387 * passing it to userspace.
1389 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1390 /* if function called for directory - we should
1391 * avoid swab not existent lsm objects */
1392 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1393 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1394 if (S_ISREG(body->mode))
1395 lustre_swab_lov_user_md_objects(
1396 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1397 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1398 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1399 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1400 if (S_ISREG(body->mode))
1401 lustre_swab_lov_user_md_objects(
1402 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1403 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1409 *lmm_size = lmmsize;
1414 static int ll_lov_setea(struct inode *inode, struct file *file,
1417 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1418 struct lov_user_md *lump;
1419 int lum_size = sizeof(struct lov_user_md) +
1420 sizeof(struct lov_user_ost_data);
1424 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1427 OBD_ALLOC_LARGE(lump, lum_size);
1431 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1432 OBD_FREE_LARGE(lump, lum_size);
1436 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1438 OBD_FREE_LARGE(lump, lum_size);
1442 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1445 struct lov_user_md_v3 lumv3;
1446 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1447 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1448 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1451 int flags = FMODE_WRITE;
1454 /* first try with v1 which is smaller than v3 */
1455 lum_size = sizeof(struct lov_user_md_v1);
1456 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1459 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1460 lum_size = sizeof(struct lov_user_md_v3);
1461 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1465 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1467 put_user(0, &lumv1p->lmm_stripe_count);
1468 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1469 0, ll_i2info(inode)->lli_smd,
1475 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1477 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1482 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1487 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1489 struct ll_inode_info *lli = ll_i2info(inode);
1490 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1491 struct ccc_grouplock grouplock;
1495 if (ll_file_nolock(file))
1496 RETURN(-EOPNOTSUPP);
1498 cfs_spin_lock(&lli->lli_lock);
1499 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1500 CWARN("group lock already existed with gid %lu\n",
1501 fd->fd_grouplock.cg_gid);
1502 cfs_spin_unlock(&lli->lli_lock);
1505 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1506 cfs_spin_unlock(&lli->lli_lock);
1508 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1509 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1513 cfs_spin_lock(&lli->lli_lock);
1514 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1515 cfs_spin_unlock(&lli->lli_lock);
1516 CERROR("another thread just won the race\n");
1517 cl_put_grouplock(&grouplock);
1521 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1522 fd->fd_grouplock = grouplock;
1523 cfs_spin_unlock(&lli->lli_lock);
1525 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1529 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1531 struct ll_inode_info *lli = ll_i2info(inode);
1532 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1533 struct ccc_grouplock grouplock;
1536 cfs_spin_lock(&lli->lli_lock);
1537 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1538 cfs_spin_unlock(&lli->lli_lock);
1539 CWARN("no group lock held\n");
1542 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1544 if (fd->fd_grouplock.cg_gid != arg) {
1545 CWARN("group lock %lu doesn't match current id %lu\n",
1546 arg, fd->fd_grouplock.cg_gid);
1547 cfs_spin_unlock(&lli->lli_lock);
1551 grouplock = fd->fd_grouplock;
1552 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1553 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1554 cfs_spin_unlock(&lli->lli_lock);
1556 cl_put_grouplock(&grouplock);
1557 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1562 * Close inode open handle
1564 * \param dentry [in] dentry which contains the inode
1565 * \param it [in,out] intent which contains open info and result
1568 * \retval <0 failure
1570 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1572 struct inode *inode = dentry->d_inode;
1573 struct obd_client_handle *och;
1579 /* Root ? Do nothing. */
1580 if (dentry->d_inode->i_sb->s_root == dentry)
1583 /* No open handle to close? Move away */
1584 if (!it_disposition(it, DISP_OPEN_OPEN))
1587 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1589 OBD_ALLOC(och, sizeof(*och));
1591 GOTO(out, rc = -ENOMEM);
1593 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1594 ll_i2info(inode), it, och);
1596 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1599 /* this one is in place of ll_file_open */
1600 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1601 ptlrpc_req_finished(it->d.lustre.it_data);
1602 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1608 * Get size for inode for which FIEMAP mapping is requested.
1609 * Make the FIEMAP get_info call and returns the result.
1611 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1614 struct obd_export *exp = ll_i2dtexp(inode);
1615 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1616 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1617 int vallen = num_bytes;
1621 /* Checks for fiemap flags */
1622 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1623 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1627 /* Check for FIEMAP_FLAG_SYNC */
1628 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1629 rc = filemap_fdatawrite(inode->i_mapping);
1634 /* If the stripe_count > 1 and the application does not understand
1635 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1637 if (lsm->lsm_stripe_count > 1 &&
1638 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1641 fm_key.oa.o_id = lsm->lsm_object_id;
1642 fm_key.oa.o_seq = lsm->lsm_object_seq;
1643 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1645 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1646 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1647 /* If filesize is 0, then there would be no objects for mapping */
1648 if (fm_key.oa.o_size == 0) {
1649 fiemap->fm_mapped_extents = 0;
1653 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1655 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1658 CERROR("obd_get_info failed: rc = %d\n", rc);
1663 int ll_fid2path(struct obd_export *exp, void *arg)
1665 struct getinfo_fid2path *gfout, *gfin;
1669 /* Need to get the buflen */
1670 OBD_ALLOC_PTR(gfin);
1673 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1678 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1679 OBD_ALLOC(gfout, outsize);
1680 if (gfout == NULL) {
1684 memcpy(gfout, gfin, sizeof(*gfout));
1687 /* Call mdc_iocontrol */
1688 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1691 if (cfs_copy_to_user(arg, gfout, outsize))
1695 OBD_FREE(gfout, outsize);
1699 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1701 struct ll_user_fiemap *fiemap_s;
1702 size_t num_bytes, ret_bytes;
1703 unsigned int extent_count;
1706 /* Get the extent count so we can calculate the size of
1707 * required fiemap buffer */
1708 if (get_user(extent_count,
1709 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1711 num_bytes = sizeof(*fiemap_s) + (extent_count *
1712 sizeof(struct ll_fiemap_extent));
1714 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1715 if (fiemap_s == NULL)
1718 /* get the fiemap value */
1719 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1721 GOTO(error, rc = -EFAULT);
1723 /* If fm_extent_count is non-zero, read the first extent since
1724 * it is used to calculate end_offset and device from previous
1727 if (copy_from_user(&fiemap_s->fm_extents[0],
1728 (char __user *)arg + sizeof(*fiemap_s),
1729 sizeof(struct ll_fiemap_extent)))
1730 GOTO(error, rc = -EFAULT);
1733 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1737 ret_bytes = sizeof(struct ll_user_fiemap);
1739 if (extent_count != 0)
1740 ret_bytes += (fiemap_s->fm_mapped_extents *
1741 sizeof(struct ll_fiemap_extent));
1743 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1747 OBD_FREE_LARGE(fiemap_s, num_bytes);
1752 * Read the data_version for inode.
1754 * This value is computed using stripe object version on OST.
1755 * Version is computed using server side locking.
1757 * @param extent_lock Take extent lock. Not needed if a process is already
1758 * holding the OST object group locks.
1760 static int ll_data_version(struct inode *inode, __u64 *data_version,
1763 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1764 struct ll_sb_info *sbi = ll_i2sbi(inode);
1765 struct obdo *obdo = NULL;
1769 /* If no stripe, we consider version is 0. */
1772 CDEBUG(D_INODE, "No object for inode\n");
1776 OBD_ALLOC_PTR(obdo);
1780 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1782 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1785 *data_version = obdo->o_data_version;
1793 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1795 struct inode *inode = file->f_dentry->d_inode;
1796 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1801 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1802 inode->i_generation, inode, cmd);
1803 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1805 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1806 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1810 case LL_IOC_GETFLAGS:
1811 /* Get the current value of the file flags */
1812 return put_user(fd->fd_flags, (int *)arg);
1813 case LL_IOC_SETFLAGS:
1814 case LL_IOC_CLRFLAGS:
1815 /* Set or clear specific file flags */
1816 /* XXX This probably needs checks to ensure the flags are
1817 * not abused, and to handle any flag side effects.
1819 if (get_user(flags, (int *) arg))
1822 if (cmd == LL_IOC_SETFLAGS) {
1823 if ((flags & LL_FILE_IGNORE_LOCK) &&
1824 !(file->f_flags & O_DIRECT)) {
1825 CERROR("%s: unable to disable locking on "
1826 "non-O_DIRECT file\n", current->comm);
1830 fd->fd_flags |= flags;
1832 fd->fd_flags &= ~flags;
1835 case LL_IOC_LOV_SETSTRIPE:
1836 RETURN(ll_lov_setstripe(inode, file, arg));
1837 case LL_IOC_LOV_SETEA:
1838 RETURN(ll_lov_setea(inode, file, arg));
1839 case LL_IOC_LOV_GETSTRIPE:
1840 RETURN(ll_lov_getstripe(inode, arg));
1841 case LL_IOC_RECREATE_OBJ:
1842 RETURN(ll_lov_recreate_obj(inode, arg));
1843 case LL_IOC_RECREATE_FID:
1844 RETURN(ll_lov_recreate_fid(inode, arg));
1845 case FSFILT_IOC_FIEMAP:
1846 RETURN(ll_ioctl_fiemap(inode, arg));
1847 case FSFILT_IOC_GETFLAGS:
1848 case FSFILT_IOC_SETFLAGS:
1849 RETURN(ll_iocontrol(inode, file, cmd, arg));
1850 case FSFILT_IOC_GETVERSION_OLD:
1851 case FSFILT_IOC_GETVERSION:
1852 RETURN(put_user(inode->i_generation, (int *)arg));
1853 case LL_IOC_GROUP_LOCK:
1854 RETURN(ll_get_grouplock(inode, file, arg));
1855 case LL_IOC_GROUP_UNLOCK:
1856 RETURN(ll_put_grouplock(inode, file, arg));
1857 case IOC_OBD_STATFS:
1858 RETURN(ll_obd_statfs(inode, (void *)arg));
1860 /* We need to special case any other ioctls we want to handle,
1861 * to send them to the MDS/OST as appropriate and to properly
1862 * network encode the arg field.
1863 case FSFILT_IOC_SETVERSION_OLD:
1864 case FSFILT_IOC_SETVERSION:
1866 case LL_IOC_FLUSHCTX:
1867 RETURN(ll_flush_ctx(inode));
1868 case LL_IOC_PATH2FID: {
1869 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1870 sizeof(struct lu_fid)))
1875 case OBD_IOC_FID2PATH:
1876 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1877 case LL_IOC_DATA_VERSION: {
1878 struct ioc_data_version idv;
1881 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1884 rc = ll_data_version(inode, &idv.idv_version,
1885 !(idv.idv_flags & LL_DV_NOFLUSH));
1888 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1894 case LL_IOC_GET_MDTIDX: {
1897 mdtidx = ll_get_mdt_idx(inode);
1901 if (put_user((int)mdtidx, (int*)arg))
1906 case OBD_IOC_GETDTNAME:
1907 case OBD_IOC_GETMDNAME:
1908 RETURN(ll_get_obd_name(inode, cmd, arg));
1913 ll_iocontrol_call(inode, file, cmd, arg, &err))
1916 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1922 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1924 struct inode *inode = file->f_dentry->d_inode;
1927 retval = offset + ((origin == 2) ? i_size_read(inode) :
1928 (origin == 1) ? file->f_pos : 0);
1929 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1930 inode->i_ino, inode->i_generation, inode, retval, retval,
1931 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1932 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1934 if (origin == 2) { /* SEEK_END */
1937 rc = ll_glimpse_size(inode);
1941 offset += i_size_read(inode);
1942 } else if (origin == 1) { /* SEEK_CUR */
1943 offset += file->f_pos;
1947 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1948 if (offset != file->f_pos) {
1949 file->f_pos = offset;
1957 #ifdef HAVE_FLUSH_OWNER_ID
1958 int ll_flush(struct file *file, fl_owner_t id)
1960 int ll_flush(struct file *file)
1963 struct inode *inode = file->f_dentry->d_inode;
1964 struct ll_inode_info *lli = ll_i2info(inode);
1965 struct lov_stripe_md *lsm = lli->lli_smd;
1968 LASSERT(!S_ISDIR(inode->i_mode));
1970 /* the application should know write failure already. */
1971 if (lli->lli_write_rc)
1974 /* catch async errors that were recorded back when async writeback
1975 * failed for pages in this mapping. */
1976 rc = lli->lli_async_rc;
1977 lli->lli_async_rc = 0;
1979 err = lov_test_and_clear_async_rc(lsm);
1984 return rc ? -EIO : 0;
1988 * Called to make sure a portion of file has been written out.
1989 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
1991 * Return how many pages have been written.
1993 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
1994 enum cl_fsync_mode mode)
1996 struct cl_env_nest nest;
1999 struct obd_capa *capa = NULL;
2000 struct cl_fsync_io *fio;
2004 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2005 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2008 env = cl_env_nested_get(&nest);
2010 RETURN(PTR_ERR(env));
2012 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2014 io = ccc_env_thread_io(env);
2015 io->ci_obj = cl_i2info(inode)->lli_clob;
2017 /* initialize parameters for sync */
2018 fio = &io->u.ci_fsync;
2019 fio->fi_capa = capa;
2020 fio->fi_start = start;
2022 fio->fi_fid = ll_inode2fid(inode);
2023 fio->fi_mode = mode;
2024 fio->fi_nr_written = 0;
2026 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2027 result = cl_io_loop(env, io);
2029 result = io->ci_result;
2031 result = fio->fi_nr_written;
2032 cl_io_fini(env, io);
2033 cl_env_nested_put(&nest, env);
2040 #ifdef HAVE_FILE_FSYNC_4ARGS
2041 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2042 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2043 int ll_fsync(struct file *file, int data)
2045 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2048 struct inode *inode = file->f_dentry->d_inode;
2049 struct ll_inode_info *lli = ll_i2info(inode);
2050 struct lov_stripe_md *lsm = lli->lli_smd;
2051 struct ptlrpc_request *req;
2052 struct obd_capa *oc;
2055 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2056 inode->i_generation, inode);
2057 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2059 /* fsync's caller has already called _fdata{sync,write}, we want
2060 * that IO to finish before calling the osc and mdc sync methods */
2061 rc = filemap_fdatawait(inode->i_mapping);
2063 /* catch async errors that were recorded back when async writeback
2064 * failed for pages in this mapping. */
2065 if (!S_ISDIR(inode->i_mode)) {
2066 err = lli->lli_async_rc;
2067 lli->lli_async_rc = 0;
2071 err = lov_test_and_clear_async_rc(lsm);
2077 oc = ll_mdscapa_get(inode);
2078 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2084 ptlrpc_req_finished(req);
2087 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2089 if (rc == 0 && err < 0)
2091 lli->lli_write_rc = rc < 0 ? rc : 0;
2097 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2099 struct inode *inode = file->f_dentry->d_inode;
2100 struct ll_sb_info *sbi = ll_i2sbi(inode);
2101 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2102 .ei_cb_cp =ldlm_flock_completion_ast,
2103 .ei_cbdata = file_lock };
2104 struct md_op_data *op_data;
2105 struct lustre_handle lockh = {0};
2106 ldlm_policy_data_t flock = {{0}};
2111 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2112 inode->i_ino, file_lock);
2114 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2116 if (file_lock->fl_flags & FL_FLOCK) {
2117 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2118 /* flocks are whole-file locks */
2119 flock.l_flock.end = OFFSET_MAX;
2120 /* For flocks owner is determined by the local file desctiptor*/
2121 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2122 } else if (file_lock->fl_flags & FL_POSIX) {
2123 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2124 flock.l_flock.start = file_lock->fl_start;
2125 flock.l_flock.end = file_lock->fl_end;
2129 flock.l_flock.pid = file_lock->fl_pid;
2131 /* Somewhat ugly workaround for svc lockd.
2132 * lockd installs custom fl_lmops->fl_compare_owner that checks
2133 * for the fl_owner to be the same (which it always is on local node
2134 * I guess between lockd processes) and then compares pid.
2135 * As such we assign pid to the owner field to make it all work,
2136 * conflict with normal locks is unlikely since pid space and
2137 * pointer space for current->files are not intersecting */
2138 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2139 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2141 switch (file_lock->fl_type) {
2143 einfo.ei_mode = LCK_PR;
2146 /* An unlock request may or may not have any relation to
2147 * existing locks so we may not be able to pass a lock handle
2148 * via a normal ldlm_lock_cancel() request. The request may even
2149 * unlock a byte range in the middle of an existing lock. In
2150 * order to process an unlock request we need all of the same
2151 * information that is given with a normal read or write record
2152 * lock request. To avoid creating another ldlm unlock (cancel)
2153 * message we'll treat a LCK_NL flock request as an unlock. */
2154 einfo.ei_mode = LCK_NL;
2157 einfo.ei_mode = LCK_PW;
2160 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2161 file_lock->fl_type);
2176 flags = LDLM_FL_BLOCK_NOWAIT;
2182 flags = LDLM_FL_TEST_LOCK;
2183 /* Save the old mode so that if the mode in the lock changes we
2184 * can decrement the appropriate reader or writer refcount. */
2185 file_lock->fl_type = einfo.ei_mode;
2188 CERROR("unknown fcntl lock command: %d\n", cmd);
2192 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2193 LUSTRE_OPC_ANY, NULL);
2194 if (IS_ERR(op_data))
2195 RETURN(PTR_ERR(op_data));
2197 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2198 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2199 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2201 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2202 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2204 ll_finish_md_op_data(op_data);
2206 if ((file_lock->fl_flags & FL_FLOCK) &&
2207 (rc == 0 || file_lock->fl_type == F_UNLCK))
2208 flock_lock_file_wait(file, file_lock);
2209 if ((file_lock->fl_flags & FL_POSIX) &&
2210 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2211 !(flags & LDLM_FL_TEST_LOCK))
2212 posix_lock_file_wait(file, file_lock);
2217 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2225 * test if some locks matching bits and l_req_mode are acquired
2226 * - bits can be in different locks
2227 * - if found clear the common lock bits in *bits
2228 * - the bits not found, are kept in *bits
2230 * \param bits [IN] searched lock bits [IN]
2231 * \param l_req_mode [IN] searched lock mode
2232 * \retval boolean, true iff all bits are found
2234 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2236 struct lustre_handle lockh;
2237 ldlm_policy_data_t policy;
2238 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2239 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2248 fid = &ll_i2info(inode)->lli_fid;
2249 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2250 ldlm_lockname[mode]);
2252 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2253 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2254 policy.l_inodebits.bits = *bits & (1 << i);
2255 if (policy.l_inodebits.bits == 0)
2258 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2259 &policy, mode, &lockh)) {
2260 struct ldlm_lock *lock;
2262 lock = ldlm_handle2lock(&lockh);
2265 ~(lock->l_policy_data.l_inodebits.bits);
2266 LDLM_LOCK_PUT(lock);
2268 *bits &= ~policy.l_inodebits.bits;
2275 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2276 struct lustre_handle *lockh)
2278 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2284 fid = &ll_i2info(inode)->lli_fid;
2285 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2287 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2288 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2289 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2293 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2294 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2295 * and return success */
2297 /* This path cannot be hit for regular files unless in
2298 * case of obscure races, so no need to to validate
2300 if (!S_ISREG(inode->i_mode) &&
2301 !S_ISDIR(inode->i_mode))
2306 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2314 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2317 struct inode *inode = dentry->d_inode;
2318 struct ptlrpc_request *req = NULL;
2319 struct obd_export *exp;
2324 CERROR("REPORT THIS LINE TO PETER\n");
2328 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2329 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2331 exp = ll_i2mdexp(inode);
2333 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2334 * But under CMD case, it caused some lock issues, should be fixed
2335 * with new CMD ibits lock. See bug 12718 */
2336 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2337 struct lookup_intent oit = { .it_op = IT_GETATTR };
2338 struct md_op_data *op_data;
2340 if (ibits == MDS_INODELOCK_LOOKUP)
2341 oit.it_op = IT_LOOKUP;
2343 /* Call getattr by fid, so do not provide name at all. */
2344 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2345 dentry->d_inode, NULL, 0, 0,
2346 LUSTRE_OPC_ANY, NULL);
2347 if (IS_ERR(op_data))
2348 RETURN(PTR_ERR(op_data));
2350 oit.it_create_mode |= M_CHECK_STALE;
2351 rc = md_intent_lock(exp, op_data, NULL, 0,
2352 /* we are not interested in name
2355 ll_md_blocking_ast, 0);
2356 ll_finish_md_op_data(op_data);
2357 oit.it_create_mode &= ~M_CHECK_STALE;
2359 rc = ll_inode_revalidate_fini(inode, rc);
2363 rc = ll_revalidate_it_finish(req, &oit, dentry);
2365 ll_intent_release(&oit);
2369 /* Unlinked? Unhash dentry, so it is not picked up later by
2370 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2371 here to preserve get_cwd functionality on 2.6.
2373 if (!dentry->d_inode->i_nlink)
2374 d_lustre_invalidate(dentry);
2376 ll_lookup_finish_locks(&oit, dentry);
2377 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2378 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2379 obd_valid valid = OBD_MD_FLGETATTR;
2380 struct md_op_data *op_data;
2383 if (S_ISREG(inode->i_mode)) {
2384 rc = ll_get_max_mdsize(sbi, &ealen);
2387 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2390 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2391 0, ealen, LUSTRE_OPC_ANY,
2393 if (IS_ERR(op_data))
2394 RETURN(PTR_ERR(op_data));
2396 op_data->op_valid = valid;
2397 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2398 * capa for this inode. Because we only keep capas of dirs
2400 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2401 ll_finish_md_op_data(op_data);
2403 rc = ll_inode_revalidate_fini(inode, rc);
2407 rc = ll_prep_inode(&inode, req, NULL);
2410 ptlrpc_req_finished(req);
2414 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2417 struct inode *inode = dentry->d_inode;
2421 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2423 /* if object not yet allocated, don't validate size */
2424 if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL) {
2425 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2426 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2427 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2431 /* ll_glimpse_size will prefer locally cached writes if they extend
2435 rc = ll_glimpse_size(inode);
2440 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2441 struct lookup_intent *it, struct kstat *stat)
2443 struct inode *inode = de->d_inode;
2444 struct ll_sb_info *sbi = ll_i2sbi(inode);
2445 struct ll_inode_info *lli = ll_i2info(inode);
2448 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2449 MDS_INODELOCK_LOOKUP);
2450 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2455 stat->dev = inode->i_sb->s_dev;
2456 if (ll_need_32bit_api(sbi))
2457 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2459 stat->ino = inode->i_ino;
2460 stat->mode = inode->i_mode;
2461 stat->nlink = inode->i_nlink;
2462 stat->uid = inode->i_uid;
2463 stat->gid = inode->i_gid;
2464 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2465 stat->atime = inode->i_atime;
2466 stat->mtime = inode->i_mtime;
2467 stat->ctime = inode->i_ctime;
2468 #ifdef HAVE_INODE_BLKSIZE
2469 stat->blksize = inode->i_blksize;
2471 stat->blksize = 1 << inode->i_blkbits;
2474 stat->size = i_size_read(inode);
2475 stat->blocks = inode->i_blocks;
2479 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2481 struct lookup_intent it = { .it_op = IT_GETATTR };
2483 return ll_getattr_it(mnt, de, &it, stat);
2486 #ifdef HAVE_LINUX_FIEMAP_H
2487 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2488 __u64 start, __u64 len)
2492 struct ll_user_fiemap *fiemap;
2493 unsigned int extent_count = fieinfo->fi_extents_max;
2495 num_bytes = sizeof(*fiemap) + (extent_count *
2496 sizeof(struct ll_fiemap_extent));
2497 OBD_ALLOC_LARGE(fiemap, num_bytes);
2502 fiemap->fm_flags = fieinfo->fi_flags;
2503 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2504 fiemap->fm_start = start;
2505 fiemap->fm_length = len;
2506 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2507 sizeof(struct ll_fiemap_extent));
2509 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2511 fieinfo->fi_flags = fiemap->fm_flags;
2512 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2513 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2514 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2516 OBD_FREE_LARGE(fiemap, num_bytes);
2523 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2524 lustre_check_acl(struct inode *inode, int mask, unsigned int flags)
2526 lustre_check_acl(struct inode *inode, int mask)
2529 #ifdef CONFIG_FS_POSIX_ACL
2530 struct ll_inode_info *lli = ll_i2info(inode);
2531 struct posix_acl *acl;
2535 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2536 if (flags & IPERM_FLAG_RCU)
2539 cfs_spin_lock(&lli->lli_lock);
2540 acl = posix_acl_dup(lli->lli_posix_acl);
2541 cfs_spin_unlock(&lli->lli_lock);
2546 rc = posix_acl_permission(inode, acl, mask);
2547 posix_acl_release(acl);
2555 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2556 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2558 # ifdef HAVE_INODE_PERMISION_2ARGS
2559 int ll_inode_permission(struct inode *inode, int mask)
2561 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2568 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2569 if (flags & IPERM_FLAG_RCU)
2573 /* as root inode are NOT getting validated in lookup operation,
2574 * need to do it before permission check. */
2576 if (inode == inode->i_sb->s_root->d_inode) {
2577 struct lookup_intent it = { .it_op = IT_LOOKUP };
2579 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2580 MDS_INODELOCK_LOOKUP);
2585 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2586 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2588 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2589 return lustre_check_remote_perm(inode, mask);
2591 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2592 rc = ll_generic_permission(inode, mask, flags, lustre_check_acl);
2597 #ifdef HAVE_FILE_READV
2598 #define READ_METHOD readv
2599 #define READ_FUNCTION ll_file_readv
2600 #define WRITE_METHOD writev
2601 #define WRITE_FUNCTION ll_file_writev
2603 #define READ_METHOD aio_read
2604 #define READ_FUNCTION ll_file_aio_read
2605 #define WRITE_METHOD aio_write
2606 #define WRITE_FUNCTION ll_file_aio_write
2609 /* -o localflock - only provides locally consistent flock locks */
2610 struct file_operations ll_file_operations = {
2611 .read = ll_file_read,
2612 .READ_METHOD = READ_FUNCTION,
2613 .write = ll_file_write,
2614 .WRITE_METHOD = WRITE_FUNCTION,
2615 .unlocked_ioctl = ll_file_ioctl,
2616 .open = ll_file_open,
2617 .release = ll_file_release,
2618 .mmap = ll_file_mmap,
2619 .llseek = ll_file_seek,
2620 #ifdef HAVE_KERNEL_SENDFILE
2621 .sendfile = ll_file_sendfile,
2623 #ifdef HAVE_KERNEL_SPLICE_READ
2624 .splice_read = ll_file_splice_read,
2630 struct file_operations ll_file_operations_flock = {
2631 .read = ll_file_read,
2632 .READ_METHOD = READ_FUNCTION,
2633 .write = ll_file_write,
2634 .WRITE_METHOD = WRITE_FUNCTION,
2635 .unlocked_ioctl = ll_file_ioctl,
2636 .open = ll_file_open,
2637 .release = ll_file_release,
2638 .mmap = ll_file_mmap,
2639 .llseek = ll_file_seek,
2640 #ifdef HAVE_KERNEL_SENDFILE
2641 .sendfile = ll_file_sendfile,
2643 #ifdef HAVE_KERNEL_SPLICE_READ
2644 .splice_read = ll_file_splice_read,
2648 .flock = ll_file_flock,
2649 .lock = ll_file_flock
2652 /* These are for -o noflock - to return ENOSYS on flock calls */
2653 struct file_operations ll_file_operations_noflock = {
2654 .read = ll_file_read,
2655 .READ_METHOD = READ_FUNCTION,
2656 .write = ll_file_write,
2657 .WRITE_METHOD = WRITE_FUNCTION,
2658 .unlocked_ioctl = ll_file_ioctl,
2659 .open = ll_file_open,
2660 .release = ll_file_release,
2661 .mmap = ll_file_mmap,
2662 .llseek = ll_file_seek,
2663 #ifdef HAVE_KERNEL_SENDFILE
2664 .sendfile = ll_file_sendfile,
2666 #ifdef HAVE_KERNEL_SPLICE_READ
2667 .splice_read = ll_file_splice_read,
2671 .flock = ll_file_noflock,
2672 .lock = ll_file_noflock
2675 struct inode_operations ll_file_inode_operations = {
2676 .setattr = ll_setattr,
2677 .truncate = ll_truncate,
2678 .getattr = ll_getattr,
2679 .permission = ll_inode_permission,
2680 .setxattr = ll_setxattr,
2681 .getxattr = ll_getxattr,
2682 .listxattr = ll_listxattr,
2683 .removexattr = ll_removexattr,
2684 #ifdef HAVE_LINUX_FIEMAP_H
2685 .fiemap = ll_fiemap,
2689 /* dynamic ioctl number support routins */
2690 static struct llioc_ctl_data {
2691 cfs_rw_semaphore_t ioc_sem;
2692 cfs_list_t ioc_head;
2694 __RWSEM_INITIALIZER(llioc.ioc_sem),
2695 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2700 cfs_list_t iocd_list;
2701 unsigned int iocd_size;
2702 llioc_callback_t iocd_cb;
2703 unsigned int iocd_count;
2704 unsigned int iocd_cmd[0];
2707 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2710 struct llioc_data *in_data = NULL;
2713 if (cb == NULL || cmd == NULL ||
2714 count > LLIOC_MAX_CMD || count < 0)
2717 size = sizeof(*in_data) + count * sizeof(unsigned int);
2718 OBD_ALLOC(in_data, size);
2719 if (in_data == NULL)
2722 memset(in_data, 0, sizeof(*in_data));
2723 in_data->iocd_size = size;
2724 in_data->iocd_cb = cb;
2725 in_data->iocd_count = count;
2726 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2728 cfs_down_write(&llioc.ioc_sem);
2729 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2730 cfs_up_write(&llioc.ioc_sem);
2735 void ll_iocontrol_unregister(void *magic)
2737 struct llioc_data *tmp;
2742 cfs_down_write(&llioc.ioc_sem);
2743 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2745 unsigned int size = tmp->iocd_size;
2747 cfs_list_del(&tmp->iocd_list);
2748 cfs_up_write(&llioc.ioc_sem);
2750 OBD_FREE(tmp, size);
2754 cfs_up_write(&llioc.ioc_sem);
2756 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2759 EXPORT_SYMBOL(ll_iocontrol_register);
2760 EXPORT_SYMBOL(ll_iocontrol_unregister);
2762 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2763 unsigned int cmd, unsigned long arg, int *rcp)
2765 enum llioc_iter ret = LLIOC_CONT;
2766 struct llioc_data *data;
2767 int rc = -EINVAL, i;
2769 cfs_down_read(&llioc.ioc_sem);
2770 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2771 for (i = 0; i < data->iocd_count; i++) {
2772 if (cmd != data->iocd_cmd[i])
2775 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2779 if (ret == LLIOC_STOP)
2782 cfs_up_read(&llioc.ioc_sem);