1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * Copyright (c) 2011 Whamcloud, Inc.
36 * This file is part of Lustre, http://www.lustre.org/
37 * Lustre is a trademark of Sun Microsystems, Inc.
41 * Author: Peter Braam <braam@clusterfs.com>
42 * Author: Phil Schwan <phil@clusterfs.com>
43 * Author: Andreas Dilger <adilger@clusterfs.com>
46 #define DEBUG_SUBSYSTEM S_LLITE
47 #include <lustre_dlm.h>
48 #include <lustre_lite.h>
49 #include <linux/pagemap.h>
50 #include <linux/file.h>
51 #include "llite_internal.h"
52 #include <lustre/ll_fiemap.h>
54 #include "cl_object.h"
56 struct ll_file_data *ll_file_data_get(void)
58 struct ll_file_data *fd;
60 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
64 static void ll_file_data_put(struct ll_file_data *fd)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
70 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
89 * Closes the IO epoch and packs all the attributes into @op_data for
92 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
93 struct obd_client_handle *och)
97 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
98 ATTR_MTIME_SET | ATTR_CTIME_SET;
100 if (!(och->och_flags & FMODE_WRITE))
103 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
104 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
106 ll_ioepoch_close(inode, op_data, &och, 0);
109 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
110 ll_prep_md_op_data(op_data, inode, NULL, NULL,
111 0, 0, LUSTRE_OPC_ANY, NULL);
115 static int ll_close_inode_openhandle(struct obd_export *md_exp,
117 struct obd_client_handle *och)
119 struct obd_export *exp = ll_i2mdexp(inode);
120 struct md_op_data *op_data;
121 struct ptlrpc_request *req = NULL;
122 struct obd_device *obd = class_exp2obd(exp);
129 * XXX: in case of LMV, is this correct to access
132 CERROR("Invalid MDC connection handle "LPX64"\n",
133 ll_i2mdexp(inode)->exp_handle.h_cookie);
137 OBD_ALLOC_PTR(op_data);
139 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141 ll_prepare_close(inode, op_data, och);
142 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
143 rc = md_close(md_exp, op_data, och->och_mod, &req);
145 /* This close must have the epoch closed. */
146 LASSERT(epoch_close);
147 /* MDS has instructed us to obtain Size-on-MDS attribute from
148 * OSTs and send setattr to back to MDS. */
149 rc = ll_som_update(inode, op_data);
151 CERROR("inode %lu mdc Size-on-MDS update failed: "
152 "rc = %d\n", inode->i_ino, rc);
156 CERROR("inode %lu mdc close failed: rc = %d\n",
159 ll_finish_md_op_data(op_data);
162 rc = ll_objects_destroy(req, inode);
164 CERROR("inode %lu ll_objects destroy: rc = %d\n",
171 if (exp_connect_som(exp) && !epoch_close &&
172 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
173 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
175 md_clear_open_replay_data(md_exp, och);
176 /* Free @och if it is not waiting for DONE_WRITING. */
177 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
180 if (req) /* This is close request */
181 ptlrpc_req_finished(req);
185 int ll_md_real_close(struct inode *inode, int flags)
187 struct ll_inode_info *lli = ll_i2info(inode);
188 struct obd_client_handle **och_p;
189 struct obd_client_handle *och;
194 if (flags & FMODE_WRITE) {
195 och_p = &lli->lli_mds_write_och;
196 och_usecount = &lli->lli_open_fd_write_count;
197 } else if (flags & FMODE_EXEC) {
198 och_p = &lli->lli_mds_exec_och;
199 och_usecount = &lli->lli_open_fd_exec_count;
201 LASSERT(flags & FMODE_READ);
202 och_p = &lli->lli_mds_read_och;
203 och_usecount = &lli->lli_open_fd_read_count;
206 cfs_down(&lli->lli_och_sem);
207 if (*och_usecount) { /* There are still users of this handle, so
209 cfs_up(&lli->lli_och_sem);
214 cfs_up(&lli->lli_och_sem);
216 if (och) { /* There might be a race and somebody have freed this och
218 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
225 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
228 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
229 struct ll_inode_info *lli = ll_i2info(inode);
233 /* clear group lock, if present */
234 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
235 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
237 /* Let's see if we have good enough OPEN lock on the file and if
238 we can skip talking to MDS */
239 if (file->f_dentry->d_inode) { /* Can this ever be false? */
241 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
242 struct lustre_handle lockh;
243 struct inode *inode = file->f_dentry->d_inode;
244 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
246 cfs_down(&lli->lli_och_sem);
247 if (fd->fd_omode & FMODE_WRITE) {
249 LASSERT(lli->lli_open_fd_write_count);
250 lli->lli_open_fd_write_count--;
251 } else if (fd->fd_omode & FMODE_EXEC) {
253 LASSERT(lli->lli_open_fd_exec_count);
254 lli->lli_open_fd_exec_count--;
257 LASSERT(lli->lli_open_fd_read_count);
258 lli->lli_open_fd_read_count--;
260 cfs_up(&lli->lli_och_sem);
262 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
263 LDLM_IBITS, &policy, lockmode,
265 rc = ll_md_real_close(file->f_dentry->d_inode,
269 CERROR("Releasing a file %p with negative dentry %p. Name %s",
270 file, file->f_dentry, file->f_dentry->d_name.name);
273 LUSTRE_FPRIVATE(file) = NULL;
274 ll_file_data_put(fd);
275 ll_capa_close(inode);
280 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
282 /* While this returns an error code, fput() the caller does not, so we need
283 * to make every effort to clean up all of our state here. Also, applications
284 * rarely check close errors and even if an error is returned they will not
285 * re-try the close call.
287 int ll_file_release(struct inode *inode, struct file *file)
289 struct ll_file_data *fd;
290 struct ll_sb_info *sbi = ll_i2sbi(inode);
291 struct ll_inode_info *lli = ll_i2info(inode);
292 struct lov_stripe_md *lsm = lli->lli_smd;
296 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
297 inode->i_generation, inode);
299 #ifdef CONFIG_FS_POSIX_ACL
300 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
301 inode == inode->i_sb->s_root->d_inode) {
302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
305 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
306 fd->fd_flags &= ~LL_FILE_RMTACL;
307 rct_del(&sbi->ll_rct, cfs_curproc_pid());
308 et_search_free(&sbi->ll_et, cfs_curproc_pid());
313 if (inode->i_sb->s_root != file->f_dentry)
314 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
315 fd = LUSTRE_FPRIVATE(file);
318 /* The last ref on @file, maybe not the the owner pid of statahead.
319 * Different processes can open the same dir, "ll_opendir_key" means:
320 * it is me that should stop the statahead thread. */
321 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
322 ll_stop_statahead(inode, lli->lli_opendir_key);
324 if (inode->i_sb->s_root == file->f_dentry) {
325 LUSTRE_FPRIVATE(file) = NULL;
326 ll_file_data_put(fd);
331 lov_test_and_clear_async_rc(lsm);
332 lli->lli_async_rc = 0;
334 rc = ll_md_close(sbi->ll_md_exp, inode, file);
336 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
337 libcfs_debug_dumplog();
342 static int ll_intent_file_open(struct file *file, void *lmm,
343 int lmmsize, struct lookup_intent *itp)
345 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
346 struct dentry *parent = file->f_dentry->d_parent;
347 const char *name = file->f_dentry->d_name.name;
348 const int len = file->f_dentry->d_name.len;
349 struct md_op_data *op_data;
350 struct ptlrpc_request *req;
351 __u32 opc = LUSTRE_OPC_ANY;
358 /* Usually we come here only for NFSD, and we want open lock.
359 But we can also get here with pre 2.6.15 patchless kernels, and in
360 that case that lock is also ok */
361 /* We can also get here if there was cached open handle in revalidate_it
362 * but it disappeared while we were getting from there to ll_file_open.
363 * But this means this file was closed and immediatelly opened which
364 * makes a good candidate for using OPEN lock */
365 /* If lmmsize & lmm are not 0, we are just setting stripe info
366 * parameters. No need for the open lock */
367 if (lmm == NULL && lmmsize == 0) {
368 itp->it_flags |= MDS_OPEN_LOCK;
369 if (itp->it_flags & FMODE_WRITE)
370 opc = LUSTRE_OPC_CREATE;
373 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
374 file->f_dentry->d_inode, name, len,
377 RETURN(PTR_ERR(op_data));
379 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
380 0 /*unused */, &req, ll_md_blocking_ast, 0);
381 ll_finish_md_op_data(op_data);
383 /* reason for keep own exit path - don`t flood log
384 * with messages with -ESTALE errors.
386 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
387 it_open_error(DISP_OPEN_OPEN, itp))
389 ll_release_openhandle(file->f_dentry, itp);
393 if (it_disposition(itp, DISP_LOOKUP_NEG))
394 GOTO(out, rc = -ENOENT);
396 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
397 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
398 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
402 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
403 if (!rc && itp->d.lustre.it_lock_mode)
404 md_set_lock_data(sbi->ll_md_exp,
405 &itp->d.lustre.it_lock_handle,
406 file->f_dentry->d_inode, NULL);
409 ptlrpc_req_finished(itp->d.lustre.it_data);
410 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
411 ll_intent_drop_lock(itp);
417 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
418 * not believe attributes if a few ioepoch holders exist. Attributes for
419 * previous ioepoch if new one is opened are also skipped by MDS.
421 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
423 if (ioepoch && lli->lli_ioepoch != ioepoch) {
424 lli->lli_ioepoch = ioepoch;
425 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
426 ioepoch, PFID(&lli->lli_fid));
430 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
431 struct lookup_intent *it, struct obd_client_handle *och)
433 struct ptlrpc_request *req = it->d.lustre.it_data;
434 struct mdt_body *body;
438 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
439 LASSERT(body != NULL); /* reply already checked out */
441 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
442 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
443 och->och_fid = lli->lli_fid;
444 och->och_flags = it->it_flags;
445 ll_ioepoch_open(lli, body->ioepoch);
447 return md_set_open_replay_data(md_exp, och, req);
450 int ll_local_open(struct file *file, struct lookup_intent *it,
451 struct ll_file_data *fd, struct obd_client_handle *och)
453 struct inode *inode = file->f_dentry->d_inode;
454 struct ll_inode_info *lli = ll_i2info(inode);
457 LASSERT(!LUSTRE_FPRIVATE(file));
462 struct ptlrpc_request *req = it->d.lustre.it_data;
463 struct mdt_body *body;
466 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
470 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
471 if ((it->it_flags & FMODE_WRITE) &&
472 (body->valid & OBD_MD_FLSIZE))
473 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
474 lli->lli_ioepoch, PFID(&lli->lli_fid));
477 LUSTRE_FPRIVATE(file) = fd;
478 ll_readahead_init(inode, &fd->fd_ras);
479 fd->fd_omode = it->it_flags;
483 /* Open a file, and (for the very first open) create objects on the OSTs at
484 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
485 * creation or open until ll_lov_setstripe() ioctl is called. We grab
486 * lli_open_sem to ensure no other process will create objects, send the
487 * stripe MD to the MDS, or try to destroy the objects if that fails.
489 * If we already have the stripe MD locally then we don't request it in
490 * md_open(), by passing a lmm_size = 0.
492 * It is up to the application to ensure no other processes open this file
493 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
494 * used. We might be able to avoid races of that sort by getting lli_open_sem
495 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
496 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
498 int ll_file_open(struct inode *inode, struct file *file)
500 struct ll_inode_info *lli = ll_i2info(inode);
501 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
502 .it_flags = file->f_flags };
503 struct lov_stripe_md *lsm;
504 struct obd_client_handle **och_p = NULL;
505 __u64 *och_usecount = NULL;
506 struct ll_file_data *fd;
507 int rc = 0, opendir_set = 0;
510 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
511 inode->i_generation, inode, file->f_flags);
513 it = file->private_data; /* XXX: compat macro */
514 file->private_data = NULL; /* prevent ll_local_open assertion */
516 fd = ll_file_data_get();
518 GOTO(out_och_free, rc = -ENOMEM);
521 if (S_ISDIR(inode->i_mode)) {
522 cfs_spin_lock(&lli->lli_sa_lock);
523 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
524 LASSERT(lli->lli_sai == NULL);
525 lli->lli_opendir_key = fd;
526 lli->lli_opendir_pid = cfs_curproc_pid();
529 cfs_spin_unlock(&lli->lli_sa_lock);
532 if (inode->i_sb->s_root == file->f_dentry) {
533 LUSTRE_FPRIVATE(file) = fd;
537 if (!it || !it->d.lustre.it_disposition) {
538 /* Convert f_flags into access mode. We cannot use file->f_mode,
539 * because everything but O_ACCMODE mask was stripped from
541 if ((oit.it_flags + 1) & O_ACCMODE)
543 if (file->f_flags & O_TRUNC)
544 oit.it_flags |= FMODE_WRITE;
546 /* kernel only call f_op->open in dentry_open. filp_open calls
547 * dentry_open after call to open_namei that checks permissions.
548 * Only nfsd_open call dentry_open directly without checking
549 * permissions and because of that this code below is safe. */
550 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
551 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
553 /* We do not want O_EXCL here, presumably we opened the file
554 * already? XXX - NFS implications? */
555 oit.it_flags &= ~O_EXCL;
557 /* bug20584, if "it_flags" contains O_CREAT, the file will be
558 * created if necessary, then "IT_CREAT" should be set to keep
559 * consistent with it */
560 if (oit.it_flags & O_CREAT)
561 oit.it_op |= IT_CREAT;
567 /* Let's see if we have file open on MDS already. */
568 if (it->it_flags & FMODE_WRITE) {
569 och_p = &lli->lli_mds_write_och;
570 och_usecount = &lli->lli_open_fd_write_count;
571 } else if (it->it_flags & FMODE_EXEC) {
572 och_p = &lli->lli_mds_exec_och;
573 och_usecount = &lli->lli_open_fd_exec_count;
575 och_p = &lli->lli_mds_read_och;
576 och_usecount = &lli->lli_open_fd_read_count;
579 cfs_down(&lli->lli_och_sem);
580 if (*och_p) { /* Open handle is present */
581 if (it_disposition(it, DISP_OPEN_OPEN)) {
582 /* Well, there's extra open request that we do not need,
583 let's close it somehow. This will decref request. */
584 rc = it_open_error(DISP_OPEN_OPEN, it);
586 cfs_up(&lli->lli_och_sem);
587 GOTO(out_openerr, rc);
590 ll_release_openhandle(file->f_dentry, it);
591 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
596 rc = ll_local_open(file, it, fd, NULL);
599 cfs_up(&lli->lli_och_sem);
600 GOTO(out_openerr, rc);
603 LASSERT(*och_usecount == 0);
604 if (!it->d.lustre.it_disposition) {
605 /* We cannot just request lock handle now, new ELC code
606 means that one of other OPEN locks for this file
607 could be cancelled, and since blocking ast handler
608 would attempt to grab och_sem as well, that would
609 result in a deadlock */
610 cfs_up(&lli->lli_och_sem);
611 it->it_create_mode |= M_CHECK_STALE;
612 rc = ll_intent_file_open(file, NULL, 0, it);
613 it->it_create_mode &= ~M_CHECK_STALE;
615 GOTO(out_openerr, rc);
619 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
621 GOTO(out_och_free, rc = -ENOMEM);
625 /* md_intent_lock() didn't get a request ref if there was an
626 * open error, so don't do cleanup on the request here
628 /* XXX (green): Should not we bail out on any error here, not
629 * just open error? */
630 rc = it_open_error(DISP_OPEN_OPEN, it);
632 GOTO(out_och_free, rc);
634 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
636 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
637 rc = ll_local_open(file, it, fd, *och_p);
639 GOTO(out_och_free, rc);
641 cfs_up(&lli->lli_och_sem);
644 /* Must do this outside lli_och_sem lock to prevent deadlock where
645 different kind of OPEN lock for this same inode gets cancelled
646 by ldlm_cancel_lru */
647 if (!S_ISREG(inode->i_mode))
648 GOTO(out_och_free, rc);
654 if (file->f_flags & O_LOV_DELAY_CREATE ||
655 !(file->f_mode & FMODE_WRITE)) {
656 CDEBUG(D_INODE, "object creation was delayed\n");
657 GOTO(out_och_free, rc);
660 file->f_flags &= ~O_LOV_DELAY_CREATE;
661 GOTO(out_och_free, rc);
664 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
665 ptlrpc_req_finished(it->d.lustre.it_data);
666 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
670 if (och_p && *och_p) {
671 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
672 *och_p = NULL; /* OBD_FREE writes some magic there */
675 cfs_up(&lli->lli_och_sem);
678 if (opendir_set != 0)
679 ll_stop_statahead(inode, lli->lli_opendir_key);
681 ll_file_data_put(fd);
687 /* Fills the obdo with the attributes for the lsm */
688 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
689 struct obd_capa *capa, struct obdo *obdo,
690 __u64 ioepoch, int sync)
692 struct ptlrpc_request_set *set;
693 struct obd_info oinfo = { { { 0 } } };
698 LASSERT(lsm != NULL);
702 oinfo.oi_oa->o_id = lsm->lsm_object_id;
703 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
704 oinfo.oi_oa->o_mode = S_IFREG;
705 oinfo.oi_oa->o_ioepoch = ioepoch;
706 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
707 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
708 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
709 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
710 OBD_MD_FLGROUP | OBD_MD_FLEPOCH;
711 oinfo.oi_capa = capa;
713 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
714 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
717 set = ptlrpc_prep_set();
719 CERROR("can't allocate ptlrpc set\n");
722 rc = obd_getattr_async(exp, &oinfo, set);
724 rc = ptlrpc_set_wait(set);
725 ptlrpc_set_destroy(set);
728 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
729 OBD_MD_FLATIME | OBD_MD_FLMTIME |
730 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
735 * Performs the getattr on the inode and updates its fields.
736 * If @sync != 0, perform the getattr under the server-side lock.
738 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
739 __u64 ioepoch, int sync)
741 struct ll_inode_info *lli = ll_i2info(inode);
742 struct obd_capa *capa = ll_mdscapa_get(inode);
746 rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode),
747 capa, obdo, ioepoch, sync);
750 obdo_refresh_inode(inode, obdo, obdo->o_valid);
752 "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
753 lli->lli_smd->lsm_object_id, i_size_read(inode),
754 (unsigned long long)inode->i_blocks,
755 (unsigned long)ll_inode_blksize(inode));
760 int ll_merge_lvb(struct inode *inode)
762 struct ll_inode_info *lli = ll_i2info(inode);
763 struct ll_sb_info *sbi = ll_i2sbi(inode);
769 ll_inode_size_lock(inode, 1);
770 inode_init_lvb(inode, &lvb);
772 /* merge timestamps the most resently obtained from mds with
773 timestamps obtained from osts */
774 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
775 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
776 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
777 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
778 cl_isize_write_nolock(inode, lvb.lvb_size);
780 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
781 PFID(&lli->lli_fid), lvb.lvb_size);
782 inode->i_blocks = lvb.lvb_blocks;
784 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
785 LTIME_S(inode->i_atime) = lvb.lvb_atime;
786 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
787 ll_inode_size_unlock(inode, 1);
792 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
795 struct obdo obdo = { 0 };
798 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
800 st->st_size = obdo.o_size;
801 st->st_blocks = obdo.o_blocks;
802 st->st_mtime = obdo.o_mtime;
803 st->st_atime = obdo.o_atime;
804 st->st_ctime = obdo.o_ctime;
809 void ll_io_init(struct cl_io *io, const struct file *file, int write)
811 struct inode *inode = file->f_dentry->d_inode;
813 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
815 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
816 io->ci_obj = ll_i2info(inode)->lli_clob;
817 io->ci_lockreq = CILR_MAYBE;
818 if (ll_file_nolock(file)) {
819 io->ci_lockreq = CILR_NEVER;
820 io->ci_no_srvlock = 1;
821 } else if (file->f_flags & O_APPEND) {
822 io->ci_lockreq = CILR_MANDATORY;
826 static ssize_t ll_file_io_generic(const struct lu_env *env,
827 struct vvp_io_args *args, struct file *file,
828 enum cl_io_type iot, loff_t *ppos, size_t count)
830 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
835 io = ccc_env_thread_io(env);
836 ll_io_init(io, file, iot == CIT_WRITE);
838 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
839 struct vvp_io *vio = vvp_env_io(env);
840 struct ccc_io *cio = ccc_env_io(env);
841 int write_sem_locked = 0;
843 cio->cui_fd = LUSTRE_FPRIVATE(file);
844 vio->cui_io_subtype = args->via_io_subtype;
846 switch (vio->cui_io_subtype) {
848 cio->cui_iov = args->u.normal.via_iov;
849 cio->cui_nrsegs = args->u.normal.via_nrsegs;
850 cio->cui_tot_nrsegs = cio->cui_nrsegs;
851 #ifndef HAVE_FILE_WRITEV
852 cio->cui_iocb = args->u.normal.via_iocb;
854 if ((iot == CIT_WRITE) &&
855 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
856 if(cfs_down_interruptible(&lli->lli_write_sem))
857 GOTO(out, result = -ERESTARTSYS);
858 write_sem_locked = 1;
859 } else if (iot == CIT_READ) {
860 cfs_down_read(&lli->lli_trunc_sem);
864 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
865 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
868 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
869 vio->u.splice.cui_flags = args->u.splice.via_flags;
872 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
875 result = cl_io_loop(env, io);
876 if (write_sem_locked)
877 cfs_up(&lli->lli_write_sem);
878 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
879 cfs_up_read(&lli->lli_trunc_sem);
881 /* cl_io_rw_init() handled IO */
882 result = io->ci_result;
885 if (io->ci_nob > 0) {
887 *ppos = io->u.ci_wr.wr.crw_pos;
893 if (iot == CIT_READ) {
895 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
896 LPROC_LL_READ_BYTES, result);
897 } else if (iot == CIT_WRITE) {
899 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
900 LPROC_LL_WRITE_BYTES, result);
901 lli->lli_write_rc = 0;
903 lli->lli_write_rc = result;
912 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
914 static int ll_file_get_iov_count(const struct iovec *iov,
915 unsigned long *nr_segs, size_t *count)
920 for (seg = 0; seg < *nr_segs; seg++) {
921 const struct iovec *iv = &iov[seg];
924 * If any segment has a negative length, or the cumulative
925 * length ever wraps negative then return -EINVAL.
928 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
930 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
935 cnt -= iv->iov_len; /* This segment is no good */
942 #ifdef HAVE_FILE_READV
943 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
944 unsigned long nr_segs, loff_t *ppos)
947 struct vvp_io_args *args;
953 result = ll_file_get_iov_count(iov, &nr_segs, &count);
957 env = cl_env_get(&refcheck);
959 RETURN(PTR_ERR(env));
961 args = vvp_env_args(env, IO_NORMAL);
962 args->u.normal.via_iov = (struct iovec *)iov;
963 args->u.normal.via_nrsegs = nr_segs;
965 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
966 cl_env_put(env, &refcheck);
970 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
974 struct iovec *local_iov;
979 env = cl_env_get(&refcheck);
981 RETURN(PTR_ERR(env));
983 local_iov = &vvp_env_info(env)->vti_local_iov;
984 local_iov->iov_base = (void __user *)buf;
985 local_iov->iov_len = count;
986 result = ll_file_readv(file, local_iov, 1, ppos);
987 cl_env_put(env, &refcheck);
992 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
993 unsigned long nr_segs, loff_t pos)
996 struct vvp_io_args *args;
1002 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1006 env = cl_env_get(&refcheck);
1008 RETURN(PTR_ERR(env));
1010 args = vvp_env_args(env, IO_NORMAL);
1011 args->u.normal.via_iov = (struct iovec *)iov;
1012 args->u.normal.via_nrsegs = nr_segs;
1013 args->u.normal.via_iocb = iocb;
1015 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1016 &iocb->ki_pos, count);
1017 cl_env_put(env, &refcheck);
1021 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1025 struct iovec *local_iov;
1026 struct kiocb *kiocb;
1031 env = cl_env_get(&refcheck);
1033 RETURN(PTR_ERR(env));
1035 local_iov = &vvp_env_info(env)->vti_local_iov;
1036 kiocb = &vvp_env_info(env)->vti_kiocb;
1037 local_iov->iov_base = (void __user *)buf;
1038 local_iov->iov_len = count;
1039 init_sync_kiocb(kiocb, file);
1040 kiocb->ki_pos = *ppos;
1041 kiocb->ki_left = count;
1043 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1044 *ppos = kiocb->ki_pos;
1046 cl_env_put(env, &refcheck);
1052 * Write to a file (through the page cache).
1054 #ifdef HAVE_FILE_WRITEV
1055 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1056 unsigned long nr_segs, loff_t *ppos)
1059 struct vvp_io_args *args;
1065 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1069 env = cl_env_get(&refcheck);
1071 RETURN(PTR_ERR(env));
1073 args = vvp_env_args(env, IO_NORMAL);
1074 args->u.normal.via_iov = (struct iovec *)iov;
1075 args->u.normal.via_nrsegs = nr_segs;
1077 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1078 cl_env_put(env, &refcheck);
1082 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1086 struct iovec *local_iov;
1091 env = cl_env_get(&refcheck);
1093 RETURN(PTR_ERR(env));
1095 local_iov = &vvp_env_info(env)->vti_local_iov;
1096 local_iov->iov_base = (void __user *)buf;
1097 local_iov->iov_len = count;
1099 result = ll_file_writev(file, local_iov, 1, ppos);
1100 cl_env_put(env, &refcheck);
1104 #else /* AIO stuff */
1105 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1106 unsigned long nr_segs, loff_t pos)
1109 struct vvp_io_args *args;
1115 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1119 env = cl_env_get(&refcheck);
1121 RETURN(PTR_ERR(env));
1123 args = vvp_env_args(env, IO_NORMAL);
1124 args->u.normal.via_iov = (struct iovec *)iov;
1125 args->u.normal.via_nrsegs = nr_segs;
1126 args->u.normal.via_iocb = iocb;
1128 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1129 &iocb->ki_pos, count);
1130 cl_env_put(env, &refcheck);
1134 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1138 struct iovec *local_iov;
1139 struct kiocb *kiocb;
1144 env = cl_env_get(&refcheck);
1146 RETURN(PTR_ERR(env));
1148 local_iov = &vvp_env_info(env)->vti_local_iov;
1149 kiocb = &vvp_env_info(env)->vti_kiocb;
1150 local_iov->iov_base = (void __user *)buf;
1151 local_iov->iov_len = count;
1152 init_sync_kiocb(kiocb, file);
1153 kiocb->ki_pos = *ppos;
1154 kiocb->ki_left = count;
1156 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1157 *ppos = kiocb->ki_pos;
1159 cl_env_put(env, &refcheck);
1165 #ifdef HAVE_KERNEL_SENDFILE
1167 * Send file content (through pagecache) somewhere with helper
1169 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1170 read_actor_t actor, void *target)
1173 struct vvp_io_args *args;
1178 env = cl_env_get(&refcheck);
1180 RETURN(PTR_ERR(env));
1182 args = vvp_env_args(env, IO_SENDFILE);
1183 args->u.sendfile.via_target = target;
1184 args->u.sendfile.via_actor = actor;
1186 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1187 cl_env_put(env, &refcheck);
1192 #ifdef HAVE_KERNEL_SPLICE_READ
1194 * Send file content (through pagecache) somewhere with helper
1196 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1197 struct pipe_inode_info *pipe, size_t count,
1201 struct vvp_io_args *args;
1206 env = cl_env_get(&refcheck);
1208 RETURN(PTR_ERR(env));
1210 args = vvp_env_args(env, IO_SPLICE);
1211 args->u.splice.via_pipe = pipe;
1212 args->u.splice.via_flags = flags;
1214 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1215 cl_env_put(env, &refcheck);
1220 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1223 struct obd_export *exp = ll_i2dtexp(inode);
1224 struct obd_trans_info oti = { 0 };
1225 struct obdo *oa = NULL;
1228 struct lov_stripe_md *lsm, *lsm2;
1235 ll_inode_size_lock(inode, 0);
1236 lsm = ll_i2info(inode)->lli_smd;
1238 GOTO(out, rc = -ENOENT);
1239 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1240 (lsm->lsm_stripe_count));
1242 OBD_ALLOC_LARGE(lsm2, lsm_size);
1244 GOTO(out, rc = -ENOMEM);
1248 oa->o_nlink = ost_idx;
1249 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1250 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1251 obdo_from_inode(oa, inode, &ll_i2info(inode)->lli_fid, OBD_MD_FLTYPE |
1252 OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1253 memcpy(lsm2, lsm, lsm_size);
1254 rc = obd_create(exp, oa, &lsm2, &oti);
1256 OBD_FREE_LARGE(lsm2, lsm_size);
1259 ll_inode_size_unlock(inode, 0);
1264 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1266 struct ll_recreate_obj ucreat;
1269 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1272 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1273 sizeof(struct ll_recreate_obj)))
1276 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1277 ucreat.lrc_ost_idx));
1280 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1287 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1290 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1291 sizeof(struct lu_fid)))
1294 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1295 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1296 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1299 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1300 int flags, struct lov_user_md *lum, int lum_size)
1302 struct lov_stripe_md *lsm;
1303 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1307 ll_inode_size_lock(inode, 0);
1308 lsm = ll_i2info(inode)->lli_smd;
1310 ll_inode_size_unlock(inode, 0);
1311 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1316 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1319 rc = oit.d.lustre.it_status;
1321 GOTO(out_req_free, rc);
1323 ll_release_openhandle(file->f_dentry, &oit);
1326 ll_inode_size_unlock(inode, 0);
1327 ll_intent_release(&oit);
1330 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1334 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1335 struct lov_mds_md **lmmp, int *lmm_size,
1336 struct ptlrpc_request **request)
1338 struct ll_sb_info *sbi = ll_i2sbi(inode);
1339 struct mdt_body *body;
1340 struct lov_mds_md *lmm = NULL;
1341 struct ptlrpc_request *req = NULL;
1342 struct md_op_data *op_data;
1345 rc = ll_get_max_mdsize(sbi, &lmmsize);
1349 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1350 strlen(filename), lmmsize,
1351 LUSTRE_OPC_ANY, NULL);
1352 if (op_data == NULL)
1355 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1356 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1357 ll_finish_md_op_data(op_data);
1359 CDEBUG(D_INFO, "md_getattr_name failed "
1360 "on %s: rc %d\n", filename, rc);
1364 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1365 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1367 lmmsize = body->eadatasize;
1369 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1371 GOTO(out, rc = -ENODATA);
1374 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1375 LASSERT(lmm != NULL);
1377 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1378 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1379 GOTO(out, rc = -EPROTO);
1383 * This is coming from the MDS, so is probably in
1384 * little endian. We convert it to host endian before
1385 * passing it to userspace.
1387 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1388 /* if function called for directory - we should
1389 * avoid swab not existent lsm objects */
1390 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1391 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1392 if (S_ISREG(body->mode))
1393 lustre_swab_lov_user_md_objects(
1394 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1395 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1396 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1397 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1398 if (S_ISREG(body->mode))
1399 lustre_swab_lov_user_md_objects(
1400 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1401 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1407 *lmm_size = lmmsize;
1412 static int ll_lov_setea(struct inode *inode, struct file *file,
1415 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1416 struct lov_user_md *lump;
1417 int lum_size = sizeof(struct lov_user_md) +
1418 sizeof(struct lov_user_ost_data);
1422 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1425 OBD_ALLOC_LARGE(lump, lum_size);
1429 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1430 OBD_FREE_LARGE(lump, lum_size);
1434 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1436 OBD_FREE_LARGE(lump, lum_size);
1440 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1443 struct lov_user_md_v3 lumv3;
1444 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1445 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1446 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1449 int flags = FMODE_WRITE;
1452 /* first try with v1 which is smaller than v3 */
1453 lum_size = sizeof(struct lov_user_md_v1);
1454 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1457 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1458 lum_size = sizeof(struct lov_user_md_v3);
1459 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1463 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1465 put_user(0, &lumv1p->lmm_stripe_count);
1466 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1467 0, ll_i2info(inode)->lli_smd,
1473 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1475 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1480 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1485 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1487 struct ll_inode_info *lli = ll_i2info(inode);
1488 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1489 struct ccc_grouplock grouplock;
1493 if (ll_file_nolock(file))
1494 RETURN(-EOPNOTSUPP);
1496 cfs_spin_lock(&lli->lli_lock);
1497 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1498 CWARN("group lock already existed with gid %lu\n",
1499 fd->fd_grouplock.cg_gid);
1500 cfs_spin_unlock(&lli->lli_lock);
1503 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1504 cfs_spin_unlock(&lli->lli_lock);
1506 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1507 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1511 cfs_spin_lock(&lli->lli_lock);
1512 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1513 cfs_spin_unlock(&lli->lli_lock);
1514 CERROR("another thread just won the race\n");
1515 cl_put_grouplock(&grouplock);
1519 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1520 fd->fd_grouplock = grouplock;
1521 cfs_spin_unlock(&lli->lli_lock);
1523 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1527 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1529 struct ll_inode_info *lli = ll_i2info(inode);
1530 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1531 struct ccc_grouplock grouplock;
1534 cfs_spin_lock(&lli->lli_lock);
1535 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1536 cfs_spin_unlock(&lli->lli_lock);
1537 CWARN("no group lock held\n");
1540 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1542 if (fd->fd_grouplock.cg_gid != arg) {
1543 CWARN("group lock %lu doesn't match current id %lu\n",
1544 arg, fd->fd_grouplock.cg_gid);
1545 cfs_spin_unlock(&lli->lli_lock);
1549 grouplock = fd->fd_grouplock;
1550 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1551 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1552 cfs_spin_unlock(&lli->lli_lock);
1554 cl_put_grouplock(&grouplock);
1555 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1560 * Close inode open handle
1562 * \param dentry [in] dentry which contains the inode
1563 * \param it [in,out] intent which contains open info and result
1566 * \retval <0 failure
1568 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1570 struct inode *inode = dentry->d_inode;
1571 struct obd_client_handle *och;
1577 /* Root ? Do nothing. */
1578 if (dentry->d_inode->i_sb->s_root == dentry)
1581 /* No open handle to close? Move away */
1582 if (!it_disposition(it, DISP_OPEN_OPEN))
1585 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1587 OBD_ALLOC(och, sizeof(*och));
1589 GOTO(out, rc = -ENOMEM);
1591 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1592 ll_i2info(inode), it, och);
1594 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1597 /* this one is in place of ll_file_open */
1598 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1599 ptlrpc_req_finished(it->d.lustre.it_data);
1600 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1606 * Get size for inode for which FIEMAP mapping is requested.
1607 * Make the FIEMAP get_info call and returns the result.
1609 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1612 struct obd_export *exp = ll_i2dtexp(inode);
1613 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1614 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1615 int vallen = num_bytes;
1619 /* Checks for fiemap flags */
1620 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1621 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1625 /* Check for FIEMAP_FLAG_SYNC */
1626 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1627 rc = filemap_fdatawrite(inode->i_mapping);
1632 /* If the stripe_count > 1 and the application does not understand
1633 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1635 if (lsm->lsm_stripe_count > 1 &&
1636 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1639 fm_key.oa.o_id = lsm->lsm_object_id;
1640 fm_key.oa.o_seq = lsm->lsm_object_seq;
1641 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1643 obdo_from_inode(&fm_key.oa, inode, &ll_i2info(inode)->lli_fid,
1645 /* If filesize is 0, then there would be no objects for mapping */
1646 if (fm_key.oa.o_size == 0) {
1647 fiemap->fm_mapped_extents = 0;
1651 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1653 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1655 CERROR("obd_get_info failed: rc = %d\n", rc);
1660 int ll_fid2path(struct obd_export *exp, void *arg)
1662 struct getinfo_fid2path *gfout, *gfin;
1666 /* Need to get the buflen */
1667 OBD_ALLOC_PTR(gfin);
1670 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1675 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1676 OBD_ALLOC(gfout, outsize);
1677 if (gfout == NULL) {
1681 memcpy(gfout, gfin, sizeof(*gfout));
1684 /* Call mdc_iocontrol */
1685 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1688 if (cfs_copy_to_user(arg, gfout, outsize))
1692 OBD_FREE(gfout, outsize);
1696 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1698 struct ll_user_fiemap *fiemap_s;
1699 size_t num_bytes, ret_bytes;
1700 unsigned int extent_count;
1703 /* Get the extent count so we can calculate the size of
1704 * required fiemap buffer */
1705 if (get_user(extent_count,
1706 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1708 num_bytes = sizeof(*fiemap_s) + (extent_count *
1709 sizeof(struct ll_fiemap_extent));
1711 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1712 if (fiemap_s == NULL)
1715 /* get the fiemap value */
1716 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1718 GOTO(error, rc = -EFAULT);
1720 /* If fm_extent_count is non-zero, read the first extent since
1721 * it is used to calculate end_offset and device from previous
1724 if (copy_from_user(&fiemap_s->fm_extents[0],
1725 (char __user *)arg + sizeof(*fiemap_s),
1726 sizeof(struct ll_fiemap_extent)))
1727 GOTO(error, rc = -EFAULT);
1730 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1734 ret_bytes = sizeof(struct ll_user_fiemap);
1736 if (extent_count != 0)
1737 ret_bytes += (fiemap_s->fm_mapped_extents *
1738 sizeof(struct ll_fiemap_extent));
1740 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1744 OBD_FREE_LARGE(fiemap_s, num_bytes);
1748 #ifdef HAVE_UNLOCKED_IOCTL
1749 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1751 struct inode *inode = file->f_dentry->d_inode;
1753 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1757 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1761 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1762 inode->i_generation, inode, cmd);
1763 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1765 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1766 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1770 case LL_IOC_GETFLAGS:
1771 /* Get the current value of the file flags */
1772 return put_user(fd->fd_flags, (int *)arg);
1773 case LL_IOC_SETFLAGS:
1774 case LL_IOC_CLRFLAGS:
1775 /* Set or clear specific file flags */
1776 /* XXX This probably needs checks to ensure the flags are
1777 * not abused, and to handle any flag side effects.
1779 if (get_user(flags, (int *) arg))
1782 if (cmd == LL_IOC_SETFLAGS) {
1783 if ((flags & LL_FILE_IGNORE_LOCK) &&
1784 !(file->f_flags & O_DIRECT)) {
1785 CERROR("%s: unable to disable locking on "
1786 "non-O_DIRECT file\n", current->comm);
1790 fd->fd_flags |= flags;
1792 fd->fd_flags &= ~flags;
1795 case LL_IOC_LOV_SETSTRIPE:
1796 RETURN(ll_lov_setstripe(inode, file, arg));
1797 case LL_IOC_LOV_SETEA:
1798 RETURN(ll_lov_setea(inode, file, arg));
1799 case LL_IOC_LOV_GETSTRIPE:
1800 RETURN(ll_lov_getstripe(inode, arg));
1801 case LL_IOC_RECREATE_OBJ:
1802 RETURN(ll_lov_recreate_obj(inode, arg));
1803 case LL_IOC_RECREATE_FID:
1804 RETURN(ll_lov_recreate_fid(inode, arg));
1805 case FSFILT_IOC_FIEMAP:
1806 RETURN(ll_ioctl_fiemap(inode, arg));
1807 case FSFILT_IOC_GETFLAGS:
1808 case FSFILT_IOC_SETFLAGS:
1809 RETURN(ll_iocontrol(inode, file, cmd, arg));
1810 case FSFILT_IOC_GETVERSION_OLD:
1811 case FSFILT_IOC_GETVERSION:
1812 RETURN(put_user(inode->i_generation, (int *)arg));
1813 case LL_IOC_GROUP_LOCK:
1814 RETURN(ll_get_grouplock(inode, file, arg));
1815 case LL_IOC_GROUP_UNLOCK:
1816 RETURN(ll_put_grouplock(inode, file, arg));
1817 case IOC_OBD_STATFS:
1818 RETURN(ll_obd_statfs(inode, (void *)arg));
1820 /* We need to special case any other ioctls we want to handle,
1821 * to send them to the MDS/OST as appropriate and to properly
1822 * network encode the arg field.
1823 case FSFILT_IOC_SETVERSION_OLD:
1824 case FSFILT_IOC_SETVERSION:
1826 case LL_IOC_FLUSHCTX:
1827 RETURN(ll_flush_ctx(inode));
1828 case LL_IOC_PATH2FID: {
1829 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1830 sizeof(struct lu_fid)))
1835 case OBD_IOC_FID2PATH:
1836 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1838 case LL_IOC_GET_MDTIDX: {
1841 mdtidx = ll_get_mdt_idx(inode);
1845 if (put_user((int)mdtidx, (int*)arg))
1855 ll_iocontrol_call(inode, file, cmd, arg, &err))
1858 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1864 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1866 struct inode *inode = file->f_dentry->d_inode;
1869 retval = offset + ((origin == 2) ? i_size_read(inode) :
1870 (origin == 1) ? file->f_pos : 0);
1871 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1872 inode->i_ino, inode->i_generation, inode, retval, retval,
1873 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1874 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1876 if (origin == 2) { /* SEEK_END */
1877 int nonblock = 0, rc;
1879 if (file->f_flags & O_NONBLOCK)
1880 nonblock = LDLM_FL_BLOCK_NOWAIT;
1882 rc = cl_glimpse_size(inode);
1886 offset += i_size_read(inode);
1887 } else if (origin == 1) { /* SEEK_CUR */
1888 offset += file->f_pos;
1892 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1893 if (offset != file->f_pos) {
1894 file->f_pos = offset;
1902 #ifdef HAVE_FLUSH_OWNER_ID
1903 int ll_flush(struct file *file, fl_owner_t id)
1905 int ll_flush(struct file *file)
1908 struct inode *inode = file->f_dentry->d_inode;
1909 struct ll_inode_info *lli = ll_i2info(inode);
1910 struct lov_stripe_md *lsm = lli->lli_smd;
1913 /* the application should know write failure already. */
1914 if (lli->lli_write_rc)
1917 /* catch async errors that were recorded back when async writeback
1918 * failed for pages in this mapping. */
1919 rc = lli->lli_async_rc;
1920 lli->lli_async_rc = 0;
1922 err = lov_test_and_clear_async_rc(lsm);
1927 return rc ? -EIO : 0;
1930 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1932 struct inode *inode = dentry->d_inode;
1933 struct ll_inode_info *lli = ll_i2info(inode);
1934 struct lov_stripe_md *lsm = lli->lli_smd;
1935 struct ptlrpc_request *req;
1936 struct obd_capa *oc;
1939 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1940 inode->i_generation, inode);
1941 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1943 /* fsync's caller has already called _fdata{sync,write}, we want
1944 * that IO to finish before calling the osc and mdc sync methods */
1945 rc = filemap_fdatawait(inode->i_mapping);
1947 /* catch async errors that were recorded back when async writeback
1948 * failed for pages in this mapping. */
1949 err = lli->lli_async_rc;
1950 lli->lli_async_rc = 0;
1954 err = lov_test_and_clear_async_rc(lsm);
1959 oc = ll_mdscapa_get(inode);
1960 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1966 ptlrpc_req_finished(req);
1969 struct obd_info *oinfo;
1971 OBD_ALLOC_PTR(oinfo);
1973 RETURN(rc ? rc : -ENOMEM);
1974 OBDO_ALLOC(oinfo->oi_oa);
1975 if (!oinfo->oi_oa) {
1976 OBD_FREE_PTR(oinfo);
1977 RETURN(rc ? rc : -ENOMEM);
1979 oinfo->oi_oa->o_id = lsm->lsm_object_id;
1980 oinfo->oi_oa->o_seq = lsm->lsm_object_seq;
1981 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1982 obdo_from_inode(oinfo->oi_oa, inode, &ll_i2info(inode)->lli_fid,
1983 OBD_MD_FLTYPE | OBD_MD_FLATIME |
1984 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1987 oinfo->oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
1988 err = obd_sync_rqset(ll_i2sbi(inode)->ll_dt_exp, oinfo, 0,
1990 capa_put(oinfo->oi_capa);
1993 OBDO_FREE(oinfo->oi_oa);
1994 OBD_FREE_PTR(oinfo);
1995 lli->lli_write_rc = err < 0 ? : 0;
2001 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2003 struct inode *inode = file->f_dentry->d_inode;
2004 struct ll_sb_info *sbi = ll_i2sbi(inode);
2005 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2006 .ei_cb_cp =ldlm_flock_completion_ast,
2007 .ei_cbdata = file_lock };
2008 struct md_op_data *op_data;
2009 struct lustre_handle lockh = {0};
2010 ldlm_policy_data_t flock = {{0}};
2015 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2016 inode->i_ino, file_lock);
2018 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2020 if (file_lock->fl_flags & FL_FLOCK) {
2021 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2022 /* flocks are whole-file locks */
2023 flock.l_flock.end = OFFSET_MAX;
2024 /* For flocks owner is determined by the local file desctiptor*/
2025 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2026 } else if (file_lock->fl_flags & FL_POSIX) {
2027 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2028 flock.l_flock.start = file_lock->fl_start;
2029 flock.l_flock.end = file_lock->fl_end;
2033 flock.l_flock.pid = file_lock->fl_pid;
2035 /* Somewhat ugly workaround for svc lockd.
2036 * lockd installs custom fl_lmops->fl_compare_owner that checks
2037 * for the fl_owner to be the same (which it always is on local node
2038 * I guess between lockd processes) and then compares pid.
2039 * As such we assign pid to the owner field to make it all work,
2040 * conflict with normal locks is unlikely since pid space and
2041 * pointer space for current->files are not intersecting */
2042 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2043 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2045 switch (file_lock->fl_type) {
2047 einfo.ei_mode = LCK_PR;
2050 /* An unlock request may or may not have any relation to
2051 * existing locks so we may not be able to pass a lock handle
2052 * via a normal ldlm_lock_cancel() request. The request may even
2053 * unlock a byte range in the middle of an existing lock. In
2054 * order to process an unlock request we need all of the same
2055 * information that is given with a normal read or write record
2056 * lock request. To avoid creating another ldlm unlock (cancel)
2057 * message we'll treat a LCK_NL flock request as an unlock. */
2058 einfo.ei_mode = LCK_NL;
2061 einfo.ei_mode = LCK_PW;
2064 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2079 flags = LDLM_FL_BLOCK_NOWAIT;
2085 flags = LDLM_FL_TEST_LOCK;
2086 /* Save the old mode so that if the mode in the lock changes we
2087 * can decrement the appropriate reader or writer refcount. */
2088 file_lock->fl_type = einfo.ei_mode;
2091 CERROR("unknown fcntl lock command: %d\n", cmd);
2095 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2096 LUSTRE_OPC_ANY, NULL);
2097 if (IS_ERR(op_data))
2098 RETURN(PTR_ERR(op_data));
2100 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2101 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2102 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2104 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2105 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2107 ll_finish_md_op_data(op_data);
2109 if ((file_lock->fl_flags & FL_FLOCK) &&
2110 (rc == 0 || file_lock->fl_type == F_UNLCK))
2111 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2112 #ifdef HAVE_F_OP_FLOCK
2113 if ((file_lock->fl_flags & FL_POSIX) &&
2114 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2115 !(flags & LDLM_FL_TEST_LOCK))
2116 posix_lock_file_wait(file, file_lock);
2122 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2130 * test if some locks matching bits and l_req_mode are acquired
2131 * - bits can be in different locks
2132 * - if found clear the common lock bits in *bits
2133 * - the bits not found, are kept in *bits
2135 * \param bits [IN] searched lock bits [IN]
2136 * \param l_req_mode [IN] searched lock mode
2137 * \retval boolean, true iff all bits are found
2139 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2141 struct lustre_handle lockh;
2142 ldlm_policy_data_t policy;
2143 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2144 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2153 fid = &ll_i2info(inode)->lli_fid;
2154 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2155 ldlm_lockname[mode]);
2157 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2158 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2159 policy.l_inodebits.bits = *bits & (1 << i);
2160 if (policy.l_inodebits.bits == 0)
2163 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2164 &policy, mode, &lockh)) {
2165 struct ldlm_lock *lock;
2167 lock = ldlm_handle2lock(&lockh);
2170 ~(lock->l_policy_data.l_inodebits.bits);
2171 LDLM_LOCK_PUT(lock);
2173 *bits &= ~policy.l_inodebits.bits;
2180 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2181 struct lustre_handle *lockh)
2183 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2189 fid = &ll_i2info(inode)->lli_fid;
2190 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2192 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2193 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2194 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2198 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2199 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2200 * and return success */
2202 /* This path cannot be hit for regular files unless in
2203 * case of obscure races, so no need to to validate
2205 if (!S_ISREG(inode->i_mode) &&
2206 !S_ISDIR(inode->i_mode))
2211 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2219 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2222 struct inode *inode = dentry->d_inode;
2223 struct ptlrpc_request *req = NULL;
2224 struct ll_sb_info *sbi;
2225 struct obd_export *exp;
2230 CERROR("REPORT THIS LINE TO PETER\n");
2233 sbi = ll_i2sbi(inode);
2235 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2236 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2238 exp = ll_i2mdexp(inode);
2240 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2241 * But under CMD case, it caused some lock issues, should be fixed
2242 * with new CMD ibits lock. See bug 12718 */
2243 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2244 struct lookup_intent oit = { .it_op = IT_GETATTR };
2245 struct md_op_data *op_data;
2247 if (ibits == MDS_INODELOCK_LOOKUP)
2248 oit.it_op = IT_LOOKUP;
2250 /* Call getattr by fid, so do not provide name at all. */
2251 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2252 dentry->d_inode, NULL, 0, 0,
2253 LUSTRE_OPC_ANY, NULL);
2254 if (IS_ERR(op_data))
2255 RETURN(PTR_ERR(op_data));
2257 oit.it_create_mode |= M_CHECK_STALE;
2258 rc = md_intent_lock(exp, op_data, NULL, 0,
2259 /* we are not interested in name
2262 ll_md_blocking_ast, 0);
2263 ll_finish_md_op_data(op_data);
2264 oit.it_create_mode &= ~M_CHECK_STALE;
2266 rc = ll_inode_revalidate_fini(inode, rc);
2270 rc = ll_revalidate_it_finish(req, &oit, dentry);
2272 ll_intent_release(&oit);
2276 /* Unlinked? Unhash dentry, so it is not picked up later by
2277 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2278 here to preserve get_cwd functionality on 2.6.
2280 if (!dentry->d_inode->i_nlink) {
2281 cfs_spin_lock(&ll_lookup_lock);
2282 spin_lock(&dcache_lock);
2283 ll_drop_dentry(dentry);
2284 spin_unlock(&dcache_lock);
2285 cfs_spin_unlock(&ll_lookup_lock);
2288 ll_lookup_finish_locks(&oit, dentry);
2289 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2290 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2291 obd_valid valid = OBD_MD_FLGETATTR;
2292 struct md_op_data *op_data;
2295 if (S_ISREG(inode->i_mode)) {
2296 rc = ll_get_max_mdsize(sbi, &ealen);
2299 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2302 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2303 0, ealen, LUSTRE_OPC_ANY,
2305 if (op_data == NULL)
2308 op_data->op_valid = valid;
2309 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2310 * capa for this inode. Because we only keep capas of dirs
2312 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2313 ll_finish_md_op_data(op_data);
2315 rc = ll_inode_revalidate_fini(inode, rc);
2319 rc = ll_prep_inode(&inode, req, NULL);
2322 ptlrpc_req_finished(req);
2326 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2329 struct inode *inode = dentry->d_inode;
2333 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2335 /* if object not yet allocated, don't validate size */
2336 if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL) {
2337 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2338 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2339 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2343 /* cl_glimpse_size will prefer locally cached writes if they extend
2347 rc = cl_glimpse_size(inode);
2352 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2353 struct lookup_intent *it, struct kstat *stat)
2355 struct inode *inode = de->d_inode;
2356 struct ll_sb_info *sbi = ll_i2sbi(inode);
2357 struct ll_inode_info *lli = ll_i2info(inode);
2360 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2361 MDS_INODELOCK_LOOKUP);
2362 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2367 stat->dev = inode->i_sb->s_dev;
2368 if (ll_need_32bit_api(sbi))
2369 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2371 stat->ino = inode->i_ino;
2372 stat->mode = inode->i_mode;
2373 stat->nlink = inode->i_nlink;
2374 stat->uid = inode->i_uid;
2375 stat->gid = inode->i_gid;
2376 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2377 stat->atime = inode->i_atime;
2378 stat->mtime = inode->i_mtime;
2379 stat->ctime = inode->i_ctime;
2380 #ifdef HAVE_INODE_BLKSIZE
2381 stat->blksize = inode->i_blksize;
2383 stat->blksize = 1 << inode->i_blkbits;
2386 stat->size = i_size_read(inode);
2387 stat->blocks = inode->i_blocks;
2391 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2393 struct lookup_intent it = { .it_op = IT_GETATTR };
2395 return ll_getattr_it(mnt, de, &it, stat);
2398 #ifdef HAVE_LINUX_FIEMAP_H
2399 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2400 __u64 start, __u64 len)
2404 struct ll_user_fiemap *fiemap;
2405 unsigned int extent_count = fieinfo->fi_extents_max;
2407 num_bytes = sizeof(*fiemap) + (extent_count *
2408 sizeof(struct ll_fiemap_extent));
2409 OBD_ALLOC_LARGE(fiemap, num_bytes);
2414 fiemap->fm_flags = fieinfo->fi_flags;
2415 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2416 fiemap->fm_start = start;
2417 fiemap->fm_length = len;
2418 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2419 sizeof(struct ll_fiemap_extent));
2421 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2423 fieinfo->fi_flags = fiemap->fm_flags;
2424 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2425 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2426 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2428 OBD_FREE_LARGE(fiemap, num_bytes);
2435 int lustre_check_acl(struct inode *inode, int mask)
2437 #ifdef CONFIG_FS_POSIX_ACL
2438 struct ll_inode_info *lli = ll_i2info(inode);
2439 struct posix_acl *acl;
2443 cfs_spin_lock(&lli->lli_lock);
2444 acl = posix_acl_dup(lli->lli_posix_acl);
2445 cfs_spin_unlock(&lli->lli_lock);
2450 rc = posix_acl_permission(inode, acl, mask);
2451 posix_acl_release(acl);
2459 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2460 #ifndef HAVE_INODE_PERMISION_2ARGS
2461 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2463 int ll_inode_permission(struct inode *inode, int mask)
2469 /* as root inode are NOT getting validated in lookup operation,
2470 * need to do it before permission check. */
2472 if (inode == inode->i_sb->s_root->d_inode) {
2473 struct lookup_intent it = { .it_op = IT_LOOKUP };
2475 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2476 MDS_INODELOCK_LOOKUP);
2481 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2482 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2484 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2485 return lustre_check_remote_perm(inode, mask);
2487 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2488 rc = generic_permission(inode, mask, lustre_check_acl);
2493 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2495 int mode = inode->i_mode;
2498 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2499 inode->i_ino, inode->i_generation, inode, mask);
2501 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2502 return lustre_check_remote_perm(inode, mask);
2504 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2506 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2507 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2509 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2511 if (cfs_curproc_fsuid() == inode->i_uid) {
2514 if (((mode >> 3) & mask & S_IRWXO) != mask)
2516 rc = lustre_check_acl(inode, mask);
2520 goto check_capabilities;
2524 if (cfs_curproc_is_in_groups(inode->i_gid))
2527 if ((mode & mask & S_IRWXO) == mask)
2531 if (!(mask & MAY_EXEC) ||
2532 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2533 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2536 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2537 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2544 #ifdef HAVE_FILE_READV
2545 #define READ_METHOD readv
2546 #define READ_FUNCTION ll_file_readv
2547 #define WRITE_METHOD writev
2548 #define WRITE_FUNCTION ll_file_writev
2550 #define READ_METHOD aio_read
2551 #define READ_FUNCTION ll_file_aio_read
2552 #define WRITE_METHOD aio_write
2553 #define WRITE_FUNCTION ll_file_aio_write
2556 /* -o localflock - only provides locally consistent flock locks */
2557 struct file_operations ll_file_operations = {
2558 .read = ll_file_read,
2559 .READ_METHOD = READ_FUNCTION,
2560 .write = ll_file_write,
2561 .WRITE_METHOD = WRITE_FUNCTION,
2562 #ifdef HAVE_UNLOCKED_IOCTL
2563 .unlocked_ioctl = ll_file_ioctl,
2565 .ioctl = ll_file_ioctl,
2567 .open = ll_file_open,
2568 .release = ll_file_release,
2569 .mmap = ll_file_mmap,
2570 .llseek = ll_file_seek,
2571 #ifdef HAVE_KERNEL_SENDFILE
2572 .sendfile = ll_file_sendfile,
2574 #ifdef HAVE_KERNEL_SPLICE_READ
2575 .splice_read = ll_file_splice_read,
2581 struct file_operations ll_file_operations_flock = {
2582 .read = ll_file_read,
2583 .READ_METHOD = READ_FUNCTION,
2584 .write = ll_file_write,
2585 .WRITE_METHOD = WRITE_FUNCTION,
2586 #ifdef HAVE_UNLOCKED_IOCTL
2587 .unlocked_ioctl = ll_file_ioctl,
2589 .ioctl = ll_file_ioctl,
2591 .open = ll_file_open,
2592 .release = ll_file_release,
2593 .mmap = ll_file_mmap,
2594 .llseek = ll_file_seek,
2595 #ifdef HAVE_KERNEL_SENDFILE
2596 .sendfile = ll_file_sendfile,
2598 #ifdef HAVE_KERNEL_SPLICE_READ
2599 .splice_read = ll_file_splice_read,
2603 #ifdef HAVE_F_OP_FLOCK
2604 .flock = ll_file_flock,
2606 .lock = ll_file_flock
2609 /* These are for -o noflock - to return ENOSYS on flock calls */
2610 struct file_operations ll_file_operations_noflock = {
2611 .read = ll_file_read,
2612 .READ_METHOD = READ_FUNCTION,
2613 .write = ll_file_write,
2614 .WRITE_METHOD = WRITE_FUNCTION,
2615 #ifdef HAVE_UNLOCKED_IOCTL
2616 .unlocked_ioctl = ll_file_ioctl,
2618 .ioctl = ll_file_ioctl,
2620 .open = ll_file_open,
2621 .release = ll_file_release,
2622 .mmap = ll_file_mmap,
2623 .llseek = ll_file_seek,
2624 #ifdef HAVE_KERNEL_SENDFILE
2625 .sendfile = ll_file_sendfile,
2627 #ifdef HAVE_KERNEL_SPLICE_READ
2628 .splice_read = ll_file_splice_read,
2632 #ifdef HAVE_F_OP_FLOCK
2633 .flock = ll_file_noflock,
2635 .lock = ll_file_noflock
2638 struct inode_operations ll_file_inode_operations = {
2639 .setattr = ll_setattr,
2640 .truncate = ll_truncate,
2641 .getattr = ll_getattr,
2642 .permission = ll_inode_permission,
2643 .setxattr = ll_setxattr,
2644 .getxattr = ll_getxattr,
2645 .listxattr = ll_listxattr,
2646 .removexattr = ll_removexattr,
2647 #ifdef HAVE_LINUX_FIEMAP_H
2648 .fiemap = ll_fiemap,
2652 /* dynamic ioctl number support routins */
2653 static struct llioc_ctl_data {
2654 cfs_rw_semaphore_t ioc_sem;
2655 cfs_list_t ioc_head;
2657 __RWSEM_INITIALIZER(llioc.ioc_sem),
2658 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2663 cfs_list_t iocd_list;
2664 unsigned int iocd_size;
2665 llioc_callback_t iocd_cb;
2666 unsigned int iocd_count;
2667 unsigned int iocd_cmd[0];
2670 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2673 struct llioc_data *in_data = NULL;
2676 if (cb == NULL || cmd == NULL ||
2677 count > LLIOC_MAX_CMD || count < 0)
2680 size = sizeof(*in_data) + count * sizeof(unsigned int);
2681 OBD_ALLOC(in_data, size);
2682 if (in_data == NULL)
2685 memset(in_data, 0, sizeof(*in_data));
2686 in_data->iocd_size = size;
2687 in_data->iocd_cb = cb;
2688 in_data->iocd_count = count;
2689 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2691 cfs_down_write(&llioc.ioc_sem);
2692 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2693 cfs_up_write(&llioc.ioc_sem);
2698 void ll_iocontrol_unregister(void *magic)
2700 struct llioc_data *tmp;
2705 cfs_down_write(&llioc.ioc_sem);
2706 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2708 unsigned int size = tmp->iocd_size;
2710 cfs_list_del(&tmp->iocd_list);
2711 cfs_up_write(&llioc.ioc_sem);
2713 OBD_FREE(tmp, size);
2717 cfs_up_write(&llioc.ioc_sem);
2719 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2722 EXPORT_SYMBOL(ll_iocontrol_register);
2723 EXPORT_SYMBOL(ll_iocontrol_unregister);
2725 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2726 unsigned int cmd, unsigned long arg, int *rcp)
2728 enum llioc_iter ret = LLIOC_CONT;
2729 struct llioc_data *data;
2730 int rc = -EINVAL, i;
2732 cfs_down_read(&llioc.ioc_sem);
2733 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2734 for (i = 0; i < data->iocd_count; i++) {
2735 if (cmd != data->iocd_cmd[i])
2738 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2742 if (ret == LLIOC_STOP)
2745 cfs_up_read(&llioc.ioc_sem);