1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * Copyright (c) 2011 Whamcloud, Inc.
36 * This file is part of Lustre, http://www.lustre.org/
37 * Lustre is a trademark of Sun Microsystems, Inc.
41 * Author: Peter Braam <braam@clusterfs.com>
42 * Author: Phil Schwan <phil@clusterfs.com>
43 * Author: Andreas Dilger <adilger@clusterfs.com>
46 #define DEBUG_SUBSYSTEM S_LLITE
47 #include <lustre_dlm.h>
48 #include <lustre_lite.h>
49 #include <lustre_mdc.h>
50 #include <linux/pagemap.h>
51 #include <linux/file.h>
52 #include "llite_internal.h"
53 #include <lustre/ll_fiemap.h>
55 #include "cl_object.h"
57 struct ll_file_data *ll_file_data_get(void)
59 struct ll_file_data *fd;
61 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
65 static void ll_file_data_put(struct ll_file_data *fd)
68 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
71 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
72 struct lustre_handle *fh)
74 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
75 op_data->op_attr.ia_mode = inode->i_mode;
76 op_data->op_attr.ia_atime = inode->i_atime;
77 op_data->op_attr.ia_mtime = inode->i_mtime;
78 op_data->op_attr.ia_ctime = inode->i_ctime;
79 op_data->op_attr.ia_size = i_size_read(inode);
80 op_data->op_attr_blocks = inode->i_blocks;
81 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
82 ll_inode_to_ext_flags(inode->i_flags);
83 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
85 op_data->op_handle = *fh;
86 op_data->op_capa1 = ll_mdscapa_get(inode);
90 * Closes the IO epoch and packs all the attributes into @op_data for
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
99 ATTR_MTIME_SET | ATTR_CTIME_SET;
101 if (!(och->och_flags & FMODE_WRITE))
104 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
105 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 ll_ioepoch_close(inode, op_data, &och, 0);
110 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
111 ll_prep_md_op_data(op_data, inode, NULL, NULL,
112 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och)
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
144 rc = md_close(md_exp, op_data, och->och_mod, &req);
146 /* This close must have the epoch closed. */
147 LASSERT(epoch_close);
148 /* MDS has instructed us to obtain Size-on-MDS attribute from
149 * OSTs and send setattr to back to MDS. */
150 rc = ll_som_update(inode, op_data);
152 CERROR("inode %lu mdc Size-on-MDS update failed: "
153 "rc = %d\n", inode->i_ino, rc);
157 CERROR("inode %lu mdc close failed: rc = %d\n",
160 ll_finish_md_op_data(op_data);
163 rc = ll_objects_destroy(req, inode);
165 CERROR("inode %lu ll_objects destroy: rc = %d\n",
172 if (exp_connect_som(exp) && !epoch_close &&
173 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
174 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
176 md_clear_open_replay_data(md_exp, och);
177 /* Free @och if it is not waiting for DONE_WRITING. */
178 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
181 if (req) /* This is close request */
182 ptlrpc_req_finished(req);
186 int ll_md_real_close(struct inode *inode, int flags)
188 struct ll_inode_info *lli = ll_i2info(inode);
189 struct obd_client_handle **och_p;
190 struct obd_client_handle *och;
195 if (flags & FMODE_WRITE) {
196 och_p = &lli->lli_mds_write_och;
197 och_usecount = &lli->lli_open_fd_write_count;
198 } else if (flags & FMODE_EXEC) {
199 och_p = &lli->lli_mds_exec_och;
200 och_usecount = &lli->lli_open_fd_exec_count;
202 LASSERT(flags & FMODE_READ);
203 och_p = &lli->lli_mds_read_och;
204 och_usecount = &lli->lli_open_fd_read_count;
207 cfs_down(&lli->lli_och_sem);
208 if (*och_usecount) { /* There are still users of this handle, so
210 cfs_up(&lli->lli_och_sem);
215 cfs_up(&lli->lli_och_sem);
217 if (och) { /* There might be a race and somebody have freed this och
219 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
226 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
229 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
230 struct ll_inode_info *lli = ll_i2info(inode);
234 /* clear group lock, if present */
235 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
236 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
238 /* Let's see if we have good enough OPEN lock on the file and if
239 we can skip talking to MDS */
240 if (file->f_dentry->d_inode) { /* Can this ever be false? */
242 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
243 struct lustre_handle lockh;
244 struct inode *inode = file->f_dentry->d_inode;
245 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
247 cfs_down(&lli->lli_och_sem);
248 if (fd->fd_omode & FMODE_WRITE) {
250 LASSERT(lli->lli_open_fd_write_count);
251 lli->lli_open_fd_write_count--;
252 } else if (fd->fd_omode & FMODE_EXEC) {
254 LASSERT(lli->lli_open_fd_exec_count);
255 lli->lli_open_fd_exec_count--;
258 LASSERT(lli->lli_open_fd_read_count);
259 lli->lli_open_fd_read_count--;
261 cfs_up(&lli->lli_och_sem);
263 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
264 LDLM_IBITS, &policy, lockmode,
266 rc = ll_md_real_close(file->f_dentry->d_inode,
270 CERROR("Releasing a file %p with negative dentry %p. Name %s",
271 file, file->f_dentry, file->f_dentry->d_name.name);
274 LUSTRE_FPRIVATE(file) = NULL;
275 ll_file_data_put(fd);
276 ll_capa_close(inode);
281 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
283 /* While this returns an error code, fput() the caller does not, so we need
284 * to make every effort to clean up all of our state here. Also, applications
285 * rarely check close errors and even if an error is returned they will not
286 * re-try the close call.
288 int ll_file_release(struct inode *inode, struct file *file)
290 struct ll_file_data *fd;
291 struct ll_sb_info *sbi = ll_i2sbi(inode);
292 struct ll_inode_info *lli = ll_i2info(inode);
293 struct lov_stripe_md *lsm = lli->lli_smd;
297 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
298 inode->i_generation, inode);
300 #ifdef CONFIG_FS_POSIX_ACL
301 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
302 inode == inode->i_sb->s_root->d_inode) {
303 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
306 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
307 fd->fd_flags &= ~LL_FILE_RMTACL;
308 rct_del(&sbi->ll_rct, cfs_curproc_pid());
309 et_search_free(&sbi->ll_et, cfs_curproc_pid());
314 if (inode->i_sb->s_root != file->f_dentry)
315 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
316 fd = LUSTRE_FPRIVATE(file);
319 /* The last ref on @file, maybe not the the owner pid of statahead.
320 * Different processes can open the same dir, "ll_opendir_key" means:
321 * it is me that should stop the statahead thread. */
322 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
323 ll_stop_statahead(inode, lli->lli_opendir_key);
325 if (inode->i_sb->s_root == file->f_dentry) {
326 LUSTRE_FPRIVATE(file) = NULL;
327 ll_file_data_put(fd);
332 lov_test_and_clear_async_rc(lsm);
333 lli->lli_async_rc = 0;
335 rc = ll_md_close(sbi->ll_md_exp, inode, file);
337 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
338 libcfs_debug_dumplog();
343 static int ll_intent_file_open(struct file *file, void *lmm,
344 int lmmsize, struct lookup_intent *itp)
346 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
347 struct dentry *parent = file->f_dentry->d_parent;
348 const char *name = file->f_dentry->d_name.name;
349 const int len = file->f_dentry->d_name.len;
350 struct md_op_data *op_data;
351 struct ptlrpc_request *req;
352 __u32 opc = LUSTRE_OPC_ANY;
359 /* Usually we come here only for NFSD, and we want open lock.
360 But we can also get here with pre 2.6.15 patchless kernels, and in
361 that case that lock is also ok */
362 /* We can also get here if there was cached open handle in revalidate_it
363 * but it disappeared while we were getting from there to ll_file_open.
364 * But this means this file was closed and immediatelly opened which
365 * makes a good candidate for using OPEN lock */
366 /* If lmmsize & lmm are not 0, we are just setting stripe info
367 * parameters. No need for the open lock */
368 if (lmm == NULL && lmmsize == 0) {
369 itp->it_flags |= MDS_OPEN_LOCK;
370 if (itp->it_flags & FMODE_WRITE)
371 opc = LUSTRE_OPC_CREATE;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
375 file->f_dentry->d_inode, name, len,
378 RETURN(PTR_ERR(op_data));
380 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
381 0 /*unused */, &req, ll_md_blocking_ast, 0);
382 ll_finish_md_op_data(op_data);
384 /* reason for keep own exit path - don`t flood log
385 * with messages with -ESTALE errors.
387 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
388 it_open_error(DISP_OPEN_OPEN, itp))
390 ll_release_openhandle(file->f_dentry, itp);
394 if (it_disposition(itp, DISP_LOOKUP_NEG))
395 GOTO(out, rc = -ENOENT);
397 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
398 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
399 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
403 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
404 if (!rc && itp->d.lustre.it_lock_mode)
405 md_set_lock_data(sbi->ll_md_exp,
406 &itp->d.lustre.it_lock_handle,
407 file->f_dentry->d_inode, NULL);
410 ptlrpc_req_finished(itp->d.lustre.it_data);
411 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
412 ll_intent_drop_lock(itp);
418 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
419 * not believe attributes if a few ioepoch holders exist. Attributes for
420 * previous ioepoch if new one is opened are also skipped by MDS.
422 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
424 if (ioepoch && lli->lli_ioepoch != ioepoch) {
425 lli->lli_ioepoch = ioepoch;
426 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
427 ioepoch, PFID(&lli->lli_fid));
431 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
432 struct lookup_intent *it, struct obd_client_handle *och)
434 struct ptlrpc_request *req = it->d.lustre.it_data;
435 struct mdt_body *body;
439 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
440 LASSERT(body != NULL); /* reply already checked out */
442 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_fid = lli->lli_fid;
445 och->och_flags = it->it_flags;
446 ll_ioepoch_open(lli, body->ioepoch);
448 return md_set_open_replay_data(md_exp, och, req);
451 int ll_local_open(struct file *file, struct lookup_intent *it,
452 struct ll_file_data *fd, struct obd_client_handle *och)
454 struct inode *inode = file->f_dentry->d_inode;
455 struct ll_inode_info *lli = ll_i2info(inode);
458 LASSERT(!LUSTRE_FPRIVATE(file));
463 struct ptlrpc_request *req = it->d.lustre.it_data;
464 struct mdt_body *body;
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
471 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
472 if ((it->it_flags & FMODE_WRITE) &&
473 (body->valid & OBD_MD_FLSIZE))
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 lli->lli_ioepoch, PFID(&lli->lli_fid));
478 LUSTRE_FPRIVATE(file) = fd;
479 ll_readahead_init(inode, &fd->fd_ras);
480 fd->fd_omode = it->it_flags;
484 /* Open a file, and (for the very first open) create objects on the OSTs at
485 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
486 * creation or open until ll_lov_setstripe() ioctl is called. We grab
487 * lli_open_sem to ensure no other process will create objects, send the
488 * stripe MD to the MDS, or try to destroy the objects if that fails.
490 * If we already have the stripe MD locally then we don't request it in
491 * md_open(), by passing a lmm_size = 0.
493 * It is up to the application to ensure no other processes open this file
494 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
495 * used. We might be able to avoid races of that sort by getting lli_open_sem
496 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
497 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
499 int ll_file_open(struct inode *inode, struct file *file)
501 struct ll_inode_info *lli = ll_i2info(inode);
502 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
503 .it_flags = file->f_flags };
504 struct lov_stripe_md *lsm;
505 struct obd_client_handle **och_p = NULL;
506 __u64 *och_usecount = NULL;
507 struct ll_file_data *fd;
508 int rc = 0, opendir_set = 0;
511 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
512 inode->i_generation, inode, file->f_flags);
514 it = file->private_data; /* XXX: compat macro */
515 file->private_data = NULL; /* prevent ll_local_open assertion */
517 fd = ll_file_data_get();
519 GOTO(out_och_free, rc = -ENOMEM);
522 if (S_ISDIR(inode->i_mode)) {
523 cfs_spin_lock(&lli->lli_sa_lock);
524 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
525 LASSERT(lli->lli_sai == NULL);
526 lli->lli_opendir_key = fd;
527 lli->lli_opendir_pid = cfs_curproc_pid();
530 cfs_spin_unlock(&lli->lli_sa_lock);
533 if (inode->i_sb->s_root == file->f_dentry) {
534 LUSTRE_FPRIVATE(file) = fd;
538 if (!it || !it->d.lustre.it_disposition) {
539 /* Convert f_flags into access mode. We cannot use file->f_mode,
540 * because everything but O_ACCMODE mask was stripped from
542 if ((oit.it_flags + 1) & O_ACCMODE)
544 if (file->f_flags & O_TRUNC)
545 oit.it_flags |= FMODE_WRITE;
547 /* kernel only call f_op->open in dentry_open. filp_open calls
548 * dentry_open after call to open_namei that checks permissions.
549 * Only nfsd_open call dentry_open directly without checking
550 * permissions and because of that this code below is safe. */
551 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
552 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
554 /* We do not want O_EXCL here, presumably we opened the file
555 * already? XXX - NFS implications? */
556 oit.it_flags &= ~O_EXCL;
558 /* bug20584, if "it_flags" contains O_CREAT, the file will be
559 * created if necessary, then "IT_CREAT" should be set to keep
560 * consistent with it */
561 if (oit.it_flags & O_CREAT)
562 oit.it_op |= IT_CREAT;
568 /* Let's see if we have file open on MDS already. */
569 if (it->it_flags & FMODE_WRITE) {
570 och_p = &lli->lli_mds_write_och;
571 och_usecount = &lli->lli_open_fd_write_count;
572 } else if (it->it_flags & FMODE_EXEC) {
573 och_p = &lli->lli_mds_exec_och;
574 och_usecount = &lli->lli_open_fd_exec_count;
576 och_p = &lli->lli_mds_read_och;
577 och_usecount = &lli->lli_open_fd_read_count;
580 cfs_down(&lli->lli_och_sem);
581 if (*och_p) { /* Open handle is present */
582 if (it_disposition(it, DISP_OPEN_OPEN)) {
583 /* Well, there's extra open request that we do not need,
584 let's close it somehow. This will decref request. */
585 rc = it_open_error(DISP_OPEN_OPEN, it);
587 cfs_up(&lli->lli_och_sem);
588 GOTO(out_openerr, rc);
591 ll_release_openhandle(file->f_dentry, it);
592 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
597 rc = ll_local_open(file, it, fd, NULL);
600 cfs_up(&lli->lli_och_sem);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->d.lustre.it_disposition) {
606 /* We cannot just request lock handle now, new ELC code
607 means that one of other OPEN locks for this file
608 could be cancelled, and since blocking ast handler
609 would attempt to grab och_sem as well, that would
610 result in a deadlock */
611 cfs_up(&lli->lli_och_sem);
612 it->it_create_mode |= M_CHECK_STALE;
613 rc = ll_intent_file_open(file, NULL, 0, it);
614 it->it_create_mode &= ~M_CHECK_STALE;
616 GOTO(out_openerr, rc);
620 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
622 GOTO(out_och_free, rc = -ENOMEM);
626 /* md_intent_lock() didn't get a request ref if there was an
627 * open error, so don't do cleanup on the request here
629 /* XXX (green): Should not we bail out on any error here, not
630 * just open error? */
631 rc = it_open_error(DISP_OPEN_OPEN, it);
633 GOTO(out_och_free, rc);
635 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
637 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
638 rc = ll_local_open(file, it, fd, *och_p);
640 GOTO(out_och_free, rc);
642 cfs_up(&lli->lli_och_sem);
645 /* Must do this outside lli_och_sem lock to prevent deadlock where
646 different kind of OPEN lock for this same inode gets cancelled
647 by ldlm_cancel_lru */
648 if (!S_ISREG(inode->i_mode))
649 GOTO(out_och_free, rc);
655 if (file->f_flags & O_LOV_DELAY_CREATE ||
656 !(file->f_mode & FMODE_WRITE)) {
657 CDEBUG(D_INODE, "object creation was delayed\n");
658 GOTO(out_och_free, rc);
661 file->f_flags &= ~O_LOV_DELAY_CREATE;
662 GOTO(out_och_free, rc);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->d.lustre.it_data);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
671 if (och_p && *och_p) {
672 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
673 *och_p = NULL; /* OBD_FREE writes some magic there */
676 cfs_up(&lli->lli_och_sem);
679 if (opendir_set != 0)
680 ll_stop_statahead(inode, lli->lli_opendir_key);
682 ll_file_data_put(fd);
688 /* Fills the obdo with the attributes for the lsm */
689 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
690 struct obd_capa *capa, struct obdo *obdo,
691 __u64 ioepoch, int sync)
693 struct ptlrpc_request_set *set;
694 struct obd_info oinfo = { { { 0 } } };
699 LASSERT(lsm != NULL);
703 oinfo.oi_oa->o_id = lsm->lsm_object_id;
704 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
705 oinfo.oi_oa->o_mode = S_IFREG;
706 oinfo.oi_oa->o_ioepoch = ioepoch;
707 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
708 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
709 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
710 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
711 OBD_MD_FLGROUP | OBD_MD_FLEPOCH;
712 oinfo.oi_capa = capa;
714 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
715 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
718 set = ptlrpc_prep_set();
720 CERROR("can't allocate ptlrpc set\n");
723 rc = obd_getattr_async(exp, &oinfo, set);
725 rc = ptlrpc_set_wait(set);
726 ptlrpc_set_destroy(set);
729 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
730 OBD_MD_FLATIME | OBD_MD_FLMTIME |
731 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
736 * Performs the getattr on the inode and updates its fields.
737 * If @sync != 0, perform the getattr under the server-side lock.
739 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
740 __u64 ioepoch, int sync)
742 struct ll_inode_info *lli = ll_i2info(inode);
743 struct obd_capa *capa = ll_mdscapa_get(inode);
747 rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode),
748 capa, obdo, ioepoch, sync);
751 obdo_refresh_inode(inode, obdo, obdo->o_valid);
753 "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
754 lli->lli_smd->lsm_object_id, i_size_read(inode),
755 (unsigned long long)inode->i_blocks,
756 (unsigned long)ll_inode_blksize(inode));
761 int ll_merge_lvb(struct inode *inode)
763 struct ll_inode_info *lli = ll_i2info(inode);
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
770 ll_inode_size_lock(inode, 1);
771 inode_init_lvb(inode, &lvb);
773 /* merge timestamps the most resently obtained from mds with
774 timestamps obtained from osts */
775 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
776 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
777 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
778 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
779 cl_isize_write_nolock(inode, lvb.lvb_size);
781 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
782 PFID(&lli->lli_fid), lvb.lvb_size);
783 inode->i_blocks = lvb.lvb_blocks;
785 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
786 LTIME_S(inode->i_atime) = lvb.lvb_atime;
787 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
788 ll_inode_size_unlock(inode, 1);
793 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
796 struct obdo obdo = { 0 };
799 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
801 st->st_size = obdo.o_size;
802 st->st_blocks = obdo.o_blocks;
803 st->st_mtime = obdo.o_mtime;
804 st->st_atime = obdo.o_atime;
805 st->st_ctime = obdo.o_ctime;
810 void ll_io_init(struct cl_io *io, const struct file *file, int write)
812 struct inode *inode = file->f_dentry->d_inode;
814 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
816 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
817 io->ci_obj = ll_i2info(inode)->lli_clob;
818 io->ci_lockreq = CILR_MAYBE;
819 if (ll_file_nolock(file)) {
820 io->ci_lockreq = CILR_NEVER;
821 io->ci_no_srvlock = 1;
822 } else if (file->f_flags & O_APPEND) {
823 io->ci_lockreq = CILR_MANDATORY;
827 static ssize_t ll_file_io_generic(const struct lu_env *env,
828 struct vvp_io_args *args, struct file *file,
829 enum cl_io_type iot, loff_t *ppos, size_t count)
831 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
836 io = ccc_env_thread_io(env);
837 ll_io_init(io, file, iot == CIT_WRITE);
839 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
840 struct vvp_io *vio = vvp_env_io(env);
841 struct ccc_io *cio = ccc_env_io(env);
842 int write_sem_locked = 0;
844 cio->cui_fd = LUSTRE_FPRIVATE(file);
845 vio->cui_io_subtype = args->via_io_subtype;
847 switch (vio->cui_io_subtype) {
849 cio->cui_iov = args->u.normal.via_iov;
850 cio->cui_nrsegs = args->u.normal.via_nrsegs;
851 cio->cui_tot_nrsegs = cio->cui_nrsegs;
852 #ifndef HAVE_FILE_WRITEV
853 cio->cui_iocb = args->u.normal.via_iocb;
855 if ((iot == CIT_WRITE) &&
856 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
857 if(cfs_down_interruptible(&lli->lli_write_sem))
858 GOTO(out, result = -ERESTARTSYS);
859 write_sem_locked = 1;
860 } else if (iot == CIT_READ) {
861 cfs_down_read(&lli->lli_trunc_sem);
865 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
866 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
869 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
870 vio->u.splice.cui_flags = args->u.splice.via_flags;
873 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
876 result = cl_io_loop(env, io);
877 if (write_sem_locked)
878 cfs_up(&lli->lli_write_sem);
879 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
880 cfs_up_read(&lli->lli_trunc_sem);
882 /* cl_io_rw_init() handled IO */
883 result = io->ci_result;
886 if (io->ci_nob > 0) {
888 *ppos = io->u.ci_wr.wr.crw_pos;
894 if (iot == CIT_READ) {
896 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
897 LPROC_LL_READ_BYTES, result);
898 } else if (iot == CIT_WRITE) {
900 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
901 LPROC_LL_WRITE_BYTES, result);
902 lli->lli_write_rc = 0;
904 lli->lli_write_rc = result;
913 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
915 static int ll_file_get_iov_count(const struct iovec *iov,
916 unsigned long *nr_segs, size_t *count)
921 for (seg = 0; seg < *nr_segs; seg++) {
922 const struct iovec *iv = &iov[seg];
925 * If any segment has a negative length, or the cumulative
926 * length ever wraps negative then return -EINVAL.
929 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
931 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
936 cnt -= iv->iov_len; /* This segment is no good */
943 #ifdef HAVE_FILE_READV
944 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
945 unsigned long nr_segs, loff_t *ppos)
948 struct vvp_io_args *args;
954 result = ll_file_get_iov_count(iov, &nr_segs, &count);
958 env = cl_env_get(&refcheck);
960 RETURN(PTR_ERR(env));
962 args = vvp_env_args(env, IO_NORMAL);
963 args->u.normal.via_iov = (struct iovec *)iov;
964 args->u.normal.via_nrsegs = nr_segs;
966 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
967 cl_env_put(env, &refcheck);
971 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
975 struct iovec *local_iov;
980 env = cl_env_get(&refcheck);
982 RETURN(PTR_ERR(env));
984 local_iov = &vvp_env_info(env)->vti_local_iov;
985 local_iov->iov_base = (void __user *)buf;
986 local_iov->iov_len = count;
987 result = ll_file_readv(file, local_iov, 1, ppos);
988 cl_env_put(env, &refcheck);
993 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
994 unsigned long nr_segs, loff_t pos)
997 struct vvp_io_args *args;
1003 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1007 env = cl_env_get(&refcheck);
1009 RETURN(PTR_ERR(env));
1011 args = vvp_env_args(env, IO_NORMAL);
1012 args->u.normal.via_iov = (struct iovec *)iov;
1013 args->u.normal.via_nrsegs = nr_segs;
1014 args->u.normal.via_iocb = iocb;
1016 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1017 &iocb->ki_pos, count);
1018 cl_env_put(env, &refcheck);
1022 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1026 struct iovec *local_iov;
1027 struct kiocb *kiocb;
1032 env = cl_env_get(&refcheck);
1034 RETURN(PTR_ERR(env));
1036 local_iov = &vvp_env_info(env)->vti_local_iov;
1037 kiocb = &vvp_env_info(env)->vti_kiocb;
1038 local_iov->iov_base = (void __user *)buf;
1039 local_iov->iov_len = count;
1040 init_sync_kiocb(kiocb, file);
1041 kiocb->ki_pos = *ppos;
1042 kiocb->ki_left = count;
1044 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1045 *ppos = kiocb->ki_pos;
1047 cl_env_put(env, &refcheck);
1053 * Write to a file (through the page cache).
1055 #ifdef HAVE_FILE_WRITEV
1056 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1057 unsigned long nr_segs, loff_t *ppos)
1060 struct vvp_io_args *args;
1066 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1070 env = cl_env_get(&refcheck);
1072 RETURN(PTR_ERR(env));
1074 args = vvp_env_args(env, IO_NORMAL);
1075 args->u.normal.via_iov = (struct iovec *)iov;
1076 args->u.normal.via_nrsegs = nr_segs;
1078 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1079 cl_env_put(env, &refcheck);
1083 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1087 struct iovec *local_iov;
1092 env = cl_env_get(&refcheck);
1094 RETURN(PTR_ERR(env));
1096 local_iov = &vvp_env_info(env)->vti_local_iov;
1097 local_iov->iov_base = (void __user *)buf;
1098 local_iov->iov_len = count;
1100 result = ll_file_writev(file, local_iov, 1, ppos);
1101 cl_env_put(env, &refcheck);
1105 #else /* AIO stuff */
1106 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1107 unsigned long nr_segs, loff_t pos)
1110 struct vvp_io_args *args;
1116 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1120 env = cl_env_get(&refcheck);
1122 RETURN(PTR_ERR(env));
1124 args = vvp_env_args(env, IO_NORMAL);
1125 args->u.normal.via_iov = (struct iovec *)iov;
1126 args->u.normal.via_nrsegs = nr_segs;
1127 args->u.normal.via_iocb = iocb;
1129 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1130 &iocb->ki_pos, count);
1131 cl_env_put(env, &refcheck);
1135 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1139 struct iovec *local_iov;
1140 struct kiocb *kiocb;
1145 env = cl_env_get(&refcheck);
1147 RETURN(PTR_ERR(env));
1149 local_iov = &vvp_env_info(env)->vti_local_iov;
1150 kiocb = &vvp_env_info(env)->vti_kiocb;
1151 local_iov->iov_base = (void __user *)buf;
1152 local_iov->iov_len = count;
1153 init_sync_kiocb(kiocb, file);
1154 kiocb->ki_pos = *ppos;
1155 kiocb->ki_left = count;
1157 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1158 *ppos = kiocb->ki_pos;
1160 cl_env_put(env, &refcheck);
1166 #ifdef HAVE_KERNEL_SENDFILE
1168 * Send file content (through pagecache) somewhere with helper
1170 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1171 read_actor_t actor, void *target)
1174 struct vvp_io_args *args;
1179 env = cl_env_get(&refcheck);
1181 RETURN(PTR_ERR(env));
1183 args = vvp_env_args(env, IO_SENDFILE);
1184 args->u.sendfile.via_target = target;
1185 args->u.sendfile.via_actor = actor;
1187 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1188 cl_env_put(env, &refcheck);
1193 #ifdef HAVE_KERNEL_SPLICE_READ
1195 * Send file content (through pagecache) somewhere with helper
1197 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1198 struct pipe_inode_info *pipe, size_t count,
1202 struct vvp_io_args *args;
1207 env = cl_env_get(&refcheck);
1209 RETURN(PTR_ERR(env));
1211 args = vvp_env_args(env, IO_SPLICE);
1212 args->u.splice.via_pipe = pipe;
1213 args->u.splice.via_flags = flags;
1215 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1216 cl_env_put(env, &refcheck);
1221 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1224 struct obd_export *exp = ll_i2dtexp(inode);
1225 struct obd_trans_info oti = { 0 };
1226 struct obdo *oa = NULL;
1229 struct lov_stripe_md *lsm, *lsm2;
1236 ll_inode_size_lock(inode, 0);
1237 lsm = ll_i2info(inode)->lli_smd;
1239 GOTO(out, rc = -ENOENT);
1240 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1241 (lsm->lsm_stripe_count));
1243 OBD_ALLOC_LARGE(lsm2, lsm_size);
1245 GOTO(out, rc = -ENOMEM);
1249 oa->o_nlink = ost_idx;
1250 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1251 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1252 obdo_from_inode(oa, inode, &ll_i2info(inode)->lli_fid, OBD_MD_FLTYPE |
1253 OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1254 memcpy(lsm2, lsm, lsm_size);
1255 rc = obd_create(exp, oa, &lsm2, &oti);
1257 OBD_FREE_LARGE(lsm2, lsm_size);
1260 ll_inode_size_unlock(inode, 0);
1265 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1267 struct ll_recreate_obj ucreat;
1270 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1273 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1274 sizeof(struct ll_recreate_obj)))
1277 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1278 ucreat.lrc_ost_idx));
1281 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1288 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1291 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1292 sizeof(struct lu_fid)))
1295 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1296 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1297 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1300 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1301 int flags, struct lov_user_md *lum, int lum_size)
1303 struct lov_stripe_md *lsm;
1304 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1308 ll_inode_size_lock(inode, 0);
1309 lsm = ll_i2info(inode)->lli_smd;
1311 ll_inode_size_unlock(inode, 0);
1312 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1317 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1320 rc = oit.d.lustre.it_status;
1322 GOTO(out_req_free, rc);
1324 ll_release_openhandle(file->f_dentry, &oit);
1327 ll_inode_size_unlock(inode, 0);
1328 ll_intent_release(&oit);
1331 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1335 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1336 struct lov_mds_md **lmmp, int *lmm_size,
1337 struct ptlrpc_request **request)
1339 struct ll_sb_info *sbi = ll_i2sbi(inode);
1340 struct mdt_body *body;
1341 struct lov_mds_md *lmm = NULL;
1342 struct ptlrpc_request *req = NULL;
1343 struct md_op_data *op_data;
1346 rc = ll_get_max_mdsize(sbi, &lmmsize);
1350 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1351 strlen(filename), lmmsize,
1352 LUSTRE_OPC_ANY, NULL);
1353 if (op_data == NULL)
1356 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1357 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1358 ll_finish_md_op_data(op_data);
1360 CDEBUG(D_INFO, "md_getattr_name failed "
1361 "on %s: rc %d\n", filename, rc);
1365 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1366 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1368 lmmsize = body->eadatasize;
1370 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1372 GOTO(out, rc = -ENODATA);
1375 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1376 LASSERT(lmm != NULL);
1378 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1379 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1380 GOTO(out, rc = -EPROTO);
1384 * This is coming from the MDS, so is probably in
1385 * little endian. We convert it to host endian before
1386 * passing it to userspace.
1388 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1389 /* if function called for directory - we should
1390 * avoid swab not existent lsm objects */
1391 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1392 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1393 if (S_ISREG(body->mode))
1394 lustre_swab_lov_user_md_objects(
1395 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1396 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1397 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1398 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1399 if (S_ISREG(body->mode))
1400 lustre_swab_lov_user_md_objects(
1401 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1402 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1408 *lmm_size = lmmsize;
1413 static int ll_lov_setea(struct inode *inode, struct file *file,
1416 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1417 struct lov_user_md *lump;
1418 int lum_size = sizeof(struct lov_user_md) +
1419 sizeof(struct lov_user_ost_data);
1423 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1426 OBD_ALLOC_LARGE(lump, lum_size);
1430 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1431 OBD_FREE_LARGE(lump, lum_size);
1435 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1437 OBD_FREE_LARGE(lump, lum_size);
1441 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1444 struct lov_user_md_v3 lumv3;
1445 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1446 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1447 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1450 int flags = FMODE_WRITE;
1453 /* first try with v1 which is smaller than v3 */
1454 lum_size = sizeof(struct lov_user_md_v1);
1455 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1458 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1459 lum_size = sizeof(struct lov_user_md_v3);
1460 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1464 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1466 put_user(0, &lumv1p->lmm_stripe_count);
1467 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1468 0, ll_i2info(inode)->lli_smd,
1474 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1476 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1481 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1486 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1488 struct ll_inode_info *lli = ll_i2info(inode);
1489 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1490 struct ccc_grouplock grouplock;
1494 if (ll_file_nolock(file))
1495 RETURN(-EOPNOTSUPP);
1497 cfs_spin_lock(&lli->lli_lock);
1498 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1499 CWARN("group lock already existed with gid %lu\n",
1500 fd->fd_grouplock.cg_gid);
1501 cfs_spin_unlock(&lli->lli_lock);
1504 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1505 cfs_spin_unlock(&lli->lli_lock);
1507 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1508 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1512 cfs_spin_lock(&lli->lli_lock);
1513 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1514 cfs_spin_unlock(&lli->lli_lock);
1515 CERROR("another thread just won the race\n");
1516 cl_put_grouplock(&grouplock);
1520 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1521 fd->fd_grouplock = grouplock;
1522 cfs_spin_unlock(&lli->lli_lock);
1524 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1528 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1530 struct ll_inode_info *lli = ll_i2info(inode);
1531 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1532 struct ccc_grouplock grouplock;
1535 cfs_spin_lock(&lli->lli_lock);
1536 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1537 cfs_spin_unlock(&lli->lli_lock);
1538 CWARN("no group lock held\n");
1541 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1543 if (fd->fd_grouplock.cg_gid != arg) {
1544 CWARN("group lock %lu doesn't match current id %lu\n",
1545 arg, fd->fd_grouplock.cg_gid);
1546 cfs_spin_unlock(&lli->lli_lock);
1550 grouplock = fd->fd_grouplock;
1551 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1552 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1553 cfs_spin_unlock(&lli->lli_lock);
1555 cl_put_grouplock(&grouplock);
1556 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1561 * Close inode open handle
1563 * \param dentry [in] dentry which contains the inode
1564 * \param it [in,out] intent which contains open info and result
1567 * \retval <0 failure
1569 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1571 struct inode *inode = dentry->d_inode;
1572 struct obd_client_handle *och;
1578 /* Root ? Do nothing. */
1579 if (dentry->d_inode->i_sb->s_root == dentry)
1582 /* No open handle to close? Move away */
1583 if (!it_disposition(it, DISP_OPEN_OPEN))
1586 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1588 OBD_ALLOC(och, sizeof(*och));
1590 GOTO(out, rc = -ENOMEM);
1592 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1593 ll_i2info(inode), it, och);
1595 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1598 /* this one is in place of ll_file_open */
1599 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1600 ptlrpc_req_finished(it->d.lustre.it_data);
1601 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1607 * Get size for inode for which FIEMAP mapping is requested.
1608 * Make the FIEMAP get_info call and returns the result.
1610 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1613 struct obd_export *exp = ll_i2dtexp(inode);
1614 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1615 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1616 int vallen = num_bytes;
1620 /* Checks for fiemap flags */
1621 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1622 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1626 /* Check for FIEMAP_FLAG_SYNC */
1627 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1628 rc = filemap_fdatawrite(inode->i_mapping);
1633 /* If the stripe_count > 1 and the application does not understand
1634 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1636 if (lsm->lsm_stripe_count > 1 &&
1637 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1640 fm_key.oa.o_id = lsm->lsm_object_id;
1641 fm_key.oa.o_seq = lsm->lsm_object_seq;
1642 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1644 obdo_from_inode(&fm_key.oa, inode, &ll_i2info(inode)->lli_fid,
1646 /* If filesize is 0, then there would be no objects for mapping */
1647 if (fm_key.oa.o_size == 0) {
1648 fiemap->fm_mapped_extents = 0;
1652 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1654 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1656 CERROR("obd_get_info failed: rc = %d\n", rc);
1661 int ll_fid2path(struct obd_export *exp, void *arg)
1663 struct getinfo_fid2path *gfout, *gfin;
1667 /* Need to get the buflen */
1668 OBD_ALLOC_PTR(gfin);
1671 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1676 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1677 OBD_ALLOC(gfout, outsize);
1678 if (gfout == NULL) {
1682 memcpy(gfout, gfin, sizeof(*gfout));
1685 /* Call mdc_iocontrol */
1686 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1689 if (cfs_copy_to_user(arg, gfout, outsize))
1693 OBD_FREE(gfout, outsize);
1697 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1699 struct ll_user_fiemap *fiemap_s;
1700 size_t num_bytes, ret_bytes;
1701 unsigned int extent_count;
1704 /* Get the extent count so we can calculate the size of
1705 * required fiemap buffer */
1706 if (get_user(extent_count,
1707 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1709 num_bytes = sizeof(*fiemap_s) + (extent_count *
1710 sizeof(struct ll_fiemap_extent));
1712 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1713 if (fiemap_s == NULL)
1716 /* get the fiemap value */
1717 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1719 GOTO(error, rc = -EFAULT);
1721 /* If fm_extent_count is non-zero, read the first extent since
1722 * it is used to calculate end_offset and device from previous
1725 if (copy_from_user(&fiemap_s->fm_extents[0],
1726 (char __user *)arg + sizeof(*fiemap_s),
1727 sizeof(struct ll_fiemap_extent)))
1728 GOTO(error, rc = -EFAULT);
1731 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1735 ret_bytes = sizeof(struct ll_user_fiemap);
1737 if (extent_count != 0)
1738 ret_bytes += (fiemap_s->fm_mapped_extents *
1739 sizeof(struct ll_fiemap_extent));
1741 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1745 OBD_FREE_LARGE(fiemap_s, num_bytes);
1749 #ifdef HAVE_UNLOCKED_IOCTL
1750 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1752 struct inode *inode = file->f_dentry->d_inode;
1754 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1758 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1762 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1763 inode->i_generation, inode, cmd);
1764 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1766 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1767 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1771 case LL_IOC_GETFLAGS:
1772 /* Get the current value of the file flags */
1773 return put_user(fd->fd_flags, (int *)arg);
1774 case LL_IOC_SETFLAGS:
1775 case LL_IOC_CLRFLAGS:
1776 /* Set or clear specific file flags */
1777 /* XXX This probably needs checks to ensure the flags are
1778 * not abused, and to handle any flag side effects.
1780 if (get_user(flags, (int *) arg))
1783 if (cmd == LL_IOC_SETFLAGS) {
1784 if ((flags & LL_FILE_IGNORE_LOCK) &&
1785 !(file->f_flags & O_DIRECT)) {
1786 CERROR("%s: unable to disable locking on "
1787 "non-O_DIRECT file\n", current->comm);
1791 fd->fd_flags |= flags;
1793 fd->fd_flags &= ~flags;
1796 case LL_IOC_LOV_SETSTRIPE:
1797 RETURN(ll_lov_setstripe(inode, file, arg));
1798 case LL_IOC_LOV_SETEA:
1799 RETURN(ll_lov_setea(inode, file, arg));
1800 case LL_IOC_LOV_GETSTRIPE:
1801 RETURN(ll_lov_getstripe(inode, arg));
1802 case LL_IOC_RECREATE_OBJ:
1803 RETURN(ll_lov_recreate_obj(inode, arg));
1804 case LL_IOC_RECREATE_FID:
1805 RETURN(ll_lov_recreate_fid(inode, arg));
1806 case FSFILT_IOC_FIEMAP:
1807 RETURN(ll_ioctl_fiemap(inode, arg));
1808 case FSFILT_IOC_GETFLAGS:
1809 case FSFILT_IOC_SETFLAGS:
1810 RETURN(ll_iocontrol(inode, file, cmd, arg));
1811 case FSFILT_IOC_GETVERSION_OLD:
1812 case FSFILT_IOC_GETVERSION:
1813 RETURN(put_user(inode->i_generation, (int *)arg));
1814 case LL_IOC_GROUP_LOCK:
1815 RETURN(ll_get_grouplock(inode, file, arg));
1816 case LL_IOC_GROUP_UNLOCK:
1817 RETURN(ll_put_grouplock(inode, file, arg));
1818 case IOC_OBD_STATFS:
1819 RETURN(ll_obd_statfs(inode, (void *)arg));
1821 /* We need to special case any other ioctls we want to handle,
1822 * to send them to the MDS/OST as appropriate and to properly
1823 * network encode the arg field.
1824 case FSFILT_IOC_SETVERSION_OLD:
1825 case FSFILT_IOC_SETVERSION:
1827 case LL_IOC_FLUSHCTX:
1828 RETURN(ll_flush_ctx(inode));
1829 case LL_IOC_PATH2FID: {
1830 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1831 sizeof(struct lu_fid)))
1836 case OBD_IOC_FID2PATH:
1837 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1839 case LL_IOC_GET_MDTIDX: {
1842 mdtidx = ll_get_mdt_idx(inode);
1846 if (put_user((int)mdtidx, (int*)arg))
1856 ll_iocontrol_call(inode, file, cmd, arg, &err))
1859 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1865 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1867 struct inode *inode = file->f_dentry->d_inode;
1870 retval = offset + ((origin == 2) ? i_size_read(inode) :
1871 (origin == 1) ? file->f_pos : 0);
1872 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1873 inode->i_ino, inode->i_generation, inode, retval, retval,
1874 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1875 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1877 if (origin == 2) { /* SEEK_END */
1878 int nonblock = 0, rc;
1880 if (file->f_flags & O_NONBLOCK)
1881 nonblock = LDLM_FL_BLOCK_NOWAIT;
1883 rc = cl_glimpse_size(inode);
1887 offset += i_size_read(inode);
1888 } else if (origin == 1) { /* SEEK_CUR */
1889 offset += file->f_pos;
1893 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1894 if (offset != file->f_pos) {
1895 file->f_pos = offset;
1903 #ifdef HAVE_FLUSH_OWNER_ID
1904 int ll_flush(struct file *file, fl_owner_t id)
1906 int ll_flush(struct file *file)
1909 struct inode *inode = file->f_dentry->d_inode;
1910 struct ll_inode_info *lli = ll_i2info(inode);
1911 struct lov_stripe_md *lsm = lli->lli_smd;
1914 /* the application should know write failure already. */
1915 if (lli->lli_write_rc)
1918 /* catch async errors that were recorded back when async writeback
1919 * failed for pages in this mapping. */
1920 rc = lli->lli_async_rc;
1921 lli->lli_async_rc = 0;
1923 err = lov_test_and_clear_async_rc(lsm);
1928 return rc ? -EIO : 0;
1931 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1933 struct inode *inode = dentry->d_inode;
1934 struct ll_inode_info *lli = ll_i2info(inode);
1935 struct lov_stripe_md *lsm = lli->lli_smd;
1936 struct ptlrpc_request *req;
1937 struct obd_capa *oc;
1940 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1941 inode->i_generation, inode);
1942 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1944 /* fsync's caller has already called _fdata{sync,write}, we want
1945 * that IO to finish before calling the osc and mdc sync methods */
1946 rc = filemap_fdatawait(inode->i_mapping);
1948 /* catch async errors that were recorded back when async writeback
1949 * failed for pages in this mapping. */
1950 err = lli->lli_async_rc;
1951 lli->lli_async_rc = 0;
1955 err = lov_test_and_clear_async_rc(lsm);
1960 oc = ll_mdscapa_get(inode);
1961 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1967 ptlrpc_req_finished(req);
1970 struct obd_info *oinfo;
1972 OBD_ALLOC_PTR(oinfo);
1974 RETURN(rc ? rc : -ENOMEM);
1975 OBDO_ALLOC(oinfo->oi_oa);
1976 if (!oinfo->oi_oa) {
1977 OBD_FREE_PTR(oinfo);
1978 RETURN(rc ? rc : -ENOMEM);
1980 oinfo->oi_oa->o_id = lsm->lsm_object_id;
1981 oinfo->oi_oa->o_seq = lsm->lsm_object_seq;
1982 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1983 obdo_from_inode(oinfo->oi_oa, inode, &ll_i2info(inode)->lli_fid,
1984 OBD_MD_FLTYPE | OBD_MD_FLATIME |
1985 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1988 oinfo->oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
1989 err = obd_sync_rqset(ll_i2sbi(inode)->ll_dt_exp, oinfo, 0,
1991 capa_put(oinfo->oi_capa);
1994 OBDO_FREE(oinfo->oi_oa);
1995 OBD_FREE_PTR(oinfo);
1996 lli->lli_write_rc = err < 0 ? : 0;
2002 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2004 struct inode *inode = file->f_dentry->d_inode;
2005 struct ll_sb_info *sbi = ll_i2sbi(inode);
2006 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2007 .ei_cb_cp =ldlm_flock_completion_ast,
2008 .ei_cbdata = file_lock };
2009 struct md_op_data *op_data;
2010 struct lustre_handle lockh = {0};
2011 ldlm_policy_data_t flock = {{0}};
2016 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2017 inode->i_ino, file_lock);
2019 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2021 if (file_lock->fl_flags & FL_FLOCK) {
2022 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2023 /* flocks are whole-file locks */
2024 flock.l_flock.end = OFFSET_MAX;
2025 /* For flocks owner is determined by the local file desctiptor*/
2026 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2027 } else if (file_lock->fl_flags & FL_POSIX) {
2028 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2029 flock.l_flock.start = file_lock->fl_start;
2030 flock.l_flock.end = file_lock->fl_end;
2034 flock.l_flock.pid = file_lock->fl_pid;
2036 /* Somewhat ugly workaround for svc lockd.
2037 * lockd installs custom fl_lmops->fl_compare_owner that checks
2038 * for the fl_owner to be the same (which it always is on local node
2039 * I guess between lockd processes) and then compares pid.
2040 * As such we assign pid to the owner field to make it all work,
2041 * conflict with normal locks is unlikely since pid space and
2042 * pointer space for current->files are not intersecting */
2043 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2044 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2046 switch (file_lock->fl_type) {
2048 einfo.ei_mode = LCK_PR;
2051 /* An unlock request may or may not have any relation to
2052 * existing locks so we may not be able to pass a lock handle
2053 * via a normal ldlm_lock_cancel() request. The request may even
2054 * unlock a byte range in the middle of an existing lock. In
2055 * order to process an unlock request we need all of the same
2056 * information that is given with a normal read or write record
2057 * lock request. To avoid creating another ldlm unlock (cancel)
2058 * message we'll treat a LCK_NL flock request as an unlock. */
2059 einfo.ei_mode = LCK_NL;
2062 einfo.ei_mode = LCK_PW;
2065 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2080 flags = LDLM_FL_BLOCK_NOWAIT;
2086 flags = LDLM_FL_TEST_LOCK;
2087 /* Save the old mode so that if the mode in the lock changes we
2088 * can decrement the appropriate reader or writer refcount. */
2089 file_lock->fl_type = einfo.ei_mode;
2092 CERROR("unknown fcntl lock command: %d\n", cmd);
2096 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2097 LUSTRE_OPC_ANY, NULL);
2098 if (IS_ERR(op_data))
2099 RETURN(PTR_ERR(op_data));
2101 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2102 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2103 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2105 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2106 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2108 ll_finish_md_op_data(op_data);
2110 if ((file_lock->fl_flags & FL_FLOCK) &&
2111 (rc == 0 || file_lock->fl_type == F_UNLCK))
2112 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2113 #ifdef HAVE_F_OP_FLOCK
2114 if ((file_lock->fl_flags & FL_POSIX) &&
2115 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2116 !(flags & LDLM_FL_TEST_LOCK))
2117 posix_lock_file_wait(file, file_lock);
2123 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2131 * test if some locks matching bits and l_req_mode are acquired
2132 * - bits can be in different locks
2133 * - if found clear the common lock bits in *bits
2134 * - the bits not found, are kept in *bits
2136 * \param bits [IN] searched lock bits [IN]
2137 * \param l_req_mode [IN] searched lock mode
2138 * \retval boolean, true iff all bits are found
2140 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2142 struct lustre_handle lockh;
2143 ldlm_policy_data_t policy;
2144 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2145 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2154 fid = &ll_i2info(inode)->lli_fid;
2155 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2156 ldlm_lockname[mode]);
2158 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2159 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2160 policy.l_inodebits.bits = *bits & (1 << i);
2161 if (policy.l_inodebits.bits == 0)
2164 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2165 &policy, mode, &lockh)) {
2166 struct ldlm_lock *lock;
2168 lock = ldlm_handle2lock(&lockh);
2171 ~(lock->l_policy_data.l_inodebits.bits);
2172 LDLM_LOCK_PUT(lock);
2174 *bits &= ~policy.l_inodebits.bits;
2181 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2182 struct lustre_handle *lockh)
2184 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2190 fid = &ll_i2info(inode)->lli_fid;
2191 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2193 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2194 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2195 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2199 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2200 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2201 * and return success */
2203 /* This path cannot be hit for regular files unless in
2204 * case of obscure races, so no need to to validate
2206 if (!S_ISREG(inode->i_mode) &&
2207 !S_ISDIR(inode->i_mode))
2212 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2220 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2223 struct inode *inode = dentry->d_inode;
2224 struct ptlrpc_request *req = NULL;
2225 struct ll_sb_info *sbi;
2226 struct obd_export *exp;
2231 CERROR("REPORT THIS LINE TO PETER\n");
2234 sbi = ll_i2sbi(inode);
2236 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2237 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2239 exp = ll_i2mdexp(inode);
2241 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2242 * But under CMD case, it caused some lock issues, should be fixed
2243 * with new CMD ibits lock. See bug 12718 */
2244 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2245 struct lookup_intent oit = { .it_op = IT_GETATTR };
2246 struct md_op_data *op_data;
2248 if (ibits == MDS_INODELOCK_LOOKUP)
2249 oit.it_op = IT_LOOKUP;
2251 /* Call getattr by fid, so do not provide name at all. */
2252 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2253 dentry->d_inode, NULL, 0, 0,
2254 LUSTRE_OPC_ANY, NULL);
2255 if (IS_ERR(op_data))
2256 RETURN(PTR_ERR(op_data));
2258 oit.it_create_mode |= M_CHECK_STALE;
2259 rc = md_intent_lock(exp, op_data, NULL, 0,
2260 /* we are not interested in name
2263 ll_md_blocking_ast, 0);
2264 ll_finish_md_op_data(op_data);
2265 oit.it_create_mode &= ~M_CHECK_STALE;
2267 rc = ll_inode_revalidate_fini(inode, rc);
2271 rc = ll_revalidate_it_finish(req, &oit, dentry);
2273 ll_intent_release(&oit);
2277 /* Unlinked? Unhash dentry, so it is not picked up later by
2278 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2279 here to preserve get_cwd functionality on 2.6.
2281 if (!dentry->d_inode->i_nlink) {
2282 cfs_spin_lock(&ll_lookup_lock);
2283 spin_lock(&dcache_lock);
2284 ll_drop_dentry(dentry);
2285 spin_unlock(&dcache_lock);
2286 cfs_spin_unlock(&ll_lookup_lock);
2289 ll_lookup_finish_locks(&oit, dentry);
2290 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2291 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2292 obd_valid valid = OBD_MD_FLGETATTR;
2293 struct md_op_data *op_data;
2296 if (S_ISREG(inode->i_mode)) {
2297 rc = ll_get_max_mdsize(sbi, &ealen);
2300 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2303 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2304 0, ealen, LUSTRE_OPC_ANY,
2306 if (op_data == NULL)
2309 op_data->op_valid = valid;
2310 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2311 * capa for this inode. Because we only keep capas of dirs
2313 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2314 ll_finish_md_op_data(op_data);
2316 rc = ll_inode_revalidate_fini(inode, rc);
2320 rc = ll_prep_inode(&inode, req, NULL);
2323 ptlrpc_req_finished(req);
2327 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2330 struct inode *inode = dentry->d_inode;
2334 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2336 /* if object not yet allocated, don't validate size */
2337 if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL) {
2338 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2339 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2340 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2344 /* cl_glimpse_size will prefer locally cached writes if they extend
2348 rc = cl_glimpse_size(inode);
2353 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2354 struct lookup_intent *it, struct kstat *stat)
2356 struct inode *inode = de->d_inode;
2357 struct ll_sb_info *sbi = ll_i2sbi(inode);
2358 struct ll_inode_info *lli = ll_i2info(inode);
2361 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2362 MDS_INODELOCK_LOOKUP);
2363 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2368 stat->dev = inode->i_sb->s_dev;
2369 if (ll_need_32bit_api(sbi))
2370 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2372 stat->ino = inode->i_ino;
2373 stat->mode = inode->i_mode;
2374 stat->nlink = inode->i_nlink;
2375 stat->uid = inode->i_uid;
2376 stat->gid = inode->i_gid;
2377 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2378 stat->atime = inode->i_atime;
2379 stat->mtime = inode->i_mtime;
2380 stat->ctime = inode->i_ctime;
2381 #ifdef HAVE_INODE_BLKSIZE
2382 stat->blksize = inode->i_blksize;
2384 stat->blksize = 1 << inode->i_blkbits;
2387 stat->size = i_size_read(inode);
2388 stat->blocks = inode->i_blocks;
2392 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2394 struct lookup_intent it = { .it_op = IT_GETATTR };
2396 return ll_getattr_it(mnt, de, &it, stat);
2399 #ifdef HAVE_LINUX_FIEMAP_H
2400 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2401 __u64 start, __u64 len)
2405 struct ll_user_fiemap *fiemap;
2406 unsigned int extent_count = fieinfo->fi_extents_max;
2408 num_bytes = sizeof(*fiemap) + (extent_count *
2409 sizeof(struct ll_fiemap_extent));
2410 OBD_ALLOC_LARGE(fiemap, num_bytes);
2415 fiemap->fm_flags = fieinfo->fi_flags;
2416 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2417 fiemap->fm_start = start;
2418 fiemap->fm_length = len;
2419 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2420 sizeof(struct ll_fiemap_extent));
2422 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2424 fieinfo->fi_flags = fiemap->fm_flags;
2425 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2426 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2427 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2429 OBD_FREE_LARGE(fiemap, num_bytes);
2436 int lustre_check_acl(struct inode *inode, int mask)
2438 #ifdef CONFIG_FS_POSIX_ACL
2439 struct ll_inode_info *lli = ll_i2info(inode);
2440 struct posix_acl *acl;
2444 cfs_spin_lock(&lli->lli_lock);
2445 acl = posix_acl_dup(lli->lli_posix_acl);
2446 cfs_spin_unlock(&lli->lli_lock);
2451 rc = posix_acl_permission(inode, acl, mask);
2452 posix_acl_release(acl);
2460 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2461 #ifndef HAVE_INODE_PERMISION_2ARGS
2462 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2464 int ll_inode_permission(struct inode *inode, int mask)
2470 /* as root inode are NOT getting validated in lookup operation,
2471 * need to do it before permission check. */
2473 if (inode == inode->i_sb->s_root->d_inode) {
2474 struct lookup_intent it = { .it_op = IT_LOOKUP };
2476 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2477 MDS_INODELOCK_LOOKUP);
2482 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2483 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2485 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2486 return lustre_check_remote_perm(inode, mask);
2488 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2489 rc = generic_permission(inode, mask, lustre_check_acl);
2494 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2496 int mode = inode->i_mode;
2499 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2500 inode->i_ino, inode->i_generation, inode, mask);
2502 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2503 return lustre_check_remote_perm(inode, mask);
2505 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2507 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2508 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2510 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2512 if (cfs_curproc_fsuid() == inode->i_uid) {
2515 if (((mode >> 3) & mask & S_IRWXO) != mask)
2517 rc = lustre_check_acl(inode, mask);
2521 goto check_capabilities;
2525 if (cfs_curproc_is_in_groups(inode->i_gid))
2528 if ((mode & mask & S_IRWXO) == mask)
2532 if (!(mask & MAY_EXEC) ||
2533 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2534 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2537 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2538 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2545 #ifdef HAVE_FILE_READV
2546 #define READ_METHOD readv
2547 #define READ_FUNCTION ll_file_readv
2548 #define WRITE_METHOD writev
2549 #define WRITE_FUNCTION ll_file_writev
2551 #define READ_METHOD aio_read
2552 #define READ_FUNCTION ll_file_aio_read
2553 #define WRITE_METHOD aio_write
2554 #define WRITE_FUNCTION ll_file_aio_write
2557 /* -o localflock - only provides locally consistent flock locks */
2558 struct file_operations ll_file_operations = {
2559 .read = ll_file_read,
2560 .READ_METHOD = READ_FUNCTION,
2561 .write = ll_file_write,
2562 .WRITE_METHOD = WRITE_FUNCTION,
2563 #ifdef HAVE_UNLOCKED_IOCTL
2564 .unlocked_ioctl = ll_file_ioctl,
2566 .ioctl = ll_file_ioctl,
2568 .open = ll_file_open,
2569 .release = ll_file_release,
2570 .mmap = ll_file_mmap,
2571 .llseek = ll_file_seek,
2572 #ifdef HAVE_KERNEL_SENDFILE
2573 .sendfile = ll_file_sendfile,
2575 #ifdef HAVE_KERNEL_SPLICE_READ
2576 .splice_read = ll_file_splice_read,
2582 struct file_operations ll_file_operations_flock = {
2583 .read = ll_file_read,
2584 .READ_METHOD = READ_FUNCTION,
2585 .write = ll_file_write,
2586 .WRITE_METHOD = WRITE_FUNCTION,
2587 #ifdef HAVE_UNLOCKED_IOCTL
2588 .unlocked_ioctl = ll_file_ioctl,
2590 .ioctl = ll_file_ioctl,
2592 .open = ll_file_open,
2593 .release = ll_file_release,
2594 .mmap = ll_file_mmap,
2595 .llseek = ll_file_seek,
2596 #ifdef HAVE_KERNEL_SENDFILE
2597 .sendfile = ll_file_sendfile,
2599 #ifdef HAVE_KERNEL_SPLICE_READ
2600 .splice_read = ll_file_splice_read,
2604 #ifdef HAVE_F_OP_FLOCK
2605 .flock = ll_file_flock,
2607 .lock = ll_file_flock
2610 /* These are for -o noflock - to return ENOSYS on flock calls */
2611 struct file_operations ll_file_operations_noflock = {
2612 .read = ll_file_read,
2613 .READ_METHOD = READ_FUNCTION,
2614 .write = ll_file_write,
2615 .WRITE_METHOD = WRITE_FUNCTION,
2616 #ifdef HAVE_UNLOCKED_IOCTL
2617 .unlocked_ioctl = ll_file_ioctl,
2619 .ioctl = ll_file_ioctl,
2621 .open = ll_file_open,
2622 .release = ll_file_release,
2623 .mmap = ll_file_mmap,
2624 .llseek = ll_file_seek,
2625 #ifdef HAVE_KERNEL_SENDFILE
2626 .sendfile = ll_file_sendfile,
2628 #ifdef HAVE_KERNEL_SPLICE_READ
2629 .splice_read = ll_file_splice_read,
2633 #ifdef HAVE_F_OP_FLOCK
2634 .flock = ll_file_noflock,
2636 .lock = ll_file_noflock
2639 struct inode_operations ll_file_inode_operations = {
2640 .setattr = ll_setattr,
2641 .truncate = ll_truncate,
2642 .getattr = ll_getattr,
2643 .permission = ll_inode_permission,
2644 .setxattr = ll_setxattr,
2645 .getxattr = ll_getxattr,
2646 .listxattr = ll_listxattr,
2647 .removexattr = ll_removexattr,
2648 #ifdef HAVE_LINUX_FIEMAP_H
2649 .fiemap = ll_fiemap,
2653 /* dynamic ioctl number support routins */
2654 static struct llioc_ctl_data {
2655 cfs_rw_semaphore_t ioc_sem;
2656 cfs_list_t ioc_head;
2658 __RWSEM_INITIALIZER(llioc.ioc_sem),
2659 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2664 cfs_list_t iocd_list;
2665 unsigned int iocd_size;
2666 llioc_callback_t iocd_cb;
2667 unsigned int iocd_count;
2668 unsigned int iocd_cmd[0];
2671 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2674 struct llioc_data *in_data = NULL;
2677 if (cb == NULL || cmd == NULL ||
2678 count > LLIOC_MAX_CMD || count < 0)
2681 size = sizeof(*in_data) + count * sizeof(unsigned int);
2682 OBD_ALLOC(in_data, size);
2683 if (in_data == NULL)
2686 memset(in_data, 0, sizeof(*in_data));
2687 in_data->iocd_size = size;
2688 in_data->iocd_cb = cb;
2689 in_data->iocd_count = count;
2690 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2692 cfs_down_write(&llioc.ioc_sem);
2693 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2694 cfs_up_write(&llioc.ioc_sem);
2699 void ll_iocontrol_unregister(void *magic)
2701 struct llioc_data *tmp;
2706 cfs_down_write(&llioc.ioc_sem);
2707 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2709 unsigned int size = tmp->iocd_size;
2711 cfs_list_del(&tmp->iocd_list);
2712 cfs_up_write(&llioc.ioc_sem);
2714 OBD_FREE(tmp, size);
2718 cfs_up_write(&llioc.ioc_sem);
2720 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2723 EXPORT_SYMBOL(ll_iocontrol_register);
2724 EXPORT_SYMBOL(ll_iocontrol_unregister);
2726 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2727 unsigned int cmd, unsigned long arg, int *rcp)
2729 enum llioc_iter ret = LLIOC_CONT;
2730 struct llioc_data *data;
2731 int rc = -EINVAL, i;
2733 cfs_down_read(&llioc.ioc_sem);
2734 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2735 for (i = 0; i < data->iocd_count; i++) {
2736 if (cmd != data->iocd_cmd[i])
2739 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2743 if (ret == LLIOC_STOP)
2746 cfs_up_read(&llioc.ioc_sem);