1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011, 2012, Whamcloud, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
40 * Author: Peter Braam <braam@clusterfs.com>
41 * Author: Phil Schwan <phil@clusterfs.com>
42 * Author: Andreas Dilger <adilger@clusterfs.com>
45 #define DEBUG_SUBSYSTEM S_LLITE
46 #include <lustre_dlm.h>
47 #include <lustre_lite.h>
48 #include <linux/pagemap.h>
49 #include <linux/file.h>
50 #include "llite_internal.h"
51 #include <lustre/ll_fiemap.h>
53 #include "cl_object.h"
55 struct ll_file_data *ll_file_data_get(void)
57 struct ll_file_data *fd;
59 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
63 static void ll_file_data_put(struct ll_file_data *fd)
66 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
69 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
70 struct lustre_handle *fh)
72 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
73 op_data->op_attr.ia_mode = inode->i_mode;
74 op_data->op_attr.ia_atime = inode->i_atime;
75 op_data->op_attr.ia_mtime = inode->i_mtime;
76 op_data->op_attr.ia_ctime = inode->i_ctime;
77 op_data->op_attr.ia_size = i_size_read(inode);
78 op_data->op_attr_blocks = inode->i_blocks;
79 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
80 ll_inode_to_ext_flags(inode->i_flags);
81 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83 op_data->op_handle = *fh;
84 op_data->op_capa1 = ll_mdscapa_get(inode);
88 * Closes the IO epoch and packs all the attributes into @op_data for
91 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
92 struct obd_client_handle *och)
96 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
97 ATTR_MTIME_SET | ATTR_CTIME_SET;
99 if (!(och->och_flags & FMODE_WRITE))
102 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
103 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
105 ll_ioepoch_close(inode, op_data, &och, 0);
108 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
109 ll_prep_md_op_data(op_data, inode, NULL, NULL,
110 0, 0, LUSTRE_OPC_ANY, NULL);
114 static int ll_close_inode_openhandle(struct obd_export *md_exp,
116 struct obd_client_handle *och)
118 struct obd_export *exp = ll_i2mdexp(inode);
119 struct md_op_data *op_data;
120 struct ptlrpc_request *req = NULL;
121 struct obd_device *obd = class_exp2obd(exp);
128 * XXX: in case of LMV, is this correct to access
131 CERROR("Invalid MDC connection handle "LPX64"\n",
132 ll_i2mdexp(inode)->exp_handle.h_cookie);
136 OBD_ALLOC_PTR(op_data);
138 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
140 ll_prepare_close(inode, op_data, och);
141 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142 rc = md_close(md_exp, op_data, och->och_mod, &req);
144 /* This close must have the epoch closed. */
145 LASSERT(epoch_close);
146 /* MDS has instructed us to obtain Size-on-MDS attribute from
147 * OSTs and send setattr to back to MDS. */
148 rc = ll_som_update(inode, op_data);
150 CERROR("inode %lu mdc Size-on-MDS update failed: "
151 "rc = %d\n", inode->i_ino, rc);
155 CERROR("inode %lu mdc close failed: rc = %d\n",
158 ll_finish_md_op_data(op_data);
161 rc = ll_objects_destroy(req, inode);
163 CERROR("inode %lu ll_objects destroy: rc = %d\n",
170 if (exp_connect_som(exp) && !epoch_close &&
171 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
174 md_clear_open_replay_data(md_exp, och);
175 /* Free @och if it is not waiting for DONE_WRITING. */
176 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
179 if (req) /* This is close request */
180 ptlrpc_req_finished(req);
184 int ll_md_real_close(struct inode *inode, int flags)
186 struct ll_inode_info *lli = ll_i2info(inode);
187 struct obd_client_handle **och_p;
188 struct obd_client_handle *och;
193 if (flags & FMODE_WRITE) {
194 och_p = &lli->lli_mds_write_och;
195 och_usecount = &lli->lli_open_fd_write_count;
196 } else if (flags & FMODE_EXEC) {
197 och_p = &lli->lli_mds_exec_och;
198 och_usecount = &lli->lli_open_fd_exec_count;
200 LASSERT(flags & FMODE_READ);
201 och_p = &lli->lli_mds_read_och;
202 och_usecount = &lli->lli_open_fd_read_count;
205 cfs_down(&lli->lli_och_sem);
206 if (*och_usecount) { /* There are still users of this handle, so
208 cfs_up(&lli->lli_och_sem);
213 cfs_up(&lli->lli_och_sem);
215 if (och) { /* There might be a race and somebody have freed this och
217 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
227 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228 struct ll_inode_info *lli = ll_i2info(inode);
232 /* clear group lock, if present */
233 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
236 /* Let's see if we have good enough OPEN lock on the file and if
237 we can skip talking to MDS */
238 if (file->f_dentry->d_inode) { /* Can this ever be false? */
240 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241 struct lustre_handle lockh;
242 struct inode *inode = file->f_dentry->d_inode;
243 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
245 cfs_down(&lli->lli_och_sem);
246 if (fd->fd_omode & FMODE_WRITE) {
248 LASSERT(lli->lli_open_fd_write_count);
249 lli->lli_open_fd_write_count--;
250 } else if (fd->fd_omode & FMODE_EXEC) {
252 LASSERT(lli->lli_open_fd_exec_count);
253 lli->lli_open_fd_exec_count--;
256 LASSERT(lli->lli_open_fd_read_count);
257 lli->lli_open_fd_read_count--;
259 cfs_up(&lli->lli_och_sem);
261 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262 LDLM_IBITS, &policy, lockmode,
264 rc = ll_md_real_close(file->f_dentry->d_inode,
268 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269 file, file->f_dentry, file->f_dentry->d_name.name);
272 LUSTRE_FPRIVATE(file) = NULL;
273 ll_file_data_put(fd);
274 ll_capa_close(inode);
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
281 /* While this returns an error code, fput() the caller does not, so we need
282 * to make every effort to clean up all of our state here. Also, applications
283 * rarely check close errors and even if an error is returned they will not
284 * re-try the close call.
286 int ll_file_release(struct inode *inode, struct file *file)
288 struct ll_file_data *fd;
289 struct ll_sb_info *sbi = ll_i2sbi(inode);
290 struct ll_inode_info *lli = ll_i2info(inode);
291 struct lov_stripe_md *lsm = lli->lli_smd;
295 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296 inode->i_generation, inode);
298 #ifdef CONFIG_FS_POSIX_ACL
299 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300 inode == inode->i_sb->s_root->d_inode) {
301 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
304 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305 fd->fd_flags &= ~LL_FILE_RMTACL;
306 rct_del(&sbi->ll_rct, cfs_curproc_pid());
307 et_search_free(&sbi->ll_et, cfs_curproc_pid());
312 if (inode->i_sb->s_root != file->f_dentry)
313 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314 fd = LUSTRE_FPRIVATE(file);
317 /* The last ref on @file, maybe not the the owner pid of statahead.
318 * Different processes can open the same dir, "ll_opendir_key" means:
319 * it is me that should stop the statahead thread. */
320 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
321 lli->lli_opendir_pid != 0)
322 ll_stop_statahead(inode, lli->lli_opendir_key);
324 if (inode->i_sb->s_root == file->f_dentry) {
325 LUSTRE_FPRIVATE(file) = NULL;
326 ll_file_data_put(fd);
330 if (!S_ISDIR(inode->i_mode)) {
332 lov_test_and_clear_async_rc(lsm);
333 lli->lli_async_rc = 0;
336 rc = ll_md_close(sbi->ll_md_exp, inode, file);
338 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
339 libcfs_debug_dumplog();
344 static int ll_intent_file_open(struct file *file, void *lmm,
345 int lmmsize, struct lookup_intent *itp)
347 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
348 struct dentry *parent = file->f_dentry->d_parent;
349 const char *name = file->f_dentry->d_name.name;
350 const int len = file->f_dentry->d_name.len;
351 struct md_op_data *op_data;
352 struct ptlrpc_request *req;
353 __u32 opc = LUSTRE_OPC_ANY;
360 /* Usually we come here only for NFSD, and we want open lock.
361 But we can also get here with pre 2.6.15 patchless kernels, and in
362 that case that lock is also ok */
363 /* We can also get here if there was cached open handle in revalidate_it
364 * but it disappeared while we were getting from there to ll_file_open.
365 * But this means this file was closed and immediatelly opened which
366 * makes a good candidate for using OPEN lock */
367 /* If lmmsize & lmm are not 0, we are just setting stripe info
368 * parameters. No need for the open lock */
369 if (lmm == NULL && lmmsize == 0) {
370 itp->it_flags |= MDS_OPEN_LOCK;
371 if (itp->it_flags & FMODE_WRITE)
372 opc = LUSTRE_OPC_CREATE;
375 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
376 file->f_dentry->d_inode, name, len,
379 RETURN(PTR_ERR(op_data));
381 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
382 0 /*unused */, &req, ll_md_blocking_ast, 0);
383 ll_finish_md_op_data(op_data);
385 /* reason for keep own exit path - don`t flood log
386 * with messages with -ESTALE errors.
388 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
389 it_open_error(DISP_OPEN_OPEN, itp))
391 ll_release_openhandle(file->f_dentry, itp);
395 if (it_disposition(itp, DISP_LOOKUP_NEG))
396 GOTO(out, rc = -ENOENT);
398 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
399 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
400 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
405 if (!rc && itp->d.lustre.it_lock_mode)
406 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
410 ptlrpc_req_finished(itp->d.lustre.it_data);
411 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
412 ll_intent_drop_lock(itp);
418 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
419 * not believe attributes if a few ioepoch holders exist. Attributes for
420 * previous ioepoch if new one is opened are also skipped by MDS.
422 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
424 if (ioepoch && lli->lli_ioepoch != ioepoch) {
425 lli->lli_ioepoch = ioepoch;
426 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
427 ioepoch, PFID(&lli->lli_fid));
431 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
432 struct lookup_intent *it, struct obd_client_handle *och)
434 struct ptlrpc_request *req = it->d.lustre.it_data;
435 struct mdt_body *body;
439 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
440 LASSERT(body != NULL); /* reply already checked out */
442 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_fid = lli->lli_fid;
445 och->och_flags = it->it_flags;
446 ll_ioepoch_open(lli, body->ioepoch);
448 return md_set_open_replay_data(md_exp, och, req);
451 int ll_local_open(struct file *file, struct lookup_intent *it,
452 struct ll_file_data *fd, struct obd_client_handle *och)
454 struct inode *inode = file->f_dentry->d_inode;
455 struct ll_inode_info *lli = ll_i2info(inode);
458 LASSERT(!LUSTRE_FPRIVATE(file));
463 struct ptlrpc_request *req = it->d.lustre.it_data;
464 struct mdt_body *body;
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
471 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
472 if ((it->it_flags & FMODE_WRITE) &&
473 (body->valid & OBD_MD_FLSIZE))
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 lli->lli_ioepoch, PFID(&lli->lli_fid));
478 LUSTRE_FPRIVATE(file) = fd;
479 ll_readahead_init(inode, &fd->fd_ras);
480 fd->fd_omode = it->it_flags;
484 /* Open a file, and (for the very first open) create objects on the OSTs at
485 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
486 * creation or open until ll_lov_setstripe() ioctl is called. We grab
487 * lli_open_sem to ensure no other process will create objects, send the
488 * stripe MD to the MDS, or try to destroy the objects if that fails.
490 * If we already have the stripe MD locally then we don't request it in
491 * md_open(), by passing a lmm_size = 0.
493 * It is up to the application to ensure no other processes open this file
494 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
495 * used. We might be able to avoid races of that sort by getting lli_open_sem
496 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
497 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
499 int ll_file_open(struct inode *inode, struct file *file)
501 struct ll_inode_info *lli = ll_i2info(inode);
502 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
503 .it_flags = file->f_flags };
504 struct lov_stripe_md *lsm;
505 struct obd_client_handle **och_p = NULL;
506 __u64 *och_usecount = NULL;
507 struct ll_file_data *fd;
508 int rc = 0, opendir_set = 0;
511 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
512 inode->i_generation, inode, file->f_flags);
514 it = file->private_data; /* XXX: compat macro */
515 file->private_data = NULL; /* prevent ll_local_open assertion */
517 fd = ll_file_data_get();
519 GOTO(out_och_free, rc = -ENOMEM);
522 if (S_ISDIR(inode->i_mode)) {
523 cfs_spin_lock(&lli->lli_sa_lock);
524 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
525 lli->lli_opendir_pid == 0) {
526 lli->lli_opendir_key = fd;
527 lli->lli_opendir_pid = cfs_curproc_pid();
530 cfs_spin_unlock(&lli->lli_sa_lock);
533 if (inode->i_sb->s_root == file->f_dentry) {
534 LUSTRE_FPRIVATE(file) = fd;
538 if (!it || !it->d.lustre.it_disposition) {
539 /* Convert f_flags into access mode. We cannot use file->f_mode,
540 * because everything but O_ACCMODE mask was stripped from
542 if ((oit.it_flags + 1) & O_ACCMODE)
544 if (file->f_flags & O_TRUNC)
545 oit.it_flags |= FMODE_WRITE;
547 /* kernel only call f_op->open in dentry_open. filp_open calls
548 * dentry_open after call to open_namei that checks permissions.
549 * Only nfsd_open call dentry_open directly without checking
550 * permissions and because of that this code below is safe. */
551 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
552 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
554 /* We do not want O_EXCL here, presumably we opened the file
555 * already? XXX - NFS implications? */
556 oit.it_flags &= ~O_EXCL;
558 /* bug20584, if "it_flags" contains O_CREAT, the file will be
559 * created if necessary, then "IT_CREAT" should be set to keep
560 * consistent with it */
561 if (oit.it_flags & O_CREAT)
562 oit.it_op |= IT_CREAT;
568 /* Let's see if we have file open on MDS already. */
569 if (it->it_flags & FMODE_WRITE) {
570 och_p = &lli->lli_mds_write_och;
571 och_usecount = &lli->lli_open_fd_write_count;
572 } else if (it->it_flags & FMODE_EXEC) {
573 och_p = &lli->lli_mds_exec_och;
574 och_usecount = &lli->lli_open_fd_exec_count;
576 och_p = &lli->lli_mds_read_och;
577 och_usecount = &lli->lli_open_fd_read_count;
580 cfs_down(&lli->lli_och_sem);
581 if (*och_p) { /* Open handle is present */
582 if (it_disposition(it, DISP_OPEN_OPEN)) {
583 /* Well, there's extra open request that we do not need,
584 let's close it somehow. This will decref request. */
585 rc = it_open_error(DISP_OPEN_OPEN, it);
587 cfs_up(&lli->lli_och_sem);
588 GOTO(out_openerr, rc);
591 ll_release_openhandle(file->f_dentry, it);
592 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
597 rc = ll_local_open(file, it, fd, NULL);
600 cfs_up(&lli->lli_och_sem);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->d.lustre.it_disposition) {
606 /* We cannot just request lock handle now, new ELC code
607 means that one of other OPEN locks for this file
608 could be cancelled, and since blocking ast handler
609 would attempt to grab och_sem as well, that would
610 result in a deadlock */
611 cfs_up(&lli->lli_och_sem);
612 it->it_create_mode |= M_CHECK_STALE;
613 rc = ll_intent_file_open(file, NULL, 0, it);
614 it->it_create_mode &= ~M_CHECK_STALE;
616 GOTO(out_openerr, rc);
620 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
622 GOTO(out_och_free, rc = -ENOMEM);
626 /* md_intent_lock() didn't get a request ref if there was an
627 * open error, so don't do cleanup on the request here
629 /* XXX (green): Should not we bail out on any error here, not
630 * just open error? */
631 rc = it_open_error(DISP_OPEN_OPEN, it);
633 GOTO(out_och_free, rc);
635 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
637 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
638 rc = ll_local_open(file, it, fd, *och_p);
640 GOTO(out_och_free, rc);
642 cfs_up(&lli->lli_och_sem);
645 /* Must do this outside lli_och_sem lock to prevent deadlock where
646 different kind of OPEN lock for this same inode gets cancelled
647 by ldlm_cancel_lru */
648 if (!S_ISREG(inode->i_mode))
649 GOTO(out_och_free, rc);
655 if (file->f_flags & O_LOV_DELAY_CREATE ||
656 !(file->f_mode & FMODE_WRITE)) {
657 CDEBUG(D_INODE, "object creation was delayed\n");
658 GOTO(out_och_free, rc);
661 file->f_flags &= ~O_LOV_DELAY_CREATE;
662 GOTO(out_och_free, rc);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->d.lustre.it_data);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
671 if (och_p && *och_p) {
672 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
673 *och_p = NULL; /* OBD_FREE writes some magic there */
676 cfs_up(&lli->lli_och_sem);
679 if (opendir_set != 0)
680 ll_stop_statahead(inode, lli->lli_opendir_key);
682 ll_file_data_put(fd);
688 /* Fills the obdo with the attributes for the lsm */
689 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
690 struct obd_capa *capa, struct obdo *obdo,
691 __u64 ioepoch, int sync)
693 struct ptlrpc_request_set *set;
694 struct obd_info oinfo = { { { 0 } } };
699 LASSERT(lsm != NULL);
703 oinfo.oi_oa->o_id = lsm->lsm_object_id;
704 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
705 oinfo.oi_oa->o_mode = S_IFREG;
706 oinfo.oi_oa->o_ioepoch = ioepoch;
707 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
708 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
709 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
710 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
711 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
712 OBD_MD_FLDATAVERSION;
713 oinfo.oi_capa = capa;
715 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
716 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
719 set = ptlrpc_prep_set();
721 CERROR("can't allocate ptlrpc set\n");
724 rc = obd_getattr_async(exp, &oinfo, set);
726 rc = ptlrpc_set_wait(set);
727 ptlrpc_set_destroy(set);
730 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
731 OBD_MD_FLATIME | OBD_MD_FLMTIME |
732 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
733 OBD_MD_FLDATAVERSION);
738 * Performs the getattr on the inode and updates its fields.
739 * If @sync != 0, perform the getattr under the server-side lock.
741 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
742 __u64 ioepoch, int sync)
744 struct ll_inode_info *lli = ll_i2info(inode);
745 struct obd_capa *capa = ll_mdscapa_get(inode);
749 rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode),
750 capa, obdo, ioepoch, sync);
753 obdo_refresh_inode(inode, obdo, obdo->o_valid);
755 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
756 lli->lli_smd->lsm_object_id, i_size_read(inode),
757 (unsigned long long)inode->i_blocks,
758 (unsigned long)ll_inode_blksize(inode));
763 int ll_merge_lvb(struct inode *inode)
765 struct ll_inode_info *lli = ll_i2info(inode);
766 struct ll_sb_info *sbi = ll_i2sbi(inode);
772 ll_inode_size_lock(inode, 1);
773 inode_init_lvb(inode, &lvb);
775 /* merge timestamps the most resently obtained from mds with
776 timestamps obtained from osts */
777 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
778 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
779 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
780 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
781 cl_isize_write_nolock(inode, lvb.lvb_size);
783 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
784 PFID(&lli->lli_fid), lvb.lvb_size);
785 inode->i_blocks = lvb.lvb_blocks;
787 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
788 LTIME_S(inode->i_atime) = lvb.lvb_atime;
789 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
790 ll_inode_size_unlock(inode, 1);
795 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
798 struct obdo obdo = { 0 };
801 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
803 st->st_size = obdo.o_size;
804 st->st_blocks = obdo.o_blocks;
805 st->st_mtime = obdo.o_mtime;
806 st->st_atime = obdo.o_atime;
807 st->st_ctime = obdo.o_ctime;
812 void ll_io_init(struct cl_io *io, const struct file *file, int write)
814 struct inode *inode = file->f_dentry->d_inode;
816 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
818 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
819 io->ci_obj = ll_i2info(inode)->lli_clob;
820 io->ci_lockreq = CILR_MAYBE;
821 if (ll_file_nolock(file)) {
822 io->ci_lockreq = CILR_NEVER;
823 io->ci_no_srvlock = 1;
824 } else if (file->f_flags & O_APPEND) {
825 io->ci_lockreq = CILR_MANDATORY;
829 static ssize_t ll_file_io_generic(const struct lu_env *env,
830 struct vvp_io_args *args, struct file *file,
831 enum cl_io_type iot, loff_t *ppos, size_t count)
833 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
838 io = ccc_env_thread_io(env);
839 ll_io_init(io, file, iot == CIT_WRITE);
841 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
842 struct vvp_io *vio = vvp_env_io(env);
843 struct ccc_io *cio = ccc_env_io(env);
844 int write_sem_locked = 0;
846 cio->cui_fd = LUSTRE_FPRIVATE(file);
847 vio->cui_io_subtype = args->via_io_subtype;
849 switch (vio->cui_io_subtype) {
851 cio->cui_iov = args->u.normal.via_iov;
852 cio->cui_nrsegs = args->u.normal.via_nrsegs;
853 cio->cui_tot_nrsegs = cio->cui_nrsegs;
854 #ifndef HAVE_FILE_WRITEV
855 cio->cui_iocb = args->u.normal.via_iocb;
857 if ((iot == CIT_WRITE) &&
858 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
859 if(cfs_down_interruptible(&lli->lli_write_sem))
860 GOTO(out, result = -ERESTARTSYS);
861 write_sem_locked = 1;
862 } else if (iot == CIT_READ) {
863 cfs_down_read(&lli->lli_trunc_sem);
867 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
868 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
871 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
872 vio->u.splice.cui_flags = args->u.splice.via_flags;
875 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
878 result = cl_io_loop(env, io);
879 if (write_sem_locked)
880 cfs_up(&lli->lli_write_sem);
881 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
882 cfs_up_read(&lli->lli_trunc_sem);
884 /* cl_io_rw_init() handled IO */
885 result = io->ci_result;
888 if (io->ci_nob > 0) {
890 *ppos = io->u.ci_wr.wr.crw_pos;
896 if (iot == CIT_READ) {
898 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
899 LPROC_LL_READ_BYTES, result);
900 } else if (iot == CIT_WRITE) {
902 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
903 LPROC_LL_WRITE_BYTES, result);
904 lli->lli_write_rc = 0;
906 lli->lli_write_rc = result;
915 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
917 static int ll_file_get_iov_count(const struct iovec *iov,
918 unsigned long *nr_segs, size_t *count)
923 for (seg = 0; seg < *nr_segs; seg++) {
924 const struct iovec *iv = &iov[seg];
927 * If any segment has a negative length, or the cumulative
928 * length ever wraps negative then return -EINVAL.
931 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
933 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
938 cnt -= iv->iov_len; /* This segment is no good */
945 #ifdef HAVE_FILE_READV
946 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
947 unsigned long nr_segs, loff_t *ppos)
950 struct vvp_io_args *args;
956 result = ll_file_get_iov_count(iov, &nr_segs, &count);
960 env = cl_env_get(&refcheck);
962 RETURN(PTR_ERR(env));
964 args = vvp_env_args(env, IO_NORMAL);
965 args->u.normal.via_iov = (struct iovec *)iov;
966 args->u.normal.via_nrsegs = nr_segs;
968 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
969 cl_env_put(env, &refcheck);
973 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
977 struct iovec *local_iov;
982 env = cl_env_get(&refcheck);
984 RETURN(PTR_ERR(env));
986 local_iov = &vvp_env_info(env)->vti_local_iov;
987 local_iov->iov_base = (void __user *)buf;
988 local_iov->iov_len = count;
989 result = ll_file_readv(file, local_iov, 1, ppos);
990 cl_env_put(env, &refcheck);
995 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
996 unsigned long nr_segs, loff_t pos)
999 struct vvp_io_args *args;
1005 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1009 env = cl_env_get(&refcheck);
1011 RETURN(PTR_ERR(env));
1013 args = vvp_env_args(env, IO_NORMAL);
1014 args->u.normal.via_iov = (struct iovec *)iov;
1015 args->u.normal.via_nrsegs = nr_segs;
1016 args->u.normal.via_iocb = iocb;
1018 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1019 &iocb->ki_pos, count);
1020 cl_env_put(env, &refcheck);
1024 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1028 struct iovec *local_iov;
1029 struct kiocb *kiocb;
1034 env = cl_env_get(&refcheck);
1036 RETURN(PTR_ERR(env));
1038 local_iov = &vvp_env_info(env)->vti_local_iov;
1039 kiocb = &vvp_env_info(env)->vti_kiocb;
1040 local_iov->iov_base = (void __user *)buf;
1041 local_iov->iov_len = count;
1042 init_sync_kiocb(kiocb, file);
1043 kiocb->ki_pos = *ppos;
1044 kiocb->ki_left = count;
1046 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1047 *ppos = kiocb->ki_pos;
1049 cl_env_put(env, &refcheck);
1055 * Write to a file (through the page cache).
1057 #ifdef HAVE_FILE_WRITEV
1058 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1059 unsigned long nr_segs, loff_t *ppos)
1062 struct vvp_io_args *args;
1068 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1072 env = cl_env_get(&refcheck);
1074 RETURN(PTR_ERR(env));
1076 args = vvp_env_args(env, IO_NORMAL);
1077 args->u.normal.via_iov = (struct iovec *)iov;
1078 args->u.normal.via_nrsegs = nr_segs;
1080 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1081 cl_env_put(env, &refcheck);
1085 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1089 struct iovec *local_iov;
1094 env = cl_env_get(&refcheck);
1096 RETURN(PTR_ERR(env));
1098 local_iov = &vvp_env_info(env)->vti_local_iov;
1099 local_iov->iov_base = (void __user *)buf;
1100 local_iov->iov_len = count;
1102 result = ll_file_writev(file, local_iov, 1, ppos);
1103 cl_env_put(env, &refcheck);
1107 #else /* AIO stuff */
1108 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1109 unsigned long nr_segs, loff_t pos)
1112 struct vvp_io_args *args;
1118 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1122 env = cl_env_get(&refcheck);
1124 RETURN(PTR_ERR(env));
1126 args = vvp_env_args(env, IO_NORMAL);
1127 args->u.normal.via_iov = (struct iovec *)iov;
1128 args->u.normal.via_nrsegs = nr_segs;
1129 args->u.normal.via_iocb = iocb;
1131 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1132 &iocb->ki_pos, count);
1133 cl_env_put(env, &refcheck);
1137 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1141 struct iovec *local_iov;
1142 struct kiocb *kiocb;
1147 env = cl_env_get(&refcheck);
1149 RETURN(PTR_ERR(env));
1151 local_iov = &vvp_env_info(env)->vti_local_iov;
1152 kiocb = &vvp_env_info(env)->vti_kiocb;
1153 local_iov->iov_base = (void __user *)buf;
1154 local_iov->iov_len = count;
1155 init_sync_kiocb(kiocb, file);
1156 kiocb->ki_pos = *ppos;
1157 kiocb->ki_left = count;
1159 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1160 *ppos = kiocb->ki_pos;
1162 cl_env_put(env, &refcheck);
1168 #ifdef HAVE_KERNEL_SENDFILE
1170 * Send file content (through pagecache) somewhere with helper
1172 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1173 read_actor_t actor, void *target)
1176 struct vvp_io_args *args;
1181 env = cl_env_get(&refcheck);
1183 RETURN(PTR_ERR(env));
1185 args = vvp_env_args(env, IO_SENDFILE);
1186 args->u.sendfile.via_target = target;
1187 args->u.sendfile.via_actor = actor;
1189 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1190 cl_env_put(env, &refcheck);
1195 #ifdef HAVE_KERNEL_SPLICE_READ
1197 * Send file content (through pagecache) somewhere with helper
1199 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1200 struct pipe_inode_info *pipe, size_t count,
1204 struct vvp_io_args *args;
1209 env = cl_env_get(&refcheck);
1211 RETURN(PTR_ERR(env));
1213 args = vvp_env_args(env, IO_SPLICE);
1214 args->u.splice.via_pipe = pipe;
1215 args->u.splice.via_flags = flags;
1217 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1218 cl_env_put(env, &refcheck);
1223 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1226 struct obd_export *exp = ll_i2dtexp(inode);
1227 struct obd_trans_info oti = { 0 };
1228 struct obdo *oa = NULL;
1231 struct lov_stripe_md *lsm, *lsm2;
1238 ll_inode_size_lock(inode, 0);
1239 lsm = ll_i2info(inode)->lli_smd;
1241 GOTO(out, rc = -ENOENT);
1242 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1243 (lsm->lsm_stripe_count));
1245 OBD_ALLOC_LARGE(lsm2, lsm_size);
1247 GOTO(out, rc = -ENOMEM);
1251 oa->o_nlink = ost_idx;
1252 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1253 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1254 obdo_from_inode(oa, inode, &ll_i2info(inode)->lli_fid, OBD_MD_FLTYPE |
1255 OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1256 memcpy(lsm2, lsm, lsm_size);
1257 rc = obd_create(exp, oa, &lsm2, &oti);
1259 OBD_FREE_LARGE(lsm2, lsm_size);
1262 ll_inode_size_unlock(inode, 0);
1267 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1269 struct ll_recreate_obj ucreat;
1272 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1275 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1276 sizeof(struct ll_recreate_obj)))
1279 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1280 ucreat.lrc_ost_idx));
1283 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1290 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1293 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1294 sizeof(struct lu_fid)))
1297 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1298 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1299 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1302 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1303 int flags, struct lov_user_md *lum, int lum_size)
1305 struct lov_stripe_md *lsm;
1306 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1310 ll_inode_size_lock(inode, 0);
1311 lsm = ll_i2info(inode)->lli_smd;
1313 ll_inode_size_unlock(inode, 0);
1314 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1319 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1322 rc = oit.d.lustre.it_status;
1324 GOTO(out_req_free, rc);
1326 ll_release_openhandle(file->f_dentry, &oit);
1329 ll_inode_size_unlock(inode, 0);
1330 ll_intent_release(&oit);
1333 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1337 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1338 struct lov_mds_md **lmmp, int *lmm_size,
1339 struct ptlrpc_request **request)
1341 struct ll_sb_info *sbi = ll_i2sbi(inode);
1342 struct mdt_body *body;
1343 struct lov_mds_md *lmm = NULL;
1344 struct ptlrpc_request *req = NULL;
1345 struct md_op_data *op_data;
1348 rc = ll_get_max_mdsize(sbi, &lmmsize);
1352 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1353 strlen(filename), lmmsize,
1354 LUSTRE_OPC_ANY, NULL);
1355 if (IS_ERR(op_data))
1356 RETURN(PTR_ERR(op_data));
1358 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1359 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1360 ll_finish_md_op_data(op_data);
1362 CDEBUG(D_INFO, "md_getattr_name failed "
1363 "on %s: rc %d\n", filename, rc);
1367 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1368 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1370 lmmsize = body->eadatasize;
1372 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1374 GOTO(out, rc = -ENODATA);
1377 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1378 LASSERT(lmm != NULL);
1380 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1381 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1382 GOTO(out, rc = -EPROTO);
1386 * This is coming from the MDS, so is probably in
1387 * little endian. We convert it to host endian before
1388 * passing it to userspace.
1390 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1391 /* if function called for directory - we should
1392 * avoid swab not existent lsm objects */
1393 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1394 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1395 if (S_ISREG(body->mode))
1396 lustre_swab_lov_user_md_objects(
1397 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1398 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1399 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1400 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1401 if (S_ISREG(body->mode))
1402 lustre_swab_lov_user_md_objects(
1403 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1404 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1410 *lmm_size = lmmsize;
1415 static int ll_lov_setea(struct inode *inode, struct file *file,
1418 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1419 struct lov_user_md *lump;
1420 int lum_size = sizeof(struct lov_user_md) +
1421 sizeof(struct lov_user_ost_data);
1425 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1428 OBD_ALLOC_LARGE(lump, lum_size);
1432 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1433 OBD_FREE_LARGE(lump, lum_size);
1437 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1439 OBD_FREE_LARGE(lump, lum_size);
1443 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1446 struct lov_user_md_v3 lumv3;
1447 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1448 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1449 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1452 int flags = FMODE_WRITE;
1455 /* first try with v1 which is smaller than v3 */
1456 lum_size = sizeof(struct lov_user_md_v1);
1457 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1460 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1461 lum_size = sizeof(struct lov_user_md_v3);
1462 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1466 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1468 put_user(0, &lumv1p->lmm_stripe_count);
1469 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1470 0, ll_i2info(inode)->lli_smd,
1476 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1478 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1483 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1488 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1490 struct ll_inode_info *lli = ll_i2info(inode);
1491 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1492 struct ccc_grouplock grouplock;
1496 if (ll_file_nolock(file))
1497 RETURN(-EOPNOTSUPP);
1499 cfs_spin_lock(&lli->lli_lock);
1500 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1501 CWARN("group lock already existed with gid %lu\n",
1502 fd->fd_grouplock.cg_gid);
1503 cfs_spin_unlock(&lli->lli_lock);
1506 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1507 cfs_spin_unlock(&lli->lli_lock);
1509 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1510 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1514 cfs_spin_lock(&lli->lli_lock);
1515 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1516 cfs_spin_unlock(&lli->lli_lock);
1517 CERROR("another thread just won the race\n");
1518 cl_put_grouplock(&grouplock);
1522 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1523 fd->fd_grouplock = grouplock;
1524 cfs_spin_unlock(&lli->lli_lock);
1526 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1530 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1532 struct ll_inode_info *lli = ll_i2info(inode);
1533 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1534 struct ccc_grouplock grouplock;
1537 cfs_spin_lock(&lli->lli_lock);
1538 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1539 cfs_spin_unlock(&lli->lli_lock);
1540 CWARN("no group lock held\n");
1543 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1545 if (fd->fd_grouplock.cg_gid != arg) {
1546 CWARN("group lock %lu doesn't match current id %lu\n",
1547 arg, fd->fd_grouplock.cg_gid);
1548 cfs_spin_unlock(&lli->lli_lock);
1552 grouplock = fd->fd_grouplock;
1553 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1554 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1555 cfs_spin_unlock(&lli->lli_lock);
1557 cl_put_grouplock(&grouplock);
1558 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1563 * Close inode open handle
1565 * \param dentry [in] dentry which contains the inode
1566 * \param it [in,out] intent which contains open info and result
1569 * \retval <0 failure
1571 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1573 struct inode *inode = dentry->d_inode;
1574 struct obd_client_handle *och;
1580 /* Root ? Do nothing. */
1581 if (dentry->d_inode->i_sb->s_root == dentry)
1584 /* No open handle to close? Move away */
1585 if (!it_disposition(it, DISP_OPEN_OPEN))
1588 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1590 OBD_ALLOC(och, sizeof(*och));
1592 GOTO(out, rc = -ENOMEM);
1594 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1595 ll_i2info(inode), it, och);
1597 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1600 /* this one is in place of ll_file_open */
1601 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1602 ptlrpc_req_finished(it->d.lustre.it_data);
1603 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1609 * Get size for inode for which FIEMAP mapping is requested.
1610 * Make the FIEMAP get_info call and returns the result.
1612 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1615 struct obd_export *exp = ll_i2dtexp(inode);
1616 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1617 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1618 int vallen = num_bytes;
1622 /* Checks for fiemap flags */
1623 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1624 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1628 /* Check for FIEMAP_FLAG_SYNC */
1629 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1630 rc = filemap_fdatawrite(inode->i_mapping);
1635 /* If the stripe_count > 1 and the application does not understand
1636 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1638 if (lsm->lsm_stripe_count > 1 &&
1639 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1642 fm_key.oa.o_id = lsm->lsm_object_id;
1643 fm_key.oa.o_seq = lsm->lsm_object_seq;
1644 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1646 obdo_from_inode(&fm_key.oa, inode, &ll_i2info(inode)->lli_fid,
1648 /* If filesize is 0, then there would be no objects for mapping */
1649 if (fm_key.oa.o_size == 0) {
1650 fiemap->fm_mapped_extents = 0;
1654 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1656 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1658 CERROR("obd_get_info failed: rc = %d\n", rc);
1663 int ll_fid2path(struct obd_export *exp, void *arg)
1665 struct getinfo_fid2path *gfout, *gfin;
1669 /* Need to get the buflen */
1670 OBD_ALLOC_PTR(gfin);
1673 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1678 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1679 OBD_ALLOC(gfout, outsize);
1680 if (gfout == NULL) {
1684 memcpy(gfout, gfin, sizeof(*gfout));
1687 /* Call mdc_iocontrol */
1688 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1691 if (cfs_copy_to_user(arg, gfout, outsize))
1695 OBD_FREE(gfout, outsize);
1699 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1701 struct ll_user_fiemap *fiemap_s;
1702 size_t num_bytes, ret_bytes;
1703 unsigned int extent_count;
1706 /* Get the extent count so we can calculate the size of
1707 * required fiemap buffer */
1708 if (get_user(extent_count,
1709 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1711 num_bytes = sizeof(*fiemap_s) + (extent_count *
1712 sizeof(struct ll_fiemap_extent));
1714 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1715 if (fiemap_s == NULL)
1718 /* get the fiemap value */
1719 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1721 GOTO(error, rc = -EFAULT);
1723 /* If fm_extent_count is non-zero, read the first extent since
1724 * it is used to calculate end_offset and device from previous
1727 if (copy_from_user(&fiemap_s->fm_extents[0],
1728 (char __user *)arg + sizeof(*fiemap_s),
1729 sizeof(struct ll_fiemap_extent)))
1730 GOTO(error, rc = -EFAULT);
1733 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1737 ret_bytes = sizeof(struct ll_user_fiemap);
1739 if (extent_count != 0)
1740 ret_bytes += (fiemap_s->fm_mapped_extents *
1741 sizeof(struct ll_fiemap_extent));
1743 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1747 OBD_FREE_LARGE(fiemap_s, num_bytes);
1752 * Read the data_version for inode.
1754 * This value is computed using stripe object version on OST.
1755 * Version is computed using server side locking.
1757 * @param extent_lock Take extent lock. Not needed if a process is already
1758 * holding the OST object group locks.
1760 static int ll_data_version(struct inode *inode, __u64 *data_version,
1763 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1764 struct ll_sb_info *sbi = ll_i2sbi(inode);
1765 struct obdo *obdo = NULL;
1769 /* If no stripe, we consider version is 0. */
1772 CDEBUG(D_INODE, "No object for inode\n");
1776 OBD_ALLOC_PTR(obdo);
1780 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1782 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1785 *data_version = obdo->o_data_version;
1793 #ifdef HAVE_UNLOCKED_IOCTL
1794 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1796 struct inode *inode = file->f_dentry->d_inode;
1798 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1802 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1807 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1808 inode->i_generation, inode, cmd);
1809 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1811 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1812 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1816 case LL_IOC_GETFLAGS:
1817 /* Get the current value of the file flags */
1818 return put_user(fd->fd_flags, (int *)arg);
1819 case LL_IOC_SETFLAGS:
1820 case LL_IOC_CLRFLAGS:
1821 /* Set or clear specific file flags */
1822 /* XXX This probably needs checks to ensure the flags are
1823 * not abused, and to handle any flag side effects.
1825 if (get_user(flags, (int *) arg))
1828 if (cmd == LL_IOC_SETFLAGS) {
1829 if ((flags & LL_FILE_IGNORE_LOCK) &&
1830 !(file->f_flags & O_DIRECT)) {
1831 CERROR("%s: unable to disable locking on "
1832 "non-O_DIRECT file\n", current->comm);
1836 fd->fd_flags |= flags;
1838 fd->fd_flags &= ~flags;
1841 case LL_IOC_LOV_SETSTRIPE:
1842 RETURN(ll_lov_setstripe(inode, file, arg));
1843 case LL_IOC_LOV_SETEA:
1844 RETURN(ll_lov_setea(inode, file, arg));
1845 case LL_IOC_LOV_GETSTRIPE:
1846 RETURN(ll_lov_getstripe(inode, arg));
1847 case LL_IOC_RECREATE_OBJ:
1848 RETURN(ll_lov_recreate_obj(inode, arg));
1849 case LL_IOC_RECREATE_FID:
1850 RETURN(ll_lov_recreate_fid(inode, arg));
1851 case FSFILT_IOC_FIEMAP:
1852 RETURN(ll_ioctl_fiemap(inode, arg));
1853 case FSFILT_IOC_GETFLAGS:
1854 case FSFILT_IOC_SETFLAGS:
1855 RETURN(ll_iocontrol(inode, file, cmd, arg));
1856 case FSFILT_IOC_GETVERSION_OLD:
1857 case FSFILT_IOC_GETVERSION:
1858 RETURN(put_user(inode->i_generation, (int *)arg));
1859 case LL_IOC_GROUP_LOCK:
1860 RETURN(ll_get_grouplock(inode, file, arg));
1861 case LL_IOC_GROUP_UNLOCK:
1862 RETURN(ll_put_grouplock(inode, file, arg));
1863 case IOC_OBD_STATFS:
1864 RETURN(ll_obd_statfs(inode, (void *)arg));
1866 /* We need to special case any other ioctls we want to handle,
1867 * to send them to the MDS/OST as appropriate and to properly
1868 * network encode the arg field.
1869 case FSFILT_IOC_SETVERSION_OLD:
1870 case FSFILT_IOC_SETVERSION:
1872 case LL_IOC_FLUSHCTX:
1873 RETURN(ll_flush_ctx(inode));
1874 case LL_IOC_PATH2FID: {
1875 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1876 sizeof(struct lu_fid)))
1881 case OBD_IOC_FID2PATH:
1882 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1883 case LL_IOC_DATA_VERSION: {
1884 struct ioc_data_version idv;
1887 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1890 rc = ll_data_version(inode, &idv.idv_version,
1891 !(idv.idv_flags & LL_DV_NOFLUSH));
1894 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1900 case LL_IOC_GET_MDTIDX: {
1903 mdtidx = ll_get_mdt_idx(inode);
1907 if (put_user((int)mdtidx, (int*)arg))
1912 case OBD_IOC_GETDTNAME:
1913 case OBD_IOC_GETMDNAME:
1914 RETURN(ll_get_obd_name(inode, cmd, arg));
1919 ll_iocontrol_call(inode, file, cmd, arg, &err))
1922 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1928 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1930 struct inode *inode = file->f_dentry->d_inode;
1933 retval = offset + ((origin == 2) ? i_size_read(inode) :
1934 (origin == 1) ? file->f_pos : 0);
1935 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1936 inode->i_ino, inode->i_generation, inode, retval, retval,
1937 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1938 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1940 if (origin == 2) { /* SEEK_END */
1943 rc = ll_glimpse_size(inode);
1947 offset += i_size_read(inode);
1948 } else if (origin == 1) { /* SEEK_CUR */
1949 offset += file->f_pos;
1953 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1954 if (offset != file->f_pos) {
1955 file->f_pos = offset;
1963 #ifdef HAVE_FLUSH_OWNER_ID
1964 int ll_flush(struct file *file, fl_owner_t id)
1966 int ll_flush(struct file *file)
1969 struct inode *inode = file->f_dentry->d_inode;
1970 struct ll_inode_info *lli = ll_i2info(inode);
1971 struct lov_stripe_md *lsm = lli->lli_smd;
1974 LASSERT(!S_ISDIR(inode->i_mode));
1976 /* the application should know write failure already. */
1977 if (lli->lli_write_rc)
1980 /* catch async errors that were recorded back when async writeback
1981 * failed for pages in this mapping. */
1982 rc = lli->lli_async_rc;
1983 lli->lli_async_rc = 0;
1985 err = lov_test_and_clear_async_rc(lsm);
1990 return rc ? -EIO : 0;
1993 #ifndef HAVE_FILE_FSYNC_2ARGS
1994 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1996 int ll_fsync(struct file *file, int data)
1999 struct inode *inode = file->f_dentry->d_inode;
2000 struct ll_inode_info *lli = ll_i2info(inode);
2001 struct lov_stripe_md *lsm = lli->lli_smd;
2002 struct ptlrpc_request *req;
2003 struct obd_capa *oc;
2006 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2007 inode->i_generation, inode);
2008 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2010 /* fsync's caller has already called _fdata{sync,write}, we want
2011 * that IO to finish before calling the osc and mdc sync methods */
2012 rc = filemap_fdatawait(inode->i_mapping);
2014 /* catch async errors that were recorded back when async writeback
2015 * failed for pages in this mapping. */
2016 if (!S_ISDIR(inode->i_mode)) {
2017 err = lli->lli_async_rc;
2018 lli->lli_async_rc = 0;
2022 err = lov_test_and_clear_async_rc(lsm);
2028 oc = ll_mdscapa_get(inode);
2029 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2035 ptlrpc_req_finished(req);
2038 struct obd_info *oinfo;
2040 OBD_ALLOC_PTR(oinfo);
2042 RETURN(rc ? rc : -ENOMEM);
2043 OBDO_ALLOC(oinfo->oi_oa);
2044 if (!oinfo->oi_oa) {
2045 OBD_FREE_PTR(oinfo);
2046 RETURN(rc ? rc : -ENOMEM);
2048 oinfo->oi_oa->o_id = lsm->lsm_object_id;
2049 oinfo->oi_oa->o_seq = lsm->lsm_object_seq;
2050 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2051 obdo_from_inode(oinfo->oi_oa, inode, &ll_i2info(inode)->lli_fid,
2052 OBD_MD_FLTYPE | OBD_MD_FLATIME |
2053 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2056 oinfo->oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2057 err = obd_sync_rqset(ll_i2sbi(inode)->ll_dt_exp, oinfo, 0,
2059 capa_put(oinfo->oi_capa);
2062 OBDO_FREE(oinfo->oi_oa);
2063 OBD_FREE_PTR(oinfo);
2064 lli->lli_write_rc = rc < 0 ? rc : 0;
2070 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2072 struct inode *inode = file->f_dentry->d_inode;
2073 struct ll_sb_info *sbi = ll_i2sbi(inode);
2074 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2075 .ei_cb_cp =ldlm_flock_completion_ast,
2076 .ei_cbdata = file_lock };
2077 struct md_op_data *op_data;
2078 struct lustre_handle lockh = {0};
2079 ldlm_policy_data_t flock = {{0}};
2084 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2085 inode->i_ino, file_lock);
2087 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2089 if (file_lock->fl_flags & FL_FLOCK) {
2090 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2091 /* flocks are whole-file locks */
2092 flock.l_flock.end = OFFSET_MAX;
2093 /* For flocks owner is determined by the local file desctiptor*/
2094 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2095 } else if (file_lock->fl_flags & FL_POSIX) {
2096 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2097 flock.l_flock.start = file_lock->fl_start;
2098 flock.l_flock.end = file_lock->fl_end;
2102 flock.l_flock.pid = file_lock->fl_pid;
2104 /* Somewhat ugly workaround for svc lockd.
2105 * lockd installs custom fl_lmops->fl_compare_owner that checks
2106 * for the fl_owner to be the same (which it always is on local node
2107 * I guess between lockd processes) and then compares pid.
2108 * As such we assign pid to the owner field to make it all work,
2109 * conflict with normal locks is unlikely since pid space and
2110 * pointer space for current->files are not intersecting */
2111 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2112 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2114 switch (file_lock->fl_type) {
2116 einfo.ei_mode = LCK_PR;
2119 /* An unlock request may or may not have any relation to
2120 * existing locks so we may not be able to pass a lock handle
2121 * via a normal ldlm_lock_cancel() request. The request may even
2122 * unlock a byte range in the middle of an existing lock. In
2123 * order to process an unlock request we need all of the same
2124 * information that is given with a normal read or write record
2125 * lock request. To avoid creating another ldlm unlock (cancel)
2126 * message we'll treat a LCK_NL flock request as an unlock. */
2127 einfo.ei_mode = LCK_NL;
2130 einfo.ei_mode = LCK_PW;
2133 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2134 file_lock->fl_type);
2149 flags = LDLM_FL_BLOCK_NOWAIT;
2155 flags = LDLM_FL_TEST_LOCK;
2156 /* Save the old mode so that if the mode in the lock changes we
2157 * can decrement the appropriate reader or writer refcount. */
2158 file_lock->fl_type = einfo.ei_mode;
2161 CERROR("unknown fcntl lock command: %d\n", cmd);
2165 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2166 LUSTRE_OPC_ANY, NULL);
2167 if (IS_ERR(op_data))
2168 RETURN(PTR_ERR(op_data));
2170 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2171 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2172 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2174 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2175 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2177 ll_finish_md_op_data(op_data);
2179 if ((file_lock->fl_flags & FL_FLOCK) &&
2180 (rc == 0 || file_lock->fl_type == F_UNLCK))
2181 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2182 #ifdef HAVE_F_OP_FLOCK
2183 if ((file_lock->fl_flags & FL_POSIX) &&
2184 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2185 !(flags & LDLM_FL_TEST_LOCK))
2186 posix_lock_file_wait(file, file_lock);
2192 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2200 * test if some locks matching bits and l_req_mode are acquired
2201 * - bits can be in different locks
2202 * - if found clear the common lock bits in *bits
2203 * - the bits not found, are kept in *bits
2205 * \param bits [IN] searched lock bits [IN]
2206 * \param l_req_mode [IN] searched lock mode
2207 * \retval boolean, true iff all bits are found
2209 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2211 struct lustre_handle lockh;
2212 ldlm_policy_data_t policy;
2213 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2214 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2223 fid = &ll_i2info(inode)->lli_fid;
2224 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2225 ldlm_lockname[mode]);
2227 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2228 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2229 policy.l_inodebits.bits = *bits & (1 << i);
2230 if (policy.l_inodebits.bits == 0)
2233 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2234 &policy, mode, &lockh)) {
2235 struct ldlm_lock *lock;
2237 lock = ldlm_handle2lock(&lockh);
2240 ~(lock->l_policy_data.l_inodebits.bits);
2241 LDLM_LOCK_PUT(lock);
2243 *bits &= ~policy.l_inodebits.bits;
2250 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2251 struct lustre_handle *lockh)
2253 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2259 fid = &ll_i2info(inode)->lli_fid;
2260 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2262 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2263 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2264 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2268 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2269 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2270 * and return success */
2272 /* This path cannot be hit for regular files unless in
2273 * case of obscure races, so no need to to validate
2275 if (!S_ISREG(inode->i_mode) &&
2276 !S_ISDIR(inode->i_mode))
2281 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2289 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2292 struct inode *inode = dentry->d_inode;
2293 struct ptlrpc_request *req = NULL;
2294 struct obd_export *exp;
2299 CERROR("REPORT THIS LINE TO PETER\n");
2303 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2304 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2306 exp = ll_i2mdexp(inode);
2308 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2309 * But under CMD case, it caused some lock issues, should be fixed
2310 * with new CMD ibits lock. See bug 12718 */
2311 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2312 struct lookup_intent oit = { .it_op = IT_GETATTR };
2313 struct md_op_data *op_data;
2315 if (ibits == MDS_INODELOCK_LOOKUP)
2316 oit.it_op = IT_LOOKUP;
2318 /* Call getattr by fid, so do not provide name at all. */
2319 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2320 dentry->d_inode, NULL, 0, 0,
2321 LUSTRE_OPC_ANY, NULL);
2322 if (IS_ERR(op_data))
2323 RETURN(PTR_ERR(op_data));
2325 oit.it_create_mode |= M_CHECK_STALE;
2326 rc = md_intent_lock(exp, op_data, NULL, 0,
2327 /* we are not interested in name
2330 ll_md_blocking_ast, 0);
2331 ll_finish_md_op_data(op_data);
2332 oit.it_create_mode &= ~M_CHECK_STALE;
2334 rc = ll_inode_revalidate_fini(inode, rc);
2338 rc = ll_revalidate_it_finish(req, &oit, dentry);
2340 ll_intent_release(&oit);
2344 /* Unlinked? Unhash dentry, so it is not picked up later by
2345 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2346 here to preserve get_cwd functionality on 2.6.
2348 if (!dentry->d_inode->i_nlink) {
2349 cfs_spin_lock(&ll_lookup_lock);
2350 spin_lock(&dcache_lock);
2351 ll_drop_dentry(dentry);
2352 spin_unlock(&dcache_lock);
2353 cfs_spin_unlock(&ll_lookup_lock);
2356 ll_lookup_finish_locks(&oit, dentry);
2357 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2358 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2359 obd_valid valid = OBD_MD_FLGETATTR;
2360 struct md_op_data *op_data;
2363 if (S_ISREG(inode->i_mode)) {
2364 rc = ll_get_max_mdsize(sbi, &ealen);
2367 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2370 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2371 0, ealen, LUSTRE_OPC_ANY,
2373 if (IS_ERR(op_data))
2374 RETURN(PTR_ERR(op_data));
2376 op_data->op_valid = valid;
2377 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2378 * capa for this inode. Because we only keep capas of dirs
2380 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2381 ll_finish_md_op_data(op_data);
2383 rc = ll_inode_revalidate_fini(inode, rc);
2387 rc = ll_prep_inode(&inode, req, NULL);
2390 ptlrpc_req_finished(req);
2394 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2397 struct inode *inode = dentry->d_inode;
2401 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2403 /* if object not yet allocated, don't validate size */
2404 if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL) {
2405 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2406 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2407 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2411 /* ll_glimpse_size will prefer locally cached writes if they extend
2415 rc = ll_glimpse_size(inode);
2420 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2421 struct lookup_intent *it, struct kstat *stat)
2423 struct inode *inode = de->d_inode;
2424 struct ll_sb_info *sbi = ll_i2sbi(inode);
2425 struct ll_inode_info *lli = ll_i2info(inode);
2428 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2429 MDS_INODELOCK_LOOKUP);
2430 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2435 stat->dev = inode->i_sb->s_dev;
2436 if (ll_need_32bit_api(sbi))
2437 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2439 stat->ino = inode->i_ino;
2440 stat->mode = inode->i_mode;
2441 stat->nlink = inode->i_nlink;
2442 stat->uid = inode->i_uid;
2443 stat->gid = inode->i_gid;
2444 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2445 stat->atime = inode->i_atime;
2446 stat->mtime = inode->i_mtime;
2447 stat->ctime = inode->i_ctime;
2448 #ifdef HAVE_INODE_BLKSIZE
2449 stat->blksize = inode->i_blksize;
2451 stat->blksize = 1 << inode->i_blkbits;
2454 stat->size = i_size_read(inode);
2455 stat->blocks = inode->i_blocks;
2459 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2461 struct lookup_intent it = { .it_op = IT_GETATTR };
2463 return ll_getattr_it(mnt, de, &it, stat);
2466 #ifdef HAVE_LINUX_FIEMAP_H
2467 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2468 __u64 start, __u64 len)
2472 struct ll_user_fiemap *fiemap;
2473 unsigned int extent_count = fieinfo->fi_extents_max;
2475 num_bytes = sizeof(*fiemap) + (extent_count *
2476 sizeof(struct ll_fiemap_extent));
2477 OBD_ALLOC_LARGE(fiemap, num_bytes);
2482 fiemap->fm_flags = fieinfo->fi_flags;
2483 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2484 fiemap->fm_start = start;
2485 fiemap->fm_length = len;
2486 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2487 sizeof(struct ll_fiemap_extent));
2489 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2491 fieinfo->fi_flags = fiemap->fm_flags;
2492 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2493 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2494 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2496 OBD_FREE_LARGE(fiemap, num_bytes);
2503 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2504 lustre_check_acl(struct inode *inode, int mask, unsigned int flags)
2506 lustre_check_acl(struct inode *inode, int mask)
2509 #ifdef CONFIG_FS_POSIX_ACL
2510 struct ll_inode_info *lli = ll_i2info(inode);
2511 struct posix_acl *acl;
2515 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2516 if (flags & IPERM_FLAG_RCU)
2519 cfs_spin_lock(&lli->lli_lock);
2520 acl = posix_acl_dup(lli->lli_posix_acl);
2521 cfs_spin_unlock(&lli->lli_lock);
2526 rc = posix_acl_permission(inode, acl, mask);
2527 posix_acl_release(acl);
2535 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2536 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2538 # ifdef HAVE_INODE_PERMISION_2ARGS
2539 int ll_inode_permission(struct inode *inode, int mask)
2541 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2548 /* as root inode are NOT getting validated in lookup operation,
2549 * need to do it before permission check. */
2551 if (inode == inode->i_sb->s_root->d_inode) {
2552 struct lookup_intent it = { .it_op = IT_LOOKUP };
2554 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2555 MDS_INODELOCK_LOOKUP);
2560 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2561 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2563 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2564 return lustre_check_remote_perm(inode, mask);
2566 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2567 rc = ll_generic_permission(inode, mask, flags, lustre_check_acl);
2572 #ifdef HAVE_FILE_READV
2573 #define READ_METHOD readv
2574 #define READ_FUNCTION ll_file_readv
2575 #define WRITE_METHOD writev
2576 #define WRITE_FUNCTION ll_file_writev
2578 #define READ_METHOD aio_read
2579 #define READ_FUNCTION ll_file_aio_read
2580 #define WRITE_METHOD aio_write
2581 #define WRITE_FUNCTION ll_file_aio_write
2584 /* -o localflock - only provides locally consistent flock locks */
2585 struct file_operations ll_file_operations = {
2586 .read = ll_file_read,
2587 .READ_METHOD = READ_FUNCTION,
2588 .write = ll_file_write,
2589 .WRITE_METHOD = WRITE_FUNCTION,
2590 #ifdef HAVE_UNLOCKED_IOCTL
2591 .unlocked_ioctl = ll_file_ioctl,
2593 .ioctl = ll_file_ioctl,
2595 .open = ll_file_open,
2596 .release = ll_file_release,
2597 .mmap = ll_file_mmap,
2598 .llseek = ll_file_seek,
2599 #ifdef HAVE_KERNEL_SENDFILE
2600 .sendfile = ll_file_sendfile,
2602 #ifdef HAVE_KERNEL_SPLICE_READ
2603 .splice_read = ll_file_splice_read,
2609 struct file_operations ll_file_operations_flock = {
2610 .read = ll_file_read,
2611 .READ_METHOD = READ_FUNCTION,
2612 .write = ll_file_write,
2613 .WRITE_METHOD = WRITE_FUNCTION,
2614 #ifdef HAVE_UNLOCKED_IOCTL
2615 .unlocked_ioctl = ll_file_ioctl,
2617 .ioctl = ll_file_ioctl,
2619 .open = ll_file_open,
2620 .release = ll_file_release,
2621 .mmap = ll_file_mmap,
2622 .llseek = ll_file_seek,
2623 #ifdef HAVE_KERNEL_SENDFILE
2624 .sendfile = ll_file_sendfile,
2626 #ifdef HAVE_KERNEL_SPLICE_READ
2627 .splice_read = ll_file_splice_read,
2631 #ifdef HAVE_F_OP_FLOCK
2632 .flock = ll_file_flock,
2634 .lock = ll_file_flock
2637 /* These are for -o noflock - to return ENOSYS on flock calls */
2638 struct file_operations ll_file_operations_noflock = {
2639 .read = ll_file_read,
2640 .READ_METHOD = READ_FUNCTION,
2641 .write = ll_file_write,
2642 .WRITE_METHOD = WRITE_FUNCTION,
2643 #ifdef HAVE_UNLOCKED_IOCTL
2644 .unlocked_ioctl = ll_file_ioctl,
2646 .ioctl = ll_file_ioctl,
2648 .open = ll_file_open,
2649 .release = ll_file_release,
2650 .mmap = ll_file_mmap,
2651 .llseek = ll_file_seek,
2652 #ifdef HAVE_KERNEL_SENDFILE
2653 .sendfile = ll_file_sendfile,
2655 #ifdef HAVE_KERNEL_SPLICE_READ
2656 .splice_read = ll_file_splice_read,
2660 #ifdef HAVE_F_OP_FLOCK
2661 .flock = ll_file_noflock,
2663 .lock = ll_file_noflock
2666 struct inode_operations ll_file_inode_operations = {
2667 .setattr = ll_setattr,
2668 .truncate = ll_truncate,
2669 .getattr = ll_getattr,
2670 .permission = ll_inode_permission,
2671 .setxattr = ll_setxattr,
2672 .getxattr = ll_getxattr,
2673 .listxattr = ll_listxattr,
2674 .removexattr = ll_removexattr,
2675 #ifdef HAVE_LINUX_FIEMAP_H
2676 .fiemap = ll_fiemap,
2680 /* dynamic ioctl number support routins */
2681 static struct llioc_ctl_data {
2682 cfs_rw_semaphore_t ioc_sem;
2683 cfs_list_t ioc_head;
2685 __RWSEM_INITIALIZER(llioc.ioc_sem),
2686 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2691 cfs_list_t iocd_list;
2692 unsigned int iocd_size;
2693 llioc_callback_t iocd_cb;
2694 unsigned int iocd_count;
2695 unsigned int iocd_cmd[0];
2698 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2701 struct llioc_data *in_data = NULL;
2704 if (cb == NULL || cmd == NULL ||
2705 count > LLIOC_MAX_CMD || count < 0)
2708 size = sizeof(*in_data) + count * sizeof(unsigned int);
2709 OBD_ALLOC(in_data, size);
2710 if (in_data == NULL)
2713 memset(in_data, 0, sizeof(*in_data));
2714 in_data->iocd_size = size;
2715 in_data->iocd_cb = cb;
2716 in_data->iocd_count = count;
2717 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2719 cfs_down_write(&llioc.ioc_sem);
2720 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2721 cfs_up_write(&llioc.ioc_sem);
2726 void ll_iocontrol_unregister(void *magic)
2728 struct llioc_data *tmp;
2733 cfs_down_write(&llioc.ioc_sem);
2734 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2736 unsigned int size = tmp->iocd_size;
2738 cfs_list_del(&tmp->iocd_list);
2739 cfs_up_write(&llioc.ioc_sem);
2741 OBD_FREE(tmp, size);
2745 cfs_up_write(&llioc.ioc_sem);
2747 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2750 EXPORT_SYMBOL(ll_iocontrol_register);
2751 EXPORT_SYMBOL(ll_iocontrol_unregister);
2753 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2754 unsigned int cmd, unsigned long arg, int *rcp)
2756 enum llioc_iter ret = LLIOC_CONT;
2757 struct llioc_data *data;
2758 int rc = -EINVAL, i;
2760 cfs_down_read(&llioc.ioc_sem);
2761 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2762 for (i = 0; i < data->iocd_count; i++) {
2763 if (cmd != data->iocd_cmd[i])
2766 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2770 if (ret == LLIOC_STOP)
2773 cfs_up_read(&llioc.ioc_sem);