1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
52 /* also used by llite/special.c:ll_special_open() */
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
61 static void ll_file_data_put(struct ll_file_data *fd)
64 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
67 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
68 struct lustre_handle *fh)
70 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
71 op_data->op_attr.ia_mode = inode->i_mode;
72 op_data->op_attr.ia_atime = inode->i_atime;
73 op_data->op_attr.ia_mtime = inode->i_mtime;
74 op_data->op_attr.ia_ctime = inode->i_ctime;
75 op_data->op_attr.ia_size = i_size_read(inode);
76 op_data->op_attr_blocks = inode->i_blocks;
77 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
78 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
79 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
80 op_data->op_capa1 = ll_mdscapa_get(inode);
83 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
84 struct obd_client_handle *och)
88 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
89 ATTR_MTIME_SET | ATTR_CTIME_SET;
91 if (!(och->och_flags & FMODE_WRITE))
94 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
95 !S_ISREG(inode->i_mode))
96 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98 ll_epoch_close(inode, op_data, &och, 0);
101 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107 struct obd_client_handle *och)
109 struct obd_export *exp = ll_i2mdexp(inode);
110 struct md_op_data *op_data;
111 struct ptlrpc_request *req = NULL;
112 struct obd_device *obd = class_exp2obd(exp);
119 * XXX: in case of LMV, is this correct to access
122 CERROR("Invalid MDC connection handle "LPX64"\n",
123 ll_i2mdexp(inode)->exp_handle.h_cookie);
128 * here we check if this is forced umount. If so this is called on
129 * canceling "open lock" and we do not call md_close() in this case, as
130 * it will not be successful, as import is already deactivated.
135 OBD_ALLOC_PTR(op_data);
137 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139 ll_prepare_close(inode, op_data, och);
140 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141 rc = md_close(md_exp, op_data, och->och_mod, &req);
146 /* This close must have the epoch closed. */
147 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
148 LASSERT(epoch_close);
149 /* MDS has instructed us to obtain Size-on-MDS attribute from
150 * OSTs and send setattr to back to MDS. */
151 rc = ll_sizeonmds_update(inode, och->och_mod,
152 &och->och_fh, op_data->op_ioepoch);
154 CERROR("inode %lu mdc Size-on-MDS update failed: "
155 "rc = %d\n", inode->i_ino, rc);
159 CERROR("inode %lu mdc close failed: rc = %d\n",
162 ll_finish_md_op_data(op_data);
165 rc = ll_objects_destroy(req, inode);
167 CERROR("inode %lu ll_objects destroy: rc = %d\n",
174 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
175 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
176 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
179 ptlrpc_close_replay_seq(req);
180 md_clear_open_replay_data(md_exp, och);
181 /* Free @och if it is not waiting for DONE_WRITING. */
182 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
185 if (req) /* This is close request */
186 ptlrpc_req_finished(req);
190 int ll_md_real_close(struct inode *inode, int flags)
192 struct ll_inode_info *lli = ll_i2info(inode);
193 struct obd_client_handle **och_p;
194 struct obd_client_handle *och;
199 if (flags & FMODE_WRITE) {
200 och_p = &lli->lli_mds_write_och;
201 och_usecount = &lli->lli_open_fd_write_count;
202 } else if (flags & FMODE_EXEC) {
203 och_p = &lli->lli_mds_exec_och;
204 och_usecount = &lli->lli_open_fd_exec_count;
206 LASSERT(flags & FMODE_READ);
207 och_p = &lli->lli_mds_read_och;
208 och_usecount = &lli->lli_open_fd_read_count;
211 down(&lli->lli_och_sem);
212 if (*och_usecount) { /* There are still users of this handle, so
214 up(&lli->lli_och_sem);
219 up(&lli->lli_och_sem);
221 if (och) { /* There might be a race and somebody have freed this och
223 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
230 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
233 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
234 struct ll_inode_info *lli = ll_i2info(inode);
238 /* clear group lock, if present */
239 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
240 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
241 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
242 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
246 /* Let's see if we have good enough OPEN lock on the file and if
247 we can skip talking to MDS */
248 if (file->f_dentry->d_inode) { /* Can this ever be false? */
250 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct lustre_handle lockh;
252 struct inode *inode = file->f_dentry->d_inode;
253 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
255 down(&lli->lli_och_sem);
256 if (fd->fd_omode & FMODE_WRITE) {
258 LASSERT(lli->lli_open_fd_write_count);
259 lli->lli_open_fd_write_count--;
260 } else if (fd->fd_omode & FMODE_EXEC) {
262 LASSERT(lli->lli_open_fd_exec_count);
263 lli->lli_open_fd_exec_count--;
266 LASSERT(lli->lli_open_fd_read_count);
267 lli->lli_open_fd_read_count--;
269 up(&lli->lli_och_sem);
271 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
272 LDLM_IBITS, &policy, lockmode,
274 rc = ll_md_real_close(file->f_dentry->d_inode,
278 CERROR("Releasing a file %p with negative dentry %p. Name %s",
279 file, file->f_dentry, file->f_dentry->d_name.name);
282 LUSTRE_FPRIVATE(file) = NULL;
283 ll_file_data_put(fd);
284 ll_capa_close(inode);
289 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
291 /* While this returns an error code, fput() the caller does not, so we need
292 * to make every effort to clean up all of our state here. Also, applications
293 * rarely check close errors and even if an error is returned they will not
294 * re-try the close call.
296 int ll_file_release(struct inode *inode, struct file *file)
298 struct ll_file_data *fd;
299 struct ll_sb_info *sbi = ll_i2sbi(inode);
300 struct ll_inode_info *lli = ll_i2info(inode);
301 struct lov_stripe_md *lsm = lli->lli_smd;
305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
306 inode->i_generation, inode);
308 #ifdef CONFIG_FS_POSIX_ACL
309 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
310 inode == inode->i_sb->s_root->d_inode) {
311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
314 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
315 fd->fd_flags &= ~LL_FILE_RMTACL;
316 rct_del(&sbi->ll_rct, cfs_curproc_pid());
317 et_search_free(&sbi->ll_et, cfs_curproc_pid());
322 if (inode->i_sb->s_root != file->f_dentry)
323 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
324 fd = LUSTRE_FPRIVATE(file);
327 /* The last ref on @file, maybe not the the owner pid of statahead.
328 * Different processes can open the same dir, "ll_opendir_key" means:
329 * it is me that should stop the statahead thread. */
330 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
331 ll_stop_statahead(inode, fd);
333 if (inode->i_sb->s_root == file->f_dentry) {
334 LUSTRE_FPRIVATE(file) = NULL;
335 ll_file_data_put(fd);
340 lov_test_and_clear_async_rc(lsm);
341 lli->lli_async_rc = 0;
343 rc = ll_md_close(sbi->ll_md_exp, inode, file);
347 static int ll_intent_file_open(struct file *file, void *lmm,
348 int lmmsize, struct lookup_intent *itp)
350 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
351 struct dentry *parent = file->f_dentry->d_parent;
352 const char *name = file->f_dentry->d_name.name;
353 const int len = file->f_dentry->d_name.len;
354 struct md_op_data *op_data;
355 struct ptlrpc_request *req;
362 /* Usually we come here only for NFSD, and we want open lock.
363 But we can also get here with pre 2.6.15 patchless kernels, and in
364 that case that lock is also ok */
365 /* We can also get here if there was cached open handle in revalidate_it
366 * but it disappeared while we were getting from there to ll_file_open.
367 * But this means this file was closed and immediatelly opened which
368 * makes a good candidate for using OPEN lock */
369 /* If lmmsize & lmm are not 0, we are just setting stripe info
370 * parameters. No need for the open lock */
371 if (!lmm && !lmmsize)
372 itp->it_flags |= MDS_OPEN_LOCK;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
375 file->f_dentry->d_inode, name, len,
376 O_RDWR, LUSTRE_OPC_ANY, NULL);
378 RETURN(PTR_ERR(op_data));
380 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
381 0 /*unused */, &req, ll_md_blocking_ast, 0);
382 ll_finish_md_op_data(op_data);
384 /* reason for keep own exit path - don`t flood log
385 * with messages with -ESTALE errors.
387 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
388 it_open_error(DISP_OPEN_OPEN, itp))
390 ll_release_openhandle(file->f_dentry, itp);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 if (itp->d.lustre.it_lock_mode)
401 md_set_lock_data(sbi->ll_md_exp,
402 &itp->d.lustre.it_lock_handle,
403 file->f_dentry->d_inode);
405 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
407 ptlrpc_req_finished(itp->d.lustre.it_data);
408 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
409 ll_intent_drop_lock(itp);
414 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
415 struct lookup_intent *it, struct obd_client_handle *och)
417 struct ptlrpc_request *req = it->d.lustre.it_data;
418 struct mdt_body *body;
422 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
423 LASSERT(body != NULL); /* reply already checked out */
425 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
426 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
427 och->och_fid = lli->lli_fid;
428 och->och_flags = it->it_flags;
429 lli->lli_ioepoch = body->ioepoch;
431 return md_set_open_replay_data(md_exp, och, req);
434 int ll_local_open(struct file *file, struct lookup_intent *it,
435 struct ll_file_data *fd, struct obd_client_handle *och)
437 struct inode *inode = file->f_dentry->d_inode;
438 struct ll_inode_info *lli = ll_i2info(inode);
441 LASSERT(!LUSTRE_FPRIVATE(file));
446 struct ptlrpc_request *req = it->d.lustre.it_data;
447 struct mdt_body *body;
450 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 if ((it->it_flags & FMODE_WRITE) &&
456 (body->valid & OBD_MD_FLSIZE))
457 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
458 lli->lli_ioepoch, PFID(&lli->lli_fid));
461 LUSTRE_FPRIVATE(file) = fd;
462 ll_readahead_init(inode, &fd->fd_ras);
463 fd->fd_omode = it->it_flags;
467 /* Open a file, and (for the very first open) create objects on the OSTs at
468 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
469 * creation or open until ll_lov_setstripe() ioctl is called. We grab
470 * lli_open_sem to ensure no other process will create objects, send the
471 * stripe MD to the MDS, or try to destroy the objects if that fails.
473 * If we already have the stripe MD locally then we don't request it in
474 * md_open(), by passing a lmm_size = 0.
476 * It is up to the application to ensure no other processes open this file
477 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
478 * used. We might be able to avoid races of that sort by getting lli_open_sem
479 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
480 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
482 int ll_file_open(struct inode *inode, struct file *file)
484 struct ll_inode_info *lli = ll_i2info(inode);
485 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
486 .it_flags = file->f_flags };
487 struct lov_stripe_md *lsm;
488 struct ptlrpc_request *req = NULL;
489 struct obd_client_handle **och_p;
491 struct ll_file_data *fd;
492 int rc = 0, opendir_set = 0;
495 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
496 inode->i_generation, inode, file->f_flags);
498 #ifdef HAVE_VFS_INTENT_PATCHES
501 it = file->private_data; /* XXX: compat macro */
502 file->private_data = NULL; /* prevent ll_local_open assertion */
505 fd = ll_file_data_get();
509 if (S_ISDIR(inode->i_mode)) {
511 spin_lock(&lli->lli_lock);
512 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
513 LASSERT(lli->lli_sai == NULL);
514 lli->lli_opendir_key = fd;
515 lli->lli_opendir_pid = cfs_curproc_pid();
517 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
518 lli->lli_opendir_key != NULL)) {
519 /* Two cases for this:
520 * (1) The same process open such directory many times.
521 * (2) The old process opened the directory, and exited
522 * before its children processes. Then new process
523 * with the same pid opens such directory before the
524 * old process's children processes exit.
525 * reset stat ahead for such cases. */
526 spin_unlock(&lli->lli_lock);
527 CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
528 " reset it.\n", file->f_dentry->d_name.len,
529 file->f_dentry->d_name.name,
530 PFID(&lli->lli_fid));
531 ll_stop_statahead(inode, lli->lli_opendir_key);
534 spin_unlock(&lli->lli_lock);
537 if (inode->i_sb->s_root == file->f_dentry) {
538 LUSTRE_FPRIVATE(file) = fd;
542 if (!it || !it->d.lustre.it_disposition) {
543 /* Convert f_flags into access mode. We cannot use file->f_mode,
544 * because everything but O_ACCMODE mask was stripped from
546 if ((oit.it_flags + 1) & O_ACCMODE)
548 if (file->f_flags & O_TRUNC)
549 oit.it_flags |= FMODE_WRITE;
551 /* kernel only call f_op->open in dentry_open. filp_open calls
552 * dentry_open after call to open_namei that checks permissions.
553 * Only nfsd_open call dentry_open directly without checking
554 * permissions and because of that this code below is safe. */
555 if (oit.it_flags & FMODE_WRITE)
556 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
558 /* We do not want O_EXCL here, presumably we opened the file
559 * already? XXX - NFS implications? */
560 oit.it_flags &= ~O_EXCL;
566 /* Let's see if we have file open on MDS already. */
567 if (it->it_flags & FMODE_WRITE) {
568 och_p = &lli->lli_mds_write_och;
569 och_usecount = &lli->lli_open_fd_write_count;
570 } else if (it->it_flags & FMODE_EXEC) {
571 och_p = &lli->lli_mds_exec_och;
572 och_usecount = &lli->lli_open_fd_exec_count;
574 och_p = &lli->lli_mds_read_och;
575 och_usecount = &lli->lli_open_fd_read_count;
578 down(&lli->lli_och_sem);
579 if (*och_p) { /* Open handle is present */
580 if (it_disposition(it, DISP_OPEN_OPEN)) {
581 /* Well, there's extra open request that we do not need,
582 let's close it somehow. This will decref request. */
583 rc = it_open_error(DISP_OPEN_OPEN, it);
585 up(&lli->lli_och_sem);
586 ll_file_data_put(fd);
587 GOTO(out_openerr, rc);
589 ll_release_openhandle(file->f_dentry, it);
590 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
595 rc = ll_local_open(file, it, fd, NULL);
598 up(&lli->lli_och_sem);
599 ll_file_data_put(fd);
600 GOTO(out_openerr, rc);
603 LASSERT(*och_usecount == 0);
604 if (!it->d.lustre.it_disposition) {
605 /* We cannot just request lock handle now, new ELC code
606 means that one of other OPEN locks for this file
607 could be cancelled, and since blocking ast handler
608 would attempt to grab och_sem as well, that would
609 result in a deadlock */
610 up(&lli->lli_och_sem);
611 it->it_flags |= O_CHECK_STALE;
612 rc = ll_intent_file_open(file, NULL, 0, it);
613 it->it_flags &= ~O_CHECK_STALE;
615 ll_file_data_put(fd);
616 GOTO(out_openerr, rc);
619 /* Got some error? Release the request */
620 if (it->d.lustre.it_status < 0) {
621 req = it->d.lustre.it_data;
622 ptlrpc_req_finished(req);
624 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
625 &it->d.lustre.it_lock_handle,
626 file->f_dentry->d_inode);
629 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
631 ll_file_data_put(fd);
632 GOTO(out_och_free, rc = -ENOMEM);
635 req = it->d.lustre.it_data;
637 /* md_intent_lock() didn't get a request ref if there was an
638 * open error, so don't do cleanup on the request here
640 /* XXX (green): Should not we bail out on any error here, not
641 * just open error? */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 ll_file_data_put(fd);
645 GOTO(out_och_free, rc);
648 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
649 rc = ll_local_open(file, it, fd, *och_p);
651 ll_file_data_put(fd);
652 GOTO(out_och_free, rc);
655 up(&lli->lli_och_sem);
657 /* Must do this outside lli_och_sem lock to prevent deadlock where
658 different kind of OPEN lock for this same inode gets cancelled
659 by ldlm_cancel_lru */
660 if (!S_ISREG(inode->i_mode))
667 if (file->f_flags & O_LOV_DELAY_CREATE ||
668 !(file->f_mode & FMODE_WRITE)) {
669 CDEBUG(D_INODE, "object creation was delayed\n");
673 file->f_flags &= ~O_LOV_DELAY_CREATE;
676 ptlrpc_req_finished(req);
678 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
682 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
683 *och_p = NULL; /* OBD_FREE writes some magic there */
686 up(&lli->lli_och_sem);
688 if (opendir_set != 0)
689 ll_stop_statahead(inode, fd);
695 /* Fills the obdo with the attributes for the inode defined by lsm */
696 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
698 struct ptlrpc_request_set *set;
699 struct ll_inode_info *lli = ll_i2info(inode);
700 struct lov_stripe_md *lsm = lli->lli_smd;
702 struct obd_info oinfo = { { { 0 } } };
706 LASSERT(lsm != NULL);
710 oinfo.oi_oa->o_id = lsm->lsm_object_id;
711 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
712 oinfo.oi_oa->o_mode = S_IFREG;
713 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
714 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
715 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
716 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
718 oinfo.oi_capa = ll_mdscapa_get(inode);
720 set = ptlrpc_prep_set();
722 CERROR("can't allocate ptlrpc set\n");
725 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
727 rc = ptlrpc_set_wait(set);
728 ptlrpc_set_destroy(set);
730 capa_put(oinfo.oi_capa);
734 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
735 OBD_MD_FLATIME | OBD_MD_FLMTIME |
736 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
738 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
739 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
740 lli->lli_smd->lsm_object_id, i_size_read(inode),
741 (unsigned long long)inode->i_blocks,
742 (unsigned long)ll_inode_blksize(inode));
746 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
748 struct ll_inode_info *lli = ll_i2info(inode);
749 struct lov_stripe_md *lsm = lli->lli_smd;
750 struct obd_export *exp = ll_i2dtexp(inode);
753 struct ldlm_lock *lock;
754 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
755 __u32 stripe, vallen = sizeof(stripe);
756 struct lov_oinfo *loinfo;
760 if (lsm->lsm_stripe_count == 1)
761 GOTO(check, stripe = 0);
763 /* get our offset in the lov */
764 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
766 CERROR("obd_get_info: rc = %d\n", rc);
769 LASSERT(stripe < lsm->lsm_stripe_count);
772 loinfo = lsm->lsm_oinfo[stripe];
773 if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
774 &lock->l_resource->lr_name)){
775 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
776 loinfo->loi_id, loinfo->loi_gr);
777 RETURN(-ELDLM_NO_LOCK_DATA);
783 /* Get extra page reference to ensure it is not going away */
784 void ll_pin_extent_cb(void *data)
786 struct page *page = data;
788 page_cache_get(page);
793 /* Flush the page from page cache for an extent as its canceled.
794 * Page to remove is delivered as @data.
796 * No one can dirty the extent until we've finished our work and they cannot
797 * enqueue another lock. The DLM protects us from ll_file_read/write here,
798 * but other kernel actors could have pages locked.
800 * If @discard is set, there is no need to write the page if it is dirty.
802 * Called with the DLM lock held. */
803 int ll_page_removal_cb(void *data, int discard)
806 struct page *page = data;
807 struct address_space *mapping;
811 /* We have page reference already from ll_pin_page */
814 /* Already truncated by somebody */
817 mapping = page->mapping;
819 ll_teardown_mmaps(mapping,
820 (__u64)page->index << PAGE_CACHE_SHIFT,
821 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
823 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
825 if (!discard && clear_page_dirty_for_io(page)) {
826 LASSERT(page->mapping);
827 rc = ll_call_writepage(page->mapping->host, page);
828 /* either waiting for io to complete or reacquiring
829 * the lock that the failed writepage released */
831 wait_on_page_writeback(page);
833 CERROR("writepage inode %lu(%p) of page %p "
834 "failed: %d\n", mapping->host->i_ino,
835 mapping->host, page, rc);
837 set_bit(AS_ENOSPC, &mapping->flags);
839 set_bit(AS_EIO, &mapping->flags);
841 set_bit(AS_EIO, &mapping->flags);
843 if (page->mapping != NULL) {
844 struct ll_async_page *llap = llap_cast_private(page);
845 /* checking again to account for writeback's lock_page() */
846 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
848 ll_ra_accounting(llap, page->mapping);
849 ll_truncate_complete_page(page);
853 LASSERT(!PageWriteback(page));
855 page_cache_release(page);
860 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
861 void *data, int flag)
864 struct ll_inode_info *lli;
865 struct lov_stripe_md *lsm;
871 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
872 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
876 inode = ll_inode_from_lock(lock);
879 lli = ll_i2info(inode);
882 if (lli->lli_smd == NULL)
886 stripe = ll_lock_to_stripe_offset(inode, lock);
890 lov_stripe_lock(lsm);
891 lock_res_and_lock(lock);
892 kms = ldlm_extent_shift_kms(lock,
893 lsm->lsm_oinfo[stripe]->loi_kms);
895 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
896 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
897 lsm->lsm_oinfo[stripe]->loi_kms, kms);
898 lsm->lsm_oinfo[stripe]->loi_kms = kms;
899 unlock_res_and_lock(lock);
900 lov_stripe_unlock(lsm);
901 ll_queue_done_writing(inode, 0);
910 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
912 /* XXX ALLOCATE - 160 bytes */
913 struct inode *inode = ll_inode_from_lock(lock);
914 struct ll_inode_info *lli = ll_i2info(inode);
915 struct lustre_handle lockh = { 0 };
920 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
921 LDLM_FL_BLOCK_CONV)) {
922 LBUG(); /* not expecting any blocked async locks yet */
923 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
925 ldlm_lock_dump(D_OTHER, lock, 0);
926 ldlm_reprocess_all(lock->l_resource);
930 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
932 stripe = ll_lock_to_stripe_offset(inode, lock);
936 if (lock->l_lvb_len) {
937 struct lov_stripe_md *lsm = lli->lli_smd;
939 lvb = lock->l_lvb_data;
940 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
942 lock_res_and_lock(lock);
943 ll_inode_size_lock(inode, 1);
944 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
945 kms = ldlm_extent_shift_kms(NULL, kms);
946 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
947 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
948 lsm->lsm_oinfo[stripe].loi_kms, kms);
949 lsm->lsm_oinfo[stripe].loi_kms = kms;
950 ll_inode_size_unlock(inode, 1);
951 unlock_res_and_lock(lock);
956 wake_up(&lock->l_waitq);
958 ldlm_lock2handle(lock, &lockh);
959 ldlm_lock_decref(&lockh, LCK_PR);
964 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
966 struct ptlrpc_request *req = reqp;
967 struct inode *inode = ll_inode_from_lock(lock);
968 struct ll_inode_info *lli;
969 struct lov_stripe_md *lsm;
975 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
976 lli = ll_i2info(inode);
978 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
981 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
983 /* First, find out which stripe index this lock corresponds to. */
984 stripe = ll_lock_to_stripe_offset(inode, lock);
986 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
988 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
989 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
991 rc = req_capsule_server_pack(&req->rq_pill);
993 CERROR("lustre_pack_reply: %d\n", rc);
997 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
998 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
999 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1000 lvb->lvb_atime = LTIME_S(inode->i_atime);
1001 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1003 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1004 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1005 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1006 lvb->lvb_atime, lvb->lvb_ctime);
1011 /* These errors are normal races, so we don't want to fill the console
1012 * with messages by calling ptlrpc_error() */
1013 if (rc == -ELDLM_NO_LOCK_DATA)
1014 lustre_pack_reply(req, 1, NULL, NULL);
1016 req->rq_status = rc;
1020 static int ll_merge_lvb(struct inode *inode)
1022 struct ll_inode_info *lli = ll_i2info(inode);
1023 struct ll_sb_info *sbi = ll_i2sbi(inode);
1029 ll_inode_size_lock(inode, 1);
1030 inode_init_lvb(inode, &lvb);
1031 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1032 i_size_write(inode, lvb.lvb_size);
1033 inode->i_blocks = lvb.lvb_blocks;
1035 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1036 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1037 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1038 ll_inode_size_unlock(inode, 1);
1043 int ll_local_size(struct inode *inode)
1045 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1046 struct ll_inode_info *lli = ll_i2info(inode);
1047 struct ll_sb_info *sbi = ll_i2sbi(inode);
1048 struct lustre_handle lockh = { 0 };
1053 if (lli->lli_smd->lsm_stripe_count == 0)
1056 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1057 &policy, LCK_PR, &flags, inode, &lockh);
1063 rc = ll_merge_lvb(inode);
1064 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct lustre_handle lockh = { 0 };
1072 struct ldlm_enqueue_info einfo = { 0 };
1073 struct obd_info oinfo = { { { 0 } } };
1079 einfo.ei_type = LDLM_EXTENT;
1080 einfo.ei_mode = LCK_PR;
1081 einfo.ei_cb_bl = osc_extent_blocking_cb;
1082 einfo.ei_cb_cp = ldlm_completion_ast;
1083 einfo.ei_cb_gl = ll_glimpse_callback;
1084 einfo.ei_cbdata = NULL;
1086 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1087 oinfo.oi_lockh = &lockh;
1089 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1091 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1095 CERROR("obd_enqueue returned rc %d, "
1096 "returning -EIO\n", rc);
1097 RETURN(rc > 0 ? -EIO : rc);
1100 lov_stripe_lock(lsm);
1101 memset(&lvb, 0, sizeof(lvb));
1102 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1103 st->st_size = lvb.lvb_size;
1104 st->st_blocks = lvb.lvb_blocks;
1105 st->st_mtime = lvb.lvb_mtime;
1106 st->st_atime = lvb.lvb_atime;
1107 st->st_ctime = lvb.lvb_ctime;
1108 lov_stripe_unlock(lsm);
1113 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1114 * file (because it prefers KMS over RSS when larger) */
1115 int ll_glimpse_size(struct inode *inode, int ast_flags)
1117 struct ll_inode_info *lli = ll_i2info(inode);
1118 struct ll_sb_info *sbi = ll_i2sbi(inode);
1119 struct lustre_handle lockh = { 0 };
1120 struct ldlm_enqueue_info einfo = { 0 };
1121 struct obd_info oinfo = { { { 0 } } };
1125 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1128 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1130 if (!lli->lli_smd) {
1131 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1135 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1136 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1137 * won't revoke any conflicting DLM locks held. Instead,
1138 * ll_glimpse_callback() will be called on each client
1139 * holding a DLM lock against this file, and resulting size
1140 * will be returned for each stripe. DLM lock on [0, EOF] is
1141 * acquired only if there were no conflicting locks. */
1142 einfo.ei_type = LDLM_EXTENT;
1143 einfo.ei_mode = LCK_PR;
1144 einfo.ei_cb_bl = osc_extent_blocking_cb;
1145 einfo.ei_cb_cp = ldlm_completion_ast;
1146 einfo.ei_cb_gl = ll_glimpse_callback;
1147 einfo.ei_cbdata = inode;
1149 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1150 oinfo.oi_lockh = &lockh;
1151 oinfo.oi_md = lli->lli_smd;
1152 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1154 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1158 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1159 RETURN(rc > 0 ? -EIO : rc);
1162 rc = ll_merge_lvb(inode);
1164 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1165 i_size_read(inode), (unsigned long long)inode->i_blocks);
1170 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1171 struct lov_stripe_md *lsm, int mode,
1172 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1175 struct ll_sb_info *sbi = ll_i2sbi(inode);
1177 struct ldlm_enqueue_info einfo = { 0 };
1178 struct obd_info oinfo = { { { 0 } } };
1182 LASSERT(!lustre_handle_is_used(lockh));
1183 LASSERT(lsm != NULL);
1185 /* XXX phil: can we do this? won't it screw the file size up? */
1186 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1187 (sbi->ll_flags & LL_SBI_NOLCK))
1190 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1191 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1193 einfo.ei_type = LDLM_EXTENT;
1194 einfo.ei_mode = mode;
1195 einfo.ei_cb_bl = osc_extent_blocking_cb;
1196 einfo.ei_cb_cp = ldlm_completion_ast;
1197 einfo.ei_cb_gl = ll_glimpse_callback;
1198 einfo.ei_cbdata = inode;
1200 oinfo.oi_policy = *policy;
1201 oinfo.oi_lockh = lockh;
1203 oinfo.oi_flags = ast_flags;
1205 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1206 *policy = oinfo.oi_policy;
1210 ll_inode_size_lock(inode, 1);
1211 inode_init_lvb(inode, &lvb);
1212 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1214 if (policy->l_extent.start == 0 &&
1215 policy->l_extent.end == OBD_OBJECT_EOF) {
1216 /* vmtruncate()->ll_truncate() first sets the i_size and then
1217 * the kms under both a DLM lock and the
1218 * ll_inode_size_lock(). If we don't get the
1219 * ll_inode_size_lock() here we can match the DLM lock and
1220 * reset i_size from the kms before the truncating path has
1221 * updated the kms. generic_file_write can then trust the
1222 * stale i_size when doing appending writes and effectively
1223 * cancel the result of the truncate. Getting the
1224 * ll_inode_size_lock() after the enqueue maintains the DLM
1225 * -> ll_inode_size_lock() acquiring order. */
1226 i_size_write(inode, lvb.lvb_size);
1227 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1228 inode->i_ino, i_size_read(inode));
1232 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1233 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1234 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1236 ll_inode_size_unlock(inode, 1);
1241 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1242 struct lov_stripe_md *lsm, int mode,
1243 struct lustre_handle *lockh)
1245 struct ll_sb_info *sbi = ll_i2sbi(inode);
1249 /* XXX phil: can we do this? won't it screw the file size up? */
1250 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1251 (sbi->ll_flags & LL_SBI_NOLCK))
1254 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1259 static void ll_set_file_contended(struct inode *inode)
1261 struct ll_inode_info *lli = ll_i2info(inode);
1262 cfs_time_t now = cfs_time_current();
1264 spin_lock(&lli->lli_lock);
1265 lli->lli_contention_time = now;
1266 lli->lli_flags |= LLIF_CONTENDED;
1267 spin_unlock(&lli->lli_lock);
1270 void ll_clear_file_contended(struct inode *inode)
1272 struct ll_inode_info *lli = ll_i2info(inode);
1274 spin_lock(&lli->lli_lock);
1275 lli->lli_flags &= ~LLIF_CONTENDED;
1276 spin_unlock(&lli->lli_lock);
1279 static int ll_is_file_contended(struct file *file)
1281 struct inode *inode = file->f_dentry->d_inode;
1282 struct ll_inode_info *lli = ll_i2info(inode);
1283 struct ll_sb_info *sbi = ll_i2sbi(inode);
1284 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1287 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1288 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1289 " osc connect flags = 0x"LPX64"\n",
1290 sbi->ll_lco.lco_flags);
1293 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1295 if (lli->lli_flags & LLIF_CONTENDED) {
1296 cfs_time_t cur_time = cfs_time_current();
1297 cfs_time_t retry_time;
1299 retry_time = cfs_time_add(
1300 lli->lli_contention_time,
1301 cfs_time_seconds(sbi->ll_contention_time));
1302 if (cfs_time_after(cur_time, retry_time)) {
1303 ll_clear_file_contended(inode);
1311 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1312 const char *buf, size_t count,
1313 loff_t start, loff_t end, int rw)
1316 int tree_locked = 0;
1318 struct inode * inode = file->f_dentry->d_inode;
1321 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1323 if (append || !ll_is_file_contended(file)) {
1324 struct ll_lock_tree_node *node;
1327 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1328 if (file->f_flags & O_NONBLOCK)
1329 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1330 node = ll_node_from_inode(inode, start, end,
1331 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1336 tree->lt_fd = LUSTRE_FPRIVATE(file);
1337 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1340 else if (rc == -EUSERS)
1341 ll_set_file_contended(inode);
1345 RETURN(tree_locked);
1351 * Checks if requested extent lock is compatible with a lock under a page.
1353 * Checks if the lock under \a page is compatible with a read or write lock
1354 * (specified by \a rw) for an extent [\a start , \a end].
1356 * \param page the page under which lock is considered
1357 * \param rw OBD_BRW_READ if requested for reading,
1358 * OBD_BRW_WRITE if requested for writing
1359 * \param start start of the requested extent
1360 * \param end end of the requested extent
1361 * \param cookie transparent parameter for passing locking context
1363 * \post result == 1, *cookie == context, appropriate lock is referenced or
1366 * \retval 1 owned lock is reused for the request
1367 * \retval 0 no lock reused for the request
1369 * \see ll_release_short_lock
1371 static int ll_reget_short_lock(struct page *page, int rw,
1372 obd_off start, obd_off end,
1375 struct ll_async_page *llap;
1376 struct obd_export *exp;
1377 struct inode *inode = page->mapping->host;
1381 exp = ll_i2dtexp(inode);
1385 llap = llap_cast_private(page);
1389 RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1390 &llap->llap_cookie, rw, start, end,
1395 * Releases a reference to a lock taken in a "fast" way.
1397 * Releases a read or a write (specified by \a rw) lock
1398 * referenced by \a cookie.
1400 * \param inode inode to which data belong
1401 * \param end end of the locked extent
1402 * \param rw OBD_BRW_READ if requested for reading,
1403 * OBD_BRW_WRITE if requested for writing
1404 * \param cookie transparent parameter for passing locking context
1406 * \post appropriate lock is dereferenced
1408 * \see ll_reget_short_lock
1410 static void ll_release_short_lock(struct inode *inode, obd_off end,
1411 void *cookie, int rw)
1413 struct obd_export *exp;
1416 exp = ll_i2dtexp(inode);
1420 rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1423 CERROR("unlock failed (%d)\n", rc);
1427 * Checks if requested extent lock is compatible
1428 * with a lock under a page in page cache.
1430 * Checks if a lock under some \a page is compatible with a read or write lock
1431 * (specified by \a rw) for an extent [\a start , \a end].
1433 * \param file the file under which lock is considered
1434 * \param rw OBD_BRW_READ if requested for reading,
1435 * OBD_BRW_WRITE if requested for writing
1436 * \param ppos start of the requested extent
1437 * \param end end of the requested extent
1438 * \param cookie transparent parameter for passing locking context
1439 * \param buf userspace buffer for the data
1441 * \post result == 1, *cookie == context, appropriate lock is referenced
1444 * \retval 1 owned lock is reused for the request
1445 * \retval 0 no lock reused for the request
1447 * \see ll_file_put_fast_lock
1449 static inline int ll_file_get_fast_lock(struct file *file,
1450 obd_off ppos, obd_off end,
1451 char *buf, void **cookie, int rw)
1458 if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
1459 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1460 ppos >> CFS_PAGE_SHIFT);
1462 if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1466 page_cache_release(page);
1474 * Releases a reference to a lock taken in a "fast" way.
1476 * Releases a read or a write (specified by \a rw) lock
1477 * referenced by \a cookie.
1479 * \param inode inode to which data belong
1480 * \param end end of the locked extent
1481 * \param rw OBD_BRW_READ if requested for reading,
1482 * OBD_BRW_WRITE if requested for writing
1483 * \param cookie transparent parameter for passing locking context
1485 * \post appropriate lock is dereferenced
1487 * \see ll_file_get_fast_lock
1489 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1490 void *cookie, int rw)
1492 ll_release_short_lock(inode, end, cookie, rw);
1495 enum ll_lock_style {
1496 LL_LOCK_STYLE_NOLOCK = 0,
1497 LL_LOCK_STYLE_FASTLOCK = 1,
1498 LL_LOCK_STYLE_TREELOCK = 2
1502 * Checks if requested extent lock is compatible with a lock
1503 * under a page cache page.
1505 * Checks if the lock under \a page is compatible with a read or write lock
1506 * (specified by \a rw) for an extent [\a start , \a end].
1508 * \param file file under which I/O is processed
1509 * \param rw OBD_BRW_READ if requested for reading,
1510 * OBD_BRW_WRITE if requested for writing
1511 * \param ppos start of the requested extent
1512 * \param end end of the requested extent
1513 * \param cookie transparent parameter for passing locking context
1514 * (only used with LL_LOCK_STYLE_FASTLOCK)
1515 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1516 * \param buf userspace buffer for the data
1518 * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
1519 * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
1520 * \retval LL_LOCK_STYLE_NOLOCK got no lock
1522 * \see ll_file_put_lock
1524 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1525 obd_off end, char *buf, void **cookie,
1526 struct ll_lock_tree *tree, int rw)
1532 if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
1533 RETURN(LL_LOCK_STYLE_FASTLOCK);
1535 rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
1536 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1539 RETURN(LL_LOCK_STYLE_TREELOCK);
1541 RETURN(LL_LOCK_STYLE_NOLOCK);
1544 /* an error happened if we reached this point, rc = -errno here */
1549 * Drops the lock taken by ll_file_get_lock.
1551 * Releases a read or a write (specified by \a rw) lock
1552 * referenced by \a tree or \a cookie.
1554 * \param inode inode to which data belong
1555 * \param end end of the locked extent
1556 * \param lockstyle facility through which the lock was taken
1557 * \param rw OBD_BRW_READ if requested for reading,
1558 * OBD_BRW_WRITE if requested for writing
1559 * \param cookie transparent parameter for passing locking context
1560 * (only used with LL_LOCK_STYLE_FASTLOCK)
1561 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1563 * \post appropriate lock is dereferenced
1565 * \see ll_file_get_lock
1567 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1568 enum ll_lock_style lock_style,
1569 void *cookie, struct ll_lock_tree *tree,
1573 switch (lock_style) {
1574 case LL_LOCK_STYLE_TREELOCK:
1575 ll_tree_unlock(tree);
1577 case LL_LOCK_STYLE_FASTLOCK:
1578 ll_file_put_fast_lock(inode, end, cookie, rw);
1581 CERROR("invalid locking style (%d)\n", lock_style);
1585 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1588 struct inode *inode = file->f_dentry->d_inode;
1589 struct ll_inode_info *lli = ll_i2info(inode);
1590 struct lov_stripe_md *lsm = lli->lli_smd;
1591 struct ll_sb_info *sbi = ll_i2sbi(inode);
1592 struct ll_lock_tree tree;
1594 struct ll_ra_read bead;
1597 ssize_t retval, chunk, sum = 0;
1603 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1604 inode->i_ino, inode->i_generation, inode, count, *ppos);
1605 /* "If nbyte is 0, read() will return 0 and have no other results."
1606 * -- Single Unix Spec */
1610 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1613 /* Read on file with no objects should return zero-filled
1614 * buffers up to file size (we can get non-zero sizes with
1615 * mknod + truncate, then opening file for read. This is a
1616 * common pattern in NFS case, it seems). Bug 6243 */
1618 /* Since there are no objects on OSTs, we have nothing to get
1619 * lock on and so we are forced to access inode->i_size
1622 /* Read beyond end of file */
1623 if (*ppos >= i_size_read(inode))
1626 if (count > i_size_read(inode) - *ppos)
1627 count = i_size_read(inode) - *ppos;
1628 /* Make sure to correctly adjust the file pos pointer for
1630 notzeroed = clear_user(buf, count);
1638 if (sbi->ll_max_rw_chunk != 0) {
1639 /* first, let's know the end of the current stripe */
1641 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1643 /* correct, the end is beyond the request */
1644 if (end > *ppos + count - 1)
1645 end = *ppos + count - 1;
1647 /* and chunk shouldn't be too large even if striping is wide */
1648 if (end - *ppos > sbi->ll_max_rw_chunk)
1649 end = *ppos + sbi->ll_max_rw_chunk - 1;
1651 end = *ppos + count - 1;
1654 lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1655 buf, &cookie, &tree, OBD_BRW_READ);
1657 GOTO(out, retval = lock_style);
1659 ll_inode_size_lock(inode, 1);
1661 * Consistency guarantees: following possibilities exist for the
1662 * relation between region being read and real file size at this
1665 * (A): the region is completely inside of the file;
1667 * (B-x): x bytes of region are inside of the file, the rest is
1670 * (C): the region is completely outside of the file.
1672 * This classification is stable under DLM lock acquired by
1673 * ll_tree_lock() above, because to change class, other client has to
1674 * take DLM lock conflicting with our lock. Also, any updates to
1675 * ->i_size by other threads on this client are serialized by
1676 * ll_inode_size_lock(). This guarantees that short reads are handled
1677 * correctly in the face of concurrent writes and truncates.
1679 inode_init_lvb(inode, &lvb);
1680 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1682 if (*ppos + count - 1 > kms) {
1683 /* A glimpse is necessary to determine whether we return a
1684 * short read (B) or some zeroes at the end of the buffer (C) */
1685 ll_inode_size_unlock(inode, 1);
1686 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1688 if (lock_style != LL_LOCK_STYLE_NOLOCK)
1689 ll_file_put_lock(inode, end, lock_style,
1690 cookie, &tree, OBD_BRW_READ);
1694 /* region is within kms and, hence, within real file size (A).
1695 * We need to increase i_size to cover the read region so that
1696 * generic_file_read() will do its job, but that doesn't mean
1697 * the kms size is _correct_, it is only the _minimum_ size.
1698 * If someone does a stat they will get the correct size which
1699 * will always be >= the kms value here. b=11081 */
1700 if (i_size_read(inode) < kms)
1701 i_size_write(inode, kms);
1702 ll_inode_size_unlock(inode, 1);
1705 chunk = end - *ppos + 1;
1706 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1707 inode->i_ino, chunk, *ppos, i_size_read(inode));
1709 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1710 /* turn off the kernel's read-ahead */
1711 file->f_ra.ra_pages = 0;
1713 /* initialize read-ahead window once per syscall */
1716 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1717 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1718 ll_ra_read_in(file, &bead);
1722 file_accessed(file);
1723 retval = generic_file_read(file, buf, chunk, ppos);
1724 ll_file_put_lock(inode, end, lock_style, cookie, &tree,
1727 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1730 ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1736 if (retval == chunk && count > 0)
1742 ll_ra_read_ex(file, &bead);
1743 retval = (sum > 0) ? sum : retval;
1748 * Write to a file (through the page cache).
1750 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1753 struct inode *inode = file->f_dentry->d_inode;
1754 struct ll_sb_info *sbi = ll_i2sbi(inode);
1755 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1756 struct ll_lock_tree tree;
1757 loff_t maxbytes = ll_file_maxbytes(inode);
1758 loff_t lock_start, lock_end, end;
1759 ssize_t retval, chunk, sum = 0;
1763 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1764 inode->i_ino, inode->i_generation, inode, count, *ppos);
1766 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1768 /* POSIX, but surprised the VFS doesn't check this already */
1772 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1773 * called on the file, don't fail the below assertion (bug 2388). */
1774 if (file->f_flags & O_LOV_DELAY_CREATE &&
1775 ll_i2info(inode)->lli_smd == NULL)
1778 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1780 down(&ll_i2info(inode)->lli_write_sem);
1783 chunk = 0; /* just to fix gcc's warning */
1784 end = *ppos + count - 1;
1786 if (file->f_flags & O_APPEND) {
1788 lock_end = OBD_OBJECT_EOF;
1789 } else if (sbi->ll_max_rw_chunk != 0) {
1790 /* first, let's know the end of the current stripe */
1792 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1795 /* correct, the end is beyond the request */
1796 if (end > *ppos + count - 1)
1797 end = *ppos + count - 1;
1799 /* and chunk shouldn't be too large even if striping is wide */
1800 if (end - *ppos > sbi->ll_max_rw_chunk)
1801 end = *ppos + sbi->ll_max_rw_chunk - 1;
1806 lock_end = *ppos + count - 1;
1809 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1810 lock_start, lock_end, OBD_BRW_WRITE);
1811 if (tree_locked < 0)
1812 GOTO(out, retval = tree_locked);
1814 /* This is ok, g_f_w will overwrite this under i_sem if it races
1815 * with a local truncate, it just makes our maxbyte checking easier.
1816 * The i_size value gets updated in ll_extent_lock() as a consequence
1817 * of the [0,EOF] extent lock we requested above. */
1818 if (file->f_flags & O_APPEND) {
1819 *ppos = i_size_read(inode);
1820 end = *ppos + count - 1;
1823 if (*ppos >= maxbytes) {
1824 send_sig(SIGXFSZ, current, 0);
1825 GOTO(out_unlock, retval = -EFBIG);
1827 if (end > maxbytes - 1)
1830 /* generic_file_write handles O_APPEND after getting i_mutex */
1831 chunk = end - *ppos + 1;
1832 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1833 inode->i_ino, chunk, *ppos);
1835 retval = generic_file_write(file, buf, chunk, ppos);
1837 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1839 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1843 ll_tree_unlock(&tree);
1850 if (retval == chunk && count > 0)
1854 up(&ll_i2info(inode)->lli_write_sem);
1856 retval = (sum > 0) ? sum : retval;
1857 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1858 retval > 0 ? retval : 0);
1863 * Send file content (through pagecache) somewhere with helper
1865 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1866 read_actor_t actor, void *target)
1868 struct inode *inode = in_file->f_dentry->d_inode;
1869 struct ll_inode_info *lli = ll_i2info(inode);
1870 struct lov_stripe_md *lsm = lli->lli_smd;
1871 struct ll_lock_tree tree;
1872 struct ll_lock_tree_node *node;
1874 struct ll_ra_read bead;
1879 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1880 inode->i_ino, inode->i_generation, inode, count, *ppos);
1882 /* "If nbyte is 0, read() will return 0 and have no other results."
1883 * -- Single Unix Spec */
1887 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1888 /* turn off the kernel's read-ahead */
1889 in_file->f_ra.ra_pages = 0;
1891 /* File with no objects, nothing to lock */
1893 RETURN(generic_file_sendfile(in_file, ppos,count,actor,target));
1895 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1897 RETURN(PTR_ERR(node));
1899 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1900 rc = ll_tree_lock(&tree, node, NULL, count,
1901 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1905 ll_clear_file_contended(inode);
1906 ll_inode_size_lock(inode, 1);
1908 * Consistency guarantees: following possibilities exist for the
1909 * relation between region being read and real file size at this
1912 * (A): the region is completely inside of the file;
1914 * (B-x): x bytes of region are inside of the file, the rest is
1917 * (C): the region is completely outside of the file.
1919 * This classification is stable under DLM lock acquired by
1920 * ll_tree_lock() above, because to change class, other client has to
1921 * take DLM lock conflicting with our lock. Also, any updates to
1922 * ->i_size by other threads on this client are serialized by
1923 * ll_inode_size_lock(). This guarantees that short reads are handled
1924 * correctly in the face of concurrent writes and truncates.
1926 inode_init_lvb(inode, &lvb);
1927 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1929 if (*ppos + count - 1 > kms) {
1930 /* A glimpse is necessary to determine whether we return a
1931 * short read (B) or some zeroes at the end of the buffer (C) */
1932 ll_inode_size_unlock(inode, 1);
1933 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1937 /* region is within kms and, hence, within real file size (A) */
1938 i_size_write(inode, kms);
1939 ll_inode_size_unlock(inode, 1);
1942 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1943 inode->i_ino, count, *ppos, i_size_read(inode));
1945 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1946 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1947 ll_ra_read_in(in_file, &bead);
1949 file_accessed(in_file);
1950 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1951 ll_ra_read_ex(in_file, &bead);
1954 ll_tree_unlock(&tree);
1958 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1961 struct ll_inode_info *lli = ll_i2info(inode);
1962 struct obd_export *exp = ll_i2dtexp(inode);
1963 struct ll_recreate_obj ucreatp;
1964 struct obd_trans_info oti = { 0 };
1965 struct obdo *oa = NULL;
1968 struct lov_stripe_md *lsm, *lsm2;
1971 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1974 if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1975 sizeof(struct ll_recreate_obj)))
1982 down(&lli->lli_size_sem);
1985 GOTO(out, rc = -ENOENT);
1986 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1987 (lsm->lsm_stripe_count));
1989 OBD_ALLOC(lsm2, lsm_size);
1991 GOTO(out, rc = -ENOMEM);
1993 oa->o_id = ucreatp.lrc_id;
1994 oa->o_gr = ucreatp.lrc_group;
1995 oa->o_nlink = ucreatp.lrc_ost_idx;
1996 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1997 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1998 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1999 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2001 memcpy(lsm2, lsm, lsm_size);
2002 rc = obd_create(exp, oa, &lsm2, &oti);
2004 OBD_FREE(lsm2, lsm_size);
2007 up(&lli->lli_size_sem);
2012 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2013 int flags, struct lov_user_md *lum, int lum_size)
2015 struct ll_inode_info *lli = ll_i2info(inode);
2016 struct lov_stripe_md *lsm;
2017 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2021 down(&lli->lli_size_sem);
2024 up(&lli->lli_size_sem);
2025 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2030 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2033 if (it_disposition(&oit, DISP_LOOKUP_NEG))
2034 GOTO(out_req_free, rc = -ENOENT);
2035 rc = oit.d.lustre.it_status;
2037 GOTO(out_req_free, rc);
2039 ll_release_openhandle(file->f_dentry, &oit);
2042 up(&lli->lli_size_sem);
2043 ll_intent_release(&oit);
2046 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2050 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2051 struct lov_mds_md **lmmp, int *lmm_size,
2052 struct ptlrpc_request **request)
2054 struct ll_sb_info *sbi = ll_i2sbi(inode);
2055 struct mdt_body *body;
2056 struct lov_mds_md *lmm = NULL;
2057 struct ptlrpc_request *req = NULL;
2058 struct obd_capa *oc;
2061 rc = ll_get_max_mdsize(sbi, &lmmsize);
2065 oc = ll_mdscapa_get(inode);
2066 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
2067 oc, filename, strlen(filename) + 1,
2068 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
2069 ll_i2suppgid(inode), &req);
2072 CDEBUG(D_INFO, "md_getattr_name failed "
2073 "on %s: rc %d\n", filename, rc);
2077 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2078 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2080 lmmsize = body->eadatasize;
2082 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2084 GOTO(out, rc = -ENODATA);
2087 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2088 LASSERT(lmm != NULL);
2090 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2091 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2092 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2093 GOTO(out, rc = -EPROTO);
2097 * This is coming from the MDS, so is probably in
2098 * little endian. We convert it to host endian before
2099 * passing it to userspace.
2101 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2102 /* if function called for directory - we should
2103 * avoid swab not existent lsm objects */
2104 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2105 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
2106 if (S_ISREG(body->mode))
2107 lustre_swab_lov_user_md_objects(
2108 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2109 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
2110 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2111 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
2112 if (S_ISREG(body->mode))
2113 lustre_swab_lov_user_md_objects(
2114 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2115 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
2116 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2117 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2121 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2122 struct lov_stripe_md *lsm;
2123 struct lov_user_md_join *lmj;
2124 int lmj_size, i, aindex = 0;
2126 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
2128 GOTO(out, rc = -ENOMEM);
2129 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
2131 GOTO(out_free_memmd, rc);
2133 lmj_size = sizeof(struct lov_user_md_join) +
2134 lsm->lsm_stripe_count *
2135 sizeof(struct lov_user_ost_data_join);
2136 OBD_ALLOC(lmj, lmj_size);
2138 GOTO(out_free_memmd, rc = -ENOMEM);
2140 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2141 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2142 struct lov_extent *lex =
2143 &lsm->lsm_array->lai_ext_array[aindex];
2145 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2147 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2148 LPU64" len %d\n", aindex, i,
2149 lex->le_start, (int)lex->le_len);
2150 lmj->lmm_objects[i].l_extent_start =
2153 if ((int)lex->le_len == -1)
2154 lmj->lmm_objects[i].l_extent_end = -1;
2156 lmj->lmm_objects[i].l_extent_end =
2157 lex->le_start + lex->le_len;
2158 lmj->lmm_objects[i].l_object_id =
2159 lsm->lsm_oinfo[i]->loi_id;
2160 lmj->lmm_objects[i].l_object_gr =
2161 lsm->lsm_oinfo[i]->loi_gr;
2162 lmj->lmm_objects[i].l_ost_gen =
2163 lsm->lsm_oinfo[i]->loi_ost_gen;
2164 lmj->lmm_objects[i].l_ost_idx =
2165 lsm->lsm_oinfo[i]->loi_ost_idx;
2167 lmm = (struct lov_mds_md *)lmj;
2170 obd_free_memmd(sbi->ll_dt_exp, &lsm);
2174 *lmm_size = lmmsize;
2179 static int ll_lov_setea(struct inode *inode, struct file *file,
2182 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2183 struct lov_user_md *lump;
2184 int lum_size = sizeof(struct lov_user_md) +
2185 sizeof(struct lov_user_ost_data);
2189 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2192 OBD_ALLOC(lump, lum_size);
2196 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
2197 OBD_FREE(lump, lum_size);
2201 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2203 OBD_FREE(lump, lum_size);
2207 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2210 struct lov_user_md_v3 lumv3;
2211 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2212 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2213 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2216 int flags = FMODE_WRITE;
2219 /* first try with v1 which is smaller than v3 */
2220 lum_size = sizeof(struct lov_user_md_v1);
2221 if (copy_from_user(lumv1, lumv1p, lum_size))
2224 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2225 lum_size = sizeof(struct lov_user_md_v3);
2226 if (copy_from_user(&lumv3, lumv3p, lum_size))
2230 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2232 put_user(0, &lumv1p->lmm_stripe_count);
2233 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
2234 0, ll_i2info(inode)->lli_smd,
2240 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2242 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2247 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
2251 static int ll_get_grouplock(struct inode *inode, struct file *file,
2254 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2255 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2256 .end = OBD_OBJECT_EOF}};
2257 struct lustre_handle lockh = { 0 };
2258 struct ll_inode_info *lli = ll_i2info(inode);
2259 struct lov_stripe_md *lsm = lli->lli_smd;
2263 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2267 policy.l_extent.gid = arg;
2268 if (file->f_flags & O_NONBLOCK)
2269 flags = LDLM_FL_BLOCK_NOWAIT;
2271 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2275 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2277 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2282 static int ll_put_grouplock(struct inode *inode, struct file *file,
2285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2286 struct ll_inode_info *lli = ll_i2info(inode);
2287 struct lov_stripe_md *lsm = lli->lli_smd;
2291 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2292 /* Ugh, it's already unlocked. */
2296 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2299 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2301 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2306 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2311 #if LUSTRE_FIX >= 50
2312 static int join_sanity_check(struct inode *head, struct inode *tail)
2315 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2316 CERROR("server do not support join \n");
2319 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2320 CERROR("tail ino %lu and ino head %lu must be regular\n",
2321 head->i_ino, tail->i_ino);
2324 if (head->i_ino == tail->i_ino) {
2325 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2328 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2329 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2335 static int join_file(struct inode *head_inode, struct file *head_filp,
2336 struct file *tail_filp)
2338 struct dentry *tail_dentry = tail_filp->f_dentry;
2339 struct lookup_intent oit = {.it_op = IT_OPEN,
2340 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2341 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2342 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
2344 struct lustre_handle lockh;
2345 struct md_op_data *op_data;
2350 tail_dentry = tail_filp->f_dentry;
2352 data = i_size_read(head_inode);
2353 op_data = ll_prep_md_op_data(NULL, head_inode,
2354 tail_dentry->d_parent->d_inode,
2355 tail_dentry->d_name.name,
2356 tail_dentry->d_name.len, 0,
2357 LUSTRE_OPC_ANY, &data);
2358 if (IS_ERR(op_data))
2359 RETURN(PTR_ERR(op_data));
2361 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2362 op_data, &lockh, NULL, 0, NULL, 0);
2364 ll_finish_md_op_data(op_data);
2368 rc = oit.d.lustre.it_status;
2370 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2371 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2372 ptlrpc_req_finished((struct ptlrpc_request *)
2373 oit.d.lustre.it_data);
2377 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2379 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2380 oit.d.lustre.it_lock_mode = 0;
2382 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2383 it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2384 ll_release_openhandle(head_filp->f_dentry, &oit);
2386 ll_intent_release(&oit);
2390 static int ll_file_join(struct inode *head, struct file *filp,
2391 char *filename_tail)
2393 struct inode *tail = NULL, *first = NULL, *second = NULL;
2394 struct dentry *tail_dentry;
2395 struct file *tail_filp, *first_filp, *second_filp;
2396 struct ll_lock_tree first_tree, second_tree;
2397 struct ll_lock_tree_node *first_node, *second_node;
2398 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2399 int rc = 0, cleanup_phase = 0;
2402 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2403 head->i_ino, head->i_generation, head, filename_tail);
2405 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2406 if (IS_ERR(tail_filp)) {
2407 CERROR("Can not open tail file %s", filename_tail);
2408 rc = PTR_ERR(tail_filp);
2411 tail = igrab(tail_filp->f_dentry->d_inode);
2413 tlli = ll_i2info(tail);
2414 tail_dentry = tail_filp->f_dentry;
2415 LASSERT(tail_dentry);
2418 /*reorder the inode for lock sequence*/
2419 first = head->i_ino > tail->i_ino ? head : tail;
2420 second = head->i_ino > tail->i_ino ? tail : head;
2421 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2422 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2424 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2425 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2426 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2427 if (IS_ERR(first_node)){
2428 rc = PTR_ERR(first_node);
2431 first_tree.lt_fd = first_filp->private_data;
2432 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2437 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2438 if (IS_ERR(second_node)){
2439 rc = PTR_ERR(second_node);
2442 second_tree.lt_fd = second_filp->private_data;
2443 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2448 rc = join_sanity_check(head, tail);
2452 rc = join_file(head, filp, tail_filp);
2456 switch (cleanup_phase) {
2458 ll_tree_unlock(&second_tree);
2459 obd_cancel_unused(ll_i2dtexp(second),
2460 ll_i2info(second)->lli_smd, 0, NULL);
2462 ll_tree_unlock(&first_tree);
2463 obd_cancel_unused(ll_i2dtexp(first),
2464 ll_i2info(first)->lli_smd, 0, NULL);
2466 filp_close(tail_filp, 0);
2469 if (head && rc == 0) {
2470 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2472 hlli->lli_smd = NULL;
2477 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2482 #endif /* LUSTRE_FIX >= 50 */
2485 * Close inode open handle
2487 * \param dentry [in] dentry which contains the inode
2488 * \param it [in,out] intent which contains open info and result
2491 * \retval <0 failure
2493 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2495 struct inode *inode = dentry->d_inode;
2496 struct obd_client_handle *och;
2502 /* Root ? Do nothing. */
2503 if (dentry->d_inode->i_sb->s_root == dentry)
2506 /* No open handle to close? Move away */
2507 if (!it_disposition(it, DISP_OPEN_OPEN))
2510 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2512 OBD_ALLOC(och, sizeof(*och));
2514 GOTO(out, rc = -ENOMEM);
2516 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2517 ll_i2info(inode), it, och);
2519 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2522 /* this one is in place of ll_file_open */
2523 if (it_disposition(it, DISP_ENQ_OPEN_REF))
2524 ptlrpc_req_finished(it->d.lustre.it_data);
2525 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2530 * Get size for inode for which FIEMAP mapping is requested.
2531 * Make the FIEMAP get_info call and returns the result.
2533 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2536 struct obd_export *exp = ll_i2dtexp(inode);
2537 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2538 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2539 int vallen = num_bytes;
2543 /* If the stripe_count > 1 and the application does not understand
2544 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2546 if (lsm->lsm_stripe_count > 1 &&
2547 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2550 fm_key.oa.o_id = lsm->lsm_object_id;
2551 fm_key.oa.o_gr = lsm->lsm_object_gr;
2552 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2554 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
2557 /* If filesize is 0, then there would be no objects for mapping */
2558 if (fm_key.oa.o_size == 0) {
2559 fiemap->fm_mapped_extents = 0;
2563 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2565 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2567 CERROR("obd_get_info failed: rc = %d\n", rc);
2572 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2575 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2579 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2580 inode->i_generation, inode, cmd);
2581 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2583 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2584 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2588 case LL_IOC_GETFLAGS:
2589 /* Get the current value of the file flags */
2590 return put_user(fd->fd_flags, (int *)arg);
2591 case LL_IOC_SETFLAGS:
2592 case LL_IOC_CLRFLAGS:
2593 /* Set or clear specific file flags */
2594 /* XXX This probably needs checks to ensure the flags are
2595 * not abused, and to handle any flag side effects.
2597 if (get_user(flags, (int *) arg))
2600 if (cmd == LL_IOC_SETFLAGS) {
2601 if ((flags & LL_FILE_IGNORE_LOCK) &&
2602 !(file->f_flags & O_DIRECT)) {
2603 CERROR("%s: unable to disable locking on "
2604 "non-O_DIRECT file\n", current->comm);
2608 fd->fd_flags |= flags;
2610 fd->fd_flags &= ~flags;
2613 case LL_IOC_LOV_SETSTRIPE:
2614 RETURN(ll_lov_setstripe(inode, file, arg));
2615 case LL_IOC_LOV_SETEA:
2616 RETURN(ll_lov_setea(inode, file, arg));
2617 case LL_IOC_LOV_GETSTRIPE:
2618 RETURN(ll_lov_getstripe(inode, arg));
2619 case LL_IOC_RECREATE_OBJ:
2620 RETURN(ll_lov_recreate_obj(inode, file, arg));
2621 case EXT3_IOC_FIEMAP: {
2622 struct ll_user_fiemap *fiemap_s;
2623 size_t num_bytes, ret_bytes;
2624 unsigned int extent_count;
2627 /* Get the extent count so we can calculate the size of
2628 * required fiemap buffer */
2629 if (get_user(extent_count,
2630 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2632 num_bytes = sizeof(*fiemap_s) + (extent_count *
2633 sizeof(struct ll_fiemap_extent));
2634 OBD_VMALLOC(fiemap_s, num_bytes);
2635 if (fiemap_s == NULL)
2638 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2640 GOTO(error, rc = -EFAULT);
2642 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2643 fiemap_s->fm_flags = fiemap_s->fm_flags &
2644 ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2645 if (copy_to_user((char *)arg, fiemap_s,
2647 GOTO(error, rc = -EFAULT);
2649 GOTO(error, rc = -EBADR);
2652 /* If fm_extent_count is non-zero, read the first extent since
2653 * it is used to calculate end_offset and device from previous
2656 if (copy_from_user(&fiemap_s->fm_extents[0],
2657 (char __user *)arg + sizeof(*fiemap_s),
2658 sizeof(struct ll_fiemap_extent)))
2659 GOTO(error, rc = -EFAULT);
2662 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
2665 rc = filemap_fdatawrite(inode->i_mapping);
2670 rc = ll_fiemap(inode, fiemap_s, num_bytes);
2674 ret_bytes = sizeof(struct ll_user_fiemap);
2676 if (extent_count != 0)
2677 ret_bytes += (fiemap_s->fm_mapped_extents *
2678 sizeof(struct ll_fiemap_extent));
2680 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2684 OBD_VFREE(fiemap_s, num_bytes);
2687 case EXT3_IOC_GETFLAGS:
2688 case EXT3_IOC_SETFLAGS:
2689 RETURN(ll_iocontrol(inode, file, cmd, arg));
2690 case EXT3_IOC_GETVERSION_OLD:
2691 case EXT3_IOC_GETVERSION:
2692 RETURN(put_user(inode->i_generation, (int *)arg));
2694 #if LUSTRE_FIX >= 50
2695 /* Allow file join in beta builds to allow debuggging */
2699 ftail = getname((const char *)arg);
2701 RETURN(PTR_ERR(ftail));
2702 rc = ll_file_join(inode, file, ftail);
2706 CWARN("file join is not supported in this version of Lustre\n");
2710 case LL_IOC_GROUP_LOCK:
2711 RETURN(ll_get_grouplock(inode, file, arg));
2712 case LL_IOC_GROUP_UNLOCK:
2713 RETURN(ll_put_grouplock(inode, file, arg));
2714 case IOC_OBD_STATFS:
2715 RETURN(ll_obd_statfs(inode, (void *)arg));
2717 /* We need to special case any other ioctls we want to handle,
2718 * to send them to the MDS/OST as appropriate and to properly
2719 * network encode the arg field.
2720 case EXT3_IOC_SETVERSION_OLD:
2721 case EXT3_IOC_SETVERSION:
2723 case LL_IOC_FLUSHCTX:
2724 RETURN(ll_flush_ctx(inode));
2729 ll_iocontrol_call(inode, file, cmd, arg, &err))
2732 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2738 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2740 struct inode *inode = file->f_dentry->d_inode;
2741 struct ll_inode_info *lli = ll_i2info(inode);
2742 struct lov_stripe_md *lsm = lli->lli_smd;
2745 retval = offset + ((origin == 2) ? i_size_read(inode) :
2746 (origin == 1) ? file->f_pos : 0);
2747 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2748 inode->i_ino, inode->i_generation, inode, retval, retval,
2749 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2750 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2752 if (origin == 2) { /* SEEK_END */
2753 int nonblock = 0, rc;
2755 if (file->f_flags & O_NONBLOCK)
2756 nonblock = LDLM_FL_BLOCK_NOWAIT;
2759 rc = ll_glimpse_size(inode, nonblock);
2764 ll_inode_size_lock(inode, 0);
2765 offset += i_size_read(inode);
2766 ll_inode_size_unlock(inode, 0);
2767 } else if (origin == 1) { /* SEEK_CUR */
2768 offset += file->f_pos;
2772 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2773 if (offset != file->f_pos) {
2774 file->f_pos = offset;
2782 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2784 struct inode *inode = dentry->d_inode;
2785 struct ll_inode_info *lli = ll_i2info(inode);
2786 struct lov_stripe_md *lsm = lli->lli_smd;
2787 struct ptlrpc_request *req;
2788 struct obd_capa *oc;
2791 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2792 inode->i_generation, inode);
2793 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2795 /* fsync's caller has already called _fdata{sync,write}, we want
2796 * that IO to finish before calling the osc and mdc sync methods */
2797 rc = filemap_fdatawait(inode->i_mapping);
2799 /* catch async errors that were recorded back when async writeback
2800 * failed for pages in this mapping. */
2801 err = lli->lli_async_rc;
2802 lli->lli_async_rc = 0;
2806 err = lov_test_and_clear_async_rc(lsm);
2811 oc = ll_mdscapa_get(inode);
2812 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2818 ptlrpc_req_finished(req);
2825 RETURN(rc ? rc : -ENOMEM);
2827 oa->o_id = lsm->lsm_object_id;
2828 oa->o_gr = lsm->lsm_object_gr;
2829 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2830 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2831 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2834 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2835 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2836 0, OBD_OBJECT_EOF, oc);
2846 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2848 struct inode *inode = file->f_dentry->d_inode;
2849 struct ll_sb_info *sbi = ll_i2sbi(inode);
2850 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2851 .ei_cb_cp =ldlm_flock_completion_ast,
2852 .ei_cbdata = file_lock };
2853 struct md_op_data *op_data;
2854 struct lustre_handle lockh = {0};
2855 ldlm_policy_data_t flock;
2860 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2861 inode->i_ino, file_lock);
2863 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2865 if (file_lock->fl_flags & FL_FLOCK) {
2866 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2867 /* set missing params for flock() calls */
2868 file_lock->fl_end = OFFSET_MAX;
2869 file_lock->fl_pid = current->tgid;
2871 flock.l_flock.pid = file_lock->fl_pid;
2872 flock.l_flock.start = file_lock->fl_start;
2873 flock.l_flock.end = file_lock->fl_end;
2875 switch (file_lock->fl_type) {
2877 einfo.ei_mode = LCK_PR;
2880 /* An unlock request may or may not have any relation to
2881 * existing locks so we may not be able to pass a lock handle
2882 * via a normal ldlm_lock_cancel() request. The request may even
2883 * unlock a byte range in the middle of an existing lock. In
2884 * order to process an unlock request we need all of the same
2885 * information that is given with a normal read or write record
2886 * lock request. To avoid creating another ldlm unlock (cancel)
2887 * message we'll treat a LCK_NL flock request as an unlock. */
2888 einfo.ei_mode = LCK_NL;
2891 einfo.ei_mode = LCK_PW;
2894 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2909 flags = LDLM_FL_BLOCK_NOWAIT;
2915 flags = LDLM_FL_TEST_LOCK;
2916 /* Save the old mode so that if the mode in the lock changes we
2917 * can decrement the appropriate reader or writer refcount. */
2918 file_lock->fl_type = einfo.ei_mode;
2921 CERROR("unknown fcntl lock command: %d\n", cmd);
2925 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2926 LUSTRE_OPC_ANY, NULL);
2927 if (IS_ERR(op_data))
2928 RETURN(PTR_ERR(op_data));
2930 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2931 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2932 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2934 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2935 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2937 ll_finish_md_op_data(op_data);
2939 if ((file_lock->fl_flags & FL_FLOCK) &&
2940 (rc == 0 || file_lock->fl_type == F_UNLCK))
2941 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2942 #ifdef HAVE_F_OP_FLOCK
2943 if ((file_lock->fl_flags & FL_POSIX) &&
2944 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2945 !(flags & LDLM_FL_TEST_LOCK))
2946 posix_lock_file_wait(file, file_lock);
2952 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2959 int ll_have_md_lock(struct inode *inode, __u64 bits)
2961 struct lustre_handle lockh;
2962 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2970 fid = &ll_i2info(inode)->lli_fid;
2971 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2973 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2974 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2975 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2981 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2982 struct lustre_handle *lockh)
2984 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2990 fid = &ll_i2info(inode)->lli_fid;
2991 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2993 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2994 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2995 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2999 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3000 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3001 * and return success */
3003 /* This path cannot be hit for regular files unless in
3004 * case of obscure races, so no need to to validate
3006 if (!S_ISREG(inode->i_mode) &&
3007 !S_ISDIR(inode->i_mode))
3012 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3020 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3022 struct inode *inode = dentry->d_inode;
3023 struct ptlrpc_request *req = NULL;
3024 struct ll_sb_info *sbi;
3025 struct obd_export *exp;
3030 CERROR("REPORT THIS LINE TO PETER\n");
3033 sbi = ll_i2sbi(inode);
3035 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3036 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3038 exp = ll_i2mdexp(inode);
3040 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3041 struct lookup_intent oit = { .it_op = IT_GETATTR };
3042 struct md_op_data *op_data;
3044 /* Call getattr by fid, so do not provide name at all. */
3045 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
3046 dentry->d_inode, NULL, 0, 0,
3047 LUSTRE_OPC_ANY, NULL);
3048 if (IS_ERR(op_data))
3049 RETURN(PTR_ERR(op_data));
3051 oit.it_flags |= O_CHECK_STALE;
3052 rc = md_intent_lock(exp, op_data, NULL, 0,
3053 /* we are not interested in name
3056 ll_md_blocking_ast, 0);
3057 ll_finish_md_op_data(op_data);
3058 oit.it_flags &= ~O_CHECK_STALE;
3060 rc = ll_inode_revalidate_fini(inode, rc);
3064 rc = ll_revalidate_it_finish(req, &oit, dentry);
3066 ll_intent_release(&oit);
3070 /* Unlinked? Unhash dentry, so it is not picked up later by
3071 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3072 here to preserve get_cwd functionality on 2.6.
3074 if (!dentry->d_inode->i_nlink) {
3075 spin_lock(&ll_lookup_lock);
3076 spin_lock(&dcache_lock);
3077 ll_drop_dentry(dentry);
3078 spin_unlock(&dcache_lock);
3079 spin_unlock(&ll_lookup_lock);
3082 ll_lookup_finish_locks(&oit, dentry);
3083 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
3084 MDS_INODELOCK_LOOKUP)) {
3085 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3086 obd_valid valid = OBD_MD_FLGETATTR;
3087 struct obd_capa *oc;
3090 if (S_ISREG(inode->i_mode)) {
3091 rc = ll_get_max_mdsize(sbi, &ealen);
3094 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3096 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3097 * capa for this inode. Because we only keep capas of dirs
3099 oc = ll_mdscapa_get(inode);
3100 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
3104 rc = ll_inode_revalidate_fini(inode, rc);
3108 rc = ll_prep_inode(&inode, req, NULL);
3113 /* if object not yet allocated, don't validate size */
3114 if (ll_i2info(inode)->lli_smd == NULL)
3117 /* ll_glimpse_size will prefer locally cached writes if they extend
3119 rc = ll_glimpse_size(inode, 0);
3122 ptlrpc_req_finished(req);
3126 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3127 struct lookup_intent *it, struct kstat *stat)
3129 struct inode *inode = de->d_inode;
3132 res = ll_inode_revalidate_it(de, it);
3133 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
3138 stat->dev = inode->i_sb->s_dev;
3139 stat->ino = inode->i_ino;
3140 stat->mode = inode->i_mode;
3141 stat->nlink = inode->i_nlink;
3142 stat->uid = inode->i_uid;
3143 stat->gid = inode->i_gid;
3144 stat->rdev = kdev_t_to_nr(inode->i_rdev);
3145 stat->atime = inode->i_atime;
3146 stat->mtime = inode->i_mtime;
3147 stat->ctime = inode->i_ctime;
3148 #ifdef HAVE_INODE_BLKSIZE
3149 stat->blksize = inode->i_blksize;
3151 stat->blksize = 1 << inode->i_blkbits;
3154 ll_inode_size_lock(inode, 0);
3155 stat->size = i_size_read(inode);
3156 stat->blocks = inode->i_blocks;
3157 ll_inode_size_unlock(inode, 0);
3161 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3163 struct lookup_intent it = { .it_op = IT_GETATTR };
3165 return ll_getattr_it(mnt, de, &it, stat);
3169 int lustre_check_acl(struct inode *inode, int mask)
3171 #ifdef CONFIG_FS_POSIX_ACL
3172 struct ll_inode_info *lli = ll_i2info(inode);
3173 struct posix_acl *acl;
3177 spin_lock(&lli->lli_lock);
3178 acl = posix_acl_dup(lli->lli_posix_acl);
3179 spin_unlock(&lli->lli_lock);
3184 rc = posix_acl_permission(inode, acl, mask);
3185 posix_acl_release(acl);
3193 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3194 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3196 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3197 inode->i_ino, inode->i_generation, inode, mask);
3198 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3199 return lustre_check_remote_perm(inode, mask);
3201 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3202 return generic_permission(inode, mask, lustre_check_acl);
3205 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3207 int mode = inode->i_mode;
3210 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3211 inode->i_ino, inode->i_generation, inode, mask);
3213 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3214 return lustre_check_remote_perm(inode, mask);
3216 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3218 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3219 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3221 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3223 if (current->fsuid == inode->i_uid) {
3226 if (((mode >> 3) & mask & S_IRWXO) != mask)
3228 rc = lustre_check_acl(inode, mask);
3232 goto check_capabilities;
3236 if (in_group_p(inode->i_gid))
3239 if ((mode & mask & S_IRWXO) == mask)
3243 if (!(mask & MAY_EXEC) ||
3244 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3245 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3248 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3249 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3256 /* -o localflock - only provides locally consistent flock locks */
3257 struct file_operations ll_file_operations = {
3258 .read = ll_file_read,
3259 .write = ll_file_write,
3260 .ioctl = ll_file_ioctl,
3261 .open = ll_file_open,
3262 .release = ll_file_release,
3263 .mmap = ll_file_mmap,
3264 .llseek = ll_file_seek,
3265 .sendfile = ll_file_sendfile,
3269 struct file_operations ll_file_operations_flock = {
3270 .read = ll_file_read,
3271 .write = ll_file_write,
3272 .ioctl = ll_file_ioctl,
3273 .open = ll_file_open,
3274 .release = ll_file_release,
3275 .mmap = ll_file_mmap,
3276 .llseek = ll_file_seek,
3277 .sendfile = ll_file_sendfile,
3279 #ifdef HAVE_F_OP_FLOCK
3280 .flock = ll_file_flock,
3282 .lock = ll_file_flock
3285 /* These are for -o noflock - to return ENOSYS on flock calls */
3286 struct file_operations ll_file_operations_noflock = {
3287 .read = ll_file_read,
3288 .write = ll_file_write,
3289 .ioctl = ll_file_ioctl,
3290 .open = ll_file_open,
3291 .release = ll_file_release,
3292 .mmap = ll_file_mmap,
3293 .llseek = ll_file_seek,
3294 .sendfile = ll_file_sendfile,
3296 #ifdef HAVE_F_OP_FLOCK
3297 .flock = ll_file_noflock,
3299 .lock = ll_file_noflock
3302 struct inode_operations ll_file_inode_operations = {
3303 #ifdef HAVE_VFS_INTENT_PATCHES
3304 .setattr_raw = ll_setattr_raw,
3306 .setattr = ll_setattr,
3307 .truncate = ll_truncate,
3308 .getattr = ll_getattr,
3309 .permission = ll_inode_permission,
3310 .setxattr = ll_setxattr,
3311 .getxattr = ll_getxattr,
3312 .listxattr = ll_listxattr,
3313 .removexattr = ll_removexattr,
3316 /* dynamic ioctl number support routins */
3317 static struct llioc_ctl_data {
3318 struct rw_semaphore ioc_sem;
3319 struct list_head ioc_head;
3321 __RWSEM_INITIALIZER(llioc.ioc_sem),
3322 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3327 struct list_head iocd_list;
3328 unsigned int iocd_size;
3329 llioc_callback_t iocd_cb;
3330 unsigned int iocd_count;
3331 unsigned int iocd_cmd[0];
3334 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3337 struct llioc_data *in_data = NULL;
3340 if (cb == NULL || cmd == NULL ||
3341 count > LLIOC_MAX_CMD || count < 0)
3344 size = sizeof(*in_data) + count * sizeof(unsigned int);
3345 OBD_ALLOC(in_data, size);
3346 if (in_data == NULL)
3349 memset(in_data, 0, sizeof(*in_data));
3350 in_data->iocd_size = size;
3351 in_data->iocd_cb = cb;
3352 in_data->iocd_count = count;
3353 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3355 down_write(&llioc.ioc_sem);
3356 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3357 up_write(&llioc.ioc_sem);
3362 void ll_iocontrol_unregister(void *magic)
3364 struct llioc_data *tmp;
3369 down_write(&llioc.ioc_sem);
3370 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3372 unsigned int size = tmp->iocd_size;
3374 list_del(&tmp->iocd_list);
3375 up_write(&llioc.ioc_sem);
3377 OBD_FREE(tmp, size);
3381 up_write(&llioc.ioc_sem);
3383 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3386 EXPORT_SYMBOL(ll_iocontrol_register);
3387 EXPORT_SYMBOL(ll_iocontrol_unregister);
3389 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3390 unsigned int cmd, unsigned long arg, int *rcp)
3392 enum llioc_iter ret = LLIOC_CONT;
3393 struct llioc_data *data;
3394 int rc = -EINVAL, i;
3396 down_read(&llioc.ioc_sem);
3397 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3398 for (i = 0; i < data->iocd_count; i++) {
3399 if (cmd != data->iocd_cmd[i])
3402 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3406 if (ret == LLIOC_STOP)
3409 up_read(&llioc.ioc_sem);