1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
52 /* also used by llite/special.c:ll_special_open() */
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
61 static void ll_file_data_put(struct ll_file_data *fd)
64 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
67 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
68 struct lustre_handle *fh)
70 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
71 op_data->op_attr.ia_mode = inode->i_mode;
72 op_data->op_attr.ia_atime = inode->i_atime;
73 op_data->op_attr.ia_mtime = inode->i_mtime;
74 op_data->op_attr.ia_ctime = inode->i_ctime;
75 op_data->op_attr.ia_size = i_size_read(inode);
76 op_data->op_attr_blocks = inode->i_blocks;
77 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
78 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
79 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
80 op_data->op_capa1 = ll_mdscapa_get(inode);
83 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
84 struct obd_client_handle *och)
88 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
89 ATTR_MTIME_SET | ATTR_CTIME_SET;
91 if (!(och->och_flags & FMODE_WRITE))
94 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
95 !S_ISREG(inode->i_mode))
96 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98 ll_epoch_close(inode, op_data, &och, 0);
101 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107 struct obd_client_handle *och)
109 struct obd_export *exp = ll_i2mdexp(inode);
110 struct md_op_data *op_data;
111 struct ptlrpc_request *req = NULL;
112 struct obd_device *obd = class_exp2obd(exp);
119 * XXX: in case of LMV, is this correct to access
122 CERROR("Invalid MDC connection handle "LPX64"\n",
123 ll_i2mdexp(inode)->exp_handle.h_cookie);
128 * here we check if this is forced umount. If so this is called on
129 * canceling "open lock" and we do not call md_close() in this case, as
130 * it will not be successful, as import is already deactivated.
135 OBD_ALLOC_PTR(op_data);
137 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139 ll_prepare_close(inode, op_data, och);
140 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141 rc = md_close(md_exp, op_data, och->och_mod, &req);
146 /* This close must have the epoch closed. */
147 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
148 LASSERT(epoch_close);
149 /* MDS has instructed us to obtain Size-on-MDS attribute from
150 * OSTs and send setattr to back to MDS. */
151 rc = ll_sizeonmds_update(inode, och->och_mod,
152 &och->och_fh, op_data->op_ioepoch);
154 CERROR("inode %lu mdc Size-on-MDS update failed: "
155 "rc = %d\n", inode->i_ino, rc);
159 CERROR("inode %lu mdc close failed: rc = %d\n",
162 ll_finish_md_op_data(op_data);
165 rc = ll_objects_destroy(req, inode);
167 CERROR("inode %lu ll_objects destroy: rc = %d\n",
174 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
175 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
176 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
179 ptlrpc_close_replay_seq(req);
180 md_clear_open_replay_data(md_exp, och);
181 /* Free @och if it is not waiting for DONE_WRITING. */
182 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
185 if (req) /* This is close request */
186 ptlrpc_req_finished(req);
190 int ll_md_real_close(struct inode *inode, int flags)
192 struct ll_inode_info *lli = ll_i2info(inode);
193 struct obd_client_handle **och_p;
194 struct obd_client_handle *och;
199 if (flags & FMODE_WRITE) {
200 och_p = &lli->lli_mds_write_och;
201 och_usecount = &lli->lli_open_fd_write_count;
202 } else if (flags & FMODE_EXEC) {
203 och_p = &lli->lli_mds_exec_och;
204 och_usecount = &lli->lli_open_fd_exec_count;
206 LASSERT(flags & FMODE_READ);
207 och_p = &lli->lli_mds_read_och;
208 och_usecount = &lli->lli_open_fd_read_count;
211 down(&lli->lli_och_sem);
212 if (*och_usecount) { /* There are still users of this handle, so
214 up(&lli->lli_och_sem);
219 up(&lli->lli_och_sem);
221 if (och) { /* There might be a race and somebody have freed this och
223 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
230 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
233 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
234 struct ll_inode_info *lli = ll_i2info(inode);
238 /* clear group lock, if present */
239 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
240 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
241 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
242 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
246 /* Let's see if we have good enough OPEN lock on the file and if
247 we can skip talking to MDS */
248 if (file->f_dentry->d_inode) { /* Can this ever be false? */
250 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
251 struct lustre_handle lockh;
252 struct inode *inode = file->f_dentry->d_inode;
253 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
255 down(&lli->lli_och_sem);
256 if (fd->fd_omode & FMODE_WRITE) {
258 LASSERT(lli->lli_open_fd_write_count);
259 lli->lli_open_fd_write_count--;
260 } else if (fd->fd_omode & FMODE_EXEC) {
262 LASSERT(lli->lli_open_fd_exec_count);
263 lli->lli_open_fd_exec_count--;
266 LASSERT(lli->lli_open_fd_read_count);
267 lli->lli_open_fd_read_count--;
269 up(&lli->lli_och_sem);
271 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
272 LDLM_IBITS, &policy, lockmode,
274 rc = ll_md_real_close(file->f_dentry->d_inode,
278 CERROR("Releasing a file %p with negative dentry %p. Name %s",
279 file, file->f_dentry, file->f_dentry->d_name.name);
282 LUSTRE_FPRIVATE(file) = NULL;
283 ll_file_data_put(fd);
284 ll_capa_close(inode);
289 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
291 /* While this returns an error code, fput() the caller does not, so we need
292 * to make every effort to clean up all of our state here. Also, applications
293 * rarely check close errors and even if an error is returned they will not
294 * re-try the close call.
296 int ll_file_release(struct inode *inode, struct file *file)
298 struct ll_file_data *fd;
299 struct ll_sb_info *sbi = ll_i2sbi(inode);
300 struct ll_inode_info *lli = ll_i2info(inode);
301 struct lov_stripe_md *lsm = lli->lli_smd;
305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
306 inode->i_generation, inode);
308 #ifdef CONFIG_FS_POSIX_ACL
309 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
310 inode == inode->i_sb->s_root->d_inode) {
311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
314 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
315 fd->fd_flags &= ~LL_FILE_RMTACL;
316 rct_del(&sbi->ll_rct, cfs_curproc_pid());
317 et_search_free(&sbi->ll_et, cfs_curproc_pid());
322 if (inode->i_sb->s_root != file->f_dentry)
323 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
324 fd = LUSTRE_FPRIVATE(file);
327 /* The last ref on @file, maybe not the the owner pid of statahead.
328 * Different processes can open the same dir, "ll_opendir_key" means:
329 * it is me that should stop the statahead thread. */
330 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
331 ll_stop_statahead(inode, fd);
333 if (inode->i_sb->s_root == file->f_dentry) {
334 LUSTRE_FPRIVATE(file) = NULL;
335 ll_file_data_put(fd);
340 lov_test_and_clear_async_rc(lsm);
341 lli->lli_async_rc = 0;
343 rc = ll_md_close(sbi->ll_md_exp, inode, file);
347 static int ll_intent_file_open(struct file *file, void *lmm,
348 int lmmsize, struct lookup_intent *itp)
350 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
351 struct dentry *parent = file->f_dentry->d_parent;
352 const char *name = file->f_dentry->d_name.name;
353 const int len = file->f_dentry->d_name.len;
354 struct md_op_data *op_data;
355 struct ptlrpc_request *req;
362 /* Usually we come here only for NFSD, and we want open lock.
363 But we can also get here with pre 2.6.15 patchless kernels, and in
364 that case that lock is also ok */
365 /* We can also get here if there was cached open handle in revalidate_it
366 * but it disappeared while we were getting from there to ll_file_open.
367 * But this means this file was closed and immediatelly opened which
368 * makes a good candidate for using OPEN lock */
369 /* If lmmsize & lmm are not 0, we are just setting stripe info
370 * parameters. No need for the open lock */
371 if (!lmm && !lmmsize)
372 itp->it_flags |= MDS_OPEN_LOCK;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
375 file->f_dentry->d_inode, name, len,
376 O_RDWR, LUSTRE_OPC_ANY, NULL);
378 RETURN(PTR_ERR(op_data));
380 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
381 0 /*unused */, &req, ll_md_blocking_ast, 0);
382 ll_finish_md_op_data(op_data);
384 /* reason for keep own exit path - don`t flood log
385 * with messages with -ESTALE errors.
387 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
388 it_open_error(DISP_OPEN_OPEN, itp))
390 ll_release_openhandle(file->f_dentry, itp);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 if (itp->d.lustre.it_lock_mode)
401 md_set_lock_data(sbi->ll_md_exp,
402 &itp->d.lustre.it_lock_handle,
403 file->f_dentry->d_inode);
405 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
407 ptlrpc_req_finished(itp->d.lustre.it_data);
408 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
409 ll_intent_drop_lock(itp);
414 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
415 struct lookup_intent *it, struct obd_client_handle *och)
417 struct ptlrpc_request *req = it->d.lustre.it_data;
418 struct mdt_body *body;
422 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
423 LASSERT(body != NULL); /* reply already checked out */
425 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
426 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
427 och->och_fid = lli->lli_fid;
428 och->och_flags = it->it_flags;
429 lli->lli_ioepoch = body->ioepoch;
431 return md_set_open_replay_data(md_exp, och, req);
434 int ll_local_open(struct file *file, struct lookup_intent *it,
435 struct ll_file_data *fd, struct obd_client_handle *och)
437 struct inode *inode = file->f_dentry->d_inode;
438 struct ll_inode_info *lli = ll_i2info(inode);
441 LASSERT(!LUSTRE_FPRIVATE(file));
446 struct ptlrpc_request *req = it->d.lustre.it_data;
447 struct mdt_body *body;
450 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 if ((it->it_flags & FMODE_WRITE) &&
456 (body->valid & OBD_MD_FLSIZE))
457 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
458 lli->lli_ioepoch, PFID(&lli->lli_fid));
461 LUSTRE_FPRIVATE(file) = fd;
462 ll_readahead_init(inode, &fd->fd_ras);
463 fd->fd_omode = it->it_flags;
467 /* Open a file, and (for the very first open) create objects on the OSTs at
468 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
469 * creation or open until ll_lov_setstripe() ioctl is called. We grab
470 * lli_open_sem to ensure no other process will create objects, send the
471 * stripe MD to the MDS, or try to destroy the objects if that fails.
473 * If we already have the stripe MD locally then we don't request it in
474 * md_open(), by passing a lmm_size = 0.
476 * It is up to the application to ensure no other processes open this file
477 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
478 * used. We might be able to avoid races of that sort by getting lli_open_sem
479 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
480 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
482 int ll_file_open(struct inode *inode, struct file *file)
484 struct ll_inode_info *lli = ll_i2info(inode);
485 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
486 .it_flags = file->f_flags };
487 struct lov_stripe_md *lsm;
488 struct ptlrpc_request *req = NULL;
489 struct obd_client_handle **och_p;
491 struct ll_file_data *fd;
492 int rc = 0, opendir_set = 0;
495 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
496 inode->i_generation, inode, file->f_flags);
498 #ifdef HAVE_VFS_INTENT_PATCHES
501 it = file->private_data; /* XXX: compat macro */
502 file->private_data = NULL; /* prevent ll_local_open assertion */
505 fd = ll_file_data_get();
509 if (S_ISDIR(inode->i_mode)) {
511 spin_lock(&lli->lli_lock);
512 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
513 LASSERT(lli->lli_sai == NULL);
514 lli->lli_opendir_key = fd;
515 lli->lli_opendir_pid = cfs_curproc_pid();
517 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
518 lli->lli_opendir_key != NULL)) {
519 /* Two cases for this:
520 * (1) The same process open such directory many times.
521 * (2) The old process opened the directory, and exited
522 * before its children processes. Then new process
523 * with the same pid opens such directory before the
524 * old process's children processes exit.
525 * reset stat ahead for such cases. */
526 spin_unlock(&lli->lli_lock);
527 CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
528 " reset it.\n", file->f_dentry->d_name.len,
529 file->f_dentry->d_name.name,
530 PFID(&lli->lli_fid));
531 ll_stop_statahead(inode, lli->lli_opendir_key);
534 spin_unlock(&lli->lli_lock);
537 if (inode->i_sb->s_root == file->f_dentry) {
538 LUSTRE_FPRIVATE(file) = fd;
542 if (!it || !it->d.lustre.it_disposition) {
543 /* Convert f_flags into access mode. We cannot use file->f_mode,
544 * because everything but O_ACCMODE mask was stripped from
546 if ((oit.it_flags + 1) & O_ACCMODE)
548 if (file->f_flags & O_TRUNC)
549 oit.it_flags |= FMODE_WRITE;
551 /* kernel only call f_op->open in dentry_open. filp_open calls
552 * dentry_open after call to open_namei that checks permissions.
553 * Only nfsd_open call dentry_open directly without checking
554 * permissions and because of that this code below is safe. */
555 if (oit.it_flags & FMODE_WRITE)
556 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
558 /* We do not want O_EXCL here, presumably we opened the file
559 * already? XXX - NFS implications? */
560 oit.it_flags &= ~O_EXCL;
566 /* Let's see if we have file open on MDS already. */
567 if (it->it_flags & FMODE_WRITE) {
568 och_p = &lli->lli_mds_write_och;
569 och_usecount = &lli->lli_open_fd_write_count;
570 } else if (it->it_flags & FMODE_EXEC) {
571 och_p = &lli->lli_mds_exec_och;
572 och_usecount = &lli->lli_open_fd_exec_count;
574 och_p = &lli->lli_mds_read_och;
575 och_usecount = &lli->lli_open_fd_read_count;
578 down(&lli->lli_och_sem);
579 if (*och_p) { /* Open handle is present */
580 if (it_disposition(it, DISP_OPEN_OPEN)) {
581 /* Well, there's extra open request that we do not need,
582 let's close it somehow. This will decref request. */
583 rc = it_open_error(DISP_OPEN_OPEN, it);
585 up(&lli->lli_och_sem);
586 ll_file_data_put(fd);
587 GOTO(out_openerr, rc);
589 ll_release_openhandle(file->f_dentry, it);
590 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
595 rc = ll_local_open(file, it, fd, NULL);
598 up(&lli->lli_och_sem);
599 ll_file_data_put(fd);
600 GOTO(out_openerr, rc);
603 LASSERT(*och_usecount == 0);
604 if (!it->d.lustre.it_disposition) {
605 /* We cannot just request lock handle now, new ELC code
606 means that one of other OPEN locks for this file
607 could be cancelled, and since blocking ast handler
608 would attempt to grab och_sem as well, that would
609 result in a deadlock */
610 up(&lli->lli_och_sem);
611 it->it_flags |= O_CHECK_STALE;
612 rc = ll_intent_file_open(file, NULL, 0, it);
613 it->it_flags &= ~O_CHECK_STALE;
615 ll_file_data_put(fd);
616 GOTO(out_openerr, rc);
619 /* Got some error? Release the request */
620 if (it->d.lustre.it_status < 0) {
621 req = it->d.lustre.it_data;
622 ptlrpc_req_finished(req);
624 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
625 &it->d.lustre.it_lock_handle,
626 file->f_dentry->d_inode);
629 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
631 ll_file_data_put(fd);
632 GOTO(out_och_free, rc = -ENOMEM);
635 req = it->d.lustre.it_data;
637 /* md_intent_lock() didn't get a request ref if there was an
638 * open error, so don't do cleanup on the request here
640 /* XXX (green): Should not we bail out on any error here, not
641 * just open error? */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 ll_file_data_put(fd);
645 GOTO(out_och_free, rc);
648 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
649 rc = ll_local_open(file, it, fd, *och_p);
651 ll_file_data_put(fd);
652 GOTO(out_och_free, rc);
655 up(&lli->lli_och_sem);
657 /* Must do this outside lli_och_sem lock to prevent deadlock where
658 different kind of OPEN lock for this same inode gets cancelled
659 by ldlm_cancel_lru */
660 if (!S_ISREG(inode->i_mode))
667 if (file->f_flags & O_LOV_DELAY_CREATE ||
668 !(file->f_mode & FMODE_WRITE)) {
669 CDEBUG(D_INODE, "object creation was delayed\n");
673 file->f_flags &= ~O_LOV_DELAY_CREATE;
676 ptlrpc_req_finished(req);
678 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
682 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
683 *och_p = NULL; /* OBD_FREE writes some magic there */
686 up(&lli->lli_och_sem);
688 if (opendir_set != 0)
689 ll_stop_statahead(inode, fd);
695 /* Fills the obdo with the attributes for the inode defined by lsm */
696 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
698 struct ptlrpc_request_set *set;
699 struct ll_inode_info *lli = ll_i2info(inode);
700 struct lov_stripe_md *lsm = lli->lli_smd;
702 struct obd_info oinfo = { { { 0 } } };
706 LASSERT(lsm != NULL);
710 oinfo.oi_oa->o_id = lsm->lsm_object_id;
711 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
712 oinfo.oi_oa->o_mode = S_IFREG;
713 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
714 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
715 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
716 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
718 oinfo.oi_capa = ll_mdscapa_get(inode);
720 set = ptlrpc_prep_set();
722 CERROR("can't allocate ptlrpc set\n");
725 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
727 rc = ptlrpc_set_wait(set);
728 ptlrpc_set_destroy(set);
730 capa_put(oinfo.oi_capa);
734 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
735 OBD_MD_FLATIME | OBD_MD_FLMTIME |
736 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
738 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
739 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
740 lli->lli_smd->lsm_object_id, i_size_read(inode),
741 (unsigned long long)inode->i_blocks,
742 (unsigned long)ll_inode_blksize(inode));
746 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
748 struct ll_inode_info *lli = ll_i2info(inode);
749 struct lov_stripe_md *lsm = lli->lli_smd;
750 struct obd_export *exp = ll_i2dtexp(inode);
753 struct ldlm_lock *lock;
754 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
755 __u32 stripe, vallen = sizeof(stripe);
756 struct lov_oinfo *loinfo;
760 if (lsm->lsm_stripe_count == 1)
761 GOTO(check, stripe = 0);
763 /* get our offset in the lov */
764 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
766 CERROR("obd_get_info: rc = %d\n", rc);
769 LASSERT(stripe < lsm->lsm_stripe_count);
772 loinfo = lsm->lsm_oinfo[stripe];
773 if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
774 &lock->l_resource->lr_name)){
775 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
776 loinfo->loi_id, loinfo->loi_gr);
777 RETURN(-ELDLM_NO_LOCK_DATA);
783 /* Get extra page reference to ensure it is not going away */
784 void ll_pin_extent_cb(void *data)
786 struct page *page = data;
788 page_cache_get(page);
793 /* Flush the page from page cache for an extent as its canceled.
794 * Page to remove is delivered as @data.
796 * No one can dirty the extent until we've finished our work and they cannot
797 * enqueue another lock. The DLM protects us from ll_file_read/write here,
798 * but other kernel actors could have pages locked.
800 * If @discard is set, there is no need to write the page if it is dirty.
802 * Called with the DLM lock held. */
803 int ll_page_removal_cb(void *data, int discard)
806 struct page *page = data;
807 struct address_space *mapping;
811 /* We have page reference already from ll_pin_page */
814 /* Already truncated by somebody */
817 mapping = page->mapping;
819 ll_teardown_mmaps(mapping,
820 (__u64)page->index << PAGE_CACHE_SHIFT,
821 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
823 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
825 if (!discard && clear_page_dirty_for_io(page)) {
826 LASSERT(page->mapping);
827 rc = ll_call_writepage(page->mapping->host, page);
828 /* either waiting for io to complete or reacquiring
829 * the lock that the failed writepage released */
831 wait_on_page_writeback(page);
833 CERROR("writepage inode %lu(%p) of page %p "
834 "failed: %d\n", mapping->host->i_ino,
835 mapping->host, page, rc);
837 set_bit(AS_ENOSPC, &mapping->flags);
839 set_bit(AS_EIO, &mapping->flags);
841 set_bit(AS_EIO, &mapping->flags);
843 if (page->mapping != NULL) {
844 struct ll_async_page *llap = llap_cast_private(page);
845 /* checking again to account for writeback's lock_page() */
846 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
848 ll_ra_accounting(llap, page->mapping);
849 ll_truncate_complete_page(page);
853 LASSERT(!PageWriteback(page));
855 page_cache_release(page);
860 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
861 void *data, int flag)
864 struct ll_inode_info *lli;
865 struct lov_stripe_md *lsm;
871 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
872 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
876 inode = ll_inode_from_lock(lock);
879 lli = ll_i2info(inode);
882 if (lli->lli_smd == NULL)
886 stripe = ll_lock_to_stripe_offset(inode, lock);
890 lov_stripe_lock(lsm);
891 lock_res_and_lock(lock);
892 kms = ldlm_extent_shift_kms(lock,
893 lsm->lsm_oinfo[stripe]->loi_kms);
895 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
896 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
897 lsm->lsm_oinfo[stripe]->loi_kms, kms);
898 lsm->lsm_oinfo[stripe]->loi_kms = kms;
899 unlock_res_and_lock(lock);
900 lov_stripe_unlock(lsm);
901 ll_queue_done_writing(inode, 0);
910 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
912 /* XXX ALLOCATE - 160 bytes */
913 struct inode *inode = ll_inode_from_lock(lock);
914 struct ll_inode_info *lli = ll_i2info(inode);
915 struct lustre_handle lockh = { 0 };
920 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
921 LDLM_FL_BLOCK_CONV)) {
922 LBUG(); /* not expecting any blocked async locks yet */
923 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
925 ldlm_lock_dump(D_OTHER, lock, 0);
926 ldlm_reprocess_all(lock->l_resource);
930 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
932 stripe = ll_lock_to_stripe_offset(inode, lock);
936 if (lock->l_lvb_len) {
937 struct lov_stripe_md *lsm = lli->lli_smd;
939 lvb = lock->l_lvb_data;
940 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
942 lock_res_and_lock(lock);
943 ll_inode_size_lock(inode, 1);
944 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
945 kms = ldlm_extent_shift_kms(NULL, kms);
946 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
947 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
948 lsm->lsm_oinfo[stripe].loi_kms, kms);
949 lsm->lsm_oinfo[stripe].loi_kms = kms;
950 ll_inode_size_unlock(inode, 1);
951 unlock_res_and_lock(lock);
956 wake_up(&lock->l_waitq);
958 ldlm_lock2handle(lock, &lockh);
959 ldlm_lock_decref(&lockh, LCK_PR);
964 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
966 struct ptlrpc_request *req = reqp;
967 struct inode *inode = ll_inode_from_lock(lock);
968 struct ll_inode_info *lli;
969 struct lov_stripe_md *lsm;
975 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
976 lli = ll_i2info(inode);
978 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
981 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
983 /* First, find out which stripe index this lock corresponds to. */
984 stripe = ll_lock_to_stripe_offset(inode, lock);
986 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
988 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
989 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
991 rc = req_capsule_server_pack(&req->rq_pill);
993 CERROR("lustre_pack_reply: %d\n", rc);
997 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
998 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
999 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1000 lvb->lvb_atime = LTIME_S(inode->i_atime);
1001 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1003 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1004 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1005 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1006 lvb->lvb_atime, lvb->lvb_ctime);
1011 /* These errors are normal races, so we don't want to fill the console
1012 * with messages by calling ptlrpc_error() */
1013 if (rc == -ELDLM_NO_LOCK_DATA)
1014 lustre_pack_reply(req, 1, NULL, NULL);
1016 req->rq_status = rc;
1020 static int ll_merge_lvb(struct inode *inode)
1022 struct ll_inode_info *lli = ll_i2info(inode);
1023 struct ll_sb_info *sbi = ll_i2sbi(inode);
1029 ll_inode_size_lock(inode, 1);
1030 inode_init_lvb(inode, &lvb);
1031 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1032 i_size_write(inode, lvb.lvb_size);
1033 inode->i_blocks = lvb.lvb_blocks;
1035 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1036 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1037 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1038 ll_inode_size_unlock(inode, 1);
1043 int ll_local_size(struct inode *inode)
1045 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1046 struct ll_inode_info *lli = ll_i2info(inode);
1047 struct ll_sb_info *sbi = ll_i2sbi(inode);
1048 struct lustre_handle lockh = { 0 };
1053 if (lli->lli_smd->lsm_stripe_count == 0)
1056 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1057 &policy, LCK_PR, &flags, inode, &lockh);
1063 rc = ll_merge_lvb(inode);
1064 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1071 struct lustre_handle lockh = { 0 };
1072 struct ldlm_enqueue_info einfo = { 0 };
1073 struct obd_info oinfo = { { { 0 } } };
1079 einfo.ei_type = LDLM_EXTENT;
1080 einfo.ei_mode = LCK_PR;
1081 einfo.ei_cb_bl = osc_extent_blocking_cb;
1082 einfo.ei_cb_cp = ldlm_completion_ast;
1083 einfo.ei_cb_gl = ll_glimpse_callback;
1084 einfo.ei_cbdata = NULL;
1086 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1087 oinfo.oi_lockh = &lockh;
1089 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1091 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1095 CERROR("obd_enqueue returned rc %d, "
1096 "returning -EIO\n", rc);
1097 RETURN(rc > 0 ? -EIO : rc);
1100 lov_stripe_lock(lsm);
1101 memset(&lvb, 0, sizeof(lvb));
1102 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1103 st->st_size = lvb.lvb_size;
1104 st->st_blocks = lvb.lvb_blocks;
1105 st->st_mtime = lvb.lvb_mtime;
1106 st->st_atime = lvb.lvb_atime;
1107 st->st_ctime = lvb.lvb_ctime;
1108 lov_stripe_unlock(lsm);
1113 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1114 * file (because it prefers KMS over RSS when larger) */
1115 int ll_glimpse_size(struct inode *inode, int ast_flags)
1117 struct ll_inode_info *lli = ll_i2info(inode);
1118 struct ll_sb_info *sbi = ll_i2sbi(inode);
1119 struct lustre_handle lockh = { 0 };
1120 struct ldlm_enqueue_info einfo = { 0 };
1121 struct obd_info oinfo = { { { 0 } } };
1125 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1128 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1130 if (!lli->lli_smd) {
1131 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1135 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1136 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1137 * won't revoke any conflicting DLM locks held. Instead,
1138 * ll_glimpse_callback() will be called on each client
1139 * holding a DLM lock against this file, and resulting size
1140 * will be returned for each stripe. DLM lock on [0, EOF] is
1141 * acquired only if there were no conflicting locks. */
1142 einfo.ei_type = LDLM_EXTENT;
1143 einfo.ei_mode = LCK_PR;
1144 einfo.ei_cb_bl = osc_extent_blocking_cb;
1145 einfo.ei_cb_cp = ldlm_completion_ast;
1146 einfo.ei_cb_gl = ll_glimpse_callback;
1147 einfo.ei_cbdata = inode;
1149 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1150 oinfo.oi_lockh = &lockh;
1151 oinfo.oi_md = lli->lli_smd;
1152 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1154 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1158 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1159 RETURN(rc > 0 ? -EIO : rc);
1162 rc = ll_merge_lvb(inode);
1164 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1165 i_size_read(inode), (unsigned long long)inode->i_blocks);
1170 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1171 struct lov_stripe_md *lsm, int mode,
1172 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1175 struct ll_sb_info *sbi = ll_i2sbi(inode);
1177 struct ldlm_enqueue_info einfo = { 0 };
1178 struct obd_info oinfo = { { { 0 } } };
1182 LASSERT(!lustre_handle_is_used(lockh));
1183 LASSERT(lsm != NULL);
1185 /* don't drop the mmapped file to LRU */
1186 if (mapping_mapped(inode->i_mapping))
1187 ast_flags |= LDLM_FL_NO_LRU;
1189 /* XXX phil: can we do this? won't it screw the file size up? */
1190 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1191 (sbi->ll_flags & LL_SBI_NOLCK))
1194 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1195 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1197 einfo.ei_type = LDLM_EXTENT;
1198 einfo.ei_mode = mode;
1199 einfo.ei_cb_bl = osc_extent_blocking_cb;
1200 einfo.ei_cb_cp = ldlm_completion_ast;
1201 einfo.ei_cb_gl = ll_glimpse_callback;
1202 einfo.ei_cbdata = inode;
1204 oinfo.oi_policy = *policy;
1205 oinfo.oi_lockh = lockh;
1207 oinfo.oi_flags = ast_flags;
1209 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1210 *policy = oinfo.oi_policy;
1214 ll_inode_size_lock(inode, 1);
1215 inode_init_lvb(inode, &lvb);
1216 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1218 if (policy->l_extent.start == 0 &&
1219 policy->l_extent.end == OBD_OBJECT_EOF) {
1220 /* vmtruncate()->ll_truncate() first sets the i_size and then
1221 * the kms under both a DLM lock and the
1222 * ll_inode_size_lock(). If we don't get the
1223 * ll_inode_size_lock() here we can match the DLM lock and
1224 * reset i_size from the kms before the truncating path has
1225 * updated the kms. generic_file_write can then trust the
1226 * stale i_size when doing appending writes and effectively
1227 * cancel the result of the truncate. Getting the
1228 * ll_inode_size_lock() after the enqueue maintains the DLM
1229 * -> ll_inode_size_lock() acquiring order. */
1230 i_size_write(inode, lvb.lvb_size);
1231 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1232 inode->i_ino, i_size_read(inode));
1236 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1237 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1238 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1240 ll_inode_size_unlock(inode, 1);
1245 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1246 struct lov_stripe_md *lsm, int mode,
1247 struct lustre_handle *lockh)
1249 struct ll_sb_info *sbi = ll_i2sbi(inode);
1253 /* XXX phil: can we do this? won't it screw the file size up? */
1254 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1255 (sbi->ll_flags & LL_SBI_NOLCK))
1258 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1263 static void ll_set_file_contended(struct inode *inode)
1265 struct ll_inode_info *lli = ll_i2info(inode);
1266 cfs_time_t now = cfs_time_current();
1268 spin_lock(&lli->lli_lock);
1269 lli->lli_contention_time = now;
1270 lli->lli_flags |= LLIF_CONTENDED;
1271 spin_unlock(&lli->lli_lock);
1274 void ll_clear_file_contended(struct inode *inode)
1276 struct ll_inode_info *lli = ll_i2info(inode);
1278 spin_lock(&lli->lli_lock);
1279 lli->lli_flags &= ~LLIF_CONTENDED;
1280 spin_unlock(&lli->lli_lock);
1283 static int ll_is_file_contended(struct file *file)
1285 struct inode *inode = file->f_dentry->d_inode;
1286 struct ll_inode_info *lli = ll_i2info(inode);
1287 struct ll_sb_info *sbi = ll_i2sbi(inode);
1288 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1291 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1292 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1293 " osc connect flags = 0x"LPX64"\n",
1294 sbi->ll_lco.lco_flags);
1297 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1299 if (lli->lli_flags & LLIF_CONTENDED) {
1300 cfs_time_t cur_time = cfs_time_current();
1301 cfs_time_t retry_time;
1303 retry_time = cfs_time_add(
1304 lli->lli_contention_time,
1305 cfs_time_seconds(sbi->ll_contention_time));
1306 if (cfs_time_after(cur_time, retry_time)) {
1307 ll_clear_file_contended(inode);
1315 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1316 const char *buf, size_t count,
1317 loff_t start, loff_t end, int rw)
1320 int tree_locked = 0;
1322 struct inode * inode = file->f_dentry->d_inode;
1325 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1327 if (append || !ll_is_file_contended(file)) {
1328 struct ll_lock_tree_node *node;
1331 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1332 if (file->f_flags & O_NONBLOCK)
1333 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1334 node = ll_node_from_inode(inode, start, end,
1335 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1340 tree->lt_fd = LUSTRE_FPRIVATE(file);
1341 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1344 else if (rc == -EUSERS)
1345 ll_set_file_contended(inode);
1349 RETURN(tree_locked);
1355 * Checks if requested extent lock is compatible with a lock under a page.
1357 * Checks if the lock under \a page is compatible with a read or write lock
1358 * (specified by \a rw) for an extent [\a start , \a end].
1360 * \param page the page under which lock is considered
1361 * \param rw OBD_BRW_READ if requested for reading,
1362 * OBD_BRW_WRITE if requested for writing
1363 * \param start start of the requested extent
1364 * \param end end of the requested extent
1365 * \param cookie transparent parameter for passing locking context
1367 * \post result == 1, *cookie == context, appropriate lock is referenced or
1370 * \retval 1 owned lock is reused for the request
1371 * \retval 0 no lock reused for the request
1373 * \see ll_release_short_lock
1375 static int ll_reget_short_lock(struct page *page, int rw,
1376 obd_off start, obd_off end,
1379 struct ll_async_page *llap;
1380 struct obd_export *exp;
1381 struct inode *inode = page->mapping->host;
1385 exp = ll_i2dtexp(inode);
1389 llap = llap_cast_private(page);
1393 RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1394 &llap->llap_cookie, rw, start, end,
1399 * Releases a reference to a lock taken in a "fast" way.
1401 * Releases a read or a write (specified by \a rw) lock
1402 * referenced by \a cookie.
1404 * \param inode inode to which data belong
1405 * \param end end of the locked extent
1406 * \param rw OBD_BRW_READ if requested for reading,
1407 * OBD_BRW_WRITE if requested for writing
1408 * \param cookie transparent parameter for passing locking context
1410 * \post appropriate lock is dereferenced
1412 * \see ll_reget_short_lock
1414 static void ll_release_short_lock(struct inode *inode, obd_off end,
1415 void *cookie, int rw)
1417 struct obd_export *exp;
1420 exp = ll_i2dtexp(inode);
1424 rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1427 CERROR("unlock failed (%d)\n", rc);
1431 * Checks if requested extent lock is compatible
1432 * with a lock under a page in page cache.
1434 * Checks if a lock under some \a page is compatible with a read or write lock
1435 * (specified by \a rw) for an extent [\a start , \a end].
1437 * \param file the file under which lock is considered
1438 * \param rw OBD_BRW_READ if requested for reading,
1439 * OBD_BRW_WRITE if requested for writing
1440 * \param ppos start of the requested extent
1441 * \param end end of the requested extent
1442 * \param cookie transparent parameter for passing locking context
1443 * \param buf userspace buffer for the data
1445 * \post result == 1, *cookie == context, appropriate lock is referenced
1448 * \retval 1 owned lock is reused for the request
1449 * \retval 0 no lock reused for the request
1451 * \see ll_file_put_fast_lock
1453 static inline int ll_file_get_fast_lock(struct file *file,
1454 obd_off ppos, obd_off end,
1455 char *buf, void **cookie, int rw)
1462 if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
1463 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1464 ppos >> CFS_PAGE_SHIFT);
1466 if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1470 page_cache_release(page);
1478 * Releases a reference to a lock taken in a "fast" way.
1480 * Releases a read or a write (specified by \a rw) lock
1481 * referenced by \a cookie.
1483 * \param inode inode to which data belong
1484 * \param end end of the locked extent
1485 * \param rw OBD_BRW_READ if requested for reading,
1486 * OBD_BRW_WRITE if requested for writing
1487 * \param cookie transparent parameter for passing locking context
1489 * \post appropriate lock is dereferenced
1491 * \see ll_file_get_fast_lock
1493 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1494 void *cookie, int rw)
1496 ll_release_short_lock(inode, end, cookie, rw);
1499 enum ll_lock_style {
1500 LL_LOCK_STYLE_NOLOCK = 0,
1501 LL_LOCK_STYLE_FASTLOCK = 1,
1502 LL_LOCK_STYLE_TREELOCK = 2
1506 * Checks if requested extent lock is compatible with a lock
1507 * under a page cache page.
1509 * Checks if the lock under \a page is compatible with a read or write lock
1510 * (specified by \a rw) for an extent [\a start , \a end].
1512 * \param file file under which I/O is processed
1513 * \param rw OBD_BRW_READ if requested for reading,
1514 * OBD_BRW_WRITE if requested for writing
1515 * \param ppos start of the requested extent
1516 * \param end end of the requested extent
1517 * \param cookie transparent parameter for passing locking context
1518 * (only used with LL_LOCK_STYLE_FASTLOCK)
1519 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1520 * \param buf userspace buffer for the data
1522 * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
1523 * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
1524 * \retval LL_LOCK_STYLE_NOLOCK got no lock
1526 * \see ll_file_put_lock
1528 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1529 obd_off end, char *buf, void **cookie,
1530 struct ll_lock_tree *tree, int rw)
1536 if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
1537 RETURN(LL_LOCK_STYLE_FASTLOCK);
1539 rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
1540 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1543 RETURN(LL_LOCK_STYLE_TREELOCK);
1545 RETURN(LL_LOCK_STYLE_NOLOCK);
1548 /* an error happened if we reached this point, rc = -errno here */
1553 * Drops the lock taken by ll_file_get_lock.
1555 * Releases a read or a write (specified by \a rw) lock
1556 * referenced by \a tree or \a cookie.
1558 * \param inode inode to which data belong
1559 * \param end end of the locked extent
1560 * \param lockstyle facility through which the lock was taken
1561 * \param rw OBD_BRW_READ if requested for reading,
1562 * OBD_BRW_WRITE if requested for writing
1563 * \param cookie transparent parameter for passing locking context
1564 * (only used with LL_LOCK_STYLE_FASTLOCK)
1565 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1567 * \post appropriate lock is dereferenced
1569 * \see ll_file_get_lock
1571 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1572 enum ll_lock_style lock_style,
1573 void *cookie, struct ll_lock_tree *tree,
1577 switch (lock_style) {
1578 case LL_LOCK_STYLE_TREELOCK:
1579 ll_tree_unlock(tree);
1581 case LL_LOCK_STYLE_FASTLOCK:
1582 ll_file_put_fast_lock(inode, end, cookie, rw);
1585 CERROR("invalid locking style (%d)\n", lock_style);
1589 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1592 struct inode *inode = file->f_dentry->d_inode;
1593 struct ll_inode_info *lli = ll_i2info(inode);
1594 struct lov_stripe_md *lsm = lli->lli_smd;
1595 struct ll_sb_info *sbi = ll_i2sbi(inode);
1596 struct ll_lock_tree tree;
1598 struct ll_ra_read bead;
1601 ssize_t retval, chunk, sum = 0;
1607 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1608 inode->i_ino, inode->i_generation, inode, count, *ppos);
1609 /* "If nbyte is 0, read() will return 0 and have no other results."
1610 * -- Single Unix Spec */
1614 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1617 /* Read on file with no objects should return zero-filled
1618 * buffers up to file size (we can get non-zero sizes with
1619 * mknod + truncate, then opening file for read. This is a
1620 * common pattern in NFS case, it seems). Bug 6243 */
1622 /* Since there are no objects on OSTs, we have nothing to get
1623 * lock on and so we are forced to access inode->i_size
1626 /* Read beyond end of file */
1627 if (*ppos >= i_size_read(inode))
1630 if (count > i_size_read(inode) - *ppos)
1631 count = i_size_read(inode) - *ppos;
1632 /* Make sure to correctly adjust the file pos pointer for
1634 notzeroed = clear_user(buf, count);
1642 if (sbi->ll_max_rw_chunk != 0) {
1643 /* first, let's know the end of the current stripe */
1645 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1647 /* correct, the end is beyond the request */
1648 if (end > *ppos + count - 1)
1649 end = *ppos + count - 1;
1651 /* and chunk shouldn't be too large even if striping is wide */
1652 if (end - *ppos > sbi->ll_max_rw_chunk)
1653 end = *ppos + sbi->ll_max_rw_chunk - 1;
1655 end = *ppos + count - 1;
1658 lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1659 buf, &cookie, &tree, OBD_BRW_READ);
1661 GOTO(out, retval = lock_style);
1663 ll_inode_size_lock(inode, 1);
1665 * Consistency guarantees: following possibilities exist for the
1666 * relation between region being read and real file size at this
1669 * (A): the region is completely inside of the file;
1671 * (B-x): x bytes of region are inside of the file, the rest is
1674 * (C): the region is completely outside of the file.
1676 * This classification is stable under DLM lock acquired by
1677 * ll_tree_lock() above, because to change class, other client has to
1678 * take DLM lock conflicting with our lock. Also, any updates to
1679 * ->i_size by other threads on this client are serialized by
1680 * ll_inode_size_lock(). This guarantees that short reads are handled
1681 * correctly in the face of concurrent writes and truncates.
1683 inode_init_lvb(inode, &lvb);
1684 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1686 if (*ppos + count - 1 > kms) {
1687 /* A glimpse is necessary to determine whether we return a
1688 * short read (B) or some zeroes at the end of the buffer (C) */
1689 ll_inode_size_unlock(inode, 1);
1690 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1692 if (lock_style != LL_LOCK_STYLE_NOLOCK)
1693 ll_file_put_lock(inode, end, lock_style,
1694 cookie, &tree, OBD_BRW_READ);
1698 /* region is within kms and, hence, within real file size (A).
1699 * We need to increase i_size to cover the read region so that
1700 * generic_file_read() will do its job, but that doesn't mean
1701 * the kms size is _correct_, it is only the _minimum_ size.
1702 * If someone does a stat they will get the correct size which
1703 * will always be >= the kms value here. b=11081 */
1704 if (i_size_read(inode) < kms)
1705 i_size_write(inode, kms);
1706 ll_inode_size_unlock(inode, 1);
1709 chunk = end - *ppos + 1;
1710 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1711 inode->i_ino, chunk, *ppos, i_size_read(inode));
1713 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1714 /* turn off the kernel's read-ahead */
1715 file->f_ra.ra_pages = 0;
1717 /* initialize read-ahead window once per syscall */
1720 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1721 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1722 ll_ra_read_in(file, &bead);
1726 file_accessed(file);
1727 retval = generic_file_read(file, buf, chunk, ppos);
1728 ll_file_put_lock(inode, end, lock_style, cookie, &tree,
1731 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1734 ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1740 if (retval == chunk && count > 0)
1746 ll_ra_read_ex(file, &bead);
1747 retval = (sum > 0) ? sum : retval;
1752 * Write to a file (through the page cache).
1754 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1757 struct inode *inode = file->f_dentry->d_inode;
1758 struct ll_sb_info *sbi = ll_i2sbi(inode);
1759 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1760 struct ll_lock_tree tree;
1761 loff_t maxbytes = ll_file_maxbytes(inode);
1762 loff_t lock_start, lock_end, end;
1763 ssize_t retval, chunk, sum = 0;
1767 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1768 inode->i_ino, inode->i_generation, inode, count, *ppos);
1770 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1772 /* POSIX, but surprised the VFS doesn't check this already */
1776 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1777 * called on the file, don't fail the below assertion (bug 2388). */
1778 if (file->f_flags & O_LOV_DELAY_CREATE &&
1779 ll_i2info(inode)->lli_smd == NULL)
1782 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1784 down(&ll_i2info(inode)->lli_write_sem);
1787 chunk = 0; /* just to fix gcc's warning */
1788 end = *ppos + count - 1;
1790 if (file->f_flags & O_APPEND) {
1792 lock_end = OBD_OBJECT_EOF;
1793 } else if (sbi->ll_max_rw_chunk != 0) {
1794 /* first, let's know the end of the current stripe */
1796 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1799 /* correct, the end is beyond the request */
1800 if (end > *ppos + count - 1)
1801 end = *ppos + count - 1;
1803 /* and chunk shouldn't be too large even if striping is wide */
1804 if (end - *ppos > sbi->ll_max_rw_chunk)
1805 end = *ppos + sbi->ll_max_rw_chunk - 1;
1810 lock_end = *ppos + count - 1;
1813 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1814 lock_start, lock_end, OBD_BRW_WRITE);
1815 if (tree_locked < 0)
1816 GOTO(out, retval = tree_locked);
1818 /* This is ok, g_f_w will overwrite this under i_sem if it races
1819 * with a local truncate, it just makes our maxbyte checking easier.
1820 * The i_size value gets updated in ll_extent_lock() as a consequence
1821 * of the [0,EOF] extent lock we requested above. */
1822 if (file->f_flags & O_APPEND) {
1823 *ppos = i_size_read(inode);
1824 end = *ppos + count - 1;
1827 if (*ppos >= maxbytes) {
1828 send_sig(SIGXFSZ, current, 0);
1829 GOTO(out_unlock, retval = -EFBIG);
1831 if (end > maxbytes - 1)
1834 /* generic_file_write handles O_APPEND after getting i_mutex */
1835 chunk = end - *ppos + 1;
1836 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1837 inode->i_ino, chunk, *ppos);
1839 retval = generic_file_write(file, buf, chunk, ppos);
1841 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1843 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1847 ll_tree_unlock(&tree);
1854 if (retval == chunk && count > 0)
1858 up(&ll_i2info(inode)->lli_write_sem);
1860 retval = (sum > 0) ? sum : retval;
1861 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1862 retval > 0 ? retval : 0);
1867 * Send file content (through pagecache) somewhere with helper
1869 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1870 read_actor_t actor, void *target)
1872 struct inode *inode = in_file->f_dentry->d_inode;
1873 struct ll_inode_info *lli = ll_i2info(inode);
1874 struct lov_stripe_md *lsm = lli->lli_smd;
1875 struct ll_lock_tree tree;
1876 struct ll_lock_tree_node *node;
1878 struct ll_ra_read bead;
1883 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1884 inode->i_ino, inode->i_generation, inode, count, *ppos);
1886 /* "If nbyte is 0, read() will return 0 and have no other results."
1887 * -- Single Unix Spec */
1891 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1892 /* turn off the kernel's read-ahead */
1893 in_file->f_ra.ra_pages = 0;
1895 /* File with no objects, nothing to lock */
1897 RETURN(generic_file_sendfile(in_file, ppos,count,actor,target));
1899 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1901 RETURN(PTR_ERR(node));
1903 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1904 rc = ll_tree_lock(&tree, node, NULL, count,
1905 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1909 ll_clear_file_contended(inode);
1910 ll_inode_size_lock(inode, 1);
1912 * Consistency guarantees: following possibilities exist for the
1913 * relation between region being read and real file size at this
1916 * (A): the region is completely inside of the file;
1918 * (B-x): x bytes of region are inside of the file, the rest is
1921 * (C): the region is completely outside of the file.
1923 * This classification is stable under DLM lock acquired by
1924 * ll_tree_lock() above, because to change class, other client has to
1925 * take DLM lock conflicting with our lock. Also, any updates to
1926 * ->i_size by other threads on this client are serialized by
1927 * ll_inode_size_lock(). This guarantees that short reads are handled
1928 * correctly in the face of concurrent writes and truncates.
1930 inode_init_lvb(inode, &lvb);
1931 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1933 if (*ppos + count - 1 > kms) {
1934 /* A glimpse is necessary to determine whether we return a
1935 * short read (B) or some zeroes at the end of the buffer (C) */
1936 ll_inode_size_unlock(inode, 1);
1937 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1941 /* region is within kms and, hence, within real file size (A) */
1942 i_size_write(inode, kms);
1943 ll_inode_size_unlock(inode, 1);
1946 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1947 inode->i_ino, count, *ppos, i_size_read(inode));
1949 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1950 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1951 ll_ra_read_in(in_file, &bead);
1953 file_accessed(in_file);
1954 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1955 ll_ra_read_ex(in_file, &bead);
1958 ll_tree_unlock(&tree);
1962 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1965 struct ll_inode_info *lli = ll_i2info(inode);
1966 struct obd_export *exp = ll_i2dtexp(inode);
1967 struct ll_recreate_obj ucreatp;
1968 struct obd_trans_info oti = { 0 };
1969 struct obdo *oa = NULL;
1972 struct lov_stripe_md *lsm, *lsm2;
1975 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1978 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1979 sizeof(struct ll_recreate_obj));
1987 down(&lli->lli_size_sem);
1990 GOTO(out, rc = -ENOENT);
1991 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1992 (lsm->lsm_stripe_count));
1994 OBD_ALLOC(lsm2, lsm_size);
1996 GOTO(out, rc = -ENOMEM);
1998 oa->o_id = ucreatp.lrc_id;
1999 oa->o_gr = ucreatp.lrc_group;
2000 oa->o_nlink = ucreatp.lrc_ost_idx;
2001 oa->o_flags |= OBD_FL_RECREATE_OBJS;
2002 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
2003 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2004 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2006 memcpy(lsm2, lsm, lsm_size);
2007 rc = obd_create(exp, oa, &lsm2, &oti);
2009 OBD_FREE(lsm2, lsm_size);
2012 up(&lli->lli_size_sem);
2017 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2018 int flags, struct lov_user_md *lum, int lum_size)
2020 struct ll_inode_info *lli = ll_i2info(inode);
2021 struct lov_stripe_md *lsm;
2022 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2026 down(&lli->lli_size_sem);
2029 up(&lli->lli_size_sem);
2030 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2035 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2038 if (it_disposition(&oit, DISP_LOOKUP_NEG))
2039 GOTO(out_req_free, rc = -ENOENT);
2040 rc = oit.d.lustre.it_status;
2042 GOTO(out_req_free, rc);
2044 ll_release_openhandle(file->f_dentry, &oit);
2047 up(&lli->lli_size_sem);
2048 ll_intent_release(&oit);
2051 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2055 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2056 struct lov_mds_md **lmmp, int *lmm_size,
2057 struct ptlrpc_request **request)
2059 struct ll_sb_info *sbi = ll_i2sbi(inode);
2060 struct mdt_body *body;
2061 struct lov_mds_md *lmm = NULL;
2062 struct ptlrpc_request *req = NULL;
2063 struct obd_capa *oc;
2066 rc = ll_get_max_mdsize(sbi, &lmmsize);
2070 oc = ll_mdscapa_get(inode);
2071 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
2072 oc, filename, strlen(filename) + 1,
2073 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
2074 ll_i2suppgid(inode), &req);
2077 CDEBUG(D_INFO, "md_getattr_name failed "
2078 "on %s: rc %d\n", filename, rc);
2082 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2083 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2085 lmmsize = body->eadatasize;
2087 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2089 GOTO(out, rc = -ENODATA);
2092 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2093 LASSERT(lmm != NULL);
2095 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2096 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2097 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2098 GOTO(out, rc = -EPROTO);
2102 * This is coming from the MDS, so is probably in
2103 * little endian. We convert it to host endian before
2104 * passing it to userspace.
2106 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2107 /* if function called for directory - we should
2108 * avoid swab not existent lsm objects */
2109 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2110 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
2111 if (S_ISREG(body->mode))
2112 lustre_swab_lov_user_md_objects(
2113 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2114 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
2115 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2116 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
2117 if (S_ISREG(body->mode))
2118 lustre_swab_lov_user_md_objects(
2119 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2120 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
2121 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2122 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2126 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2127 struct lov_stripe_md *lsm;
2128 struct lov_user_md_join *lmj;
2129 int lmj_size, i, aindex = 0;
2131 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
2133 GOTO(out, rc = -ENOMEM);
2134 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
2136 GOTO(out_free_memmd, rc);
2138 lmj_size = sizeof(struct lov_user_md_join) +
2139 lsm->lsm_stripe_count *
2140 sizeof(struct lov_user_ost_data_join);
2141 OBD_ALLOC(lmj, lmj_size);
2143 GOTO(out_free_memmd, rc = -ENOMEM);
2145 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2146 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2147 struct lov_extent *lex =
2148 &lsm->lsm_array->lai_ext_array[aindex];
2150 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2152 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2153 LPU64" len %d\n", aindex, i,
2154 lex->le_start, (int)lex->le_len);
2155 lmj->lmm_objects[i].l_extent_start =
2158 if ((int)lex->le_len == -1)
2159 lmj->lmm_objects[i].l_extent_end = -1;
2161 lmj->lmm_objects[i].l_extent_end =
2162 lex->le_start + lex->le_len;
2163 lmj->lmm_objects[i].l_object_id =
2164 lsm->lsm_oinfo[i]->loi_id;
2165 lmj->lmm_objects[i].l_object_gr =
2166 lsm->lsm_oinfo[i]->loi_gr;
2167 lmj->lmm_objects[i].l_ost_gen =
2168 lsm->lsm_oinfo[i]->loi_ost_gen;
2169 lmj->lmm_objects[i].l_ost_idx =
2170 lsm->lsm_oinfo[i]->loi_ost_idx;
2172 lmm = (struct lov_mds_md *)lmj;
2175 obd_free_memmd(sbi->ll_dt_exp, &lsm);
2179 *lmm_size = lmmsize;
2184 static int ll_lov_setea(struct inode *inode, struct file *file,
2187 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2188 struct lov_user_md *lump;
2189 int lum_size = sizeof(struct lov_user_md) +
2190 sizeof(struct lov_user_ost_data);
2194 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2197 OBD_ALLOC(lump, lum_size);
2201 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2203 OBD_FREE(lump, lum_size);
2207 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2209 OBD_FREE(lump, lum_size);
2213 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2216 struct lov_user_md_v3 lumv3;
2217 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2218 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2219 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2222 int flags = FMODE_WRITE;
2225 /* first try with v1 which is smaller than v3 */
2226 lum_size = sizeof(struct lov_user_md_v1);
2227 rc = copy_from_user(lumv1, lumv1p, lum_size);
2231 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2232 lum_size = sizeof(struct lov_user_md_v3);
2233 rc = copy_from_user(&lumv3, lumv3p, lum_size);
2238 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2240 put_user(0, &lumv1p->lmm_stripe_count);
2241 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
2242 0, ll_i2info(inode)->lli_smd,
2248 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2250 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2255 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
2259 static int ll_get_grouplock(struct inode *inode, struct file *file,
2262 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2263 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2264 .end = OBD_OBJECT_EOF}};
2265 struct lustre_handle lockh = { 0 };
2266 struct ll_inode_info *lli = ll_i2info(inode);
2267 struct lov_stripe_md *lsm = lli->lli_smd;
2271 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2275 policy.l_extent.gid = arg;
2276 if (file->f_flags & O_NONBLOCK)
2277 flags = LDLM_FL_BLOCK_NOWAIT;
2279 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2283 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2285 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2290 static int ll_put_grouplock(struct inode *inode, struct file *file,
2293 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2294 struct ll_inode_info *lli = ll_i2info(inode);
2295 struct lov_stripe_md *lsm = lli->lli_smd;
2299 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2300 /* Ugh, it's already unlocked. */
2304 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2307 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2309 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2314 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2319 #if LUSTRE_FIX >= 50
2320 static int join_sanity_check(struct inode *head, struct inode *tail)
2323 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2324 CERROR("server do not support join \n");
2327 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2328 CERROR("tail ino %lu and ino head %lu must be regular\n",
2329 head->i_ino, tail->i_ino);
2332 if (head->i_ino == tail->i_ino) {
2333 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2336 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2337 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2343 static int join_file(struct inode *head_inode, struct file *head_filp,
2344 struct file *tail_filp)
2346 struct dentry *tail_dentry = tail_filp->f_dentry;
2347 struct lookup_intent oit = {.it_op = IT_OPEN,
2348 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2349 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2350 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2352 struct lustre_handle lockh;
2353 struct md_op_data *op_data;
2358 tail_dentry = tail_filp->f_dentry;
2360 data = i_size_read(head_inode);
2361 op_data = ll_prep_md_op_data(NULL, head_inode,
2362 tail_dentry->d_parent->d_inode,
2363 tail_dentry->d_name.name,
2364 tail_dentry->d_name.len, 0,
2365 LUSTRE_OPC_ANY, &data);
2366 if (IS_ERR(op_data))
2367 RETURN(PTR_ERR(op_data));
2369 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2370 op_data, &lockh, NULL, 0, NULL, 0);
2372 ll_finish_md_op_data(op_data);
2376 rc = oit.d.lustre.it_status;
2378 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2379 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2380 ptlrpc_req_finished((struct ptlrpc_request *)
2381 oit.d.lustre.it_data);
2385 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2387 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2388 oit.d.lustre.it_lock_mode = 0;
2390 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2391 it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2392 ll_release_openhandle(head_filp->f_dentry, &oit);
2394 ll_intent_release(&oit);
2398 static int ll_file_join(struct inode *head, struct file *filp,
2399 char *filename_tail)
2401 struct inode *tail = NULL, *first = NULL, *second = NULL;
2402 struct dentry *tail_dentry;
2403 struct file *tail_filp, *first_filp, *second_filp;
2404 struct ll_lock_tree first_tree, second_tree;
2405 struct ll_lock_tree_node *first_node, *second_node;
2406 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2407 int rc = 0, cleanup_phase = 0;
2410 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2411 head->i_ino, head->i_generation, head, filename_tail);
2413 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2414 if (IS_ERR(tail_filp)) {
2415 CERROR("Can not open tail file %s", filename_tail);
2416 rc = PTR_ERR(tail_filp);
2419 tail = igrab(tail_filp->f_dentry->d_inode);
2421 tlli = ll_i2info(tail);
2422 tail_dentry = tail_filp->f_dentry;
2423 LASSERT(tail_dentry);
2426 /*reorder the inode for lock sequence*/
2427 first = head->i_ino > tail->i_ino ? head : tail;
2428 second = head->i_ino > tail->i_ino ? tail : head;
2429 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2430 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2432 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2433 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2434 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2435 if (IS_ERR(first_node)){
2436 rc = PTR_ERR(first_node);
2439 first_tree.lt_fd = first_filp->private_data;
2440 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2445 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2446 if (IS_ERR(second_node)){
2447 rc = PTR_ERR(second_node);
2450 second_tree.lt_fd = second_filp->private_data;
2451 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2456 rc = join_sanity_check(head, tail);
2460 rc = join_file(head, filp, tail_filp);
2464 switch (cleanup_phase) {
2466 ll_tree_unlock(&second_tree);
2467 obd_cancel_unused(ll_i2dtexp(second),
2468 ll_i2info(second)->lli_smd, 0, NULL);
2470 ll_tree_unlock(&first_tree);
2471 obd_cancel_unused(ll_i2dtexp(first),
2472 ll_i2info(first)->lli_smd, 0, NULL);
2474 filp_close(tail_filp, 0);
2477 if (head && rc == 0) {
2478 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2480 hlli->lli_smd = NULL;
2485 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2490 #endif /* LUSTRE_FIX >= 50 */
2493 * Close inode open handle
2495 * \param dentry [in] dentry which contains the inode
2496 * \param it [in,out] intent which contains open info and result
2499 * \retval <0 failure
2501 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2503 struct inode *inode = dentry->d_inode;
2504 struct obd_client_handle *och;
2510 /* Root ? Do nothing. */
2511 if (dentry->d_inode->i_sb->s_root == dentry)
2514 /* No open handle to close? Move away */
2515 if (!it_disposition(it, DISP_OPEN_OPEN))
2518 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2520 OBD_ALLOC(och, sizeof(*och));
2522 GOTO(out, rc = -ENOMEM);
2524 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2525 ll_i2info(inode), it, och);
2527 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2530 /* this one is in place of ll_file_open */
2531 if (it_disposition(it, DISP_ENQ_OPEN_REF))
2532 ptlrpc_req_finished(it->d.lustre.it_data);
2533 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2538 * Get size for inode for which FIEMAP mapping is requested.
2539 * Make the FIEMAP get_info call and returns the result.
2541 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2544 struct obd_export *exp = ll_i2dtexp(inode);
2545 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2546 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2547 int vallen = num_bytes;
2551 /* If the stripe_count > 1 and the application does not understand
2552 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2554 if (lsm->lsm_stripe_count > 1 &&
2555 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2558 fm_key.oa.o_id = lsm->lsm_object_id;
2559 fm_key.oa.o_gr = lsm->lsm_object_gr;
2560 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2562 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
2565 /* If filesize is 0, then there would be no objects for mapping */
2566 if (fm_key.oa.o_size == 0) {
2567 fiemap->fm_mapped_extents = 0;
2571 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2573 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2575 CERROR("obd_get_info failed: rc = %d\n", rc);
2580 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2583 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2587 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2588 inode->i_generation, inode, cmd);
2589 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2591 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2592 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2596 case LL_IOC_GETFLAGS:
2597 /* Get the current value of the file flags */
2598 return put_user(fd->fd_flags, (int *)arg);
2599 case LL_IOC_SETFLAGS:
2600 case LL_IOC_CLRFLAGS:
2601 /* Set or clear specific file flags */
2602 /* XXX This probably needs checks to ensure the flags are
2603 * not abused, and to handle any flag side effects.
2605 if (get_user(flags, (int *) arg))
2608 if (cmd == LL_IOC_SETFLAGS) {
2609 if ((flags & LL_FILE_IGNORE_LOCK) &&
2610 !(file->f_flags & O_DIRECT)) {
2611 CERROR("%s: unable to disable locking on "
2612 "non-O_DIRECT file\n", current->comm);
2616 fd->fd_flags |= flags;
2618 fd->fd_flags &= ~flags;
2621 case LL_IOC_LOV_SETSTRIPE:
2622 RETURN(ll_lov_setstripe(inode, file, arg));
2623 case LL_IOC_LOV_SETEA:
2624 RETURN(ll_lov_setea(inode, file, arg));
2625 case LL_IOC_LOV_GETSTRIPE:
2626 RETURN(ll_lov_getstripe(inode, arg));
2627 case LL_IOC_RECREATE_OBJ:
2628 RETURN(ll_lov_recreate_obj(inode, file, arg));
2629 case EXT3_IOC_FIEMAP: {
2630 struct ll_user_fiemap *fiemap_s;
2631 size_t num_bytes, ret_bytes;
2632 unsigned int extent_count;
2635 /* Get the extent count so we can calculate the size of
2636 * required fiemap buffer */
2637 if (get_user(extent_count,
2638 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2640 num_bytes = sizeof(*fiemap_s) + (extent_count *
2641 sizeof(struct ll_fiemap_extent));
2642 OBD_VMALLOC(fiemap_s, num_bytes);
2643 if (fiemap_s == NULL)
2646 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2648 GOTO(error, rc = -EFAULT);
2650 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2651 fiemap_s->fm_flags = fiemap_s->fm_flags &
2652 ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2653 if (copy_to_user((char *)arg, fiemap_s,
2655 GOTO(error, rc = -EFAULT);
2657 GOTO(error, rc = -EBADR);
2660 /* If fm_extent_count is non-zero, read the first extent since
2661 * it is used to calculate end_offset and device from previous
2664 if (copy_from_user(&fiemap_s->fm_extents[0],
2665 (char __user *)arg + sizeof(*fiemap_s),
2666 sizeof(struct ll_fiemap_extent)))
2667 GOTO(error, rc = -EFAULT);
2670 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
2673 rc = filemap_fdatawrite(inode->i_mapping);
2678 rc = ll_fiemap(inode, fiemap_s, num_bytes);
2682 ret_bytes = sizeof(struct ll_user_fiemap);
2684 if (extent_count != 0)
2685 ret_bytes += (fiemap_s->fm_mapped_extents *
2686 sizeof(struct ll_fiemap_extent));
2688 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2692 OBD_VFREE(fiemap_s, num_bytes);
2695 case EXT3_IOC_GETFLAGS:
2696 case EXT3_IOC_SETFLAGS:
2697 RETURN(ll_iocontrol(inode, file, cmd, arg));
2698 case EXT3_IOC_GETVERSION_OLD:
2699 case EXT3_IOC_GETVERSION:
2700 RETURN(put_user(inode->i_generation, (int *)arg));
2702 #if LUSTRE_FIX >= 50
2703 /* Allow file join in beta builds to allow debuggging */
2707 ftail = getname((const char *)arg);
2709 RETURN(PTR_ERR(ftail));
2710 rc = ll_file_join(inode, file, ftail);
2714 CWARN("file join is not supported in this version of Lustre\n");
2718 case LL_IOC_GROUP_LOCK:
2719 RETURN(ll_get_grouplock(inode, file, arg));
2720 case LL_IOC_GROUP_UNLOCK:
2721 RETURN(ll_put_grouplock(inode, file, arg));
2722 case IOC_OBD_STATFS:
2723 RETURN(ll_obd_statfs(inode, (void *)arg));
2725 /* We need to special case any other ioctls we want to handle,
2726 * to send them to the MDS/OST as appropriate and to properly
2727 * network encode the arg field.
2728 case EXT3_IOC_SETVERSION_OLD:
2729 case EXT3_IOC_SETVERSION:
2731 case LL_IOC_FLUSHCTX:
2732 RETURN(ll_flush_ctx(inode));
2737 ll_iocontrol_call(inode, file, cmd, arg, &err))
2740 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2746 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2748 struct inode *inode = file->f_dentry->d_inode;
2749 struct ll_inode_info *lli = ll_i2info(inode);
2750 struct lov_stripe_md *lsm = lli->lli_smd;
2753 retval = offset + ((origin == 2) ? i_size_read(inode) :
2754 (origin == 1) ? file->f_pos : 0);
2755 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2756 inode->i_ino, inode->i_generation, inode, retval, retval,
2757 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2758 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2760 if (origin == 2) { /* SEEK_END */
2761 int nonblock = 0, rc;
2763 if (file->f_flags & O_NONBLOCK)
2764 nonblock = LDLM_FL_BLOCK_NOWAIT;
2767 rc = ll_glimpse_size(inode, nonblock);
2772 ll_inode_size_lock(inode, 0);
2773 offset += i_size_read(inode);
2774 ll_inode_size_unlock(inode, 0);
2775 } else if (origin == 1) { /* SEEK_CUR */
2776 offset += file->f_pos;
2780 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2781 if (offset != file->f_pos) {
2782 file->f_pos = offset;
2790 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2792 struct inode *inode = dentry->d_inode;
2793 struct ll_inode_info *lli = ll_i2info(inode);
2794 struct lov_stripe_md *lsm = lli->lli_smd;
2795 struct ptlrpc_request *req;
2796 struct obd_capa *oc;
2799 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2800 inode->i_generation, inode);
2801 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2803 /* fsync's caller has already called _fdata{sync,write}, we want
2804 * that IO to finish before calling the osc and mdc sync methods */
2805 rc = filemap_fdatawait(inode->i_mapping);
2807 /* catch async errors that were recorded back when async writeback
2808 * failed for pages in this mapping. */
2809 err = lli->lli_async_rc;
2810 lli->lli_async_rc = 0;
2814 err = lov_test_and_clear_async_rc(lsm);
2819 oc = ll_mdscapa_get(inode);
2820 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2826 ptlrpc_req_finished(req);
2833 RETURN(rc ? rc : -ENOMEM);
2835 oa->o_id = lsm->lsm_object_id;
2836 oa->o_gr = lsm->lsm_object_gr;
2837 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2838 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2839 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2842 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2843 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2844 0, OBD_OBJECT_EOF, oc);
2854 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2856 struct inode *inode = file->f_dentry->d_inode;
2857 struct ll_sb_info *sbi = ll_i2sbi(inode);
2858 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2859 .ei_cb_cp =ldlm_flock_completion_ast,
2860 .ei_cbdata = file_lock };
2861 struct md_op_data *op_data;
2862 struct lustre_handle lockh = {0};
2863 ldlm_policy_data_t flock;
2868 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2869 inode->i_ino, file_lock);
2871 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2873 if (file_lock->fl_flags & FL_FLOCK) {
2874 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2875 /* set missing params for flock() calls */
2876 file_lock->fl_end = OFFSET_MAX;
2877 file_lock->fl_pid = current->tgid;
2879 flock.l_flock.pid = file_lock->fl_pid;
2880 flock.l_flock.start = file_lock->fl_start;
2881 flock.l_flock.end = file_lock->fl_end;
2883 switch (file_lock->fl_type) {
2885 einfo.ei_mode = LCK_PR;
2888 /* An unlock request may or may not have any relation to
2889 * existing locks so we may not be able to pass a lock handle
2890 * via a normal ldlm_lock_cancel() request. The request may even
2891 * unlock a byte range in the middle of an existing lock. In
2892 * order to process an unlock request we need all of the same
2893 * information that is given with a normal read or write record
2894 * lock request. To avoid creating another ldlm unlock (cancel)
2895 * message we'll treat a LCK_NL flock request as an unlock. */
2896 einfo.ei_mode = LCK_NL;
2899 einfo.ei_mode = LCK_PW;
2902 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2917 flags = LDLM_FL_BLOCK_NOWAIT;
2923 flags = LDLM_FL_TEST_LOCK;
2924 /* Save the old mode so that if the mode in the lock changes we
2925 * can decrement the appropriate reader or writer refcount. */
2926 file_lock->fl_type = einfo.ei_mode;
2929 CERROR("unknown fcntl lock command: %d\n", cmd);
2933 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2934 LUSTRE_OPC_ANY, NULL);
2935 if (IS_ERR(op_data))
2936 RETURN(PTR_ERR(op_data));
2938 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2939 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2940 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2942 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2943 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2945 ll_finish_md_op_data(op_data);
2947 if ((file_lock->fl_flags & FL_FLOCK) &&
2948 (rc == 0 || file_lock->fl_type == F_UNLCK))
2949 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2950 #ifdef HAVE_F_OP_FLOCK
2951 if ((file_lock->fl_flags & FL_POSIX) &&
2952 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2953 !(flags & LDLM_FL_TEST_LOCK))
2954 posix_lock_file_wait(file, file_lock);
2960 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2967 int ll_have_md_lock(struct inode *inode, __u64 bits)
2969 struct lustre_handle lockh;
2970 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2978 fid = &ll_i2info(inode)->lli_fid;
2979 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2981 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2982 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2983 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2989 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2990 struct lustre_handle *lockh)
2992 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2998 fid = &ll_i2info(inode)->lli_fid;
2999 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3001 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
3002 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
3003 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
3007 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3008 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3009 * and return success */
3011 /* This path cannot be hit for regular files unless in
3012 * case of obscure races, so no need to to validate
3014 if (!S_ISREG(inode->i_mode) &&
3015 !S_ISDIR(inode->i_mode))
3020 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3028 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3030 struct inode *inode = dentry->d_inode;
3031 struct ptlrpc_request *req = NULL;
3032 struct ll_sb_info *sbi;
3033 struct obd_export *exp;
3038 CERROR("REPORT THIS LINE TO PETER\n");
3041 sbi = ll_i2sbi(inode);
3043 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3044 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3046 exp = ll_i2mdexp(inode);
3048 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3049 struct lookup_intent oit = { .it_op = IT_GETATTR };
3050 struct md_op_data *op_data;
3052 /* Call getattr by fid, so do not provide name at all. */
3053 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
3054 dentry->d_inode, NULL, 0, 0,
3055 LUSTRE_OPC_ANY, NULL);
3056 if (IS_ERR(op_data))
3057 RETURN(PTR_ERR(op_data));
3059 oit.it_flags |= O_CHECK_STALE;
3060 rc = md_intent_lock(exp, op_data, NULL, 0,
3061 /* we are not interested in name
3064 ll_md_blocking_ast, 0);
3065 ll_finish_md_op_data(op_data);
3066 oit.it_flags &= ~O_CHECK_STALE;
3068 rc = ll_inode_revalidate_fini(inode, rc);
3072 rc = ll_revalidate_it_finish(req, &oit, dentry);
3074 ll_intent_release(&oit);
3078 /* Unlinked? Unhash dentry, so it is not picked up later by
3079 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3080 here to preserve get_cwd functionality on 2.6.
3082 if (!dentry->d_inode->i_nlink) {
3083 spin_lock(&ll_lookup_lock);
3084 spin_lock(&dcache_lock);
3085 ll_drop_dentry(dentry);
3086 spin_unlock(&dcache_lock);
3087 spin_unlock(&ll_lookup_lock);
3090 ll_lookup_finish_locks(&oit, dentry);
3091 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
3092 MDS_INODELOCK_LOOKUP)) {
3093 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3094 obd_valid valid = OBD_MD_FLGETATTR;
3095 struct obd_capa *oc;
3098 if (S_ISREG(inode->i_mode)) {
3099 rc = ll_get_max_mdsize(sbi, &ealen);
3102 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3104 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3105 * capa for this inode. Because we only keep capas of dirs
3107 oc = ll_mdscapa_get(inode);
3108 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
3112 rc = ll_inode_revalidate_fini(inode, rc);
3116 rc = ll_prep_inode(&inode, req, NULL);
3121 /* if object not yet allocated, don't validate size */
3122 if (ll_i2info(inode)->lli_smd == NULL)
3125 /* ll_glimpse_size will prefer locally cached writes if they extend
3127 rc = ll_glimpse_size(inode, 0);
3130 ptlrpc_req_finished(req);
3134 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3135 struct lookup_intent *it, struct kstat *stat)
3137 struct inode *inode = de->d_inode;
3140 res = ll_inode_revalidate_it(de, it);
3141 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
3146 stat->dev = inode->i_sb->s_dev;
3147 stat->ino = inode->i_ino;
3148 stat->mode = inode->i_mode;
3149 stat->nlink = inode->i_nlink;
3150 stat->uid = inode->i_uid;
3151 stat->gid = inode->i_gid;
3152 stat->rdev = kdev_t_to_nr(inode->i_rdev);
3153 stat->atime = inode->i_atime;
3154 stat->mtime = inode->i_mtime;
3155 stat->ctime = inode->i_ctime;
3156 #ifdef HAVE_INODE_BLKSIZE
3157 stat->blksize = inode->i_blksize;
3159 stat->blksize = 1 << inode->i_blkbits;
3162 ll_inode_size_lock(inode, 0);
3163 stat->size = i_size_read(inode);
3164 stat->blocks = inode->i_blocks;
3165 ll_inode_size_unlock(inode, 0);
3169 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3171 struct lookup_intent it = { .it_op = IT_GETATTR };
3173 return ll_getattr_it(mnt, de, &it, stat);
3177 int lustre_check_acl(struct inode *inode, int mask)
3179 #ifdef CONFIG_FS_POSIX_ACL
3180 struct ll_inode_info *lli = ll_i2info(inode);
3181 struct posix_acl *acl;
3185 spin_lock(&lli->lli_lock);
3186 acl = posix_acl_dup(lli->lli_posix_acl);
3187 spin_unlock(&lli->lli_lock);
3192 rc = posix_acl_permission(inode, acl, mask);
3193 posix_acl_release(acl);
3201 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3202 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3204 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3205 inode->i_ino, inode->i_generation, inode, mask);
3206 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3207 return lustre_check_remote_perm(inode, mask);
3209 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3210 return generic_permission(inode, mask, lustre_check_acl);
3213 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3215 int mode = inode->i_mode;
3218 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3219 inode->i_ino, inode->i_generation, inode, mask);
3221 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3222 return lustre_check_remote_perm(inode, mask);
3224 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3226 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3227 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3229 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3231 if (current->fsuid == inode->i_uid) {
3234 if (((mode >> 3) & mask & S_IRWXO) != mask)
3236 rc = lustre_check_acl(inode, mask);
3240 goto check_capabilities;
3244 if (in_group_p(inode->i_gid))
3247 if ((mode & mask & S_IRWXO) == mask)
3251 if (!(mask & MAY_EXEC) ||
3252 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3253 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3256 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3257 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3264 /* -o localflock - only provides locally consistent flock locks */
3265 struct file_operations ll_file_operations = {
3266 .read = ll_file_read,
3267 .write = ll_file_write,
3268 .ioctl = ll_file_ioctl,
3269 .open = ll_file_open,
3270 .release = ll_file_release,
3271 .mmap = ll_file_mmap,
3272 .llseek = ll_file_seek,
3273 .sendfile = ll_file_sendfile,
3277 struct file_operations ll_file_operations_flock = {
3278 .read = ll_file_read,
3279 .write = ll_file_write,
3280 .ioctl = ll_file_ioctl,
3281 .open = ll_file_open,
3282 .release = ll_file_release,
3283 .mmap = ll_file_mmap,
3284 .llseek = ll_file_seek,
3285 .sendfile = ll_file_sendfile,
3287 #ifdef HAVE_F_OP_FLOCK
3288 .flock = ll_file_flock,
3290 .lock = ll_file_flock
3293 /* These are for -o noflock - to return ENOSYS on flock calls */
3294 struct file_operations ll_file_operations_noflock = {
3295 .read = ll_file_read,
3296 .write = ll_file_write,
3297 .ioctl = ll_file_ioctl,
3298 .open = ll_file_open,
3299 .release = ll_file_release,
3300 .mmap = ll_file_mmap,
3301 .llseek = ll_file_seek,
3302 .sendfile = ll_file_sendfile,
3304 #ifdef HAVE_F_OP_FLOCK
3305 .flock = ll_file_noflock,
3307 .lock = ll_file_noflock
3310 struct inode_operations ll_file_inode_operations = {
3311 #ifdef HAVE_VFS_INTENT_PATCHES
3312 .setattr_raw = ll_setattr_raw,
3314 .setattr = ll_setattr,
3315 .truncate = ll_truncate,
3316 .getattr = ll_getattr,
3317 .permission = ll_inode_permission,
3318 .setxattr = ll_setxattr,
3319 .getxattr = ll_getxattr,
3320 .listxattr = ll_listxattr,
3321 .removexattr = ll_removexattr,
3324 /* dynamic ioctl number support routins */
3325 static struct llioc_ctl_data {
3326 struct rw_semaphore ioc_sem;
3327 struct list_head ioc_head;
3329 __RWSEM_INITIALIZER(llioc.ioc_sem),
3330 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3335 struct list_head iocd_list;
3336 unsigned int iocd_size;
3337 llioc_callback_t iocd_cb;
3338 unsigned int iocd_count;
3339 unsigned int iocd_cmd[0];
3342 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3345 struct llioc_data *in_data = NULL;
3348 if (cb == NULL || cmd == NULL ||
3349 count > LLIOC_MAX_CMD || count < 0)
3352 size = sizeof(*in_data) + count * sizeof(unsigned int);
3353 OBD_ALLOC(in_data, size);
3354 if (in_data == NULL)
3357 memset(in_data, 0, sizeof(*in_data));
3358 in_data->iocd_size = size;
3359 in_data->iocd_cb = cb;
3360 in_data->iocd_count = count;
3361 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3363 down_write(&llioc.ioc_sem);
3364 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3365 up_write(&llioc.ioc_sem);
3370 void ll_iocontrol_unregister(void *magic)
3372 struct llioc_data *tmp;
3377 down_write(&llioc.ioc_sem);
3378 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3380 unsigned int size = tmp->iocd_size;
3382 list_del(&tmp->iocd_list);
3383 up_write(&llioc.ioc_sem);
3385 OBD_FREE(tmp, size);
3389 up_write(&llioc.ioc_sem);
3391 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3394 EXPORT_SYMBOL(ll_iocontrol_register);
3395 EXPORT_SYMBOL(ll_iocontrol_unregister);
3397 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3398 unsigned int cmd, unsigned long arg, int *rcp)
3400 enum llioc_iter ret = LLIOC_CONT;
3401 struct llioc_data *data;
3402 int rc = -EINVAL, i;
3404 down_read(&llioc.ioc_sem);
3405 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3406 for (i = 0; i < data->iocd_count; i++) {
3407 if (cmd != data->iocd_cmd[i])
3410 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3414 if (ret == LLIOC_STOP)
3417 up_read(&llioc.ioc_sem);