1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <linux/pagemap.h>
29 #include <linux/file.h>
30 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
31 #include <linux/lustre_compat25.h>
33 #include "llite_internal.h"
35 /* also used by llite/special.c:ll_special_open() */
36 struct ll_file_data *ll_file_data_get(void)
38 struct ll_file_data *fd;
40 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
44 static void ll_file_data_put(struct ll_file_data *fd)
47 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
50 static int ll_close_inode_openhandle(struct inode *inode,
51 struct obd_client_handle *och)
53 struct ptlrpc_request *req = NULL;
54 struct obd_device *obd;
59 obd = class_exp2obd(ll_i2mdcexp(inode));
61 CERROR("Invalid MDC connection handle "LPX64"\n",
62 ll_i2mdcexp(inode)->exp_handle.h_cookie);
67 * here we check if this is forced umount. If so this is called on
68 * canceling "open lock" and we do not call mdc_close() in this case, as
69 * it will not be successful, as import is already deactivated.
76 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
78 oa->o_id = inode->i_ino;
79 oa->o_valid = OBD_MD_FLID;
80 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
81 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
82 OBD_MD_FLATIME | OBD_MD_FLMTIME |
84 if (ll_is_inode_dirty(inode)) {
85 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
86 oa->o_valid |= OBD_MD_FLFLAGS;
89 rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
91 /* We are the last writer, so the MDS has instructed us to get
92 * the file size and any write cookies, then close again. */
93 ll_queue_done_writing(inode);
96 CERROR("inode %lu mdc close failed: rc = %d\n",
103 rc = ll_objects_destroy(req, inode);
105 CERROR("inode %lu ll_objects destroy: rc = %d\n",
109 ptlrpc_req_finished(req); /* This is close request */
112 mdc_clear_open_replay_data(och);
117 int ll_mdc_real_close(struct inode *inode, int flags)
119 struct ll_inode_info *lli = ll_i2info(inode);
121 struct obd_client_handle **och_p;
122 struct obd_client_handle *och;
127 if (flags & FMODE_WRITE) {
128 och_p = &lli->lli_mds_write_och;
129 och_usecount = &lli->lli_open_fd_write_count;
130 } else if (flags & FMODE_EXEC) {
131 och_p = &lli->lli_mds_exec_och;
132 och_usecount = &lli->lli_open_fd_exec_count;
134 LASSERT(flags & FMODE_READ);
135 och_p = &lli->lli_mds_read_och;
136 och_usecount = &lli->lli_open_fd_read_count;
139 down(&lli->lli_och_sem);
140 if (*och_usecount) { /* There are still users of this handle, so
142 up(&lli->lli_och_sem);
147 up(&lli->lli_och_sem);
149 if (och) { /* There might be a race and somebody have freed this och
151 rc = ll_close_inode_openhandle(inode, och);
152 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
153 OBD_FREE(och, sizeof *och);
159 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
162 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
163 struct ll_inode_info *lli = ll_i2info(inode);
167 /* clear group lock, if present */
168 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
169 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
170 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
171 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
175 /* Let's see if we have good enough OPEN lock on the file and if
176 we can skip talking to MDS */
177 if (file->f_dentry->d_inode) { /* Can this ever be false? */
179 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
180 struct lustre_handle lockh;
181 struct inode *inode = file->f_dentry->d_inode;
182 struct ldlm_res_id file_res_id = {.name={inode->i_ino,
183 inode->i_generation}};
184 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
186 down(&lli->lli_och_sem);
187 if (fd->fd_omode & FMODE_WRITE) {
189 LASSERT(lli->lli_open_fd_write_count);
190 lli->lli_open_fd_write_count--;
191 } else if (fd->fd_omode & FMODE_EXEC) {
193 LASSERT(lli->lli_open_fd_exec_count);
194 lli->lli_open_fd_exec_count--;
197 LASSERT(lli->lli_open_fd_read_count);
198 lli->lli_open_fd_read_count--;
200 up(&lli->lli_och_sem);
202 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
203 &file_res_id, LDLM_IBITS, &policy,lockmode,
205 rc = ll_mdc_real_close(file->f_dentry->d_inode,
209 CERROR("Releasing a file %p with negative dentry %p. Name %s",
210 file, file->f_dentry, file->f_dentry->d_name.name);
213 LUSTRE_FPRIVATE(file) = NULL;
214 ll_file_data_put(fd);
219 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
221 /* While this returns an error code, fput() the caller does not, so we need
222 * to make every effort to clean up all of our state here. Also, applications
223 * rarely check close errors and even if an error is returned they will not
224 * re-try the close call.
226 int ll_file_release(struct inode *inode, struct file *file)
228 struct ll_file_data *fd;
229 struct ll_sb_info *sbi = ll_i2sbi(inode);
230 struct ll_inode_info *lli = ll_i2info(inode);
231 struct lov_stripe_md *lsm = lli->lli_smd;
235 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
236 inode->i_generation, inode);
238 if (S_ISDIR(inode->i_mode))
239 ll_stop_statahead(inode);
241 /* don't do anything for / */
242 if (inode->i_sb->s_root == file->f_dentry)
245 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
246 fd = LUSTRE_FPRIVATE(file);
250 lov_test_and_clear_async_rc(lsm);
251 lli->lli_async_rc = 0;
253 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
257 static int ll_intent_file_open(struct file *file, void *lmm,
258 int lmmsize, struct lookup_intent *itp)
260 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
261 struct mdc_op_data data;
262 struct dentry *parent = file->f_dentry->d_parent;
263 const char *name = file->f_dentry->d_name.name;
264 const int len = file->f_dentry->d_name.len;
265 struct inode *inode = file->f_dentry->d_inode;
266 struct ptlrpc_request *req;
273 ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
274 name, len, O_RDWR, NULL);
276 /* Usually we come here only for NFSD, and we want open lock.
277 But we can also get here with pre 2.6.15 patchless kernels, and in
278 that case that lock is also ok */
279 /* We can also get here if there was cached open handle in revalidate_it
280 * but it disappeared while we were getting from there to ll_file_open.
281 * But this means this file was closed and immediatelly opened which
282 * makes a good candidate for using OPEN lock */
283 /* If lmmsize & lmm are not 0, we are just setting stripe info
284 * parameters. No need for the open lock */
285 if (!lmm && !lmmsize)
286 itp->it_flags |= MDS_OPEN_LOCK;
288 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
289 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
291 /* reason for keep own exit path - don`t flood log
292 * with messages with -ESTALE errors.
294 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
295 it_open_error(DISP_OPEN_OPEN, itp))
297 ll_release_openhandle(file->f_dentry, itp);
301 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
302 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
303 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
307 if (itp->d.lustre.it_lock_mode)
308 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
311 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
312 req, DLM_REPLY_REC_OFF, NULL);
314 ptlrpc_req_finished(itp->d.lustre.it_data);
317 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
318 ll_intent_drop_lock(itp);
324 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
325 struct obd_client_handle *och)
327 struct ptlrpc_request *req = it->d.lustre.it_data;
328 struct mds_body *body;
332 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
333 LASSERT(body != NULL); /* reply already checked out */
334 /* and swabbed in mdc_enqueue */
335 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
337 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
338 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
339 lli->lli_io_epoch = body->io_epoch;
341 mdc_set_open_replay_data(och, it->d.lustre.it_data);
344 int ll_local_open(struct file *file, struct lookup_intent *it,
345 struct ll_file_data *fd, struct obd_client_handle *och)
349 LASSERT(!LUSTRE_FPRIVATE(file));
354 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
355 LUSTRE_FPRIVATE(file) = fd;
356 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
357 fd->fd_omode = it->it_flags;
362 /* Open a file, and (for the very first open) create objects on the OSTs at
363 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
364 * creation or open until ll_lov_setstripe() ioctl is called. We grab
365 * lli_open_sem to ensure no other process will create objects, send the
366 * stripe MD to the MDS, or try to destroy the objects if that fails.
368 * If we already have the stripe MD locally then we don't request it in
369 * mdc_open(), by passing a lmm_size = 0.
371 * It is up to the application to ensure no other processes open this file
372 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
373 * used. We might be able to avoid races of that sort by getting lli_open_sem
374 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
375 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
377 int ll_file_open(struct inode *inode, struct file *file)
379 struct ll_inode_info *lli = ll_i2info(inode);
380 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
381 .it_flags = file->f_flags };
382 struct lov_stripe_md *lsm;
383 struct ptlrpc_request *req = NULL;
384 struct obd_client_handle **och_p;
386 struct ll_file_data *fd;
390 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
391 inode->i_generation, inode, file->f_flags);
393 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_pid == 0)
394 lli->lli_opendir_pid = current->pid;
396 /* don't do anything for / */
397 if (inode->i_sb->s_root == file->f_dentry)
400 #ifdef HAVE_VFS_INTENT_PATCHES
403 it = file->private_data; /* XXX: compat macro */
404 file->private_data = NULL; /* prevent ll_local_open assertion */
407 fd = ll_file_data_get();
409 lli->lli_opendir_pid = 0;
412 if (!it || !it->d.lustre.it_disposition) {
413 /* Convert f_flags into access mode. We cannot use file->f_mode,
414 * because everything but O_ACCMODE mask was stripped from it */
415 if ((oit.it_flags + 1) & O_ACCMODE)
417 if (file->f_flags & O_TRUNC)
418 oit.it_flags |= FMODE_WRITE;
420 /* kernel only call f_op->open in dentry_open. filp_open calls
421 * dentry_open after call to open_namei that checks permissions.
422 * Only nfsd_open call dentry_open directly without checking
423 * permissions and because of that this code below is safe. */
424 if (oit.it_flags & FMODE_WRITE)
425 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
427 /* We do not want O_EXCL here, presumably we opened the file
428 * already? XXX - NFS implications? */
429 oit.it_flags &= ~O_EXCL;
435 /* Let's see if we have file open on MDS already. */
436 if (it->it_flags & FMODE_WRITE) {
437 och_p = &lli->lli_mds_write_och;
438 och_usecount = &lli->lli_open_fd_write_count;
439 } else if (it->it_flags & FMODE_EXEC) {
440 och_p = &lli->lli_mds_exec_och;
441 och_usecount = &lli->lli_open_fd_exec_count;
443 och_p = &lli->lli_mds_read_och;
444 och_usecount = &lli->lli_open_fd_read_count;
447 LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
448 it->d.lustre.it_disposition);
450 down(&lli->lli_och_sem);
451 if (*och_p) { /* Open handle is present */
452 if (it_disposition(it, DISP_OPEN_OPEN)) {
453 /* Well, there's extra open request that we do not need,
454 let's close it somehow. This will decref request. */
455 rc = it_open_error(DISP_OPEN_OPEN, it);
457 ll_file_data_put(fd);
458 GOTO(out_och_free, rc);
460 ll_release_openhandle(file->f_dentry, it);
461 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
466 rc = ll_local_open(file, it, fd, NULL);
468 LASSERTF(rc == 0, "rc = %d\n", rc);
470 LASSERT(*och_usecount == 0);
471 if (!it->d.lustre.it_disposition) {
472 /* We cannot just request lock handle now, new ELC code
473 means that one of other OPEN locks for this file
474 could be cancelled, and since blocking ast handler
475 would attempt to grab och_sem as well, that would
476 result in a deadlock */
477 up(&lli->lli_och_sem);
478 rc = ll_intent_file_open(file, NULL, 0, it);
480 ll_file_data_put(fd);
481 GOTO(out_openerr, rc);
484 /* Got some error? Release the request */
485 if (it->d.lustre.it_status < 0) {
486 req = it->d.lustre.it_data;
487 ptlrpc_req_finished(req);
489 mdc_set_lock_data(&it->d.lustre.it_lock_handle,
490 file->f_dentry->d_inode);
494 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
496 ll_file_data_put(fd);
497 GOTO(out_och_free, rc = -ENOMEM);
500 req = it->d.lustre.it_data;
502 /* mdc_intent_lock() didn't get a request ref if there was an
503 * open error, so don't do cleanup on the request here
505 /* XXX (green): Should not we bail out on any error here, not
506 * just open error? */
507 rc = it_open_error(DISP_OPEN_OPEN, it);
509 ll_file_data_put(fd);
510 GOTO(out_och_free, rc);
513 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
514 rc = ll_local_open(file, it, fd, *och_p);
515 LASSERTF(rc == 0, "rc = %d\n", rc);
517 up(&lli->lli_och_sem);
519 /* Must do this outside lli_och_sem lock to prevent deadlock where
520 different kind of OPEN lock for this same inode gets cancelled
521 by ldlm_cancel_lru */
522 if (!S_ISREG(inode->i_mode))
527 if (file->f_flags & O_LOV_DELAY_CREATE ||
528 !(file->f_mode & FMODE_WRITE)) {
529 CDEBUG(D_INODE, "object creation was delayed\n");
533 file->f_flags &= ~O_LOV_DELAY_CREATE;
536 ptlrpc_req_finished(req);
538 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
540 ll_open_complete(inode);
544 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
545 *och_p = NULL; /* OBD_FREE writes some magic there */
548 up(&lli->lli_och_sem);
550 lli->lli_opendir_pid = 0;
555 /* Fills the obdo with the attributes for the inode defined by lsm */
556 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
559 struct ptlrpc_request_set *set;
560 struct obd_info oinfo = { { { 0 } } };
564 LASSERT(lsm != NULL);
566 memset(oa, 0, sizeof *oa);
569 oa->o_id = lsm->lsm_object_id;
570 oa->o_mode = S_IFREG;
571 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
572 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
575 set = ptlrpc_prep_set();
579 rc = obd_getattr_async(exp, &oinfo, set);
581 rc = ptlrpc_set_wait(set);
582 ptlrpc_set_destroy(set);
587 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
588 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
592 static inline void ll_remove_suid(struct inode *inode)
596 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
597 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
599 /* was any of the uid bits set? */
600 mode &= inode->i_mode;
601 if (mode && !capable(CAP_FSETID)) {
602 inode->i_mode &= ~mode;
603 // XXX careful here - we cannot change the size
607 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
609 struct ll_inode_info *lli = ll_i2info(inode);
610 struct lov_stripe_md *lsm = lli->lli_smd;
611 struct obd_export *exp = ll_i2obdexp(inode);
614 struct ldlm_lock *lock;
615 struct lov_stripe_md *lsm;
616 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
617 __u32 stripe, vallen = sizeof(stripe);
621 if (lsm->lsm_stripe_count == 1)
622 GOTO(check, stripe = 0);
624 /* get our offset in the lov */
625 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
627 CERROR("obd_get_info: rc = %d\n", rc);
630 LASSERT(stripe < lsm->lsm_stripe_count);
633 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
634 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[1]){
635 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
636 lsm->lsm_oinfo[stripe]->loi_id,
637 lsm->lsm_oinfo[stripe]->loi_gr);
638 RETURN(-ELDLM_NO_LOCK_DATA);
644 /* Get extra page reference to ensure it is not going away */
645 void ll_pin_extent_cb(void *data)
647 struct page *page = data;
649 page_cache_get(page);
653 /* Flush the page from page cache for an extent as its canceled.
654 * Page to remove is delivered as @data.
656 * No one can dirty the extent until we've finished our work and they cannot
657 * enqueue another lock. The DLM protects us from ll_file_read/write here,
658 * but other kernel actors could have pages locked.
660 * If @discard is set, there is no need to write the page if it is dirty.
662 * Called with the DLM lock held. */
663 int ll_page_removal_cb(void *data, int discard)
666 struct page *page = data;
667 struct address_space *mapping;
671 /* We have page reference already from ll_pin_page */
674 /* Already truncated by somebody */
678 mapping = page->mapping;
680 ll_teardown_mmaps(mapping,
681 (__u64)page->index << PAGE_CACHE_SHIFT,
682 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
684 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
686 if (!discard && clear_page_dirty_for_io(page)) {
687 LASSERT(page->mapping);
688 rc = ll_call_writepage(page->mapping->host, page);
689 /* either waiting for io to complete or reacquiring
690 * the lock that the failed writepage released */
692 wait_on_page_writeback(page);
694 CERROR("writepage inode %lu(%p) of page %p "
695 "failed: %d\n", mapping->host->i_ino,
696 mapping->host, page, rc);
697 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
699 set_bit(AS_ENOSPC, &mapping->flags);
701 set_bit(AS_EIO, &mapping->flags);
703 mapping->gfp_mask |= AS_EIO_MASK;
707 if (page->mapping != NULL) {
708 struct ll_async_page *llap = llap_cast_private(page);
709 // checking again to account for writeback's lock_page()
710 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
712 ll_ra_accounting(llap, page->mapping);
713 ll_truncate_complete_page(page);
717 LASSERT(!PageWriteback(page));
719 page_cache_release(page);
724 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
725 void *data, int flag)
728 struct ll_inode_info *lli;
729 struct lov_stripe_md *lsm;
735 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
736 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
740 inode = ll_inode_from_lock(lock);
743 lli = ll_i2info(inode);
746 if (lli->lli_smd == NULL)
750 stripe = ll_lock_to_stripe_offset(inode, lock);
754 lov_stripe_lock(lsm);
755 lock_res_and_lock(lock);
756 kms = ldlm_extent_shift_kms(lock,
757 lsm->lsm_oinfo[stripe]->loi_kms);
759 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
760 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
761 lsm->lsm_oinfo[stripe]->loi_kms, kms);
762 lsm->lsm_oinfo[stripe]->loi_kms = kms;
763 unlock_res_and_lock(lock);
764 lov_stripe_unlock(lsm);
765 ll_try_done_writing(inode);
774 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
776 /* XXX ALLOCATE - 160 bytes */
777 struct inode *inode = ll_inode_from_lock(lock);
778 struct ll_inode_info *lli = ll_i2info(inode);
779 struct lustre_handle lockh = { 0 };
784 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
785 LDLM_FL_BLOCK_CONV)) {
786 LBUG(); /* not expecting any blocked async locks yet */
787 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
789 ldlm_lock_dump(D_OTHER, lock, 0);
790 ldlm_reprocess_all(lock->l_resource);
794 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
796 stripe = ll_lock_to_stripe_offset(inode, lock);
800 if (lock->l_lvb_len) {
801 struct lov_stripe_md *lsm = lli->lli_smd;
803 lvb = lock->l_lvb_data;
804 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
806 lock_res_and_lock(lock);
807 ll_inode_size_lock(inode, 1);
808 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
809 kms = ldlm_extent_shift_kms(NULL, kms);
810 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
811 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
812 lsm->lsm_oinfo[stripe].loi_kms, kms);
813 lsm->lsm_oinfo[stripe].loi_kms = kms;
814 ll_inode_size_unlock(inode, 1);
815 unlock_res_and_lock(lock);
820 wake_up(&lock->l_waitq);
822 ldlm_lock2handle(lock, &lockh);
823 ldlm_lock_decref(&lockh, LCK_PR);
828 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
830 struct ptlrpc_request *req = reqp;
831 struct inode *inode = ll_inode_from_lock(lock);
832 struct ll_inode_info *lli;
833 struct lov_stripe_md *lsm;
836 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
840 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
841 lli = ll_i2info(inode);
843 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
846 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
848 /* First, find out which stripe index this lock corresponds to. */
849 stripe = ll_lock_to_stripe_offset(inode, lock);
851 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
853 rc = lustre_pack_reply(req, 2, size, NULL);
857 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
858 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
859 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
860 lvb->lvb_atime = LTIME_S(inode->i_atime);
861 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
863 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
864 " atime "LPU64", mtime "LPU64", ctime "LPU64,
865 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
866 lvb->lvb_atime, lvb->lvb_ctime);
871 /* These errors are normal races, so we don't want to fill the console
872 * with messages by calling ptlrpc_error() */
873 if (rc == -ELDLM_NO_LOCK_DATA)
874 lustre_pack_reply(req, 1, NULL, NULL);
880 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
883 struct lustre_handle lockh = { 0 };
884 struct ldlm_enqueue_info einfo = { 0 };
885 struct obd_info oinfo = { { { 0 } } };
891 einfo.ei_type = LDLM_EXTENT;
892 einfo.ei_mode = LCK_PR;
893 einfo.ei_cb_bl = osc_extent_blocking_cb;
894 einfo.ei_cb_cp = ldlm_completion_ast;
895 einfo.ei_cb_gl = ll_glimpse_callback;
896 einfo.ei_cbdata = NULL;
898 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
899 oinfo.oi_lockh = &lockh;
901 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
903 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
907 CERROR("obd_enqueue returned rc %d, "
908 "returning -EIO\n", rc);
909 RETURN(rc > 0 ? -EIO : rc);
912 lov_stripe_lock(lsm);
913 memset(&lvb, 0, sizeof(lvb));
914 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
915 st->st_size = lvb.lvb_size;
916 st->st_blocks = lvb.lvb_blocks;
917 st->st_mtime = lvb.lvb_mtime;
918 st->st_atime = lvb.lvb_atime;
919 st->st_ctime = lvb.lvb_ctime;
920 lov_stripe_unlock(lsm);
925 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
926 * file (because it prefers KMS over RSS when larger) */
927 int ll_glimpse_size(struct inode *inode, int ast_flags)
929 struct ll_inode_info *lli = ll_i2info(inode);
930 struct ll_sb_info *sbi = ll_i2sbi(inode);
931 struct lustre_handle lockh = { 0 };
932 struct ldlm_enqueue_info einfo = { 0 };
933 struct obd_info oinfo = { { { 0 } } };
938 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
941 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
945 /* NOTE: this looks like DLM lock request, but it may not be one. Due
946 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
947 * won't revoke any conflicting DLM locks held. Instead,
948 * ll_glimpse_callback() will be called on each client
949 * holding a DLM lock against this file, and resulting size
950 * will be returned for each stripe. DLM lock on [0, EOF] is
951 * acquired only if there were no conflicting locks. */
952 einfo.ei_type = LDLM_EXTENT;
953 einfo.ei_mode = LCK_PR;
954 einfo.ei_cb_bl = osc_extent_blocking_cb;
955 einfo.ei_cb_cp = ldlm_completion_ast;
956 einfo.ei_cb_gl = ll_glimpse_callback;
957 einfo.ei_cbdata = inode;
959 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
960 oinfo.oi_lockh = &lockh;
961 oinfo.oi_md = lli->lli_smd;
962 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
964 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
968 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
969 RETURN(rc > 0 ? -EIO : rc);
972 ll_inode_size_lock(inode, 1);
973 inode_init_lvb(inode, &lvb);
974 rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
975 i_size_write(inode, lvb.lvb_size);
976 inode->i_blocks = lvb.lvb_blocks;
977 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
978 LTIME_S(inode->i_atime) = lvb.lvb_atime;
979 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
980 ll_inode_size_unlock(inode, 1);
982 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
983 i_size_read(inode), (long long)inode->i_blocks);
988 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
989 struct lov_stripe_md *lsm, int mode,
990 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
993 struct ll_sb_info *sbi = ll_i2sbi(inode);
995 struct ldlm_enqueue_info einfo = { 0 };
996 struct obd_info oinfo = { { { 0 } } };
1000 LASSERT(!lustre_handle_is_used(lockh));
1001 LASSERT(lsm != NULL);
1003 /* don't drop the mmapped file to LRU */
1004 if (mapping_mapped(inode->i_mapping))
1005 ast_flags |= LDLM_FL_NO_LRU;
1007 /* XXX phil: can we do this? won't it screw the file size up? */
1008 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1009 (sbi->ll_flags & LL_SBI_NOLCK))
1012 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1013 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1015 einfo.ei_type = LDLM_EXTENT;
1016 einfo.ei_mode = mode;
1017 einfo.ei_cb_bl = osc_extent_blocking_cb;
1018 einfo.ei_cb_cp = ldlm_completion_ast;
1019 einfo.ei_cb_gl = ll_glimpse_callback;
1020 einfo.ei_cbdata = inode;
1022 oinfo.oi_policy = *policy;
1023 oinfo.oi_lockh = lockh;
1025 oinfo.oi_flags = ast_flags;
1027 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1028 *policy = oinfo.oi_policy;
1032 ll_inode_size_lock(inode, 1);
1033 inode_init_lvb(inode, &lvb);
1034 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1036 if (policy->l_extent.start == 0 &&
1037 policy->l_extent.end == OBD_OBJECT_EOF) {
1038 /* vmtruncate()->ll_truncate() first sets the i_size and then
1039 * the kms under both a DLM lock and the
1040 * ll_inode_size_lock(). If we don't get the
1041 * ll_inode_size_lock() here we can match the DLM lock and
1042 * reset i_size from the kms before the truncating path has
1043 * updated the kms. generic_file_write can then trust the
1044 * stale i_size when doing appending writes and effectively
1045 * cancel the result of the truncate. Getting the
1046 * ll_inode_size_lock() after the enqueue maintains the DLM
1047 * -> ll_inode_size_lock() acquiring order. */
1048 i_size_write(inode, lvb.lvb_size);
1049 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1050 inode->i_ino, i_size_read(inode));
1054 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1055 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1056 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1058 ll_inode_size_unlock(inode, 1);
1063 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1064 struct lov_stripe_md *lsm, int mode,
1065 struct lustre_handle *lockh)
1067 struct ll_sb_info *sbi = ll_i2sbi(inode);
1071 /* XXX phil: can we do this? won't it screw the file size up? */
1072 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1073 (sbi->ll_flags & LL_SBI_NOLCK))
1076 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
1081 static void ll_set_file_contended(struct inode *inode)
1083 struct ll_inode_info *lli = ll_i2info(inode);
1085 lli->lli_contention_time = cfs_time_current();
1086 set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1089 void ll_clear_file_contended(struct inode *inode)
1091 struct ll_inode_info *lli = ll_i2info(inode);
1093 clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1096 static int ll_is_file_contended(struct file *file)
1098 struct inode *inode = file->f_dentry->d_inode;
1099 struct ll_inode_info *lli = ll_i2info(inode);
1100 struct ll_sb_info *sbi = ll_i2sbi(inode);
1101 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1104 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1105 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1106 " osc connect flags = 0x"LPX64"\n",
1107 sbi->ll_lco.lco_flags);
1110 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1112 if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1113 cfs_time_t cur_time = cfs_time_current();
1114 cfs_time_t retry_time;
1116 retry_time = cfs_time_add(
1117 lli->lli_contention_time,
1118 cfs_time_seconds(sbi->ll_contention_time));
1119 if (cfs_time_after(cur_time, retry_time)) {
1120 ll_clear_file_contended(inode);
1128 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1129 const char *buf, size_t count,
1130 loff_t start, loff_t end, int rw)
1133 int tree_locked = 0;
1135 struct inode * inode = file->f_dentry->d_inode;
1137 append = (rw == WRITE) && (file->f_flags & O_APPEND);
1139 if (append || !ll_is_file_contended(file)) {
1140 struct ll_lock_tree_node *node;
1143 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1144 if (file->f_flags & O_NONBLOCK)
1145 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1146 node = ll_node_from_inode(inode, start, end,
1147 (rw == WRITE) ? LCK_PW : LCK_PR);
1152 tree->lt_fd = LUSTRE_FPRIVATE(file);
1153 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1156 else if (rc == -EUSERS)
1157 ll_set_file_contended(inode);
1161 RETURN(tree_locked);
1166 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1169 struct inode *inode = file->f_dentry->d_inode;
1170 struct ll_inode_info *lli = ll_i2info(inode);
1171 struct lov_stripe_md *lsm = lli->lli_smd;
1172 struct ll_sb_info *sbi = ll_i2sbi(inode);
1173 struct ll_lock_tree tree;
1175 struct ll_ra_read bead;
1178 ssize_t retval, chunk, sum = 0;
1183 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1184 inode->i_ino, inode->i_generation, inode, count, *ppos);
1185 /* "If nbyte is 0, read() will return 0 and have no other results."
1186 * -- Single Unix Spec */
1190 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1193 /* Read on file with no objects should return zero-filled
1194 * buffers up to file size (we can get non-zero sizes with
1195 * mknod + truncate, then opening file for read. This is a
1196 * common pattern in NFS case, it seems). Bug 6243 */
1198 /* Since there are no objects on OSTs, we have nothing to get
1199 * lock on and so we are forced to access inode->i_size
1202 /* Read beyond end of file */
1203 if (*ppos >= i_size_read(inode))
1206 if (count > i_size_read(inode) - *ppos)
1207 count = i_size_read(inode) - *ppos;
1208 /* Make sure to correctly adjust the file pos pointer for
1210 notzeroed = clear_user(buf, count);
1218 if (sbi->ll_max_rw_chunk != 0) {
1219 /* first, let's know the end of the current stripe */
1221 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1224 /* correct, the end is beyond the request */
1225 if (end > *ppos + count - 1)
1226 end = *ppos + count - 1;
1228 /* and chunk shouldn't be too large even if striping is wide */
1229 if (end - *ppos > sbi->ll_max_rw_chunk)
1230 end = *ppos + sbi->ll_max_rw_chunk - 1;
1232 end = *ppos + count - 1;
1235 tree_locked = ll_file_get_tree_lock(&tree, file, buf,
1236 count, *ppos, end, READ);
1237 if (tree_locked < 0)
1238 GOTO(out, retval = tree_locked);
1240 ll_inode_size_lock(inode, 1);
1242 * Consistency guarantees: following possibilities exist for the
1243 * relation between region being read and real file size at this
1246 * (A): the region is completely inside of the file;
1248 * (B-x): x bytes of region are inside of the file, the rest is
1251 * (C): the region is completely outside of the file.
1253 * This classification is stable under DLM lock acquired by
1254 * ll_tree_lock() above, because to change class, other client has to
1255 * take DLM lock conflicting with our lock. Also, any updates to
1256 * ->i_size by other threads on this client are serialized by
1257 * ll_inode_size_lock(). This guarantees that short reads are handled
1258 * correctly in the face of concurrent writes and truncates.
1260 inode_init_lvb(inode, &lvb);
1261 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1263 if (*ppos + count - 1 > kms) {
1264 /* A glimpse is necessary to determine whether we return a
1265 * short read (B) or some zeroes at the end of the buffer (C) */
1266 ll_inode_size_unlock(inode, 1);
1267 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1269 ll_tree_unlock(&tree);
1273 /* region is within kms and, hence, within real file size (A).
1274 * We need to increase i_size to cover the read region so that
1275 * generic_file_read() will do its job, but that doesn't mean
1276 * the kms size is _correct_, it is only the _minimum_ size.
1277 * If someone does a stat they will get the correct size which
1278 * will always be >= the kms value here. b=11081 */
1279 if (i_size_read(inode) < kms)
1280 i_size_write(inode, kms);
1281 ll_inode_size_unlock(inode, 1);
1284 chunk = end - *ppos + 1;
1285 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1286 inode->i_ino, chunk, *ppos, i_size_read(inode));
1288 /* turn off the kernel's read-ahead */
1290 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1293 file->f_ra.ra_pages = 0;
1295 /* initialize read-ahead window once per syscall */
1298 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1299 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1300 ll_ra_read_in(file, &bead);
1304 file_accessed(file);
1305 retval = generic_file_read(file, buf, chunk, ppos);
1306 ll_tree_unlock(&tree);
1308 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1310 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1315 if (retval == chunk && count > 0)
1321 ll_ra_read_ex(file, &bead);
1322 retval = (sum > 0) ? sum : retval;
1327 * Write to a file (through the page cache).
1329 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1332 struct inode *inode = file->f_dentry->d_inode;
1333 struct ll_sb_info *sbi = ll_i2sbi(inode);
1334 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1335 struct ll_lock_tree tree;
1336 loff_t maxbytes = ll_file_maxbytes(inode);
1337 loff_t lock_start, lock_end, end;
1338 ssize_t retval, chunk, sum = 0;
1342 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1343 inode->i_ino, inode->i_generation, inode, count, *ppos);
1345 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1347 /* POSIX, but surprised the VFS doesn't check this already */
1351 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1352 * called on the file, don't fail the below assertion (bug 2388). */
1353 if (file->f_flags & O_LOV_DELAY_CREATE &&
1354 ll_i2info(inode)->lli_smd == NULL)
1357 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1359 down(&ll_i2info(inode)->lli_write_sem);
1362 chunk = 0; /* just to fix gcc's warning */
1363 end = *ppos + count - 1;
1365 if (file->f_flags & O_APPEND) {
1367 lock_end = OBD_OBJECT_EOF;
1368 } else if (sbi->ll_max_rw_chunk != 0) {
1369 /* first, let's know the end of the current stripe */
1371 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1374 /* correct, the end is beyond the request */
1375 if (end > *ppos + count - 1)
1376 end = *ppos + count - 1;
1378 /* and chunk shouldn't be too large even if striping is wide */
1379 if (end - *ppos > sbi->ll_max_rw_chunk)
1380 end = *ppos + sbi->ll_max_rw_chunk - 1;
1385 lock_end = *ppos + count - 1;
1388 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1389 lock_start, lock_end, WRITE);
1390 if (tree_locked < 0)
1391 GOTO(out, retval = tree_locked);
1393 /* This is ok, g_f_w will overwrite this under i_sem if it races
1394 * with a local truncate, it just makes our maxbyte checking easier.
1395 * The i_size value gets updated in ll_extent_lock() as a consequence
1396 * of the [0,EOF] extent lock we requested above. */
1397 if (file->f_flags & O_APPEND) {
1398 *ppos = i_size_read(inode);
1399 end = *ppos + count - 1;
1402 if (*ppos >= maxbytes) {
1403 send_sig(SIGXFSZ, current, 0);
1404 GOTO(out_unlock, retval = -EFBIG);
1406 if (end > maxbytes - 1)
1409 /* generic_file_write handles O_APPEND after getting i_mutex */
1410 chunk = end - *ppos + 1;
1411 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1412 inode->i_ino, chunk, *ppos);
1414 retval = generic_file_write(file, buf, chunk, ppos);
1416 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1418 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1422 ll_tree_unlock(&tree);
1429 if (retval == chunk && count > 0)
1433 up(&ll_i2info(inode)->lli_write_sem);
1435 retval = (sum > 0) ? sum : retval;
1436 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1437 retval > 0 ? retval : 0);
1442 * Send file content (through pagecache) somewhere with helper
1444 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1445 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1446 read_actor_t actor, void *target)
1448 struct inode *inode = in_file->f_dentry->d_inode;
1449 struct ll_inode_info *lli = ll_i2info(inode);
1450 struct lov_stripe_md *lsm = lli->lli_smd;
1451 struct ll_lock_tree tree;
1452 struct ll_lock_tree_node *node;
1454 struct ll_ra_read bead;
1459 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1460 inode->i_ino, inode->i_generation, inode, count, *ppos);
1462 /* "If nbyte is 0, read() will return 0 and have no other results."
1463 * -- Single Unix Spec */
1467 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1468 /* turn off the kernel's read-ahead */
1469 in_file->f_ra.ra_pages = 0;
1471 /* File with no objects, nothing to lock */
1473 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1475 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1477 RETURN(PTR_ERR(node));
1479 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1480 rc = ll_tree_lock(&tree, node, NULL, count,
1481 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1485 ll_clear_file_contended(inode);
1486 ll_inode_size_lock(inode, 1);
1488 * Consistency guarantees: following possibilities exist for the
1489 * relation between region being read and real file size at this
1492 * (A): the region is completely inside of the file;
1494 * (B-x): x bytes of region are inside of the file, the rest is
1497 * (C): the region is completely outside of the file.
1499 * This classification is stable under DLM lock acquired by
1500 * ll_tree_lock() above, because to change class, other client has to
1501 * take DLM lock conflicting with our lock. Also, any updates to
1502 * ->i_size by other threads on this client are serialized by
1503 * ll_inode_size_lock(). This guarantees that short reads are handled
1504 * correctly in the face of concurrent writes and truncates.
1506 inode_init_lvb(inode, &lvb);
1507 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1509 if (*ppos + count - 1 > kms) {
1510 /* A glimpse is necessary to determine whether we return a
1511 * short read (B) or some zeroes at the end of the buffer (C) */
1512 ll_inode_size_unlock(inode, 1);
1513 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1517 /* region is within kms and, hence, within real file size (A) */
1518 i_size_write(inode, kms);
1519 ll_inode_size_unlock(inode, 1);
1522 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1523 inode->i_ino, count, *ppos, i_size_read(inode));
1525 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1526 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1527 ll_ra_read_in(in_file, &bead);
1529 file_accessed(in_file);
1530 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1531 ll_ra_read_ex(in_file, &bead);
1534 ll_tree_unlock(&tree);
1539 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1542 struct ll_inode_info *lli = ll_i2info(inode);
1543 struct obd_export *exp = ll_i2obdexp(inode);
1544 struct ll_recreate_obj ucreatp;
1545 struct obd_trans_info oti = { 0 };
1546 struct obdo *oa = NULL;
1549 struct lov_stripe_md *lsm, *lsm2;
1552 if (!capable (CAP_SYS_ADMIN))
1555 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1556 sizeof(struct ll_recreate_obj));
1564 down(&lli->lli_size_sem);
1567 GOTO(out, rc = -ENOENT);
1568 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1569 (lsm->lsm_stripe_count));
1571 OBD_ALLOC(lsm2, lsm_size);
1573 GOTO(out, rc = -ENOMEM);
1575 oa->o_id = ucreatp.lrc_id;
1576 oa->o_nlink = ucreatp.lrc_ost_idx;
1577 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1578 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
1579 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1580 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1582 memcpy(lsm2, lsm, lsm_size);
1583 rc = obd_create(exp, oa, &lsm2, &oti);
1585 OBD_FREE(lsm2, lsm_size);
1588 up(&lli->lli_size_sem);
1593 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1594 int flags, struct lov_user_md *lum,
1597 struct ll_inode_info *lli = ll_i2info(inode);
1598 struct lov_stripe_md *lsm;
1599 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1603 down(&lli->lli_size_sem);
1606 up(&lli->lli_size_sem);
1607 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1612 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1615 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1616 GOTO(out_req_free, rc = -ENOENT);
1617 rc = oit.d.lustre.it_status;
1619 GOTO(out_req_free, rc);
1621 ll_release_openhandle(file->f_dentry, &oit);
1624 up(&lli->lli_size_sem);
1625 ll_intent_release(&oit);
1628 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1632 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1633 struct lov_mds_md **lmmp, int *lmm_size,
1634 struct ptlrpc_request **request)
1636 struct ll_sb_info *sbi = ll_i2sbi(inode);
1638 struct mds_body *body;
1639 struct lov_mds_md *lmm = NULL;
1640 struct ptlrpc_request *req = NULL;
1643 ll_inode2fid(&fid, inode);
1645 rc = ll_get_max_mdsize(sbi, &lmmsize);
1649 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
1650 filename, strlen(filename) + 1,
1651 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
1654 CDEBUG(D_INFO, "mdc_getattr_name failed "
1655 "on %s: rc %d\n", filename, rc);
1659 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1661 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1662 /* swabbed by mdc_getattr_name */
1663 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1665 lmmsize = body->eadatasize;
1667 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1669 GOTO(out, rc = -ENODATA);
1672 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
1674 LASSERT(lmm != NULL);
1675 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1678 * This is coming from the MDS, so is probably in
1679 * little endian. We convert it to host endian before
1680 * passing it to userspace.
1682 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1683 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1684 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1685 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1686 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1689 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1690 struct lov_stripe_md *lsm;
1691 struct lov_user_md_join *lmj;
1692 int lmj_size, i, aindex = 0;
1694 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1696 GOTO(out, rc = -ENOMEM);
1697 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1699 GOTO(out_free_memmd, rc);
1701 lmj_size = sizeof(struct lov_user_md_join) +
1702 lsm->lsm_stripe_count *
1703 sizeof(struct lov_user_ost_data_join);
1704 OBD_ALLOC(lmj, lmj_size);
1706 GOTO(out_free_memmd, rc = -ENOMEM);
1708 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1709 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1710 struct lov_extent *lex =
1711 &lsm->lsm_array->lai_ext_array[aindex];
1713 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1715 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1716 LPU64" len %d\n", aindex, i,
1717 lex->le_start, (int)lex->le_len);
1718 lmj->lmm_objects[i].l_extent_start =
1721 if ((int)lex->le_len == -1)
1722 lmj->lmm_objects[i].l_extent_end = -1;
1724 lmj->lmm_objects[i].l_extent_end =
1725 lex->le_start + lex->le_len;
1726 lmj->lmm_objects[i].l_object_id =
1727 lsm->lsm_oinfo[i]->loi_id;
1728 lmj->lmm_objects[i].l_object_gr =
1729 lsm->lsm_oinfo[i]->loi_gr;
1730 lmj->lmm_objects[i].l_ost_gen =
1731 lsm->lsm_oinfo[i]->loi_ost_gen;
1732 lmj->lmm_objects[i].l_ost_idx =
1733 lsm->lsm_oinfo[i]->loi_ost_idx;
1735 lmm = (struct lov_mds_md *)lmj;
1738 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1742 *lmm_size = lmmsize;
1746 static int ll_lov_setea(struct inode *inode, struct file *file,
1749 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1750 struct lov_user_md *lump;
1751 int lum_size = sizeof(struct lov_user_md) +
1752 sizeof(struct lov_user_ost_data);
1756 if (!capable (CAP_SYS_ADMIN))
1759 OBD_ALLOC(lump, lum_size);
1763 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1765 OBD_FREE(lump, lum_size);
1769 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1771 OBD_FREE(lump, lum_size);
1775 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1778 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1780 int flags = FMODE_WRITE;
1783 /* Bug 1152: copy properly when this is no longer true */
1784 LASSERT(sizeof(lum) == sizeof(*lump));
1785 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1786 rc = copy_from_user(&lum, lump, sizeof(lum));
1790 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1792 put_user(0, &lump->lmm_stripe_count);
1793 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
1794 0, ll_i2info(inode)->lli_smd, lump);
1799 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1801 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1806 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
1810 static int ll_get_grouplock(struct inode *inode, struct file *file,
1813 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1814 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1815 .end = OBD_OBJECT_EOF}};
1816 struct lustre_handle lockh = { 0 };
1817 struct ll_inode_info *lli = ll_i2info(inode);
1818 struct lov_stripe_md *lsm = lli->lli_smd;
1822 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1826 policy.l_extent.gid = arg;
1827 if (file->f_flags & O_NONBLOCK)
1828 flags = LDLM_FL_BLOCK_NOWAIT;
1830 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1834 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1836 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1841 static int ll_put_grouplock(struct inode *inode, struct file *file,
1844 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1845 struct ll_inode_info *lli = ll_i2info(inode);
1846 struct lov_stripe_md *lsm = lli->lli_smd;
1850 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1851 /* Ugh, it's already unlocked. */
1855 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1858 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1860 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1865 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1870 static int join_sanity_check(struct inode *head, struct inode *tail)
1873 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1874 CERROR("server do not support join \n");
1877 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1878 CERROR("tail ino %lu and ino head %lu must be regular\n",
1879 head->i_ino, tail->i_ino);
1882 if (head->i_ino == tail->i_ino) {
1883 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1886 if (i_size_read(head) % JOIN_FILE_ALIGN) {
1887 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1893 static int join_file(struct inode *head_inode, struct file *head_filp,
1894 struct file *tail_filp)
1896 struct dentry *tail_dentry = tail_filp->f_dentry;
1897 struct lookup_intent oit = {.it_op = IT_OPEN,
1898 .it_flags = head_filp->f_flags|O_JOIN_FILE};
1899 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
1900 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
1902 struct lustre_handle lockh;
1903 struct mdc_op_data *op_data;
1908 tail_dentry = tail_filp->f_dentry;
1910 OBD_ALLOC_PTR(op_data);
1911 if (op_data == NULL) {
1915 data = i_size_read(head_inode);
1916 ll_prepare_mdc_op_data(op_data, head_inode,
1917 tail_dentry->d_parent->d_inode,
1918 tail_dentry->d_name.name,
1919 tail_dentry->d_name.len, 0, &data);
1920 rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
1921 op_data, &lockh, NULL, 0, 0);
1926 rc = oit.d.lustre.it_status;
1928 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1929 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1930 ptlrpc_req_finished((struct ptlrpc_request *)
1931 oit.d.lustre.it_data);
1935 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1937 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1938 oit.d.lustre.it_lock_mode = 0;
1940 ll_release_openhandle(head_filp->f_dentry, &oit);
1943 OBD_FREE_PTR(op_data);
1944 ll_intent_release(&oit);
1948 static int ll_file_join(struct inode *head, struct file *filp,
1949 char *filename_tail)
1951 struct inode *tail = NULL, *first = NULL, *second = NULL;
1952 struct dentry *tail_dentry;
1953 struct file *tail_filp, *first_filp, *second_filp;
1954 struct ll_lock_tree first_tree, second_tree;
1955 struct ll_lock_tree_node *first_node, *second_node;
1956 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1957 int rc = 0, cleanup_phase = 0;
1960 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1961 head->i_ino, head->i_generation, head, filename_tail);
1963 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1964 if (IS_ERR(tail_filp)) {
1965 CERROR("Can not open tail file %s", filename_tail);
1966 rc = PTR_ERR(tail_filp);
1969 tail = igrab(tail_filp->f_dentry->d_inode);
1971 tlli = ll_i2info(tail);
1972 tail_dentry = tail_filp->f_dentry;
1973 LASSERT(tail_dentry);
1976 /*reorder the inode for lock sequence*/
1977 first = head->i_ino > tail->i_ino ? head : tail;
1978 second = head->i_ino > tail->i_ino ? tail : head;
1979 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1980 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1982 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1983 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1984 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1985 if (IS_ERR(first_node)){
1986 rc = PTR_ERR(first_node);
1989 first_tree.lt_fd = first_filp->private_data;
1990 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1995 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1996 if (IS_ERR(second_node)){
1997 rc = PTR_ERR(second_node);
2000 second_tree.lt_fd = second_filp->private_data;
2001 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2006 rc = join_sanity_check(head, tail);
2010 rc = join_file(head, filp, tail_filp);
2014 switch (cleanup_phase) {
2016 ll_tree_unlock(&second_tree);
2017 obd_cancel_unused(ll_i2obdexp(second),
2018 ll_i2info(second)->lli_smd, 0, NULL);
2020 ll_tree_unlock(&first_tree);
2021 obd_cancel_unused(ll_i2obdexp(first),
2022 ll_i2info(first)->lli_smd, 0, NULL);
2024 filp_close(tail_filp, 0);
2027 if (head && rc == 0) {
2028 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2030 hlli->lli_smd = NULL;
2035 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2041 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2043 struct inode *inode = dentry->d_inode;
2044 struct obd_client_handle *och;
2050 /* Root ? Do nothing. */
2051 if (dentry->d_inode->i_sb->s_root == dentry)
2054 /* No open handle to close? Move away */
2055 if (!it_disposition(it, DISP_OPEN_OPEN))
2058 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2060 OBD_ALLOC(och, sizeof(*och));
2062 GOTO(out, rc = -ENOMEM);
2064 ll_och_fill(ll_i2info(inode), it, och);
2066 rc = ll_close_inode_openhandle(inode, och);
2068 OBD_FREE(och, sizeof(*och));
2070 /* this one is in place of ll_file_open */
2071 ptlrpc_req_finished(it->d.lustre.it_data);
2072 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2076 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2079 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2083 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2084 inode->i_generation, inode, cmd);
2085 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2087 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2088 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2092 case LL_IOC_GETFLAGS:
2093 /* Get the current value of the file flags */
2094 return put_user(fd->fd_flags, (int *)arg);
2095 case LL_IOC_SETFLAGS:
2096 case LL_IOC_CLRFLAGS:
2097 /* Set or clear specific file flags */
2098 /* XXX This probably needs checks to ensure the flags are
2099 * not abused, and to handle any flag side effects.
2101 if (get_user(flags, (int *) arg))
2104 if (cmd == LL_IOC_SETFLAGS) {
2105 if ((flags & LL_FILE_IGNORE_LOCK) &&
2106 !(file->f_flags & O_DIRECT)) {
2107 CERROR("%s: unable to disable locking on "
2108 "non-O_DIRECT file\n", current->comm);
2112 fd->fd_flags |= flags;
2114 fd->fd_flags &= ~flags;
2117 case LL_IOC_LOV_SETSTRIPE:
2118 RETURN(ll_lov_setstripe(inode, file, arg));
2119 case LL_IOC_LOV_SETEA:
2120 RETURN(ll_lov_setea(inode, file, arg));
2121 case LL_IOC_LOV_GETSTRIPE:
2122 RETURN(ll_lov_getstripe(inode, arg));
2123 case LL_IOC_RECREATE_OBJ:
2124 RETURN(ll_lov_recreate_obj(inode, file, arg));
2125 case EXT3_IOC_GETFLAGS:
2126 case EXT3_IOC_SETFLAGS:
2127 RETURN(ll_iocontrol(inode, file, cmd, arg));
2128 case EXT3_IOC_GETVERSION_OLD:
2129 case EXT3_IOC_GETVERSION:
2130 RETURN(put_user(inode->i_generation, (int *)arg));
2135 ftail = getname((const char *)arg);
2137 RETURN(PTR_ERR(ftail));
2138 rc = ll_file_join(inode, file, ftail);
2142 case LL_IOC_GROUP_LOCK:
2143 RETURN(ll_get_grouplock(inode, file, arg));
2144 case LL_IOC_GROUP_UNLOCK:
2145 RETURN(ll_put_grouplock(inode, file, arg));
2146 case IOC_OBD_STATFS:
2147 RETURN(ll_obd_statfs(inode, (void *)arg));
2148 case OBD_IOC_GETNAME_OLD:
2149 case OBD_IOC_GETNAME: {
2150 struct obd_device *obd =
2151 class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
2154 if (copy_to_user((void *)arg, obd->obd_name,
2155 strlen(obd->obd_name) + 1))
2160 /* We need to special case any other ioctls we want to handle,
2161 * to send them to the MDS/OST as appropriate and to properly
2162 * network encode the arg field.
2163 case EXT3_IOC_SETVERSION_OLD:
2164 case EXT3_IOC_SETVERSION:
2170 ll_iocontrol_call(inode, file, cmd, arg, &err))
2173 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
2179 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2181 struct inode *inode = file->f_dentry->d_inode;
2182 struct ll_inode_info *lli = ll_i2info(inode);
2183 struct lov_stripe_md *lsm = lli->lli_smd;
2186 retval = offset + ((origin == 2) ? i_size_read(inode) :
2187 (origin == 1) ? file->f_pos : 0);
2188 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2189 inode->i_ino, inode->i_generation, inode, retval, retval,
2190 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2191 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2193 if (origin == 2) { /* SEEK_END */
2194 int nonblock = 0, rc;
2196 if (file->f_flags & O_NONBLOCK)
2197 nonblock = LDLM_FL_BLOCK_NOWAIT;
2200 rc = ll_glimpse_size(inode, nonblock);
2205 ll_inode_size_lock(inode, 0);
2206 offset += i_size_read(inode);
2207 ll_inode_size_unlock(inode, 0);
2208 } else if (origin == 1) { /* SEEK_CUR */
2209 offset += file->f_pos;
2213 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2214 if (offset != file->f_pos) {
2215 file->f_pos = offset;
2216 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2218 file->f_version = ++event;
2220 file->f_version = 0;
2229 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2231 struct inode *inode = dentry->d_inode;
2232 struct ll_inode_info *lli = ll_i2info(inode);
2233 struct lov_stripe_md *lsm = lli->lli_smd;
2235 struct ptlrpc_request *req;
2238 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2239 inode->i_generation, inode);
2240 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2242 /* fsync's caller has already called _fdata{sync,write}, we want
2243 * that IO to finish before calling the osc and mdc sync methods */
2244 rc = filemap_fdatawait(inode->i_mapping);
2246 /* catch async errors that were recorded back when async writeback
2247 * failed for pages in this mapping. */
2248 err = lli->lli_async_rc;
2249 lli->lli_async_rc = 0;
2253 err = lov_test_and_clear_async_rc(lsm);
2258 ll_inode2fid(&fid, inode);
2259 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
2263 ptlrpc_req_finished(req);
2270 RETURN(rc ? rc : -ENOMEM);
2272 oa->o_id = lsm->lsm_object_id;
2273 oa->o_valid = OBD_MD_FLID;
2274 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2275 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2277 err = obd_sync(ll_i2sbi(inode)->ll_osc_exp, oa, lsm,
2287 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2289 struct inode *inode = file->f_dentry->d_inode;
2290 struct ll_sb_info *sbi = ll_i2sbi(inode);
2291 struct ldlm_res_id res_id =
2292 { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
2293 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2294 ldlm_flock_completion_ast, NULL, file_lock };
2295 struct lustre_handle lockh = {0};
2296 ldlm_policy_data_t flock;
2301 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2302 inode->i_ino, file_lock);
2303 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2305 if (file_lock->fl_flags & FL_FLOCK) {
2306 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2307 /* set missing params for flock() calls */
2308 file_lock->fl_end = OFFSET_MAX;
2309 file_lock->fl_pid = current->tgid;
2311 flock.l_flock.pid = file_lock->fl_pid;
2312 flock.l_flock.start = file_lock->fl_start;
2313 flock.l_flock.end = file_lock->fl_end;
2315 switch (file_lock->fl_type) {
2317 einfo.ei_mode = LCK_PR;
2320 /* An unlock request may or may not have any relation to
2321 * existing locks so we may not be able to pass a lock handle
2322 * via a normal ldlm_lock_cancel() request. The request may even
2323 * unlock a byte range in the middle of an existing lock. In
2324 * order to process an unlock request we need all of the same
2325 * information that is given with a normal read or write record
2326 * lock request. To avoid creating another ldlm unlock (cancel)
2327 * message we'll treat a LCK_NL flock request as an unlock. */
2328 einfo.ei_mode = LCK_NL;
2331 einfo.ei_mode = LCK_PW;
2334 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2349 flags = LDLM_FL_BLOCK_NOWAIT;
2355 flags = LDLM_FL_TEST_LOCK;
2356 /* Save the old mode so that if the mode in the lock changes we
2357 * can decrement the appropriate reader or writer refcount. */
2358 file_lock->fl_type = einfo.ei_mode;
2361 CERROR("unknown fcntl lock command: %d\n", cmd);
2365 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2366 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2367 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2369 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
2370 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2371 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2372 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2373 #ifdef HAVE_F_OP_FLOCK
2374 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2375 !(flags & LDLM_FL_TEST_LOCK))
2376 posix_lock_file_wait(file, file_lock);
2382 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2389 int ll_have_md_lock(struct inode *inode, __u64 bits)
2391 struct lustre_handle lockh;
2392 struct ldlm_res_id res_id = { .name = {0} };
2393 struct obd_device *obddev;
2394 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2401 obddev = ll_i2mdcexp(inode)->exp_obd;
2402 res_id.name[0] = inode->i_ino;
2403 res_id.name[1] = inode->i_generation;
2405 CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
2407 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2408 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
2409 &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2416 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2417 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2418 * and return success */
2420 /* This path cannot be hit for regular files unless in
2421 * case of obscure races, so no need to to validate
2423 if (!S_ISREG(inode->i_mode) &&
2424 !S_ISDIR(inode->i_mode))
2429 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2437 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2439 struct inode *inode = dentry->d_inode;
2440 struct ptlrpc_request *req = NULL;
2441 struct obd_export *exp;
2446 CERROR("REPORT THIS LINE TO PETER\n");
2449 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2450 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2451 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2452 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2455 exp = ll_i2mdcexp(inode);
2457 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2458 struct lookup_intent oit = { .it_op = IT_GETATTR };
2459 struct mdc_op_data op_data;
2461 /* Call getattr by fid, so do not provide name at all. */
2462 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
2463 dentry->d_inode, NULL, 0, 0, NULL);
2464 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
2465 /* we are not interested in name
2468 ll_mdc_blocking_ast, 0);
2470 rc = ll_inode_revalidate_fini(inode, rc);
2474 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2476 ll_intent_release(&oit);
2480 /* Unlinked? Unhash dentry, so it is not picked up later by
2481 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2482 here to preserve get_cwd functionality on 2.6.
2484 if (!dentry->d_inode->i_nlink) {
2485 spin_lock(&dcache_lock);
2486 ll_drop_dentry(dentry);
2487 spin_unlock(&dcache_lock);
2490 ll_lookup_finish_locks(&oit, dentry);
2491 } else if (!ll_have_md_lock(dentry->d_inode,
2492 MDS_INODELOCK_UPDATE|MDS_INODELOCK_LOOKUP)) {
2493 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2495 obd_valid valid = OBD_MD_FLGETATTR;
2498 if (S_ISREG(inode->i_mode)) {
2499 rc = ll_get_max_mdsize(sbi, &ealen);
2502 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2504 ll_inode2fid(&fid, inode);
2505 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
2507 rc = ll_inode_revalidate_fini(inode, rc);
2511 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
2517 /* if object not yet allocated, don't validate size */
2518 if (ll_i2info(inode)->lli_smd == NULL)
2521 /* ll_glimpse_size will prefer locally cached writes if they extend
2523 rc = ll_glimpse_size(inode, 0);
2526 ptlrpc_req_finished(req);
2530 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2531 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2532 struct lookup_intent *it, struct kstat *stat)
2534 struct inode *inode = de->d_inode;
2537 res = ll_inode_revalidate_it(de, it);
2538 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2543 stat->dev = inode->i_sb->s_dev;
2544 stat->ino = inode->i_ino;
2545 stat->mode = inode->i_mode;
2546 stat->nlink = inode->i_nlink;
2547 stat->uid = inode->i_uid;
2548 stat->gid = inode->i_gid;
2549 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2550 stat->atime = inode->i_atime;
2551 stat->mtime = inode->i_mtime;
2552 stat->ctime = inode->i_ctime;
2553 #ifdef HAVE_INODE_BLKSIZE
2554 stat->blksize = inode->i_blksize;
2556 stat->blksize = 1<<inode->i_blkbits;
2559 ll_inode_size_lock(inode, 0);
2560 stat->size = i_size_read(inode);
2561 stat->blocks = inode->i_blocks;
2562 ll_inode_size_unlock(inode, 0);
2566 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2568 struct lookup_intent it = { .it_op = IT_GETATTR };
2570 return ll_getattr_it(mnt, de, &it, stat);
2575 int lustre_check_acl(struct inode *inode, int mask)
2577 #ifdef CONFIG_FS_POSIX_ACL
2578 struct ll_inode_info *lli = ll_i2info(inode);
2579 struct posix_acl *acl;
2583 spin_lock(&lli->lli_lock);
2584 acl = posix_acl_dup(lli->lli_posix_acl);
2585 spin_unlock(&lli->lli_lock);
2590 rc = posix_acl_permission(inode, acl, mask);
2591 posix_acl_release(acl);
2599 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2600 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2602 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2603 inode->i_ino, inode->i_generation, inode, mask);
2605 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2606 return generic_permission(inode, mask, lustre_check_acl);
2609 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2610 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2612 int ll_inode_permission(struct inode *inode, int mask)
2615 int mode = inode->i_mode;
2618 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2619 inode->i_ino, inode->i_generation, inode, mask);
2620 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2622 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2623 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2625 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2627 if (current->fsuid == inode->i_uid) {
2630 if (((mode >> 3) & mask & S_IRWXO) != mask)
2632 rc = lustre_check_acl(inode, mask);
2636 goto check_capabilities;
2640 if (in_group_p(inode->i_gid))
2643 if ((mode & mask & S_IRWXO) == mask)
2647 if (!(mask & MAY_EXEC) ||
2648 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2649 if (capable(CAP_DAC_OVERRIDE))
2652 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2653 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2660 /* -o localflock - only provides locally consistent flock locks */
2661 struct file_operations ll_file_operations = {
2662 .read = ll_file_read,
2663 .write = ll_file_write,
2664 .ioctl = ll_file_ioctl,
2665 .open = ll_file_open,
2666 .release = ll_file_release,
2667 .mmap = ll_file_mmap,
2668 .llseek = ll_file_seek,
2669 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2670 .sendfile = ll_file_sendfile,
2675 struct file_operations ll_file_operations_flock = {
2676 .read = ll_file_read,
2677 .write = ll_file_write,
2678 .ioctl = ll_file_ioctl,
2679 .open = ll_file_open,
2680 .release = ll_file_release,
2681 .mmap = ll_file_mmap,
2682 .llseek = ll_file_seek,
2683 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2684 .sendfile = ll_file_sendfile,
2687 #ifdef HAVE_F_OP_FLOCK
2688 .flock = ll_file_flock,
2690 .lock = ll_file_flock
2693 /* These are for -o noflock - to return ENOSYS on flock calls */
2694 struct file_operations ll_file_operations_noflock = {
2695 .read = ll_file_read,
2696 .write = ll_file_write,
2697 .ioctl = ll_file_ioctl,
2698 .open = ll_file_open,
2699 .release = ll_file_release,
2700 .mmap = ll_file_mmap,
2701 .llseek = ll_file_seek,
2702 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2703 .sendfile = ll_file_sendfile,
2706 #ifdef HAVE_F_OP_FLOCK
2707 .flock = ll_file_noflock,
2709 .lock = ll_file_noflock
2712 struct inode_operations ll_file_inode_operations = {
2713 #ifdef HAVE_VFS_INTENT_PATCHES
2714 .setattr_raw = ll_setattr_raw,
2716 .setattr = ll_setattr,
2717 .truncate = ll_truncate,
2718 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2719 .getattr = ll_getattr,
2721 .revalidate_it = ll_inode_revalidate_it,
2723 .permission = ll_inode_permission,
2724 .setxattr = ll_setxattr,
2725 .getxattr = ll_getxattr,
2726 .listxattr = ll_listxattr,
2727 .removexattr = ll_removexattr,
2730 /* dynamic ioctl number support routins */
2731 static struct llioc_ctl_data {
2732 struct rw_semaphore ioc_sem;
2733 struct list_head ioc_head;
2735 __RWSEM_INITIALIZER(llioc.ioc_sem),
2736 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2741 struct list_head iocd_list;
2742 unsigned int iocd_size;
2743 llioc_callback_t iocd_cb;
2744 unsigned int iocd_count;
2745 unsigned int iocd_cmd[0];
2748 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2751 struct llioc_data *in_data = NULL;
2754 if (cb == NULL || cmd == NULL ||
2755 count > LLIOC_MAX_CMD || count < 0)
2758 size = sizeof(*in_data) + count * sizeof(unsigned int);
2759 OBD_ALLOC(in_data, size);
2760 if (in_data == NULL)
2763 memset(in_data, 0, sizeof(*in_data));
2764 in_data->iocd_size = size;
2765 in_data->iocd_cb = cb;
2766 in_data->iocd_count = count;
2767 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2769 down_write(&llioc.ioc_sem);
2770 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2771 up_write(&llioc.ioc_sem);
2776 void ll_iocontrol_unregister(void *magic)
2778 struct llioc_data *tmp;
2783 down_write(&llioc.ioc_sem);
2784 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2786 unsigned int size = tmp->iocd_size;
2788 list_del(&tmp->iocd_list);
2789 up_write(&llioc.ioc_sem);
2791 OBD_FREE(tmp, size);
2795 up_write(&llioc.ioc_sem);
2797 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2800 EXPORT_SYMBOL(ll_iocontrol_register);
2801 EXPORT_SYMBOL(ll_iocontrol_unregister);
2803 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2804 unsigned int cmd, unsigned long arg, int *rcp)
2806 enum llioc_iter ret = LLIOC_CONT;
2807 struct llioc_data *data;
2808 int rc = -EINVAL, i;
2810 down_read(&llioc.ioc_sem);
2811 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2812 for (i = 0; i < data->iocd_count; i++) {
2813 if (cmd != data->iocd_cmd[i])
2816 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2820 if (ret == LLIOC_STOP)
2823 up_read(&llioc.ioc_sem);