1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <linux/pagemap.h>
29 #include <linux/file.h>
30 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
31 #include <linux/lustre_compat25.h>
33 #include "llite_internal.h"
35 /* also used by llite/special.c:ll_special_open() */
36 struct ll_file_data *ll_file_data_get(void)
38 struct ll_file_data *fd;
40 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
44 static void ll_file_data_put(struct ll_file_data *fd)
47 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
50 static int ll_close_inode_openhandle(struct inode *inode,
51 struct obd_client_handle *och)
53 struct ptlrpc_request *req = NULL;
54 struct obd_device *obd;
59 obd = class_exp2obd(ll_i2mdcexp(inode));
61 CERROR("Invalid MDC connection handle "LPX64"\n",
62 ll_i2mdcexp(inode)->exp_handle.h_cookie);
67 * here we check if this is forced umount. If so this is called on
68 * canceling "open lock" and we do not call mdc_close() in this case, as
69 * it will not be successful, as import is already deactivated.
76 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
78 oa->o_id = inode->i_ino;
79 oa->o_valid = OBD_MD_FLID;
80 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
81 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
82 OBD_MD_FLATIME | OBD_MD_FLMTIME |
84 if (ll_is_inode_dirty(inode)) {
85 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
86 oa->o_valid |= OBD_MD_FLFLAGS;
89 rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
91 /* We are the last writer, so the MDS has instructed us to get
92 * the file size and any write cookies, then close again. */
93 ll_queue_done_writing(inode);
96 CERROR("inode %lu mdc close failed: rc = %d\n",
103 rc = ll_objects_destroy(req, inode);
105 CERROR("inode %lu ll_objects destroy: rc = %d\n",
109 ptlrpc_req_finished(req); /* This is close request */
112 mdc_clear_open_replay_data(och);
117 int ll_mdc_real_close(struct inode *inode, int flags)
119 struct ll_inode_info *lli = ll_i2info(inode);
121 struct obd_client_handle **och_p;
122 struct obd_client_handle *och;
127 if (flags & FMODE_WRITE) {
128 och_p = &lli->lli_mds_write_och;
129 och_usecount = &lli->lli_open_fd_write_count;
130 } else if (flags & FMODE_EXEC) {
131 och_p = &lli->lli_mds_exec_och;
132 och_usecount = &lli->lli_open_fd_exec_count;
134 LASSERT(flags & FMODE_READ);
135 och_p = &lli->lli_mds_read_och;
136 och_usecount = &lli->lli_open_fd_read_count;
139 down(&lli->lli_och_sem);
140 if (*och_usecount) { /* There are still users of this handle, so
142 up(&lli->lli_och_sem);
147 up(&lli->lli_och_sem);
149 if (och) { /* There might be a race and somebody have freed this och
151 rc = ll_close_inode_openhandle(inode, och);
152 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
153 OBD_FREE(och, sizeof *och);
159 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
162 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
163 struct ll_inode_info *lli = ll_i2info(inode);
167 /* clear group lock, if present */
168 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
169 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
170 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
171 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
175 /* Let's see if we have good enough OPEN lock on the file and if
176 we can skip talking to MDS */
177 if (file->f_dentry->d_inode) { /* Can this ever be false? */
179 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
180 struct lustre_handle lockh;
181 struct inode *inode = file->f_dentry->d_inode;
182 struct ldlm_res_id file_res_id = {.name={inode->i_ino,
183 inode->i_generation}};
184 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
186 down(&lli->lli_och_sem);
187 if (fd->fd_omode & FMODE_WRITE) {
189 LASSERT(lli->lli_open_fd_write_count);
190 lli->lli_open_fd_write_count--;
191 } else if (fd->fd_omode & FMODE_EXEC) {
193 LASSERT(lli->lli_open_fd_exec_count);
194 lli->lli_open_fd_exec_count--;
197 LASSERT(lli->lli_open_fd_read_count);
198 lli->lli_open_fd_read_count--;
200 up(&lli->lli_och_sem);
202 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
203 &file_res_id, LDLM_IBITS, &policy,lockmode,
205 rc = ll_mdc_real_close(file->f_dentry->d_inode,
209 CERROR("Releasing a file %p with negative dentry %p. Name %s",
210 file, file->f_dentry, file->f_dentry->d_name.name);
213 LUSTRE_FPRIVATE(file) = NULL;
214 ll_file_data_put(fd);
219 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
221 /* While this returns an error code, fput() the caller does not, so we need
222 * to make every effort to clean up all of our state here. Also, applications
223 * rarely check close errors and even if an error is returned they will not
224 * re-try the close call.
226 int ll_file_release(struct inode *inode, struct file *file)
228 struct ll_file_data *fd;
229 struct ll_sb_info *sbi = ll_i2sbi(inode);
230 struct ll_inode_info *lli = ll_i2info(inode);
231 struct lov_stripe_md *lsm = lli->lli_smd;
235 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
236 inode->i_generation, inode);
239 if (inode->i_sb->s_root != file->f_dentry)
240 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
241 fd = LUSTRE_FPRIVATE(file);
245 * The last ref on @file, maybe not the the owner pid of statahead.
246 * Different processes can open the same dir, "ll_opendir_key" means:
247 * it is me that should stop the statahead thread.
249 if (lli->lli_opendir_key == fd)
250 ll_stop_statahead(inode, fd);
252 if (inode->i_sb->s_root == file->f_dentry) {
253 LUSTRE_FPRIVATE(file) = NULL;
254 ll_file_data_put(fd);
259 lov_test_and_clear_async_rc(lsm);
260 lli->lli_async_rc = 0;
262 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
266 static int ll_intent_file_open(struct file *file, void *lmm,
267 int lmmsize, struct lookup_intent *itp)
269 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
270 struct mdc_op_data data;
271 struct dentry *parent = file->f_dentry->d_parent;
272 const char *name = file->f_dentry->d_name.name;
273 const int len = file->f_dentry->d_name.len;
274 struct inode *inode = file->f_dentry->d_inode;
275 struct ptlrpc_request *req;
282 ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
283 name, len, O_RDWR, NULL);
285 /* Usually we come here only for NFSD, and we want open lock.
286 But we can also get here with pre 2.6.15 patchless kernels, and in
287 that case that lock is also ok */
288 /* We can also get here if there was cached open handle in revalidate_it
289 * but it disappeared while we were getting from there to ll_file_open.
290 * But this means this file was closed and immediatelly opened which
291 * makes a good candidate for using OPEN lock */
292 /* If lmmsize & lmm are not 0, we are just setting stripe info
293 * parameters. No need for the open lock */
294 if (!lmm && !lmmsize)
295 itp->it_flags |= MDS_OPEN_LOCK;
297 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
298 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
300 /* reason for keep own exit path - don`t flood log
301 * with messages with -ESTALE errors.
303 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
304 it_open_error(DISP_OPEN_OPEN, itp))
306 ll_release_openhandle(file->f_dentry, itp);
310 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
311 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
312 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
316 if (itp->d.lustre.it_lock_mode)
317 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
320 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
321 req, DLM_REPLY_REC_OFF, NULL);
323 ptlrpc_req_finished(itp->d.lustre.it_data);
326 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
327 ll_intent_drop_lock(itp);
333 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
334 struct obd_client_handle *och)
336 struct ptlrpc_request *req = it->d.lustre.it_data;
337 struct mds_body *body;
341 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
342 LASSERT(body != NULL); /* reply already checked out */
343 /* and swabbed in mdc_enqueue */
344 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
346 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
347 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
348 lli->lli_io_epoch = body->io_epoch;
350 mdc_set_open_replay_data(och, it->d.lustre.it_data);
353 int ll_local_open(struct file *file, struct lookup_intent *it,
354 struct ll_file_data *fd, struct obd_client_handle *och)
358 LASSERT(!LUSTRE_FPRIVATE(file));
363 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
364 LUSTRE_FPRIVATE(file) = fd;
365 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
366 fd->fd_omode = it->it_flags;
371 /* Open a file, and (for the very first open) create objects on the OSTs at
372 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
373 * creation or open until ll_lov_setstripe() ioctl is called. We grab
374 * lli_open_sem to ensure no other process will create objects, send the
375 * stripe MD to the MDS, or try to destroy the objects if that fails.
377 * If we already have the stripe MD locally then we don't request it in
378 * mdc_open(), by passing a lmm_size = 0.
380 * It is up to the application to ensure no other processes open this file
381 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
382 * used. We might be able to avoid races of that sort by getting lli_open_sem
383 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
384 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
386 int ll_file_open(struct inode *inode, struct file *file)
388 struct ll_inode_info *lli = ll_i2info(inode);
389 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
390 .it_flags = file->f_flags };
391 struct lov_stripe_md *lsm;
392 struct ptlrpc_request *req = NULL;
393 struct obd_client_handle **och_p;
395 struct ll_file_data *fd;
396 int rc = 0, opendir_set = 0;
399 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
400 inode->i_generation, inode, file->f_flags);
402 #ifdef HAVE_VFS_INTENT_PATCHES
405 it = file->private_data; /* XXX: compat macro */
406 file->private_data = NULL; /* prevent ll_local_open assertion */
409 fd = ll_file_data_get();
413 if (S_ISDIR(inode->i_mode)) {
414 spin_lock(&lli->lli_lock);
416 * "lli->lli_opendir_pid != 0" means someone has set it.
417 * "lli->lli_sai != NULL" means the previous statahead has not
420 if (lli->lli_opendir_pid == 0 && lli->lli_sai == NULL) {
422 lli->lli_opendir_pid = cfs_curproc_pid();
423 lli->lli_opendir_key = fd;
424 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid())) {
425 /* Two cases for this:
426 * (1) The same process open such directory many times.
427 * (2) The old process opened the directory, and exited
428 * before its children processes. Then new process
429 * with the same pid opens such directory before the
430 * old process's children processes exit.
431 * Change the owner to the latest one.
434 lli->lli_opendir_key = fd;
436 spin_unlock(&lli->lli_lock);
439 if (inode->i_sb->s_root == file->f_dentry) {
440 LUSTRE_FPRIVATE(file) = fd;
444 if (!it || !it->d.lustre.it_disposition) {
445 /* Convert f_flags into access mode. We cannot use file->f_mode,
446 * because everything but O_ACCMODE mask was stripped from it */
447 if ((oit.it_flags + 1) & O_ACCMODE)
449 if (file->f_flags & O_TRUNC)
450 oit.it_flags |= FMODE_WRITE;
452 /* kernel only call f_op->open in dentry_open. filp_open calls
453 * dentry_open after call to open_namei that checks permissions.
454 * Only nfsd_open call dentry_open directly without checking
455 * permissions and because of that this code below is safe. */
456 if (oit.it_flags & FMODE_WRITE)
457 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
459 /* We do not want O_EXCL here, presumably we opened the file
460 * already? XXX - NFS implications? */
461 oit.it_flags &= ~O_EXCL;
467 /* Let's see if we have file open on MDS already. */
468 if (it->it_flags & FMODE_WRITE) {
469 och_p = &lli->lli_mds_write_och;
470 och_usecount = &lli->lli_open_fd_write_count;
471 } else if (it->it_flags & FMODE_EXEC) {
472 och_p = &lli->lli_mds_exec_och;
473 och_usecount = &lli->lli_open_fd_exec_count;
475 och_p = &lli->lli_mds_read_och;
476 och_usecount = &lli->lli_open_fd_read_count;
479 LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
480 it->d.lustre.it_disposition);
482 down(&lli->lli_och_sem);
483 if (*och_p) { /* Open handle is present */
484 if (it_disposition(it, DISP_OPEN_OPEN)) {
485 /* Well, there's extra open request that we do not need,
486 let's close it somehow. This will decref request. */
487 rc = it_open_error(DISP_OPEN_OPEN, it);
489 up(&lli->lli_och_sem);
490 ll_file_data_put(fd);
491 GOTO(out_openerr, rc);
493 ll_release_openhandle(file->f_dentry, it);
494 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
499 rc = ll_local_open(file, it, fd, NULL);
501 LASSERTF(rc == 0, "rc = %d\n", rc);
503 LASSERT(*och_usecount == 0);
504 if (!it->d.lustre.it_disposition) {
505 /* We cannot just request lock handle now, new ELC code
506 means that one of other OPEN locks for this file
507 could be cancelled, and since blocking ast handler
508 would attempt to grab och_sem as well, that would
509 result in a deadlock */
510 up(&lli->lli_och_sem);
511 rc = ll_intent_file_open(file, NULL, 0, it);
513 ll_file_data_put(fd);
514 GOTO(out_openerr, rc);
517 mdc_set_lock_data(&it->d.lustre.it_lock_handle,
518 file->f_dentry->d_inode);
522 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
524 ll_file_data_put(fd);
525 GOTO(out_och_free, rc = -ENOMEM);
528 req = it->d.lustre.it_data;
530 /* mdc_intent_lock() didn't get a request ref if there was an
531 * open error, so don't do cleanup on the request here
533 /* XXX (green): Should not we bail out on any error here, not
534 * just open error? */
535 rc = it_open_error(DISP_OPEN_OPEN, it);
537 ll_file_data_put(fd);
538 GOTO(out_och_free, rc);
541 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
542 rc = ll_local_open(file, it, fd, *och_p);
543 LASSERTF(rc == 0, "rc = %d\n", rc);
545 up(&lli->lli_och_sem);
547 /* Must do this outside lli_och_sem lock to prevent deadlock where
548 different kind of OPEN lock for this same inode gets cancelled
549 by ldlm_cancel_lru */
550 if (!S_ISREG(inode->i_mode))
555 if (file->f_flags & O_LOV_DELAY_CREATE ||
556 !(file->f_mode & FMODE_WRITE)) {
557 CDEBUG(D_INODE, "object creation was delayed\n");
561 file->f_flags &= ~O_LOV_DELAY_CREATE;
564 ptlrpc_req_finished(req);
566 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
568 ll_open_complete(inode);
572 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
573 *och_p = NULL; /* OBD_FREE writes some magic there */
576 up(&lli->lli_och_sem);
579 lli->lli_opendir_key = NULL;
580 lli->lli_opendir_pid = 0;
581 } else if (unlikely(opendir_set == 2)) {
582 ll_stop_statahead(inode, fd);
588 /* Fills the obdo with the attributes for the inode defined by lsm */
589 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
592 struct ptlrpc_request_set *set;
593 struct obd_info oinfo = { { { 0 } } };
597 LASSERT(lsm != NULL);
599 memset(oa, 0, sizeof *oa);
602 oa->o_id = lsm->lsm_object_id;
603 oa->o_mode = S_IFREG;
604 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
605 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
608 set = ptlrpc_prep_set();
612 rc = obd_getattr_async(exp, &oinfo, set);
614 rc = ptlrpc_set_wait(set);
615 ptlrpc_set_destroy(set);
620 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
621 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
625 static inline void ll_remove_suid(struct inode *inode)
629 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
630 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
632 /* was any of the uid bits set? */
633 mode &= inode->i_mode;
634 if (mode && !capable(CAP_FSETID)) {
635 inode->i_mode &= ~mode;
636 // XXX careful here - we cannot change the size
640 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
642 struct ll_inode_info *lli = ll_i2info(inode);
643 struct lov_stripe_md *lsm = lli->lli_smd;
644 struct obd_export *exp = ll_i2obdexp(inode);
647 struct ldlm_lock *lock;
648 struct lov_stripe_md *lsm;
649 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
650 __u32 stripe, vallen = sizeof(stripe);
654 if (lsm->lsm_stripe_count == 1)
655 GOTO(check, stripe = 0);
657 /* get our offset in the lov */
658 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
660 CERROR("obd_get_info: rc = %d\n", rc);
663 LASSERT(stripe < lsm->lsm_stripe_count);
666 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
667 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[1]){
668 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
669 lsm->lsm_oinfo[stripe]->loi_id,
670 lsm->lsm_oinfo[stripe]->loi_gr);
671 RETURN(-ELDLM_NO_LOCK_DATA);
677 /* Get extra page reference to ensure it is not going away */
678 void ll_pin_extent_cb(void *data)
680 struct page *page = data;
682 page_cache_get(page);
686 /* Flush the page from page cache for an extent as its canceled.
687 * Page to remove is delivered as @data.
689 * No one can dirty the extent until we've finished our work and they cannot
690 * enqueue another lock. The DLM protects us from ll_file_read/write here,
691 * but other kernel actors could have pages locked.
693 * If @discard is set, there is no need to write the page if it is dirty.
695 * Called with the DLM lock held. */
696 int ll_page_removal_cb(void *data, int discard)
699 struct page *page = data;
700 struct address_space *mapping;
704 /* We have page reference already from ll_pin_page */
707 /* Already truncated by somebody */
711 mapping = page->mapping;
713 ll_teardown_mmaps(mapping,
714 (__u64)page->index << PAGE_CACHE_SHIFT,
715 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
717 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
718 if (!discard && PageWriteback(page))
719 wait_on_page_writeback(page);
721 if (!discard && clear_page_dirty_for_io(page)) {
722 rc = ll_call_writepage(page->mapping->host, page);
723 /* either waiting for io to complete or reacquiring
724 * the lock that the failed writepage released */
726 wait_on_page_writeback(page);
728 CERROR("writepage inode %lu(%p) of page %p "
729 "failed: %d\n", mapping->host->i_ino,
730 mapping->host, page, rc);
731 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
733 set_bit(AS_ENOSPC, &mapping->flags);
735 set_bit(AS_EIO, &mapping->flags);
737 mapping->gfp_mask |= AS_EIO_MASK;
741 if (page->mapping != NULL) {
742 struct ll_async_page *llap = llap_cast_private(page);
743 // checking again to account for writeback's lock_page()
744 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
746 ll_ra_accounting(llap, page->mapping);
747 ll_truncate_complete_page(page);
751 LASSERT(!PageWriteback(page));
753 page_cache_release(page);
758 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
759 void *data, int flag)
762 struct ll_inode_info *lli;
763 struct lov_stripe_md *lsm;
769 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
770 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
774 inode = ll_inode_from_lock(lock);
777 lli = ll_i2info(inode);
780 if (lli->lli_smd == NULL)
784 stripe = ll_lock_to_stripe_offset(inode, lock);
788 lov_stripe_lock(lsm);
789 lock_res_and_lock(lock);
790 kms = ldlm_extent_shift_kms(lock,
791 lsm->lsm_oinfo[stripe]->loi_kms);
793 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
794 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
795 lsm->lsm_oinfo[stripe]->loi_kms, kms);
796 lsm->lsm_oinfo[stripe]->loi_kms = kms;
797 unlock_res_and_lock(lock);
798 lov_stripe_unlock(lsm);
799 ll_try_done_writing(inode);
808 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
810 /* XXX ALLOCATE - 160 bytes */
811 struct inode *inode = ll_inode_from_lock(lock);
812 struct ll_inode_info *lli = ll_i2info(inode);
813 struct lustre_handle lockh = { 0 };
818 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
819 LDLM_FL_BLOCK_CONV)) {
820 LBUG(); /* not expecting any blocked async locks yet */
821 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
823 ldlm_lock_dump(D_OTHER, lock, 0);
824 ldlm_reprocess_all(lock->l_resource);
828 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
830 stripe = ll_lock_to_stripe_offset(inode, lock);
834 if (lock->l_lvb_len) {
835 struct lov_stripe_md *lsm = lli->lli_smd;
837 lvb = lock->l_lvb_data;
838 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
840 lock_res_and_lock(lock);
841 ll_inode_size_lock(inode, 1);
842 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
843 kms = ldlm_extent_shift_kms(NULL, kms);
844 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
845 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
846 lsm->lsm_oinfo[stripe].loi_kms, kms);
847 lsm->lsm_oinfo[stripe].loi_kms = kms;
848 ll_inode_size_unlock(inode, 1);
849 unlock_res_and_lock(lock);
854 wake_up(&lock->l_waitq);
856 ldlm_lock2handle(lock, &lockh);
857 ldlm_lock_decref(&lockh, LCK_PR);
862 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
864 struct ptlrpc_request *req = reqp;
865 struct inode *inode = ll_inode_from_lock(lock);
866 struct ll_inode_info *lli;
867 struct lov_stripe_md *lsm;
870 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
874 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
875 lli = ll_i2info(inode);
877 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
880 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
882 /* First, find out which stripe index this lock corresponds to. */
883 stripe = ll_lock_to_stripe_offset(inode, lock);
885 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
887 rc = lustre_pack_reply(req, 2, size, NULL);
891 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
892 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
893 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
894 lvb->lvb_atime = LTIME_S(inode->i_atime);
895 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
897 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
898 " atime "LPU64", mtime "LPU64", ctime "LPU64,
899 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
900 lvb->lvb_atime, lvb->lvb_ctime);
905 /* These errors are normal races, so we don't want to fill the console
906 * with messages by calling ptlrpc_error() */
907 if (rc == -ELDLM_NO_LOCK_DATA)
908 lustre_pack_reply(req, 1, NULL, NULL);
914 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
917 struct lustre_handle lockh = { 0 };
918 struct ldlm_enqueue_info einfo = { 0 };
919 struct obd_info oinfo = { { { 0 } } };
925 einfo.ei_type = LDLM_EXTENT;
926 einfo.ei_mode = LCK_PR;
927 einfo.ei_cb_bl = osc_extent_blocking_cb;
928 einfo.ei_cb_cp = ldlm_completion_ast;
929 einfo.ei_cb_gl = ll_glimpse_callback;
930 einfo.ei_cbdata = NULL;
932 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
933 oinfo.oi_lockh = &lockh;
935 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
937 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
941 CERROR("obd_enqueue returned rc %d, "
942 "returning -EIO\n", rc);
943 RETURN(rc > 0 ? -EIO : rc);
946 lov_stripe_lock(lsm);
947 memset(&lvb, 0, sizeof(lvb));
948 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
949 st->st_size = lvb.lvb_size;
950 st->st_blocks = lvb.lvb_blocks;
951 st->st_mtime = lvb.lvb_mtime;
952 st->st_atime = lvb.lvb_atime;
953 st->st_ctime = lvb.lvb_ctime;
954 lov_stripe_unlock(lsm);
959 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
960 * file (because it prefers KMS over RSS when larger) */
961 int ll_glimpse_size(struct inode *inode, int ast_flags)
963 struct ll_inode_info *lli = ll_i2info(inode);
964 struct ll_sb_info *sbi = ll_i2sbi(inode);
965 struct lustre_handle lockh = { 0 };
966 struct ldlm_enqueue_info einfo = { 0 };
967 struct obd_info oinfo = { { { 0 } } };
972 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
975 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
979 /* NOTE: this looks like DLM lock request, but it may not be one. Due
980 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
981 * won't revoke any conflicting DLM locks held. Instead,
982 * ll_glimpse_callback() will be called on each client
983 * holding a DLM lock against this file, and resulting size
984 * will be returned for each stripe. DLM lock on [0, EOF] is
985 * acquired only if there were no conflicting locks. */
986 einfo.ei_type = LDLM_EXTENT;
987 einfo.ei_mode = LCK_PR;
988 einfo.ei_cb_bl = osc_extent_blocking_cb;
989 einfo.ei_cb_cp = ldlm_completion_ast;
990 einfo.ei_cb_gl = ll_glimpse_callback;
991 einfo.ei_cbdata = inode;
993 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
994 oinfo.oi_lockh = &lockh;
995 oinfo.oi_md = lli->lli_smd;
996 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
998 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
1002 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1003 RETURN(rc > 0 ? -EIO : rc);
1006 ll_inode_size_lock(inode, 1);
1007 inode_init_lvb(inode, &lvb);
1008 rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1009 i_size_write(inode, lvb.lvb_size);
1010 inode->i_blocks = lvb.lvb_blocks;
1011 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1012 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1013 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1014 ll_inode_size_unlock(inode, 1);
1016 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1017 i_size_read(inode), (long long)inode->i_blocks);
1022 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1023 struct lov_stripe_md *lsm, int mode,
1024 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1027 struct ll_sb_info *sbi = ll_i2sbi(inode);
1029 struct ldlm_enqueue_info einfo = { 0 };
1030 struct obd_info oinfo = { { { 0 } } };
1034 LASSERT(!lustre_handle_is_used(lockh));
1035 LASSERT(lsm != NULL);
1037 /* don't drop the mmapped file to LRU */
1038 if (mapping_mapped(inode->i_mapping))
1039 ast_flags |= LDLM_FL_NO_LRU;
1041 /* XXX phil: can we do this? won't it screw the file size up? */
1042 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1043 (sbi->ll_flags & LL_SBI_NOLCK))
1046 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1047 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1049 einfo.ei_type = LDLM_EXTENT;
1050 einfo.ei_mode = mode;
1051 einfo.ei_cb_bl = osc_extent_blocking_cb;
1052 einfo.ei_cb_cp = ldlm_completion_ast;
1053 einfo.ei_cb_gl = ll_glimpse_callback;
1054 einfo.ei_cbdata = inode;
1056 oinfo.oi_policy = *policy;
1057 oinfo.oi_lockh = lockh;
1059 oinfo.oi_flags = ast_flags;
1061 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1062 *policy = oinfo.oi_policy;
1066 ll_inode_size_lock(inode, 1);
1067 inode_init_lvb(inode, &lvb);
1068 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1070 if (policy->l_extent.start == 0 &&
1071 policy->l_extent.end == OBD_OBJECT_EOF) {
1072 /* vmtruncate()->ll_truncate() first sets the i_size and then
1073 * the kms under both a DLM lock and the
1074 * ll_inode_size_lock(). If we don't get the
1075 * ll_inode_size_lock() here we can match the DLM lock and
1076 * reset i_size from the kms before the truncating path has
1077 * updated the kms. generic_file_write can then trust the
1078 * stale i_size when doing appending writes and effectively
1079 * cancel the result of the truncate. Getting the
1080 * ll_inode_size_lock() after the enqueue maintains the DLM
1081 * -> ll_inode_size_lock() acquiring order. */
1082 i_size_write(inode, lvb.lvb_size);
1083 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1084 inode->i_ino, i_size_read(inode));
1088 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1089 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1090 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1092 ll_inode_size_unlock(inode, 1);
1097 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1098 struct lov_stripe_md *lsm, int mode,
1099 struct lustre_handle *lockh)
1101 struct ll_sb_info *sbi = ll_i2sbi(inode);
1105 /* XXX phil: can we do this? won't it screw the file size up? */
1106 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1107 (sbi->ll_flags & LL_SBI_NOLCK))
1110 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
1115 static void ll_set_file_contended(struct inode *inode)
1117 struct ll_inode_info *lli = ll_i2info(inode);
1119 lli->lli_contention_time = cfs_time_current();
1120 set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1123 void ll_clear_file_contended(struct inode *inode)
1125 struct ll_inode_info *lli = ll_i2info(inode);
1127 clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1130 static int ll_is_file_contended(struct file *file)
1132 struct inode *inode = file->f_dentry->d_inode;
1133 struct ll_inode_info *lli = ll_i2info(inode);
1134 struct ll_sb_info *sbi = ll_i2sbi(inode);
1135 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1138 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1139 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1140 " osc connect flags = 0x"LPX64"\n",
1141 sbi->ll_lco.lco_flags);
1144 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1146 if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1147 cfs_time_t cur_time = cfs_time_current();
1148 cfs_time_t retry_time;
1150 retry_time = cfs_time_add(
1151 lli->lli_contention_time,
1152 cfs_time_seconds(sbi->ll_contention_time));
1153 if (cfs_time_after(cur_time, retry_time)) {
1154 ll_clear_file_contended(inode);
1162 static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
1163 struct file *file, const struct iovec *iov,
1164 unsigned long nr_segs,
1165 loff_t start, loff_t end, int rw)
1168 int tree_locked = 0;
1170 struct inode * inode = file->f_dentry->d_inode;
1172 append = (rw == WRITE) && (file->f_flags & O_APPEND);
1174 if (append || !ll_is_file_contended(file)) {
1175 struct ll_lock_tree_node *node;
1178 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1179 if (file->f_flags & O_NONBLOCK)
1180 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1181 node = ll_node_from_inode(inode, start, end,
1182 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1187 tree->lt_fd = LUSTRE_FPRIVATE(file);
1188 rc = ll_tree_lock_iov(tree, node, iov, nr_segs, ast_flags);
1191 else if (rc == -EUSERS)
1192 ll_set_file_contended(inode);
1196 RETURN(tree_locked);
1201 /* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
1203 static size_t ll_file_get_iov_count(const struct iovec *iov,
1204 unsigned long *nr_segs)
1209 for (seg = 0; seg < *nr_segs; seg++) {
1210 const struct iovec *iv = &iov[seg];
1213 * If any segment has a negative length, or the cumulative
1214 * length ever wraps negative then return -EINVAL.
1216 count += iv->iov_len;
1217 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1219 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1224 count -= iv->iov_len; /* This segment is no good */
1230 static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out,
1231 unsigned long *nrsegs_copy,
1232 struct iovec *iov_copy, size_t *offset,
1236 const struct iovec *iov = *iov_out;
1237 for (i = 0; i < *nr_segs;
1239 const struct iovec *iv = &iov[i];
1240 struct iovec *ivc = &iov_copy[i];
1243 ivc->iov_len -= *offset;
1244 ivc->iov_base += *offset;
1246 if (ivc->iov_len >= size) {
1247 ivc->iov_len = size;
1254 size -= ivc->iov_len;
1258 *nrsegs_copy = i + 1;
1263 static int ll_reget_short_lock(struct page *page, int rw,
1264 obd_off start, obd_off end,
1267 struct ll_async_page *llap;
1268 struct obd_export *exp;
1269 struct inode *inode = page->mapping->host;
1273 exp = ll_i2obdexp(inode);
1277 llap = llap_cast_private(page);
1281 RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1282 &llap->llap_cookie, rw, start, end,
1286 static void ll_release_short_lock(struct inode *inode, obd_off end,
1287 void *cookie, int rw)
1289 struct obd_export *exp;
1292 exp = ll_i2obdexp(inode);
1296 rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1299 CERROR("unlock failed (%d)\n", rc);
1302 static inline int ll_file_get_fast_lock(struct file *file,
1303 obd_off ppos, obd_off end,
1304 const struct iovec *iov,
1305 unsigned long nr_segs,
1306 void **cookie, int rw)
1313 /* we would like this read request to be lockfree */
1314 for (seg = 0; seg < nr_segs; seg++) {
1315 const struct iovec *iv = &iov[seg];
1316 if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len))
1320 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1321 ppos >> CFS_PAGE_SHIFT);
1323 if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1327 page_cache_release(page);
1334 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1335 void *cookie, int rw)
1337 ll_release_short_lock(inode, end, cookie, rw);
1340 enum ll_lock_style {
1341 LL_LOCK_STYLE_NOLOCK = 0,
1342 LL_LOCK_STYLE_FASTLOCK = 1,
1343 LL_LOCK_STYLE_TREELOCK = 2
1346 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1347 obd_off end, const struct iovec *iov,
1348 unsigned long nr_segs, void **cookie,
1349 struct ll_lock_tree *tree, int rw)
1355 if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, cookie, rw))
1356 RETURN(LL_LOCK_STYLE_FASTLOCK);
1358 rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs,
1360 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1363 RETURN(LL_LOCK_STYLE_TREELOCK);
1365 RETURN(LL_LOCK_STYLE_NOLOCK);
1368 /* an error happened if we reached this point, rc = -errno here */
1372 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1373 enum ll_lock_style lock_style,
1374 void *cookie, struct ll_lock_tree *tree,
1378 switch (lock_style) {
1379 case LL_LOCK_STYLE_TREELOCK:
1380 ll_tree_unlock(tree);
1382 case LL_LOCK_STYLE_FASTLOCK:
1383 ll_file_put_fast_lock(inode, end, cookie, rw);
1386 CERROR("invalid locking style (%d)\n", lock_style);
1390 #ifdef HAVE_FILE_READV
1391 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1392 unsigned long nr_segs, loff_t *ppos)
1395 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1396 unsigned long nr_segs, loff_t pos)
1398 struct file *file = iocb->ki_filp;
1399 loff_t *ppos = &iocb->ki_pos;
1401 struct inode *inode = file->f_dentry->d_inode;
1402 struct ll_inode_info *lli = ll_i2info(inode);
1403 struct lov_stripe_md *lsm = lli->lli_smd;
1404 struct ll_sb_info *sbi = ll_i2sbi(inode);
1405 struct ll_lock_tree tree;
1407 struct ll_ra_read bead;
1410 ssize_t retval, chunk, sum = 0;
1412 struct iovec *iov_copy = NULL;
1413 unsigned long nrsegs_copy, nrsegs_orig = 0;
1414 size_t count, iov_offset = 0;
1419 count = ll_file_get_iov_count(iov, &nr_segs);
1420 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1421 inode->i_ino, inode->i_generation, inode, count, *ppos);
1422 /* "If nbyte is 0, read() will return 0 and have no other results."
1423 * -- Single Unix Spec */
1427 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1430 /* Read on file with no objects should return zero-filled
1431 * buffers up to file size (we can get non-zero sizes with
1432 * mknod + truncate, then opening file for read. This is a
1433 * common pattern in NFS case, it seems). Bug 6243 */
1435 /* Since there are no objects on OSTs, we have nothing to get
1436 * lock on and so we are forced to access inode->i_size
1439 /* Read beyond end of file */
1440 if (*ppos >= i_size_read(inode))
1443 if (count > i_size_read(inode) - *ppos)
1444 count = i_size_read(inode) - *ppos;
1445 /* Make sure to correctly adjust the file pos pointer for
1447 for (nrsegs_copy = 0; nrsegs_copy < nr_segs; nrsegs_copy++) {
1448 const struct iovec *iv = &iov[nrsegs_copy];
1450 if (count < iv->iov_len)
1453 chunk = iv->iov_len;
1454 notzeroed = clear_user(iv->iov_base, chunk);
1455 sum += (chunk - notzeroed);
1456 count -= (chunk - notzeroed);
1457 if (notzeroed || !count)
1467 if (sbi->ll_max_rw_chunk != 0) {
1468 /* first, let's know the end of the current stripe */
1470 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END, &end);
1472 /* correct, the end is beyond the request */
1473 if (end > *ppos + count - 1)
1474 end = *ppos + count - 1;
1476 /* and chunk shouldn't be too large even if striping is wide */
1477 if (end - *ppos > sbi->ll_max_rw_chunk)
1478 end = *ppos + sbi->ll_max_rw_chunk - 1;
1480 chunk = end - *ppos + 1;
1481 if ((count == chunk) && (iov_offset == 0)) {
1483 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1485 iov_copy = (struct iovec *)iov;
1486 nrsegs_copy = nr_segs;
1489 nrsegs_orig = nr_segs;
1490 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1492 GOTO(out, retval = -ENOMEM);
1495 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1496 &iov_offset, chunk);
1499 end = *ppos + count - 1;
1500 iov_copy = (struct iovec *)iov;
1501 nrsegs_copy = nr_segs;
1504 lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1505 iov_copy, nrsegs_copy, &cookie, &tree,
1508 GOTO(out, retval = lock_style);
1510 ll_inode_size_lock(inode, 1);
1512 * Consistency guarantees: following possibilities exist for the
1513 * relation between region being read and real file size at this
1516 * (A): the region is completely inside of the file;
1518 * (B-x): x bytes of region are inside of the file, the rest is
1521 * (C): the region is completely outside of the file.
1523 * This classification is stable under DLM lock acquired by
1524 * ll_tree_lock() above, because to change class, other client has to
1525 * take DLM lock conflicting with our lock. Also, any updates to
1526 * ->i_size by other threads on this client are serialized by
1527 * ll_inode_size_lock(). This guarantees that short reads are handled
1528 * correctly in the face of concurrent writes and truncates.
1530 inode_init_lvb(inode, &lvb);
1531 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1533 if (*ppos + count - 1 > kms) {
1534 /* A glimpse is necessary to determine whether we return a
1535 * short read (B) or some zeroes at the end of the buffer (C) */
1536 ll_inode_size_unlock(inode, 1);
1537 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1539 if (lock_style != LL_LOCK_STYLE_NOLOCK)
1540 ll_file_put_lock(inode, end, lock_style,
1541 cookie, &tree, OBD_BRW_READ);
1545 /* region is within kms and, hence, within real file size (A).
1546 * We need to increase i_size to cover the read region so that
1547 * generic_file_read() will do its job, but that doesn't mean
1548 * the kms size is _correct_, it is only the _minimum_ size.
1549 * If someone does a stat they will get the correct size which
1550 * will always be >= the kms value here. b=11081 */
1551 if (i_size_read(inode) < kms)
1552 i_size_write(inode, kms);
1553 ll_inode_size_unlock(inode, 1);
1556 chunk = end - *ppos + 1;
1557 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1558 inode->i_ino, chunk, *ppos, i_size_read(inode));
1560 /* turn off the kernel's read-ahead */
1561 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1562 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1565 file->f_ra.ra_pages = 0;
1567 /* initialize read-ahead window once per syscall */
1570 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1571 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1572 ll_ra_read_in(file, &bead);
1576 file_accessed(file);
1577 #ifdef HAVE_FILE_READV
1578 retval = generic_file_readv(file, iov_copy, nrsegs_copy, ppos);
1580 retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy,
1583 ll_file_put_lock(inode, end, lock_style, cookie,
1584 &tree, OBD_BRW_READ);
1586 retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy, ppos,
1589 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1593 if (retval == chunk && count > 0)
1599 ll_ra_read_ex(file, &bead);
1600 retval = (sum > 0) ? sum : retval;
1602 if (iov_copy && iov_copy != iov)
1603 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1608 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1611 struct iovec local_iov = { .iov_base = (void __user *)buf,
1613 #ifdef HAVE_FILE_READV
1614 return ll_file_readv(file, &local_iov, 1, ppos);
1619 init_sync_kiocb(&kiocb, file);
1620 kiocb.ki_pos = *ppos;
1621 kiocb.ki_left = count;
1623 ret = ll_file_aio_read(&kiocb, &local_iov, 1, kiocb.ki_pos);
1624 *ppos = kiocb.ki_pos;
1630 * Write to a file (through the page cache).
1632 #ifdef HAVE_FILE_WRITEV
1633 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1634 unsigned long nr_segs, loff_t *ppos)
1636 #else /* AIO stuff */
1637 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1638 unsigned long nr_segs, loff_t pos)
1640 struct file *file = iocb->ki_filp;
1641 loff_t *ppos = &iocb->ki_pos;
1643 struct inode *inode = file->f_dentry->d_inode;
1644 struct ll_sb_info *sbi = ll_i2sbi(inode);
1645 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1646 struct ll_lock_tree tree;
1647 loff_t maxbytes = ll_file_maxbytes(inode);
1648 loff_t lock_start, lock_end, end;
1649 ssize_t retval, chunk, sum = 0;
1651 struct iovec *iov_copy = NULL;
1652 unsigned long nrsegs_copy, nrsegs_orig = 0;
1653 size_t count, iov_offset = 0;
1656 count = ll_file_get_iov_count(iov, &nr_segs);
1658 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1659 inode->i_ino, inode->i_generation, inode, count, *ppos);
1661 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1663 /* POSIX, but surprised the VFS doesn't check this already */
1667 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1668 * called on the file, don't fail the below assertion (bug 2388). */
1669 if (file->f_flags & O_LOV_DELAY_CREATE &&
1670 ll_i2info(inode)->lli_smd == NULL)
1673 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1675 down(&ll_i2info(inode)->lli_write_sem);
1678 chunk = 0; /* just to fix gcc's warning */
1679 end = *ppos + count - 1;
1681 if (file->f_flags & O_APPEND) {
1683 lock_end = OBD_OBJECT_EOF;
1684 iov_copy = (struct iovec *)iov;
1685 nrsegs_copy = nr_segs;
1686 } else if (sbi->ll_max_rw_chunk != 0) {
1687 /* first, let's know the end of the current stripe */
1689 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1692 /* correct, the end is beyond the request */
1693 if (end > *ppos + count - 1)
1694 end = *ppos + count - 1;
1696 /* and chunk shouldn't be too large even if striping is wide */
1697 if (end - *ppos > sbi->ll_max_rw_chunk)
1698 end = *ppos + sbi->ll_max_rw_chunk - 1;
1701 chunk = end - *ppos + 1;
1702 if ((count == chunk) && (iov_offset == 0)) {
1704 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1706 iov_copy = (struct iovec *)iov;
1707 nrsegs_copy = nr_segs;
1710 nrsegs_orig = nr_segs;
1711 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1713 GOTO(out, retval = -ENOMEM);
1715 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1716 &iov_offset, chunk);
1721 iov_copy = (struct iovec *)iov;
1722 nrsegs_copy = nr_segs;
1725 tree_locked = ll_file_get_tree_lock_iov(&tree, file, iov_copy,
1727 (obd_off)lock_start,
1730 if (tree_locked < 0)
1731 GOTO(out, retval = tree_locked);
1733 /* This is ok, g_f_w will overwrite this under i_sem if it races
1734 * with a local truncate, it just makes our maxbyte checking easier.
1735 * The i_size value gets updated in ll_extent_lock() as a consequence
1736 * of the [0,EOF] extent lock we requested above. */
1737 if (file->f_flags & O_APPEND) {
1738 *ppos = i_size_read(inode);
1739 end = *ppos + count - 1;
1742 if (*ppos >= maxbytes) {
1743 send_sig(SIGXFSZ, current, 0);
1744 GOTO(out_unlock, retval = -EFBIG);
1746 if (end > maxbytes - 1)
1749 /* generic_file_write handles O_APPEND after getting i_mutex */
1750 chunk = end - *ppos + 1;
1751 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1752 inode->i_ino, chunk, *ppos);
1754 #ifdef HAVE_FILE_WRITEV
1755 retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
1757 retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
1761 retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy,
1762 ppos, WRITE, chunk);
1763 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1767 ll_tree_unlock(&tree);
1773 if (retval == chunk && count > 0)
1777 up(&ll_i2info(inode)->lli_write_sem);
1779 if (iov_copy && iov_copy != iov)
1780 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1782 retval = (sum > 0) ? sum : retval;
1783 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1784 retval > 0 ? retval : 0);
1788 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1791 struct iovec local_iov = { .iov_base = (void __user *)buf,
1794 #ifdef HAVE_FILE_WRITEV
1795 return ll_file_writev(file, &local_iov, 1, ppos);
1800 init_sync_kiocb(&kiocb, file);
1801 kiocb.ki_pos = *ppos;
1802 kiocb.ki_left = count;
1804 ret = ll_file_aio_write(&kiocb, &local_iov, 1, kiocb.ki_pos);
1805 *ppos = kiocb.ki_pos;
1812 * Send file content (through pagecache) somewhere with helper
1814 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1815 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1816 read_actor_t actor, void *target)
1818 struct inode *inode = in_file->f_dentry->d_inode;
1819 struct ll_inode_info *lli = ll_i2info(inode);
1820 struct lov_stripe_md *lsm = lli->lli_smd;
1821 struct ll_lock_tree tree;
1822 struct ll_lock_tree_node *node;
1824 struct ll_ra_read bead;
1829 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1830 inode->i_ino, inode->i_generation, inode, count, *ppos);
1832 /* "If nbyte is 0, read() will return 0 and have no other results."
1833 * -- Single Unix Spec */
1837 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1838 /* turn off the kernel's read-ahead */
1839 in_file->f_ra.ra_pages = 0;
1841 /* File with no objects, nothing to lock */
1843 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1845 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1847 RETURN(PTR_ERR(node));
1849 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1850 rc = ll_tree_lock(&tree, node, NULL, count,
1851 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1855 ll_clear_file_contended(inode);
1856 ll_inode_size_lock(inode, 1);
1858 * Consistency guarantees: following possibilities exist for the
1859 * relation between region being read and real file size at this
1862 * (A): the region is completely inside of the file;
1864 * (B-x): x bytes of region are inside of the file, the rest is
1867 * (C): the region is completely outside of the file.
1869 * This classification is stable under DLM lock acquired by
1870 * ll_tree_lock() above, because to change class, other client has to
1871 * take DLM lock conflicting with our lock. Also, any updates to
1872 * ->i_size by other threads on this client are serialized by
1873 * ll_inode_size_lock(). This guarantees that short reads are handled
1874 * correctly in the face of concurrent writes and truncates.
1876 inode_init_lvb(inode, &lvb);
1877 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1879 if (*ppos + count - 1 > kms) {
1880 /* A glimpse is necessary to determine whether we return a
1881 * short read (B) or some zeroes at the end of the buffer (C) */
1882 ll_inode_size_unlock(inode, 1);
1883 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1887 /* region is within kms and, hence, within real file size (A) */
1888 i_size_write(inode, kms);
1889 ll_inode_size_unlock(inode, 1);
1892 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1893 inode->i_ino, count, *ppos, i_size_read(inode));
1895 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1896 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1897 ll_ra_read_in(in_file, &bead);
1899 file_accessed(in_file);
1900 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1901 ll_ra_read_ex(in_file, &bead);
1904 ll_tree_unlock(&tree);
1909 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1912 struct ll_inode_info *lli = ll_i2info(inode);
1913 struct obd_export *exp = ll_i2obdexp(inode);
1914 struct ll_recreate_obj ucreatp;
1915 struct obd_trans_info oti = { 0 };
1916 struct obdo *oa = NULL;
1919 struct lov_stripe_md *lsm, *lsm2;
1922 if (!capable (CAP_SYS_ADMIN))
1925 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1926 sizeof(struct ll_recreate_obj));
1934 down(&lli->lli_size_sem);
1937 GOTO(out, rc = -ENOENT);
1938 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1939 (lsm->lsm_stripe_count));
1941 OBD_ALLOC(lsm2, lsm_size);
1943 GOTO(out, rc = -ENOMEM);
1945 oa->o_id = ucreatp.lrc_id;
1946 oa->o_nlink = ucreatp.lrc_ost_idx;
1947 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1948 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
1949 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1950 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1952 memcpy(lsm2, lsm, lsm_size);
1953 rc = obd_create(exp, oa, &lsm2, &oti);
1955 OBD_FREE(lsm2, lsm_size);
1958 up(&lli->lli_size_sem);
1963 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1964 int flags, struct lov_user_md *lum,
1967 struct ll_inode_info *lli = ll_i2info(inode);
1968 struct lov_stripe_md *lsm;
1969 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1973 down(&lli->lli_size_sem);
1976 up(&lli->lli_size_sem);
1977 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1982 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1985 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1986 GOTO(out_req_free, rc = -ENOENT);
1987 rc = oit.d.lustre.it_status;
1989 GOTO(out_req_free, rc);
1991 ll_release_openhandle(file->f_dentry, &oit);
1994 up(&lli->lli_size_sem);
1995 ll_intent_release(&oit);
1998 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2002 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2003 struct lov_mds_md **lmmp, int *lmm_size,
2004 struct ptlrpc_request **request)
2006 struct ll_sb_info *sbi = ll_i2sbi(inode);
2008 struct mds_body *body;
2009 struct lov_mds_md *lmm = NULL;
2010 struct ptlrpc_request *req = NULL;
2013 ll_inode2fid(&fid, inode);
2015 rc = ll_get_max_mdsize(sbi, &lmmsize);
2019 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
2020 filename, strlen(filename) + 1,
2021 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
2024 CDEBUG(D_INFO, "mdc_getattr_name failed "
2025 "on %s: rc %d\n", filename, rc);
2029 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
2031 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2032 /* swabbed by mdc_getattr_name */
2033 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
2035 lmmsize = body->eadatasize;
2037 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2039 GOTO(out, rc = -ENODATA);
2042 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
2044 LASSERT(lmm != NULL);
2045 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
2047 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC)) &&
2048 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2049 GOTO(out, rc = -EPROTO);
2052 * This is coming from the MDS, so is probably in
2053 * little endian. We convert it to host endian before
2054 * passing it to userspace.
2056 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2057 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC)) {
2058 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
2059 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
2060 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2061 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2065 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2066 struct lov_stripe_md *lsm;
2067 struct lov_user_md_join *lmj;
2068 int lmj_size, i, aindex = 0;
2070 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
2072 GOTO(out, rc = -ENOMEM);
2073 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
2075 GOTO(out_free_memmd, rc);
2077 lmj_size = sizeof(struct lov_user_md_join) +
2078 lsm->lsm_stripe_count *
2079 sizeof(struct lov_user_ost_data_join);
2080 OBD_ALLOC(lmj, lmj_size);
2082 GOTO(out_free_memmd, rc = -ENOMEM);
2084 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2085 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2086 struct lov_extent *lex =
2087 &lsm->lsm_array->lai_ext_array[aindex];
2089 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2091 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2092 LPU64" len %d\n", aindex, i,
2093 lex->le_start, (int)lex->le_len);
2094 lmj->lmm_objects[i].l_extent_start =
2097 if ((int)lex->le_len == -1)
2098 lmj->lmm_objects[i].l_extent_end = -1;
2100 lmj->lmm_objects[i].l_extent_end =
2101 lex->le_start + lex->le_len;
2102 lmj->lmm_objects[i].l_object_id =
2103 lsm->lsm_oinfo[i]->loi_id;
2104 lmj->lmm_objects[i].l_object_gr =
2105 lsm->lsm_oinfo[i]->loi_gr;
2106 lmj->lmm_objects[i].l_ost_gen =
2107 lsm->lsm_oinfo[i]->loi_ost_gen;
2108 lmj->lmm_objects[i].l_ost_idx =
2109 lsm->lsm_oinfo[i]->loi_ost_idx;
2111 lmm = (struct lov_mds_md *)lmj;
2114 obd_free_memmd(sbi->ll_osc_exp, &lsm);
2118 *lmm_size = lmmsize;
2122 static int ll_lov_setea(struct inode *inode, struct file *file,
2125 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2126 struct lov_user_md *lump;
2127 int lum_size = sizeof(struct lov_user_md) +
2128 sizeof(struct lov_user_ost_data);
2132 if (!capable (CAP_SYS_ADMIN))
2135 OBD_ALLOC(lump, lum_size);
2139 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2141 OBD_FREE(lump, lum_size);
2145 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2147 OBD_FREE(lump, lum_size);
2151 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2154 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
2156 int flags = FMODE_WRITE;
2159 /* Bug 1152: copy properly when this is no longer true */
2160 LASSERT(sizeof(lum) == sizeof(*lump));
2161 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
2162 rc = copy_from_user(&lum, lump, sizeof(lum));
2166 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
2168 put_user(0, &lump->lmm_stripe_count);
2169 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
2170 0, ll_i2info(inode)->lli_smd, lump);
2175 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2177 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2182 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
2186 static int ll_get_grouplock(struct inode *inode, struct file *file,
2189 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2190 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2191 .end = OBD_OBJECT_EOF}};
2192 struct lustre_handle lockh = { 0 };
2193 struct ll_inode_info *lli = ll_i2info(inode);
2194 struct lov_stripe_md *lsm = lli->lli_smd;
2198 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2202 policy.l_extent.gid = arg;
2203 if (file->f_flags & O_NONBLOCK)
2204 flags = LDLM_FL_BLOCK_NOWAIT;
2206 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2210 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2212 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2217 static int ll_put_grouplock(struct inode *inode, struct file *file,
2220 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2221 struct ll_inode_info *lli = ll_i2info(inode);
2222 struct lov_stripe_md *lsm = lli->lli_smd;
2226 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2227 /* Ugh, it's already unlocked. */
2231 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2234 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2236 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2241 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2246 static int join_sanity_check(struct inode *head, struct inode *tail)
2249 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2250 CERROR("server do not support join \n");
2253 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2254 CERROR("tail ino %lu and ino head %lu must be regular\n",
2255 head->i_ino, tail->i_ino);
2258 if (head->i_ino == tail->i_ino) {
2259 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2262 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2263 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2269 static int join_file(struct inode *head_inode, struct file *head_filp,
2270 struct file *tail_filp)
2272 struct dentry *tail_dentry = tail_filp->f_dentry;
2273 struct lookup_intent oit = {.it_op = IT_OPEN,
2274 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2275 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
2276 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
2278 struct lustre_handle lockh;
2279 struct mdc_op_data *op_data;
2284 tail_dentry = tail_filp->f_dentry;
2286 OBD_ALLOC_PTR(op_data);
2287 if (op_data == NULL) {
2291 data = i_size_read(head_inode);
2292 ll_prepare_mdc_op_data(op_data, head_inode,
2293 tail_dentry->d_parent->d_inode,
2294 tail_dentry->d_name.name,
2295 tail_dentry->d_name.len, 0, &data);
2296 rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
2297 op_data, &lockh, NULL, 0, 0);
2302 rc = oit.d.lustre.it_status;
2304 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2305 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2306 ptlrpc_req_finished((struct ptlrpc_request *)
2307 oit.d.lustre.it_data);
2311 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2313 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2314 oit.d.lustre.it_lock_mode = 0;
2316 ll_release_openhandle(head_filp->f_dentry, &oit);
2319 OBD_FREE_PTR(op_data);
2320 ll_intent_release(&oit);
2324 static int ll_file_join(struct inode *head, struct file *filp,
2325 char *filename_tail)
2327 struct inode *tail = NULL, *first = NULL, *second = NULL;
2328 struct dentry *tail_dentry;
2329 struct file *tail_filp, *first_filp, *second_filp;
2330 struct ll_lock_tree first_tree, second_tree;
2331 struct ll_lock_tree_node *first_node, *second_node;
2332 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2333 int rc = 0, cleanup_phase = 0;
2336 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2337 head->i_ino, head->i_generation, head, filename_tail);
2339 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2340 if (IS_ERR(tail_filp)) {
2341 CERROR("Can not open tail file %s", filename_tail);
2342 rc = PTR_ERR(tail_filp);
2345 tail = igrab(tail_filp->f_dentry->d_inode);
2347 tlli = ll_i2info(tail);
2348 tail_dentry = tail_filp->f_dentry;
2349 LASSERT(tail_dentry);
2352 /*reorder the inode for lock sequence*/
2353 first = head->i_ino > tail->i_ino ? head : tail;
2354 second = head->i_ino > tail->i_ino ? tail : head;
2355 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2356 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2358 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2359 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2360 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2361 if (IS_ERR(first_node)){
2362 rc = PTR_ERR(first_node);
2365 first_tree.lt_fd = first_filp->private_data;
2366 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2371 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2372 if (IS_ERR(second_node)){
2373 rc = PTR_ERR(second_node);
2376 second_tree.lt_fd = second_filp->private_data;
2377 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2382 rc = join_sanity_check(head, tail);
2386 rc = join_file(head, filp, tail_filp);
2390 switch (cleanup_phase) {
2392 ll_tree_unlock(&second_tree);
2393 obd_cancel_unused(ll_i2obdexp(second),
2394 ll_i2info(second)->lli_smd, 0, NULL);
2396 ll_tree_unlock(&first_tree);
2397 obd_cancel_unused(ll_i2obdexp(first),
2398 ll_i2info(first)->lli_smd, 0, NULL);
2400 filp_close(tail_filp, 0);
2403 if (head && rc == 0) {
2404 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2406 hlli->lli_smd = NULL;
2411 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2417 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2419 struct inode *inode = dentry->d_inode;
2420 struct obd_client_handle *och;
2426 /* Root ? Do nothing. */
2427 if (dentry->d_inode->i_sb->s_root == dentry)
2430 /* No open handle to close? Move away */
2431 if (!it_disposition(it, DISP_OPEN_OPEN))
2434 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2436 OBD_ALLOC(och, sizeof(*och));
2438 GOTO(out, rc = -ENOMEM);
2440 ll_och_fill(ll_i2info(inode), it, och);
2442 rc = ll_close_inode_openhandle(inode, och);
2444 OBD_FREE(och, sizeof(*och));
2446 /* this one is in place of ll_file_open */
2447 ptlrpc_req_finished(it->d.lustre.it_data);
2448 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2452 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2455 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2459 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2460 inode->i_generation, inode, cmd);
2461 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2463 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2464 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2468 case LL_IOC_GETFLAGS:
2469 /* Get the current value of the file flags */
2470 return put_user(fd->fd_flags, (int *)arg);
2471 case LL_IOC_SETFLAGS:
2472 case LL_IOC_CLRFLAGS:
2473 /* Set or clear specific file flags */
2474 /* XXX This probably needs checks to ensure the flags are
2475 * not abused, and to handle any flag side effects.
2477 if (get_user(flags, (int *) arg))
2480 if (cmd == LL_IOC_SETFLAGS) {
2481 if ((flags & LL_FILE_IGNORE_LOCK) &&
2482 !(file->f_flags & O_DIRECT)) {
2483 CERROR("%s: unable to disable locking on "
2484 "non-O_DIRECT file\n", current->comm);
2488 fd->fd_flags |= flags;
2490 fd->fd_flags &= ~flags;
2493 case LL_IOC_LOV_SETSTRIPE:
2494 RETURN(ll_lov_setstripe(inode, file, arg));
2495 case LL_IOC_LOV_SETEA:
2496 RETURN(ll_lov_setea(inode, file, arg));
2497 case LL_IOC_LOV_GETSTRIPE:
2498 RETURN(ll_lov_getstripe(inode, arg));
2499 case LL_IOC_RECREATE_OBJ:
2500 RETURN(ll_lov_recreate_obj(inode, file, arg));
2501 case EXT3_IOC_GETFLAGS:
2502 case EXT3_IOC_SETFLAGS:
2503 RETURN(ll_iocontrol(inode, file, cmd, arg));
2504 case EXT3_IOC_GETVERSION_OLD:
2505 case EXT3_IOC_GETVERSION:
2506 RETURN(put_user(inode->i_generation, (int *)arg));
2511 ftail = getname((const char *)arg);
2513 RETURN(PTR_ERR(ftail));
2514 rc = ll_file_join(inode, file, ftail);
2518 case LL_IOC_GROUP_LOCK:
2519 RETURN(ll_get_grouplock(inode, file, arg));
2520 case LL_IOC_GROUP_UNLOCK:
2521 RETURN(ll_put_grouplock(inode, file, arg));
2522 case IOC_OBD_STATFS:
2523 RETURN(ll_obd_statfs(inode, (void *)arg));
2524 case OBD_IOC_GETNAME_OLD:
2525 case OBD_IOC_GETNAME: {
2526 struct obd_device *obd =
2527 class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
2530 if (copy_to_user((void *)arg, obd->obd_name,
2531 strlen(obd->obd_name) + 1))
2536 /* We need to special case any other ioctls we want to handle,
2537 * to send them to the MDS/OST as appropriate and to properly
2538 * network encode the arg field.
2539 case EXT3_IOC_SETVERSION_OLD:
2540 case EXT3_IOC_SETVERSION:
2546 ll_iocontrol_call(inode, file, cmd, arg, &err))
2549 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
2555 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2557 struct inode *inode = file->f_dentry->d_inode;
2558 struct ll_inode_info *lli = ll_i2info(inode);
2559 struct lov_stripe_md *lsm = lli->lli_smd;
2562 retval = offset + ((origin == 2) ? i_size_read(inode) :
2563 (origin == 1) ? file->f_pos : 0);
2564 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2565 inode->i_ino, inode->i_generation, inode, retval, retval,
2566 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2567 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2569 if (origin == 2) { /* SEEK_END */
2570 int nonblock = 0, rc;
2572 if (file->f_flags & O_NONBLOCK)
2573 nonblock = LDLM_FL_BLOCK_NOWAIT;
2576 rc = ll_glimpse_size(inode, nonblock);
2581 ll_inode_size_lock(inode, 0);
2582 offset += i_size_read(inode);
2583 ll_inode_size_unlock(inode, 0);
2584 } else if (origin == 1) { /* SEEK_CUR */
2585 offset += file->f_pos;
2589 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2590 if (offset != file->f_pos) {
2591 file->f_pos = offset;
2592 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2594 file->f_version = ++event;
2596 file->f_version = 0;
2605 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2607 struct inode *inode = dentry->d_inode;
2608 struct ll_inode_info *lli = ll_i2info(inode);
2609 struct lov_stripe_md *lsm = lli->lli_smd;
2611 struct ptlrpc_request *req;
2614 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2615 inode->i_generation, inode);
2616 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2618 /* fsync's caller has already called _fdata{sync,write}, we want
2619 * that IO to finish before calling the osc and mdc sync methods */
2620 rc = filemap_fdatawait(inode->i_mapping);
2622 /* catch async errors that were recorded back when async writeback
2623 * failed for pages in this mapping. */
2624 err = lli->lli_async_rc;
2625 lli->lli_async_rc = 0;
2629 err = lov_test_and_clear_async_rc(lsm);
2634 ll_inode2fid(&fid, inode);
2635 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
2639 ptlrpc_req_finished(req);
2646 RETURN(rc ? rc : -ENOMEM);
2648 oa->o_id = lsm->lsm_object_id;
2649 oa->o_valid = OBD_MD_FLID;
2650 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2651 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2653 err = obd_sync(ll_i2sbi(inode)->ll_osc_exp, oa, lsm,
2663 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2665 struct inode *inode = file->f_dentry->d_inode;
2666 struct ll_sb_info *sbi = ll_i2sbi(inode);
2667 struct ldlm_res_id res_id =
2668 { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
2669 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2670 ldlm_flock_completion_ast, NULL, file_lock };
2671 struct lustre_handle lockh = {0};
2672 ldlm_policy_data_t flock;
2677 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2678 inode->i_ino, file_lock);
2679 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2681 if (file_lock->fl_flags & FL_FLOCK) {
2682 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2683 /* set missing params for flock() calls */
2684 file_lock->fl_end = OFFSET_MAX;
2685 file_lock->fl_pid = current->tgid;
2687 flock.l_flock.pid = file_lock->fl_pid;
2688 flock.l_flock.start = file_lock->fl_start;
2689 flock.l_flock.end = file_lock->fl_end;
2691 switch (file_lock->fl_type) {
2693 einfo.ei_mode = LCK_PR;
2696 /* An unlock request may or may not have any relation to
2697 * existing locks so we may not be able to pass a lock handle
2698 * via a normal ldlm_lock_cancel() request. The request may even
2699 * unlock a byte range in the middle of an existing lock. In
2700 * order to process an unlock request we need all of the same
2701 * information that is given with a normal read or write record
2702 * lock request. To avoid creating another ldlm unlock (cancel)
2703 * message we'll treat a LCK_NL flock request as an unlock. */
2704 einfo.ei_mode = LCK_NL;
2707 einfo.ei_mode = LCK_PW;
2710 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2725 flags = LDLM_FL_BLOCK_NOWAIT;
2731 flags = LDLM_FL_TEST_LOCK;
2732 /* Save the old mode so that if the mode in the lock changes we
2733 * can decrement the appropriate reader or writer refcount. */
2734 file_lock->fl_type = einfo.ei_mode;
2737 CERROR("unknown fcntl lock command: %d\n", cmd);
2741 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2742 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2743 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2745 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
2746 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2747 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2748 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2749 #ifdef HAVE_F_OP_FLOCK
2750 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2751 !(flags & LDLM_FL_TEST_LOCK))
2752 posix_lock_file_wait(file, file_lock);
2758 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2765 int ll_have_md_lock(struct inode *inode, __u64 bits)
2767 struct lustre_handle lockh;
2768 struct ldlm_res_id res_id = { .name = {0} };
2769 struct obd_device *obddev;
2770 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2777 obddev = ll_i2mdcexp(inode)->exp_obd;
2778 res_id.name[0] = inode->i_ino;
2779 res_id.name[1] = inode->i_generation;
2781 CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
2783 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2784 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
2785 &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2792 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2793 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2794 * and return success */
2796 /* This path cannot be hit for regular files unless in
2797 * case of obscure races, so no need to to validate
2799 if (!S_ISREG(inode->i_mode) &&
2800 !S_ISDIR(inode->i_mode))
2805 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2813 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2815 struct inode *inode = dentry->d_inode;
2816 struct ptlrpc_request *req = NULL;
2817 struct obd_export *exp;
2822 CERROR("REPORT THIS LINE TO PETER\n");
2825 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2826 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2827 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2828 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2831 exp = ll_i2mdcexp(inode);
2833 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2834 struct lookup_intent oit = { .it_op = IT_GETATTR };
2835 struct mdc_op_data op_data;
2837 /* Call getattr by fid, so do not provide name at all. */
2838 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
2839 dentry->d_inode, NULL, 0, 0, NULL);
2840 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
2841 /* we are not interested in name
2844 ll_mdc_blocking_ast, 0);
2846 rc = ll_inode_revalidate_fini(inode, rc);
2850 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2852 ll_intent_release(&oit);
2856 /* Unlinked? Unhash dentry, so it is not picked up later by
2857 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2858 here to preserve get_cwd functionality on 2.6.
2860 if (!dentry->d_inode->i_nlink) {
2861 spin_lock(&dcache_lock);
2862 ll_drop_dentry(dentry);
2863 spin_unlock(&dcache_lock);
2866 ll_lookup_finish_locks(&oit, dentry);
2867 } else if (!ll_have_md_lock(dentry->d_inode,
2868 MDS_INODELOCK_UPDATE|MDS_INODELOCK_LOOKUP)) {
2869 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2871 obd_valid valid = OBD_MD_FLGETATTR;
2874 if (S_ISREG(inode->i_mode)) {
2875 rc = ll_get_max_mdsize(sbi, &ealen);
2878 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2880 ll_inode2fid(&fid, inode);
2881 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
2883 rc = ll_inode_revalidate_fini(inode, rc);
2887 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
2893 /* if object not yet allocated, don't validate size */
2894 if (ll_i2info(inode)->lli_smd == NULL)
2897 /* ll_glimpse_size will prefer locally cached writes if they extend
2899 rc = ll_glimpse_size(inode, 0);
2902 ptlrpc_req_finished(req);
2906 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2907 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2908 struct lookup_intent *it, struct kstat *stat)
2910 struct inode *inode = de->d_inode;
2913 res = ll_inode_revalidate_it(de, it);
2914 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2919 stat->dev = inode->i_sb->s_dev;
2920 stat->ino = inode->i_ino;
2921 stat->mode = inode->i_mode;
2922 stat->nlink = inode->i_nlink;
2923 stat->uid = inode->i_uid;
2924 stat->gid = inode->i_gid;
2925 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2926 stat->atime = inode->i_atime;
2927 stat->mtime = inode->i_mtime;
2928 stat->ctime = inode->i_ctime;
2929 #ifdef HAVE_INODE_BLKSIZE
2930 stat->blksize = inode->i_blksize;
2932 stat->blksize = 1<<inode->i_blkbits;
2935 ll_inode_size_lock(inode, 0);
2936 stat->size = i_size_read(inode);
2937 stat->blocks = inode->i_blocks;
2938 ll_inode_size_unlock(inode, 0);
2942 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2944 struct lookup_intent it = { .it_op = IT_GETATTR };
2946 return ll_getattr_it(mnt, de, &it, stat);
2951 int lustre_check_acl(struct inode *inode, int mask)
2953 #ifdef CONFIG_FS_POSIX_ACL
2954 struct ll_inode_info *lli = ll_i2info(inode);
2955 struct posix_acl *acl;
2959 spin_lock(&lli->lli_lock);
2960 acl = posix_acl_dup(lli->lli_posix_acl);
2961 spin_unlock(&lli->lli_lock);
2966 rc = posix_acl_permission(inode, acl, mask);
2967 posix_acl_release(acl);
2975 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2976 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2978 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2979 inode->i_ino, inode->i_generation, inode, mask);
2981 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2982 return generic_permission(inode, mask, lustre_check_acl);
2985 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2986 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2988 int ll_inode_permission(struct inode *inode, int mask)
2991 int mode = inode->i_mode;
2994 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2995 inode->i_ino, inode->i_generation, inode, mask);
2996 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2998 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2999 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3001 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3003 if (current->fsuid == inode->i_uid) {
3006 if (((mode >> 3) & mask & S_IRWXO) != mask)
3008 rc = lustre_check_acl(inode, mask);
3012 goto check_capabilities;
3016 if (in_group_p(inode->i_gid))
3019 if ((mode & mask & S_IRWXO) == mask)
3023 if (!(mask & MAY_EXEC) ||
3024 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3025 if (capable(CAP_DAC_OVERRIDE))
3028 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3029 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3036 /* -o localflock - only provides locally consistent flock locks */
3037 struct file_operations ll_file_operations = {
3038 .read = ll_file_read,
3039 #ifdef HAVE_FILE_READV
3040 .readv = ll_file_readv,
3042 .aio_read = ll_file_aio_read,
3044 .write = ll_file_write,
3045 #ifdef HAVE_FILE_WRITEV
3046 .writev = ll_file_writev,
3048 .aio_write = ll_file_aio_write,
3050 .ioctl = ll_file_ioctl,
3051 .open = ll_file_open,
3052 .release = ll_file_release,
3053 .mmap = ll_file_mmap,
3054 .llseek = ll_file_seek,
3055 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
3056 .sendfile = ll_file_sendfile,
3061 struct file_operations ll_file_operations_flock = {
3062 .read = ll_file_read,
3063 #ifdef HAVE_FILE_READV
3064 .readv = ll_file_readv,
3066 .aio_read = ll_file_aio_read,
3068 .write = ll_file_write,
3069 #ifdef HAVE_FILE_WRITEV
3070 .writev = ll_file_writev,
3072 .aio_write = ll_file_aio_write,
3074 .ioctl = ll_file_ioctl,
3075 .open = ll_file_open,
3076 .release = ll_file_release,
3077 .mmap = ll_file_mmap,
3078 .llseek = ll_file_seek,
3079 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
3080 .sendfile = ll_file_sendfile,
3083 #ifdef HAVE_F_OP_FLOCK
3084 .flock = ll_file_flock,
3086 .lock = ll_file_flock
3089 /* These are for -o noflock - to return ENOSYS on flock calls */
3090 struct file_operations ll_file_operations_noflock = {
3091 .read = ll_file_read,
3092 #ifdef HAVE_FILE_READV
3093 .readv = ll_file_readv,
3095 .aio_read = ll_file_aio_read,
3097 .write = ll_file_write,
3098 #ifdef HAVE_FILE_WRITEV
3099 .writev = ll_file_writev,
3101 .aio_write = ll_file_aio_write,
3103 .ioctl = ll_file_ioctl,
3104 .open = ll_file_open,
3105 .release = ll_file_release,
3106 .mmap = ll_file_mmap,
3107 .llseek = ll_file_seek,
3108 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
3109 .sendfile = ll_file_sendfile,
3112 #ifdef HAVE_F_OP_FLOCK
3113 .flock = ll_file_noflock,
3115 .lock = ll_file_noflock
3118 struct inode_operations ll_file_inode_operations = {
3119 #ifdef HAVE_VFS_INTENT_PATCHES
3120 .setattr_raw = ll_setattr_raw,
3122 .setattr = ll_setattr,
3123 .truncate = ll_truncate,
3124 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
3125 .getattr = ll_getattr,
3127 .revalidate_it = ll_inode_revalidate_it,
3129 .permission = ll_inode_permission,
3130 .setxattr = ll_setxattr,
3131 .getxattr = ll_getxattr,
3132 .listxattr = ll_listxattr,
3133 .removexattr = ll_removexattr,
3136 /* dynamic ioctl number support routins */
3137 static struct llioc_ctl_data {
3138 struct rw_semaphore ioc_sem;
3139 struct list_head ioc_head;
3141 __RWSEM_INITIALIZER(llioc.ioc_sem),
3142 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3147 struct list_head iocd_list;
3148 unsigned int iocd_size;
3149 llioc_callback_t iocd_cb;
3150 unsigned int iocd_count;
3151 unsigned int iocd_cmd[0];
3154 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3157 struct llioc_data *in_data = NULL;
3160 if (cb == NULL || cmd == NULL ||
3161 count > LLIOC_MAX_CMD || count < 0)
3164 size = sizeof(*in_data) + count * sizeof(unsigned int);
3165 OBD_ALLOC(in_data, size);
3166 if (in_data == NULL)
3169 memset(in_data, 0, sizeof(*in_data));
3170 in_data->iocd_size = size;
3171 in_data->iocd_cb = cb;
3172 in_data->iocd_count = count;
3173 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3175 down_write(&llioc.ioc_sem);
3176 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3177 up_write(&llioc.ioc_sem);
3182 void ll_iocontrol_unregister(void *magic)
3184 struct llioc_data *tmp;
3189 down_write(&llioc.ioc_sem);
3190 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3192 unsigned int size = tmp->iocd_size;
3194 list_del(&tmp->iocd_list);
3195 up_write(&llioc.ioc_sem);
3197 OBD_FREE(tmp, size);
3201 up_write(&llioc.ioc_sem);
3203 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3206 EXPORT_SYMBOL(ll_iocontrol_register);
3207 EXPORT_SYMBOL(ll_iocontrol_unregister);
3209 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3210 unsigned int cmd, unsigned long arg, int *rcp)
3212 enum llioc_iter ret = LLIOC_CONT;
3213 struct llioc_data *data;
3214 int rc = -EINVAL, i;
3216 down_read(&llioc.ioc_sem);
3217 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3218 for (i = 0; i < data->iocd_count; i++) {
3219 if (cmd != data->iocd_cmd[i])
3222 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3226 if (ret == LLIOC_STOP)
3229 up_read(&llioc.ioc_sem);