1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 /* also used by llite/special.c:ll_special_open() */
52 struct ll_file_data *ll_file_data_get(void)
54 struct ll_file_data *fd;
56 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
60 static void ll_file_data_put(struct ll_file_data *fd)
63 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 static int ll_close_inode_openhandle(struct inode *inode,
67 struct obd_client_handle *och)
69 struct ptlrpc_request *req = NULL;
70 struct obd_device *obd;
72 struct mdc_op_data data = { { 0 } };
77 obd = class_exp2obd(ll_i2mdcexp(inode));
79 CERROR("Invalid MDC connection handle "LPX64"\n",
80 ll_i2mdcexp(inode)->exp_handle.h_cookie);
85 * here we check if this is forced umount. If so this is called on
86 * canceling "open lock" and we do not call mdc_close() in this case, as
87 * it will not be successful, as import is already deactivated.
94 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
96 oa->o_id = inode->i_ino;
97 oa->o_valid = OBD_MD_FLID;
98 valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
99 OBD_MD_FLMTIME | OBD_MD_FLCTIME;
100 if (S_ISREG(inode->i_mode))
101 valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
102 obdo_from_inode(oa, inode, valid);
103 if (ll_is_inode_dirty(inode)) {
104 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
105 oa->o_valid |= OBD_MD_FLFLAGS;
107 ll_inode2fid(&data.fid1, inode);
108 rc = mdc_close(ll_i2mdcexp(inode), &data, oa, och, &req);
110 /* We are the last writer, so the MDS has instructed us to get
111 * the file size and any write cookies, then close again. */
112 ll_queue_done_writing(inode);
115 CERROR("inode %lu mdc close failed: rc = %d\n",
122 rc = ll_objects_destroy(req, inode);
124 CERROR("inode %lu ll_objects destroy: rc = %d\n",
128 ptlrpc_req_finished(req); /* This is close request */
131 mdc_clear_open_replay_data(och);
136 int ll_mdc_real_close(struct inode *inode, int flags)
138 struct ll_inode_info *lli = ll_i2info(inode);
140 struct obd_client_handle **och_p;
141 struct obd_client_handle *och;
146 if (flags & FMODE_WRITE) {
147 och_p = &lli->lli_mds_write_och;
148 och_usecount = &lli->lli_open_fd_write_count;
149 } else if (flags & FMODE_EXEC) {
150 och_p = &lli->lli_mds_exec_och;
151 och_usecount = &lli->lli_open_fd_exec_count;
153 LASSERT(flags & FMODE_READ);
154 och_p = &lli->lli_mds_read_och;
155 och_usecount = &lli->lli_open_fd_read_count;
158 down(&lli->lli_och_sem);
159 if (*och_usecount) { /* There are still users of this handle, so
161 up(&lli->lli_och_sem);
166 up(&lli->lli_och_sem);
168 if (och) { /* There might be a race and somebody have freed this och
170 rc = ll_close_inode_openhandle(inode, och);
171 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
172 OBD_FREE(och, sizeof *och);
178 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
181 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
182 struct ll_inode_info *lli = ll_i2info(inode);
186 /* clear group lock, if present */
187 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
188 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
189 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
190 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
194 /* Let's see if we have good enough OPEN lock on the file and if
195 we can skip talking to MDS */
196 if (file->f_dentry->d_inode) { /* Can this ever be false? */
198 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
199 struct lustre_handle lockh;
200 struct inode *inode = file->f_dentry->d_inode;
201 struct ldlm_res_id file_res_id;
203 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
204 fid_build_reg_res_name(ll_inode_lu_fid(inode), &file_res_id);
206 down(&lli->lli_och_sem);
207 if (fd->fd_omode & FMODE_WRITE) {
209 LASSERT(lli->lli_open_fd_write_count);
210 lli->lli_open_fd_write_count--;
211 } else if (fd->fd_omode & FMODE_EXEC) {
213 LASSERT(lli->lli_open_fd_exec_count);
214 lli->lli_open_fd_exec_count--;
217 LASSERT(lli->lli_open_fd_read_count);
218 lli->lli_open_fd_read_count--;
220 up(&lli->lli_och_sem);
222 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
223 &file_res_id, LDLM_IBITS, &policy,lockmode,
225 rc = ll_mdc_real_close(file->f_dentry->d_inode,
229 CERROR("Releasing a file %p with negative dentry %p. Name %s",
230 file, file->f_dentry, file->f_dentry->d_name.name);
233 LUSTRE_FPRIVATE(file) = NULL;
234 ll_file_data_put(fd);
239 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
241 /* While this returns an error code, fput() the caller does not, so we need
242 * to make every effort to clean up all of our state here. Also, applications
243 * rarely check close errors and even if an error is returned they will not
244 * re-try the close call.
246 int ll_file_release(struct inode *inode, struct file *file)
248 struct ll_file_data *fd;
249 struct ll_sb_info *sbi = ll_i2sbi(inode);
250 struct ll_inode_info *lli = ll_i2info(inode);
251 struct lov_stripe_md *lsm = lli->lli_smd;
255 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
256 inode->i_generation, inode);
258 if (inode->i_sb->s_root != file->f_dentry)
259 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
260 fd = LUSTRE_FPRIVATE(file);
263 /* The last ref on @file, maybe not the the owner pid of statahead.
264 * Different processes can open the same dir, "ll_opendir_key" means:
265 * it is me that should stop the statahead thread. */
266 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
267 ll_stop_statahead(inode, lli->lli_opendir_key);
269 if (inode->i_sb->s_root == file->f_dentry) {
270 LUSTRE_FPRIVATE(file) = NULL;
271 ll_file_data_put(fd);
276 lov_test_and_clear_async_rc(lsm);
277 lli->lli_async_rc = 0;
279 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
281 if (OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, obd_fail_val))
282 libcfs_debug_dumplog();
287 static int ll_intent_file_open(struct file *file, void *lmm,
288 int lmmsize, struct lookup_intent *itp)
290 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
291 struct mdc_op_data data = { { 0 } };
292 struct dentry *parent = file->f_dentry->d_parent;
293 const char *name = file->f_dentry->d_name.name;
294 const int len = file->f_dentry->d_name.len;
295 struct inode *inode = file->f_dentry->d_inode;
296 struct ptlrpc_request *req;
303 ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
304 name, len, O_RDWR, NULL);
306 /* Usually we come here only for NFSD, and we want open lock.
307 But we can also get here with pre 2.6.15 patchless kernels, and in
308 that case that lock is also ok */
309 /* We can also get here if there was cached open handle in revalidate_it
310 * but it disappeared while we were getting from there to ll_file_open.
311 * But this means this file was closed and immediatelly opened which
312 * makes a good candidate for using OPEN lock */
313 /* If lmmsize & lmm are not 0, we are just setting stripe info
314 * parameters. No need for the open lock */
315 if (!lmm && !lmmsize)
316 itp->it_flags |= MDS_OPEN_LOCK;
318 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
319 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
321 /* reason for keep own exit path - don`t flood log
322 * with messages with -ESTALE errors.
324 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
325 it_open_error(DISP_OPEN_OPEN, itp))
327 ll_release_openhandle(file->f_dentry, itp);
331 if (it_disposition(itp, DISP_LOOKUP_NEG))
332 GOTO(out, rc = -ENOENT);
334 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
335 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
336 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
340 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
341 req, DLM_REPLY_REC_OFF, NULL);
342 if (itp->d.lustre.it_lock_mode)
343 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
347 ptlrpc_req_finished(itp->d.lustre.it_data);
348 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
349 ll_intent_drop_lock(itp);
355 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
356 struct obd_client_handle *och)
358 struct ptlrpc_request *req = it->d.lustre.it_data;
359 struct mds_body *body;
363 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
364 LASSERT(body != NULL); /* reply already checked out */
365 /* and swabbed in mdc_enqueue */
366 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
368 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
369 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
370 lli->lli_io_epoch = body->io_epoch;
372 mdc_set_open_replay_data(och, it->d.lustre.it_data);
375 int ll_local_open(struct file *file, struct lookup_intent *it,
376 struct ll_file_data *fd, struct obd_client_handle *och)
380 LASSERT(!LUSTRE_FPRIVATE(file));
385 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
386 LUSTRE_FPRIVATE(file) = fd;
387 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
388 fd->fd_omode = it->it_flags;
393 /* Open a file, and (for the very first open) create objects on the OSTs at
394 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
395 * creation or open until ll_lov_setstripe() ioctl is called. We grab
396 * lli_open_sem to ensure no other process will create objects, send the
397 * stripe MD to the MDS, or try to destroy the objects if that fails.
399 * If we already have the stripe MD locally then we don't request it in
400 * mdc_open(), by passing a lmm_size = 0.
402 * It is up to the application to ensure no other processes open this file
403 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
404 * used. We might be able to avoid races of that sort by getting lli_open_sem
405 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
406 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
408 int ll_file_open(struct inode *inode, struct file *file)
410 struct ll_inode_info *lli = ll_i2info(inode);
411 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
412 .it_flags = file->f_flags };
413 struct lov_stripe_md *lsm;
414 struct obd_client_handle **och_p = NULL;
415 __u64 *och_usecount = NULL;
416 struct ll_file_data *fd;
417 int rc = 0, opendir_set = 0;
420 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
421 inode->i_generation, inode, file->f_flags);
423 #ifdef HAVE_VFS_INTENT_PATCHES
426 it = file->private_data; /* XXX: compat macro */
427 file->private_data = NULL; /* prevent ll_local_open assertion */
430 fd = ll_file_data_get();
432 GOTO(out_och_free, rc = -ENOMEM);
434 if (S_ISDIR(inode->i_mode)) {
435 spin_lock(&lli->lli_lock);
436 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
437 LASSERT(lli->lli_sai == NULL);
438 lli->lli_opendir_key = fd;
439 lli->lli_opendir_pid = cfs_curproc_pid();
442 spin_unlock(&lli->lli_lock);
445 if (inode->i_sb->s_root == file->f_dentry) {
446 LUSTRE_FPRIVATE(file) = fd;
450 if (!it || !it->d.lustre.it_disposition) {
451 /* Convert f_flags into access mode. We cannot use file->f_mode,
452 * because everything but O_ACCMODE mask was stripped from it */
453 if ((oit.it_flags + 1) & O_ACCMODE)
455 if (file->f_flags & O_TRUNC)
456 oit.it_flags |= FMODE_WRITE;
458 /* kernel only call f_op->open in dentry_open. filp_open calls
459 * dentry_open after call to open_namei that checks permissions.
460 * Only nfsd_open call dentry_open directly without checking
461 * permissions and because of that this code below is safe. */
462 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
463 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
465 /* We do not want O_EXCL here, presumably we opened the file
466 * already? XXX - NFS implications? */
467 oit.it_flags &= ~O_EXCL;
469 /* bug20584, if "it_flags" contains O_CREAT, the file will be
470 * created if necessary, then "IT_CREAT" should be set to keep
471 * consistent with it */
472 if (oit.it_flags & O_CREAT)
473 oit.it_op |= IT_CREAT;
478 if (ll_i2sbi(inode)->ll_direct_io_default &&
479 !S_ISDIR(inode->i_mode) &&
480 !(it->it_flags & FMODE_EXEC))
481 file->f_flags |= O_DIRECT;
484 /* Let's see if we have file open on MDS already. */
485 if (it->it_flags & FMODE_WRITE) {
486 och_p = &lli->lli_mds_write_och;
487 och_usecount = &lli->lli_open_fd_write_count;
488 } else if (it->it_flags & FMODE_EXEC) {
489 och_p = &lli->lli_mds_exec_och;
490 och_usecount = &lli->lli_open_fd_exec_count;
492 och_p = &lli->lli_mds_read_och;
493 och_usecount = &lli->lli_open_fd_read_count;
496 LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
497 it->d.lustre.it_disposition);
499 down(&lli->lli_och_sem);
500 if (*och_p) { /* Open handle is present */
501 if (it_disposition(it, DISP_OPEN_OPEN)) {
502 /* Well, there's extra open request that we do not need,
503 let's close it somehow. This will decref request. */
504 rc = it_open_error(DISP_OPEN_OPEN, it);
506 up(&lli->lli_och_sem);
507 GOTO(out_openerr, rc);
509 ll_release_openhandle(file->f_dentry, it);
510 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
515 rc = ll_local_open(file, it, fd, NULL);
517 LASSERTF(rc == 0, "rc = %d\n", rc);
519 LASSERT(*och_usecount == 0);
520 if (!it->d.lustre.it_disposition) {
521 /* We cannot just request lock handle now, new ELC code
522 means that one of other OPEN locks for this file
523 could be cancelled, and since blocking ast handler
524 would attempt to grab och_sem as well, that would
525 result in a deadlock */
526 up(&lli->lli_och_sem);
527 it->it_create_mode |= M_CHECK_STALE;
528 rc = ll_intent_file_open(file, NULL, 0, it);
529 it->it_create_mode &= ~M_CHECK_STALE;
531 GOTO(out_openerr, rc);
536 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
538 GOTO(out_och_free, rc = -ENOMEM);
542 /* mdc_intent_lock() didn't get a request ref if there was an
543 * open error, so don't do cleanup on the request here
545 /* XXX (green): Should not we bail out on any error here, not
546 * just open error? */
547 rc = it_open_error(DISP_OPEN_OPEN, it);
549 GOTO(out_och_free, rc);
551 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
553 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
554 rc = ll_local_open(file, it, fd, *och_p);
555 LASSERTF(rc == 0, "rc = %d\n", rc);
557 up(&lli->lli_och_sem);
560 /* Must do this outside lli_och_sem lock to prevent deadlock where
561 different kind of OPEN lock for this same inode gets cancelled
562 by ldlm_cancel_lru */
563 if (!S_ISREG(inode->i_mode))
564 GOTO(out_och_free, rc);
568 if (file->f_flags & O_LOV_DELAY_CREATE ||
569 !(file->f_mode & FMODE_WRITE)) {
570 CDEBUG(D_INODE, "object creation was delayed\n");
571 GOTO(out_och_free, rc);
574 file->f_flags &= ~O_LOV_DELAY_CREATE;
575 GOTO(out_och_free, rc);
578 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
579 ptlrpc_req_finished(it->d.lustre.it_data);
580 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
584 ll_open_complete(inode);
586 if (och_p && *och_p) {
587 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
588 *och_p = NULL; /* OBD_FREE writes some magic there */
591 up(&lli->lli_och_sem);
593 if (opendir_set != 0)
594 ll_stop_statahead(inode, lli->lli_opendir_key);
596 ll_file_data_put(fd);
602 /* Fills the obdo with the attributes for the inode defined by lsm */
603 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
606 struct ptlrpc_request_set *set;
607 struct obd_info oinfo = { { { 0 } } };
611 LASSERT(lsm != NULL);
613 memset(oa, 0, sizeof *oa);
616 oa->o_id = lsm->lsm_object_id;
617 oa->o_gr = lsm->lsm_object_gr;
618 oa->o_mode = S_IFREG;
619 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
620 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
621 OBD_MD_FLCTIME | OBD_MD_FLGROUP;
623 set = ptlrpc_prep_set();
627 rc = obd_getattr_async(exp, &oinfo, set);
629 rc = ptlrpc_set_wait(set);
630 ptlrpc_set_destroy(set);
635 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
636 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
640 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
642 struct ll_inode_info *lli = ll_i2info(inode);
643 struct lov_stripe_md *lsm = lli->lli_smd;
644 struct obd_export *exp = ll_i2obdexp(inode);
647 struct ldlm_lock *lock;
648 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
649 __u32 stripe, vallen = sizeof(stripe);
650 struct lov_oinfo *loinfo;
654 if (lsm->lsm_stripe_count == 1)
655 GOTO(check, stripe = 0);
657 /* get our offset in the lov */
658 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
660 CERROR("obd_get_info: rc = %d\n", rc);
663 LASSERT(stripe < lsm->lsm_stripe_count);
666 loinfo = lsm->lsm_oinfo[stripe];
667 if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
668 &lock->l_resource->lr_name)) {
669 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
670 loinfo->loi_id, loinfo->loi_gr);
671 RETURN(-ELDLM_NO_LOCK_DATA);
677 /* Get extra page reference to ensure it is not going away */
678 void ll_pin_extent_cb(void *data)
680 struct page *page = data;
682 page_cache_get(page);
686 /* Flush the page from page cache for an extent as its canceled.
687 * Page to remove is delivered as @data.
689 * No one can dirty the extent until we've finished our work and they cannot
690 * enqueue another lock. The DLM protects us from ll_file_read/write here,
691 * but other kernel actors could have pages locked.
693 * If @discard is set, there is no need to write the page if it is dirty.
695 * Called with the DLM lock held. */
696 int ll_page_removal_cb(void *data, int discard)
699 struct page *page = data;
700 struct address_space *mapping;
704 /* We have page reference already from ll_pin_page */
707 /* Already truncated by somebody */
711 mapping = page->mapping;
713 ll_teardown_mmaps(mapping,
714 (__u64)page->index << PAGE_CACHE_SHIFT,
715 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
717 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
718 if (!discard && PageWriteback(page))
719 wait_on_page_writeback(page);
721 if (!discard && clear_page_dirty_for_io(page)) {
722 rc = ll_call_writepage(page->mapping->host, page);
723 /* either waiting for io to complete or reacquiring
724 * the lock that the failed writepage released */
726 wait_on_page_writeback(page);
728 CERROR("writepage inode %lu(%p) of page %p "
729 "failed: %d\n", mapping->host->i_ino,
730 mapping->host, page, rc);
732 set_bit(AS_ENOSPC, &mapping->flags);
734 set_bit(AS_EIO, &mapping->flags);
737 if (page->mapping != NULL) {
738 struct ll_async_page *llap = llap_cast_private(page);
739 // checking again to account for writeback's lock_page()
740 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
742 ll_ra_accounting(llap, page->mapping);
743 ll_truncate_complete_page(page);
747 LASSERT(!PageWriteback(page));
749 page_cache_release(page);
754 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
755 void *data, int flag)
758 struct ll_inode_info *lli;
759 struct lov_stripe_md *lsm;
765 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
766 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
770 inode = ll_inode_from_lock(lock);
773 lli = ll_i2info(inode);
776 if (lli->lli_smd == NULL)
780 stripe = ll_lock_to_stripe_offset(inode, lock);
784 lov_stripe_lock(lsm);
785 lock_res_and_lock(lock);
786 kms = ldlm_extent_shift_kms(lock,
787 lsm->lsm_oinfo[stripe]->loi_kms);
789 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
790 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
791 lsm->lsm_oinfo[stripe]->loi_kms, kms);
792 lsm->lsm_oinfo[stripe]->loi_kms = kms;
793 unlock_res_and_lock(lock);
794 lov_stripe_unlock(lsm);
795 ll_try_done_writing(inode);
804 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
806 /* XXX ALLOCATE - 160 bytes */
807 struct inode *inode = ll_inode_from_lock(lock);
808 struct ll_inode_info *lli = ll_i2info(inode);
809 struct lustre_handle lockh = { 0 };
814 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
815 LDLM_FL_BLOCK_CONV)) {
816 LBUG(); /* not expecting any blocked async locks yet */
817 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
819 ldlm_lock_dump(D_OTHER, lock, 0);
820 ldlm_reprocess_all(lock->l_resource);
824 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
826 stripe = ll_lock_to_stripe_offset(inode, lock);
830 if (lock->l_lvb_len) {
831 struct lov_stripe_md *lsm = lli->lli_smd;
833 lvb = lock->l_lvb_data;
834 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
836 lock_res_and_lock(lock);
837 ll_inode_size_lock(inode, 1);
838 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
839 kms = ldlm_extent_shift_kms(NULL, kms);
840 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
841 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
842 lsm->lsm_oinfo[stripe].loi_kms, kms);
843 lsm->lsm_oinfo[stripe].loi_kms = kms;
844 ll_inode_size_unlock(inode, 1);
845 unlock_res_and_lock(lock);
850 wake_up(&lock->l_waitq);
852 ldlm_lock2handle(lock, &lockh);
853 ldlm_lock_decref(&lockh, LCK_PR);
858 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
860 struct ptlrpc_request *req = reqp;
861 struct inode *inode = ll_inode_from_lock(lock);
862 struct ll_inode_info *lli;
863 struct lov_stripe_md *lsm;
866 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
870 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
871 lli = ll_i2info(inode);
873 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
876 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
878 /* First, find out which stripe index this lock corresponds to. */
879 stripe = ll_lock_to_stripe_offset(inode, lock);
881 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
883 rc = lustre_pack_reply(req, 2, size, NULL);
887 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
888 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
889 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
890 lvb->lvb_atime = LTIME_S(inode->i_atime);
891 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
893 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
894 " atime "LPU64", mtime "LPU64", ctime "LPU64,
895 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_atime,
896 lvb->lvb_mtime, lvb->lvb_ctime);
901 /* These errors are normal races, so we don't want to fill the console
902 * with messages by calling ptlrpc_error() */
903 if (rc == -ELDLM_NO_LOCK_DATA)
904 lustre_pack_reply(req, 1, NULL, NULL);
910 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
913 struct lustre_handle lockh = { 0 };
914 struct ldlm_enqueue_info einfo = { 0 };
915 struct obd_info oinfo = { { { 0 } } };
921 einfo.ei_type = LDLM_EXTENT;
922 einfo.ei_mode = LCK_PR;
923 einfo.ei_cb_bl = osc_extent_blocking_cb;
924 einfo.ei_cb_cp = ldlm_completion_ast;
925 einfo.ei_cb_gl = ll_glimpse_callback;
926 einfo.ei_cbdata = NULL;
928 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
929 oinfo.oi_lockh = &lockh;
931 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
933 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
937 CERROR("obd_enqueue returned rc %d, "
938 "returning -EIO\n", rc);
939 RETURN(rc > 0 ? -EIO : rc);
942 lov_stripe_lock(lsm);
943 memset(&lvb, 0, sizeof(lvb));
944 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
945 st->st_size = lvb.lvb_size;
946 st->st_blocks = lvb.lvb_blocks;
947 st->st_mtime = lvb.lvb_mtime;
948 st->st_atime = lvb.lvb_atime;
949 st->st_ctime = lvb.lvb_ctime;
950 lov_stripe_unlock(lsm);
955 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
956 * file (because it prefers KMS over RSS when larger) */
957 int ll_glimpse_size(struct inode *inode, int ast_flags)
959 struct ll_inode_info *lli = ll_i2info(inode);
960 struct ll_sb_info *sbi = ll_i2sbi(inode);
961 struct lustre_handle lockh = { 0 };
962 struct ldlm_enqueue_info einfo = { 0 };
963 struct obd_info oinfo = { { { 0 } } };
968 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
971 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
975 /* NOTE: this looks like DLM lock request, but it may not be one. Due
976 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
977 * won't revoke any conflicting DLM locks held. Instead,
978 * ll_glimpse_callback() will be called on each client
979 * holding a DLM lock against this file, and resulting size
980 * will be returned for each stripe. DLM lock on [0, EOF] is
981 * acquired only if there were no conflicting locks. */
982 einfo.ei_type = LDLM_EXTENT;
983 einfo.ei_mode = LCK_PR;
984 einfo.ei_cb_bl = osc_extent_blocking_cb;
985 einfo.ei_cb_cp = ldlm_completion_ast;
986 einfo.ei_cb_gl = ll_glimpse_callback;
987 einfo.ei_cbdata = inode;
989 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
990 oinfo.oi_lockh = &lockh;
991 oinfo.oi_md = lli->lli_smd;
992 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
994 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
998 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
999 RETURN(rc > 0 ? -EIO : rc);
1002 ll_inode_size_lock(inode, 1);
1003 inode_init_lvb(inode, &lvb);
1004 /* merge timestamps the most recently obtained from mds with
1005 timestamps obtained from osts */
1006 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
1007 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
1008 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
1009 rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1010 i_size_write(inode, lvb.lvb_size);
1011 inode->i_blocks = lvb.lvb_blocks;
1012 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1013 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1014 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1015 ll_inode_size_unlock(inode, 1);
1017 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1018 i_size_read(inode), (long long)inode->i_blocks);
1023 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1024 struct lov_stripe_md *lsm, int mode,
1025 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1028 struct ll_sb_info *sbi = ll_i2sbi(inode);
1030 struct ldlm_enqueue_info einfo = { 0 };
1031 struct obd_info oinfo = { { { 0 } } };
1035 LASSERT(!lustre_handle_is_used(lockh));
1036 LASSERT(lsm != NULL);
1038 /* don't drop the mmapped file to LRU */
1039 if (mapping_mapped(inode->i_mapping))
1040 ast_flags |= LDLM_FL_NO_LRU;
1042 /* XXX phil: can we do this? won't it screw the file size up? */
1043 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1044 (sbi->ll_flags & LL_SBI_NOLCK))
1047 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1048 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1050 einfo.ei_type = LDLM_EXTENT;
1051 einfo.ei_mode = mode;
1052 einfo.ei_cb_bl = osc_extent_blocking_cb;
1053 einfo.ei_cb_cp = ldlm_completion_ast;
1054 einfo.ei_cb_gl = ll_glimpse_callback;
1055 einfo.ei_cbdata = inode;
1057 oinfo.oi_policy = *policy;
1058 oinfo.oi_lockh = lockh;
1060 oinfo.oi_flags = ast_flags;
1062 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1063 *policy = oinfo.oi_policy;
1067 ll_inode_size_lock(inode, 1);
1068 inode_init_lvb(inode, &lvb);
1069 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1071 if (policy->l_extent.start == 0 &&
1072 policy->l_extent.end == OBD_OBJECT_EOF) {
1073 /* vmtruncate()->ll_truncate() first sets the i_size and then
1074 * the kms under both a DLM lock and the
1075 * ll_inode_size_lock(). If we don't get the
1076 * ll_inode_size_lock() here we can match the DLM lock and
1077 * reset i_size from the kms before the truncating path has
1078 * updated the kms. generic_file_write can then trust the
1079 * stale i_size when doing appending writes and effectively
1080 * cancel the result of the truncate. Getting the
1081 * ll_inode_size_lock() after the enqueue maintains the DLM
1082 * -> ll_inode_size_lock() acquiring order. */
1083 i_size_write(inode, lvb.lvb_size);
1084 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1085 inode->i_ino, i_size_read(inode));
1089 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1090 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1091 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1093 ll_inode_size_unlock(inode, 1);
1098 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1099 struct lov_stripe_md *lsm, int mode,
1100 struct lustre_handle *lockh)
1102 struct ll_sb_info *sbi = ll_i2sbi(inode);
1106 /* XXX phil: can we do this? won't it screw the file size up? */
1107 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1108 (sbi->ll_flags & LL_SBI_NOLCK))
1111 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh, 0, 0);
1116 static void ll_set_file_contended(struct inode *inode)
1118 struct ll_inode_info *lli = ll_i2info(inode);
1120 lli->lli_contention_time = cfs_time_current();
1121 set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1124 void ll_clear_file_contended(struct inode *inode)
1126 struct ll_inode_info *lli = ll_i2info(inode);
1128 clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1131 static int ll_is_file_contended(struct file *file)
1133 struct inode *inode = file->f_dentry->d_inode;
1134 struct ll_inode_info *lli = ll_i2info(inode);
1135 struct ll_sb_info *sbi = ll_i2sbi(inode);
1136 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1139 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1140 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1141 " osc connect flags = 0x"LPX64"\n",
1142 sbi->ll_lco.lco_flags);
1146 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1149 /* The semantics here is a bit complicated due to compatibility.
1150 * The user may be aware of per-file LL_FILE_LOCKED_DIRECTIO,
1151 * but not of per-client lockless_direct_io, so the file bit takes
1152 * precedence if it is set. If the file bit is not set, we use
1153 * lockless I/O unless per-client lockless_direct_io is set to zero.
1155 CLASSERT(SBI_DEFAULT_LOCKLESS_DIRECT_IO == 1);
1156 if ((file->f_flags & O_DIRECT) &&
1157 !(fd && (fd->fd_flags & LL_FILE_LOCKED_DIRECTIO)) &&
1158 sbi->ll_lockless_direct_io)
1161 /* server-side locking for cached I/O with LL_FILE_LOCKLESS_IO */
1162 if (!(file->f_flags & O_DIRECT) &&
1163 fd && fd->fd_flags & LL_FILE_LOCKLESS_IO)
1166 if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1167 cfs_time_t cur_time = cfs_time_current();
1168 cfs_time_t retry_time;
1170 retry_time = cfs_time_add(
1171 lli->lli_contention_time,
1172 cfs_time_seconds(sbi->ll_contention_time));
1173 if (cfs_time_after(cur_time, retry_time)) {
1174 ll_clear_file_contended(inode);
1182 static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
1183 struct file *file, const struct iovec *iov,
1184 unsigned long nr_segs,
1185 obd_off start, obd_off end, int rw)
1188 int tree_locked = 0;
1190 struct inode * inode = file->f_dentry->d_inode;
1193 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1195 if (append || !ll_is_file_contended(file)) {
1196 struct ll_lock_tree_node *node;
1199 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1200 if (file->f_flags & O_NONBLOCK)
1201 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1202 node = ll_node_from_inode(inode, start, end,
1203 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1208 tree->lt_fd = LUSTRE_FPRIVATE(file);
1209 rc = ll_tree_lock_iov(tree, node, iov, nr_segs, ast_flags);
1212 else if (rc == -EUSERS)
1213 ll_set_file_contended(inode);
1217 RETURN(tree_locked);
1222 /* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
1224 static size_t ll_file_get_iov_count(const struct iovec *iov,
1225 unsigned long *nr_segs)
1230 for (seg = 0; seg < *nr_segs; seg++) {
1231 const struct iovec *iv = &iov[seg];
1234 * If any segment has a negative length, or the cumulative
1235 * length ever wraps negative then return -EINVAL.
1237 count += iv->iov_len;
1238 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1240 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1245 count -= iv->iov_len; /* This segment is no good */
1251 static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out,
1252 unsigned long *nrsegs_copy,
1253 struct iovec *iov_copy, size_t *offset,
1257 const struct iovec *iov = *iov_out;
1258 for (i = 0; i < *nr_segs;
1260 const struct iovec *iv = &iov[i];
1261 struct iovec *ivc = &iov_copy[i];
1264 ivc->iov_len -= *offset;
1265 ivc->iov_base += *offset;
1267 if (ivc->iov_len >= size) {
1268 ivc->iov_len = size;
1275 size -= ivc->iov_len;
1279 *nrsegs_copy = i + 1;
1284 static int ll_get_short_lock(struct page *page, int rw, obd_off start,
1285 obd_off end, struct lustre_handle *lockh)
1287 struct ll_async_page *llap;
1288 struct obd_export *exp;
1289 struct inode *inode = page->mapping->host;
1293 exp = ll_i2obdexp(inode);
1297 llap = llap_cast_private(page);
1301 RETURN(obd_get_lock(exp, ll_i2info(inode)->lli_smd,
1302 &llap->llap_cookie, rw, start, end, lockh,
1306 static void ll_release_short_lock(struct inode *inode, obd_off end,
1307 struct lustre_handle *lockh, int rw)
1309 struct obd_export *exp;
1312 exp = ll_i2obdexp(inode);
1316 rc = obd_cancel(exp, ll_i2info(inode)->lli_smd,
1317 rw = OBD_BRW_READ ? LCK_PR : LCK_PW, lockh,
1318 OBD_FAST_LOCK, end);
1320 CERROR("unlock failed (%d)\n", rc);
1323 static inline int ll_file_get_fast_lock(struct file *file,
1324 obd_off ppos, obd_off end,
1325 const struct iovec *iov,
1326 unsigned long nr_segs,
1327 struct lustre_handle *lockh,
1335 /* we would like this read request to be lockfree */
1336 for (seg = 0; seg < nr_segs; seg++) {
1337 const struct iovec *iv = &iov[seg];
1338 if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len))
1342 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1343 ppos >> CFS_PAGE_SHIFT);
1345 if (ll_get_short_lock(page, rw, ppos, end, lockh))
1349 page_cache_release(page);
1356 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1357 struct lustre_handle *lockh, int rw)
1359 ll_release_short_lock(inode, end, lockh, rw);
1362 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1363 obd_off end, const struct iovec *iov,
1364 unsigned long nr_segs,
1365 struct lustre_handle *lockh,
1366 struct ll_lock_tree *tree, int rw)
1372 if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, lockh, rw))
1373 RETURN(LL_LOCK_STYLE_FASTLOCK);
1375 rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs,
1377 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1380 RETURN(LL_LOCK_STYLE_TREELOCK);
1382 RETURN(LL_LOCK_STYLE_NOLOCK);
1385 /* an error happened if we reached this point, rc = -errno here */
1389 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1390 enum ll_lock_style lock_style,
1391 struct lustre_handle *lockh,
1392 struct ll_lock_tree *tree, int rw)
1395 switch (lock_style) {
1396 case LL_LOCK_STYLE_TREELOCK:
1397 ll_tree_unlock(tree);
1399 case LL_LOCK_STYLE_FASTLOCK:
1400 ll_file_put_fast_lock(inode, end, lockh, rw);
1403 CERROR("invalid locking style (%d)\n", lock_style);
1407 #ifdef HAVE_FILE_READV
1408 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1409 unsigned long nr_segs, loff_t *ppos)
1412 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1413 unsigned long nr_segs, loff_t pos)
1415 struct file *file = iocb->ki_filp;
1416 loff_t *ppos = &iocb->ki_pos;
1418 struct inode *inode = file->f_dentry->d_inode;
1419 struct ll_inode_info *lli = ll_i2info(inode);
1420 struct lov_stripe_md *lsm = lli->lli_smd;
1421 struct ll_sb_info *sbi = ll_i2sbi(inode);
1422 struct ll_thread_data ltd = { 0 };
1424 struct ll_ra_read bead;
1427 ssize_t retval, chunk, sum = 0;
1428 struct iovec *iov_copy = NULL;
1429 unsigned long nrsegs_copy, nrsegs_orig = 0;
1430 size_t count, iov_offset = 0;
1434 count = ll_file_get_iov_count(iov, &nr_segs);
1435 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%lld\n",
1436 inode->i_ino, inode->i_generation, inode, (unsigned long)count,
1438 /* "If nbyte is 0, read() will return 0 and have no other results."
1439 * -- Single Unix Spec */
1443 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1446 /* Read on file with no objects should return zero-filled
1447 * buffers up to file size (we can get non-zero sizes with
1448 * mknod + truncate, then opening file for read. This is a
1449 * common pattern in NFS case, it seems). Bug 6243 */
1451 /* Since there are no objects on OSTs, we have nothing to get
1452 * lock on and so we are forced to access inode->i_size
1455 /* Read beyond end of file */
1456 if (*ppos >= i_size_read(inode))
1459 if (count > i_size_read(inode) - *ppos)
1460 count = i_size_read(inode) - *ppos;
1461 /* Make sure to correctly adjust the file pos pointer for
1463 for (nrsegs_copy = 0; nrsegs_copy < nr_segs; nrsegs_copy++) {
1464 const struct iovec *iv = &iov[nrsegs_copy];
1466 if (count < iv->iov_len)
1469 chunk = iv->iov_len;
1470 notzeroed = clear_user(iv->iov_base, chunk);
1471 sum += (chunk - notzeroed);
1472 count -= (chunk - notzeroed);
1473 if (notzeroed || !count)
1482 ltd.ltd_magic = LTD_MAGIC;
1485 memset(<d, 0, sizeof(ltd));
1486 ltd.ltd_magic = LTD_MAGIC;
1487 if (sbi->ll_max_rw_chunk != 0 && !(file->f_flags & O_DIRECT)) {
1488 /* first, let's know the end of the current stripe */
1490 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,&end);
1492 /* correct, the end is beyond the request */
1493 if (end > *ppos + count - 1)
1494 end = *ppos + count - 1;
1496 /* and chunk shouldn't be too large even if striping is wide */
1497 if (end - *ppos > sbi->ll_max_rw_chunk)
1498 end = *ppos + sbi->ll_max_rw_chunk - 1;
1500 chunk = end - *ppos + 1;
1501 if ((count == chunk) && (iov_offset == 0)) {
1503 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1505 iov_copy = (struct iovec *)iov;
1506 nrsegs_copy = nr_segs;
1509 nrsegs_orig = nr_segs;
1510 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1512 GOTO(out, retval = -ENOMEM);
1515 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1516 &iov_offset, chunk);
1519 end = *ppos + count - 1;
1520 iov_copy = (struct iovec *)iov;
1521 nrsegs_copy = nr_segs;
1524 down_read(&lli->lli_truncate_rwsem); /* Bug 18233 */
1526 ltd.lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1527 iov_copy, nrsegs_copy,
1528 <d.u.lockh, <d.u.tree,
1530 if (ltd.lock_style < 0 || ltd.lock_style == LL_LOCK_STYLE_NOLOCK)
1531 up_read(&lli->lli_truncate_rwsem);
1532 if (ltd.lock_style < 0)
1533 GOTO(out, retval = ltd.lock_style);
1535 ll_inode_size_lock(inode, 1);
1537 * Consistency guarantees: following possibilities exist for the
1538 * relation between region being read and real file size at this
1541 * (A): the region is completely inside of the file;
1543 * (B-x): x bytes of region are inside of the file, the rest is
1546 * (C): the region is completely outside of the file.
1548 * This classification is stable under DLM lock acquired by
1549 * ll_tree_lock() above, because to change class, other client has to
1550 * take DLM lock conflicting with our lock. Also, any updates to
1551 * ->i_size by other threads on this client are serialized by
1552 * ll_inode_size_lock(). This guarantees that short reads are handled
1553 * correctly in the face of concurrent writes and truncates.
1555 inode_init_lvb(inode, &lvb);
1556 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1558 if (*ppos + count - 1 > kms) {
1559 /* A glimpse is necessary to determine whether we return a
1560 * short read (B) or some zeroes at the end of the buffer (C) */
1561 ll_inode_size_unlock(inode, 1);
1562 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1564 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1565 ll_file_put_lock(inode, end, ltd.lock_style,
1566 <d.u.lockh, <d.u.tree,
1568 up_read(&lli->lli_truncate_rwsem);
1572 /* If objective page index exceed the end-of-file page
1573 * index, return directly. Do not expect kernel will
1574 * check such case correctly. linux-2.6.18-128.1.1 miss
1575 * to do that. --bug 17336 */
1576 loff_t size = i_size_read(inode);
1577 unsigned long cur_index = *ppos >> CFS_PAGE_SHIFT;
1579 if ((size == 0 && cur_index != 0) ||
1580 (((size - 1) >> CFS_PAGE_SHIFT) < cur_index)) {
1581 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1583 ll_file_put_lock(inode, end,
1588 up_read(&lli->lli_truncate_rwsem);
1594 /* region is within kms and, hence, within real file size (A).
1595 * We need to increase i_size to cover the read region so that
1596 * generic_file_read() will do its job, but that doesn't mean
1597 * the kms size is _correct_, it is only the _minimum_ size.
1598 * If someone does a stat they will get the correct size which
1599 * will always be >= the kms value here. b=11081 */
1600 if (i_size_read(inode) < kms)
1601 i_size_write(inode, kms);
1602 ll_inode_size_unlock(inode, 1);
1605 chunk = end - *ppos + 1;
1606 CDEBUG(D_INODE,"Read ino %lu, %ld bytes, offset %lld, i_size %llu\n",
1607 inode->i_ino, (long)chunk, *ppos, i_size_read(inode));
1609 /* turn off the kernel's read-ahead */
1610 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1611 struct ost_lvb *xtimes;
1614 * 1. update inode's atime as long as concurrent stat
1615 * (via ll_glimpse_size) might bring out-of-date ones
1617 * 2. update lsm so that next stat (via
1618 * ll_glimpse_size) could get correct values in lsm */
1619 OBD_ALLOC_PTR(xtimes);
1620 if (NULL == xtimes) {
1621 ll_file_put_lock(inode, end, ltd.lock_style,
1622 <d.u.lockh, <d.u.tree,
1624 up_read(&lli->lli_truncate_rwsem);
1625 GOTO(out, retval = -ENOMEM);
1628 lov_stripe_lock(lsm);
1629 LTIME_S(inode->i_atime) = LTIME_S(CURRENT_TIME);
1630 xtimes->lvb_atime = LTIME_S(inode->i_atime);
1631 obd_update_lvb(sbi->ll_osc_exp, lsm, xtimes,
1633 lov_stripe_unlock(lsm);
1634 OBD_FREE_PTR(xtimes);
1636 file->f_ra.ra_pages = 0;
1637 /* initialize read-ahead window once per syscall */
1640 ll_ra_read_init(file, &bead, *ppos, count);
1644 file_accessed(file);
1645 #ifdef HAVE_FILE_READV
1646 retval = generic_file_readv(file, iov_copy, nrsegs_copy, ppos);
1648 retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy,
1651 ll_file_put_lock(inode, end, ltd.lock_style, <d.u.lockh,
1652 <d.u.tree, OBD_BRW_READ);
1653 up_read(&lli->lli_truncate_rwsem);
1655 file_accessed(file);
1656 retval = ll_direct_IO(READ, file, iov_copy, *ppos, nr_segs, 0);
1658 lprocfs_counter_add(sbi->ll_stats,
1659 LPROC_LL_LOCKLESS_READ,
1664 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1668 if (retval == chunk && count > 0)
1675 ll_ra_read_ex(file, &bead);
1676 retval = (sum > 0) ? sum : retval;
1678 if (iov_copy && iov_copy != iov)
1679 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1684 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1687 struct iovec local_iov = { .iov_base = (void __user *)buf,
1689 #ifdef HAVE_FILE_READV
1690 return ll_file_readv(file, &local_iov, 1, ppos);
1695 init_sync_kiocb(&kiocb, file);
1696 kiocb.ki_pos = *ppos;
1697 kiocb.ki_left = count;
1699 ret = ll_file_aio_read(&kiocb, &local_iov, 1, kiocb.ki_pos);
1700 *ppos = kiocb.ki_pos;
1705 /* iov_shorten from linux kernel */
1706 static unsigned long ll_iov_shorten(struct iovec *iov,
1707 unsigned long nr_segs,
1710 unsigned long seg = 0;
1713 while (seg < nr_segs) {
1715 if (len + iov->iov_len >= to) {
1716 iov->iov_len = to - len;
1719 len += iov->iov_len;
1725 /* 2.6.22 and 2.6.27 export this as generic_segment_checks */
1726 static int ll_generic_segment_checks(const struct iovec *iov,
1727 unsigned long *nr_segs,
1733 for (seg = 0; seg < *nr_segs; seg++) {
1734 const struct iovec *iv = &iov[seg];
1737 * If any segment has a negative length, or the cumulative
1738 * length ever wraps negative then return -EINVAL.
1741 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1743 if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1748 cnt -= iv->iov_len; /* This segment is no good */
1756 * Write to a file (through the page cache).
1758 #ifdef HAVE_FILE_WRITEV
1759 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1760 unsigned long nr_segs, loff_t *ppos)
1762 #else /* AIO stuff */
1763 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1764 unsigned long nr_segs, loff_t pos)
1766 struct file *file = iocb->ki_filp;
1767 loff_t *ppos = &iocb->ki_pos;
1769 struct inode *inode = file->f_dentry->d_inode;
1770 struct ll_sb_info *sbi = ll_i2sbi(inode);
1771 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1772 struct ll_thread_data ltd = { 0 };
1773 loff_t maxbytes = ll_file_maxbytes(inode);
1774 loff_t lock_start, lock_end, end;
1775 ssize_t retval, chunk, sum = 0;
1777 struct iovec *iov_copy = NULL;
1778 unsigned long nrsegs_copy, nrsegs_orig = 0;
1779 size_t count, iov_offset = 0;
1780 int got_write_sem = 0;
1781 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1784 count = ll_file_get_iov_count(iov, &nr_segs);
1786 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%Ld\n",
1787 inode->i_ino, inode->i_generation, inode, (unsigned long)count,
1790 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1792 /* POSIX, but surprised the VFS doesn't check this already */
1796 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1797 * called on the file, don't fail the below assertion (bug 2388). */
1798 if (file->f_flags & O_LOV_DELAY_CREATE &&
1799 ll_i2info(inode)->lli_smd == NULL)
1802 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1804 /* signal(7) specifies that write(2) and writev(2) should be restarted */
1805 if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK)) {
1807 if (down_interruptible(&ll_i2info(inode)->lli_write_sem))
1808 RETURN(-ERESTARTSYS);
1811 ltd.ltd_magic = LTD_MAGIC;
1814 memset(<d, 0, sizeof(ltd));
1815 ltd.ltd_magic = LTD_MAGIC;
1817 chunk = 0; /* just to fix gcc's warning */
1818 end = *ppos + count - 1;
1820 if (file->f_flags & O_APPEND) {
1822 lock_end = OBD_OBJECT_EOF;
1823 iov_copy = (struct iovec *)iov;
1824 nrsegs_copy = nr_segs;
1825 } else if (sbi->ll_max_rw_chunk != 0 && !(file->f_flags & O_DIRECT)) {
1826 /* first, let's know the end of the current stripe */
1828 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1831 /* correct, the end is beyond the request */
1832 if (end > *ppos + count - 1)
1833 end = *ppos + count - 1;
1835 /* and chunk shouldn't be too large even if striping is wide */
1836 if (end - *ppos > sbi->ll_max_rw_chunk)
1837 end = *ppos + sbi->ll_max_rw_chunk - 1;
1840 chunk = end - *ppos + 1;
1841 if ((count == chunk) && (iov_offset == 0)) {
1843 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1845 iov_copy = (struct iovec *)iov;
1846 nrsegs_copy = nr_segs;
1849 nrsegs_orig = nr_segs;
1850 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1852 GOTO(out, retval = -ENOMEM);
1854 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1855 &iov_offset, chunk);
1860 iov_copy = (struct iovec *)iov;
1861 nrsegs_copy = nr_segs;
1864 tree_locked = ll_file_get_tree_lock_iov(<d.u.tree, file, iov_copy,
1866 (obd_off)lock_start,
1869 if (tree_locked < 0)
1870 GOTO(out, retval = tree_locked);
1872 /* This is ok, g_f_w will overwrite this under i_sem if it races
1873 * with a local truncate, it just makes our maxbyte checking easier.
1874 * The i_size value gets updated in ll_extent_lock() as a consequence
1875 * of the [0,EOF] extent lock we requested above. */
1876 if (file->f_flags & O_APPEND) {
1877 *ppos = i_size_read(inode);
1878 end = *ppos + count - 1;
1881 if (*ppos >= maxbytes) {
1882 send_sig(SIGXFSZ, current, 0);
1883 GOTO(out_unlock, retval = -EFBIG);
1885 if (end > maxbytes - 1)
1888 /* generic_file_write handles O_APPEND after getting i_mutex */
1889 chunk = end - *ppos + 1;
1890 CDEBUG(D_INFO, "Writing inode %lu, %ld bytes, offset %Lu\n",
1891 inode->i_ino, (long)chunk, *ppos);
1893 struct ost_lvb *xtimes;
1894 /* write under locks
1896 * 1. update inode's mtime and ctime as long as
1897 * concurrent stat (via ll_glimpse_size) might bring
1900 * 2. update lsm so that next stat (via
1901 * ll_glimpse_size) could get correct values in lsm */
1902 OBD_ALLOC_PTR(xtimes);
1904 GOTO(out_unlock, retval = -ENOMEM);
1906 lov_stripe_lock(lsm);
1907 LTIME_S(inode->i_mtime) = LTIME_S(CURRENT_TIME);
1908 LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
1909 xtimes->lvb_mtime = LTIME_S(inode->i_mtime);
1910 xtimes->lvb_ctime = LTIME_S(inode->i_ctime);
1911 obd_update_lvb(sbi->ll_osc_exp, lsm, xtimes,
1912 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1913 lov_stripe_unlock(lsm);
1914 OBD_FREE_PTR(xtimes);
1916 ltd.lock_style = LL_LOCK_STYLE_TREELOCK;
1918 #ifdef HAVE_FILE_WRITEV
1919 retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
1921 retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
1925 size_t ocount, ncount;
1927 retval = ll_generic_segment_checks(iov_copy, &nrsegs_copy,
1928 &ocount, VERIFY_READ);
1934 retval = generic_write_checks(file, ppos, &ncount, 0);
1938 if (unlikely(ocount != ncount)) {
1939 /* we are allowed to modify the original iov too */
1940 nrsegs_copy = ll_iov_shorten(iov_copy, nrsegs_copy,
1942 chunk = 0; /* no repetition after the short write */
1945 retval = ll_remove_suid(file, file->f_vfsmnt);
1949 ll_update_time(file);
1950 retval = ll_direct_IO(WRITE, file, iov_copy, *ppos, nr_segs, 0);
1952 lprocfs_counter_add(sbi->ll_stats,
1953 LPROC_LL_LOCKLESS_WRITE,
1958 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1962 ll_tree_unlock(<d.u.tree);
1968 if (retval == chunk && count > 0)
1973 up(&ll_i2info(inode)->lli_write_sem);
1976 if (iov_copy && iov_copy != iov)
1977 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1979 retval = (sum > 0) ? sum : retval;
1980 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1981 retval > 0 ? retval : 0);
1985 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1988 struct iovec local_iov = { .iov_base = (void __user *)buf,
1991 #ifdef HAVE_FILE_WRITEV
1992 return ll_file_writev(file, &local_iov, 1, ppos);
1997 init_sync_kiocb(&kiocb, file);
1998 kiocb.ki_pos = *ppos;
1999 kiocb.ki_left = count;
2001 ret = ll_file_aio_write(&kiocb, &local_iov, 1, kiocb.ki_pos);
2002 *ppos = kiocb.ki_pos;
2008 #ifdef HAVE_KERNEL_SENDFILE
2010 * Send file content (through pagecache) somewhere with helper
2012 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,
2013 size_t count, read_actor_t actor, void *target)
2015 struct inode *inode = in_file->f_dentry->d_inode;
2016 struct ll_inode_info *lli = ll_i2info(inode);
2017 struct lov_stripe_md *lsm = lli->lli_smd;
2018 struct ll_lock_tree tree;
2019 struct ll_lock_tree_node *node;
2021 struct ll_ra_read bead;
2026 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%Ld\n",
2027 inode->i_ino, inode->i_generation, inode, (unsigned long)count,
2030 /* "If nbyte is 0, read() will return 0 and have no other results."
2031 * -- Single Unix Spec */
2035 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
2036 /* turn off the kernel's read-ahead */
2037 in_file->f_ra.ra_pages = 0;
2039 /* File with no objects, nothing to lock */
2041 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
2045 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2047 RETURN(PTR_ERR(node));
2049 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2050 rc = ll_tree_lock(&tree, node, NULL, count,
2051 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2055 ll_clear_file_contended(inode);
2056 ll_inode_size_lock(inode, 1);
2058 * Consistency guarantees: following possibilities exist for the
2059 * relation between region being read and real file size at this
2062 * (A): the region is completely inside of the file;
2064 * (B-x): x bytes of region are inside of the file, the rest is
2067 * (C): the region is completely outside of the file.
2069 * This classification is stable under DLM lock acquired by
2070 * ll_tree_lock() above, because to change class, other client has to
2071 * take DLM lock conflicting with our lock. Also, any updates to
2072 * ->i_size by other threads on this client are serialized by
2073 * ll_inode_size_lock(). This guarantees that short reads are handled
2074 * correctly in the face of concurrent writes and truncates.
2076 inode_init_lvb(inode, &lvb);
2077 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2079 if (*ppos + count - 1 > kms) {
2080 /* A glimpse is necessary to determine whether we return a
2081 * short read (B) or some zeroes at the end of the buffer (C) */
2082 ll_inode_size_unlock(inode, 1);
2083 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2087 /* region is within kms and, hence, within real file size (A) */
2088 i_size_write(inode, kms);
2089 ll_inode_size_unlock(inode, 1);
2092 CDEBUG(D_INFO, "Send ino %lu, %lu bytes, offset %lld, i_size %llu\n",
2093 inode->i_ino, (unsigned long)count, *ppos, i_size_read(inode));
2095 ll_ra_read_init(in_file, &bead, *ppos, count);
2097 file_accessed(in_file);
2098 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
2099 ll_ra_read_ex(in_file, &bead);
2102 ll_tree_unlock(&tree);
2108 * http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=f0930fffa99e7fe0a0c4b6c7d9a244dc88288c27
2110 #ifdef HAVE_KERNEL_SPLICE_READ
2111 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2112 struct pipe_inode_info *pipe, size_t count,
2115 struct inode *inode = in_file->f_dentry->d_inode;
2116 struct ll_inode_info *lli = ll_i2info(inode);
2117 struct lov_stripe_md *lsm = lli->lli_smd;
2118 struct ll_lock_tree tree;
2119 struct ll_lock_tree_node *node;
2121 struct ll_ra_read bead;
2126 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%Ld\n",
2127 inode->i_ino, inode->i_generation, inode, (unsigned long)count,
2130 /* "If nbyte is 0, read() will return 0 and have no other results."
2131 * -- Single Unix Spec */
2135 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
2136 /* turn off the kernel's read-ahead */
2137 in_file->f_ra.ra_pages = 0;
2139 /* File with no objects, nothing to lock */
2141 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2145 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2147 RETURN(PTR_ERR(node));
2149 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2150 rc = ll_tree_lock(&tree, node, NULL, count,
2151 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2155 ll_clear_file_contended(inode);
2156 ll_inode_size_lock(inode, 1);
2158 * Consistency guarantees: following possibilities exist for the
2159 * relation between region being read and real file size at this
2162 * (A): the region is completely inside of the file;
2164 * (B-x): x bytes of region are inside of the file, the rest is
2167 * (C): the region is completely outside of the file.
2169 * This classification is stable under DLM lock acquired by
2170 * ll_tree_lock() above, because to change class, other client has to
2171 * take DLM lock conflicting with our lock. Also, any updates to
2172 * ->i_size by other threads on this client are serialized by
2173 * ll_inode_size_lock(). This guarantees that short reads are handled
2174 * correctly in the face of concurrent writes and truncates.
2176 inode_init_lvb(inode, &lvb);
2177 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2179 if (*ppos + count - 1 > kms) {
2180 /* A glimpse is necessary to determine whether we return a
2181 * short read (B) or some zeroes at the end of the buffer (C) */
2182 ll_inode_size_unlock(inode, 1);
2183 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2187 /* region is within kms and, hence, within real file size (A) */
2188 i_size_write(inode, kms);
2189 ll_inode_size_unlock(inode, 1);
2192 CDEBUG(D_INFO, "Send ino %lu, %lu bytes, offset %lld, i_size %llu\n",
2193 inode->i_ino, (unsigned long)count, *ppos, i_size_read(inode));
2195 ll_ra_read_init(in_file, &bead, *ppos, count);
2197 file_accessed(in_file);
2198 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2199 ll_ra_read_ex(in_file, &bead);
2202 ll_tree_unlock(&tree);
2207 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_gr gr,
2210 struct ll_inode_info *lli = ll_i2info(inode);
2211 struct obd_export *exp = ll_i2obdexp(inode);
2212 struct obd_trans_info oti = { 0 };
2213 struct obdo *oa = NULL;
2216 struct lov_stripe_md *lsm, *lsm2;
2223 down(&lli->lli_size_sem);
2226 GOTO(out, rc = -ENOENT);
2227 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
2228 (lsm->lsm_stripe_count));
2230 OBD_ALLOC(lsm2, lsm_size);
2232 GOTO(out, rc = -ENOMEM);
2236 oa->o_nlink = ost_idx;
2237 oa->o_flags |= OBD_FL_RECREATE_OBJS;
2238 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
2239 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2240 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2242 memcpy(lsm2, lsm, lsm_size);
2243 rc = obd_create(exp, oa, &lsm2, &oti);
2245 OBD_FREE(lsm2, lsm_size);
2248 up(&lli->lli_size_sem);
2253 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
2255 struct ll_recreate_obj ucreat;
2258 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2261 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
2262 sizeof(struct ll_recreate_obj)))
2265 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
2266 ucreat.lrc_ost_idx));
2269 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
2276 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2279 if (copy_from_user(&fid, (struct lu_fid *)arg,
2280 sizeof(struct lu_fid)))
2283 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
2284 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
2285 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
2288 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2289 int flags, struct lov_user_md *lum,
2292 struct ll_inode_info *lli = ll_i2info(inode);
2293 struct lov_stripe_md *lsm;
2294 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2298 down(&lli->lli_size_sem);
2301 up(&lli->lli_size_sem);
2302 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2307 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2310 rc = oit.d.lustre.it_status;
2312 GOTO(out_req_free, rc);
2314 ll_release_openhandle(file->f_dentry, &oit);
2317 up(&lli->lli_size_sem);
2318 ll_intent_release(&oit);
2321 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2325 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2326 struct lov_mds_md **lmmp, int *lmm_size,
2327 struct ptlrpc_request **request)
2329 struct ll_sb_info *sbi = ll_i2sbi(inode);
2331 struct mds_body *body;
2332 struct lov_mds_md *lmm = NULL;
2333 struct ptlrpc_request *req = NULL;
2336 ll_inode2fid(&fid, inode);
2338 rc = ll_get_max_mdsize(sbi, &lmmsize);
2342 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
2343 filename, strlen(filename) + 1,
2344 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
2347 CDEBUG(D_INFO, "mdc_getattr_name failed "
2348 "on %s: rc %d\n", filename, rc);
2352 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
2354 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2355 /* swabbed by mdc_getattr_name */
2356 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
2358 lmmsize = body->eadatasize;
2360 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2362 GOTO(out, rc = -ENODATA);
2365 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
2367 LASSERT(lmm != NULL);
2368 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
2370 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2371 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2372 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2373 GOTO(out, rc = -EPROTO);
2376 * This is coming from the MDS, so is probably in
2377 * little endian. We convert it to host endian before
2378 * passing it to userspace.
2380 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2381 /* if function called for directory - we should
2382 * avoid swab not existent lsm objects */
2383 if ((lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
2384 (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3))) {
2385 lustre_swab_lov_user_md((struct lov_user_md*)lmm);
2386 if (S_ISREG(body->mode))
2387 lustre_swab_lov_user_md_objects(
2388 (struct lov_user_md*)lmm);
2389 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2390 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2394 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2395 struct lov_stripe_md *lsm;
2396 struct lov_user_md_join *lmj;
2397 int lmj_size, i, aindex = 0;
2399 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
2401 GOTO(out, rc = -ENOMEM);
2402 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
2404 GOTO(out_free_memmd, rc);
2406 lmj_size = sizeof(struct lov_user_md_join) +
2407 lsm->lsm_stripe_count *
2408 sizeof(struct lov_user_ost_data_join);
2409 OBD_ALLOC(lmj, lmj_size);
2411 GOTO(out_free_memmd, rc = -ENOMEM);
2413 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2414 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2415 struct lov_extent *lex =
2416 &lsm->lsm_array->lai_ext_array[aindex];
2418 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2420 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2421 LPU64" len %d\n", aindex, i,
2422 lex->le_start, (int)lex->le_len);
2423 lmj->lmm_objects[i].l_extent_start =
2426 if ((int)lex->le_len == -1)
2427 lmj->lmm_objects[i].l_extent_end = -1;
2429 lmj->lmm_objects[i].l_extent_end =
2430 lex->le_start + lex->le_len;
2431 lmj->lmm_objects[i].l_object_id =
2432 lsm->lsm_oinfo[i]->loi_id;
2433 lmj->lmm_objects[i].l_object_gr =
2434 lsm->lsm_oinfo[i]->loi_gr;
2435 lmj->lmm_objects[i].l_ost_gen =
2436 lsm->lsm_oinfo[i]->loi_ost_gen;
2437 lmj->lmm_objects[i].l_ost_idx =
2438 lsm->lsm_oinfo[i]->loi_ost_idx;
2440 lmm = (struct lov_mds_md *)lmj;
2443 obd_free_memmd(sbi->ll_osc_exp, &lsm);
2447 *lmm_size = lmmsize;
2451 static int ll_lov_setea(struct inode *inode, struct file *file,
2454 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2455 struct lov_user_md *lump;
2456 int lum_size = sizeof(struct lov_user_md) +
2457 sizeof(struct lov_user_ost_data);
2461 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2464 OBD_ALLOC(lump, lum_size);
2468 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2470 OBD_FREE(lump, lum_size);
2474 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2476 OBD_FREE(lump, lum_size);
2480 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2483 struct lov_user_md_v3 lumv3;
2484 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2485 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2486 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2489 int flags = FMODE_WRITE;
2492 /* first try with v1 which is smaller than v3 */
2493 lum_size = sizeof(struct lov_user_md_v1);
2494 rc = copy_from_user(lumv1, lumv1p, lum_size);
2498 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2499 lum_size = sizeof(struct lov_user_md_v3);
2500 rc = copy_from_user(&lumv3, lumv3p, lum_size);
2505 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2507 put_user(0, &lumv1p->lmm_stripe_count);
2508 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
2509 0, ll_i2info(inode)->lli_smd,
2515 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2517 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2522 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
2526 static int ll_get_grouplock(struct inode *inode, struct file *file,
2529 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2530 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2531 .end = OBD_OBJECT_EOF}};
2532 struct lustre_handle lockh = { 0 };
2533 struct ll_inode_info *lli = ll_i2info(inode);
2534 struct lov_stripe_md *lsm = lli->lli_smd;
2538 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2542 policy.l_extent.gid = arg;
2543 if (file->f_flags & O_NONBLOCK)
2544 flags = LDLM_FL_BLOCK_NOWAIT;
2546 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2550 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2552 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2557 static int ll_put_grouplock(struct inode *inode, struct file *file,
2560 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2561 struct ll_inode_info *lli = ll_i2info(inode);
2562 struct lov_stripe_md *lsm = lli->lli_smd;
2566 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2567 /* Ugh, it's already unlocked. */
2571 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2574 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2576 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2581 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2586 #if LUSTRE_FIX >= 50
2587 static int join_sanity_check(struct inode *head, struct inode *tail)
2590 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2591 CERROR("server do not support join \n");
2594 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2595 CERROR("tail ino %lu and ino head %lu must be regular\n",
2596 head->i_ino, tail->i_ino);
2599 if (head->i_ino == tail->i_ino) {
2600 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2603 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2604 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2610 static int join_file(struct inode *head_inode, struct file *head_filp,
2611 struct file *tail_filp)
2613 struct dentry *tail_dentry = tail_filp->f_dentry;
2614 struct lookup_intent oit = {.it_op = IT_OPEN,
2615 .it_flags = head_filp->f_flags,
2616 .it_create_mode = M_JOIN_FILE};
2617 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
2618 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
2620 struct lustre_handle lockh;
2621 struct mdc_op_data *op_data;
2626 tail_dentry = tail_filp->f_dentry;
2628 OBD_ALLOC_PTR(op_data);
2629 if (op_data == NULL) {
2633 data = i_size_read(head_inode);
2634 ll_prepare_mdc_op_data(op_data, head_inode,
2635 tail_dentry->d_parent->d_inode,
2636 tail_dentry->d_name.name,
2637 tail_dentry->d_name.len, 0, &data);
2638 rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
2639 op_data, &lockh, NULL, 0, 0);
2644 rc = oit.d.lustre.it_status;
2646 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2647 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2648 ptlrpc_req_finished((struct ptlrpc_request *)
2649 oit.d.lustre.it_data);
2653 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2655 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2656 oit.d.lustre.it_lock_mode = 0;
2658 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2659 it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2660 ll_release_openhandle(head_filp->f_dentry, &oit);
2663 OBD_FREE_PTR(op_data);
2664 ll_intent_release(&oit);
2668 static int ll_file_join(struct inode *head, struct file *filp,
2669 char *filename_tail)
2671 struct inode *tail = NULL, *first = NULL, *second = NULL;
2672 struct dentry *tail_dentry;
2673 struct file *tail_filp, *first_filp, *second_filp;
2674 struct ll_lock_tree first_tree, second_tree;
2675 struct ll_lock_tree_node *first_node, *second_node;
2676 struct ll_inode_info *hlli = ll_i2info(head);
2677 int rc = 0, cleanup_phase = 0;
2680 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2681 head->i_ino, head->i_generation, head, filename_tail);
2683 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2684 if (IS_ERR(tail_filp)) {
2685 CERROR("Can not open tail file %s", filename_tail);
2686 rc = PTR_ERR(tail_filp);
2689 tail = igrab(tail_filp->f_dentry->d_inode);
2691 tail_dentry = tail_filp->f_dentry;
2692 LASSERT(tail_dentry);
2695 /*reorder the inode for lock sequence*/
2696 first = head->i_ino > tail->i_ino ? head : tail;
2697 second = head->i_ino > tail->i_ino ? tail : head;
2698 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2699 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2701 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2702 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2703 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2704 if (IS_ERR(first_node)){
2705 rc = PTR_ERR(first_node);
2708 first_tree.lt_fd = first_filp->private_data;
2709 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2714 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2715 if (IS_ERR(second_node)){
2716 rc = PTR_ERR(second_node);
2719 second_tree.lt_fd = second_filp->private_data;
2720 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2725 rc = join_sanity_check(head, tail);
2729 rc = join_file(head, filp, tail_filp);
2733 switch (cleanup_phase) {
2735 ll_tree_unlock(&second_tree);
2736 obd_cancel_unused(ll_i2obdexp(second),
2737 ll_i2info(second)->lli_smd, 0, NULL);
2739 ll_tree_unlock(&first_tree);
2740 obd_cancel_unused(ll_i2obdexp(first),
2741 ll_i2info(first)->lli_smd, 0, NULL);
2743 filp_close(tail_filp, 0);
2746 if (head && rc == 0) {
2747 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2749 hlli->lli_smd = NULL;
2754 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2759 #endif /* LUSTRE_FIX >= 50 */
2762 * Close inode open handle
2764 * \param dentry [in] dentry which contains the inode
2765 * \param it [in,out] intent which contains open info and result
2768 * \retval <0 failure
2770 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2772 struct inode *inode = dentry->d_inode;
2773 struct obd_client_handle *och;
2779 /* Root ? Do nothing. */
2780 if (dentry->d_inode->i_sb->s_root == dentry)
2783 /* No open handle to close? Move away */
2784 if (!it_disposition(it, DISP_OPEN_OPEN))
2787 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2789 OBD_ALLOC(och, sizeof(*och));
2791 GOTO(out, rc = -ENOMEM);
2793 ll_och_fill(ll_i2info(inode), it, och);
2795 rc = ll_close_inode_openhandle(inode, och);
2797 OBD_FREE(och, sizeof(*och));
2799 /* this one is in place of ll_file_open */
2800 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2801 ptlrpc_req_finished(it->d.lustre.it_data);
2802 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2807 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2810 struct obd_export *exp = ll_i2obdexp(inode);
2811 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2812 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2813 int vallen = num_bytes;
2817 /* Checks for fiemap flags */
2818 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2819 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2823 /* Check for FIEMAP_FLAG_SYNC */
2824 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2825 rc = filemap_fdatawrite(inode->i_mapping);
2830 /* If the stripe_count > 1 and the application does not understand
2831 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2833 if (lsm->lsm_stripe_count > 1 &&
2834 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2837 fm_key.oa.o_id = lsm->lsm_object_id;
2838 fm_key.oa.o_valid = OBD_MD_FLID;
2840 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLSIZE);
2842 /* If filesize is 0, then there would be no objects for mapping */
2843 if (fm_key.oa.o_size == 0) {
2844 fiemap->fm_mapped_extents = 0;
2848 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2850 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2852 CERROR("obd_get_info failed: rc = %d\n", rc);
2857 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
2859 struct ll_user_fiemap *fiemap_s;
2860 size_t num_bytes, ret_bytes;
2861 unsigned int extent_count;
2864 /* Get the extent count so we can calculate the size of
2865 * required fiemap buffer */
2866 if (get_user(extent_count,
2867 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2869 num_bytes = sizeof(*fiemap_s) + (extent_count *
2870 sizeof(struct ll_fiemap_extent));
2872 OBD_VMALLOC(fiemap_s, num_bytes);
2873 if (fiemap_s == NULL)
2876 /* get the fiemap value */
2877 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2879 GOTO(error, rc = -EFAULT);
2881 /* If fm_extent_count is non-zero, read the first extent since
2882 * it is used to calculate end_offset and device from previous
2885 if (copy_from_user(&fiemap_s->fm_extents[0],
2886 (char __user *)arg + sizeof(*fiemap_s),
2887 sizeof(struct ll_fiemap_extent)))
2888 GOTO(error, rc = -EFAULT);
2891 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
2895 ret_bytes = sizeof(struct ll_user_fiemap);
2897 if (extent_count != 0)
2898 ret_bytes += (fiemap_s->fm_mapped_extents *
2899 sizeof(struct ll_fiemap_extent));
2901 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2905 OBD_VFREE(fiemap_s, num_bytes);
2909 #ifdef HAVE_UNLOCKED_IOCTL
2910 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2912 struct inode *inode = file->f_dentry->d_inode;
2914 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2918 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2922 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2923 inode->i_generation, inode, cmd);
2924 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2926 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2927 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2931 case LL_IOC_GETFLAGS:
2932 /* Get the current value of the file flags */
2933 return put_user(fd->fd_flags, (int *)arg);
2934 case LL_IOC_SETFLAGS:
2935 case LL_IOC_CLRFLAGS:
2936 /* Set or clear specific file flags */
2937 /* XXX This probably needs checks to ensure the flags are
2938 * not abused, and to handle any flag side effects.
2940 if (get_user(flags, (int *) arg))
2943 if (cmd == LL_IOC_SETFLAGS) {
2944 if ((flags & LL_FILE_IGNORE_LOCK) &&
2945 !(file->f_flags & O_DIRECT)) {
2946 CERROR("%s: unable to disable locking on "
2947 "non-O_DIRECT file\n", current->comm);
2951 fd->fd_flags |= flags;
2953 fd->fd_flags &= ~flags;
2956 case LL_IOC_LOV_SETSTRIPE:
2957 RETURN(ll_lov_setstripe(inode, file, arg));
2958 case LL_IOC_LOV_SETEA:
2959 RETURN(ll_lov_setea(inode, file, arg));
2960 case LL_IOC_LOV_GETSTRIPE:
2961 RETURN(ll_lov_getstripe(inode, arg));
2962 case LL_IOC_RECREATE_OBJ:
2963 RETURN(ll_lov_recreate_obj(inode, arg));
2964 case LL_IOC_RECREATE_FID:
2965 RETURN(ll_lov_recreate_fid(inode, arg));
2966 case FSFILT_IOC_FIEMAP:
2967 RETURN(ll_ioctl_fiemap(inode, arg));
2968 case FSFILT_IOC_GETFLAGS:
2969 case FSFILT_IOC_SETFLAGS:
2970 RETURN(ll_iocontrol(inode, file, cmd, arg));
2971 case FSFILT_IOC_GETVERSION_OLD:
2972 case FSFILT_IOC_GETVERSION:
2973 RETURN(put_user(inode->i_generation, (int *)arg));
2975 #if LUSTRE_FIX >= 50
2976 /* Allow file join in beta builds to allow debuggging */
2980 ftail = getname((const char *)arg);
2982 RETURN(PTR_ERR(ftail));
2983 rc = ll_file_join(inode, file, ftail);
2987 CWARN("file join is not supported in this version of Lustre\n");
2991 case LL_IOC_GROUP_LOCK:
2992 RETURN(ll_get_grouplock(inode, file, arg));
2993 case LL_IOC_GROUP_UNLOCK:
2994 RETURN(ll_put_grouplock(inode, file, arg));
2995 case IOC_OBD_STATFS:
2996 RETURN(ll_obd_statfs(inode, (void *)arg));
2997 case OBD_IOC_GETNAME_OLD:
2998 case OBD_IOC_GETNAME: {
2999 struct obd_device *obd =
3000 class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
3003 if (copy_to_user((void *)arg, obd->obd_name,
3004 strlen(obd->obd_name) + 1))
3008 case LL_IOC_PATH2FID: {
3009 if (copy_to_user((void *)arg, ll_inode_lu_fid(inode),
3010 sizeof(struct lu_fid)))
3016 /* We need to special case any other ioctls we want to handle,
3017 * to send them to the MDS/OST as appropriate and to properly
3018 * network encode the arg field.
3019 case EXT3_IOC_SETVERSION_OLD:
3020 case EXT3_IOC_SETVERSION:
3026 ll_iocontrol_call(inode, file, cmd, arg, &err))
3029 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
3035 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3037 struct inode *inode = file->f_dentry->d_inode;
3038 struct ll_inode_info *lli = ll_i2info(inode);
3039 struct lov_stripe_md *lsm = lli->lli_smd;
3042 retval = offset + ((origin == 2) ? i_size_read(inode) :
3043 (origin == 1) ? file->f_pos : 0);
3044 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
3045 inode->i_ino, inode->i_generation, inode, retval, retval,
3046 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
3047 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3049 if (origin == 2) { /* SEEK_END */
3050 int nonblock = 0, rc;
3052 if (file->f_flags & O_NONBLOCK)
3053 nonblock = LDLM_FL_BLOCK_NOWAIT;
3056 rc = ll_glimpse_size(inode, nonblock);
3061 ll_inode_size_lock(inode, 0);
3062 offset += i_size_read(inode);
3063 ll_inode_size_unlock(inode, 0);
3064 } else if (origin == 1) { /* SEEK_CUR */
3065 offset += file->f_pos;
3069 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
3070 if (offset != file->f_pos) {
3071 file->f_pos = offset;
3072 file->f_version = 0;
3080 #ifdef HAVE_FLUSH_OWNER_ID
3081 int ll_flush(struct file *file, fl_owner_t id)
3083 int ll_flush(struct file *file)
3086 struct inode *inode = file->f_dentry->d_inode;
3087 struct ll_inode_info *lli = ll_i2info(inode);
3088 struct lov_stripe_md *lsm = lli->lli_smd;
3091 /* catch async errors that were recorded back when async writeback
3092 * failed for pages in this mapping. */
3093 rc = lli->lli_async_rc;
3094 lli->lli_async_rc = 0;
3096 err = lov_test_and_clear_async_rc(lsm);
3101 return rc ? -EIO : 0;
3104 int ll_fsync(struct file *file, struct dentry *dentry, int data)
3106 struct inode *inode = dentry->d_inode;
3107 struct ll_inode_info *lli = ll_i2info(inode);
3108 struct lov_stripe_md *lsm = lli->lli_smd;
3110 struct ptlrpc_request *req;
3113 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
3114 inode->i_generation, inode);
3115 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3117 /* fsync's caller has already called _fdata{sync,write}, we want
3118 * that IO to finish before calling the osc and mdc sync methods */
3119 rc = filemap_fdatawait(inode->i_mapping);
3121 /* catch async errors that were recorded back when async writeback
3122 * failed for pages in this mapping. */
3123 err = lli->lli_async_rc;
3124 lli->lli_async_rc = 0;
3128 err = lov_test_and_clear_async_rc(lsm);
3133 ll_inode2fid(&fid, inode);
3134 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
3138 ptlrpc_req_finished(req);
3141 struct obd_info *oinfo;
3143 OBD_ALLOC_PTR(oinfo);
3145 RETURN(rc ? rc : -ENOMEM);
3146 OBDO_ALLOC(oinfo->oi_oa);
3147 if (!oinfo->oi_oa) {
3148 OBD_FREE_PTR(oinfo);
3149 RETURN(rc ? rc : -ENOMEM);
3151 oinfo->oi_oa->o_id = lsm->lsm_object_id;
3152 oinfo->oi_oa->o_gr = lsm->lsm_object_gr;
3153 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
3154 obdo_from_inode(oinfo->oi_oa, inode,
3155 OBD_MD_FLTYPE | OBD_MD_FLATIME |
3156 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
3158 err = obd_sync_rqset(ll_i2sbi(inode)->ll_osc_exp, oinfo,
3162 OBDO_FREE(oinfo->oi_oa);
3163 OBD_FREE_PTR(oinfo);
3169 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3171 struct inode *inode = file->f_dentry->d_inode;
3172 struct ll_sb_info *sbi = ll_i2sbi(inode);
3173 struct lu_fid *fid = ll_inode_lu_fid(inode);
3174 struct ldlm_res_id res_id =
3175 { .name = { fid_seq(fid),
3179 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
3180 ldlm_flock_completion_ast, NULL, file_lock };
3181 struct lustre_handle lockh = {0};
3182 ldlm_policy_data_t flock;
3187 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
3188 inode->i_ino, file_lock);
3189 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3191 if (fid_is_igif(fid)) {
3192 /* If this is an IGIF inode, we need to keep the 1.6-style
3193 * flock mapping for compatibility. If it is a proper FID
3194 * then we know any other client accessing it must also be
3195 * accessing it as a FID and can use the CMD-style flock. */
3196 res_id.name[2] = LDLM_FLOCK;
3200 if (file_lock->fl_flags & FL_FLOCK) {
3201 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3202 /* set missing params for flock() calls */
3203 file_lock->fl_end = OFFSET_MAX;
3204 file_lock->fl_pid = current->tgid;
3206 flock.l_flock.pid = file_lock->fl_pid;
3207 flock.l_flock.start = file_lock->fl_start;
3208 flock.l_flock.end = file_lock->fl_end;
3210 switch (file_lock->fl_type) {
3212 einfo.ei_mode = LCK_PR;
3215 /* An unlock request may or may not have any relation to
3216 * existing locks so we may not be able to pass a lock handle
3217 * via a normal ldlm_lock_cancel() request. The request may even
3218 * unlock a byte range in the middle of an existing lock. In
3219 * order to process an unlock request we need all of the same
3220 * information that is given with a normal read or write record
3221 * lock request. To avoid creating another ldlm unlock (cancel)
3222 * message we'll treat a LCK_NL flock request as an unlock. */
3223 einfo.ei_mode = LCK_NL;
3226 einfo.ei_mode = LCK_PW;
3229 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
3244 flags = LDLM_FL_BLOCK_NOWAIT;
3250 flags = LDLM_FL_TEST_LOCK;
3251 /* Save the old mode so that if the mode in the lock changes we
3252 * can decrement the appropriate reader or writer refcount. */
3253 file_lock->fl_type = einfo.ei_mode;
3256 CERROR("unknown fcntl lock command: %d\n", cmd);
3260 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
3261 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
3262 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
3264 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
3265 &flock, &flags, NULL, 0, NULL, &lockh, 0);
3266 if ((file_lock->fl_flags & FL_FLOCK) &&
3267 (rc == 0 || file_lock->fl_type == F_UNLCK))
3268 rc2 = ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
3269 #ifdef HAVE_F_OP_FLOCK
3270 if ((file_lock->fl_flags & FL_POSIX) &&
3271 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3272 !(flags & LDLM_FL_TEST_LOCK))
3273 rc2 = posix_lock_file_wait(file, file_lock);
3276 RETURN(rc ? rc : rc2);
3279 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3286 int ll_have_md_lock(struct inode *inode, __u64 bits, ldlm_mode_t l_req_mode)
3288 struct lustre_handle lockh;
3289 struct ldlm_res_id res_id;
3290 struct obd_device *obddev;
3291 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3292 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3293 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3300 obddev = ll_i2mdcexp(inode)->exp_obd;
3301 fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id);
3303 CDEBUG(D_INFO, "trying to match res "LPU64":"LPU64":"LPU64" mode %s\n",
3307 ldlm_lockname[mode]);
3309 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3310 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
3311 &policy, mode, &lockh)) {
3318 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3319 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3320 * and return success */
3322 /* This path cannot be hit for regular files unless in
3323 * case of obscure races, so no need to to validate
3325 if (!S_ISREG(inode->i_mode) &&
3326 !S_ISDIR(inode->i_mode))
3331 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3339 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3342 struct inode *inode = dentry->d_inode;
3343 struct ptlrpc_request *req = NULL;
3344 struct obd_export *exp;
3349 CERROR("REPORT THIS LINE TO PETER\n");
3352 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3353 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3355 exp = ll_i2mdcexp(inode);
3357 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3358 struct lookup_intent oit = { .it_op = IT_GETATTR };
3359 struct mdc_op_data op_data = { { 0 } };
3361 /* Call getattr by fid, so do not provide name at all. */
3362 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
3363 dentry->d_inode, NULL, 0, 0, NULL);
3364 oit.it_create_mode |= M_CHECK_STALE;
3365 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
3366 /* we are not interested in name
3369 ll_mdc_blocking_ast, 0);
3370 oit.it_create_mode &= ~M_CHECK_STALE;
3372 rc = ll_inode_revalidate_fini(inode, rc);
3376 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
3378 ll_intent_release(&oit);
3382 /* Unlinked? Unhash dentry, so it is not picked up later by
3383 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3384 here to preserve get_cwd functionality on 2.6.
3386 if (!dentry->d_inode->i_nlink) {
3387 spin_lock(&ll_lookup_lock);
3388 spin_lock(&dcache_lock);
3389 ll_drop_dentry(dentry);
3390 spin_unlock(&dcache_lock);
3391 spin_unlock(&ll_lookup_lock);
3394 ll_lookup_finish_locks(&oit, dentry);
3395 } else if (!ll_have_md_lock(dentry->d_inode, ibits, LCK_MINMODE)) {
3396 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3398 obd_valid valid = OBD_MD_FLGETATTR;
3401 if (S_ISREG(inode->i_mode)) {
3402 rc = ll_get_max_mdsize(sbi, &ealen);
3405 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3407 ll_inode2fid(&fid, inode);
3408 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
3410 rc = ll_inode_revalidate_fini(inode, rc);
3414 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
3419 ptlrpc_req_finished(req);
3423 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3425 struct inode *inode = dentry->d_inode;
3429 rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
3430 MDS_INODELOCK_LOOKUP);
3432 /* if object not yet allocated, don't validate size */
3433 if (rc == 0 && ll_i2info(inode)->lli_smd == NULL) {
3434 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3435 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3436 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3440 /* ll_glimpse_size will prefer locally cached writes if they extend
3444 rc = ll_glimpse_size(inode, 0);
3449 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3450 struct lookup_intent *it, struct kstat *stat)
3452 struct inode *inode = de->d_inode;
3453 struct ll_sb_info *sbi = ll_i2sbi(inode);
3454 struct ll_inode_info *lli = ll_i2info(inode);
3457 res = ll_inode_revalidate_it(de, it);
3458 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3463 stat->dev = inode->i_sb->s_dev;
3464 if (ll_need_32bit_api(sbi))
3465 stat->ino = ll_fid_build_ino((struct ll_fid *)&lli->lli_fid, 1);
3467 stat->ino = inode->i_ino;
3468 stat->mode = inode->i_mode;
3469 stat->nlink = inode->i_nlink;
3470 stat->uid = inode->i_uid;
3471 stat->gid = inode->i_gid;
3472 stat->rdev = kdev_t_to_nr(inode->i_rdev);
3473 stat->atime = inode->i_atime;
3474 stat->mtime = inode->i_mtime;
3475 stat->ctime = inode->i_ctime;
3476 #ifdef HAVE_INODE_BLKSIZE
3477 stat->blksize = inode->i_blksize;
3479 stat->blksize = 1<<inode->i_blkbits;
3482 ll_inode_size_lock(inode, 0);
3483 stat->size = i_size_read(inode);
3484 stat->blocks = inode->i_blocks;
3485 ll_inode_size_unlock(inode, 0);
3489 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3491 struct lookup_intent it = { .it_op = IT_GETATTR };
3493 return ll_getattr_it(mnt, de, &it, stat);
3496 #ifdef HAVE_LINUX_FIEMAP_H
3497 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3498 __u64 start, __u64 len)
3502 struct ll_user_fiemap *fiemap;
3503 unsigned int extent_count = fieinfo->fi_extents_max;
3505 num_bytes = sizeof(*fiemap) + (extent_count *
3506 sizeof(struct ll_fiemap_extent));
3507 OBD_VMALLOC(fiemap, num_bytes);
3512 fiemap->fm_flags = fieinfo->fi_flags;
3513 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3514 fiemap->fm_start = start;
3515 fiemap->fm_length = len;
3516 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3517 sizeof(struct ll_fiemap_extent));
3519 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3521 fieinfo->fi_flags = fiemap->fm_flags;
3522 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3523 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3524 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3526 OBD_VFREE(fiemap, num_bytes);
3533 int lustre_check_acl(struct inode *inode, int mask)
3535 #ifdef CONFIG_FS_POSIX_ACL
3536 struct ll_inode_info *lli = ll_i2info(inode);
3537 struct posix_acl *acl;
3541 spin_lock(&lli->lli_lock);
3542 acl = posix_acl_dup(lli->lli_posix_acl);
3543 spin_unlock(&lli->lli_lock);
3548 rc = posix_acl_permission(inode, acl, mask);
3549 posix_acl_release(acl);
3557 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3558 #ifndef HAVE_INODE_PERMISION_2ARGS
3559 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3561 int ll_inode_permission(struct inode *inode, int mask)
3567 /* as root inode are NOT getting validated in lookup operation,
3568 * need to do it before permission check. */
3570 if (inode == inode->i_sb->s_root->d_inode) {
3571 struct lookup_intent it = { .it_op = IT_LOOKUP };
3573 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3574 MDS_INODELOCK_LOOKUP);
3579 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3580 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3582 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3583 rc = generic_permission(inode, mask, lustre_check_acl);
3588 #ifndef HAVE_INODE_PERMISION_2ARGS
3589 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3591 int ll_inode_permission(struct inode *inode, int mask)
3594 int mode = inode->i_mode;
3597 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3598 inode->i_ino, inode->i_generation, inode, mask);
3599 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3601 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3602 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3604 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3606 if (current->fsuid == inode->i_uid) {
3609 if (((mode >> 3) & mask & S_IRWXO) != mask)
3611 rc = lustre_check_acl(inode, mask);
3615 goto check_capabilities;
3619 if (in_group_p(inode->i_gid))
3622 if ((mode & mask & S_IRWXO) == mask)
3626 if (!(mask & MAY_EXEC) ||
3627 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3628 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3631 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3632 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3639 /* -o localflock - only provides locally consistent flock locks */
3640 struct file_operations ll_file_operations = {
3641 .read = ll_file_read,
3642 #ifdef HAVE_FILE_READV
3643 .readv = ll_file_readv,
3645 .aio_read = ll_file_aio_read,
3647 .write = ll_file_write,
3648 #ifdef HAVE_FILE_WRITEV
3649 .writev = ll_file_writev,
3651 .aio_write = ll_file_aio_write,
3653 #ifdef HAVE_UNLOCKED_IOCTL
3654 .unlocked_ioctl = ll_file_ioctl,
3656 .ioctl = ll_file_ioctl,
3658 .open = ll_file_open,
3659 .release = ll_file_release,
3660 .mmap = ll_file_mmap,
3661 .llseek = ll_file_seek,
3662 #ifdef HAVE_KERNEL_SPLICE_READ
3663 .splice_read = ll_file_splice_read,
3665 #ifdef HAVE_KERNEL_SENDFILE
3666 .sendfile = ll_file_sendfile,
3672 struct file_operations ll_file_operations_flock = {
3673 .read = ll_file_read,
3674 #ifdef HAVE_FILE_READV
3675 .readv = ll_file_readv,
3677 .aio_read = ll_file_aio_read,
3679 .write = ll_file_write,
3680 #ifdef HAVE_FILE_WRITEV
3681 .writev = ll_file_writev,
3683 .aio_write = ll_file_aio_write,
3685 #ifdef HAVE_UNLOCKED_IOCTL
3686 .unlocked_ioctl = ll_file_ioctl,
3688 .ioctl = ll_file_ioctl,
3690 .open = ll_file_open,
3691 .release = ll_file_release,
3692 .mmap = ll_file_mmap,
3693 .llseek = ll_file_seek,
3694 #ifdef HAVE_KERNEL_SPLICE_READ
3695 .splice_read = ll_file_splice_read,
3697 #ifdef HAVE_KERNEL_SENDFILE
3698 .sendfile = ll_file_sendfile,
3702 #ifdef HAVE_F_OP_FLOCK
3703 .flock = ll_file_flock,
3705 .lock = ll_file_flock
3708 /* These are for -o noflock - to return ENOSYS on flock calls */
3709 struct file_operations ll_file_operations_noflock = {
3710 .read = ll_file_read,
3711 #ifdef HAVE_FILE_READV
3712 .readv = ll_file_readv,
3714 .aio_read = ll_file_aio_read,
3716 .write = ll_file_write,
3717 #ifdef HAVE_FILE_WRITEV
3718 .writev = ll_file_writev,
3720 .aio_write = ll_file_aio_write,
3722 #ifdef HAVE_UNLOCKED_IOCTL
3723 .unlocked_ioctl = ll_file_ioctl,
3725 .ioctl = ll_file_ioctl,
3727 .open = ll_file_open,
3728 .release = ll_file_release,
3729 .mmap = ll_file_mmap,
3730 .llseek = ll_file_seek,
3731 #ifdef HAVE_KERNEL_SPLICE_READ
3732 .splice_read = ll_file_splice_read,
3734 #ifdef HAVE_KERNEL_SENDFILE
3735 .sendfile = ll_file_sendfile,
3739 #ifdef HAVE_F_OP_FLOCK
3740 .flock = ll_file_noflock,
3742 .lock = ll_file_noflock
3745 struct inode_operations ll_file_inode_operations = {
3746 #ifdef HAVE_VFS_INTENT_PATCHES
3747 .setattr_raw = ll_setattr_raw,
3749 .setattr = ll_setattr,
3750 .truncate = ll_truncate,
3751 .getattr = ll_getattr,
3752 .permission = ll_inode_permission,
3753 .setxattr = ll_setxattr,
3754 .getxattr = ll_getxattr,
3755 .listxattr = ll_listxattr,
3756 .removexattr = ll_removexattr,
3757 #ifdef HAVE_LINUX_FIEMAP_H
3758 .fiemap = ll_fiemap,
3762 /* dynamic ioctl number support routins */
3763 static struct llioc_ctl_data {
3764 struct rw_semaphore ioc_sem;
3765 struct list_head ioc_head;
3767 __RWSEM_INITIALIZER(llioc.ioc_sem),
3768 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3773 struct list_head iocd_list;
3774 unsigned int iocd_size;
3775 llioc_callback_t iocd_cb;
3776 unsigned int iocd_count;
3777 unsigned int iocd_cmd[0];
3780 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3783 struct llioc_data *in_data = NULL;
3786 if (cb == NULL || cmd == NULL ||
3787 count > LLIOC_MAX_CMD || count < 0)
3790 size = sizeof(*in_data) + count * sizeof(unsigned int);
3791 OBD_ALLOC(in_data, size);
3792 if (in_data == NULL)
3795 memset(in_data, 0, sizeof(*in_data));
3796 in_data->iocd_size = size;
3797 in_data->iocd_cb = cb;
3798 in_data->iocd_count = count;
3799 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3801 down_write(&llioc.ioc_sem);
3802 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3803 up_write(&llioc.ioc_sem);
3808 void ll_iocontrol_unregister(void *magic)
3810 struct llioc_data *tmp;
3815 down_write(&llioc.ioc_sem);
3816 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3818 unsigned int size = tmp->iocd_size;
3820 list_del(&tmp->iocd_list);
3821 up_write(&llioc.ioc_sem);
3823 OBD_FREE(tmp, size);
3827 up_write(&llioc.ioc_sem);
3829 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3832 EXPORT_SYMBOL(ll_iocontrol_register);
3833 EXPORT_SYMBOL(ll_iocontrol_unregister);
3835 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3836 unsigned int cmd, unsigned long arg, int *rcp)
3838 enum llioc_iter ret = LLIOC_CONT;
3839 struct llioc_data *data;
3840 int rc = -EINVAL, i;
3842 down_read(&llioc.ioc_sem);
3843 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3844 for (i = 0; i < data->iocd_count; i++) {
3845 if (cmd != data->iocd_cmd[i])
3848 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3852 if (ret == LLIOC_STOP)
3855 up_read(&llioc.ioc_sem);