1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 /* also used by llite/special.c:ll_special_open() */
52 struct ll_file_data *ll_file_data_get(void)
54 struct ll_file_data *fd;
56 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
60 static void ll_file_data_put(struct ll_file_data *fd)
63 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 static int ll_close_inode_openhandle(struct inode *inode,
67 struct obd_client_handle *och)
69 struct ptlrpc_request *req = NULL;
70 struct obd_device *obd;
72 struct mdc_op_data data = { { 0 } };
77 obd = class_exp2obd(ll_i2mdcexp(inode));
79 CERROR("Invalid MDC connection handle "LPX64"\n",
80 ll_i2mdcexp(inode)->exp_handle.h_cookie);
85 * here we check if this is forced umount. If so this is called on
86 * canceling "open lock" and we do not call mdc_close() in this case, as
87 * it will not be successful, as import is already deactivated.
94 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
96 oa->o_id = inode->i_ino;
97 oa->o_valid = OBD_MD_FLID;
98 valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
99 OBD_MD_FLMTIME | OBD_MD_FLCTIME;
100 if (S_ISREG(inode->i_mode))
101 valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
102 obdo_from_inode(oa, inode, valid);
103 if (ll_is_inode_dirty(inode)) {
104 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
105 oa->o_valid |= OBD_MD_FLFLAGS;
107 ll_inode2fid(&data.fid1, inode);
108 rc = mdc_close(ll_i2mdcexp(inode), &data, oa, och, &req);
110 /* We are the last writer, so the MDS has instructed us to get
111 * the file size and any write cookies, then close again. */
112 ll_queue_done_writing(inode);
115 CERROR("inode %lu mdc close failed: rc = %d\n",
122 rc = ll_objects_destroy(req, inode);
124 CERROR("inode %lu ll_objects destroy: rc = %d\n",
128 ptlrpc_req_finished(req); /* This is close request */
131 mdc_clear_open_replay_data(och);
136 int ll_mdc_real_close(struct inode *inode, int flags)
138 struct ll_inode_info *lli = ll_i2info(inode);
140 struct obd_client_handle **och_p;
141 struct obd_client_handle *och;
146 if (flags & FMODE_WRITE) {
147 och_p = &lli->lli_mds_write_och;
148 och_usecount = &lli->lli_open_fd_write_count;
149 } else if (flags & FMODE_EXEC) {
150 och_p = &lli->lli_mds_exec_och;
151 och_usecount = &lli->lli_open_fd_exec_count;
153 LASSERT(flags & FMODE_READ);
154 och_p = &lli->lli_mds_read_och;
155 och_usecount = &lli->lli_open_fd_read_count;
158 down(&lli->lli_och_sem);
159 if (*och_usecount) { /* There are still users of this handle, so
161 up(&lli->lli_och_sem);
166 up(&lli->lli_och_sem);
168 if (och) { /* There might be a race and somebody have freed this och
170 rc = ll_close_inode_openhandle(inode, och);
171 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
172 OBD_FREE(och, sizeof *och);
178 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
181 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
182 struct ll_inode_info *lli = ll_i2info(inode);
186 /* clear group lock, if present */
187 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
188 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
189 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
190 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
194 /* Let's see if we have good enough OPEN lock on the file and if
195 we can skip talking to MDS */
196 if (file->f_dentry->d_inode) { /* Can this ever be false? */
198 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
199 struct lustre_handle lockh;
200 struct inode *inode = file->f_dentry->d_inode;
201 struct ldlm_res_id file_res_id;
203 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
204 fid_build_reg_res_name(ll_inode_lu_fid(inode), &file_res_id);
206 down(&lli->lli_och_sem);
207 if (fd->fd_omode & FMODE_WRITE) {
209 LASSERT(lli->lli_open_fd_write_count);
210 lli->lli_open_fd_write_count--;
211 } else if (fd->fd_omode & FMODE_EXEC) {
213 LASSERT(lli->lli_open_fd_exec_count);
214 lli->lli_open_fd_exec_count--;
217 LASSERT(lli->lli_open_fd_read_count);
218 lli->lli_open_fd_read_count--;
220 up(&lli->lli_och_sem);
222 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
223 &file_res_id, LDLM_IBITS, &policy,lockmode,
225 rc = ll_mdc_real_close(file->f_dentry->d_inode,
229 CERROR("Releasing a file %p with negative dentry %p. Name %s",
230 file, file->f_dentry, file->f_dentry->d_name.name);
233 LUSTRE_FPRIVATE(file) = NULL;
234 ll_file_data_put(fd);
239 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
241 /* While this returns an error code, fput() the caller does not, so we need
242 * to make every effort to clean up all of our state here. Also, applications
243 * rarely check close errors and even if an error is returned they will not
244 * re-try the close call.
246 int ll_file_release(struct inode *inode, struct file *file)
248 struct ll_file_data *fd;
249 struct ll_sb_info *sbi = ll_i2sbi(inode);
250 struct ll_inode_info *lli = ll_i2info(inode);
251 struct lov_stripe_md *lsm = lli->lli_smd;
255 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
256 inode->i_generation, inode);
258 if (inode->i_sb->s_root != file->f_dentry)
259 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
260 fd = LUSTRE_FPRIVATE(file);
263 /* The last ref on @file, maybe not the the owner pid of statahead.
264 * Different processes can open the same dir, "ll_opendir_key" means:
265 * it is me that should stop the statahead thread. */
266 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
267 ll_stop_statahead(inode, lli->lli_opendir_key);
269 if (inode->i_sb->s_root == file->f_dentry) {
270 LUSTRE_FPRIVATE(file) = NULL;
271 ll_file_data_put(fd);
276 lov_test_and_clear_async_rc(lsm);
277 lli->lli_async_rc = 0;
279 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
281 if (OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, obd_fail_val))
282 libcfs_debug_dumplog();
287 static int ll_intent_file_open(struct file *file, void *lmm,
288 int lmmsize, struct lookup_intent *itp)
290 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
291 struct mdc_op_data data = { { 0 } };
292 struct dentry *parent = file->f_dentry->d_parent;
293 const char *name = file->f_dentry->d_name.name;
294 const int len = file->f_dentry->d_name.len;
295 struct inode *inode = file->f_dentry->d_inode;
296 struct ptlrpc_request *req;
303 ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
304 name, len, O_RDWR, NULL);
306 /* Usually we come here only for NFSD, and we want open lock.
307 But we can also get here with pre 2.6.15 patchless kernels, and in
308 that case that lock is also ok */
309 /* We can also get here if there was cached open handle in revalidate_it
310 * but it disappeared while we were getting from there to ll_file_open.
311 * But this means this file was closed and immediatelly opened which
312 * makes a good candidate for using OPEN lock */
313 /* If lmmsize & lmm are not 0, we are just setting stripe info
314 * parameters. No need for the open lock */
315 if (!lmm && !lmmsize)
316 itp->it_flags |= MDS_OPEN_LOCK;
318 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
319 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
321 /* reason for keep own exit path - don`t flood log
322 * with messages with -ESTALE errors.
324 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
325 it_open_error(DISP_OPEN_OPEN, itp))
327 ll_release_openhandle(file->f_dentry, itp);
331 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
332 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
333 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
337 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
338 req, DLM_REPLY_REC_OFF, NULL);
339 if (itp->d.lustre.it_lock_mode)
340 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
344 ptlrpc_req_finished(itp->d.lustre.it_data);
345 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
346 ll_intent_drop_lock(itp);
352 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
353 struct obd_client_handle *och)
355 struct ptlrpc_request *req = it->d.lustre.it_data;
356 struct mds_body *body;
360 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
361 LASSERT(body != NULL); /* reply already checked out */
362 /* and swabbed in mdc_enqueue */
363 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
365 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
366 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
367 lli->lli_io_epoch = body->io_epoch;
369 mdc_set_open_replay_data(och, it->d.lustre.it_data);
372 int ll_local_open(struct file *file, struct lookup_intent *it,
373 struct ll_file_data *fd, struct obd_client_handle *och)
377 LASSERT(!LUSTRE_FPRIVATE(file));
382 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
383 LUSTRE_FPRIVATE(file) = fd;
384 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
385 fd->fd_omode = it->it_flags;
390 /* Open a file, and (for the very first open) create objects on the OSTs at
391 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
392 * creation or open until ll_lov_setstripe() ioctl is called. We grab
393 * lli_open_sem to ensure no other process will create objects, send the
394 * stripe MD to the MDS, or try to destroy the objects if that fails.
396 * If we already have the stripe MD locally then we don't request it in
397 * mdc_open(), by passing a lmm_size = 0.
399 * It is up to the application to ensure no other processes open this file
400 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
401 * used. We might be able to avoid races of that sort by getting lli_open_sem
402 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
403 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
405 int ll_file_open(struct inode *inode, struct file *file)
407 struct ll_inode_info *lli = ll_i2info(inode);
408 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
409 .it_flags = file->f_flags };
410 struct lov_stripe_md *lsm;
411 struct ptlrpc_request *req = NULL;
412 struct obd_client_handle **och_p;
414 struct ll_file_data *fd;
415 int rc = 0, opendir_set = 0;
418 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
419 inode->i_generation, inode, file->f_flags);
421 #ifdef HAVE_VFS_INTENT_PATCHES
424 it = file->private_data; /* XXX: compat macro */
425 file->private_data = NULL; /* prevent ll_local_open assertion */
428 fd = ll_file_data_get();
432 if (S_ISDIR(inode->i_mode)) {
433 spin_lock(&lli->lli_lock);
434 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
435 LASSERT(lli->lli_sai == NULL);
436 lli->lli_opendir_key = fd;
437 lli->lli_opendir_pid = cfs_curproc_pid();
440 spin_unlock(&lli->lli_lock);
443 if (inode->i_sb->s_root == file->f_dentry) {
444 LUSTRE_FPRIVATE(file) = fd;
448 if (!it || !it->d.lustre.it_disposition) {
449 /* Convert f_flags into access mode. We cannot use file->f_mode,
450 * because everything but O_ACCMODE mask was stripped from it */
451 if ((oit.it_flags + 1) & O_ACCMODE)
453 if (file->f_flags & O_TRUNC)
454 oit.it_flags |= FMODE_WRITE;
456 /* kernel only call f_op->open in dentry_open. filp_open calls
457 * dentry_open after call to open_namei that checks permissions.
458 * Only nfsd_open call dentry_open directly without checking
459 * permissions and because of that this code below is safe. */
460 if (oit.it_flags & FMODE_WRITE)
461 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
463 /* We do not want O_EXCL here, presumably we opened the file
464 * already? XXX - NFS implications? */
465 oit.it_flags &= ~O_EXCL;
467 /* bug20584, if "it_flags" contains O_CREAT, the file will be
468 * created if necessary, then "IT_CREAT" should be set to keep
469 * consistent with it */
470 if (oit.it_flags & O_CREAT)
471 oit.it_op |= IT_CREAT;
476 if (ll_i2sbi(inode)->ll_direct_io_default &&
477 !S_ISDIR(inode->i_mode) &&
478 !(it->it_flags & FMODE_EXEC))
479 file->f_flags |= O_DIRECT;
482 /* Let's see if we have file open on MDS already. */
483 if (it->it_flags & FMODE_WRITE) {
484 och_p = &lli->lli_mds_write_och;
485 och_usecount = &lli->lli_open_fd_write_count;
486 } else if (it->it_flags & FMODE_EXEC) {
487 och_p = &lli->lli_mds_exec_och;
488 och_usecount = &lli->lli_open_fd_exec_count;
490 och_p = &lli->lli_mds_read_och;
491 och_usecount = &lli->lli_open_fd_read_count;
494 LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
495 it->d.lustre.it_disposition);
497 down(&lli->lli_och_sem);
498 if (*och_p) { /* Open handle is present */
499 if (it_disposition(it, DISP_OPEN_OPEN)) {
500 /* Well, there's extra open request that we do not need,
501 let's close it somehow. This will decref request. */
502 rc = it_open_error(DISP_OPEN_OPEN, it);
504 up(&lli->lli_och_sem);
505 ll_file_data_put(fd);
506 GOTO(out_openerr, rc);
508 ll_release_openhandle(file->f_dentry, it);
509 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
514 rc = ll_local_open(file, it, fd, NULL);
516 LASSERTF(rc == 0, "rc = %d\n", rc);
518 LASSERT(*och_usecount == 0);
519 if (!it->d.lustre.it_disposition) {
520 /* We cannot just request lock handle now, new ELC code
521 means that one of other OPEN locks for this file
522 could be cancelled, and since blocking ast handler
523 would attempt to grab och_sem as well, that would
524 result in a deadlock */
525 up(&lli->lli_och_sem);
526 it->it_create_mode |= M_CHECK_STALE;
527 rc = ll_intent_file_open(file, NULL, 0, it);
528 it->it_create_mode &= ~M_CHECK_STALE;
530 ll_file_data_put(fd);
531 GOTO(out_openerr, rc);
536 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
538 ll_file_data_put(fd);
539 GOTO(out_och_free, rc = -ENOMEM);
542 req = it->d.lustre.it_data;
544 /* mdc_intent_lock() didn't get a request ref if there was an
545 * open error, so don't do cleanup on the request here
547 /* XXX (green): Should not we bail out on any error here, not
548 * just open error? */
549 rc = it_open_error(DISP_OPEN_OPEN, it);
551 ll_file_data_put(fd);
552 GOTO(out_och_free, rc);
555 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
556 rc = ll_local_open(file, it, fd, *och_p);
557 LASSERTF(rc == 0, "rc = %d\n", rc);
559 up(&lli->lli_och_sem);
561 /* Must do this outside lli_och_sem lock to prevent deadlock where
562 different kind of OPEN lock for this same inode gets cancelled
563 by ldlm_cancel_lru */
564 if (!S_ISREG(inode->i_mode))
569 if (file->f_flags & O_LOV_DELAY_CREATE ||
570 !(file->f_mode & FMODE_WRITE)) {
571 CDEBUG(D_INODE, "object creation was delayed\n");
575 file->f_flags &= ~O_LOV_DELAY_CREATE;
578 ptlrpc_req_finished(req);
580 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
582 ll_open_complete(inode);
586 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
587 *och_p = NULL; /* OBD_FREE writes some magic there */
590 up(&lli->lli_och_sem);
592 if (opendir_set != 0)
593 ll_stop_statahead(inode, lli->lli_opendir_key);
599 /* Fills the obdo with the attributes for the inode defined by lsm */
600 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
603 struct ptlrpc_request_set *set;
604 struct obd_info oinfo = { { { 0 } } };
608 LASSERT(lsm != NULL);
610 memset(oa, 0, sizeof *oa);
613 oa->o_id = lsm->lsm_object_id;
614 oa->o_gr = lsm->lsm_object_gr;
615 oa->o_mode = S_IFREG;
616 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
617 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
618 OBD_MD_FLCTIME | OBD_MD_FLGROUP;
620 set = ptlrpc_prep_set();
624 rc = obd_getattr_async(exp, &oinfo, set);
626 rc = ptlrpc_set_wait(set);
627 ptlrpc_set_destroy(set);
632 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
633 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
637 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
639 struct ll_inode_info *lli = ll_i2info(inode);
640 struct lov_stripe_md *lsm = lli->lli_smd;
641 struct obd_export *exp = ll_i2obdexp(inode);
644 struct ldlm_lock *lock;
645 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
646 __u32 stripe, vallen = sizeof(stripe);
647 struct lov_oinfo *loinfo;
651 if (lsm->lsm_stripe_count == 1)
652 GOTO(check, stripe = 0);
654 /* get our offset in the lov */
655 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
657 CERROR("obd_get_info: rc = %d\n", rc);
660 LASSERT(stripe < lsm->lsm_stripe_count);
663 loinfo = lsm->lsm_oinfo[stripe];
664 if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
665 &lock->l_resource->lr_name)) {
666 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
667 loinfo->loi_id, loinfo->loi_gr);
668 RETURN(-ELDLM_NO_LOCK_DATA);
674 /* Get extra page reference to ensure it is not going away */
675 void ll_pin_extent_cb(void *data)
677 struct page *page = data;
679 page_cache_get(page);
683 /* Flush the page from page cache for an extent as its canceled.
684 * Page to remove is delivered as @data.
686 * No one can dirty the extent until we've finished our work and they cannot
687 * enqueue another lock. The DLM protects us from ll_file_read/write here,
688 * but other kernel actors could have pages locked.
690 * If @discard is set, there is no need to write the page if it is dirty.
692 * Called with the DLM lock held. */
693 int ll_page_removal_cb(void *data, int discard)
696 struct page *page = data;
697 struct address_space *mapping;
701 /* We have page reference already from ll_pin_page */
704 /* Already truncated by somebody */
708 mapping = page->mapping;
710 ll_teardown_mmaps(mapping,
711 (__u64)page->index << PAGE_CACHE_SHIFT,
712 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
714 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
715 if (!discard && PageWriteback(page))
716 wait_on_page_writeback(page);
718 if (!discard && clear_page_dirty_for_io(page)) {
719 rc = ll_call_writepage(page->mapping->host, page);
720 /* either waiting for io to complete or reacquiring
721 * the lock that the failed writepage released */
723 wait_on_page_writeback(page);
725 CERROR("writepage inode %lu(%p) of page %p "
726 "failed: %d\n", mapping->host->i_ino,
727 mapping->host, page, rc);
729 set_bit(AS_ENOSPC, &mapping->flags);
731 set_bit(AS_EIO, &mapping->flags);
734 if (page->mapping != NULL) {
735 struct ll_async_page *llap = llap_cast_private(page);
736 // checking again to account for writeback's lock_page()
737 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
739 ll_ra_accounting(llap, page->mapping);
740 ll_truncate_complete_page(page);
744 LASSERT(!PageWriteback(page));
746 page_cache_release(page);
751 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
752 void *data, int flag)
755 struct ll_inode_info *lli;
756 struct lov_stripe_md *lsm;
762 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
763 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
767 inode = ll_inode_from_lock(lock);
770 lli = ll_i2info(inode);
773 if (lli->lli_smd == NULL)
777 stripe = ll_lock_to_stripe_offset(inode, lock);
781 lov_stripe_lock(lsm);
782 lock_res_and_lock(lock);
783 kms = ldlm_extent_shift_kms(lock,
784 lsm->lsm_oinfo[stripe]->loi_kms);
786 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
787 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
788 lsm->lsm_oinfo[stripe]->loi_kms, kms);
789 lsm->lsm_oinfo[stripe]->loi_kms = kms;
790 unlock_res_and_lock(lock);
791 lov_stripe_unlock(lsm);
792 ll_try_done_writing(inode);
801 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
803 /* XXX ALLOCATE - 160 bytes */
804 struct inode *inode = ll_inode_from_lock(lock);
805 struct ll_inode_info *lli = ll_i2info(inode);
806 struct lustre_handle lockh = { 0 };
811 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
812 LDLM_FL_BLOCK_CONV)) {
813 LBUG(); /* not expecting any blocked async locks yet */
814 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
816 ldlm_lock_dump(D_OTHER, lock, 0);
817 ldlm_reprocess_all(lock->l_resource);
821 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
823 stripe = ll_lock_to_stripe_offset(inode, lock);
827 if (lock->l_lvb_len) {
828 struct lov_stripe_md *lsm = lli->lli_smd;
830 lvb = lock->l_lvb_data;
831 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
833 lock_res_and_lock(lock);
834 ll_inode_size_lock(inode, 1);
835 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
836 kms = ldlm_extent_shift_kms(NULL, kms);
837 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
838 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
839 lsm->lsm_oinfo[stripe].loi_kms, kms);
840 lsm->lsm_oinfo[stripe].loi_kms = kms;
841 ll_inode_size_unlock(inode, 1);
842 unlock_res_and_lock(lock);
847 wake_up(&lock->l_waitq);
849 ldlm_lock2handle(lock, &lockh);
850 ldlm_lock_decref(&lockh, LCK_PR);
855 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
857 struct ptlrpc_request *req = reqp;
858 struct inode *inode = ll_inode_from_lock(lock);
859 struct ll_inode_info *lli;
860 struct lov_stripe_md *lsm;
863 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
867 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
868 lli = ll_i2info(inode);
870 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
873 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
875 /* First, find out which stripe index this lock corresponds to. */
876 stripe = ll_lock_to_stripe_offset(inode, lock);
878 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
880 rc = lustre_pack_reply(req, 2, size, NULL);
884 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
885 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
886 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
887 lvb->lvb_atime = LTIME_S(inode->i_atime);
888 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
890 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
891 " atime "LPU64", mtime "LPU64", ctime "LPU64,
892 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_atime,
893 lvb->lvb_mtime, lvb->lvb_ctime);
898 /* These errors are normal races, so we don't want to fill the console
899 * with messages by calling ptlrpc_error() */
900 if (rc == -ELDLM_NO_LOCK_DATA)
901 lustre_pack_reply(req, 1, NULL, NULL);
907 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
910 struct lustre_handle lockh = { 0 };
911 struct ldlm_enqueue_info einfo = { 0 };
912 struct obd_info oinfo = { { { 0 } } };
918 einfo.ei_type = LDLM_EXTENT;
919 einfo.ei_mode = LCK_PR;
920 einfo.ei_cb_bl = osc_extent_blocking_cb;
921 einfo.ei_cb_cp = ldlm_completion_ast;
922 einfo.ei_cb_gl = ll_glimpse_callback;
923 einfo.ei_cbdata = NULL;
925 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
926 oinfo.oi_lockh = &lockh;
928 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
930 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
934 CERROR("obd_enqueue returned rc %d, "
935 "returning -EIO\n", rc);
936 RETURN(rc > 0 ? -EIO : rc);
939 lov_stripe_lock(lsm);
940 memset(&lvb, 0, sizeof(lvb));
941 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
942 st->st_size = lvb.lvb_size;
943 st->st_blocks = lvb.lvb_blocks;
944 st->st_mtime = lvb.lvb_mtime;
945 st->st_atime = lvb.lvb_atime;
946 st->st_ctime = lvb.lvb_ctime;
947 lov_stripe_unlock(lsm);
952 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
953 * file (because it prefers KMS over RSS when larger) */
954 int ll_glimpse_size(struct inode *inode, int ast_flags)
956 struct ll_inode_info *lli = ll_i2info(inode);
957 struct ll_sb_info *sbi = ll_i2sbi(inode);
958 struct lustre_handle lockh = { 0 };
959 struct ldlm_enqueue_info einfo = { 0 };
960 struct obd_info oinfo = { { { 0 } } };
965 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
968 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
972 /* NOTE: this looks like DLM lock request, but it may not be one. Due
973 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
974 * won't revoke any conflicting DLM locks held. Instead,
975 * ll_glimpse_callback() will be called on each client
976 * holding a DLM lock against this file, and resulting size
977 * will be returned for each stripe. DLM lock on [0, EOF] is
978 * acquired only if there were no conflicting locks. */
979 einfo.ei_type = LDLM_EXTENT;
980 einfo.ei_mode = LCK_PR;
981 einfo.ei_cb_bl = osc_extent_blocking_cb;
982 einfo.ei_cb_cp = ldlm_completion_ast;
983 einfo.ei_cb_gl = ll_glimpse_callback;
984 einfo.ei_cbdata = inode;
986 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
987 oinfo.oi_lockh = &lockh;
988 oinfo.oi_md = lli->lli_smd;
989 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
991 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
995 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
996 RETURN(rc > 0 ? -EIO : rc);
999 ll_inode_size_lock(inode, 1);
1000 inode_init_lvb(inode, &lvb);
1001 /* merge timestamps the most recently obtained from mds with
1002 timestamps obtained from osts */
1003 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
1004 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
1005 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
1006 rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1007 i_size_write(inode, lvb.lvb_size);
1008 inode->i_blocks = lvb.lvb_blocks;
1009 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1010 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1011 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1012 ll_inode_size_unlock(inode, 1);
1014 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1015 i_size_read(inode), (long long)inode->i_blocks);
1020 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1021 struct lov_stripe_md *lsm, int mode,
1022 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1025 struct ll_sb_info *sbi = ll_i2sbi(inode);
1027 struct ldlm_enqueue_info einfo = { 0 };
1028 struct obd_info oinfo = { { { 0 } } };
1032 LASSERT(!lustre_handle_is_used(lockh));
1033 LASSERT(lsm != NULL);
1035 /* don't drop the mmapped file to LRU */
1036 if (mapping_mapped(inode->i_mapping))
1037 ast_flags |= LDLM_FL_NO_LRU;
1039 /* XXX phil: can we do this? won't it screw the file size up? */
1040 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1041 (sbi->ll_flags & LL_SBI_NOLCK))
1044 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1045 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1047 einfo.ei_type = LDLM_EXTENT;
1048 einfo.ei_mode = mode;
1049 einfo.ei_cb_bl = osc_extent_blocking_cb;
1050 einfo.ei_cb_cp = ldlm_completion_ast;
1051 einfo.ei_cb_gl = ll_glimpse_callback;
1052 einfo.ei_cbdata = inode;
1054 oinfo.oi_policy = *policy;
1055 oinfo.oi_lockh = lockh;
1057 oinfo.oi_flags = ast_flags;
1059 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1060 *policy = oinfo.oi_policy;
1064 ll_inode_size_lock(inode, 1);
1065 inode_init_lvb(inode, &lvb);
1066 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1068 if (policy->l_extent.start == 0 &&
1069 policy->l_extent.end == OBD_OBJECT_EOF) {
1070 /* vmtruncate()->ll_truncate() first sets the i_size and then
1071 * the kms under both a DLM lock and the
1072 * ll_inode_size_lock(). If we don't get the
1073 * ll_inode_size_lock() here we can match the DLM lock and
1074 * reset i_size from the kms before the truncating path has
1075 * updated the kms. generic_file_write can then trust the
1076 * stale i_size when doing appending writes and effectively
1077 * cancel the result of the truncate. Getting the
1078 * ll_inode_size_lock() after the enqueue maintains the DLM
1079 * -> ll_inode_size_lock() acquiring order. */
1080 i_size_write(inode, lvb.lvb_size);
1081 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1082 inode->i_ino, i_size_read(inode));
1086 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1087 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1088 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1090 ll_inode_size_unlock(inode, 1);
1095 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1096 struct lov_stripe_md *lsm, int mode,
1097 struct lustre_handle *lockh)
1099 struct ll_sb_info *sbi = ll_i2sbi(inode);
1103 /* XXX phil: can we do this? won't it screw the file size up? */
1104 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1105 (sbi->ll_flags & LL_SBI_NOLCK))
1108 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh, 0, 0);
1113 static void ll_set_file_contended(struct inode *inode)
1115 struct ll_inode_info *lli = ll_i2info(inode);
1117 lli->lli_contention_time = cfs_time_current();
1118 set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1121 void ll_clear_file_contended(struct inode *inode)
1123 struct ll_inode_info *lli = ll_i2info(inode);
1125 clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1128 static int ll_is_file_contended(struct file *file)
1130 struct inode *inode = file->f_dentry->d_inode;
1131 struct ll_inode_info *lli = ll_i2info(inode);
1132 struct ll_sb_info *sbi = ll_i2sbi(inode);
1133 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1136 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1137 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1138 " osc connect flags = 0x"LPX64"\n",
1139 sbi->ll_lco.lco_flags);
1143 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1146 /* server-side locking for dio unless LL_FILE_LOCKED_DIRECTIO */
1147 if ((file->f_flags & O_DIRECT) &&
1148 !(fd && (fd->fd_flags & LL_FILE_LOCKED_DIRECTIO)))
1151 /* server-side locking for cached I/O with LL_FILE_LOCKLESS_IO */
1152 if (!(file->f_flags & O_DIRECT) &&
1153 fd && fd->fd_flags & LL_FILE_LOCKLESS_IO)
1156 if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1157 cfs_time_t cur_time = cfs_time_current();
1158 cfs_time_t retry_time;
1160 retry_time = cfs_time_add(
1161 lli->lli_contention_time,
1162 cfs_time_seconds(sbi->ll_contention_time));
1163 if (cfs_time_after(cur_time, retry_time)) {
1164 ll_clear_file_contended(inode);
1172 static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
1173 struct file *file, const struct iovec *iov,
1174 unsigned long nr_segs,
1175 obd_off start, obd_off end, int rw)
1178 int tree_locked = 0;
1180 struct inode * inode = file->f_dentry->d_inode;
1183 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1185 if (append || !ll_is_file_contended(file)) {
1186 struct ll_lock_tree_node *node;
1189 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1190 if (file->f_flags & O_NONBLOCK)
1191 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1192 node = ll_node_from_inode(inode, start, end,
1193 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1198 tree->lt_fd = LUSTRE_FPRIVATE(file);
1199 rc = ll_tree_lock_iov(tree, node, iov, nr_segs, ast_flags);
1202 else if (rc == -EUSERS)
1203 ll_set_file_contended(inode);
1207 RETURN(tree_locked);
1212 /* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
1214 static size_t ll_file_get_iov_count(const struct iovec *iov,
1215 unsigned long *nr_segs)
1220 for (seg = 0; seg < *nr_segs; seg++) {
1221 const struct iovec *iv = &iov[seg];
1224 * If any segment has a negative length, or the cumulative
1225 * length ever wraps negative then return -EINVAL.
1227 count += iv->iov_len;
1228 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1230 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1235 count -= iv->iov_len; /* This segment is no good */
1241 static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out,
1242 unsigned long *nrsegs_copy,
1243 struct iovec *iov_copy, size_t *offset,
1247 const struct iovec *iov = *iov_out;
1248 for (i = 0; i < *nr_segs;
1250 const struct iovec *iv = &iov[i];
1251 struct iovec *ivc = &iov_copy[i];
1254 ivc->iov_len -= *offset;
1255 ivc->iov_base += *offset;
1257 if (ivc->iov_len >= size) {
1258 ivc->iov_len = size;
1265 size -= ivc->iov_len;
1269 *nrsegs_copy = i + 1;
1274 static int ll_get_short_lock(struct page *page, int rw, obd_off start,
1275 obd_off end, struct lustre_handle *lockh)
1277 struct ll_async_page *llap;
1278 struct obd_export *exp;
1279 struct inode *inode = page->mapping->host;
1283 exp = ll_i2obdexp(inode);
1287 llap = llap_cast_private(page);
1291 RETURN(obd_get_lock(exp, ll_i2info(inode)->lli_smd,
1292 &llap->llap_cookie, rw, start, end, lockh,
1296 static void ll_release_short_lock(struct inode *inode, obd_off end,
1297 struct lustre_handle *lockh, int rw)
1299 struct obd_export *exp;
1302 exp = ll_i2obdexp(inode);
1306 rc = obd_cancel(exp, ll_i2info(inode)->lli_smd,
1307 rw = OBD_BRW_READ ? LCK_PR : LCK_PW, lockh,
1308 OBD_FAST_LOCK, end);
1310 CERROR("unlock failed (%d)\n", rc);
1313 static inline int ll_file_get_fast_lock(struct file *file,
1314 obd_off ppos, obd_off end,
1315 const struct iovec *iov,
1316 unsigned long nr_segs,
1317 struct lustre_handle *lockh,
1325 /* we would like this read request to be lockfree */
1326 for (seg = 0; seg < nr_segs; seg++) {
1327 const struct iovec *iv = &iov[seg];
1328 if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len))
1332 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1333 ppos >> CFS_PAGE_SHIFT);
1335 if (ll_get_short_lock(page, rw, ppos, end, lockh))
1339 page_cache_release(page);
1346 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1347 struct lustre_handle *lockh, int rw)
1349 ll_release_short_lock(inode, end, lockh, rw);
1352 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1353 obd_off end, const struct iovec *iov,
1354 unsigned long nr_segs,
1355 struct lustre_handle *lockh,
1356 struct ll_lock_tree *tree, int rw)
1362 if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, lockh, rw))
1363 RETURN(LL_LOCK_STYLE_FASTLOCK);
1365 rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs,
1367 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1370 RETURN(LL_LOCK_STYLE_TREELOCK);
1372 RETURN(LL_LOCK_STYLE_NOLOCK);
1375 /* an error happened if we reached this point, rc = -errno here */
1379 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1380 enum ll_lock_style lock_style,
1381 struct lustre_handle *lockh,
1382 struct ll_lock_tree *tree, int rw)
1385 switch (lock_style) {
1386 case LL_LOCK_STYLE_TREELOCK:
1387 ll_tree_unlock(tree);
1389 case LL_LOCK_STYLE_FASTLOCK:
1390 ll_file_put_fast_lock(inode, end, lockh, rw);
1393 CERROR("invalid locking style (%d)\n", lock_style);
1397 #ifdef HAVE_FILE_READV
1398 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1399 unsigned long nr_segs, loff_t *ppos)
1402 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1403 unsigned long nr_segs, loff_t pos)
1405 struct file *file = iocb->ki_filp;
1406 loff_t *ppos = &iocb->ki_pos;
1408 struct inode *inode = file->f_dentry->d_inode;
1409 struct ll_inode_info *lli = ll_i2info(inode);
1410 struct lov_stripe_md *lsm = lli->lli_smd;
1411 struct ll_sb_info *sbi = ll_i2sbi(inode);
1412 struct ll_thread_data ltd = { 0 };
1414 struct ll_ra_read bead;
1417 ssize_t retval, chunk, sum = 0;
1418 struct iovec *iov_copy = NULL;
1419 unsigned long nrsegs_copy, nrsegs_orig = 0;
1420 size_t count, iov_offset = 0;
1424 count = ll_file_get_iov_count(iov, &nr_segs);
1425 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1426 inode->i_ino, inode->i_generation, inode, count, *ppos);
1427 /* "If nbyte is 0, read() will return 0 and have no other results."
1428 * -- Single Unix Spec */
1432 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1435 /* Read on file with no objects should return zero-filled
1436 * buffers up to file size (we can get non-zero sizes with
1437 * mknod + truncate, then opening file for read. This is a
1438 * common pattern in NFS case, it seems). Bug 6243 */
1440 /* Since there are no objects on OSTs, we have nothing to get
1441 * lock on and so we are forced to access inode->i_size
1444 /* Read beyond end of file */
1445 if (*ppos >= i_size_read(inode))
1448 if (count > i_size_read(inode) - *ppos)
1449 count = i_size_read(inode) - *ppos;
1450 /* Make sure to correctly adjust the file pos pointer for
1452 for (nrsegs_copy = 0; nrsegs_copy < nr_segs; nrsegs_copy++) {
1453 const struct iovec *iv = &iov[nrsegs_copy];
1455 if (count < iv->iov_len)
1458 chunk = iv->iov_len;
1459 notzeroed = clear_user(iv->iov_base, chunk);
1460 sum += (chunk - notzeroed);
1461 count -= (chunk - notzeroed);
1462 if (notzeroed || !count)
1471 ltd.ltd_magic = LTD_MAGIC;
1474 memset(<d, 0, sizeof(ltd));
1475 ltd.ltd_magic = LTD_MAGIC;
1476 if (sbi->ll_max_rw_chunk != 0 && !(file->f_flags & O_DIRECT)) {
1477 /* first, let's know the end of the current stripe */
1479 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,&end);
1481 /* correct, the end is beyond the request */
1482 if (end > *ppos + count - 1)
1483 end = *ppos + count - 1;
1485 /* and chunk shouldn't be too large even if striping is wide */
1486 if (end - *ppos > sbi->ll_max_rw_chunk)
1487 end = *ppos + sbi->ll_max_rw_chunk - 1;
1489 chunk = end - *ppos + 1;
1490 if ((count == chunk) && (iov_offset == 0)) {
1492 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1494 iov_copy = (struct iovec *)iov;
1495 nrsegs_copy = nr_segs;
1498 nrsegs_orig = nr_segs;
1499 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1501 GOTO(out, retval = -ENOMEM);
1504 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1505 &iov_offset, chunk);
1508 end = *ppos + count - 1;
1509 iov_copy = (struct iovec *)iov;
1510 nrsegs_copy = nr_segs;
1513 down_read(&lli->lli_truncate_rwsem); /* Bug 18233 */
1515 ltd.lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1516 iov_copy, nrsegs_copy,
1517 <d.u.lockh, <d.u.tree,
1519 if (ltd.lock_style < 0 || ltd.lock_style == LL_LOCK_STYLE_NOLOCK)
1520 up_read(&lli->lli_truncate_rwsem);
1521 if (ltd.lock_style < 0)
1522 GOTO(out, retval = ltd.lock_style);
1524 ll_inode_size_lock(inode, 1);
1526 * Consistency guarantees: following possibilities exist for the
1527 * relation between region being read and real file size at this
1530 * (A): the region is completely inside of the file;
1532 * (B-x): x bytes of region are inside of the file, the rest is
1535 * (C): the region is completely outside of the file.
1537 * This classification is stable under DLM lock acquired by
1538 * ll_tree_lock() above, because to change class, other client has to
1539 * take DLM lock conflicting with our lock. Also, any updates to
1540 * ->i_size by other threads on this client are serialized by
1541 * ll_inode_size_lock(). This guarantees that short reads are handled
1542 * correctly in the face of concurrent writes and truncates.
1544 inode_init_lvb(inode, &lvb);
1545 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1547 if (*ppos + count - 1 > kms) {
1548 /* A glimpse is necessary to determine whether we return a
1549 * short read (B) or some zeroes at the end of the buffer (C) */
1550 ll_inode_size_unlock(inode, 1);
1551 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1553 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1554 ll_file_put_lock(inode, end, ltd.lock_style,
1555 <d.u.lockh, <d.u.tree,
1557 up_read(&lli->lli_truncate_rwsem);
1561 /* If objective page index exceed the end-of-file page
1562 * index, return directly. Do not expect kernel will
1563 * check such case correctly. linux-2.6.18-128.1.1 miss
1564 * to do that. --bug 17336 */
1565 loff_t size = i_size_read(inode);
1566 unsigned long cur_index = *ppos >> CFS_PAGE_SHIFT;
1568 if ((size == 0 && cur_index != 0) ||
1569 (((size - 1) >> CFS_PAGE_SHIFT) < cur_index)) {
1570 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1572 ll_file_put_lock(inode, end,
1577 up_read(&lli->lli_truncate_rwsem);
1583 /* region is within kms and, hence, within real file size (A).
1584 * We need to increase i_size to cover the read region so that
1585 * generic_file_read() will do its job, but that doesn't mean
1586 * the kms size is _correct_, it is only the _minimum_ size.
1587 * If someone does a stat they will get the correct size which
1588 * will always be >= the kms value here. b=11081 */
1589 if (i_size_read(inode) < kms)
1590 i_size_write(inode, kms);
1591 ll_inode_size_unlock(inode, 1);
1594 chunk = end - *ppos + 1;
1595 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1596 inode->i_ino, chunk, *ppos, i_size_read(inode));
1598 /* turn off the kernel's read-ahead */
1599 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1600 struct ost_lvb *xtimes;
1603 * 1. update inode's atime as long as concurrent stat
1604 * (via ll_glimpse_size) might bring out-of-date ones
1606 * 2. update lsm so that next stat (via
1607 * ll_glimpse_size) could get correct values in lsm */
1608 OBD_ALLOC_PTR(xtimes);
1609 if (NULL == xtimes) {
1610 ll_file_put_lock(inode, end, ltd.lock_style,
1611 <d.u.lockh, <d.u.tree,
1613 up_read(&lli->lli_truncate_rwsem);
1614 GOTO(out, retval = -ENOMEM);
1617 lov_stripe_lock(lsm);
1618 LTIME_S(inode->i_atime) = LTIME_S(CURRENT_TIME);
1619 xtimes->lvb_atime = LTIME_S(inode->i_atime);
1620 obd_update_lvb(sbi->ll_osc_exp, lsm, xtimes,
1622 lov_stripe_unlock(lsm);
1623 OBD_FREE_PTR(xtimes);
1625 file->f_ra.ra_pages = 0;
1626 /* initialize read-ahead window once per syscall */
1629 ll_ra_read_init(file, &bead, *ppos, count);
1633 file_accessed(file);
1634 #ifdef HAVE_FILE_READV
1635 retval = generic_file_readv(file, iov_copy, nrsegs_copy, ppos);
1637 retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy,
1640 ll_file_put_lock(inode, end, ltd.lock_style, <d.u.lockh,
1641 <d.u.tree, OBD_BRW_READ);
1642 up_read(&lli->lli_truncate_rwsem);
1644 file_accessed(file);
1645 retval = ll_direct_IO(READ, file, iov_copy, *ppos, nr_segs, 0);
1647 lprocfs_counter_add(sbi->ll_stats,
1648 LPROC_LL_LOCKLESS_READ,
1653 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1657 if (retval == chunk && count > 0)
1664 ll_ra_read_ex(file, &bead);
1665 retval = (sum > 0) ? sum : retval;
1667 if (iov_copy && iov_copy != iov)
1668 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1673 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1676 struct iovec local_iov = { .iov_base = (void __user *)buf,
1678 #ifdef HAVE_FILE_READV
1679 return ll_file_readv(file, &local_iov, 1, ppos);
1684 init_sync_kiocb(&kiocb, file);
1685 kiocb.ki_pos = *ppos;
1686 kiocb.ki_left = count;
1688 ret = ll_file_aio_read(&kiocb, &local_iov, 1, kiocb.ki_pos);
1689 *ppos = kiocb.ki_pos;
1694 /* iov_shorten from linux kernel */
1695 static unsigned long ll_iov_shorten(struct iovec *iov,
1696 unsigned long nr_segs,
1699 unsigned long seg = 0;
1702 while (seg < nr_segs) {
1704 if (len + iov->iov_len >= to) {
1705 iov->iov_len = to - len;
1708 len += iov->iov_len;
1714 /* 2.6.22 and 2.6.27 export this as generic_segment_checks */
1715 static int ll_generic_segment_checks(const struct iovec *iov,
1716 unsigned long *nr_segs,
1722 for (seg = 0; seg < *nr_segs; seg++) {
1723 const struct iovec *iv = &iov[seg];
1726 * If any segment has a negative length, or the cumulative
1727 * length ever wraps negative then return -EINVAL.
1730 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1732 if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1737 cnt -= iv->iov_len; /* This segment is no good */
1745 * Write to a file (through the page cache).
1747 #ifdef HAVE_FILE_WRITEV
1748 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1749 unsigned long nr_segs, loff_t *ppos)
1751 #else /* AIO stuff */
1752 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1753 unsigned long nr_segs, loff_t pos)
1755 struct file *file = iocb->ki_filp;
1756 loff_t *ppos = &iocb->ki_pos;
1758 struct inode *inode = file->f_dentry->d_inode;
1759 struct ll_sb_info *sbi = ll_i2sbi(inode);
1760 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1761 struct ll_thread_data ltd = { 0 };
1762 loff_t maxbytes = ll_file_maxbytes(inode);
1763 loff_t lock_start, lock_end, end;
1764 ssize_t retval, chunk, sum = 0;
1766 struct iovec *iov_copy = NULL;
1767 unsigned long nrsegs_copy, nrsegs_orig = 0;
1768 size_t count, iov_offset = 0;
1769 int got_write_sem = 0;
1770 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1773 count = ll_file_get_iov_count(iov, &nr_segs);
1775 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1776 inode->i_ino, inode->i_generation, inode, count, *ppos);
1778 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1780 /* POSIX, but surprised the VFS doesn't check this already */
1784 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1785 * called on the file, don't fail the below assertion (bug 2388). */
1786 if (file->f_flags & O_LOV_DELAY_CREATE &&
1787 ll_i2info(inode)->lli_smd == NULL)
1790 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1792 /* signal(7) specifies that write(2) and writev(2) should be restarted */
1793 if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK)) {
1795 if (down_interruptible(&ll_i2info(inode)->lli_write_sem))
1796 RETURN(-ERESTARTSYS);
1799 ltd.ltd_magic = LTD_MAGIC;
1802 memset(<d, 0, sizeof(ltd));
1803 ltd.ltd_magic = LTD_MAGIC;
1805 chunk = 0; /* just to fix gcc's warning */
1806 end = *ppos + count - 1;
1808 if (file->f_flags & O_APPEND) {
1810 lock_end = OBD_OBJECT_EOF;
1811 iov_copy = (struct iovec *)iov;
1812 nrsegs_copy = nr_segs;
1813 } else if (sbi->ll_max_rw_chunk != 0 && !(file->f_flags & O_DIRECT)) {
1814 /* first, let's know the end of the current stripe */
1816 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1819 /* correct, the end is beyond the request */
1820 if (end > *ppos + count - 1)
1821 end = *ppos + count - 1;
1823 /* and chunk shouldn't be too large even if striping is wide */
1824 if (end - *ppos > sbi->ll_max_rw_chunk)
1825 end = *ppos + sbi->ll_max_rw_chunk - 1;
1828 chunk = end - *ppos + 1;
1829 if ((count == chunk) && (iov_offset == 0)) {
1831 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1833 iov_copy = (struct iovec *)iov;
1834 nrsegs_copy = nr_segs;
1837 nrsegs_orig = nr_segs;
1838 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1840 GOTO(out, retval = -ENOMEM);
1842 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1843 &iov_offset, chunk);
1848 iov_copy = (struct iovec *)iov;
1849 nrsegs_copy = nr_segs;
1852 tree_locked = ll_file_get_tree_lock_iov(<d.u.tree, file, iov_copy,
1854 (obd_off)lock_start,
1857 if (tree_locked < 0)
1858 GOTO(out, retval = tree_locked);
1860 /* This is ok, g_f_w will overwrite this under i_sem if it races
1861 * with a local truncate, it just makes our maxbyte checking easier.
1862 * The i_size value gets updated in ll_extent_lock() as a consequence
1863 * of the [0,EOF] extent lock we requested above. */
1864 if (file->f_flags & O_APPEND) {
1865 *ppos = i_size_read(inode);
1866 end = *ppos + count - 1;
1869 if (*ppos >= maxbytes) {
1870 send_sig(SIGXFSZ, current, 0);
1871 GOTO(out_unlock, retval = -EFBIG);
1873 if (end > maxbytes - 1)
1876 /* generic_file_write handles O_APPEND after getting i_mutex */
1877 chunk = end - *ppos + 1;
1878 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1879 inode->i_ino, chunk, *ppos);
1881 struct ost_lvb *xtimes;
1882 /* write under locks
1884 * 1. update inode's mtime and ctime as long as
1885 * concurrent stat (via ll_glimpse_size) might bring
1888 * 2. update lsm so that next stat (via
1889 * ll_glimpse_size) could get correct values in lsm */
1890 OBD_ALLOC_PTR(xtimes);
1892 GOTO(out_unlock, retval = -ENOMEM);
1894 lov_stripe_lock(lsm);
1895 LTIME_S(inode->i_mtime) = LTIME_S(CURRENT_TIME);
1896 LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
1897 xtimes->lvb_mtime = LTIME_S(inode->i_mtime);
1898 xtimes->lvb_ctime = LTIME_S(inode->i_ctime);
1899 obd_update_lvb(sbi->ll_osc_exp, lsm, xtimes,
1900 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1901 lov_stripe_unlock(lsm);
1902 OBD_FREE_PTR(xtimes);
1904 ltd.lock_style = LL_LOCK_STYLE_TREELOCK;
1906 #ifdef HAVE_FILE_WRITEV
1907 retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
1909 retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
1913 size_t ocount, ncount;
1915 retval = ll_generic_segment_checks(iov_copy, &nrsegs_copy,
1916 &ocount, VERIFY_READ);
1922 retval = generic_write_checks(file, ppos, &ncount, 0);
1926 if (unlikely(ocount != ncount)) {
1927 /* we are allowed to modify the original iov too */
1928 nrsegs_copy = ll_iov_shorten(iov_copy, nrsegs_copy,
1930 chunk = 0; /* no repetition after the short write */
1933 retval = ll_remove_suid(file, file->f_vfsmnt);
1937 ll_update_time(file);
1938 retval = ll_direct_IO(WRITE, file, iov_copy, *ppos, nr_segs, 0);
1940 lprocfs_counter_add(sbi->ll_stats,
1941 LPROC_LL_LOCKLESS_WRITE,
1946 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1950 ll_tree_unlock(<d.u.tree);
1956 if (retval == chunk && count > 0)
1961 up(&ll_i2info(inode)->lli_write_sem);
1964 if (iov_copy && iov_copy != iov)
1965 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1967 retval = (sum > 0) ? sum : retval;
1968 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1969 retval > 0 ? retval : 0);
1973 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1976 struct iovec local_iov = { .iov_base = (void __user *)buf,
1979 #ifdef HAVE_FILE_WRITEV
1980 return ll_file_writev(file, &local_iov, 1, ppos);
1985 init_sync_kiocb(&kiocb, file);
1986 kiocb.ki_pos = *ppos;
1987 kiocb.ki_left = count;
1989 ret = ll_file_aio_write(&kiocb, &local_iov, 1, kiocb.ki_pos);
1990 *ppos = kiocb.ki_pos;
1996 #ifdef HAVE_KERNEL_SENDFILE
1998 * Send file content (through pagecache) somewhere with helper
2000 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,
2001 size_t count, read_actor_t actor, void *target)
2003 struct inode *inode = in_file->f_dentry->d_inode;
2004 struct ll_inode_info *lli = ll_i2info(inode);
2005 struct lov_stripe_md *lsm = lli->lli_smd;
2006 struct ll_lock_tree tree;
2007 struct ll_lock_tree_node *node;
2009 struct ll_ra_read bead;
2014 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
2015 inode->i_ino, inode->i_generation, inode, count, *ppos);
2017 /* "If nbyte is 0, read() will return 0 and have no other results."
2018 * -- Single Unix Spec */
2022 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
2023 /* turn off the kernel's read-ahead */
2024 in_file->f_ra.ra_pages = 0;
2026 /* File with no objects, nothing to lock */
2028 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
2032 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2034 RETURN(PTR_ERR(node));
2036 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2037 rc = ll_tree_lock(&tree, node, NULL, count,
2038 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2042 ll_clear_file_contended(inode);
2043 ll_inode_size_lock(inode, 1);
2045 * Consistency guarantees: following possibilities exist for the
2046 * relation between region being read and real file size at this
2049 * (A): the region is completely inside of the file;
2051 * (B-x): x bytes of region are inside of the file, the rest is
2054 * (C): the region is completely outside of the file.
2056 * This classification is stable under DLM lock acquired by
2057 * ll_tree_lock() above, because to change class, other client has to
2058 * take DLM lock conflicting with our lock. Also, any updates to
2059 * ->i_size by other threads on this client are serialized by
2060 * ll_inode_size_lock(). This guarantees that short reads are handled
2061 * correctly in the face of concurrent writes and truncates.
2063 inode_init_lvb(inode, &lvb);
2064 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2066 if (*ppos + count - 1 > kms) {
2067 /* A glimpse is necessary to determine whether we return a
2068 * short read (B) or some zeroes at the end of the buffer (C) */
2069 ll_inode_size_unlock(inode, 1);
2070 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2074 /* region is within kms and, hence, within real file size (A) */
2075 i_size_write(inode, kms);
2076 ll_inode_size_unlock(inode, 1);
2079 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
2080 inode->i_ino, count, *ppos, i_size_read(inode));
2082 ll_ra_read_init(in_file, &bead, *ppos, count);
2084 file_accessed(in_file);
2085 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
2086 ll_ra_read_ex(in_file, &bead);
2089 ll_tree_unlock(&tree);
2095 * http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=f0930fffa99e7fe0a0c4b6c7d9a244dc88288c27
2097 #ifdef HAVE_KERNEL_SPLICE_READ
2098 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2099 struct pipe_inode_info *pipe, size_t count,
2102 struct inode *inode = in_file->f_dentry->d_inode;
2103 struct ll_inode_info *lli = ll_i2info(inode);
2104 struct lov_stripe_md *lsm = lli->lli_smd;
2105 struct ll_lock_tree tree;
2106 struct ll_lock_tree_node *node;
2108 struct ll_ra_read bead;
2113 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
2114 inode->i_ino, inode->i_generation, inode, count, *ppos);
2116 /* "If nbyte is 0, read() will return 0 and have no other results."
2117 * -- Single Unix Spec */
2121 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
2122 /* turn off the kernel's read-ahead */
2123 in_file->f_ra.ra_pages = 0;
2125 /* File with no objects, nothing to lock */
2127 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2131 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2133 RETURN(PTR_ERR(node));
2135 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2136 rc = ll_tree_lock(&tree, node, NULL, count,
2137 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2141 ll_clear_file_contended(inode);
2142 ll_inode_size_lock(inode, 1);
2144 * Consistency guarantees: following possibilities exist for the
2145 * relation between region being read and real file size at this
2148 * (A): the region is completely inside of the file;
2150 * (B-x): x bytes of region are inside of the file, the rest is
2153 * (C): the region is completely outside of the file.
2155 * This classification is stable under DLM lock acquired by
2156 * ll_tree_lock() above, because to change class, other client has to
2157 * take DLM lock conflicting with our lock. Also, any updates to
2158 * ->i_size by other threads on this client are serialized by
2159 * ll_inode_size_lock(). This guarantees that short reads are handled
2160 * correctly in the face of concurrent writes and truncates.
2162 inode_init_lvb(inode, &lvb);
2163 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2165 if (*ppos + count - 1 > kms) {
2166 /* A glimpse is necessary to determine whether we return a
2167 * short read (B) or some zeroes at the end of the buffer (C) */
2168 ll_inode_size_unlock(inode, 1);
2169 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2173 /* region is within kms and, hence, within real file size (A) */
2174 i_size_write(inode, kms);
2175 ll_inode_size_unlock(inode, 1);
2178 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
2179 inode->i_ino, count, *ppos, i_size_read(inode));
2181 ll_ra_read_init(in_file, &bead, *ppos, count);
2183 file_accessed(in_file);
2184 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2185 ll_ra_read_ex(in_file, &bead);
2188 ll_tree_unlock(&tree);
2193 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
2196 struct ll_inode_info *lli = ll_i2info(inode);
2197 struct obd_export *exp = ll_i2obdexp(inode);
2198 struct ll_recreate_obj ucreatp;
2199 struct obd_trans_info oti = { 0 };
2200 struct obdo *oa = NULL;
2203 struct lov_stripe_md *lsm, *lsm2;
2206 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2209 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
2210 sizeof(struct ll_recreate_obj));
2218 down(&lli->lli_size_sem);
2221 GOTO(out, rc = -ENOENT);
2222 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
2223 (lsm->lsm_stripe_count));
2225 OBD_ALLOC(lsm2, lsm_size);
2227 GOTO(out, rc = -ENOMEM);
2229 oa->o_id = ucreatp.lrc_id;
2230 oa->o_nlink = ucreatp.lrc_ost_idx;
2231 oa->o_flags |= OBD_FL_RECREATE_OBJS;
2232 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
2233 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2234 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2236 memcpy(lsm2, lsm, lsm_size);
2237 rc = obd_create(exp, oa, &lsm2, &oti);
2239 OBD_FREE(lsm2, lsm_size);
2242 up(&lli->lli_size_sem);
2247 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2248 int flags, struct lov_user_md *lum,
2251 struct ll_inode_info *lli = ll_i2info(inode);
2252 struct lov_stripe_md *lsm;
2253 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2257 down(&lli->lli_size_sem);
2260 up(&lli->lli_size_sem);
2261 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2266 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2269 if (it_disposition(&oit, DISP_LOOKUP_NEG))
2270 GOTO(out_req_free, rc = -ENOENT);
2271 rc = oit.d.lustre.it_status;
2273 GOTO(out_req_free, rc);
2275 ll_release_openhandle(file->f_dentry, &oit);
2278 up(&lli->lli_size_sem);
2279 ll_intent_release(&oit);
2282 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2286 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2287 struct lov_mds_md **lmmp, int *lmm_size,
2288 struct ptlrpc_request **request)
2290 struct ll_sb_info *sbi = ll_i2sbi(inode);
2292 struct mds_body *body;
2293 struct lov_mds_md *lmm = NULL;
2294 struct ptlrpc_request *req = NULL;
2297 ll_inode2fid(&fid, inode);
2299 rc = ll_get_max_mdsize(sbi, &lmmsize);
2303 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
2304 filename, strlen(filename) + 1,
2305 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
2308 CDEBUG(D_INFO, "mdc_getattr_name failed "
2309 "on %s: rc %d\n", filename, rc);
2313 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
2315 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2316 /* swabbed by mdc_getattr_name */
2317 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
2319 lmmsize = body->eadatasize;
2321 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2323 GOTO(out, rc = -ENODATA);
2326 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
2328 LASSERT(lmm != NULL);
2329 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
2331 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2332 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2333 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2334 GOTO(out, rc = -EPROTO);
2337 * This is coming from the MDS, so is probably in
2338 * little endian. We convert it to host endian before
2339 * passing it to userspace.
2341 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2342 /* if function called for directory - we should
2343 * avoid swab not existent lsm objects */
2344 if ((lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
2345 (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3))) {
2346 lustre_swab_lov_user_md((struct lov_user_md*)lmm);
2347 if (S_ISREG(body->mode))
2348 lustre_swab_lov_user_md_objects(
2349 (struct lov_user_md*)lmm);
2350 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2351 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2355 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2356 struct lov_stripe_md *lsm;
2357 struct lov_user_md_join *lmj;
2358 int lmj_size, i, aindex = 0;
2360 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
2362 GOTO(out, rc = -ENOMEM);
2363 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
2365 GOTO(out_free_memmd, rc);
2367 lmj_size = sizeof(struct lov_user_md_join) +
2368 lsm->lsm_stripe_count *
2369 sizeof(struct lov_user_ost_data_join);
2370 OBD_ALLOC(lmj, lmj_size);
2372 GOTO(out_free_memmd, rc = -ENOMEM);
2374 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2375 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2376 struct lov_extent *lex =
2377 &lsm->lsm_array->lai_ext_array[aindex];
2379 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2381 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2382 LPU64" len %d\n", aindex, i,
2383 lex->le_start, (int)lex->le_len);
2384 lmj->lmm_objects[i].l_extent_start =
2387 if ((int)lex->le_len == -1)
2388 lmj->lmm_objects[i].l_extent_end = -1;
2390 lmj->lmm_objects[i].l_extent_end =
2391 lex->le_start + lex->le_len;
2392 lmj->lmm_objects[i].l_object_id =
2393 lsm->lsm_oinfo[i]->loi_id;
2394 lmj->lmm_objects[i].l_object_gr =
2395 lsm->lsm_oinfo[i]->loi_gr;
2396 lmj->lmm_objects[i].l_ost_gen =
2397 lsm->lsm_oinfo[i]->loi_ost_gen;
2398 lmj->lmm_objects[i].l_ost_idx =
2399 lsm->lsm_oinfo[i]->loi_ost_idx;
2401 lmm = (struct lov_mds_md *)lmj;
2404 obd_free_memmd(sbi->ll_osc_exp, &lsm);
2408 *lmm_size = lmmsize;
2412 static int ll_lov_setea(struct inode *inode, struct file *file,
2415 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2416 struct lov_user_md *lump;
2417 int lum_size = sizeof(struct lov_user_md) +
2418 sizeof(struct lov_user_ost_data);
2422 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2425 OBD_ALLOC(lump, lum_size);
2429 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2431 OBD_FREE(lump, lum_size);
2435 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2437 OBD_FREE(lump, lum_size);
2441 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2444 struct lov_user_md_v3 lumv3;
2445 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2446 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2447 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2450 int flags = FMODE_WRITE;
2453 /* first try with v1 which is smaller than v3 */
2454 lum_size = sizeof(struct lov_user_md_v1);
2455 rc = copy_from_user(lumv1, lumv1p, lum_size);
2459 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2460 lum_size = sizeof(struct lov_user_md_v3);
2461 rc = copy_from_user(&lumv3, lumv3p, lum_size);
2466 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2468 put_user(0, &lumv1p->lmm_stripe_count);
2469 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
2470 0, ll_i2info(inode)->lli_smd,
2476 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2478 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2483 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
2487 static int ll_get_grouplock(struct inode *inode, struct file *file,
2490 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2491 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2492 .end = OBD_OBJECT_EOF}};
2493 struct lustre_handle lockh = { 0 };
2494 struct ll_inode_info *lli = ll_i2info(inode);
2495 struct lov_stripe_md *lsm = lli->lli_smd;
2499 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2503 policy.l_extent.gid = arg;
2504 if (file->f_flags & O_NONBLOCK)
2505 flags = LDLM_FL_BLOCK_NOWAIT;
2507 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2511 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2513 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2518 static int ll_put_grouplock(struct inode *inode, struct file *file,
2521 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2522 struct ll_inode_info *lli = ll_i2info(inode);
2523 struct lov_stripe_md *lsm = lli->lli_smd;
2527 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2528 /* Ugh, it's already unlocked. */
2532 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2535 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2537 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2542 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2547 #if LUSTRE_FIX >= 50
2548 static int join_sanity_check(struct inode *head, struct inode *tail)
2551 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2552 CERROR("server do not support join \n");
2555 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2556 CERROR("tail ino %lu and ino head %lu must be regular\n",
2557 head->i_ino, tail->i_ino);
2560 if (head->i_ino == tail->i_ino) {
2561 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2564 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2565 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2571 static int join_file(struct inode *head_inode, struct file *head_filp,
2572 struct file *tail_filp)
2574 struct dentry *tail_dentry = tail_filp->f_dentry;
2575 struct lookup_intent oit = {.it_op = IT_OPEN,
2576 .it_flags = head_filp->f_flags,
2577 .it_create_mode = M_JOIN_FILE};
2578 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
2579 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
2581 struct lustre_handle lockh;
2582 struct mdc_op_data *op_data;
2587 tail_dentry = tail_filp->f_dentry;
2589 OBD_ALLOC_PTR(op_data);
2590 if (op_data == NULL) {
2594 data = i_size_read(head_inode);
2595 ll_prepare_mdc_op_data(op_data, head_inode,
2596 tail_dentry->d_parent->d_inode,
2597 tail_dentry->d_name.name,
2598 tail_dentry->d_name.len, 0, &data);
2599 rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
2600 op_data, &lockh, NULL, 0, 0);
2605 rc = oit.d.lustre.it_status;
2607 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2608 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2609 ptlrpc_req_finished((struct ptlrpc_request *)
2610 oit.d.lustre.it_data);
2614 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2616 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2617 oit.d.lustre.it_lock_mode = 0;
2619 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2620 it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2621 ll_release_openhandle(head_filp->f_dentry, &oit);
2624 OBD_FREE_PTR(op_data);
2625 ll_intent_release(&oit);
2629 static int ll_file_join(struct inode *head, struct file *filp,
2630 char *filename_tail)
2632 struct inode *tail = NULL, *first = NULL, *second = NULL;
2633 struct dentry *tail_dentry;
2634 struct file *tail_filp, *first_filp, *second_filp;
2635 struct ll_lock_tree first_tree, second_tree;
2636 struct ll_lock_tree_node *first_node, *second_node;
2637 struct ll_inode_info *hlli = ll_i2info(head);
2638 int rc = 0, cleanup_phase = 0;
2641 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2642 head->i_ino, head->i_generation, head, filename_tail);
2644 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2645 if (IS_ERR(tail_filp)) {
2646 CERROR("Can not open tail file %s", filename_tail);
2647 rc = PTR_ERR(tail_filp);
2650 tail = igrab(tail_filp->f_dentry->d_inode);
2652 tail_dentry = tail_filp->f_dentry;
2653 LASSERT(tail_dentry);
2656 /*reorder the inode for lock sequence*/
2657 first = head->i_ino > tail->i_ino ? head : tail;
2658 second = head->i_ino > tail->i_ino ? tail : head;
2659 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2660 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2662 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2663 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2664 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2665 if (IS_ERR(first_node)){
2666 rc = PTR_ERR(first_node);
2669 first_tree.lt_fd = first_filp->private_data;
2670 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2675 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2676 if (IS_ERR(second_node)){
2677 rc = PTR_ERR(second_node);
2680 second_tree.lt_fd = second_filp->private_data;
2681 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2686 rc = join_sanity_check(head, tail);
2690 rc = join_file(head, filp, tail_filp);
2694 switch (cleanup_phase) {
2696 ll_tree_unlock(&second_tree);
2697 obd_cancel_unused(ll_i2obdexp(second),
2698 ll_i2info(second)->lli_smd, 0, NULL);
2700 ll_tree_unlock(&first_tree);
2701 obd_cancel_unused(ll_i2obdexp(first),
2702 ll_i2info(first)->lli_smd, 0, NULL);
2704 filp_close(tail_filp, 0);
2707 if (head && rc == 0) {
2708 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2710 hlli->lli_smd = NULL;
2715 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2720 #endif /* LUSTRE_FIX >= 50 */
2723 * Close inode open handle
2725 * \param dentry [in] dentry which contains the inode
2726 * \param it [in,out] intent which contains open info and result
2729 * \retval <0 failure
2731 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2733 struct inode *inode = dentry->d_inode;
2734 struct obd_client_handle *och;
2740 /* Root ? Do nothing. */
2741 if (dentry->d_inode->i_sb->s_root == dentry)
2744 /* No open handle to close? Move away */
2745 if (!it_disposition(it, DISP_OPEN_OPEN))
2748 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2750 OBD_ALLOC(och, sizeof(*och));
2752 GOTO(out, rc = -ENOMEM);
2754 ll_och_fill(ll_i2info(inode), it, och);
2756 rc = ll_close_inode_openhandle(inode, och);
2758 OBD_FREE(och, sizeof(*och));
2760 /* this one is in place of ll_file_open */
2761 if (it_disposition(it, DISP_ENQ_OPEN_REF))
2762 ptlrpc_req_finished(it->d.lustre.it_data);
2763 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2767 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2770 struct obd_export *exp = ll_i2obdexp(inode);
2771 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2772 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2773 int vallen = num_bytes;
2777 /* Checks for fiemap flags */
2778 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2779 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2783 /* Check for FIEMAP_FLAG_SYNC */
2784 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2785 rc = filemap_fdatawrite(inode->i_mapping);
2790 /* If the stripe_count > 1 and the application does not understand
2791 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2793 if (lsm->lsm_stripe_count > 1 &&
2794 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2797 fm_key.oa.o_id = lsm->lsm_object_id;
2798 fm_key.oa.o_valid = OBD_MD_FLID;
2800 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLSIZE);
2802 /* If filesize is 0, then there would be no objects for mapping */
2803 if (fm_key.oa.o_size == 0) {
2804 fiemap->fm_mapped_extents = 0;
2808 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2810 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2812 CERROR("obd_get_info failed: rc = %d\n", rc);
2817 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
2819 struct ll_user_fiemap *fiemap_s;
2820 size_t num_bytes, ret_bytes;
2821 unsigned int extent_count;
2824 /* Get the extent count so we can calculate the size of
2825 * required fiemap buffer */
2826 if (get_user(extent_count,
2827 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2829 num_bytes = sizeof(*fiemap_s) + (extent_count *
2830 sizeof(struct ll_fiemap_extent));
2832 OBD_VMALLOC(fiemap_s, num_bytes);
2833 if (fiemap_s == NULL)
2836 /* get the fiemap value */
2837 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2839 GOTO(error, rc = -EFAULT);
2841 /* If fm_extent_count is non-zero, read the first extent since
2842 * it is used to calculate end_offset and device from previous
2845 if (copy_from_user(&fiemap_s->fm_extents[0],
2846 (char __user *)arg + sizeof(*fiemap_s),
2847 sizeof(struct ll_fiemap_extent)))
2848 GOTO(error, rc = -EFAULT);
2851 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
2855 ret_bytes = sizeof(struct ll_user_fiemap);
2857 if (extent_count != 0)
2858 ret_bytes += (fiemap_s->fm_mapped_extents *
2859 sizeof(struct ll_fiemap_extent));
2861 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2865 OBD_VFREE(fiemap_s, num_bytes);
2869 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2872 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2876 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2877 inode->i_generation, inode, cmd);
2878 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2880 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2881 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2885 case LL_IOC_GETFLAGS:
2886 /* Get the current value of the file flags */
2887 return put_user(fd->fd_flags, (int *)arg);
2888 case LL_IOC_SETFLAGS:
2889 case LL_IOC_CLRFLAGS:
2890 /* Set or clear specific file flags */
2891 /* XXX This probably needs checks to ensure the flags are
2892 * not abused, and to handle any flag side effects.
2894 if (get_user(flags, (int *) arg))
2897 if (cmd == LL_IOC_SETFLAGS) {
2898 if ((flags & LL_FILE_IGNORE_LOCK) &&
2899 !(file->f_flags & O_DIRECT)) {
2900 CERROR("%s: unable to disable locking on "
2901 "non-O_DIRECT file\n", current->comm);
2905 fd->fd_flags |= flags;
2907 fd->fd_flags &= ~flags;
2910 case LL_IOC_LOV_SETSTRIPE:
2911 RETURN(ll_lov_setstripe(inode, file, arg));
2912 case LL_IOC_LOV_SETEA:
2913 RETURN(ll_lov_setea(inode, file, arg));
2914 case LL_IOC_LOV_GETSTRIPE:
2915 RETURN(ll_lov_getstripe(inode, arg));
2916 case LL_IOC_RECREATE_OBJ:
2917 RETURN(ll_lov_recreate_obj(inode, file, arg));
2918 case FSFILT_IOC_FIEMAP:
2919 RETURN(ll_ioctl_fiemap(inode, arg));
2920 case FSFILT_IOC_GETFLAGS:
2921 case FSFILT_IOC_SETFLAGS:
2922 RETURN(ll_iocontrol(inode, file, cmd, arg));
2923 case FSFILT_IOC_GETVERSION_OLD:
2924 case FSFILT_IOC_GETVERSION:
2925 RETURN(put_user(inode->i_generation, (int *)arg));
2927 #if LUSTRE_FIX >= 50
2928 /* Allow file join in beta builds to allow debuggging */
2932 ftail = getname((const char *)arg);
2934 RETURN(PTR_ERR(ftail));
2935 rc = ll_file_join(inode, file, ftail);
2939 CWARN("file join is not supported in this version of Lustre\n");
2943 case LL_IOC_GROUP_LOCK:
2944 RETURN(ll_get_grouplock(inode, file, arg));
2945 case LL_IOC_GROUP_UNLOCK:
2946 RETURN(ll_put_grouplock(inode, file, arg));
2947 case IOC_OBD_STATFS:
2948 RETURN(ll_obd_statfs(inode, (void *)arg));
2949 case OBD_IOC_GETNAME_OLD:
2950 case OBD_IOC_GETNAME: {
2951 struct obd_device *obd =
2952 class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
2955 if (copy_to_user((void *)arg, obd->obd_name,
2956 strlen(obd->obd_name) + 1))
2960 case LL_IOC_PATH2FID: {
2961 if (copy_to_user((void *)arg, ll_inode_lu_fid(inode),
2962 sizeof(struct lu_fid)))
2968 /* We need to special case any other ioctls we want to handle,
2969 * to send them to the MDS/OST as appropriate and to properly
2970 * network encode the arg field.
2971 case EXT3_IOC_SETVERSION_OLD:
2972 case EXT3_IOC_SETVERSION:
2978 ll_iocontrol_call(inode, file, cmd, arg, &err))
2981 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
2987 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2989 struct inode *inode = file->f_dentry->d_inode;
2990 struct ll_inode_info *lli = ll_i2info(inode);
2991 struct lov_stripe_md *lsm = lli->lli_smd;
2994 retval = offset + ((origin == 2) ? i_size_read(inode) :
2995 (origin == 1) ? file->f_pos : 0);
2996 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2997 inode->i_ino, inode->i_generation, inode, retval, retval,
2998 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2999 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3001 if (origin == 2) { /* SEEK_END */
3002 int nonblock = 0, rc;
3004 if (file->f_flags & O_NONBLOCK)
3005 nonblock = LDLM_FL_BLOCK_NOWAIT;
3008 rc = ll_glimpse_size(inode, nonblock);
3013 ll_inode_size_lock(inode, 0);
3014 offset += i_size_read(inode);
3015 ll_inode_size_unlock(inode, 0);
3016 } else if (origin == 1) { /* SEEK_CUR */
3017 offset += file->f_pos;
3021 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
3022 if (offset != file->f_pos) {
3023 file->f_pos = offset;
3024 file->f_version = 0;
3032 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,5)
3033 int ll_flush(struct file *file)
3035 int ll_flush(struct file *file, fl_owner_t id)
3038 struct inode *inode = file->f_dentry->d_inode;
3039 struct ll_inode_info *lli = ll_i2info(inode);
3040 struct lov_stripe_md *lsm = lli->lli_smd;
3043 /* catch async errors that were recorded back when async writeback
3044 * failed for pages in this mapping. */
3045 rc = lli->lli_async_rc;
3046 lli->lli_async_rc = 0;
3048 err = lov_test_and_clear_async_rc(lsm);
3053 return rc ? -EIO : 0;
3056 int ll_fsync(struct file *file, struct dentry *dentry, int data)
3058 struct inode *inode = dentry->d_inode;
3059 struct ll_inode_info *lli = ll_i2info(inode);
3060 struct lov_stripe_md *lsm = lli->lli_smd;
3062 struct ptlrpc_request *req;
3065 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
3066 inode->i_generation, inode);
3067 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3069 /* fsync's caller has already called _fdata{sync,write}, we want
3070 * that IO to finish before calling the osc and mdc sync methods */
3071 rc = filemap_fdatawait(inode->i_mapping);
3073 /* catch async errors that were recorded back when async writeback
3074 * failed for pages in this mapping. */
3075 err = lli->lli_async_rc;
3076 lli->lli_async_rc = 0;
3080 err = lov_test_and_clear_async_rc(lsm);
3085 ll_inode2fid(&fid, inode);
3086 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
3090 ptlrpc_req_finished(req);
3093 struct obd_info *oinfo;
3095 OBD_ALLOC_PTR(oinfo);
3097 RETURN(rc ? rc : -ENOMEM);
3098 OBDO_ALLOC(oinfo->oi_oa);
3099 if (!oinfo->oi_oa) {
3100 OBD_FREE_PTR(oinfo);
3101 RETURN(rc ? rc : -ENOMEM);
3103 oinfo->oi_oa->o_id = lsm->lsm_object_id;
3104 oinfo->oi_oa->o_gr = lsm->lsm_object_gr;
3105 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
3106 obdo_from_inode(oinfo->oi_oa, inode,
3107 OBD_MD_FLTYPE | OBD_MD_FLATIME |
3108 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
3110 err = obd_sync_rqset(ll_i2sbi(inode)->ll_osc_exp, oinfo,
3114 OBDO_FREE(oinfo->oi_oa);
3115 OBD_FREE_PTR(oinfo);
3121 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3123 struct inode *inode = file->f_dentry->d_inode;
3124 struct ll_sb_info *sbi = ll_i2sbi(inode);
3125 struct lu_fid *fid = ll_inode_lu_fid(inode);
3126 struct ldlm_res_id res_id =
3127 { .name = { fid_seq(fid),
3131 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
3132 ldlm_flock_completion_ast, NULL, file_lock };
3133 struct lustre_handle lockh = {0};
3134 ldlm_policy_data_t flock;
3139 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
3140 inode->i_ino, file_lock);
3141 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3143 if (fid_is_igif(fid)) {
3144 /* If this is an IGIF inode, we need to keep the 1.6-style
3145 * flock mapping for compatibility. If it is a proper FID
3146 * then we know any other client accessing it must also be
3147 * accessing it as a FID and can use the CMD-style flock. */
3148 res_id.name[2] = LDLM_FLOCK;
3152 if (file_lock->fl_flags & FL_FLOCK) {
3153 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3154 /* set missing params for flock() calls */
3155 file_lock->fl_end = OFFSET_MAX;
3156 file_lock->fl_pid = current->tgid;
3158 flock.l_flock.pid = file_lock->fl_pid;
3159 flock.l_flock.start = file_lock->fl_start;
3160 flock.l_flock.end = file_lock->fl_end;
3162 switch (file_lock->fl_type) {
3164 einfo.ei_mode = LCK_PR;
3167 /* An unlock request may or may not have any relation to
3168 * existing locks so we may not be able to pass a lock handle
3169 * via a normal ldlm_lock_cancel() request. The request may even
3170 * unlock a byte range in the middle of an existing lock. In
3171 * order to process an unlock request we need all of the same
3172 * information that is given with a normal read or write record
3173 * lock request. To avoid creating another ldlm unlock (cancel)
3174 * message we'll treat a LCK_NL flock request as an unlock. */
3175 einfo.ei_mode = LCK_NL;
3178 einfo.ei_mode = LCK_PW;
3181 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
3196 flags = LDLM_FL_BLOCK_NOWAIT;
3202 flags = LDLM_FL_TEST_LOCK;
3203 /* Save the old mode so that if the mode in the lock changes we
3204 * can decrement the appropriate reader or writer refcount. */
3205 file_lock->fl_type = einfo.ei_mode;
3208 CERROR("unknown fcntl lock command: %d\n", cmd);
3212 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
3213 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
3214 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
3216 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
3217 &flock, &flags, NULL, 0, NULL, &lockh, 0);
3218 if ((file_lock->fl_flags & FL_FLOCK) &&
3219 (rc == 0 || file_lock->fl_type == F_UNLCK))
3220 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
3221 #ifdef HAVE_F_OP_FLOCK
3222 if ((file_lock->fl_flags & FL_POSIX) &&
3223 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3224 !(flags & LDLM_FL_TEST_LOCK))
3225 posix_lock_file_wait(file, file_lock);
3231 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3238 int ll_have_md_lock(struct inode *inode, __u64 bits)
3240 struct lustre_handle lockh;
3241 struct ldlm_res_id res_id;
3242 struct obd_device *obddev;
3243 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3250 obddev = ll_i2mdcexp(inode)->exp_obd;
3251 fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id);
3253 CDEBUG(D_INFO, "trying to match res "LPU64":"LPU64":"LPU64"\n",
3258 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3259 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
3260 &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
3267 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3268 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3269 * and return success */
3271 /* This path cannot be hit for regular files unless in
3272 * case of obscure races, so no need to to validate
3274 if (!S_ISREG(inode->i_mode) &&
3275 !S_ISDIR(inode->i_mode))
3280 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3288 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3291 struct inode *inode = dentry->d_inode;
3292 struct ptlrpc_request *req = NULL;
3293 struct obd_export *exp;
3298 CERROR("REPORT THIS LINE TO PETER\n");
3301 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3302 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3304 exp = ll_i2mdcexp(inode);
3306 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3307 struct lookup_intent oit = { .it_op = IT_GETATTR };
3308 struct mdc_op_data op_data = { { 0 } };
3310 /* Call getattr by fid, so do not provide name at all. */
3311 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
3312 dentry->d_inode, NULL, 0, 0, NULL);
3313 oit.it_create_mode |= M_CHECK_STALE;
3314 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
3315 /* we are not interested in name
3318 ll_mdc_blocking_ast, 0);
3319 oit.it_create_mode &= ~M_CHECK_STALE;
3321 rc = ll_inode_revalidate_fini(inode, rc);
3325 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
3327 ll_intent_release(&oit);
3331 /* Unlinked? Unhash dentry, so it is not picked up later by
3332 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3333 here to preserve get_cwd functionality on 2.6.
3335 if (!dentry->d_inode->i_nlink) {
3336 spin_lock(&ll_lookup_lock);
3337 spin_lock(&dcache_lock);
3338 ll_drop_dentry(dentry);
3339 spin_unlock(&dcache_lock);
3340 spin_unlock(&ll_lookup_lock);
3343 ll_lookup_finish_locks(&oit, dentry);
3344 } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
3345 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3347 obd_valid valid = OBD_MD_FLGETATTR;
3350 if (S_ISREG(inode->i_mode)) {
3351 rc = ll_get_max_mdsize(sbi, &ealen);
3354 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3356 ll_inode2fid(&fid, inode);
3357 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
3359 rc = ll_inode_revalidate_fini(inode, rc);
3363 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
3368 ptlrpc_req_finished(req);
3372 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3374 struct inode *inode = dentry->d_inode;
3378 rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
3379 MDS_INODELOCK_LOOKUP);
3381 /* if object not yet allocated, don't validate size */
3382 if (rc == 0 && ll_i2info(inode)->lli_smd == NULL) {
3383 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3384 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3385 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3389 /* ll_glimpse_size will prefer locally cached writes if they extend
3393 rc = ll_glimpse_size(inode, 0);
3398 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3399 struct lookup_intent *it, struct kstat *stat)
3401 struct inode *inode = de->d_inode;
3404 res = ll_inode_revalidate_it(de, it);
3405 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
3410 stat->dev = inode->i_sb->s_dev;
3411 stat->ino = inode->i_ino;
3412 stat->mode = inode->i_mode;
3413 stat->nlink = inode->i_nlink;
3414 stat->uid = inode->i_uid;
3415 stat->gid = inode->i_gid;
3416 stat->rdev = kdev_t_to_nr(inode->i_rdev);
3417 stat->atime = inode->i_atime;
3418 stat->mtime = inode->i_mtime;
3419 stat->ctime = inode->i_ctime;
3420 #ifdef HAVE_INODE_BLKSIZE
3421 stat->blksize = inode->i_blksize;
3423 stat->blksize = 1<<inode->i_blkbits;
3426 ll_inode_size_lock(inode, 0);
3427 stat->size = i_size_read(inode);
3428 stat->blocks = inode->i_blocks;
3429 ll_inode_size_unlock(inode, 0);
3433 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3435 struct lookup_intent it = { .it_op = IT_GETATTR };
3437 return ll_getattr_it(mnt, de, &it, stat);
3440 #ifdef HAVE_LINUX_FIEMAP_H
3441 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3442 __u64 start, __u64 len)
3446 struct ll_user_fiemap *fiemap;
3447 unsigned int extent_count = fieinfo->fi_extents_max;
3449 num_bytes = sizeof(*fiemap) + (extent_count *
3450 sizeof(struct ll_fiemap_extent));
3451 OBD_VMALLOC(fiemap, num_bytes);
3456 fiemap->fm_flags = fieinfo->fi_flags;
3457 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3458 fiemap->fm_start = start;
3459 fiemap->fm_length = len;
3460 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3461 sizeof(struct ll_fiemap_extent));
3463 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3465 fieinfo->fi_flags = fiemap->fm_flags;
3466 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3467 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3468 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3470 OBD_VFREE(fiemap, num_bytes);
3477 int lustre_check_acl(struct inode *inode, int mask)
3479 #ifdef CONFIG_FS_POSIX_ACL
3480 struct ll_inode_info *lli = ll_i2info(inode);
3481 struct posix_acl *acl;
3485 spin_lock(&lli->lli_lock);
3486 acl = posix_acl_dup(lli->lli_posix_acl);
3487 spin_unlock(&lli->lli_lock);
3492 rc = posix_acl_permission(inode, acl, mask);
3493 posix_acl_release(acl);
3501 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3502 #ifndef HAVE_INODE_PERMISION_2ARGS
3503 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3505 int ll_inode_permission(struct inode *inode, int mask)
3511 /* as root inode are NOT getting validated in lookup operation,
3512 * need to do it before permission check. */
3514 if (inode == inode->i_sb->s_root->d_inode) {
3515 struct lookup_intent it = { .it_op = IT_LOOKUP };
3517 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3518 MDS_INODELOCK_LOOKUP);
3523 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3524 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3526 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3527 rc = generic_permission(inode, mask, lustre_check_acl);
3532 #ifndef HAVE_INODE_PERMISION_2ARGS
3533 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3535 int ll_inode_permission(struct inode *inode, int mask)
3538 int mode = inode->i_mode;
3541 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3542 inode->i_ino, inode->i_generation, inode, mask);
3543 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3545 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3546 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3548 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3550 if (current->fsuid == inode->i_uid) {
3553 if (((mode >> 3) & mask & S_IRWXO) != mask)
3555 rc = lustre_check_acl(inode, mask);
3559 goto check_capabilities;
3563 if (in_group_p(inode->i_gid))
3566 if ((mode & mask & S_IRWXO) == mask)
3570 if (!(mask & MAY_EXEC) ||
3571 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3572 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3575 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3576 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3583 /* -o localflock - only provides locally consistent flock locks */
3584 struct file_operations ll_file_operations = {
3585 .read = ll_file_read,
3586 #ifdef HAVE_FILE_READV
3587 .readv = ll_file_readv,
3589 .aio_read = ll_file_aio_read,
3591 .write = ll_file_write,
3592 #ifdef HAVE_FILE_WRITEV
3593 .writev = ll_file_writev,
3595 .aio_write = ll_file_aio_write,
3597 .ioctl = ll_file_ioctl,
3598 .open = ll_file_open,
3599 .release = ll_file_release,
3600 .mmap = ll_file_mmap,
3601 .llseek = ll_file_seek,
3602 #ifdef HAVE_KERNEL_SPLICE_READ
3603 .splice_read = ll_file_splice_read,
3605 #ifdef HAVE_KERNEL_SENDFILE
3606 .sendfile = ll_file_sendfile,
3612 struct file_operations ll_file_operations_flock = {
3613 .read = ll_file_read,
3614 #ifdef HAVE_FILE_READV
3615 .readv = ll_file_readv,
3617 .aio_read = ll_file_aio_read,
3619 .write = ll_file_write,
3620 #ifdef HAVE_FILE_WRITEV
3621 .writev = ll_file_writev,
3623 .aio_write = ll_file_aio_write,
3625 .ioctl = ll_file_ioctl,
3626 .open = ll_file_open,
3627 .release = ll_file_release,
3628 .mmap = ll_file_mmap,
3629 .llseek = ll_file_seek,
3630 #ifdef HAVE_KERNEL_SPLICE_READ
3631 .splice_read = ll_file_splice_read,
3633 #ifdef HAVE_KERNEL_SENDFILE
3634 .sendfile = ll_file_sendfile,
3638 #ifdef HAVE_F_OP_FLOCK
3639 .flock = ll_file_flock,
3641 .lock = ll_file_flock
3644 /* These are for -o noflock - to return ENOSYS on flock calls */
3645 struct file_operations ll_file_operations_noflock = {
3646 .read = ll_file_read,
3647 #ifdef HAVE_FILE_READV
3648 .readv = ll_file_readv,
3650 .aio_read = ll_file_aio_read,
3652 .write = ll_file_write,
3653 #ifdef HAVE_FILE_WRITEV
3654 .writev = ll_file_writev,
3656 .aio_write = ll_file_aio_write,
3658 .ioctl = ll_file_ioctl,
3659 .open = ll_file_open,
3660 .release = ll_file_release,
3661 .mmap = ll_file_mmap,
3662 .llseek = ll_file_seek,
3663 #ifdef HAVE_KERNEL_SPLICE_READ
3664 .splice_read = ll_file_splice_read,
3666 #ifdef HAVE_KERNEL_SENDFILE
3667 .sendfile = ll_file_sendfile,
3671 #ifdef HAVE_F_OP_FLOCK
3672 .flock = ll_file_noflock,
3674 .lock = ll_file_noflock
3677 struct inode_operations ll_file_inode_operations = {
3678 #ifdef HAVE_VFS_INTENT_PATCHES
3679 .setattr_raw = ll_setattr_raw,
3681 .setattr = ll_setattr,
3682 .truncate = ll_truncate,
3683 .getattr = ll_getattr,
3684 .permission = ll_inode_permission,
3685 .setxattr = ll_setxattr,
3686 .getxattr = ll_getxattr,
3687 .listxattr = ll_listxattr,
3688 .removexattr = ll_removexattr,
3689 #ifdef HAVE_LINUX_FIEMAP_H
3690 .fiemap = ll_fiemap,
3694 /* dynamic ioctl number support routins */
3695 static struct llioc_ctl_data {
3696 struct rw_semaphore ioc_sem;
3697 struct list_head ioc_head;
3699 __RWSEM_INITIALIZER(llioc.ioc_sem),
3700 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3705 struct list_head iocd_list;
3706 unsigned int iocd_size;
3707 llioc_callback_t iocd_cb;
3708 unsigned int iocd_count;
3709 unsigned int iocd_cmd[0];
3712 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3715 struct llioc_data *in_data = NULL;
3718 if (cb == NULL || cmd == NULL ||
3719 count > LLIOC_MAX_CMD || count < 0)
3722 size = sizeof(*in_data) + count * sizeof(unsigned int);
3723 OBD_ALLOC(in_data, size);
3724 if (in_data == NULL)
3727 memset(in_data, 0, sizeof(*in_data));
3728 in_data->iocd_size = size;
3729 in_data->iocd_cb = cb;
3730 in_data->iocd_count = count;
3731 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3733 down_write(&llioc.ioc_sem);
3734 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3735 up_write(&llioc.ioc_sem);
3740 void ll_iocontrol_unregister(void *magic)
3742 struct llioc_data *tmp;
3747 down_write(&llioc.ioc_sem);
3748 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3750 unsigned int size = tmp->iocd_size;
3752 list_del(&tmp->iocd_list);
3753 up_write(&llioc.ioc_sem);
3755 OBD_FREE(tmp, size);
3759 up_write(&llioc.ioc_sem);
3761 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3764 EXPORT_SYMBOL(ll_iocontrol_register);
3765 EXPORT_SYMBOL(ll_iocontrol_unregister);
3767 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3768 unsigned int cmd, unsigned long arg, int *rcp)
3770 enum llioc_iter ret = LLIOC_CONT;
3771 struct llioc_data *data;
3772 int rc = -EINVAL, i;
3774 down_read(&llioc.ioc_sem);
3775 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3776 for (i = 0; i < data->iocd_count; i++) {
3777 if (cmd != data->iocd_cmd[i])
3780 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3784 if (ret == LLIOC_STOP)
3787 up_read(&llioc.ioc_sem);