1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 /* also used by llite/special.c:ll_special_open() */
52 struct ll_file_data *ll_file_data_get(void)
54 struct ll_file_data *fd;
56 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
60 static void ll_file_data_put(struct ll_file_data *fd)
63 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 static int ll_close_inode_openhandle(struct inode *inode,
67 struct obd_client_handle *och)
69 struct ptlrpc_request *req = NULL;
70 struct obd_device *obd;
72 struct mdc_op_data data = { { 0 } };
76 obd = class_exp2obd(ll_i2mdcexp(inode));
78 CERROR("Invalid MDC connection handle "LPX64"\n",
79 ll_i2mdcexp(inode)->exp_handle.h_cookie);
84 * here we check if this is forced umount. If so this is called on
85 * canceling "open lock" and we do not call mdc_close() in this case, as
86 * it will not be successful, as import is already deactivated.
93 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
95 oa->o_id = inode->i_ino;
96 oa->o_valid = OBD_MD_FLID;
97 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
98 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
99 OBD_MD_FLATIME | OBD_MD_FLMTIME |
101 if (ll_is_inode_dirty(inode)) {
102 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
103 oa->o_valid |= OBD_MD_FLFLAGS;
105 ll_inode2fid(&data.fid1, inode);
106 rc = mdc_close(ll_i2mdcexp(inode), &data, oa, och, &req);
108 /* We are the last writer, so the MDS has instructed us to get
109 * the file size and any write cookies, then close again. */
110 ll_queue_done_writing(inode);
113 CERROR("inode %lu mdc close failed: rc = %d\n",
120 rc = ll_objects_destroy(req, inode);
122 CERROR("inode %lu ll_objects destroy: rc = %d\n",
126 ptlrpc_req_finished(req); /* This is close request */
129 mdc_clear_open_replay_data(och);
134 int ll_mdc_real_close(struct inode *inode, int flags)
136 struct ll_inode_info *lli = ll_i2info(inode);
138 struct obd_client_handle **och_p;
139 struct obd_client_handle *och;
144 if (flags & FMODE_WRITE) {
145 och_p = &lli->lli_mds_write_och;
146 och_usecount = &lli->lli_open_fd_write_count;
147 } else if (flags & FMODE_EXEC) {
148 och_p = &lli->lli_mds_exec_och;
149 och_usecount = &lli->lli_open_fd_exec_count;
151 LASSERT(flags & FMODE_READ);
152 och_p = &lli->lli_mds_read_och;
153 och_usecount = &lli->lli_open_fd_read_count;
156 down(&lli->lli_och_sem);
157 if (*och_usecount) { /* There are still users of this handle, so
159 up(&lli->lli_och_sem);
164 up(&lli->lli_och_sem);
166 if (och) { /* There might be a race and somebody have freed this och
168 rc = ll_close_inode_openhandle(inode, och);
169 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
170 OBD_FREE(och, sizeof *och);
176 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
179 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
180 struct ll_inode_info *lli = ll_i2info(inode);
184 /* clear group lock, if present */
185 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
186 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
187 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
188 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
192 /* Let's see if we have good enough OPEN lock on the file and if
193 we can skip talking to MDS */
194 if (file->f_dentry->d_inode) { /* Can this ever be false? */
196 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
197 struct lustre_handle lockh;
198 struct inode *inode = file->f_dentry->d_inode;
199 struct ldlm_res_id file_res_id;
201 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
202 fid_build_reg_res_name(ll_inode_lu_fid(inode), &file_res_id);
204 down(&lli->lli_och_sem);
205 if (fd->fd_omode & FMODE_WRITE) {
207 LASSERT(lli->lli_open_fd_write_count);
208 lli->lli_open_fd_write_count--;
209 } else if (fd->fd_omode & FMODE_EXEC) {
211 LASSERT(lli->lli_open_fd_exec_count);
212 lli->lli_open_fd_exec_count--;
215 LASSERT(lli->lli_open_fd_read_count);
216 lli->lli_open_fd_read_count--;
218 up(&lli->lli_och_sem);
220 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
221 &file_res_id, LDLM_IBITS, &policy,lockmode,
223 rc = ll_mdc_real_close(file->f_dentry->d_inode,
227 CERROR("Releasing a file %p with negative dentry %p. Name %s",
228 file, file->f_dentry, file->f_dentry->d_name.name);
231 LUSTRE_FPRIVATE(file) = NULL;
232 ll_file_data_put(fd);
237 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
239 /* While this returns an error code, fput() the caller does not, so we need
240 * to make every effort to clean up all of our state here. Also, applications
241 * rarely check close errors and even if an error is returned they will not
242 * re-try the close call.
244 int ll_file_release(struct inode *inode, struct file *file)
246 struct ll_file_data *fd;
247 struct ll_sb_info *sbi = ll_i2sbi(inode);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lov_stripe_md *lsm = lli->lli_smd;
253 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
254 inode->i_generation, inode);
257 if (inode->i_sb->s_root != file->f_dentry)
258 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
259 fd = LUSTRE_FPRIVATE(file);
262 /* The last ref on @file, maybe not the the owner pid of statahead.
263 * Different processes can open the same dir, "ll_opendir_key" means:
264 * it is me that should stop the statahead thread. */
265 if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
266 ll_stop_statahead(inode, lli->lli_opendir_key);
268 if (inode->i_sb->s_root == file->f_dentry) {
269 LUSTRE_FPRIVATE(file) = NULL;
270 ll_file_data_put(fd);
275 lov_test_and_clear_async_rc(lsm);
276 lli->lli_async_rc = 0;
278 rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
282 static int ll_intent_file_open(struct file *file, void *lmm,
283 int lmmsize, struct lookup_intent *itp)
285 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
286 struct mdc_op_data data = { { 0 } };
287 struct dentry *parent = file->f_dentry->d_parent;
288 const char *name = file->f_dentry->d_name.name;
289 const int len = file->f_dentry->d_name.len;
290 struct inode *inode = file->f_dentry->d_inode;
291 struct ptlrpc_request *req;
298 ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
299 name, len, O_RDWR, NULL);
301 /* Usually we come here only for NFSD, and we want open lock.
302 But we can also get here with pre 2.6.15 patchless kernels, and in
303 that case that lock is also ok */
304 /* We can also get here if there was cached open handle in revalidate_it
305 * but it disappeared while we were getting from there to ll_file_open.
306 * But this means this file was closed and immediatelly opened which
307 * makes a good candidate for using OPEN lock */
308 /* If lmmsize & lmm are not 0, we are just setting stripe info
309 * parameters. No need for the open lock */
310 if (!lmm && !lmmsize)
311 itp->it_flags |= MDS_OPEN_LOCK;
313 rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
314 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
316 /* reason for keep own exit path - don`t flood log
317 * with messages with -ESTALE errors.
319 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
320 it_open_error(DISP_OPEN_OPEN, itp))
322 ll_release_openhandle(file->f_dentry, itp);
326 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
327 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
328 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
332 if (itp->d.lustre.it_lock_mode)
333 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
336 rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
337 req, DLM_REPLY_REC_OFF, NULL);
339 ptlrpc_req_finished(itp->d.lustre.it_data);
340 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
341 ll_intent_drop_lock(itp);
347 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
348 struct obd_client_handle *och)
350 struct ptlrpc_request *req = it->d.lustre.it_data;
351 struct mds_body *body;
355 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
356 LASSERT(body != NULL); /* reply already checked out */
357 /* and swabbed in mdc_enqueue */
358 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
360 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
361 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
362 lli->lli_io_epoch = body->io_epoch;
364 mdc_set_open_replay_data(och, it->d.lustre.it_data);
367 int ll_local_open(struct file *file, struct lookup_intent *it,
368 struct ll_file_data *fd, struct obd_client_handle *och)
372 LASSERT(!LUSTRE_FPRIVATE(file));
377 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
378 LUSTRE_FPRIVATE(file) = fd;
379 ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
380 fd->fd_omode = it->it_flags;
385 /* Open a file, and (for the very first open) create objects on the OSTs at
386 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
387 * creation or open until ll_lov_setstripe() ioctl is called. We grab
388 * lli_open_sem to ensure no other process will create objects, send the
389 * stripe MD to the MDS, or try to destroy the objects if that fails.
391 * If we already have the stripe MD locally then we don't request it in
392 * mdc_open(), by passing a lmm_size = 0.
394 * It is up to the application to ensure no other processes open this file
395 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
396 * used. We might be able to avoid races of that sort by getting lli_open_sem
397 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
398 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
400 int ll_file_open(struct inode *inode, struct file *file)
402 struct ll_inode_info *lli = ll_i2info(inode);
403 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
404 .it_flags = file->f_flags };
405 struct lov_stripe_md *lsm;
406 struct ptlrpc_request *req = NULL;
407 struct obd_client_handle **och_p;
409 struct ll_file_data *fd;
410 int rc = 0, opendir_set = 0;
413 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
414 inode->i_generation, inode, file->f_flags);
416 #ifdef HAVE_VFS_INTENT_PATCHES
419 it = file->private_data; /* XXX: compat macro */
420 file->private_data = NULL; /* prevent ll_local_open assertion */
423 fd = ll_file_data_get();
427 if (S_ISDIR(inode->i_mode)) {
429 spin_lock(&lli->lli_lock);
430 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
431 LASSERT(lli->lli_sai == NULL);
432 lli->lli_opendir_key = fd;
433 lli->lli_opendir_pid = cfs_curproc_pid();
435 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
436 lli->lli_opendir_key != NULL)) {
437 /* Two cases for this:
438 * (1) The same process open such directory many times.
439 * (2) The old process opened the directory, and exited
440 * before its children processes. Then new process
441 * with the same pid opens such directory before the
442 * old process's children processes exit.
443 * reset stat ahead for such cases. */
444 spin_unlock(&lli->lli_lock);
445 CDEBUG(D_INFO, "Conflict statahead for %.*s %lu/%u"
446 " reset it.\n", file->f_dentry->d_name.len,
447 file->f_dentry->d_name.name,
448 inode->i_ino, inode->i_generation);
449 ll_stop_statahead(inode, lli->lli_opendir_key);
452 spin_unlock(&lli->lli_lock);
455 if (inode->i_sb->s_root == file->f_dentry) {
456 LUSTRE_FPRIVATE(file) = fd;
460 if (!it || !it->d.lustre.it_disposition) {
461 /* Convert f_flags into access mode. We cannot use file->f_mode,
462 * because everything but O_ACCMODE mask was stripped from it */
463 if ((oit.it_flags + 1) & O_ACCMODE)
465 if (file->f_flags & O_TRUNC)
466 oit.it_flags |= FMODE_WRITE;
468 /* kernel only call f_op->open in dentry_open. filp_open calls
469 * dentry_open after call to open_namei that checks permissions.
470 * Only nfsd_open call dentry_open directly without checking
471 * permissions and because of that this code below is safe. */
472 if (oit.it_flags & FMODE_WRITE)
473 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
475 /* We do not want O_EXCL here, presumably we opened the file
476 * already? XXX - NFS implications? */
477 oit.it_flags &= ~O_EXCL;
483 /* Let's see if we have file open on MDS already. */
484 if (it->it_flags & FMODE_WRITE) {
485 och_p = &lli->lli_mds_write_och;
486 och_usecount = &lli->lli_open_fd_write_count;
487 } else if (it->it_flags & FMODE_EXEC) {
488 och_p = &lli->lli_mds_exec_och;
489 och_usecount = &lli->lli_open_fd_exec_count;
491 och_p = &lli->lli_mds_read_och;
492 och_usecount = &lli->lli_open_fd_read_count;
495 LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
496 it->d.lustre.it_disposition);
498 down(&lli->lli_och_sem);
499 if (*och_p) { /* Open handle is present */
500 if (it_disposition(it, DISP_OPEN_OPEN)) {
501 /* Well, there's extra open request that we do not need,
502 let's close it somehow. This will decref request. */
503 rc = it_open_error(DISP_OPEN_OPEN, it);
505 up(&lli->lli_och_sem);
506 ll_file_data_put(fd);
507 GOTO(out_openerr, rc);
509 ll_release_openhandle(file->f_dentry, it);
510 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
515 rc = ll_local_open(file, it, fd, NULL);
517 LASSERTF(rc == 0, "rc = %d\n", rc);
519 LASSERT(*och_usecount == 0);
520 if (!it->d.lustre.it_disposition) {
521 /* We cannot just request lock handle now, new ELC code
522 means that one of other OPEN locks for this file
523 could be cancelled, and since blocking ast handler
524 would attempt to grab och_sem as well, that would
525 result in a deadlock */
526 up(&lli->lli_och_sem);
527 it->it_flags |= O_CHECK_STALE;
528 rc = ll_intent_file_open(file, NULL, 0, it);
529 it->it_flags &= ~O_CHECK_STALE;
531 ll_file_data_put(fd);
532 GOTO(out_openerr, rc);
535 mdc_set_lock_data(&it->d.lustre.it_lock_handle,
536 file->f_dentry->d_inode);
540 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
542 ll_file_data_put(fd);
543 GOTO(out_och_free, rc = -ENOMEM);
546 req = it->d.lustre.it_data;
548 /* mdc_intent_lock() didn't get a request ref if there was an
549 * open error, so don't do cleanup on the request here
551 /* XXX (green): Should not we bail out on any error here, not
552 * just open error? */
553 rc = it_open_error(DISP_OPEN_OPEN, it);
555 ll_file_data_put(fd);
556 GOTO(out_och_free, rc);
559 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
560 rc = ll_local_open(file, it, fd, *och_p);
561 LASSERTF(rc == 0, "rc = %d\n", rc);
563 up(&lli->lli_och_sem);
565 /* Must do this outside lli_och_sem lock to prevent deadlock where
566 different kind of OPEN lock for this same inode gets cancelled
567 by ldlm_cancel_lru */
568 if (!S_ISREG(inode->i_mode))
573 if (file->f_flags & O_LOV_DELAY_CREATE ||
574 !(file->f_mode & FMODE_WRITE)) {
575 CDEBUG(D_INODE, "object creation was delayed\n");
579 file->f_flags &= ~O_LOV_DELAY_CREATE;
582 ptlrpc_req_finished(req);
584 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
586 ll_open_complete(inode);
590 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
591 *och_p = NULL; /* OBD_FREE writes some magic there */
594 up(&lli->lli_och_sem);
596 if (opendir_set != 0)
597 ll_stop_statahead(inode, lli->lli_opendir_key);
603 /* Fills the obdo with the attributes for the inode defined by lsm */
604 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
607 struct ptlrpc_request_set *set;
608 struct obd_info oinfo = { { { 0 } } };
612 LASSERT(lsm != NULL);
614 memset(oa, 0, sizeof *oa);
617 oa->o_id = lsm->lsm_object_id;
618 oa->o_gr = lsm->lsm_object_gr;
619 oa->o_mode = S_IFREG;
620 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
621 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
622 OBD_MD_FLCTIME | OBD_MD_FLGROUP;
624 set = ptlrpc_prep_set();
628 rc = obd_getattr_async(exp, &oinfo, set);
630 rc = ptlrpc_set_wait(set);
631 ptlrpc_set_destroy(set);
636 oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
637 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
641 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
643 struct ll_inode_info *lli = ll_i2info(inode);
644 struct lov_stripe_md *lsm = lli->lli_smd;
645 struct obd_export *exp = ll_i2obdexp(inode);
648 struct ldlm_lock *lock;
649 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
650 __u32 stripe, vallen = sizeof(stripe);
651 struct lov_oinfo *loinfo;
655 if (lsm->lsm_stripe_count == 1)
656 GOTO(check, stripe = 0);
658 /* get our offset in the lov */
659 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
661 CERROR("obd_get_info: rc = %d\n", rc);
664 LASSERT(stripe < lsm->lsm_stripe_count);
667 loinfo = lsm->lsm_oinfo[stripe];
668 if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
669 &lock->l_resource->lr_name)) {
670 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
671 loinfo->loi_id, loinfo->loi_gr);
672 RETURN(-ELDLM_NO_LOCK_DATA);
678 /* Get extra page reference to ensure it is not going away */
679 void ll_pin_extent_cb(void *data)
681 struct page *page = data;
683 page_cache_get(page);
687 /* Flush the page from page cache for an extent as its canceled.
688 * Page to remove is delivered as @data.
690 * No one can dirty the extent until we've finished our work and they cannot
691 * enqueue another lock. The DLM protects us from ll_file_read/write here,
692 * but other kernel actors could have pages locked.
694 * If @discard is set, there is no need to write the page if it is dirty.
696 * Called with the DLM lock held. */
697 int ll_page_removal_cb(void *data, int discard)
700 struct page *page = data;
701 struct address_space *mapping;
705 /* We have page reference already from ll_pin_page */
708 /* Already truncated by somebody */
712 mapping = page->mapping;
714 ll_teardown_mmaps(mapping,
715 (__u64)page->index << PAGE_CACHE_SHIFT,
716 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
718 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
719 if (!discard && PageWriteback(page))
720 wait_on_page_writeback(page);
722 if (!discard && clear_page_dirty_for_io(page)) {
723 rc = ll_call_writepage(page->mapping->host, page);
724 /* either waiting for io to complete or reacquiring
725 * the lock that the failed writepage released */
727 wait_on_page_writeback(page);
729 CERROR("writepage inode %lu(%p) of page %p "
730 "failed: %d\n", mapping->host->i_ino,
731 mapping->host, page, rc);
733 set_bit(AS_ENOSPC, &mapping->flags);
735 set_bit(AS_EIO, &mapping->flags);
738 if (page->mapping != NULL) {
739 struct ll_async_page *llap = llap_cast_private(page);
740 // checking again to account for writeback's lock_page()
741 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
743 ll_ra_accounting(llap, page->mapping);
744 ll_truncate_complete_page(page);
748 LASSERT(!PageWriteback(page));
750 page_cache_release(page);
755 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
756 void *data, int flag)
759 struct ll_inode_info *lli;
760 struct lov_stripe_md *lsm;
766 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
767 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
771 inode = ll_inode_from_lock(lock);
774 lli = ll_i2info(inode);
777 if (lli->lli_smd == NULL)
781 stripe = ll_lock_to_stripe_offset(inode, lock);
785 lov_stripe_lock(lsm);
786 lock_res_and_lock(lock);
787 kms = ldlm_extent_shift_kms(lock,
788 lsm->lsm_oinfo[stripe]->loi_kms);
790 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
791 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
792 lsm->lsm_oinfo[stripe]->loi_kms, kms);
793 lsm->lsm_oinfo[stripe]->loi_kms = kms;
794 unlock_res_and_lock(lock);
795 lov_stripe_unlock(lsm);
796 ll_try_done_writing(inode);
805 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
807 /* XXX ALLOCATE - 160 bytes */
808 struct inode *inode = ll_inode_from_lock(lock);
809 struct ll_inode_info *lli = ll_i2info(inode);
810 struct lustre_handle lockh = { 0 };
815 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
816 LDLM_FL_BLOCK_CONV)) {
817 LBUG(); /* not expecting any blocked async locks yet */
818 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
820 ldlm_lock_dump(D_OTHER, lock, 0);
821 ldlm_reprocess_all(lock->l_resource);
825 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
827 stripe = ll_lock_to_stripe_offset(inode, lock);
831 if (lock->l_lvb_len) {
832 struct lov_stripe_md *lsm = lli->lli_smd;
834 lvb = lock->l_lvb_data;
835 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
837 lock_res_and_lock(lock);
838 ll_inode_size_lock(inode, 1);
839 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
840 kms = ldlm_extent_shift_kms(NULL, kms);
841 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
842 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
843 lsm->lsm_oinfo[stripe].loi_kms, kms);
844 lsm->lsm_oinfo[stripe].loi_kms = kms;
845 ll_inode_size_unlock(inode, 1);
846 unlock_res_and_lock(lock);
851 wake_up(&lock->l_waitq);
853 ldlm_lock2handle(lock, &lockh);
854 ldlm_lock_decref(&lockh, LCK_PR);
859 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
861 struct ptlrpc_request *req = reqp;
862 struct inode *inode = ll_inode_from_lock(lock);
863 struct ll_inode_info *lli;
864 struct lov_stripe_md *lsm;
867 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
871 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
872 lli = ll_i2info(inode);
874 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
877 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
879 /* First, find out which stripe index this lock corresponds to. */
880 stripe = ll_lock_to_stripe_offset(inode, lock);
882 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
884 rc = lustre_pack_reply(req, 2, size, NULL);
888 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
889 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
890 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
891 lvb->lvb_atime = LTIME_S(inode->i_atime);
892 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
894 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
895 " atime "LPU64", mtime "LPU64", ctime "LPU64,
896 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_atime,
897 lvb->lvb_mtime, lvb->lvb_ctime);
902 /* These errors are normal races, so we don't want to fill the console
903 * with messages by calling ptlrpc_error() */
904 if (rc == -ELDLM_NO_LOCK_DATA)
905 lustre_pack_reply(req, 1, NULL, NULL);
911 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
914 struct lustre_handle lockh = { 0 };
915 struct ldlm_enqueue_info einfo = { 0 };
916 struct obd_info oinfo = { { { 0 } } };
922 einfo.ei_type = LDLM_EXTENT;
923 einfo.ei_mode = LCK_PR;
924 einfo.ei_cb_bl = osc_extent_blocking_cb;
925 einfo.ei_cb_cp = ldlm_completion_ast;
926 einfo.ei_cb_gl = ll_glimpse_callback;
927 einfo.ei_cbdata = NULL;
929 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
930 oinfo.oi_lockh = &lockh;
932 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
934 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
938 CERROR("obd_enqueue returned rc %d, "
939 "returning -EIO\n", rc);
940 RETURN(rc > 0 ? -EIO : rc);
943 lov_stripe_lock(lsm);
944 memset(&lvb, 0, sizeof(lvb));
945 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
946 st->st_size = lvb.lvb_size;
947 st->st_blocks = lvb.lvb_blocks;
948 st->st_mtime = lvb.lvb_mtime;
949 st->st_atime = lvb.lvb_atime;
950 st->st_ctime = lvb.lvb_ctime;
951 lov_stripe_unlock(lsm);
956 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
957 * file (because it prefers KMS over RSS when larger) */
958 int ll_glimpse_size(struct inode *inode, int ast_flags)
960 struct ll_inode_info *lli = ll_i2info(inode);
961 struct ll_sb_info *sbi = ll_i2sbi(inode);
962 struct lustre_handle lockh = { 0 };
963 struct ldlm_enqueue_info einfo = { 0 };
964 struct obd_info oinfo = { { { 0 } } };
969 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
972 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
976 /* NOTE: this looks like DLM lock request, but it may not be one. Due
977 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
978 * won't revoke any conflicting DLM locks held. Instead,
979 * ll_glimpse_callback() will be called on each client
980 * holding a DLM lock against this file, and resulting size
981 * will be returned for each stripe. DLM lock on [0, EOF] is
982 * acquired only if there were no conflicting locks. */
983 einfo.ei_type = LDLM_EXTENT;
984 einfo.ei_mode = LCK_PR;
985 einfo.ei_cb_bl = osc_extent_blocking_cb;
986 einfo.ei_cb_cp = ldlm_completion_ast;
987 einfo.ei_cb_gl = ll_glimpse_callback;
988 einfo.ei_cbdata = inode;
990 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
991 oinfo.oi_lockh = &lockh;
992 oinfo.oi_md = lli->lli_smd;
993 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
995 rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
999 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1000 RETURN(rc > 0 ? -EIO : rc);
1003 ll_inode_size_lock(inode, 1);
1004 inode_init_lvb(inode, &lvb);
1005 /* merge timestamps the most resently obtained from mds with
1006 timestamps obtained from osts */
1007 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
1008 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
1009 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
1010 rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1011 i_size_write(inode, lvb.lvb_size);
1012 inode->i_blocks = lvb.lvb_blocks;
1013 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1014 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1015 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1016 ll_inode_size_unlock(inode, 1);
1018 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1019 i_size_read(inode), (long long)inode->i_blocks);
1024 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1025 struct lov_stripe_md *lsm, int mode,
1026 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1029 struct ll_sb_info *sbi = ll_i2sbi(inode);
1031 struct ldlm_enqueue_info einfo = { 0 };
1032 struct obd_info oinfo = { { { 0 } } };
1036 LASSERT(!lustre_handle_is_used(lockh));
1037 LASSERT(lsm != NULL);
1039 /* don't drop the mmapped file to LRU */
1040 if (mapping_mapped(inode->i_mapping))
1041 ast_flags |= LDLM_FL_NO_LRU;
1043 /* XXX phil: can we do this? won't it screw the file size up? */
1044 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1045 (sbi->ll_flags & LL_SBI_NOLCK))
1048 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1049 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1051 einfo.ei_type = LDLM_EXTENT;
1052 einfo.ei_mode = mode;
1053 einfo.ei_cb_bl = osc_extent_blocking_cb;
1054 einfo.ei_cb_cp = ldlm_completion_ast;
1055 einfo.ei_cb_gl = ll_glimpse_callback;
1056 einfo.ei_cbdata = inode;
1058 oinfo.oi_policy = *policy;
1059 oinfo.oi_lockh = lockh;
1061 oinfo.oi_flags = ast_flags;
1063 rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1064 *policy = oinfo.oi_policy;
1068 ll_inode_size_lock(inode, 1);
1069 inode_init_lvb(inode, &lvb);
1070 obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1072 if (policy->l_extent.start == 0 &&
1073 policy->l_extent.end == OBD_OBJECT_EOF) {
1074 /* vmtruncate()->ll_truncate() first sets the i_size and then
1075 * the kms under both a DLM lock and the
1076 * ll_inode_size_lock(). If we don't get the
1077 * ll_inode_size_lock() here we can match the DLM lock and
1078 * reset i_size from the kms before the truncating path has
1079 * updated the kms. generic_file_write can then trust the
1080 * stale i_size when doing appending writes and effectively
1081 * cancel the result of the truncate. Getting the
1082 * ll_inode_size_lock() after the enqueue maintains the DLM
1083 * -> ll_inode_size_lock() acquiring order. */
1084 i_size_write(inode, lvb.lvb_size);
1085 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1086 inode->i_ino, i_size_read(inode));
1090 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1091 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1092 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1094 ll_inode_size_unlock(inode, 1);
1099 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1100 struct lov_stripe_md *lsm, int mode,
1101 struct lustre_handle *lockh)
1103 struct ll_sb_info *sbi = ll_i2sbi(inode);
1107 /* XXX phil: can we do this? won't it screw the file size up? */
1108 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1109 (sbi->ll_flags & LL_SBI_NOLCK))
1112 rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
1117 static void ll_set_file_contended(struct inode *inode)
1119 struct ll_inode_info *lli = ll_i2info(inode);
1121 lli->lli_contention_time = cfs_time_current();
1122 set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1125 void ll_clear_file_contended(struct inode *inode)
1127 struct ll_inode_info *lli = ll_i2info(inode);
1129 clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1132 static int ll_is_file_contended(struct file *file)
1134 struct inode *inode = file->f_dentry->d_inode;
1135 struct ll_inode_info *lli = ll_i2info(inode);
1136 struct ll_sb_info *sbi = ll_i2sbi(inode);
1137 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1140 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1141 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1142 " osc connect flags = 0x"LPX64"\n",
1143 sbi->ll_lco.lco_flags);
1146 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1148 if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1149 cfs_time_t cur_time = cfs_time_current();
1150 cfs_time_t retry_time;
1152 retry_time = cfs_time_add(
1153 lli->lli_contention_time,
1154 cfs_time_seconds(sbi->ll_contention_time));
1155 if (cfs_time_after(cur_time, retry_time)) {
1156 ll_clear_file_contended(inode);
1164 static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
1165 struct file *file, const struct iovec *iov,
1166 unsigned long nr_segs,
1167 obd_off start, obd_off end, int rw)
1170 int tree_locked = 0;
1172 struct inode * inode = file->f_dentry->d_inode;
1175 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1177 if (append || !ll_is_file_contended(file)) {
1178 struct ll_lock_tree_node *node;
1181 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1182 if (file->f_flags & O_NONBLOCK)
1183 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1184 node = ll_node_from_inode(inode, start, end,
1185 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1190 tree->lt_fd = LUSTRE_FPRIVATE(file);
1191 rc = ll_tree_lock_iov(tree, node, iov, nr_segs, ast_flags);
1194 else if (rc == -EUSERS)
1195 ll_set_file_contended(inode);
1199 RETURN(tree_locked);
1204 /* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
1206 static size_t ll_file_get_iov_count(const struct iovec *iov,
1207 unsigned long *nr_segs)
1212 for (seg = 0; seg < *nr_segs; seg++) {
1213 const struct iovec *iv = &iov[seg];
1216 * If any segment has a negative length, or the cumulative
1217 * length ever wraps negative then return -EINVAL.
1219 count += iv->iov_len;
1220 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1222 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1227 count -= iv->iov_len; /* This segment is no good */
1233 static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out,
1234 unsigned long *nrsegs_copy,
1235 struct iovec *iov_copy, size_t *offset,
1239 const struct iovec *iov = *iov_out;
1240 for (i = 0; i < *nr_segs;
1242 const struct iovec *iv = &iov[i];
1243 struct iovec *ivc = &iov_copy[i];
1246 ivc->iov_len -= *offset;
1247 ivc->iov_base += *offset;
1249 if (ivc->iov_len >= size) {
1250 ivc->iov_len = size;
1257 size -= ivc->iov_len;
1261 *nrsegs_copy = i + 1;
1266 static int ll_reget_short_lock(struct page *page, int rw,
1267 obd_off start, obd_off end,
1270 struct ll_async_page *llap;
1271 struct obd_export *exp;
1272 struct inode *inode = page->mapping->host;
1276 exp = ll_i2obdexp(inode);
1280 llap = llap_cast_private(page);
1284 RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1285 &llap->llap_cookie, rw, start, end,
1289 static void ll_release_short_lock(struct inode *inode, obd_off end,
1290 void *cookie, int rw)
1292 struct obd_export *exp;
1295 exp = ll_i2obdexp(inode);
1299 rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1302 CERROR("unlock failed (%d)\n", rc);
1305 static inline int ll_file_get_fast_lock(struct file *file,
1306 obd_off ppos, obd_off end,
1307 const struct iovec *iov,
1308 unsigned long nr_segs,
1309 void **cookie, int rw)
1316 /* we would like this read request to be lockfree */
1317 for (seg = 0; seg < nr_segs; seg++) {
1318 const struct iovec *iv = &iov[seg];
1319 if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len))
1323 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1324 ppos >> CFS_PAGE_SHIFT);
1326 if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1330 page_cache_release(page);
1337 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1338 void *cookie, int rw)
1340 ll_release_short_lock(inode, end, cookie, rw);
1343 enum ll_lock_style {
1344 LL_LOCK_STYLE_NOLOCK = 0,
1345 LL_LOCK_STYLE_FASTLOCK = 1,
1346 LL_LOCK_STYLE_TREELOCK = 2
1349 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1350 obd_off end, const struct iovec *iov,
1351 unsigned long nr_segs, void **cookie,
1352 struct ll_lock_tree *tree, int rw)
1358 if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, cookie, rw))
1359 RETURN(LL_LOCK_STYLE_FASTLOCK);
1361 rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs,
1363 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1366 RETURN(LL_LOCK_STYLE_TREELOCK);
1368 RETURN(LL_LOCK_STYLE_NOLOCK);
1371 /* an error happened if we reached this point, rc = -errno here */
1375 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1376 enum ll_lock_style lock_style,
1377 void *cookie, struct ll_lock_tree *tree,
1381 switch (lock_style) {
1382 case LL_LOCK_STYLE_TREELOCK:
1383 ll_tree_unlock(tree);
1385 case LL_LOCK_STYLE_FASTLOCK:
1386 ll_file_put_fast_lock(inode, end, cookie, rw);
1389 CERROR("invalid locking style (%d)\n", lock_style);
1393 #ifdef HAVE_FILE_READV
1394 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1395 unsigned long nr_segs, loff_t *ppos)
1398 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1399 unsigned long nr_segs, loff_t pos)
1401 struct file *file = iocb->ki_filp;
1402 loff_t *ppos = &iocb->ki_pos;
1404 struct inode *inode = file->f_dentry->d_inode;
1405 struct ll_inode_info *lli = ll_i2info(inode);
1406 struct lov_stripe_md *lsm = lli->lli_smd;
1407 struct ll_sb_info *sbi = ll_i2sbi(inode);
1408 struct ll_lock_tree tree;
1410 struct ll_ra_read bead;
1413 ssize_t retval, chunk, sum = 0;
1415 struct iovec *iov_copy = NULL;
1416 unsigned long nrsegs_copy, nrsegs_orig = 0;
1417 size_t count, iov_offset = 0;
1422 count = ll_file_get_iov_count(iov, &nr_segs);
1423 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1424 inode->i_ino, inode->i_generation, inode, count, *ppos);
1425 /* "If nbyte is 0, read() will return 0 and have no other results."
1426 * -- Single Unix Spec */
1430 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1433 /* Read on file with no objects should return zero-filled
1434 * buffers up to file size (we can get non-zero sizes with
1435 * mknod + truncate, then opening file for read. This is a
1436 * common pattern in NFS case, it seems). Bug 6243 */
1438 /* Since there are no objects on OSTs, we have nothing to get
1439 * lock on and so we are forced to access inode->i_size
1442 /* Read beyond end of file */
1443 if (*ppos >= i_size_read(inode))
1446 if (count > i_size_read(inode) - *ppos)
1447 count = i_size_read(inode) - *ppos;
1448 /* Make sure to correctly adjust the file pos pointer for
1450 for (nrsegs_copy = 0; nrsegs_copy < nr_segs; nrsegs_copy++) {
1451 const struct iovec *iv = &iov[nrsegs_copy];
1453 if (count < iv->iov_len)
1456 chunk = iv->iov_len;
1457 notzeroed = clear_user(iv->iov_base, chunk);
1458 sum += (chunk - notzeroed);
1459 count -= (chunk - notzeroed);
1460 if (notzeroed || !count)
1470 if (sbi->ll_max_rw_chunk != 0) {
1471 /* first, let's know the end of the current stripe */
1473 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,&end);
1475 /* correct, the end is beyond the request */
1476 if (end > *ppos + count - 1)
1477 end = *ppos + count - 1;
1479 /* and chunk shouldn't be too large even if striping is wide */
1480 if (end - *ppos > sbi->ll_max_rw_chunk)
1481 end = *ppos + sbi->ll_max_rw_chunk - 1;
1483 chunk = end - *ppos + 1;
1484 if ((count == chunk) && (iov_offset == 0)) {
1486 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1488 iov_copy = (struct iovec *)iov;
1489 nrsegs_copy = nr_segs;
1492 nrsegs_orig = nr_segs;
1493 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1495 GOTO(out, retval = -ENOMEM);
1498 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1499 &iov_offset, chunk);
1502 end = *ppos + count - 1;
1503 iov_copy = (struct iovec *)iov;
1504 nrsegs_copy = nr_segs;
1507 down_read(&lli->lli_truncate_rwsem); /* Bug 18233 */
1509 lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1510 iov_copy, nrsegs_copy, &cookie, &tree,
1512 if (lock_style < 0 || lock_style == LL_LOCK_STYLE_NOLOCK)
1513 up_read(&lli->lli_truncate_rwsem);
1515 GOTO(out, retval = lock_style);
1517 ll_inode_size_lock(inode, 1);
1519 * Consistency guarantees: following possibilities exist for the
1520 * relation between region being read and real file size at this
1523 * (A): the region is completely inside of the file;
1525 * (B-x): x bytes of region are inside of the file, the rest is
1528 * (C): the region is completely outside of the file.
1530 * This classification is stable under DLM lock acquired by
1531 * ll_tree_lock() above, because to change class, other client has to
1532 * take DLM lock conflicting with our lock. Also, any updates to
1533 * ->i_size by other threads on this client are serialized by
1534 * ll_inode_size_lock(). This guarantees that short reads are handled
1535 * correctly in the face of concurrent writes and truncates.
1537 inode_init_lvb(inode, &lvb);
1538 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1540 if (*ppos + count - 1 > kms) {
1541 /* A glimpse is necessary to determine whether we return a
1542 * short read (B) or some zeroes at the end of the buffer (C) */
1543 ll_inode_size_unlock(inode, 1);
1544 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1546 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1547 ll_file_put_lock(inode, end, lock_style,
1548 cookie, &tree, OBD_BRW_READ);
1549 up_read(&lli->lli_truncate_rwsem);
1554 /* region is within kms and, hence, within real file size (A).
1555 * We need to increase i_size to cover the read region so that
1556 * generic_file_read() will do its job, but that doesn't mean
1557 * the kms size is _correct_, it is only the _minimum_ size.
1558 * If someone does a stat they will get the correct size which
1559 * will always be >= the kms value here. b=11081 */
1560 if (i_size_read(inode) < kms)
1561 i_size_write(inode, kms);
1562 ll_inode_size_unlock(inode, 1);
1565 chunk = end - *ppos + 1;
1566 CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1567 inode->i_ino, chunk, *ppos, i_size_read(inode));
1569 /* turn off the kernel's read-ahead */
1570 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1573 * 1. update inode's atime as long as concurrent stat
1574 * (via ll_glimpse_size) might bring out-of-date ones
1576 * 2. update lsm so that next stat (via
1577 * ll_glimpse_size) could get correct values in lsm */
1578 struct ost_lvb xtimes;
1580 lov_stripe_lock(lsm);
1581 LTIME_S(inode->i_atime) = LTIME_S(CURRENT_TIME);
1582 xtimes.lvb_atime = LTIME_S(inode->i_atime);
1583 obd_update_lvb(sbi->ll_osc_exp, lsm, &xtimes,
1585 lov_stripe_unlock(lsm);
1587 file->f_ra.ra_pages = 0;
1588 /* initialize read-ahead window once per syscall */
1591 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1592 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1593 ll_ra_read_in(file, &bead);
1597 file_accessed(file);
1598 #ifdef HAVE_FILE_READV
1599 retval = generic_file_readv(file, iov_copy, nrsegs_copy, ppos);
1601 retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy,
1604 ll_file_put_lock(inode, end, lock_style, cookie,
1605 &tree, OBD_BRW_READ);
1606 up_read(&lli->lli_truncate_rwsem);
1610 * current time will get into request as atime
1611 * (lustre/osc/osc_request.c:osc_build_request())
1613 retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy, ppos,
1616 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1620 if (retval == chunk && count > 0)
1626 ll_ra_read_ex(file, &bead);
1627 retval = (sum > 0) ? sum : retval;
1629 if (iov_copy && iov_copy != iov)
1630 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1635 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1638 struct iovec local_iov = { .iov_base = (void __user *)buf,
1640 #ifdef HAVE_FILE_READV
1641 return ll_file_readv(file, &local_iov, 1, ppos);
1646 init_sync_kiocb(&kiocb, file);
1647 kiocb.ki_pos = *ppos;
1648 kiocb.ki_left = count;
1650 ret = ll_file_aio_read(&kiocb, &local_iov, 1, kiocb.ki_pos);
1651 *ppos = kiocb.ki_pos;
1657 * Write to a file (through the page cache).
1659 #ifdef HAVE_FILE_WRITEV
1660 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1661 unsigned long nr_segs, loff_t *ppos)
1663 #else /* AIO stuff */
1664 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1665 unsigned long nr_segs, loff_t pos)
1667 struct file *file = iocb->ki_filp;
1668 loff_t *ppos = &iocb->ki_pos;
1670 struct inode *inode = file->f_dentry->d_inode;
1671 struct ll_sb_info *sbi = ll_i2sbi(inode);
1672 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1673 struct ll_lock_tree tree;
1674 loff_t maxbytes = ll_file_maxbytes(inode);
1675 loff_t lock_start, lock_end, end;
1676 ssize_t retval, chunk, sum = 0;
1678 struct iovec *iov_copy = NULL;
1679 unsigned long nrsegs_copy, nrsegs_orig = 0;
1680 size_t count, iov_offset = 0;
1683 count = ll_file_get_iov_count(iov, &nr_segs);
1685 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1686 inode->i_ino, inode->i_generation, inode, count, *ppos);
1688 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1690 /* POSIX, but surprised the VFS doesn't check this already */
1694 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1695 * called on the file, don't fail the below assertion (bug 2388). */
1696 if (file->f_flags & O_LOV_DELAY_CREATE &&
1697 ll_i2info(inode)->lli_smd == NULL)
1700 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1702 down(&ll_i2info(inode)->lli_write_sem);
1705 chunk = 0; /* just to fix gcc's warning */
1706 end = *ppos + count - 1;
1708 if (file->f_flags & O_APPEND) {
1710 lock_end = OBD_OBJECT_EOF;
1711 iov_copy = (struct iovec *)iov;
1712 nrsegs_copy = nr_segs;
1713 } else if (sbi->ll_max_rw_chunk != 0) {
1714 /* first, let's know the end of the current stripe */
1716 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1719 /* correct, the end is beyond the request */
1720 if (end > *ppos + count - 1)
1721 end = *ppos + count - 1;
1723 /* and chunk shouldn't be too large even if striping is wide */
1724 if (end - *ppos > sbi->ll_max_rw_chunk)
1725 end = *ppos + sbi->ll_max_rw_chunk - 1;
1728 chunk = end - *ppos + 1;
1729 if ((count == chunk) && (iov_offset == 0)) {
1731 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1733 iov_copy = (struct iovec *)iov;
1734 nrsegs_copy = nr_segs;
1737 nrsegs_orig = nr_segs;
1738 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1740 GOTO(out, retval = -ENOMEM);
1742 iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1743 &iov_offset, chunk);
1748 iov_copy = (struct iovec *)iov;
1749 nrsegs_copy = nr_segs;
1752 tree_locked = ll_file_get_tree_lock_iov(&tree, file, iov_copy,
1754 (obd_off)lock_start,
1757 if (tree_locked < 0)
1758 GOTO(out, retval = tree_locked);
1760 /* This is ok, g_f_w will overwrite this under i_sem if it races
1761 * with a local truncate, it just makes our maxbyte checking easier.
1762 * The i_size value gets updated in ll_extent_lock() as a consequence
1763 * of the [0,EOF] extent lock we requested above. */
1764 if (file->f_flags & O_APPEND) {
1765 *ppos = i_size_read(inode);
1766 end = *ppos + count - 1;
1769 if (*ppos >= maxbytes) {
1770 send_sig(SIGXFSZ, current, 0);
1771 GOTO(out_unlock, retval = -EFBIG);
1773 if (end > maxbytes - 1)
1776 /* generic_file_write handles O_APPEND after getting i_mutex */
1777 chunk = end - *ppos + 1;
1778 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1779 inode->i_ino, chunk, *ppos);
1781 /* write under locks
1783 * 1. update inode's mtime and ctime as long as
1784 * concurrent stat (via ll_glimpse_size) might bring
1787 * 2. update lsm so that next stat (via
1788 * ll_glimpse_size) could get correct values in lsm */
1789 struct ost_lvb xtimes;
1791 lov_stripe_lock(lsm);
1792 LTIME_S(inode->i_mtime) = LTIME_S(CURRENT_TIME);
1793 LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
1794 xtimes.lvb_mtime = LTIME_S(inode->i_mtime);
1795 xtimes.lvb_ctime = LTIME_S(inode->i_ctime);
1796 obd_update_lvb(sbi->ll_osc_exp, lsm, &xtimes,
1797 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1798 lov_stripe_unlock(lsm);
1800 #ifdef HAVE_FILE_WRITEV
1801 retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
1803 retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
1809 * current time will get into request as mtime and
1810 * ctime (lustre/osc/osc_request.c:osc_build_request())
1812 retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy,
1813 ppos, WRITE, chunk);
1815 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1819 ll_tree_unlock(&tree);
1825 if (retval == chunk && count > 0)
1829 up(&ll_i2info(inode)->lli_write_sem);
1831 if (iov_copy && iov_copy != iov)
1832 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1834 retval = (sum > 0) ? sum : retval;
1835 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1836 retval > 0 ? retval : 0);
1840 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1843 struct iovec local_iov = { .iov_base = (void __user *)buf,
1846 #ifdef HAVE_FILE_WRITEV
1847 return ll_file_writev(file, &local_iov, 1, ppos);
1852 init_sync_kiocb(&kiocb, file);
1853 kiocb.ki_pos = *ppos;
1854 kiocb.ki_left = count;
1856 ret = ll_file_aio_write(&kiocb, &local_iov, 1, kiocb.ki_pos);
1857 *ppos = kiocb.ki_pos;
1863 #ifdef HAVE_KERNEL_SENDFILE
1865 * Send file content (through pagecache) somewhere with helper
1867 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,
1868 size_t count, read_actor_t actor, void *target)
1870 struct inode *inode = in_file->f_dentry->d_inode;
1871 struct ll_inode_info *lli = ll_i2info(inode);
1872 struct lov_stripe_md *lsm = lli->lli_smd;
1873 struct ll_lock_tree tree;
1874 struct ll_lock_tree_node *node;
1876 struct ll_ra_read bead;
1881 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1882 inode->i_ino, inode->i_generation, inode, count, *ppos);
1884 /* "If nbyte is 0, read() will return 0 and have no other results."
1885 * -- Single Unix Spec */
1889 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1890 /* turn off the kernel's read-ahead */
1891 in_file->f_ra.ra_pages = 0;
1893 /* File with no objects, nothing to lock */
1895 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
1899 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1901 RETURN(PTR_ERR(node));
1903 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1904 rc = ll_tree_lock(&tree, node, NULL, count,
1905 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1909 ll_clear_file_contended(inode);
1910 ll_inode_size_lock(inode, 1);
1912 * Consistency guarantees: following possibilities exist for the
1913 * relation between region being read and real file size at this
1916 * (A): the region is completely inside of the file;
1918 * (B-x): x bytes of region are inside of the file, the rest is
1921 * (C): the region is completely outside of the file.
1923 * This classification is stable under DLM lock acquired by
1924 * ll_tree_lock() above, because to change class, other client has to
1925 * take DLM lock conflicting with our lock. Also, any updates to
1926 * ->i_size by other threads on this client are serialized by
1927 * ll_inode_size_lock(). This guarantees that short reads are handled
1928 * correctly in the face of concurrent writes and truncates.
1930 inode_init_lvb(inode, &lvb);
1931 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1933 if (*ppos + count - 1 > kms) {
1934 /* A glimpse is necessary to determine whether we return a
1935 * short read (B) or some zeroes at the end of the buffer (C) */
1936 ll_inode_size_unlock(inode, 1);
1937 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1941 /* region is within kms and, hence, within real file size (A) */
1942 i_size_write(inode, kms);
1943 ll_inode_size_unlock(inode, 1);
1946 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1947 inode->i_ino, count, *ppos, i_size_read(inode));
1949 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1950 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1951 ll_ra_read_in(in_file, &bead);
1953 file_accessed(in_file);
1954 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
1955 ll_ra_read_ex(in_file, &bead);
1958 ll_tree_unlock(&tree);
1964 * http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=f0930fffa99e7fe0a0c4b6c7d9a244dc88288c27
1966 #ifdef HAVE_KERNEL_SPLICE_READ
1967 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1968 struct pipe_inode_info *pipe, size_t count,
1971 struct inode *inode = in_file->f_dentry->d_inode;
1972 struct ll_inode_info *lli = ll_i2info(inode);
1973 struct lov_stripe_md *lsm = lli->lli_smd;
1974 struct ll_lock_tree tree;
1975 struct ll_lock_tree_node *node;
1977 struct ll_ra_read bead;
1982 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1983 inode->i_ino, inode->i_generation, inode, count, *ppos);
1985 /* "If nbyte is 0, read() will return 0 and have no other results."
1986 * -- Single Unix Spec */
1990 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1991 /* turn off the kernel's read-ahead */
1992 in_file->f_ra.ra_pages = 0;
1994 /* File with no objects, nothing to lock */
1996 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2000 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2002 RETURN(PTR_ERR(node));
2004 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2005 rc = ll_tree_lock(&tree, node, NULL, count,
2006 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2010 ll_clear_file_contended(inode);
2011 ll_inode_size_lock(inode, 1);
2013 * Consistency guarantees: following possibilities exist for the
2014 * relation between region being read and real file size at this
2017 * (A): the region is completely inside of the file;
2019 * (B-x): x bytes of region are inside of the file, the rest is
2022 * (C): the region is completely outside of the file.
2024 * This classification is stable under DLM lock acquired by
2025 * ll_tree_lock() above, because to change class, other client has to
2026 * take DLM lock conflicting with our lock. Also, any updates to
2027 * ->i_size by other threads on this client are serialized by
2028 * ll_inode_size_lock(). This guarantees that short reads are handled
2029 * correctly in the face of concurrent writes and truncates.
2031 inode_init_lvb(inode, &lvb);
2032 obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2034 if (*ppos + count - 1 > kms) {
2035 /* A glimpse is necessary to determine whether we return a
2036 * short read (B) or some zeroes at the end of the buffer (C) */
2037 ll_inode_size_unlock(inode, 1);
2038 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2042 /* region is within kms and, hence, within real file size (A) */
2043 i_size_write(inode, kms);
2044 ll_inode_size_unlock(inode, 1);
2047 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
2048 inode->i_ino, count, *ppos, i_size_read(inode));
2050 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
2051 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
2052 ll_ra_read_in(in_file, &bead);
2054 file_accessed(in_file);
2055 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2056 ll_ra_read_ex(in_file, &bead);
2059 ll_tree_unlock(&tree);
2064 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
2067 struct ll_inode_info *lli = ll_i2info(inode);
2068 struct obd_export *exp = ll_i2obdexp(inode);
2069 struct ll_recreate_obj ucreatp;
2070 struct obd_trans_info oti = { 0 };
2071 struct obdo *oa = NULL;
2074 struct lov_stripe_md *lsm, *lsm2;
2077 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2080 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
2081 sizeof(struct ll_recreate_obj));
2089 down(&lli->lli_size_sem);
2092 GOTO(out, rc = -ENOENT);
2093 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
2094 (lsm->lsm_stripe_count));
2096 OBD_ALLOC(lsm2, lsm_size);
2098 GOTO(out, rc = -ENOMEM);
2100 oa->o_id = ucreatp.lrc_id;
2101 oa->o_nlink = ucreatp.lrc_ost_idx;
2102 oa->o_flags |= OBD_FL_RECREATE_OBJS;
2103 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
2104 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2105 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2107 memcpy(lsm2, lsm, lsm_size);
2108 rc = obd_create(exp, oa, &lsm2, &oti);
2110 OBD_FREE(lsm2, lsm_size);
2113 up(&lli->lli_size_sem);
2118 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2119 int flags, struct lov_user_md *lum,
2122 struct ll_inode_info *lli = ll_i2info(inode);
2123 struct lov_stripe_md *lsm;
2124 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2128 down(&lli->lli_size_sem);
2131 up(&lli->lli_size_sem);
2132 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2137 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2140 if (it_disposition(&oit, DISP_LOOKUP_NEG))
2141 GOTO(out_req_free, rc = -ENOENT);
2142 rc = oit.d.lustre.it_status;
2144 GOTO(out_req_free, rc);
2146 ll_release_openhandle(file->f_dentry, &oit);
2149 up(&lli->lli_size_sem);
2150 ll_intent_release(&oit);
2153 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2157 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2158 struct lov_mds_md **lmmp, int *lmm_size,
2159 struct ptlrpc_request **request)
2161 struct ll_sb_info *sbi = ll_i2sbi(inode);
2163 struct mds_body *body;
2164 struct lov_mds_md *lmm = NULL;
2165 struct ptlrpc_request *req = NULL;
2168 ll_inode2fid(&fid, inode);
2170 rc = ll_get_max_mdsize(sbi, &lmmsize);
2174 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
2175 filename, strlen(filename) + 1,
2176 OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
2179 CDEBUG(D_INFO, "mdc_getattr_name failed "
2180 "on %s: rc %d\n", filename, rc);
2184 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
2186 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2187 /* swabbed by mdc_getattr_name */
2188 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
2190 lmmsize = body->eadatasize;
2192 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2194 GOTO(out, rc = -ENODATA);
2197 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
2199 LASSERT(lmm != NULL);
2200 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
2202 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2203 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2204 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2205 GOTO(out, rc = -EPROTO);
2208 * This is coming from the MDS, so is probably in
2209 * little endian. We convert it to host endian before
2210 * passing it to userspace.
2212 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2213 /* if function called for directory - we should
2214 * avoid swab not existent lsm objects */
2215 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2216 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
2217 if (S_ISREG(body->mode))
2218 lustre_swab_lov_user_md_objects(
2219 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2220 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
2221 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2222 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
2223 if (S_ISREG(body->mode))
2224 lustre_swab_lov_user_md_objects(
2225 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2226 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
2227 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2228 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2232 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2233 struct lov_stripe_md *lsm;
2234 struct lov_user_md_join *lmj;
2235 int lmj_size, i, aindex = 0;
2237 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
2239 GOTO(out, rc = -ENOMEM);
2240 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
2242 GOTO(out_free_memmd, rc);
2244 lmj_size = sizeof(struct lov_user_md_join) +
2245 lsm->lsm_stripe_count *
2246 sizeof(struct lov_user_ost_data_join);
2247 OBD_ALLOC(lmj, lmj_size);
2249 GOTO(out_free_memmd, rc = -ENOMEM);
2251 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2252 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2253 struct lov_extent *lex =
2254 &lsm->lsm_array->lai_ext_array[aindex];
2256 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2258 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2259 LPU64" len %d\n", aindex, i,
2260 lex->le_start, (int)lex->le_len);
2261 lmj->lmm_objects[i].l_extent_start =
2264 if ((int)lex->le_len == -1)
2265 lmj->lmm_objects[i].l_extent_end = -1;
2267 lmj->lmm_objects[i].l_extent_end =
2268 lex->le_start + lex->le_len;
2269 lmj->lmm_objects[i].l_object_id =
2270 lsm->lsm_oinfo[i]->loi_id;
2271 lmj->lmm_objects[i].l_object_gr =
2272 lsm->lsm_oinfo[i]->loi_gr;
2273 lmj->lmm_objects[i].l_ost_gen =
2274 lsm->lsm_oinfo[i]->loi_ost_gen;
2275 lmj->lmm_objects[i].l_ost_idx =
2276 lsm->lsm_oinfo[i]->loi_ost_idx;
2278 lmm = (struct lov_mds_md *)lmj;
2281 obd_free_memmd(sbi->ll_osc_exp, &lsm);
2285 *lmm_size = lmmsize;
2289 static int ll_lov_setea(struct inode *inode, struct file *file,
2292 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2293 struct lov_user_md *lump;
2294 int lum_size = sizeof(struct lov_user_md) +
2295 sizeof(struct lov_user_ost_data);
2299 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2302 OBD_ALLOC(lump, lum_size);
2306 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2308 OBD_FREE(lump, lum_size);
2312 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2314 OBD_FREE(lump, lum_size);
2318 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2321 struct lov_user_md_v3 lumv3;
2322 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2323 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2324 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2327 int flags = FMODE_WRITE;
2330 /* first try with v1 which is smaller than v3 */
2331 lum_size = sizeof(struct lov_user_md_v1);
2332 rc = copy_from_user(lumv1, lumv1p, lum_size);
2336 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2337 lum_size = sizeof(struct lov_user_md_v3);
2338 rc = copy_from_user(&lumv3, lumv3p, lum_size);
2343 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2345 put_user(0, &lumv1p->lmm_stripe_count);
2346 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
2347 0, ll_i2info(inode)->lli_smd,
2353 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2355 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2360 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
2364 static int ll_get_grouplock(struct inode *inode, struct file *file,
2367 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2368 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2369 .end = OBD_OBJECT_EOF}};
2370 struct lustre_handle lockh = { 0 };
2371 struct ll_inode_info *lli = ll_i2info(inode);
2372 struct lov_stripe_md *lsm = lli->lli_smd;
2376 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2380 policy.l_extent.gid = arg;
2381 if (file->f_flags & O_NONBLOCK)
2382 flags = LDLM_FL_BLOCK_NOWAIT;
2384 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2388 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2390 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2395 static int ll_put_grouplock(struct inode *inode, struct file *file,
2398 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2399 struct ll_inode_info *lli = ll_i2info(inode);
2400 struct lov_stripe_md *lsm = lli->lli_smd;
2404 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2405 /* Ugh, it's already unlocked. */
2409 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2412 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2414 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2419 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2424 #if LUSTRE_FIX >= 50
2425 static int join_sanity_check(struct inode *head, struct inode *tail)
2428 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2429 CERROR("server do not support join \n");
2432 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2433 CERROR("tail ino %lu and ino head %lu must be regular\n",
2434 head->i_ino, tail->i_ino);
2437 if (head->i_ino == tail->i_ino) {
2438 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2441 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2442 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2448 static int join_file(struct inode *head_inode, struct file *head_filp,
2449 struct file *tail_filp)
2451 struct dentry *tail_dentry = tail_filp->f_dentry;
2452 struct lookup_intent oit = {.it_op = IT_OPEN,
2453 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2454 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
2455 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
2457 struct lustre_handle lockh;
2458 struct mdc_op_data *op_data;
2463 tail_dentry = tail_filp->f_dentry;
2465 OBD_ALLOC_PTR(op_data);
2466 if (op_data == NULL) {
2470 data = i_size_read(head_inode);
2471 ll_prepare_mdc_op_data(op_data, head_inode,
2472 tail_dentry->d_parent->d_inode,
2473 tail_dentry->d_name.name,
2474 tail_dentry->d_name.len, 0, &data);
2475 rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
2476 op_data, &lockh, NULL, 0, 0);
2481 rc = oit.d.lustre.it_status;
2483 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2484 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2485 ptlrpc_req_finished((struct ptlrpc_request *)
2486 oit.d.lustre.it_data);
2490 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2492 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2493 oit.d.lustre.it_lock_mode = 0;
2495 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2496 it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2497 ll_release_openhandle(head_filp->f_dentry, &oit);
2500 OBD_FREE_PTR(op_data);
2501 ll_intent_release(&oit);
2505 static int ll_file_join(struct inode *head, struct file *filp,
2506 char *filename_tail)
2508 struct inode *tail = NULL, *first = NULL, *second = NULL;
2509 struct dentry *tail_dentry;
2510 struct file *tail_filp, *first_filp, *second_filp;
2511 struct ll_lock_tree first_tree, second_tree;
2512 struct ll_lock_tree_node *first_node, *second_node;
2513 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2514 int rc = 0, cleanup_phase = 0;
2517 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2518 head->i_ino, head->i_generation, head, filename_tail);
2520 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2521 if (IS_ERR(tail_filp)) {
2522 CERROR("Can not open tail file %s", filename_tail);
2523 rc = PTR_ERR(tail_filp);
2526 tail = igrab(tail_filp->f_dentry->d_inode);
2528 tlli = ll_i2info(tail);
2529 tail_dentry = tail_filp->f_dentry;
2530 LASSERT(tail_dentry);
2533 /*reorder the inode for lock sequence*/
2534 first = head->i_ino > tail->i_ino ? head : tail;
2535 second = head->i_ino > tail->i_ino ? tail : head;
2536 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2537 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2539 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2540 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2541 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2542 if (IS_ERR(first_node)){
2543 rc = PTR_ERR(first_node);
2546 first_tree.lt_fd = first_filp->private_data;
2547 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2552 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2553 if (IS_ERR(second_node)){
2554 rc = PTR_ERR(second_node);
2557 second_tree.lt_fd = second_filp->private_data;
2558 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2563 rc = join_sanity_check(head, tail);
2567 rc = join_file(head, filp, tail_filp);
2571 switch (cleanup_phase) {
2573 ll_tree_unlock(&second_tree);
2574 obd_cancel_unused(ll_i2obdexp(second),
2575 ll_i2info(second)->lli_smd, 0, NULL);
2577 ll_tree_unlock(&first_tree);
2578 obd_cancel_unused(ll_i2obdexp(first),
2579 ll_i2info(first)->lli_smd, 0, NULL);
2581 filp_close(tail_filp, 0);
2584 if (head && rc == 0) {
2585 obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2587 hlli->lli_smd = NULL;
2592 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2597 #endif /* LUSTRE_FIX >= 50 */
2600 * Close inode open handle
2602 * \param dentry [in] dentry which contains the inode
2603 * \param it [in,out] intent which contains open info and result
2606 * \retval <0 failure
2608 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2610 struct inode *inode = dentry->d_inode;
2611 struct obd_client_handle *och;
2617 /* Root ? Do nothing. */
2618 if (dentry->d_inode->i_sb->s_root == dentry)
2621 /* No open handle to close? Move away */
2622 if (!it_disposition(it, DISP_OPEN_OPEN))
2625 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2627 OBD_ALLOC(och, sizeof(*och));
2629 GOTO(out, rc = -ENOMEM);
2631 ll_och_fill(ll_i2info(inode), it, och);
2633 rc = ll_close_inode_openhandle(inode, och);
2635 OBD_FREE(och, sizeof(*och));
2637 /* this one is in place of ll_file_open */
2638 if (it_disposition(it, DISP_ENQ_OPEN_REF))
2639 ptlrpc_req_finished(it->d.lustre.it_data);
2640 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2644 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2647 struct obd_export *exp = ll_i2obdexp(inode);
2648 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2649 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2650 int vallen = num_bytes;
2654 /* If the stripe_count > 1 and the application does not understand
2655 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2657 if (lsm->lsm_stripe_count > 1 &&
2658 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2661 fm_key.oa.o_id = lsm->lsm_object_id;
2662 fm_key.oa.o_valid = OBD_MD_FLID;
2664 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLSIZE);
2666 /* If filesize is 0, then there would be no objects for mapping */
2667 if (fm_key.oa.o_size == 0) {
2668 fiemap->fm_mapped_extents = 0;
2672 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2674 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2676 CERROR("obd_get_info failed: rc = %d\n", rc);
2681 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2684 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2688 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2689 inode->i_generation, inode, cmd);
2690 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2692 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2693 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2697 case LL_IOC_GETFLAGS:
2698 /* Get the current value of the file flags */
2699 return put_user(fd->fd_flags, (int *)arg);
2700 case LL_IOC_SETFLAGS:
2701 case LL_IOC_CLRFLAGS:
2702 /* Set or clear specific file flags */
2703 /* XXX This probably needs checks to ensure the flags are
2704 * not abused, and to handle any flag side effects.
2706 if (get_user(flags, (int *) arg))
2709 if (cmd == LL_IOC_SETFLAGS) {
2710 if ((flags & LL_FILE_IGNORE_LOCK) &&
2711 !(file->f_flags & O_DIRECT)) {
2712 CERROR("%s: unable to disable locking on "
2713 "non-O_DIRECT file\n", current->comm);
2717 fd->fd_flags |= flags;
2719 fd->fd_flags &= ~flags;
2722 case LL_IOC_LOV_SETSTRIPE:
2723 RETURN(ll_lov_setstripe(inode, file, arg));
2724 case LL_IOC_LOV_SETEA:
2725 RETURN(ll_lov_setea(inode, file, arg));
2726 case LL_IOC_LOV_GETSTRIPE:
2727 RETURN(ll_lov_getstripe(inode, arg));
2728 case LL_IOC_RECREATE_OBJ:
2729 RETURN(ll_lov_recreate_obj(inode, file, arg));
2730 case EXT3_IOC_FIEMAP: {
2731 struct ll_user_fiemap *fiemap_s;
2732 size_t num_bytes, ret_bytes;
2733 unsigned int extent_count;
2736 /* Get the extent count so we can calculate the size of
2737 * required fiemap buffer */
2738 if (get_user(extent_count,
2739 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2741 num_bytes = sizeof(*fiemap_s) + (extent_count *
2742 sizeof(struct ll_fiemap_extent));
2743 OBD_VMALLOC(fiemap_s, num_bytes);
2744 if (fiemap_s == NULL)
2747 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2749 GOTO(error, rc = -EFAULT);
2751 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2752 fiemap_s->fm_flags = fiemap_s->fm_flags &
2753 ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2754 if (copy_to_user((char *)arg, fiemap_s,
2756 GOTO(error, rc = -EFAULT);
2758 GOTO(error, rc = -EBADR);
2761 /* If fm_extent_count is non-zero, read the first extent since
2762 * it is used to calculate end_offset and device from previous
2765 if (copy_from_user(&fiemap_s->fm_extents[0],
2766 (char __user *)arg + sizeof(*fiemap_s),
2767 sizeof(struct ll_fiemap_extent)))
2768 GOTO(error, rc = -EFAULT);
2771 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
2774 rc = filemap_fdatawrite(inode->i_mapping);
2779 rc = ll_fiemap(inode, fiemap_s, num_bytes);
2783 ret_bytes = sizeof(struct ll_user_fiemap);
2785 if (extent_count != 0)
2786 ret_bytes += (fiemap_s->fm_mapped_extents *
2787 sizeof(struct ll_fiemap_extent));
2789 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2793 OBD_VFREE(fiemap_s, num_bytes);
2796 case EXT3_IOC_GETFLAGS:
2797 case EXT3_IOC_SETFLAGS:
2798 RETURN(ll_iocontrol(inode, file, cmd, arg));
2799 case EXT3_IOC_GETVERSION_OLD:
2800 case EXT3_IOC_GETVERSION:
2801 RETURN(put_user(inode->i_generation, (int *)arg));
2803 #if LUSTRE_FIX >= 50
2804 /* Allow file join in beta builds to allow debuggging */
2808 ftail = getname((const char *)arg);
2810 RETURN(PTR_ERR(ftail));
2811 rc = ll_file_join(inode, file, ftail);
2815 CWARN("file join is not supported in this version of Lustre\n");
2819 case LL_IOC_GROUP_LOCK:
2820 RETURN(ll_get_grouplock(inode, file, arg));
2821 case LL_IOC_GROUP_UNLOCK:
2822 RETURN(ll_put_grouplock(inode, file, arg));
2823 case IOC_OBD_STATFS:
2824 RETURN(ll_obd_statfs(inode, (void *)arg));
2825 case OBD_IOC_GETNAME_OLD:
2826 case OBD_IOC_GETNAME: {
2827 struct obd_device *obd =
2828 class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
2831 if (copy_to_user((void *)arg, obd->obd_name,
2832 strlen(obd->obd_name) + 1))
2837 /* We need to special case any other ioctls we want to handle,
2838 * to send them to the MDS/OST as appropriate and to properly
2839 * network encode the arg field.
2840 case EXT3_IOC_SETVERSION_OLD:
2841 case EXT3_IOC_SETVERSION:
2847 ll_iocontrol_call(inode, file, cmd, arg, &err))
2850 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
2856 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2858 struct inode *inode = file->f_dentry->d_inode;
2859 struct ll_inode_info *lli = ll_i2info(inode);
2860 struct lov_stripe_md *lsm = lli->lli_smd;
2863 retval = offset + ((origin == 2) ? i_size_read(inode) :
2864 (origin == 1) ? file->f_pos : 0);
2865 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2866 inode->i_ino, inode->i_generation, inode, retval, retval,
2867 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2868 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2870 if (origin == 2) { /* SEEK_END */
2871 int nonblock = 0, rc;
2873 if (file->f_flags & O_NONBLOCK)
2874 nonblock = LDLM_FL_BLOCK_NOWAIT;
2877 rc = ll_glimpse_size(inode, nonblock);
2882 ll_inode_size_lock(inode, 0);
2883 offset += i_size_read(inode);
2884 ll_inode_size_unlock(inode, 0);
2885 } else if (origin == 1) { /* SEEK_CUR */
2886 offset += file->f_pos;
2890 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2891 if (offset != file->f_pos) {
2892 file->f_pos = offset;
2893 file->f_version = 0;
2901 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2903 struct inode *inode = dentry->d_inode;
2904 struct ll_inode_info *lli = ll_i2info(inode);
2905 struct lov_stripe_md *lsm = lli->lli_smd;
2907 struct ptlrpc_request *req;
2910 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2911 inode->i_generation, inode);
2912 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2914 /* fsync's caller has already called _fdata{sync,write}, we want
2915 * that IO to finish before calling the osc and mdc sync methods */
2916 rc = filemap_fdatawait(inode->i_mapping);
2918 /* catch async errors that were recorded back when async writeback
2919 * failed for pages in this mapping. */
2920 err = lli->lli_async_rc;
2921 lli->lli_async_rc = 0;
2925 err = lov_test_and_clear_async_rc(lsm);
2930 ll_inode2fid(&fid, inode);
2931 err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
2935 ptlrpc_req_finished(req);
2938 struct obd_info *oinfo;
2940 OBD_ALLOC_PTR(oinfo);
2942 RETURN(rc ? rc : -ENOMEM);
2943 OBDO_ALLOC(oinfo->oi_oa);
2944 if (!oinfo->oi_oa) {
2945 OBD_FREE_PTR(oinfo);
2946 RETURN(rc ? rc : -ENOMEM);
2948 oinfo->oi_oa->o_id = lsm->lsm_object_id;
2949 oinfo->oi_oa->o_gr = lsm->lsm_object_gr;
2950 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2951 obdo_from_inode(oinfo->oi_oa, inode,
2952 OBD_MD_FLTYPE | OBD_MD_FLATIME |
2953 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2955 err = obd_sync_rqset(ll_i2sbi(inode)->ll_osc_exp, oinfo,
2959 OBDO_FREE(oinfo->oi_oa);
2960 OBD_FREE_PTR(oinfo);
2966 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2968 struct inode *inode = file->f_dentry->d_inode;
2969 struct ll_sb_info *sbi = ll_i2sbi(inode);
2970 struct lu_fid *fid = ll_inode_lu_fid(inode);
2971 struct ldlm_res_id res_id =
2972 { .name = { fid_seq(fid),
2976 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2977 ldlm_flock_completion_ast, NULL, file_lock };
2978 struct lustre_handle lockh = {0};
2979 ldlm_policy_data_t flock;
2984 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2985 inode->i_ino, file_lock);
2986 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2988 if (fid_is_igif(fid)) {
2989 /* If this is an IGIF inode, we need to keep the 1.6-style
2990 * flock mapping for compatibility. If it is a proper FID
2991 * then we know any other client accessing it must also be
2992 * accessing it as a FID and can use the CMD-style flock. */
2993 res_id.name[2] = LDLM_FLOCK;
2997 if (file_lock->fl_flags & FL_FLOCK) {
2998 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2999 /* set missing params for flock() calls */
3000 file_lock->fl_end = OFFSET_MAX;
3001 file_lock->fl_pid = current->tgid;
3003 flock.l_flock.pid = file_lock->fl_pid;
3004 flock.l_flock.start = file_lock->fl_start;
3005 flock.l_flock.end = file_lock->fl_end;
3007 switch (file_lock->fl_type) {
3009 einfo.ei_mode = LCK_PR;
3012 /* An unlock request may or may not have any relation to
3013 * existing locks so we may not be able to pass a lock handle
3014 * via a normal ldlm_lock_cancel() request. The request may even
3015 * unlock a byte range in the middle of an existing lock. In
3016 * order to process an unlock request we need all of the same
3017 * information that is given with a normal read or write record
3018 * lock request. To avoid creating another ldlm unlock (cancel)
3019 * message we'll treat a LCK_NL flock request as an unlock. */
3020 einfo.ei_mode = LCK_NL;
3023 einfo.ei_mode = LCK_PW;
3026 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
3041 flags = LDLM_FL_BLOCK_NOWAIT;
3047 flags = LDLM_FL_TEST_LOCK;
3048 /* Save the old mode so that if the mode in the lock changes we
3049 * can decrement the appropriate reader or writer refcount. */
3050 file_lock->fl_type = einfo.ei_mode;
3053 CERROR("unknown fcntl lock command: %d\n", cmd);
3057 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
3058 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
3059 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
3061 rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
3062 &flock, &flags, NULL, 0, NULL, &lockh, 0);
3063 if ((file_lock->fl_flags & FL_FLOCK) &&
3064 (rc == 0 || file_lock->fl_type == F_UNLCK))
3065 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
3066 #ifdef HAVE_F_OP_FLOCK
3067 if ((file_lock->fl_flags & FL_POSIX) &&
3068 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3069 !(flags & LDLM_FL_TEST_LOCK))
3070 posix_lock_file_wait(file, file_lock);
3076 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3083 int ll_have_md_lock(struct inode *inode, __u64 bits)
3085 struct lustre_handle lockh;
3086 struct ldlm_res_id res_id;
3087 struct obd_device *obddev;
3088 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3095 obddev = ll_i2mdcexp(inode)->exp_obd;
3096 fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id);
3098 CDEBUG(D_INFO, "trying to match res "LPU64":"LPU64":"LPU64"\n",
3103 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3104 if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
3105 &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
3112 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3113 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3114 * and return success */
3116 /* This path cannot be hit for regular files unless in
3117 * case of obscure races, so no need to to validate
3119 if (!S_ISREG(inode->i_mode) &&
3120 !S_ISDIR(inode->i_mode))
3125 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3133 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3135 struct inode *inode = dentry->d_inode;
3136 struct ptlrpc_request *req = NULL;
3137 struct obd_export *exp;
3142 CERROR("REPORT THIS LINE TO PETER\n");
3145 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3146 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3148 exp = ll_i2mdcexp(inode);
3150 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3151 struct lookup_intent oit = { .it_op = IT_GETATTR };
3152 struct mdc_op_data op_data = { { 0 } };
3154 /* Call getattr by fid, so do not provide name at all. */
3155 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
3156 dentry->d_inode, NULL, 0, 0, NULL);
3157 oit.it_flags |= O_CHECK_STALE;
3158 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
3159 /* we are not interested in name
3162 ll_mdc_blocking_ast, 0);
3163 oit.it_flags &= ~O_CHECK_STALE;
3165 rc = ll_inode_revalidate_fini(inode, rc);
3169 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
3171 ll_intent_release(&oit);
3175 /* Unlinked? Unhash dentry, so it is not picked up later by
3176 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3177 here to preserve get_cwd functionality on 2.6.
3179 if (!dentry->d_inode->i_nlink) {
3180 spin_lock(&ll_lookup_lock);
3181 spin_lock(&dcache_lock);
3182 ll_drop_dentry(dentry);
3183 spin_unlock(&dcache_lock);
3184 spin_unlock(&ll_lookup_lock);
3187 ll_lookup_finish_locks(&oit, dentry);
3188 } else if (!ll_have_md_lock(dentry->d_inode,
3189 MDS_INODELOCK_UPDATE|MDS_INODELOCK_LOOKUP)) {
3190 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3192 obd_valid valid = OBD_MD_FLGETATTR;
3195 if (S_ISREG(inode->i_mode)) {
3196 rc = ll_get_max_mdsize(sbi, &ealen);
3199 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3201 ll_inode2fid(&fid, inode);
3202 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
3204 rc = ll_inode_revalidate_fini(inode, rc);
3208 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
3214 /* if object not yet allocated, don't validate size */
3215 if (ll_i2info(inode)->lli_smd == NULL) {
3216 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3217 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3218 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3222 /* ll_glimpse_size will prefer locally cached writes if they extend
3224 rc = ll_glimpse_size(inode, 0);
3227 ptlrpc_req_finished(req);
3231 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3232 struct lookup_intent *it, struct kstat *stat)
3234 struct inode *inode = de->d_inode;
3237 res = ll_inode_revalidate_it(de, it);
3238 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
3243 stat->dev = inode->i_sb->s_dev;
3244 stat->ino = inode->i_ino;
3245 stat->mode = inode->i_mode;
3246 stat->nlink = inode->i_nlink;
3247 stat->uid = inode->i_uid;
3248 stat->gid = inode->i_gid;
3249 stat->rdev = kdev_t_to_nr(inode->i_rdev);
3250 stat->atime = inode->i_atime;
3251 stat->mtime = inode->i_mtime;
3252 stat->ctime = inode->i_ctime;
3253 #ifdef HAVE_INODE_BLKSIZE
3254 stat->blksize = inode->i_blksize;
3256 stat->blksize = 1<<inode->i_blkbits;
3259 ll_inode_size_lock(inode, 0);
3260 stat->size = i_size_read(inode);
3261 stat->blocks = inode->i_blocks;
3262 ll_inode_size_unlock(inode, 0);
3266 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3268 struct lookup_intent it = { .it_op = IT_GETATTR };
3270 return ll_getattr_it(mnt, de, &it, stat);
3274 int lustre_check_acl(struct inode *inode, int mask)
3276 #ifdef CONFIG_FS_POSIX_ACL
3277 struct ll_inode_info *lli = ll_i2info(inode);
3278 struct posix_acl *acl;
3282 spin_lock(&lli->lli_lock);
3283 acl = posix_acl_dup(lli->lli_posix_acl);
3284 spin_unlock(&lli->lli_lock);
3289 rc = posix_acl_permission(inode, acl, mask);
3290 posix_acl_release(acl);
3298 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3299 #ifndef HAVE_INODE_PERMISION_2ARGS
3300 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3302 int ll_inode_permission(struct inode *inode, int mask)
3305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3306 inode->i_ino, inode->i_generation, inode, mask);
3308 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3309 return generic_permission(inode, mask, lustre_check_acl);
3312 #ifndef HAVE_INODE_PERMISION_2ARGS
3313 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3315 int ll_inode_permission(struct inode *inode, int mask)
3318 int mode = inode->i_mode;
3321 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3322 inode->i_ino, inode->i_generation, inode, mask);
3323 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3325 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3326 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3328 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3330 if (current->fsuid == inode->i_uid) {
3333 if (((mode >> 3) & mask & S_IRWXO) != mask)
3335 rc = lustre_check_acl(inode, mask);
3339 goto check_capabilities;
3343 if (in_group_p(inode->i_gid))
3346 if ((mode & mask & S_IRWXO) == mask)
3350 if (!(mask & MAY_EXEC) ||
3351 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3352 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3355 if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3356 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3363 /* -o localflock - only provides locally consistent flock locks */
3364 struct file_operations ll_file_operations = {
3365 .read = ll_file_read,
3366 #ifdef HAVE_FILE_READV
3367 .readv = ll_file_readv,
3369 .aio_read = ll_file_aio_read,
3371 .write = ll_file_write,
3372 #ifdef HAVE_FILE_WRITEV
3373 .writev = ll_file_writev,
3375 .aio_write = ll_file_aio_write,
3377 .ioctl = ll_file_ioctl,
3378 .open = ll_file_open,
3379 .release = ll_file_release,
3380 .mmap = ll_file_mmap,
3381 .llseek = ll_file_seek,
3382 #ifdef HAVE_KERNEL_SPLICE_READ
3383 .splice_read = ll_file_splice_read,
3385 #ifdef HAVE_KERNEL_SENDFILE
3386 .sendfile = ll_file_sendfile,
3391 struct file_operations ll_file_operations_flock = {
3392 .read = ll_file_read,
3393 #ifdef HAVE_FILE_READV
3394 .readv = ll_file_readv,
3396 .aio_read = ll_file_aio_read,
3398 .write = ll_file_write,
3399 #ifdef HAVE_FILE_WRITEV
3400 .writev = ll_file_writev,
3402 .aio_write = ll_file_aio_write,
3404 .ioctl = ll_file_ioctl,
3405 .open = ll_file_open,
3406 .release = ll_file_release,
3407 .mmap = ll_file_mmap,
3408 .llseek = ll_file_seek,
3409 #ifdef HAVE_KERNEL_SPLICE_READ
3410 .splice_read = ll_file_splice_read,
3412 #ifdef HAVE_KERNEL_SENDFILE
3413 .sendfile = ll_file_sendfile,
3416 #ifdef HAVE_F_OP_FLOCK
3417 .flock = ll_file_flock,
3419 .lock = ll_file_flock
3422 /* These are for -o noflock - to return ENOSYS on flock calls */
3423 struct file_operations ll_file_operations_noflock = {
3424 .read = ll_file_read,
3425 #ifdef HAVE_FILE_READV
3426 .readv = ll_file_readv,
3428 .aio_read = ll_file_aio_read,
3430 .write = ll_file_write,
3431 #ifdef HAVE_FILE_WRITEV
3432 .writev = ll_file_writev,
3434 .aio_write = ll_file_aio_write,
3436 .ioctl = ll_file_ioctl,
3437 .open = ll_file_open,
3438 .release = ll_file_release,
3439 .mmap = ll_file_mmap,
3440 .llseek = ll_file_seek,
3441 #ifdef HAVE_KERNEL_SPLICE_READ
3442 .splice_read = ll_file_splice_read,
3444 #ifdef HAVE_KERNEL_SENDFILE
3445 .sendfile = ll_file_sendfile,
3448 #ifdef HAVE_F_OP_FLOCK
3449 .flock = ll_file_noflock,
3451 .lock = ll_file_noflock
3454 struct inode_operations ll_file_inode_operations = {
3455 #ifdef HAVE_VFS_INTENT_PATCHES
3456 .setattr_raw = ll_setattr_raw,
3458 .setattr = ll_setattr,
3459 .truncate = ll_truncate,
3460 .getattr = ll_getattr,
3461 .permission = ll_inode_permission,
3462 .setxattr = ll_setxattr,
3463 .getxattr = ll_getxattr,
3464 .listxattr = ll_listxattr,
3465 .removexattr = ll_removexattr,
3468 /* dynamic ioctl number support routins */
3469 static struct llioc_ctl_data {
3470 struct rw_semaphore ioc_sem;
3471 struct list_head ioc_head;
3473 __RWSEM_INITIALIZER(llioc.ioc_sem),
3474 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3479 struct list_head iocd_list;
3480 unsigned int iocd_size;
3481 llioc_callback_t iocd_cb;
3482 unsigned int iocd_count;
3483 unsigned int iocd_cmd[0];
3486 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3489 struct llioc_data *in_data = NULL;
3492 if (cb == NULL || cmd == NULL ||
3493 count > LLIOC_MAX_CMD || count < 0)
3496 size = sizeof(*in_data) + count * sizeof(unsigned int);
3497 OBD_ALLOC(in_data, size);
3498 if (in_data == NULL)
3501 memset(in_data, 0, sizeof(*in_data));
3502 in_data->iocd_size = size;
3503 in_data->iocd_cb = cb;
3504 in_data->iocd_count = count;
3505 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3507 down_write(&llioc.ioc_sem);
3508 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3509 up_write(&llioc.ioc_sem);
3514 void ll_iocontrol_unregister(void *magic)
3516 struct llioc_data *tmp;
3521 down_write(&llioc.ioc_sem);
3522 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3524 unsigned int size = tmp->iocd_size;
3526 list_del(&tmp->iocd_list);
3527 up_write(&llioc.ioc_sem);
3529 OBD_FREE(tmp, size);
3533 up_write(&llioc.ioc_sem);
3535 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3538 EXPORT_SYMBOL(ll_iocontrol_register);
3539 EXPORT_SYMBOL(ll_iocontrol_unregister);
3541 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3542 unsigned int cmd, unsigned long arg, int *rcp)
3544 enum llioc_iter ret = LLIOC_CONT;
3545 struct llioc_data *data;
3546 int rc = -EINVAL, i;
3548 down_read(&llioc.ioc_sem);
3549 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3550 for (i = 0; i < data->iocd_count; i++) {
3551 if (cmd != data->iocd_cmd[i])
3554 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3558 if (ret == LLIOC_STOP)
3561 up_read(&llioc.ioc_sem);