*/
ci_async_readahead:1,
/**
+ * Ignore lockless and do normal locking for this io.
+ */
+ ci_ignore_lockless:1,
+ /**
* Set if we've tried all mirrors for this read IO, if it's not set,
* the read IO will check to-be-read OSCs' status, and make fast-switch
* another mirror if some of the OSTs are not healthy.
* called by page WB daemon, or sync write or reading requests. */
oe_urgent:1,
/** Non-delay RPC should be used for this extent. */
- oe_ndelay:1;
+ oe_ndelay:1,
+ /** direct IO pages */
+ oe_dio:1;
/** how many grants allocated for this extent.
* Grant allocated for this extent. There is no grant allocated
* for reading extents and sync write extents. */
GOTO(out_och_free, rc);
mutex_unlock(&lli->lli_och_mutex);
- fd = NULL;
- /* Must do this outside lli_och_mutex lock to prevent deadlock where
- different kind of OPEN lock for this same inode gets cancelled
- by ldlm_cancel_lru */
- if (!S_ISREG(inode->i_mode))
- GOTO(out_och_free, rc);
+ /* lockless for direct IO so that it can do IO in parallel */
+ if (file->f_flags & O_DIRECT)
+ fd->fd_flags |= LL_FILE_LOCKLESS_IO;
+ fd = NULL;
+ /* Must do this outside lli_och_mutex lock to prevent deadlock where
+ different kind of OPEN lock for this same inode gets cancelled
+ by ldlm_cancel_lru */
+ if (!S_ISREG(inode->i_mode))
+ GOTO(out_och_free, rc);
cl_lov_delay_create_clear(&file->f_flags);
GOTO(out_och_free, rc);
int rc = 0;
unsigned retried = 0;
bool restarted = false;
+ unsigned ignore_lockless = 0;
ENTRY;
restart:
io = vvp_env_thread_io(env);
ll_io_init(io, file, iot, args);
+ io->ci_ignore_lockless = ignore_lockless;
io->ci_ndelay_tried = retried;
if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
file->f_path.dentry->d_name.name,
iot, rc, result, io->ci_need_restart);
- if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
+ if ((rc == 0 || rc == -ENODATA || rc == -ENOLCK) &&
+ count > 0 && io->ci_need_restart) {
CDEBUG(D_VFSTRACE,
"%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
file_dentry(file)->d_name.name,
*ppos, count, result, rc);
/* preserve the tried count for FLR */
retried = io->ci_ndelay_tried;
+ ignore_lockless = io->ci_ignore_lockless;
restarted = true;
goto restart;
}
RETURN(result);
}
+ /**
+ * Direct read can fall back to buffered read, but DIO is done
+ * with lockless i/o, and buffered requires LDLM locking, so in
+ * this case we must restart without lockless.
+ */
+ if (file->f_flags & O_DIRECT &&
+ lcc && lcc->lcc_type == LCC_RW &&
+ !io->ci_ignore_lockless) {
+ unlock_page(vmpage);
+ io->ci_ignore_lockless = 1;
+ io->ci_need_restart = 1;
+ RETURN(-ENOLCK);
+ }
+
LASSERT(io->ci_state == CIS_IO_GOING);
page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
if (!IS_ERR(page)) {
env = lcc->lcc_env;
io = lcc->lcc_io;
- if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+ if (file->f_flags & O_DIRECT) {
/* direct IO failed because it couldn't clean up cached pages,
* this causes a problem for mirror write because the cached
* page may belong to another mirror, which will result in
* problem submitting the I/O. */
- GOTO(out, result = -EBUSY);
- }
+ if (io->ci_designated_mirror > 0)
+ GOTO(out, result = -EBUSY);
+ /**
+ * Direct read can fall back to buffered read, but DIO is done
+ * with lockless i/o, and buffered requires LDLM locking, so
+ * in this case we must restart without lockless.
+ */
+ if (!io->ci_ignore_lockless) {
+ io->ci_ignore_lockless = 1;
+ io->ci_need_restart = 1;
+ GOTO(out, result = -ENOLCK);
+ }
+ }
again:
/* To avoid deadlock, try to lock page first. */
vmpage = grab_cache_page_nowait(mapping, index);
ast_flags |= CEF_NONBLOCK;
if (io->ci_lock_no_expand)
ast_flags |= CEF_LOCK_NO_EXPAND;
+ if (vio->vui_fd) {
+ /* Group lock held means no lockless any more */
+ if (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+ io->ci_ignore_lockless = 1;
+
+ if (ll_file_nolock(vio->vui_fd->fd_file) ||
+ (vio->vui_fd->fd_flags & LL_FILE_LOCKLESS_IO &&
+ !io->ci_ignore_lockless))
+ ast_flags |= CEF_NEVER;
+ }
result = vvp_mmap_locks(env, vio, io);
if (result == 0)
},
};
-static int vvp_transient_page_prep(const struct lu_env *env,
- const struct cl_page_slice *slice,
- struct cl_io *unused)
-{
- ENTRY;
- /* transient page should always be sent. */
- RETURN(0);
-}
-
-static void vvp_transient_page_verify(const struct cl_page *page)
-{
-}
-
-static int vvp_transient_page_own(const struct lu_env *env,
- const struct cl_page_slice *slice,
- struct cl_io *unused, int nonblock)
-{
- vvp_transient_page_verify(slice->cpl_page);
- return 0;
-}
-
-static void vvp_transient_page_assume(const struct lu_env *env,
- const struct cl_page_slice *slice,
- struct cl_io *unused)
-{
- vvp_transient_page_verify(slice->cpl_page);
-}
-
-static void vvp_transient_page_unassume(const struct lu_env *env,
- const struct cl_page_slice *slice,
- struct cl_io *unused)
-{
- vvp_transient_page_verify(slice->cpl_page);
-}
-
-static void vvp_transient_page_disown(const struct lu_env *env,
- const struct cl_page_slice *slice,
- struct cl_io *unused)
-{
- vvp_transient_page_verify(slice->cpl_page);
-}
-
static void vvp_transient_page_discard(const struct lu_env *env,
const struct cl_page_slice *slice,
struct cl_io *unused)
{
struct cl_page *page = slice->cpl_page;
- vvp_transient_page_verify(slice->cpl_page);
-
/*
* For transient pages, remove it from the radix tree.
*/
static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
const struct cl_page_slice *slice)
{
- struct inode *inode = vvp_object_inode(slice->cpl_obj);
- int locked;
-
- locked = !inode_trylock(inode);
- if (!locked)
- inode_unlock(inode);
- return locked ? -EBUSY : -ENODATA;
-}
-
-static void
-vvp_transient_page_completion(const struct lu_env *env,
- const struct cl_page_slice *slice,
- int ioret)
-{
- vvp_transient_page_verify(slice->cpl_page);
+ return -EBUSY;
}
static void vvp_transient_page_fini(const struct lu_env *env,
struct pagevec *pvec)
{
struct vvp_page *vpg = cl2vvp_page(slice);
- struct cl_page *clp = slice->cpl_page;
- struct vvp_object *clobj = cl2vvp(clp->cp_obj);
+ struct vvp_object *clobj = cl2vvp(slice->cpl_obj);
vvp_page_fini_common(vpg, pvec);
atomic_dec(&clobj->vob_transient_pages);
}
static const struct cl_page_operations vvp_transient_page_ops = {
- .cpo_own = vvp_transient_page_own,
- .cpo_assume = vvp_transient_page_assume,
- .cpo_unassume = vvp_transient_page_unassume,
- .cpo_disown = vvp_transient_page_disown,
.cpo_discard = vvp_transient_page_discard,
.cpo_fini = vvp_transient_page_fini,
.cpo_is_vmlocked = vvp_transient_page_is_vmlocked,
.cpo_print = vvp_page_print,
- .io = {
- [CRT_READ] = {
- .cpo_prep = vvp_transient_page_prep,
- .cpo_completion = vvp_transient_page_completion,
- },
- [CRT_WRITE] = {
- .cpo_prep = vvp_transient_page_prep,
- .cpo_completion = vvp_transient_page_completion,
- }
- }
};
int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
RETURN(rc);
}
-static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+static inline bool
+overlapped(const struct osc_extent *ex1, const struct osc_extent *ex2)
{
return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
}
return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
}
+static inline bool
+can_merge(const struct osc_extent *ext, const struct osc_extent *in_rpc)
+{
+ if (ext->oe_no_merge || in_rpc->oe_no_merge)
+ return false;
+
+ if (ext->oe_srvlock != in_rpc->oe_srvlock)
+ return false;
+
+ if (ext->oe_ndelay != in_rpc->oe_ndelay)
+ return false;
+
+ if (!ext->oe_grants != !in_rpc->oe_grants)
+ return false;
+
+ if (ext->oe_dio != in_rpc->oe_dio)
+ return false;
+
+ /* It's possible to have overlap on DIO */
+ if (in_rpc->oe_dio && overlapped(ext, in_rpc))
+ return false;
+
+ return true;
+}
+
/**
* Try to add extent to one RPC. We need to think about the following things:
* - # of pages must not be over max_pages_per_rpc
{
struct osc_extent *tmp;
unsigned int chunk_count;
- struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
- struct osc_async_page,
- oap_pending_item);
ENTRY;
EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
RETURN(0);
list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
- struct osc_async_page *oap2;
- oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
- oap_pending_item);
EASSERT(tmp->oe_owner == current, tmp);
-#if 0
- if (overlapped(tmp, ext)) {
- OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
- EASSERT(0, ext);
- }
-#endif
- if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
- CDEBUG(D_CACHE, "Do not permit different types of IO "
- "in one RPC\n");
- RETURN(0);
- }
- if (tmp->oe_srvlock != ext->oe_srvlock ||
- !tmp->oe_grants != !ext->oe_grants ||
- tmp->oe_ndelay != ext->oe_ndelay ||
- tmp->oe_no_merge || ext->oe_no_merge)
+ if (!can_merge(ext, tmp))
RETURN(0);
/* remove break for strict check */
ext->oe_obj = obj;
ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
+ ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE);
ext->oe_nr_pages = page_count;
ext->oe_mppr = mppr;
list_splice_init(list, &ext->oe_pages);
if (crt == CRT_READ && ios->cis_io->ci_ndelay)
brw_flags |= OBD_BRW_NDELAY;
+ page = cl_page_list_first(qin);
+ if (page->cp_type == CPT_TRANSIENT)
+ brw_flags |= OBD_BRW_NOCACHE;
+
/*
* NOTE: here @page is a top-level page. This is done to avoid
* creation of sub-page-list.
}
run_test 319 "lost lease lock on migrate error"
+test_398a() { # LU-4198
+ $LFS setstripe -c 1 -i 0 $DIR/$tfile
+ $LCTL set_param ldlm.namespaces.*.lru_size=clear
+
+ # request a new lock on client
+ dd if=/dev/zero of=$DIR/$tfile bs=1M count=1
+
+ dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=direct conv=notrunc
+ local lock_count=$($LCTL get_param -n \
+ ldlm.namespaces.*-OST0000-osc-ffff*.lru_size)
+ [[ $lock_count -eq 0 ]] || error "lock should be cancelled by direct IO"
+
+ $LCTL set_param ldlm.namespaces.*-OST0000-osc-ffff*.lru_size=clear
+
+ # no lock cached, should use lockless IO and not enqueue new lock
+ dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=direct conv=notrunc
+ lock_count=$($LCTL get_param -n \
+ ldlm.namespaces.*-OST0000-osc-ffff*.lru_size)
+ [[ $lock_count -eq 0 ]] || error "no lock should be held by direct IO"
+}
+run_test 398a "direct IO should cancel lock otherwise lockless"
+
+test_398b() { # LU-4198
+ which fio || skip_env "no fio installed"
+ $LFS setstripe -c -1 $DIR/$tfile
+
+ local size=12
+ dd if=/dev/zero of=$DIR/$tfile bs=1M count=$size
+
+ local njobs=4
+ echo "mix direct rw ${size}M to OST0 by fio with $njobs jobs..."
+ fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE --direct=1 \
+ --numjobs=$njobs --fallocate=none \
+ --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \
+ --filename=$DIR/$tfile &
+ bg_pid=$!
+
+ echo "mix buffer rw ${size}M to OST0 by fio with $njobs jobs..."
+ fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE \
+ --numjobs=$njobs --fallocate=none \
+ --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \
+ --filename=$DIR/$tfile || true
+ wait $bg_pid
+
+ rm -rf $DIR/$tfile
+}
+run_test 398b "DIO and buffer IO race"
+
test_fake_rw() {
local read_write=$1
if [ "$read_write" = "write" ]; then
}
run_test 16c "verify data consistency on ldiskfs with cache disabled (b=17397)"
+test_16d() {
+ local file1=$DIR1/$tfile
+ local file2=$DIR2/$tfile
+ local file3=$DIR1/file
+ local stripe_size=$(do_facet $SINGLEMDS \
+ "$LCTL get_param -n lod.$(facet_svc $SINGLEMDS)*.stripesize")
+
+ # to allocate grant because it may run out due to test_15.
+ $LFS setstripe -c -1 $file1
+ dd if=/dev/zero of=$file1 bs=$stripe_size count=$OSTCOUNT oflag=sync
+ dd if=/dev/zero of=$file2 bs=$stripe_size count=$OSTCOUNT oflag=sync
+ rm -f $file1
+
+ local tmpfile=`mktemp`
+ $LFS setstripe -c -1 $file1 # b=10919
+ $LCTL set_param ldlm.namespaces.*.lru_size=clear
+
+ # direct write on one client and direct read from another
+ dd if=/dev/urandom of=$file1 bs=1M count=100 oflag=direct
+ dd if=$file2 of=$tmpfile iflag=direct bs=1M
+ diff $file1 $tmpfile || error "file different(1)"
+ rm -f $file1
+
+ # buffer write on one client, but direct read from another
+ dd if=$tmpfile of=$file1 bs=1M count=100
+ dd if=$file2 of=$file3 bs=1M iflag=direct count=100
+ diff $file3 $tmpfile || error "file different(2)"
+
+ rm -f $file3 $file2 $file1
+ # direct write on one client
+ dd if=$tmpfile of=$file1 bs=1M count=100 oflag=direct
+ # buffer read from another client
+ dd if=$file2 of=$file3 bs=1M count=100
+ diff $file3 $tmpfile || error "file different(3)"
+
+ rm -f $file1 $file2 $file3 $tmpfile
+
+}
+run_test 16d "Verify DIO and buffer IO with two clients"
+
test_17() { # bug 3513, 3667
remote_ost_nodsh && skip "remote OST with nodsh" && return