/** Layers are free to decide between local and global locking. */
CILR_MAYBE,
/** Never lock: there is no cache (e.g., liblustre). */
- CILR_NEVER
+ CILR_NEVER,
+ /** Peek lock: use existing locks, don't queue new ones */
+ CILR_PEEK
};
struct cl_io_rw_common {
pgoff_t ft_index;
/** bytes valid byte on a faulted page. */
int ft_nob;
- /** writable page? */
+ /** writable page? for nopage() only */
int ft_writable;
/** page of an executable? */
int ft_executable;
+ /** page_mkwrite() */
+ int ft_mkwrite;
/** resulting page */
struct cl_page *ft_page;
} ci_fault;
OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */
OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */
OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
- OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client */
+ OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client.
+ * XXX: obsoleted - reserved for old
+ * clients prior than 2.2 */
OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */
}
obdo_from_inode(oa, inode, &cl_i2info(inode)->lli_fid,
valid_flags & flags);
-#ifdef __KERNEL__
- /* Bug11742 - set the OBD_FL_MMAP flag for memory mapped files */
- if (cfs_atomic_read(&(cl_inode2ccc(inode)->cob_mmap_cnt)) != 0) {
- if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
- oa->o_valid |= OBD_MD_FLFLAGS;
- oa->o_flags = OBD_FL_MMAP;
- } else {
- oa->o_flags |= OBD_FL_MMAP;
- }
- }
-#endif
}
const struct cl_req_operations ccc_req_ops = {
struct vm_area_struct *vma, unsigned long addr, size_t count);
struct vm_area_struct *our_vma(unsigned long addr, size_t count);
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+ struct address_space *mapping = vmpage->mapping;
+ loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+ LASSERT(PageLocked(vmpage));
+ if (mapping == NULL)
+ return;
+
+ ll_teardown_mmaps(mapping, offset, offset + CFS_PAGE_SIZE);
+ truncate_complete_page(mapping, vmpage);
+}
+
#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi)
/* don't need an addref as the sb_info should be holding one */
#include "llite_internal.h"
#include <linux/lustre_compat25.h>
-#define VMA_DEBUG(vma, fmt, arg...) \
- CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) " \
- "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end, \
- vma->vm_pgoff, vma->vm_file->f_dentry->d_inode, \
- vma->vm_file->f_dentry->d_inode->i_ino, \
- vma->vm_file->f_dentry->d_iname, ## arg); \
-
struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
int *type);
{
struct file *file = vma->vm_file;
struct inode *inode = file->f_dentry->d_inode;
- const unsigned long writable = VM_SHARED|VM_WRITE;
struct cl_io *io;
struct cl_fault_io *fio;
struct lu_env *env;
fio = &io->u.ci_fault;
fio->ft_index = index;
- fio->ft_writable = (vma->vm_flags&writable) == writable;
fio->ft_executable = vma->vm_flags&VM_EXEC;
/*
* the kernel will not read other pages not covered by ldlm in
* filemap_nopage. we do our readahead in ll_readpage.
*/
- *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+ if (ra_flags != NULL)
+ *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
vma->vm_flags &= ~VM_SEQ_READ;
vma->vm_flags |= VM_RAND_READ;
- CDEBUG(D_INFO, "vm_flags: %lx (%lu %d %d)\n", vma->vm_flags,
- fio->ft_index, fio->ft_writable, fio->ft_executable);
+ CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+ fio->ft_index, fio->ft_executable);
if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
struct ccc_io *cio = ccc_env_io(env);
return io;
}
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+ bool *retry)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ struct vvp_io *vio;
+ struct cl_env_nest nest;
+ int result;
+ ENTRY;
+
+ LASSERT(vmpage != NULL);
+
+ io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL);
+ if (IS_ERR(io))
+ GOTO(out, result = PTR_ERR(io));
+
+ result = io->ci_result;
+ if (result < 0)
+ GOTO(out, result);
+
+ /* Don't enqueue new locks for page_mkwrite().
+ * If the lock has been cancelled then page must have been
+ * truncated, in that case, kernel will handle it.
+ */
+ io->ci_lockreq = CILR_PEEK;
+ io->u.ci_fault.ft_mkwrite = 1;
+ io->u.ci_fault.ft_writable = 1;
+
+ vio = vvp_env_io(env);
+ vio->u.fault.ft_vma = vma;
+ vio->u.fault.ft_vmpage = vmpage;
+
+ result = cl_io_loop(env, io);
+
+ if (result == -ENODATA) /* peek failed, no lock caching. */
+ CDEBUG(D_MMAP, "race on page_mkwrite: %lx (%lu %p)\n",
+ vma->vm_flags, io->u.ci_fault.ft_index, vmpage);
+
+ if (result == 0 || result == -ENODATA) {
+ lock_page(vmpage);
+ if (vmpage->mapping == NULL) {
+ unlock_page(vmpage);
+
+ /* page was truncated and lock was cancelled, return
+ * ENODATA so that VM_FAULT_NOPAGE will be returned
+ * to handle_mm_fault(). */
+ if (result == 0)
+ result = -ENODATA;
+ } else if (result == -ENODATA) {
+ /* Invalidate it if the cl_lock is being revoked.
+ * This piece of code is definitely needed for RHEL5,
+ * otherwise, SIGBUS will be wrongly returned to
+ * applications. */
+ ll_invalidate_page(vmpage);
+ LASSERT(vmpage->mapping == NULL);
+ unlock_page(vmpage);
+ } else if (!PageDirty(vmpage)) {
+ /* race, the page has been cleaned by ptlrpcd after
+ * it was unlocked, it has to be added into dirty
+ * cache again otherwise this soon-to-dirty page won't
+ * consume any grants, even worse if this page is being
+ * transferred because it will break RPC checksum.
+ */
+ unlock_page(vmpage);
+
+ CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+ "been written out, retry.\n",
+ vmpage, vmpage->index);
+
+ *retry = true;
+ result = -EAGAIN;
+ }
+ }
+ EXIT;
+
+out:
+ cl_io_fini(env, io);
+ cl_env_nested_put(&nest, env);
+
+ CDEBUG(D_MMAP, "%s mkwrite with %d\n", cfs_current()->comm, result);
+
+ LASSERT(ergo(result == 0, PageLocked(vmpage)));
+ return(result);
+}
+
+
#ifndef HAVE_VM_OP_FAULT
/**
* Lustre implementation of a vm_operations_struct::nopage() method, called by
unsigned long ra_flags;
pgoff_t pg_offset;
int result;
+ const unsigned long writable = VM_SHARED|VM_WRITE;
ENTRY;
pg_offset = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (result < 0)
goto out_err;
+ io->u.ci_fault.ft_writable = (vma->vm_flags&writable) == writable;
+
vio = vvp_env_io(env);
vio->u.fault.ft_vma = vma;
vio->u.fault.nopage.ft_address = address;
vio->u.fault.nopage.ft_type = type;
+ vio->u.fault.ft_vmpage = NULL;
result = cl_io_loop(env, io);
+ page = vio->u.fault.ft_vmpage;
+ if (result != 0 && page != NULL)
+ page_cache_release(page);
out_err:
- if (result == 0)
- page = vio->u.fault.ft_vmpage;
- else if (result == -ENOMEM)
+ if (result == -ENOMEM)
page = NOPAGE_OOM;
vma->vm_flags &= ~VM_RAND_READ;
RETURN(page);
}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct page *vmpage)
+{
+ int count = 0;
+ bool printed = false;
+ bool retry;
+ int result;
+
+ do {
+ retry = false;
+ result = ll_page_mkwrite0(vma, vmpage, &retry);
+
+ if (!printed && ++count > 16) {
+ CWARN("app(%s): the page %lu of file %lu is under heavy"
+ " contention.\n",
+ current->comm, page_index(vmpage),
+ vma->vm_file->f_dentry->d_inode->i_ino);
+ printed = true;
+ }
+ } while (retry);
+
+ if (result == 0)
+ unlock_page(vmpage);
+ else if (result == -ENODATA)
+ result = 0; /* kernel will know truncate has happened and
+ * retry */
+
+ return result;
+}
+
#else
/**
* Lustre implementation of a vm_operations_struct::fault() method, called by
* \retval VM_FAULT_ERROR on general error
* \retval NOPAGE_OOM not have memory for allocate new page
*/
-int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct lu_env *env;
struct cl_io *io;
struct vvp_io *vio = NULL;
+ struct page *vmpage;
unsigned long ra_flags;
struct cl_env_nest nest;
int result;
vio->u.fault.fault.ft_vmf = vmf;
result = cl_io_loop(env, io);
+
+ vmpage = vio->u.fault.ft_vmpage;
+ if (result != 0 && vmpage != NULL) {
+ page_cache_release(vmpage);
+ vmf->page = NULL;
+ }
+
fault_ret = vio->u.fault.fault.ft_flags;
out_err:
- if ((result != 0) && !(fault_ret & VM_FAULT_RETRY))
- fault_ret |= VM_FAULT_ERROR;
+ if (result != 0 && fault_ret == 0)
+ fault_ret = VM_FAULT_ERROR;
vma->vm_flags |= ra_flags;
cl_io_fini(env, io);
cl_env_nested_put(&nest, env);
+ CDEBUG(D_MMAP, "%s fault %d/%d\n",
+ cfs_current()->comm, fault_ret, result);
RETURN(fault_ret);
}
-int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int count = 0;
bool printed = false;
}
return result;
}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int count = 0;
+ bool printed = false;
+ bool retry;
+ int result;
+
+ do {
+ retry = false;
+ result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+ if (!printed && ++count > 16) {
+ CWARN("app(%s): the page %lu of file %lu is under heavy"
+ " contention.\n",
+ current->comm, vmf->pgoff,
+ vma->vm_file->f_dentry->d_inode->i_ino);
+ printed = true;
+ }
+ } while (retry);
+
+ switch(result) {
+ case 0:
+ LASSERT(PageLocked(vmf->page));
+ result = VM_FAULT_LOCKED;
+ break;
+ case -ENODATA:
+ case -EFAULT:
+ result = VM_FAULT_NOPAGE;
+ break;
+ case -ENOMEM:
+ result = VM_FAULT_OOM;
+ break;
+ case -EAGAIN:
+ result = VM_FAULT_RETRY;
+ break;
+ default:
+ result = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ return result;
+}
#endif
/**
#else
.fault = ll_fault,
#endif
+ .page_mkwrite = ll_page_mkwrite,
.open = ll_vm_open,
.close = ll_vm_close,
};
cfio->nopage.ft_address, (long)cfio->nopage.ft_type);
cfio->ft_vmpage = vmpage;
+ lock_page(vmpage);
return 0;
}
#else
static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
{
- cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, cfio->fault.ft_vmf);
-
- if (cfio->fault.ft_vmf->page) {
- LL_CDEBUG_PAGE(D_PAGE, cfio->fault.ft_vmf->page,
- "got addr %p type NOPAGE\n",
- cfio->fault.ft_vmf->virtual_address);
- /*XXX workaround to bug in CLIO - he deadlocked with
- lock cancel if page locked */
- if (likely(cfio->fault.ft_flags & VM_FAULT_LOCKED)) {
- unlock_page(cfio->fault.ft_vmf->page);
- cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+ struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+ cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+
+ if (vmf->page) {
+ LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+ vmf->virtual_address);
+ if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+ lock_page(vmf->page);
+ cfio->fault.ft_flags &= VM_FAULT_LOCKED;
}
- cfio->ft_vmpage = cfio->fault.ft_vmf->page;
+ cfio->ft_vmpage = vmf->page;
return 0;
}
- if (unlikely (cfio->fault.ft_flags & VM_FAULT_ERROR)) {
- CDEBUG(D_PAGE, "got addr %p - SIGBUS\n",
- cfio->fault.ft_vmf->virtual_address);
+ if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+ CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
return -EFAULT;
}
- if (unlikely (cfio->fault.ft_flags & VM_FAULT_NOPAGE)) {
- CDEBUG(D_PAGE, "got addr %p - OOM\n",
- cfio->fault.ft_vmf->virtual_address);
+ if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+ CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
return -ENOMEM;
}
- if (unlikely(cfio->fault.ft_flags & VM_FAULT_RETRY))
+ if (cfio->fault.ft_flags & VM_FAULT_RETRY)
return -EAGAIN;
- CERROR("unknow error in page fault!\n");
+ CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags);
return -EINVAL;
}
struct cl_fault_io *fio = &io->u.ci_fault;
struct vvp_fault_io *cfio = &vio->u.fault;
loff_t offset;
- int kernel_result = 0;
int result = 0;
+ cfs_page_t *vmpage = NULL;
struct cl_page *page;
loff_t size;
pgoff_t last; /* last page in a file data region */
if (result != 0)
return result;
- /* must return unlocked page */
- kernel_result = vvp_io_kernel_fault(cfio);
- if (kernel_result != 0)
- return kernel_result;
-
- if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) {
- truncate_inode_pages_range(inode->i_mapping,
- cl_offset(obj, fio->ft_index), offset);
+ /* must return locked page */
+ if (fio->ft_mkwrite) {
+ LASSERT(cfio->ft_vmpage != NULL);
+ lock_page(cfio->ft_vmpage);
+ } else {
+ result = vvp_io_kernel_fault(cfio);
+ if (result != 0)
+ return result;
}
- /* Temporarily lock vmpage to keep cl_page_find() happy. */
- lock_page(cfio->ft_vmpage);
+ vmpage = cfio->ft_vmpage;
+ LASSERT(PageLocked(vmpage));
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+ ll_invalidate_page(vmpage);
/* Though we have already held a cl_lock upon this page, but
* it still can be truncated locally. */
- if (unlikely(cfio->ft_vmpage->mapping == NULL)) {
- unlock_page(cfio->ft_vmpage);
-
+ if (unlikely(vmpage->mapping == NULL)) {
CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
/* return +1 to stop cl_io_loop() and ll_fault() will catch
* and retry. */
- return +1;
+ GOTO(out, result = +1);
}
- page = cl_page_find(env, obj, fio->ft_index, cfio->ft_vmpage,
- CPT_CACHEABLE);
- unlock_page(cfio->ft_vmpage);
- if (IS_ERR(page)) {
- page_cache_release(cfio->ft_vmpage);
- cfio->ft_vmpage = NULL;
- return PTR_ERR(page);
+ page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+ if (IS_ERR(page))
+ GOTO(out, result = PTR_ERR(page));
+
+ /* if page is going to be written, we should add this page into cache
+ * earlier. */
+ if (fio->ft_mkwrite) {
+ wait_on_page_writeback(vmpage);
+ if (set_page_dirty(vmpage)) {
+ struct ccc_page *cp;
+
+ /* vvp_page_assume() calls wait_on_page_writeback(). */
+ cl_page_assume(env, io, page);
+
+ cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+ vvp_write_pending(cl2ccc(obj), cp);
+
+ /* Do not set Dirty bit here so that in case IO is
+ * started before the page is really made dirty, we
+ * still have chance to detect it. */
+ result = cl_page_cache_add(env, io, page, CRT_WRITE);
+ if (result < 0) {
+ cl_page_unassume(env, io, page);
+ cl_page_put(env, page);
+
+ /* we're in big trouble, what can we do now? */
+ if (result == -EDQUOT)
+ result = -ENOSPC;
+ GOTO(out, result);
+ }
+ }
}
size = i_size_read(inode);
last = cl_index(obj, size - 1);
+ LASSERT(fio->ft_index <= last);
if (fio->ft_index == last)
/*
* Last page is mapped partially.
*/
fio->ft_nob = size - cl_offset(obj, fio->ft_index);
- else
+ else
fio->ft_nob = cl_page_size(obj);
- lu_ref_add(&page->cp_reference, "fault", io);
- fio->ft_page = page;
- /*
- * Certain 2.6 kernels return not-NULL from
- * filemap_nopage() when page is beyond the file size,
- * on the grounds that "An external ptracer can access
- * pages that normally aren't accessible.." Don't
- * propagate such page fault to the lower layers to
- * avoid side-effects like KMS updates.
- */
- if (fio->ft_index > last)
- result = +1;
+ lu_ref_add(&page->cp_reference, "fault", io);
+ fio->ft_page = page;
+ EXIT;
+out:
+ /* return unlocked vmpage to avoid deadlocking */
+ unlock_page(vmpage);
+#ifdef HAVE_VM_OP_FAULT
+ cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+#endif
return result;
}
ENTRY;
- lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+ if (io->ci_lockreq == CILR_PEEK) {
+ lock = cl_lock_peek(env, io, &link->cill_descr, "io", io);
+ if (lock == NULL)
+ lock = ERR_PTR(-ENODATA);
+ } else
+ lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
if (!IS_ERR(lock)) {
link->cill_lock = lock;
cfs_list_move(&link->cill_linkage, &set->cls_curr);
return 0;
}
- /* If this is mmaped file - it can be changed at any time */
- if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags & OBD_FL_MMAP)
- return 1;
-
cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
oa->o_flags : 0);
new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
rc = osc_brw_fini_request(req, rc);
CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
if (osc_recoverable_error(rc)) {
- /* Only retry once for mmaped files since the mmaped page
- * might be modified at anytime. We have to retry at least
- * once in case there WAS really a corruption of the page
- * on the network, that was not caused by mmap() modifying
- * the page. Bug11742 */
- if ((rc == -EAGAIN) && (aa->aa_resends > 0) &&
- aa->aa_oa->o_valid & OBD_MD_FLFLAGS &&
- aa->aa_oa->o_flags & OBD_FL_MMAP) {
- rc = 0;
- } else {
- rc = osc_brw_redo_request(req, aa);
- if (rc == 0)
- RETURN(0);
- }
+ rc = osc_brw_redo_request(req, aa);
+ if (rc == 0)
+ RETURN(0);
}
if (aa->aa_ocapa) {
rc = errno;
goto out;
}
- buf = mmap(NULL, page_size,
+ buf = mmap(NULL, page_size * 2,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buf == MAP_FAILED) {
perror("mmap");
return rc;
}
+static int mmap_tst8(char *mnt)
+{
+ char fname[256];
+ char *buf = MAP_FAILED;
+ int fd = -1;
+ int rc = 0;
+ pid_t pid;
+ char xyz[page_size * 2];
+
+ if (snprintf(fname, 256, "%s/mmap_tst8", mnt) >= 256) {
+ fprintf(stderr, "dir name too long\n");
+ rc = ENAMETOOLONG;
+ goto out;
+ }
+ fd = open(fname, O_RDWR | O_CREAT, 0644);
+ if (fd == -1) {
+ perror("open");
+ rc = errno;
+ goto out;
+ }
+ if (ftruncate(fd, page_size) == -1) {
+ perror("truncate");
+ rc = errno;
+ goto out;
+ }
+ buf = mmap(NULL, page_size * 2,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (buf == MAP_FAILED) {
+ perror("mmap");
+ rc = errno;
+ goto out;
+ }
+
+ pid = fork();
+ if (pid == 0) { /* child */
+ memcpy(xyz, buf, page_size * 2);
+ /* shouldn't reach here. */
+ exit(0);
+ } else if (pid > 0) { /* parent */
+ int status = 0;
+ pid = waitpid(pid, &status, 0);
+ if (pid < 0) {
+ perror("wait");
+ rc = errno;
+ goto out;
+ }
+
+ rc = EFAULT;
+ if (WIFSIGNALED(status) && SIGBUS == WTERMSIG(status))
+ rc = 0;
+ } else {
+ perror("fork");
+ rc = errno;
+ }
+
+out:
+ if (buf != MAP_FAILED)
+ munmap(buf, page_size);
+ if (fd != -1)
+ close(fd);
+ return rc;
+}
+
static int remote_tst(int tc, char *mnt)
{
int rc = 0;
}
return rc;
}
-
+
struct test_case {
int tc; /* test case number */
char *desc; /* test description */
{ 6, "mmap test6: check mmap write/read content on two nodes",
mmap_tst6, 2 },
{ 7, "mmap test7: file i/o with an unmapped buffer", mmap_tst7, 1},
+ { 8, "mmap test8: SIGBUS for beyond file size", mmap_tst8, 1},
{ 0, NULL, 0, 0 }
};