From e46545c5af0b582e292b658cf741c47fdde343e9 Mon Sep 17 00:00:00 2001
From: jxiong <jxiong>
Date: Tue, 20 Jan 2009 04:30:21 +0000
Subject: [PATCH] b=5498 r=nikita,adilger

Porting lloop driver to HEAD, and add a new test to verify the basic function of
lloop driver in sanity
---
 lustre/llite/llite_internal.h |  21 +++
 lustre/llite/lloop.c          | 384 ++++++++++++++++++++++++++----------------
 lustre/llite/rw26.c           |  39 ++++-
 lustre/llite/vvp_page.c       |   7 +-
 lustre/obdclass/cl_page.c     |   7 +-
 lustre/tests/sanity.sh        |  38 ++++-
 lustre/utils/obd.c            |   7 +-
 7 files changed, 336 insertions(+), 167 deletions(-)
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index a03e1bf..f34c5d3 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -1216,4 +1216,25 @@ static inline int cl_merge_lvb(struct inode *inode)
 
 struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
 
+/** direct write pages */
+struct ll_dio_pages {
+        /** page array to be written. we don't support
+         * partial pages except the last one. */
+        struct page **ldp_pages;
+        /* offset of each page */
+        loff_t       *ldp_offsets;
+        /** if ldp_offsets is NULL, it means a sequential
+         * pages to be written, then this is the file offset
+         * of the * first page. */
+        loff_t        ldp_start_offset;
+        /** how many bytes are to be written. */
+        size_t        ldp_size;
+        /** # of pages in the array. */
+        int           ldp_nr;
+};
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                                  int rw, struct inode *inode,
+                                  struct ll_dio_pages *pv);
+
 #endif /* LLITE_INTERNAL_H */
diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c
index 05026f1..0a1b98e 100644
--- a/lustre/llite/lloop.c
+++ b/lustre/llite/lloop.c
@@ -42,9 +42,6 @@
  * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
  * permitted under the GNU General Public License.
  *
- * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
- * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
- *
  * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
  * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
  *
@@ -56,10 +53,6 @@
  *
  * Loadable modules and other fixes by AK, 1998
  *
- * Make real block number available to downstream transfer functions, enables
- * CBC (and relatives) mode encryption requiring unique IVs per data block.
- * Reed H. Petty, rhp@draper.net
- *
  * Maximum number of loop devices now dynamic via max_loop module parameter.
  * Russell Kroll <rkroll@exploits.org> 19990701
  *
@@ -129,37 +122,40 @@ enum {
 };
 
 struct lloop_device {
-        int                lo_number;
-        int                lo_refcnt;
-        loff_t             lo_offset;
-        loff_t             lo_sizelimit;
-        int                lo_flags;
+        int                  lo_number;
+        int                  lo_refcnt;
+        loff_t               lo_offset;
+        loff_t               lo_sizelimit;
+        int                  lo_flags;
         int                (*ioctl)(struct lloop_device *, int cmd,
-                                 unsigned long arg);
+                                    unsigned long arg);
 
-        struct file *      lo_backing_file;
+        struct file         *lo_backing_file;
         struct block_device *lo_device;
-        unsigned           lo_blocksize;
+        unsigned             lo_blocksize;
 
-        int                old_gfp_mask;
+        int                  old_gfp_mask;
 
-        spinlock_t         lo_lock;
-        struct bio         *lo_bio;
-        struct bio         *lo_biotail;
-        int                lo_state;
-        struct semaphore   lo_sem;
-        struct semaphore   lo_ctl_mutex;
-        struct semaphore   lo_bh_mutex;
-        atomic_t           lo_pending;
+        spinlock_t           lo_lock;
+        struct bio          *lo_bio;
+        struct bio          *lo_biotail;
+        int                  lo_state;
+        struct semaphore     lo_sem;
+        struct semaphore     lo_ctl_mutex;
+        atomic_t             lo_pending;
+        wait_queue_head_t    lo_bh_wait;
 
-        request_queue_t    *lo_queue;
+        request_queue_t     *lo_queue;
+
+        const struct lu_env *lo_env;
+        struct cl_io         lo_io;
+        struct ll_dio_pages  lo_pvec;
 
         /* data to handle bio for lustre. */
         struct lo_request_data {
-                struct brw_page    lrd_pages[LLOOP_MAX_SEGMENTS];
-                struct obdo        lrd_oa;
+                struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+                loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
         } lo_requests[1];
-
 };
 
 /*
@@ -170,7 +166,8 @@ enum {
 };
 
 static int lloop_major;
-static int max_loop = 8;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
 static struct lloop_device *loop_dev;
 static struct gendisk **disks;
 static struct semaphore lloop_mutex;
@@ -194,63 +191,88 @@ static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
         return loopsize >> 9;
 }
 
-static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio)
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 {
-        struct inode *inode = lo->lo_backing_file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_info oinfo = {{{ 0 }}};
-        struct brw_page *pg = lo->lo_requests[0].lrd_pages;
-        struct obdo *oa = &lo->lo_requests[0].lrd_oa;
-        pgoff_t offset;
-        int ret, cmd, i, opc;
-        struct bio_vec *bvec;
-
-        BUG_ON(bio->bi_hw_segments > LLOOP_MAX_SEGMENTS);
-
-        offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
-        bio_for_each_segment(bvec, bio, i) {
-                BUG_ON(bvec->bv_offset != 0);
-                BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
-
-                pg->pg = bvec->bv_page;
-                pg->off = offset;
-                pg->count = bvec->bv_len;
-                pg->flag = OBD_BRW_SRVLOCK;
-
-                pg++;
-                offset += bvec->bv_len;
+        const struct lu_env  *env   = lo->lo_env;
+        struct cl_io         *io    = &lo->lo_io;
+        struct inode         *inode = lo->lo_backing_file->f_dentry->d_inode;
+        struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+        pgoff_t               offset;
+        int                   ret;
+        int                   i;
+        int                   rw;
+        obd_count             page_count = 0;
+        struct bio_vec       *bvec;
+        struct bio           *bio;
+        ssize_t               bytes;
+
+        struct ll_dio_pages  *pvec = &lo->lo_pvec;
+        struct page         **pages = pvec->ldp_pages;
+        loff_t               *offsets = pvec->ldp_offsets;
+
+        truncate_inode_pages(inode->i_mapping, 0);
+
+        /* initialize the IO */
+        memset(io, 0, sizeof(*io));
+        io->ci_obj = obj;
+        ret = cl_io_init(env, io, CIT_MISC, obj);
+        if (ret)
+                return io->ci_result;
+        io->ci_lockreq = CILR_NEVER;
+
+        LASSERT(head != NULL);
+        rw = head->bi_rw;
+        for (bio = head; bio != NULL; bio = bio->bi_next) {
+                LASSERT(rw == bio->bi_rw);
+
+                offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+                bio_for_each_segment(bvec, bio, i) {
+                        BUG_ON(bvec->bv_offset != 0);
+                        BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
+
+                        pages[page_count] = bvec->bv_page;
+                        offsets[page_count] = offset;
+                        page_count++;
+                        offset += bvec->bv_len;
+                }
+                LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
         }
 
-        oa->o_mode = inode->i_mode;
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_gr = lsm->lsm_object_gr;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLMODE |
-                      OBD_MD_FLTYPE |OBD_MD_FLGROUP;
-        obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
-
-        cmd = OBD_BRW_READ;
-        if (bio_rw(bio) == WRITE)
-                cmd = OBD_BRW_WRITE;
-
-        if (cmd == OBD_BRW_WRITE)
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, bio->bi_size);
-        else
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, bio->bi_size);
-        oinfo.oi_oa = oa;
-        oinfo.oi_md = lsm;
-        opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
-        oinfo.oi_capa = ll_osscapa_get(inode, opc);
-        ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo,
-                      (obd_count)(i - bio->bi_idx),
-                      lo->lo_requests[0].lrd_pages, NULL);
-        capa_put(oinfo.oi_capa);
-        if (ret == 0)
-                obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
-        return ret;
+        ll_stats_ops_tally(ll_i2sbi(inode),
+                        (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+                        page_count << PAGE_CACHE_SHIFT);
+
+        pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+        pvec->ldp_nr = page_count;
+
+        /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+         * write those pages into OST. Even worse case is that more pages
+         * would be asked to write out to swap space, and then finally get here
+         * again.
+         * Unfortunately this is NOT easy to fix.
+         * Thoughts on solution:
+         * 0. Define a reserved pool for cl_pages, which could be a list of
+         *    pre-allocated cl_pages from cl_page_kmem;
+         * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+         *    which measures how many layers for this lustre object. Generally
+         *    speaking, the depth would be 2, one for llite, and one for lovsub.
+         *    However, for SNS, there will be more since we need additional page
+         *    to store parity;
+         * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+         *    pool. Afterwards, the clio would allocate the pages from reserved 
+         *    pool, this guarantees we neeedn't allocate the cl_pages from
+         *    generic cl_page slab cache.
+         *    Of course, if there is NOT enough pages in the pool, we might
+         *    be asked to write less pages once, this purely depends on
+         *    implementation. Anyway, we should be careful to avoid deadlocking.
+         */
+        LOCK_INODE_MUTEX(inode);
+        bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+        UNLOCK_INODE_MUTEX(inode);
+        cl_io_fini(env, io);
+        return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
 }
 
-
 /*
  * Add bio to back of pending list
  */
@@ -266,41 +288,77 @@ static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
                 lo->lo_bio = lo->lo_biotail = bio;
         spin_unlock_irqrestore(&lo->lo_lock, flags);
 
-        up(&lo->lo_bh_mutex);
+        atomic_inc(&lo->lo_pending);
+        if (waitqueue_active(&lo->lo_bh_wait))
+                wake_up(&lo->lo_bh_wait);
 }
 
 /*
  * Grab first pending buffer
  */
-static struct bio *loop_get_bio(struct lloop_device *lo)
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
 {
-        struct bio *bio;
+        struct bio *first;
+        struct bio **bio;
+        unsigned int count = 0;
+        unsigned int page_count = 0;
+        int rw;
 
         spin_lock_irq(&lo->lo_lock);
-        if ((bio = lo->lo_bio)) {
-                if (bio == lo->lo_biotail)
-                        lo->lo_biotail = NULL;
-                lo->lo_bio = bio->bi_next;
-                bio->bi_next = NULL;
+        first = lo->lo_bio;
+        if (unlikely(first == NULL)) {
+                spin_unlock_irq(&lo->lo_lock);
+                return 0;
         }
-        spin_unlock_irq(&lo->lo_lock);
 
-        return bio;
+        /* TODO: need to split the bio, too bad. */
+        LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+        rw = first->bi_rw;
+        bio = &lo->lo_bio;
+        while (*bio && (*bio)->bi_rw == rw) {
+                CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+                       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                       page_count, (*bio)->bi_vcnt);
+                if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+                        break;
+
+
+                page_count += (*bio)->bi_vcnt;
+                count++;
+                bio = &(*bio)->bi_next;
+        }
+        if (*bio) {
+                /* Some of bios can't be mergable. */
+                lo->lo_bio = *bio;
+                *bio = NULL;
+        } else {
+                /* Hit the end of queue */
+                lo->lo_biotail = NULL;
+                lo->lo_bio = NULL;
+        }
+        *req = first;
+        spin_unlock_irq(&lo->lo_lock);
+        return count;
 }
 
 static int loop_make_request(request_queue_t *q, struct bio *old_bio)
 {
         struct lloop_device *lo = q->queuedata;
         int rw = bio_rw(old_bio);
+        int inactive;
 
         if (!lo)
-                goto out;
+                goto err;
+
+        CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+               (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
 
         spin_lock_irq(&lo->lo_lock);
-        if (lo->lo_state != LLOOP_BOUND)
-                goto inactive;
-        atomic_inc(&lo->lo_pending);
+        inactive = (lo->lo_state != LLOOP_BOUND);
         spin_unlock_irq(&lo->lo_lock);
+        if (inactive)
+                goto err;
 
         if (rw == WRITE) {
                 if (lo->lo_flags & LO_FLAGS_READ_ONLY)
@@ -314,14 +372,8 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
         loop_add_bio(lo, old_bio);
         return 0;
 err:
-        if (atomic_dec_and_test(&lo->lo_pending))
-                up(&lo->lo_bh_mutex);
-out:
         bio_io_error(old_bio, old_bio->bi_size);
         return 0;
-inactive:
-        spin_unlock_irq(&lo->lo_lock);
-        goto out;
 }
 
 /*
@@ -338,27 +390,50 @@ static void loop_unplug(request_queue_t *q)
 static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
 {
         int ret;
-        ret = do_bio_filebacked(lo, bio);
-        bio_endio(bio, bio->bi_size, ret);
+        ret = do_bio_lustrebacked(lo, bio);
+        while (bio) {
+                struct bio *tmp = bio->bi_next;
+                bio->bi_next = NULL;
+                bio_endio(bio, bio->bi_size, ret);
+                bio = tmp;
+        }
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+        return atomic_read(&lo->lo_pending) || (lo->lo_state == LLOOP_RUNDOWN);
 }
 
 /*
  * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
+ * to avoid blocking in our make_request_fn.
  */
 static int loop_thread(void *data)
 {
         struct lloop_device *lo = data;
         struct bio *bio;
+        unsigned int count;
+        unsigned long times = 0;
+        unsigned long total_count = 0;
+
+        struct lu_env *env;
+        int refcheck;
+        int ret = 0;
 
         daemonize("lloop%d", lo->lo_number);
 
         set_user_nice(current, -20);
 
         lo->lo_state = LLOOP_BOUND;
-        atomic_inc(&lo->lo_pending);
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                GOTO(out, ret = PTR_ERR(env));
+
+        lo->lo_env = env;
+        memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+        lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+        lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
 
         /*
          * up sem, we are running
@@ -366,40 +441,54 @@ static int loop_thread(void *data)
         up(&lo->lo_sem);
 
         for (;;) {
-                down_interruptible(&lo->lo_bh_mutex);
-                /*
-                 * could be upped because of tear-down, not because of
-                 * pending work
-                 */
-                if (!atomic_read(&lo->lo_pending))
-                        break;
+                wait_event(lo->lo_bh_wait, loop_active(lo));
+                if (!atomic_read(&lo->lo_pending)) {
+                        int exiting = 0;
+                        spin_lock_irq(&lo->lo_lock);
+                        exiting = (lo->lo_state == LLOOP_RUNDOWN);
+                        spin_unlock_irq(&lo->lo_lock);
+                        if (exiting)
+                                break;
+                }
 
-                bio = loop_get_bio(lo);
-                if (!bio) {
+                bio = NULL;
+                count = loop_get_bio(lo, &bio);
+                if (!count) {
                         CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
                         continue;
                 }
-                loop_handle_bio(lo, bio);
 
-                /*
-                 * upped both for pending work and tear-down, lo_pending
-                 * will hit zero then
-                 */
-                if (atomic_dec_and_test(&lo->lo_pending))
-                        break;
+                total_count += count;
+                if (total_count < count) {     /* overflow */
+                        total_count = count;
+                        times = 1;
+                } else {
+                        times++;
+                }
+                if ((times & 127) == 0) {
+                        CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+                               total_count, times, total_count / times);
+                }
+
+                LASSERT(bio != NULL);
+                LASSERT(count <= atomic_read(&lo->lo_pending));
+                loop_handle_bio(lo, bio);
+                atomic_sub(count, &lo->lo_pending);
         }
+        cl_env_put(env, &refcheck);
 
+out:
         up(&lo->lo_sem);
-        return 0;
+        return ret;
 }
 
 static int loop_set_fd(struct lloop_device *lo, struct file *unused,
                        struct block_device *bdev, struct file *file)
 {
-        struct inode        *inode;
+        struct inode         *inode;
         struct address_space *mapping;
-        int                lo_flags = 0;
-        int                error;
+        int                   lo_flags = 0;
+        int                   error;
         loff_t                size;
 
         if (!try_module_get(THIS_MODULE))
@@ -452,8 +541,10 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
 
         /* queue parameters */
         blk_queue_hardsect_size(lo->lo_queue, CFS_PAGE_SIZE);
-        blk_queue_max_sectors(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+        blk_queue_max_sectors(lo->lo_queue,
+                              LLOOP_MAX_SEGMENTS << (CFS_PAGE_SHIFT - 9));
         blk_queue_max_phys_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+        blk_queue_max_hw_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
 
         set_capacity(disks[lo->lo_number], size);
         bd_set_size(bdev, size << 9);
@@ -487,9 +578,8 @@ static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
 
         spin_lock_irq(&lo->lo_lock);
         lo->lo_state = LLOOP_RUNDOWN;
-        if (atomic_dec_and_test(&lo->lo_pending))
-                up(&lo->lo_bh_mutex);
         spin_unlock_irq(&lo->lo_lock);
+        wake_up(&lo->lo_bh_wait);
 
         down(&lo->lo_sem);
         lo->lo_backing_file = NULL;
@@ -533,7 +623,7 @@ static int lo_release(struct inode *inode, struct file *file)
 
 /* lloop device node's ioctl function. */
 static int lo_ioctl(struct inode *inode, struct file *unused,
-        unsigned int cmd, unsigned long arg)
+                    unsigned int cmd, unsigned long arg)
 {
         struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
         struct block_device *bdev = inode->i_bdev;
@@ -578,12 +668,13 @@ static struct block_device_operations lo_fops = {
 /* dynamic iocontrol callback.
  * This callback is registered in lloop_init and will be called by
  * ll_iocontrol_call.
+ *
  * This is a llite regular file ioctl function. It takes the responsibility
- * of attaching a file, and detaching a file by a lloop's device numner.
+ * of attaching or detaching a file by a lloop's device numner.
  */
 static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
-                unsigned int cmd, unsigned long arg,
-                void *magic, int *rcp)
+                                   unsigned int cmd, unsigned long arg,
+                                   void *magic, int *rcp)
 {
         struct lloop_device *lo = NULL;
         struct block_device *bdev = NULL;
@@ -684,25 +775,27 @@ static int __init lloop_init(void)
         };
 
         if (max_loop < 1 || max_loop > 256) {
+                max_loop = MAX_LOOP_DEFAULT;
                 CWARN("lloop: invalid max_loop (must be between"
-                      " 1 and 256), using default (8)\n");
-                max_loop = 8;
+                      " 1 and 256), using default (%u)\n", max_loop);
         }
 
         lloop_major = register_blkdev(0, "lloop");
         if (lloop_major < 0)
                 return -EIO;
 
+        CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+               lloop_major, max_loop);
+
         ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
         if (ll_iocontrol_magic == NULL)
                 goto out_mem1;
 
-        loop_dev = kmalloc(max_loop * sizeof(struct lloop_device), GFP_KERNEL);
+        OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
         if (!loop_dev)
                 goto out_mem1;
-        memset(loop_dev, 0, max_loop * sizeof(struct lloop_device));
 
-        disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL);
+        OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
         if (!disks)
                 goto out_mem2;
 
@@ -718,14 +811,13 @@ static int __init lloop_init(void)
                 struct lloop_device *lo = &loop_dev[i];
                 struct gendisk *disk = disks[i];
 
-                memset(lo, 0, sizeof(*lo));
                 lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
                 if (!lo->lo_queue)
                         goto out_mem4;
 
                 init_MUTEX(&lo->lo_ctl_mutex);
                 init_MUTEX_LOCKED(&lo->lo_sem);
-                init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+                init_waitqueue_head(&lo->lo_bh_wait);
                 lo->lo_number = i;
                 spin_lock_init(&lo->lo_lock);
                 disk->major = lloop_major;
@@ -748,9 +840,9 @@ out_mem4:
 out_mem3:
         while (i--)
                 put_disk(disks[i]);
-        kfree(disks);
+        OBD_FREE(disks, max_loop * sizeof(*disks));
 out_mem2:
-        kfree(loop_dev);
+        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
 out_mem1:
         unregister_blkdev(lloop_major, "lloop");
         ll_iocontrol_unregister(ll_iocontrol_magic);
@@ -770,9 +862,11 @@ static void lloop_exit(void)
         }
         if (ll_unregister_blkdev(lloop_major, "lloop"))
                 CWARN("lloop: cannot unregister blkdev\n");
+        else
+                CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
 
-        kfree(disks);
-        kfree(loop_dev);
+        OBD_FREE(disks, max_loop * sizeof(*disks));
+        OBD_FREE(loop_dev, max_loop * sizeof(loop_dev));
 }
 
 module_init(lloop_init);
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c
index 031b1ab..fac56d7 100644
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -216,11 +216,9 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
         OBD_FREE(pages, npages * sizeof(*pages));
 }
 
-static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
-                                   int rw, struct inode *inode,
-                                   struct address_space *mapping,
-                                   size_t size, loff_t file_offset,
-                                   struct page **pages, int page_count)
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                           int rw, struct inode *inode,
+                           struct ll_dio_pages *pv)
 {
         struct cl_page    *clp;
         struct ccc_page   *clup;
@@ -229,8 +227,11 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
         struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io;
         int i;
         ssize_t rc = 0;
-        ssize_t size_orig = size;
-        size_t page_size  = cl_page_size(obj);
+        loff_t file_offset  = pv->ldp_start_offset;
+        size_t size         = pv->ldp_size;
+        int page_count      = pv->ldp_nr;
+        struct page **pages = pv->ldp_pages;
+        size_t page_size    = cl_page_size(obj);
         ENTRY;
 
         cl_sync_io_init(anchor, page_count);
@@ -238,8 +239,11 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
         queue = &io->ci_queue;
         cl_2queue_init(queue);
         for (i = 0; i < page_count; i++) {
+                if (pv->ldp_offsets)
+                    file_offset = pv->ldp_offsets[i];
+                LASSERT(!(file_offset & (page_size - 1)));
                 clp = cl_page_find(env, obj, cl_index(obj, file_offset),
-                                   pages[i], CPT_TRANSIENT);
+                                   pv->ldp_pages[i], CPT_TRANSIENT);
                 if (IS_ERR(clp)) {
                         rc = PTR_ERR(clp);
                         break;
@@ -319,7 +323,7 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
                                 cl_sync_io_note(anchor, +1);
                         /* wait for the IO to be finished. */
                         rc = cl_sync_io_wait(env, io, &queue->c2_qout,
-                                             anchor) ?: size_orig;
+                                             anchor) ?: pv->ldp_size;
                 }
         }
 
@@ -328,6 +332,23 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
         cl_2queue_fini(env, queue);
         RETURN(rc);
 }
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+                                   int rw, struct inode *inode,
+                                   struct address_space *mapping,
+                                   size_t size, loff_t file_offset,
+                                   struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages        = pages,
+                                 .ldp_nr           = page_count,
+                                 .ldp_size         = size,
+                                 .ldp_offsets      = NULL,
+                                 .ldp_start_offset = file_offset
+                               };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
 
 /* This is the maximum size of a single O_DIRECT request, based on a 128kB
  * kmalloc limit.  We need to fit all of the brw_page structs, each one
diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c
index d199ad6..b698f52 100644
--- a/lustre/llite/vvp_page.c
+++ b/lustre/llite/vvp_page.c
@@ -243,7 +243,12 @@ static void vvp_page_completion_common(const struct lu_env *env,
         struct cl_sync_io *anchor = cp->cpg_sync_io;
 
         LINVRNT(cl_page_is_vmlocked(env, clp));
-        KLASSERT(!PageWriteback(vmpage));
+
+        /* Don't assert the page writeback bit here because the lustre file
+         * may be as a backend of swap space. in this case, the page writeback
+         * is set by VM, and obvious we shouldn't clear it at all. Fortunately
+         * this type of pages are all TRANSIENT pages. */
+        KLASSERT(ergo(clp->cp_type == CPT_CACHEABLE, !PageWriteback(vmpage)));
 
         vvp_vmpage_error(inode, vmpage, ioret);
 
diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c
index feac1ff..e88427b 100644
--- a/lustre/obdclass/cl_page.c
+++ b/lustre/obdclass/cl_page.c
@@ -1259,7 +1259,12 @@ void cl_page_completion(const struct lu_env *env,
                                (const struct lu_env *,
                                 const struct cl_page_slice *, int), ioret);
 
-        KLASSERT(!PageWriteback(cl_page_vmpage(env, pg)));
+        /* Don't assert the page writeback bit here because the lustre file
+         * may be as a backend of swap space. in this case, the page writeback
+         * is set by VM, and obvious we shouldn't clear it at all. Fortunately
+         * this type of pages are all TRANSIENT pages. */
+        KLASSERT(ergo(pg->cp_type == CPT_CACHEABLE,
+                      !PageWriteback(cl_page_vmpage(env, pg))));
         EXIT;
 }
 EXPORT_SYMBOL(cl_page_completion);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index fd20f97..c199621 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -8,7 +8,7 @@ set -e
 
 ONLY=${ONLY:-"$*"}
 # bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 12653 12653 5188 10764 16260
-ALWAYS_EXCEPT="                27u   42a  42b  42c  42d  45   51d   65a   65e   68   75    119d  $SANITY_EXCEPT"
+ALWAYS_EXCEPT="                27u   42a  42b  42c  42d  45   51d   65a   65e   68b   75    119d  $SANITY_EXCEPT"
 # bug number for skipped test: 2108 9789 3637 9789 3561 5188/5749 1443
 #ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27m 42a 42b 42c 42d 45 68 76"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
@@ -3075,12 +3075,15 @@ LLOOP=
 cleanup_68() {
 	trap 0
 	if [ ! -z "$LLOOP" ]; then
-		swapoff $LLOOP || error "swapoff failed"
+		if swapon -s | grep -q $LLOOP; then
+			swapoff $LLOOP || error "swapoff failed"
+		fi
+
 		$LCTL blockdev_detach $LLOOP || error "detach failed"
 		rm -f $LLOOP
 		unset LLOOP
 	fi
-	rm -f $DIR/f68
+	rm -f $DIR/f68*
 }
 
 meminfo() {
@@ -3091,10 +3094,29 @@ swap_used() {
 	swapon -s | awk '($1 == "'$1'") { print $4 }'
 }
 
+# test case for lloop driver, basic function
+test_68a() {
+	[ "$UID" != 0 ] && skip "must run as root" && return
+
+	grep -q llite_lloop /proc/modules
+	[ $? -ne 0 ] && skip "can't find module llite_lloop" && return
+
+	LLOOP=$TMP/lloop.`date +%s`.`date +%N`
+	dd if=/dev/zero of=$DIR/f68a bs=4k count=1024
+	$LCTL blockdev_attach $DIR/f68a $LLOOP || error "attach failed"
+
+	trap cleanup_68 EXIT
+
+	directio rdwr $LLOOP 0 1024 4096 || error "direct write failed"
+	directio rdwr $LLOOP 0 1025 4096 && error "direct write should fail"
+
+	cleanup_68
+}
+run_test 68a "lloop driver - basic test ========================"
 
 # excercise swapping to lustre by adding a high priority swapfile entry
 # and then consuming memory until it is used.
-test_68() {
+test_68b() {  # was test_68
 	[ "$UID" != 0 ] && skip "must run as root" && return
 	lctl get_param -n devices | grep -q obdfilter && \
 		skip "local OST" && return
@@ -3110,10 +3132,10 @@ test_68() {
 	[[ $NR_BLOCKS -le 2048 ]] && NR_BLOCKS=2048
 
 	LLOOP=$TMP/lloop.`date +%s`.`date +%N`
-	dd if=/dev/zero of=$DIR/f68 bs=64k seek=$NR_BLOCKS count=1
-	mkswap $DIR/f68
+	dd if=/dev/zero of=$DIR/f68b bs=64k seek=$NR_BLOCKS count=1
+	mkswap $DIR/f68b
 
-	$LCTL blockdev_attach $DIR/f68 $LLOOP || error "attach failed"
+	$LCTL blockdev_attach $DIR/f68b $LLOOP || error "attach failed"
 
 	trap cleanup_68 EXIT
 
@@ -3128,7 +3150,7 @@ test_68() {
 
 	[ $SWAPUSED -eq 0 ] && echo "no swap used???" || true
 }
-run_test 68 "support swapping to Lustre ========================"
+run_test 68b "support swapping to Lustre ========================"
 
 # bug5265, obdfilter oa2dentry return -ENOENT
 # #define OBD_FAIL_OST_ENOENT 0x217
diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c
index a408a9d..63b2757 100644
--- a/lustre/utils/obd.c
+++ b/lustre/utils/obd.c
@@ -2142,14 +2142,15 @@ static int jt_blockdev_find_module(const char *module)
 {
         FILE *fp;
         int found = 0;
-        char modname[256];
+        char buf[1024];
 
         fp = fopen("/proc/modules", "r");
         if (fp == NULL)
                 return -1;
 
-        while (fscanf(fp, "%s %*s %*s %*s %*s %*s", modname) == 1) {
-                if (strcmp(module, modname) == 0) {
+        while (fgets(buf, 1024, fp) != NULL) {
+                *strchr(buf, ' ') = 0;
+                if (strcmp(module, buf) == 0) {
                         found = 1;
                         break;
                 }
-- 
1.8.3.1