Whamcloud - gitweb
LU-1994 kernel: fix reference counting with l_dentry_open
[fs/lustre-release.git] / lustre / obdclass / llog_lvfs.c
index 6e0309b..9373474 100644 (file)
@@ -1,40 +1,50 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/*
+ * GPL HEADER START
  *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
  *
- * OST<->MDS recovery logging infrastructure.
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
  *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
  * Invariants in implementation:
  * - we do not share logs among different OST<->MDS connections, so that
  *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
  */
 
 #define DEBUG_SUBSYSTEM S_LOG
 
-#ifndef EXPORT_SYMTAB
-#define EXPORT_SYMTAB
-#endif
-
 #ifndef __KERNEL__
 #include <liblustre.h>
 #endif
@@ -94,7 +104,7 @@ static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
 
         file->f_pos = off;
 
-        if (buflen == 0) 
+        if (buflen == 0)
                 CWARN("0-length record\n");
 
         if (!buf) {
@@ -151,7 +161,8 @@ static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
         RETURN(0);
 }
 
-static int llog_lvfs_read_header(struct llog_handle *handle)
+static int llog_lvfs_read_header(const struct lu_env *env,
+                                struct llog_handle *handle)
 {
         struct obd_device *obd;
         int rc;
@@ -161,7 +172,7 @@ static int llog_lvfs_read_header(struct llog_handle *handle)
 
         obd = handle->lgh_ctxt->loc_exp->exp_obd;
 
-        if (handle->lgh_file->f_dentry->d_inode->i_size == 0) {
+        if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
                 CDEBUG(D_HA, "not reading header from 0-byte log\n");
                 RETURN(LLOG_EEMPTY);
         }
@@ -196,17 +207,18 @@ static int llog_lvfs_read_header(struct llog_handle *handle)
         }
 
         handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
-        handle->lgh_file->f_pos = handle->lgh_file->f_dentry->d_inode->i_size;
+        handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
 
         RETURN(rc);
 }
 
 /* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
 /* appends if idx == -1, otherwise overwrites record idx. */
-static int llog_lvfs_write_rec(struct llog_handle *loghandle,
-                               struct llog_rec_hdr *rec,
-                               struct llog_cookie *reccookie, int cookiecount,
-                               void *buf, int idx)
+static int llog_lvfs_write_rec(const struct lu_env *env,
+                              struct llog_handle *loghandle,
+                              struct llog_rec_hdr *rec,
+                              struct llog_cookie *reccookie, int cookiecount,
+                              void *buf, int idx, struct thandle *th)
 {
         struct llog_log_hdr *llh;
         int reclen = rec->lrh_len, index, rc;
@@ -230,15 +242,15 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                 RETURN(rc);
 
         if (buf)
-                /* write_blob adds header and tail to lrh_len. */ 
-                reclen = sizeof(*rec) + rec->lrh_len + 
+                /* write_blob adds header and tail to lrh_len. */
+                reclen = sizeof(*rec) + rec->lrh_len +
                          sizeof(struct llog_rec_tail);
 
         if (idx != -1) {
                 loff_t saved_offset;
 
                 /* no header: only allowed to insert record 1 */
-                if (idx != 1 && !file->f_dentry->d_inode->i_size) {
+                if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
                         CERROR("idx != -1 in empty log\n");
                         LBUG();
                 }
@@ -246,7 +258,7 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                 if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
                         RETURN(-EINVAL);
 
-                if (!ext2_test_bit(idx, llh->llh_bitmap)) 
+                if (!ext2_test_bit(idx, llh->llh_bitmap))
                         CERROR("Modify unset record %u\n", idx);
                 if (idx != rec->lrh_index)
                         CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
@@ -256,12 +268,7 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                 if (rc || idx == 0)
                         RETURN(rc);
 
-                /* Assumes constant lrh_len */
-                saved_offset = sizeof(*llh) + (idx - 1) * reclen;
-
                 if (buf) {
-                        struct llog_rec_hdr check;
-
                         /* We assume that caller has set lgh_cur_* */
                         saved_offset = loghandle->lgh_cur_offset;
                         CDEBUG(D_OTHER,
@@ -275,19 +282,10 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                                        idx, loghandle->lgh_cur_idx);
                                 RETURN(-EFAULT);
                         }
-#if 1  /* FIXME remove this safety check at some point */
-                        /* Verify that the record we're modifying is the 
-                           right one. */
-                        rc = llog_lvfs_read_blob(obd, file, &check,
-                                                 sizeof(check), saved_offset);
-                        if (check.lrh_index != idx || check.lrh_len != reclen) {
-                                CERROR("Bad modify idx %u/%u size %u/%u (%d)\n",
-                                       idx, check.lrh_index, reclen, 
-                                       check.lrh_len, rc);
-                                RETURN(-EFAULT);
-                        }
-#endif
-                }
+               } else {
+                       /* Assumes constant lrh_len */
+                       saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+               }
 
                 rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
                 if (rc == 0 && reccookie) {
@@ -310,15 +308,15 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
         /* NOTE: padding is a record, but no bit is set */
         if (left != 0 && left != reclen &&
             left < (reclen + LLOG_MIN_REC_SIZE)) {
-                loghandle->lgh_last_idx++;
-                rc = llog_lvfs_pad(obd, file, left, loghandle->lgh_last_idx);
-                if (rc)
-                        RETURN(rc);
-                /* if it's the last idx in log file, then return -ENOSPC */
-                if (loghandle->lgh_last_idx == LLOG_BITMAP_SIZE(llh) - 1)
-                        RETURN(-ENOSPC);
-        }
-
+                 index = loghandle->lgh_last_idx + 1;
+                 rc = llog_lvfs_pad(obd, file, left, index);
+                 if (rc)
+                         RETURN(rc);
+                 loghandle->lgh_last_idx++; /*for pad rec*/
+         }
+         /* if it's the last idx in log file, then return -ENOSPC */
+         if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+                 RETURN(-ENOSPC);
         loghandle->lgh_last_idx++;
         index = loghandle->lgh_last_idx;
         LASSERT(index < LLOG_BITMAP_SIZE(llh));
@@ -329,11 +327,17 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                 lrt->lrt_len = rec->lrh_len;
                 lrt->lrt_index = rec->lrh_index;
         }
-        if (ext2_set_bit(index, llh->llh_bitmap)) {
-                CERROR("argh, index %u already set in log bitmap?\n", index);
-                LBUG(); /* should never happen */
-        }
-        llh->llh_count++;
+        /*The caller should make sure only 1 process access the lgh_last_idx,
+         *Otherwise it might hit the assert.*/
+        LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("argh, index %u already set in log bitmap?\n", index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
         llh->llh_tail.lrt_index = index;
 
         rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
@@ -344,18 +348,16 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
         if (rc)
                 RETURN(rc);
 
-        CDEBUG(D_HA, "added record "LPX64": idx: %u, %u bytes\n",
+        CDEBUG(D_RPCTRACE, "added record "LPX64": idx: %u, %u \n",
                loghandle->lgh_id.lgl_oid, index, rec->lrh_len);
         if (rc == 0 && reccookie) {
                 reccookie->lgc_lgl = loghandle->lgh_id;
                 reccookie->lgc_index = index;
-                if ((rec->lrh_type == MDS_UNLINK_REC) || 
-                                (rec->lrh_type == MDS_SETATTR_REC))
+                if ((rec->lrh_type == MDS_UNLINK_REC) ||
+                    (rec->lrh_type == MDS_SETATTR64_REC))
                         reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
                 else if (rec->lrh_type == OST_SZ_REC)
                         reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
-                else if (rec->lrh_type == OST_RAID1_REC)
-                        reccookie->lgc_subsys = LLOG_RD1_ORIG_CTXT;
                 else
                         reccookie->lgc_subsys = -1;
                 rc = 1;
@@ -386,9 +388,10 @@ static void llog_skip_over(__u64 *off, int curr, int goal)
  *  - cur_idx to the log index preceeding cur_offset
  * returns -EIO/-EINVAL on error
  */
-static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
-                                int next_idx, __u64 *cur_offset, void *buf,
-                                int len)
+static int llog_lvfs_next_block(const struct lu_env *env,
+                               struct llog_handle *loghandle, int *cur_idx,
+                               int next_idx, __u64 *cur_offset, void *buf,
+                               int len)
 {
         int rc;
         ENTRY;
@@ -399,18 +402,21 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
         CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
                next_idx, *cur_idx, *cur_offset);
 
-        while (*cur_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) {
-                struct llog_rec_hdr *rec;
-                struct llog_rec_tail *tail;
-                loff_t ppos;
-
-                llog_skip_over(cur_offset, *cur_idx, next_idx);
-
-                ppos = *cur_offset;
-                rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
-                                        loghandle->lgh_file, buf, len,
-                                        &ppos);
-                if (rc) {
+        while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+               struct llog_rec_hdr *rec, *last_rec;
+               struct llog_rec_tail *tail;
+               loff_t ppos;
+               int llen;
+
+               llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+               /* read up to next LLOG_CHUNK_SIZE block */
+               ppos = *cur_offset;
+               llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+               rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+                                       loghandle->lgh_file, buf, llen,
+                                       cur_offset);
+               if (rc < 0) {
                         CERROR("Cant read llog block at log id "LPU64
                                "/%u offset "LPU64"\n",
                                loghandle->lgh_id.lgl_oid,
@@ -420,9 +426,7 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
                 }
 
                 /* put number of bytes read into rc to make code simpler */
-                rc = ppos - *cur_offset;
-                *cur_offset = ppos;
-
+               rc = *cur_offset - ppos;
                 if (rc < len) {
                         /* signal the end of the valid buffer to llog_process */
                         memset(buf + rc, 0, len - rc);
@@ -439,12 +443,19 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
                 }
 
                 rec = buf;
-                tail = (struct llog_rec_tail *)((char *)buf + rc -
-                                                sizeof(struct llog_rec_tail));
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
 
-                if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) {
-                        lustre_swab_llog_rec(rec, tail);
-                }
+               tail = (struct llog_rec_tail *)(buf + rc -
+                                               sizeof(struct llog_rec_tail));
+
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)(buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
 
                 *cur_idx = tail->lrt_index;
 
@@ -470,8 +481,9 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
         RETURN(-EIO);
 }
 
-static int llog_lvfs_prev_block(struct llog_handle *loghandle,
-                                int prev_idx, void *buf, int len)
+static int llog_lvfs_prev_block(const struct lu_env *env,
+                               struct llog_handle *loghandle,
+                               int prev_idx, void *buf, int len)
 {
         __u64 cur_offset;
         int rc;
@@ -480,22 +492,20 @@ static int llog_lvfs_prev_block(struct llog_handle *loghandle,
         if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
                 RETURN(-EINVAL);
 
-        CDEBUG(D_OTHER, "looking for log index %u n", prev_idx);
+        CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
 
         cur_offset = LLOG_CHUNK_SIZE;
         llog_skip_over(&cur_offset, 0, prev_idx);
 
-        while (cur_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) {
-                struct llog_rec_hdr *rec;
-                struct llog_rec_tail *tail;
-                loff_t ppos;
+        while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+               struct llog_rec_hdr *rec, *last_rec;
+               struct llog_rec_tail *tail;
+               loff_t ppos = cur_offset;
 
-                ppos = cur_offset;
-
-                rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
-                                        loghandle->lgh_file, buf, len,
-                                        &ppos);
-                if (rc) {
+               rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+                                       loghandle->lgh_file, buf, len,
+                                       &cur_offset);
+               if (rc < 0) {
                         CERROR("Cant read llog block at log id "LPU64
                                "/%u offset "LPU64"\n",
                                loghandle->lgh_id.lgl_oid,
@@ -505,8 +515,7 @@ static int llog_lvfs_prev_block(struct llog_handle *loghandle,
                 }
 
                 /* put number of bytes read into rc to make code simpler */
-                rc = ppos - cur_offset;
-                cur_offset = ppos;
+               rc = cur_offset - ppos;
 
                 if (rc == 0) /* end of file, nothing to do */
                         RETURN(0);
@@ -518,7 +527,20 @@ static int llog_lvfs_prev_block(struct llog_handle *loghandle,
                         RETURN(-EINVAL);
                 }
 
-                tail = buf + rc - sizeof(struct llog_rec_tail);
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)(buf + rc -
+                                               sizeof(struct llog_rec_tail));
+
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)(buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
 
                 /* this shouldn't happen */
                 if (tail->lrt_index == 0) {
@@ -527,15 +549,14 @@ static int llog_lvfs_prev_block(struct llog_handle *loghandle,
                                loghandle->lgh_id.lgl_ogen, cur_offset);
                         RETURN(-EINVAL);
                 }
-                if (le32_to_cpu(tail->lrt_index) < prev_idx)
+               if (tail->lrt_index < prev_idx)
                         continue;
 
                 /* sanity check that the start of the new buffer is no farther
                  * than the record that we wanted.  This shouldn't happen. */
-                rec = buf;
-                if (le32_to_cpu(rec->lrh_index) > prev_idx) {
-                        CERROR("missed desired record? %u > %u\n",
-                               le32_to_cpu(rec->lrh_index), prev_idx);
+               if (rec->lrh_index > prev_idx) {
+                       CERROR("missed desired record? %u > %u\n",
+                              rec->lrh_index, prev_idx);
                         RETURN(-ENOENT);
                 }
                 RETURN(0);
@@ -558,7 +579,7 @@ static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
                 filp = ERR_PTR(-ENAMETOOLONG);
         } else {
                 filp = l_filp_open(logname, flags, mode);
-                if (IS_ERR(filp))
+               if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
                         CERROR("logfile creation %s: %ld\n", logname,
                                PTR_ERR(filp));
         }
@@ -566,162 +587,221 @@ static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
         return filp;
 }
 
-/* This is a callback from the llog_* functions.
- * Assumes caller has already pushed us into the kernel context. */
-static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
-                            struct llog_logid *logid, char *name)
+static int llog_lvfs_open(const struct lu_env *env,  struct llog_handle *handle,
+                         struct llog_logid *logid, char *name,
+                         enum llog_open_param open_param)
 {
-        struct llog_handle *handle;
-        struct obd_device *obd;
-        struct l_dentry *dchild = NULL;
-        struct obdo *oa = NULL;
-        int rc = 0, cleanup_phase = 1;
-        int open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
-        ENTRY;
-
-        handle = llog_alloc_handle();
-        if (handle == NULL)
-                RETURN(-ENOMEM);
-        *res = handle;
-
-        LASSERT(ctxt);
-        LASSERT(ctxt->loc_exp);
-        obd = ctxt->loc_exp->exp_obd;
-
-        if (logid != NULL) {
-                dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, logid->lgl_oid,
-                                             logid->lgl_ogen, logid->lgl_ogr);
-
-                if (IS_ERR(dchild)) {
-                        rc = PTR_ERR(dchild);
-                        CERROR("error looking up logfile "LPX64":0x%x: rc %d\n",
-                               logid->lgl_oid, logid->lgl_ogen, rc);
-                        GOTO(cleanup, rc);
-                }
-
-                cleanup_phase = 2;
-                if (dchild->d_inode == NULL) {
-                        rc = -ENOENT;
-                        CERROR("nonexistent log file "LPX64":"LPX64": rc %d\n",
-                               logid->lgl_oid, logid->lgl_ogr, rc);
-                        GOTO(cleanup, rc);
-                }
-
-                handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
-                                                    O_RDWR | O_LARGEFILE);
-                if (IS_ERR(handle->lgh_file)) {
-                        rc = PTR_ERR(handle->lgh_file);
-                        CERROR("error opening logfile "LPX64"0x%x: rc %d\n",
-                               logid->lgl_oid, logid->lgl_ogen, rc);
-                        GOTO(cleanup, rc);
-                }
-
-                /* assign the value of lgh_id for handle directly */
-                handle->lgh_id = *logid;
-
-        } else if (name) {
-                /* COMPAT_146 */
-                if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0) {
-                        handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name, 
-                                                          open_flags, 0644);
-                } else {
-                        /* end COMPAT_146 */
-                        handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR,
-                                                          name, open_flags, 
-                                                          0644);
-                }
-                if (IS_ERR(handle->lgh_file))
-                        GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
-
-                handle->lgh_id.lgl_ogr = 1;
-                handle->lgh_id.lgl_oid =
-                        handle->lgh_file->f_dentry->d_inode->i_ino;
-                handle->lgh_id.lgl_ogen =
-                        handle->lgh_file->f_dentry->d_inode->i_generation;
-        } else {
-                OBDO_ALLOC(oa);
-                if (oa == NULL)
-                        GOTO(cleanup, rc = -ENOMEM);
-
-                oa->o_gr = FILTER_GROUP_LLOG;
-                oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
-
-                rc = obd_create(ctxt->loc_exp, oa, NULL, NULL);
-                if (rc)
-                        GOTO(cleanup, rc);
-
-                dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, oa->o_id,
-                                             oa->o_generation, oa->o_gr);
-
-                if (IS_ERR(dchild))
-                        GOTO(cleanup, rc = PTR_ERR(dchild));
-                cleanup_phase = 2;
-                handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
-                                                 open_flags);
-                if (IS_ERR(handle->lgh_file))
-                        GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
-
-                handle->lgh_id.lgl_ogr = oa->o_gr;
-                handle->lgh_id.lgl_oid = oa->o_id;
-                handle->lgh_id.lgl_ogen = oa->o_generation;
-        }
+       struct llog_ctxt        *ctxt = handle->lgh_ctxt;
+       struct l_dentry         *dchild = NULL;
+       struct obd_device       *obd;
+       int                      rc = 0;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       LASSERT(ctxt->loc_exp->exp_obd);
+       obd = ctxt->loc_exp->exp_obd;
+
+       LASSERT(handle);
+       if (logid != NULL) {
+               dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, logid->lgl_oid,
+                                            logid->lgl_ogen, logid->lgl_oseq);
+               if (IS_ERR(dchild)) {
+                       rc = PTR_ERR(dchild);
+                       CERROR("%s: error looking up logfile #"LPX64"#"
+                              LPX64"#%08x: rc = %d\n",
+                              ctxt->loc_obd->obd_name, logid->lgl_oid,
+                              logid->lgl_oseq, logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               if (dchild->d_inode == NULL) {
+                       l_dput(dchild);
+                       rc = -ENOENT;
+                       CERROR("%s: nonexistent llog #"LPX64"#"LPX64"#%08x: "
+                              "rc = %d\n", ctxt->loc_obd->obd_name,
+                              logid->lgl_oid, logid->lgl_oseq,
+                              logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+                                                O_RDWR | O_LARGEFILE);
+               l_dput(dchild);
+               if (IS_ERR(handle->lgh_file)) {
+                       rc = PTR_ERR(handle->lgh_file);
+                       handle->lgh_file = NULL;
+                       CERROR("%s: error opening llog #"LPX64"#"LPX64"#%08x: "
+                              "rc = %d\n", ctxt->loc_obd->obd_name,
+                              logid->lgl_oid, logid->lgl_oseq,
+                              logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               handle->lgh_id = *logid;
+       } else if (name) {
+               handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+                                                 O_RDWR | O_LARGEFILE, 0644);
+               if (IS_ERR(handle->lgh_file)) {
+                       rc = PTR_ERR(handle->lgh_file);
+                       handle->lgh_file = NULL;
+                       if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+                               OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+                               if (handle->lgh_name)
+                                       strcpy(handle->lgh_name, name);
+                               else
+                                       GOTO(out, rc = -ENOMEM);
+                               rc = 0;
+                       } else {
+                               GOTO(out, rc);
+                       }
+               } else {
+                       handle->lgh_id.lgl_oseq = FID_SEQ_LLOG;
+                       handle->lgh_id.lgl_oid =
+                               handle->lgh_file->f_dentry->d_inode->i_ino;
+                       handle->lgh_id.lgl_ogen =
+                               handle->lgh_file->f_dentry->d_inode->i_generation;
+               }
+       } else {
+               LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+               handle->lgh_file = NULL;
+       }
+
+       /* No new llog is expected but doesn't exist */
+       if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+               GOTO(out_name, rc = -ENOENT);
+
+       RETURN(0);
+out_name:
+       if (handle->lgh_name != NULL)
+               OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+       RETURN(rc);
+}
 
-        handle->lgh_ctxt = ctxt;
- finish:
-        if (oa)
-                OBDO_FREE(oa);
-        RETURN(rc);
-cleanup:
-        switch (cleanup_phase) {
-        case 2:
-                l_dput(dchild);
-        case 1:
-                llog_free_handle(handle);
-        }
-        goto finish;
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+       return (handle->lgh_file != NULL);
 }
 
-static int llog_lvfs_close(struct llog_handle *handle)
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+                           struct llog_handle *handle,
+                           struct thandle *th)
 {
-        int rc;
-        ENTRY;
+       struct llog_ctxt        *ctxt = handle->lgh_ctxt;
+       struct obd_device       *obd;
+       struct l_dentry         *dchild = NULL;
+       struct file             *file;
+       struct obdo             *oa = NULL;
+       int                      rc = 0;
+       int                      open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       obd = ctxt->loc_exp->exp_obd;
+       LASSERT(handle->lgh_file == NULL);
+
+       if (handle->lgh_name) {
+               file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+                                     open_flags, 0644);
+               if (IS_ERR(file))
+                       RETURN(PTR_ERR(file));
+
+               handle->lgh_id.lgl_oseq = FID_SEQ_LLOG;
+               handle->lgh_id.lgl_oid = file->f_dentry->d_inode->i_ino;
+               handle->lgh_id.lgl_ogen =
+                               file->f_dentry->d_inode->i_generation;
+               handle->lgh_file = file;
+       } else {
+               OBDO_ALLOC(oa);
+               if (oa == NULL)
+                       RETURN(-ENOMEM);
+
+               oa->o_seq = FID_SEQ_LLOG;
+               oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+               rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* FIXME: rationalize the misuse of o_generation in
+                *        this API along with mds_obd_{create,destroy}.
+                *        Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+               dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, oa->o_id,
+                                            oa->o_generation, oa->o_seq);
+               if (IS_ERR(dchild))
+                       GOTO(out, rc = PTR_ERR(dchild));
+
+               file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+               l_dput(dchild);
+               if (IS_ERR(file))
+                       GOTO(out, rc = PTR_ERR(file));
+               handle->lgh_id.lgl_oseq = oa->o_seq;
+               handle->lgh_id.lgl_oid = oa->o_id;
+               handle->lgh_id.lgl_ogen = oa->o_generation;
+               handle->lgh_file = file;
+out:
+               OBDO_FREE(oa);
+       }
+       RETURN(rc);
+}
 
-        rc = filp_close(handle->lgh_file, 0);
-        if (rc)
-                CERROR("error closing log: rc %d\n", rc);
-        RETURN(rc);
+static int llog_lvfs_close(const struct lu_env *env,
+                          struct llog_handle *handle)
+{
+       int rc;
+
+       ENTRY;
+
+       if (handle->lgh_file == NULL)
+               RETURN(0);
+       rc = filp_close(handle->lgh_file, 0);
+       if (rc)
+               CERROR("%s: error closing llog #"LPX64"#"LPX64"#%08x: "
+                      "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+                      handle->lgh_id.lgl_oid, handle->lgh_id.lgl_oseq,
+                      handle->lgh_id.lgl_ogen, rc);
+       handle->lgh_file = NULL;
+       if (handle->lgh_name) {
+               OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+               handle->lgh_name = NULL;
+       }
+       RETURN(rc);
 }
 
-static int llog_lvfs_destroy(struct llog_handle *handle)
+static int llog_lvfs_destroy(const struct lu_env *env,
+                            struct llog_handle *handle)
 {
         struct dentry *fdentry;
         struct obdo *oa;
         struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
         char *dir;
-        int rc;
+        void *th;
+        struct inode *inode;
+        int rc, rc1;
         ENTRY;
 
-        /* COMPAT_146 */
-        if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0)
-                dir = MDT_LOGS_DIR;
-        else
-                /* end COMPAT_146 */
-                dir = MOUNT_CONFIGS_DIR;
+        dir = MOUNT_CONFIGS_DIR;
 
+       LASSERT(handle->lgh_file);
         fdentry = handle->lgh_file->f_dentry;
+        inode = fdentry->d_parent->d_inode;
         if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
-                struct inode *inode = fdentry->d_parent->d_inode;
                 struct lvfs_run_ctxt saved;
+                struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
 
                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 dget(fdentry);
-                rc = llog_lvfs_close(handle);
-
-                if (rc == 0) {
-                        LOCK_INODE_MUTEX(inode);
-                        rc = vfs_unlink(inode, fdentry);
-                        UNLOCK_INODE_MUTEX(inode);
-                }
+               rc = llog_lvfs_close(env, handle);
+               if (rc == 0) {
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                       rc = ll_vfs_unlink(inode, fdentry, mnt);
+                       mutex_unlock(&inode->i_mutex);
+               }
+               mntput(mnt);
 
                 dput(fdentry);
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
@@ -733,35 +813,47 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
                 RETURN(-ENOMEM);
 
         oa->o_id = handle->lgh_id.lgl_oid;
-        oa->o_gr = handle->lgh_id.lgl_ogr;
+        oa->o_seq = handle->lgh_id.lgl_oseq;
         oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
         oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
 
-        rc = llog_lvfs_close(handle);
+       rc = llog_lvfs_close(env, handle);
         if (rc)
                 GOTO(out, rc);
 
-        rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL, NULL);
+        th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+        if (IS_ERR(th)) {
+                CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+                GOTO(out, rc = PTR_ERR(th));
+        }
+
+        rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+                         NULL, NULL, NULL, NULL);
+
+        rc1 = fsfilt_commit(obd, inode, th, 0);
+        if (rc == 0 && rc1 != 0)
+                rc = rc1;
  out:
         OBDO_FREE(oa);
         RETURN(rc);
 }
 
 /* reads the catalog list */
-int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+int llog_get_cat_list(struct obd_device *disk_obd,
+                      char *name, int idx, int count, struct llog_catid *idarray)
 {
         struct lvfs_run_ctxt saved;
         struct l_file *file;
-        int rc;
+        int rc, rc1 = 0;
         int size = sizeof(*idarray) * count;
-        loff_t off = 0;
+        loff_t off = idx *  sizeof(*idarray);
         ENTRY;
 
-        if (!count) 
+        if (!count)
                 RETURN(0);
 
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
         file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
         if (!file || IS_ERR(file)) {
                 rc = PTR_ERR(file);
@@ -769,15 +861,20 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
                        name, rc);
                 GOTO(out, rc);
         }
-        
+
         if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
                 CERROR("%s is not a regular file!: mode = %o\n", name,
                        file->f_dentry->d_inode->i_mode);
                 GOTO(out, rc = -ENOENT);
         }
 
-        CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", 
-               (int)file->f_dentry->d_inode->i_size, size);
+        CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+               (int)i_size_read(file->f_dentry->d_inode), size);
+
+        /* read for new ost index or for empty file */
+        memset(idarray, 0, size);
+        if (i_size_read(file->f_dentry->d_inode) < off)
+                GOTO(out, rc = 0);
 
         rc = fsfilt_read_record(disk_obd, file, idarray, size, &off);
         if (rc) {
@@ -787,27 +884,29 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
 
         EXIT;
  out:
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
         if (file && !IS_ERR(file))
-                rc = filp_close(file, 0);
+                rc1 = filp_close(file, 0);
+        if (rc == 0)
+                rc = rc1;
         return rc;
 }
 EXPORT_SYMBOL(llog_get_cat_list);
 
 /* writes the cat list */
-int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+int llog_put_cat_list(struct obd_device *disk_obd,
+                      char *name, int idx, int count, struct llog_catid *idarray)
 {
         struct lvfs_run_ctxt saved;
         struct l_file *file;
-        int rc;
+        int rc, rc1 = 0;
         int size = sizeof(*idarray) * count;
-        loff_t off = 0;
+        loff_t off = idx * sizeof(*idarray);
 
-        if (!count) 
-                return (0);
+        if (!count)
+                GOTO(out1, rc = 0);
 
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
         file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
         if (!file || IS_ERR(file)) {
                 rc = PTR_ERR(file);
@@ -824,104 +923,68 @@ int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
 
         rc = fsfilt_write_record(disk_obd, file, idarray, size, &off, 1);
         if (rc) {
-                CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
+                CDEBUG(D_INODE,"OBD filter: error writeing %s: rc %d\n",
                        name, rc);
                 GOTO(out, rc);
         }
 
- out:
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+out:
+        pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
         if (file && !IS_ERR(file))
-                rc = filp_close(file, 0);
-        RETURN(rc);
-}
+                rc1 = filp_close(file, 0);
 
-struct llog_operations llog_lvfs_ops = {
-        lop_write_rec:   llog_lvfs_write_rec,
-        lop_next_block:  llog_lvfs_next_block,
-        lop_prev_block:  llog_lvfs_prev_block,
-        lop_read_header: llog_lvfs_read_header,
-        lop_create:      llog_lvfs_create,
-        lop_destroy:     llog_lvfs_destroy,
-        lop_close:       llog_lvfs_close,
-        //        lop_cancel: llog_lvfs_cancel,
-};
-
-EXPORT_SYMBOL(llog_lvfs_ops);
-
-#else /* !__KERNEL__ */
-
-static int llog_lvfs_read_header(struct llog_handle *handle)
-{
-        LBUG();
-        return 0;
-}
-
-static int llog_lvfs_write_rec(struct llog_handle *loghandle,
-                               struct llog_rec_hdr *rec,
-                               struct llog_cookie *reccookie, int cookiecount,
-                               void *buf, int idx)
-{
-        LBUG();
-        return 0;
-}
-
-static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
-                                int next_idx, __u64 *cur_offset, void *buf,
-                                int len)
-{
-        LBUG();
-        return 0;
-}
-
-static int llog_lvfs_prev_block(struct llog_handle *loghandle,
-                                int prev_idx, void *buf, int len)
-{
-        LBUG();
-        return 0;
-}
-
-static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
-                            struct llog_logid *logid, char *name)
-{
-        LBUG();
-        return 0;
+        if (rc == 0)
+                rc = rc1;
+out1:
+        RETURN(rc);
 }
+EXPORT_SYMBOL(llog_put_cat_list);
 
-static int llog_lvfs_close(struct llog_handle *handle)
+static int llog_lvfs_declare_create(const struct lu_env *env,
+                                   struct llog_handle *res,
+                                   struct thandle *th)
 {
-        LBUG();
-        return 0;
+       return 0;
 }
 
-static int llog_lvfs_destroy(struct llog_handle *handle)
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+                                      struct llog_handle *loghandle,
+                                      struct llog_rec_hdr *rec,
+                                      int idx, struct thandle *th)
 {
-        LBUG();
-        return 0;
+       return 0;
 }
 
-int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+struct llog_operations llog_lvfs_ops = {
+       .lop_write_rec          = llog_lvfs_write_rec,
+       .lop_next_block         = llog_lvfs_next_block,
+       .lop_prev_block         = llog_lvfs_prev_block,
+       .lop_read_header        = llog_lvfs_read_header,
+       .lop_create             = llog_lvfs_create,
+       .lop_destroy            = llog_lvfs_destroy,
+       .lop_close              = llog_lvfs_close,
+       .lop_open               = llog_lvfs_open,
+       .lop_exist              = llog_lvfs_exist,
+       .lop_declare_create     = llog_lvfs_declare_create,
+       .lop_declare_write_rec  = llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+int llog_get_cat_list(struct obd_device *disk_obd,
+                     char *name, int idx, int count,
+                     struct llog_catid *idarray)
 {
-        LBUG();
-        return 0;
+       LBUG();
+       return 0;
 }
 
-int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+int llog_put_cat_list(struct obd_device *disk_obd,
+                     char *name, int idx, int count,
+                     struct llog_catid *idarray)
 {
-        LBUG();
-        return 0;
+       LBUG();
+       return 0;
 }
 
-struct llog_operations llog_lvfs_ops = {
-        lop_write_rec:   llog_lvfs_write_rec,
-        lop_next_block:  llog_lvfs_next_block,
-        lop_prev_block:  llog_lvfs_prev_block,
-        lop_read_header: llog_lvfs_read_header,
-        lop_create:      llog_lvfs_create,
-        lop_destroy:     llog_lvfs_destroy,
-        lop_close:       llog_lvfs_close,
-//        lop_cancel:      llog_lvfs_cancel,
-};
+struct llog_operations llog_lvfs_ops = {};
 #endif