Whamcloud - gitweb
Let's even be specific about the fact that it's the script library that we
[fs/lustre-release.git] / lustre / lvfs / llog_lvfs.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
5  *   Author: Andreas Dilger <adilger@clusterfs.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  * OST<->MDS recovery logging infrastructure.
23  *
24  * Invariants in implementation:
25  * - we do not share logs among different OST<->MDS connections, so that
26  *   if an OST or MDS fails it need only look at log(s) relevant to itself
27  */
28
29 #define DEBUG_SUBSYSTEM S_LOG
30
31 #ifndef EXPORT_SYMTAB
32 #define EXPORT_SYMTAB
33 #endif
34
35 #ifdef __KERNEL__
36 #include <linux/fs.h>
37 #else
38 #include <liblustre.h>
39 #endif
40
41 #include <linux/lvfs.h>
42 #include <linux/lustre_fsfilt.h>
43 #include <linux/lustre_log.h>
44
45 #ifdef __KERNEL__
46
47 static int llog_lvfs_pad(struct llog_ctxt *ctxt, struct l_file *file,
48                          int len, int index)
49 {
50         struct llog_rec_hdr rec;
51         struct llog_rec_tail tail;
52         int rc;
53         ENTRY;
54
55         LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
56
57         tail.lrt_len = rec.lrh_len = cpu_to_le32(len);
58         tail.lrt_index = rec.lrh_index = cpu_to_le32(index);
59         rec.lrh_type = 0;
60
61         rc = llog_fsfilt_write_record(ctxt, file, &rec, sizeof(rec),
62                                       &file->f_pos, 0);
63         if (rc) {
64                 CERROR("error writing padding record: rc %d\n", rc);
65                 goto out;
66         }
67
68         file->f_pos += len - sizeof(rec) - sizeof(tail);
69         rc = llog_fsfilt_write_record(ctxt, file, &tail, sizeof(tail),
70                                       &file->f_pos, 0);
71         if (rc) {
72                 CERROR("error writing padding record: rc %d\n", rc);
73                 goto out;
74         }
75
76  out:
77         RETURN(rc);
78 }
79
80 static int llog_lvfs_write_blob(struct llog_ctxt *ctxt, struct l_file *file,
81                                 struct llog_rec_hdr *rec, void *buf, loff_t off)
82 {
83         int rc;
84         struct llog_rec_tail end;
85         loff_t saved_off = file->f_pos;
86         int buflen = le32_to_cpu(rec->lrh_len);
87
88         ENTRY;
89         file->f_pos = off;
90
91         if (!buf) {
92                 rc = llog_fsfilt_write_record(ctxt, file, rec, buflen,
93                                               &file->f_pos, 0);
94                 if (rc) {
95                         CERROR("error writing log record: rc %d\n", rc);
96                         goto out;
97                 }
98                 GOTO(out, rc = 0);
99         }
100
101         /* the buf case */
102         rec->lrh_len = cpu_to_le32(sizeof(*rec) + buflen + sizeof(end));
103         rc = llog_fsfilt_write_record(ctxt, file, rec, sizeof(*rec),
104                                       &file->f_pos, 0);
105         if (rc) {
106                 CERROR("error writing log hdr: rc %d\n", rc);
107                 goto out;
108         }
109
110         rc = llog_fsfilt_write_record(ctxt, file, buf, buflen,
111                                       &file->f_pos, 0);
112         if (rc) {
113                 CERROR("error writing log buffer: rc %d\n", rc);
114                 goto out;
115         }
116
117         end.lrt_len = rec->lrh_len;
118         end.lrt_index = rec->lrh_index;
119         rc = llog_fsfilt_write_record(ctxt, file, &end, sizeof(end),
120                                       &file->f_pos, 0);
121         if (rc) {
122                 CERROR("error writing log tail: rc %d\n", rc);
123                 goto out;
124         }
125
126         rc = 0;
127  out:
128         if (saved_off > file->f_pos)
129                 file->f_pos = saved_off;
130         LASSERT(rc <= 0);
131         RETURN(rc);
132 }
133
134 static int llog_lvfs_read_blob(struct llog_ctxt *ctxt, struct l_file *file,
135                                void *buf, int size, loff_t off)
136 {
137         loff_t offset = off;
138         int rc;
139         ENTRY;
140
141         rc = llog_fsfilt_read_record(ctxt, file, buf, size, &offset);
142         if (rc) {
143                 CERROR("error reading log record: rc %d\n", rc);
144                 RETURN(rc);
145         }
146         RETURN(0);
147 }
148
149 static int llog_lvfs_read_header(struct llog_handle *handle)
150 {
151         struct llog_ctxt *ctxt = handle->lgh_ctxt;
152         int rc;
153         ENTRY;
154
155         LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
156         LASSERT(ctxt != NULL);
157
158         if (handle->lgh_file->f_dentry->d_inode->i_size == 0) {
159                 CDEBUG(D_HA, "not reading header from 0-byte log\n");
160                 RETURN(LLOG_EEMPTY);
161         }
162
163         rc = llog_lvfs_read_blob(ctxt, handle->lgh_file, handle->lgh_hdr,
164                                  LLOG_CHUNK_SIZE, 0);
165         if (rc)
166                 CERROR("error reading log header\n");
167
168         handle->lgh_last_idx = le32_to_cpu(handle->lgh_hdr->llh_tail.lrt_index);
169         handle->lgh_file->f_pos = handle->lgh_file->f_dentry->d_inode->i_size;
170
171         RETURN(rc);
172 }
173
174 /* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
175 /* appends if idx == -1, otherwise overwrites record idx. */
176 static int llog_lvfs_write_rec(struct llog_handle *loghandle,
177                                struct llog_rec_hdr *rec,
178                                struct llog_cookie *reccookie,
179                                int cookiecount,
180                                void *buf, int idx)
181 {
182         struct llog_log_hdr *llh;
183         int reclen = le32_to_cpu(rec->lrh_len), index, rc;
184         struct llog_rec_tail *lrt;
185         struct llog_ctxt *ctxt = loghandle->lgh_ctxt;
186         struct file *file;
187         loff_t offset;
188         size_t left;
189         ENTRY;
190
191         llh = loghandle->lgh_hdr;
192         file = loghandle->lgh_file;
193
194         /* record length should not bigger than LLOG_CHUNK_SIZE */
195         if (buf)
196                 rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr)
197                       - sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
198         else
199                 rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
200         if (rc)
201                 RETURN(rc);
202
203         if (idx != -1) {
204                 loff_t saved_offset;
205
206                 /* no header: only allowed to insert record 1 */
207                 if (idx > 1 && !file->f_dentry->d_inode->i_size) {
208                         CERROR("idx != -1 in empty log\n");
209                         LBUG();
210                 }
211
212                 if (idx && llh->llh_size && llh->llh_size != reclen)
213                         RETURN(-EINVAL);
214
215                 rc = llog_lvfs_write_blob(ctxt, file, &llh->llh_hdr, NULL, 0);
216                 /* we are done if we only write the header or on error */
217                 if (rc || idx == 0)
218                         RETURN(rc);
219
220                 saved_offset = sizeof(*llh) + (idx-1)*le32_to_cpu(rec->lrh_len);
221                 rc = llog_lvfs_write_blob(ctxt, file, rec, buf, saved_offset);
222                 if (rc == 0 && reccookie) {
223                         reccookie->lgc_lgl = loghandle->lgh_id;
224                         reccookie->lgc_index = idx;
225                         rc = 1;
226                 }
227                 RETURN(rc);
228         }
229
230         /* Make sure that records don't cross a chunk boundary, so we can
231          * process them page-at-a-time if needed.  If it will cross a chunk
232          * boundary, write in a fake (but referenced) entry to pad the chunk.
233          *
234          * We know that llog_current_log() will return a loghandle that is
235          * big enough to hold reclen, so all we care about is padding here.
236          */
237         left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
238         if (buf)
239                 reclen = sizeof(*rec) + le32_to_cpu(rec->lrh_len) +
240                          sizeof(struct llog_rec_tail);
241
242         /* NOTE: padding is a record, but no bit is set */
243         if (left != 0 && left != reclen &&
244             left < (reclen + LLOG_MIN_REC_SIZE)) {
245                 loghandle->lgh_last_idx++;
246                 rc = llog_lvfs_pad(ctxt, file, left, loghandle->lgh_last_idx);
247                 if (rc)
248                         RETURN(rc);
249                 /* if it's the last idx in log file, then return -ENOSPC */
250                 if (loghandle->lgh_last_idx == LLOG_BITMAP_SIZE(llh) - 1)
251                         RETURN(-ENOSPC);
252         }
253
254         loghandle->lgh_last_idx++;
255         index = loghandle->lgh_last_idx;
256         LASSERT(index < LLOG_BITMAP_SIZE(llh));
257         rec->lrh_index = cpu_to_le32(index);
258         if (buf == NULL) {
259                 lrt = (void *)rec + le32_to_cpu(rec->lrh_len) - sizeof(*lrt);
260                 lrt->lrt_len = rec->lrh_len;
261                 lrt->lrt_index = rec->lrh_index;
262         }
263         if (ext2_set_bit(index, llh->llh_bitmap)) {
264                 CERROR("argh, index %u already set in log bitmap?\n", index);
265                 LBUG(); /* should never happen */
266         }
267         llh->llh_count = cpu_to_le32(le32_to_cpu(llh->llh_count) + 1);
268         llh->llh_tail.lrt_index = cpu_to_le32(index);
269
270         offset = 0;
271         rc = llog_lvfs_write_blob(ctxt, file, &llh->llh_hdr, NULL, 0);
272         if (rc)
273                 RETURN(rc);
274
275         CDEBUG(D_HA, "adding record "LPX64": idx: %u, %u bytes off: %lld\n",
276                loghandle->lgh_id.lgl_oid, index, le32_to_cpu(rec->lrh_len),
277                file->f_pos);
278
279         rc = llog_lvfs_write_blob(ctxt, file, rec, buf, file->f_pos);
280         if (rc)
281                 RETURN(rc);
282
283         if (rc == 0 && reccookie) {
284                 if (llog_cookie_get_flags(reccookie) & LLOG_COOKIE_REPLAY) {
285                         LASSERTF(EQ_LOGID(reccookie->lgc_lgl,loghandle->lgh_id),
286                                  "lgc_lgl.oid/gr "LPU64"/"LPU64" lgh_id.oid/gr"
287                                  LPU64"/"LPU64"\n",
288                                  reccookie->lgc_lgl.lgl_oid,
289                                  reccookie->lgc_lgl.lgl_ogr,
290                                  loghandle->lgh_id.lgl_oid,
291                                  loghandle->lgh_id.lgl_oid);
292                         LASSERTF(reccookie->lgc_index == index,
293                                  "lgc_index %u != index %u\n",
294                                  reccookie->lgc_index, index);
295                 } else {
296                         reccookie->lgc_lgl = loghandle->lgh_id;
297                         reccookie->lgc_index = index;
298                         llog_cookie_add_flags(reccookie, LLOG_COOKIE_REPLAY);
299                 }
300
301                 if (le32_to_cpu(rec->lrh_type) == MDS_UNLINK_REC)
302                         reccookie->lgc_subsys = LLOG_UNLINK_ORIG_CTXT;
303                 else if (le32_to_cpu(rec->lrh_type) == OST_SZ_REC)
304                         reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
305                 else if (le32_to_cpu(rec->lrh_type) == OST_RAID1_REC)
306                         reccookie->lgc_subsys = LLOG_RD1_ORIG_CTXT;
307                 else
308                         reccookie->lgc_subsys = -1;
309                 rc = 1;
310         }
311         if (rc == 0 && (le32_to_cpu(rec->lrh_type) == LLOG_GEN_REC ||
312             le32_to_cpu(rec->lrh_type) == SMFS_UPDATE_REC))
313                 rc = 1;
314
315         RETURN(rc);
316 }
317
318 /* We can skip reading at least as many log blocks as the number of
319 * minimum sized log records we are skipping.  If it turns out
320 * that we are not far enough along the log (because the
321 * actual records are larger than minimum size) we just skip
322 * some more records. */
323
324 static void llog_skip_over(__u64 *off, int curr, int goal)
325 {
326         if (goal <= curr)
327                 return;
328         *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
329                 ~(LLOG_CHUNK_SIZE - 1);
330 }
331
332 /* sets:
333  *  - curr_offset to the furthest point read in the log file
334  *  - curr_idx to the log index preceeding curr_offset
335  * returns -EIO/-EINVAL on error
336  */
337 static int llog_lvfs_next_block(struct llog_handle *loghandle, int *curr_idx,
338                                 int next_idx, __u64 *curr_offset, void *buf,
339                                 int len)
340 {
341         struct llog_ctxt *ctxt = loghandle->lgh_ctxt;
342         ENTRY;
343
344         if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
345                 RETURN(-EINVAL);
346
347         CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
348                next_idx, *curr_idx, *curr_offset);
349
350         while (*curr_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) {
351                 struct llog_rec_hdr *rec;
352                 struct llog_rec_tail *tail;
353                 loff_t ppos;
354                 int nbytes, rc;
355
356                 llog_skip_over(curr_offset, *curr_idx, next_idx);
357
358                 ppos = *curr_offset;
359                 rc = llog_fsfilt_read_record(ctxt, loghandle->lgh_file,
360                                              buf, len, &ppos);
361
362                 if (rc) {
363                         CERROR("Cant read llog block at log id "LPU64
364                                "/%u offset "LPU64"\n",
365                                loghandle->lgh_id.lgl_oid,
366                                loghandle->lgh_id.lgl_ogen,
367                                *curr_offset);
368                         RETURN(rc);
369                 }
370
371                 nbytes = ppos - *curr_offset;
372                 *curr_offset = ppos;
373
374                 if (nbytes == 0) /* end of file, nothing to do */
375                         RETURN(0);
376
377                 if (nbytes < sizeof(*tail)) {
378                         CERROR("Invalid llog block at log id "LPU64"/%u offset "
379                                LPU64"\n", loghandle->lgh_id.lgl_oid,
380                                loghandle->lgh_id.lgl_ogen, *curr_offset);
381                         RETURN(-EINVAL);
382                 }
383
384                 tail = buf + nbytes - sizeof(struct llog_rec_tail);
385                 *curr_idx = le32_to_cpu(tail->lrt_index);
386
387                 /* this shouldn't happen */
388                 if (tail->lrt_index == 0) {
389                         CERROR("Invalid llog tail at log id "LPU64"/%u offset "
390                                LPU64"\n", loghandle->lgh_id.lgl_oid,
391                                loghandle->lgh_id.lgl_ogen, *curr_offset);
392                         RETURN(-EINVAL);
393                 }
394                 if (le32_to_cpu(tail->lrt_index) < next_idx) {
395                         memset(buf, 0, len);
396                         continue;
397                 }
398
399                 /* sanity check that the start of the new buffer is no farther
400                  * than the record that we wanted.  This shouldn't happen. */
401                 rec = buf;
402                 if (le32_to_cpu(rec->lrh_index) > next_idx) {
403                         CERROR("missed desired record? %u > %u\n",
404                                le32_to_cpu(rec->lrh_index), next_idx);
405                         RETURN(-ENOENT);
406                 }
407                 RETURN(0);
408         }
409         RETURN(-EIO);
410 }
411
412 static int llog_lvfs_prev_block(struct llog_handle *loghandle,
413                                 int prev_idx, void *buf, int len)
414 {
415         struct llog_ctxt *ctxt = loghandle->lgh_ctxt;
416         __u64 curr_offset;
417         int rc;
418         ENTRY;
419
420         if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
421                 RETURN(-EINVAL);
422
423         CDEBUG(D_OTHER, "looking for log index %u \n", prev_idx);
424
425         curr_offset = LLOG_CHUNK_SIZE;
426         llog_skip_over(&curr_offset, 0, prev_idx);
427
428         while (curr_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) {
429                 struct llog_rec_hdr *rec;
430                 struct llog_rec_tail *tail;
431                 loff_t ppos;
432
433                 ppos = curr_offset;
434                 rc = llog_fsfilt_read_record(ctxt, loghandle->lgh_file,
435                                              buf, len, &ppos);
436
437                 if (rc) {
438                         CERROR("Cant read llog block at log id "LPU64
439                                "/%u offset "LPU64"\n",
440                                loghandle->lgh_id.lgl_oid,
441                                loghandle->lgh_id.lgl_ogen,
442                                curr_offset);
443                         RETURN(rc);
444                 }
445
446                 /* put number of bytes read into rc to make code simpler */
447                 rc = ppos - curr_offset;
448                 curr_offset = ppos;
449
450                 if (rc == 0) /* end of file, nothing to do */
451                         RETURN(0);
452
453                 if (rc < sizeof(*tail)) {
454                         CERROR("Invalid llog block at log id "LPU64"/%u offset "
455                                LPU64"\n", loghandle->lgh_id.lgl_oid,
456                                loghandle->lgh_id.lgl_ogen, curr_offset);
457                         RETURN(-EINVAL);
458                 }
459
460                 tail = buf + rc - sizeof(struct llog_rec_tail);
461
462                 /* this shouldn't happen */
463                 if (tail->lrt_index == 0) {
464                         CERROR("Invalid llog tail at log id "LPU64"/%u offset "
465                                LPU64"\n", loghandle->lgh_id.lgl_oid,
466                                loghandle->lgh_id.lgl_ogen, curr_offset);
467                         RETURN(-EINVAL);
468                 }
469                 if (le32_to_cpu(tail->lrt_index) < prev_idx)
470                         continue;
471
472                 /* sanity check that the start of the new buffer is no farther
473                  * than the record that we wanted.  This shouldn't happen. */
474                 rec = buf;
475                 if (le32_to_cpu(rec->lrh_index) > prev_idx) {
476                         CERROR("missed desired record? %u > %u\n",
477                                le32_to_cpu(rec->lrh_index), prev_idx);
478                         RETURN(-ENOENT);
479                 }
480                 RETURN(0);
481         }
482         RETURN(-EIO);
483 }
484
485 static struct file *llog_filp_open(char *name, int flags, int mode)
486 {
487         char *logname;
488         struct file *filp;
489         int len;
490
491         OBD_ALLOC(logname, PATH_MAX);
492         if (logname == NULL)
493                 return ERR_PTR(-ENOMEM);
494
495         len = snprintf(logname, PATH_MAX, "LOGS/%s", name);
496         if (len >= PATH_MAX - 1) {
497                 filp = ERR_PTR(-ENAMETOOLONG);
498         } else {
499                 filp = l_filp_open(logname, flags, mode);
500                 if (IS_ERR(filp)) {
501                         CERROR("logfile %s(%s): %ld\n",
502                                flags & O_CREAT ? "create" : "open", logname,
503                                PTR_ERR(filp));
504                 }
505         }
506
507         OBD_FREE(logname, PATH_MAX);
508         return filp;
509 }
510
511 /* creates object for the case when we have no obd (smfs). */
512 static struct file *
513 llog_object_create_alone(struct llog_ctxt *ctxt, struct llog_logid *lgh_id)
514 {
515         struct file *filp;
516         int rc = 0;
517         ENTRY;
518
519         LASSERT(lgh_id != NULL);
520         if (lgh_id->lgl_oid) {
521                 struct dentry *dchild;
522                 char id_name[LL_ID_NAMELEN];
523                 int id_len = 0;
524
525                 down(&ctxt->loc_objects_dir->d_inode->i_sem);
526                 id_len = ll_id2str(id_name, lgh_id->lgl_oid, 
527                                        lgh_id->lgl_ogen);
528                 
529                 dchild = lookup_one_len(id_name, ctxt->loc_objects_dir, 
530                                         id_len);
531                 if (IS_ERR(dchild)) {
532                         up(&ctxt->loc_objects_dir->d_inode->i_sem);
533                         RETURN((struct file *)dchild);
534                 }
535                 if (dchild->d_inode == NULL) {
536                         struct dentry_params dp;
537                         struct inode *inode;
538
539                         dchild->d_fsdata = (void *) &dp;
540                         dp.p_ptr = NULL;
541                         dp.p_inum = lgh_id->lgl_oid;
542                         rc = ll_vfs_create(ctxt->loc_objects_dir->d_inode,
543                                            dchild, S_IFREG, NULL);
544                         if (dchild->d_fsdata == (void *)(unsigned long)lgh_id->lgl_oid)
545                                 dchild->d_fsdata = NULL;
546                         if (rc) {
547                                 CDEBUG(D_INODE, "err during create: %d\n", rc);
548                                 dput(dchild);
549                                 up(&ctxt->loc_objects_dir->d_inode->i_sem);
550                                 RETURN(ERR_PTR(rc));
551                         }
552                         inode = dchild->d_inode;
553                         LASSERT(inode->i_ino == lgh_id->lgl_oid);
554                         inode->i_generation = lgh_id->lgl_ogen;
555                         CDEBUG(D_HA, "recreated ino %lu with gen %u\n",
556                                inode->i_ino, inode->i_generation);
557                         mark_inode_dirty(inode);
558                 }
559
560                 mntget(ctxt->loc_lvfs_ctxt->pwdmnt);
561                 filp = dentry_open(dchild, ctxt->loc_lvfs_ctxt->pwdmnt,
562                                     O_RDWR | O_LARGEFILE);
563                 if (IS_ERR(filp)) {
564                         dput(dchild);
565                         up(&ctxt->loc_objects_dir->d_inode->i_sem);
566                         RETURN(filp);
567                 }
568                 if (!S_ISREG(filp->f_dentry->d_inode->i_mode)) {
569                         CERROR("%s is not a regular file!: mode = %o\n", 
570                                id_name, filp->f_dentry->d_inode->i_mode);
571                         filp_close(filp, 0);
572                         up(&ctxt->loc_objects_dir->d_inode->i_sem);
573                         RETURN(ERR_PTR(-ENOENT));
574                 }
575
576                 up(&ctxt->loc_objects_dir->d_inode->i_sem);
577                 RETURN(filp);
578
579         } else {
580                 unsigned int tmpname = ll_insecure_random_int();
581                 char id_name[LL_ID_NAMELEN];
582                 struct dentry *new_child, *parent;
583                 int err, id_len;
584                 void *handle;
585
586                 sprintf(id_name, "OBJECTS/%u", tmpname);
587                 filp = filp_open(id_name, O_CREAT | O_EXCL, 0644);
588                 if (IS_ERR(filp)) {
589                         rc = PTR_ERR(filp);
590                         if (rc == -EEXIST) {
591                                 CERROR("impossible object name collision %u\n",
592                                         tmpname);
593                                 LBUG();
594                         }
595                         CERROR("error creating tmp object %u: rc %d\n", tmpname, rc);
596                         RETURN(filp);
597                 }
598
599                 id_len = ll_id2str(id_name, filp->f_dentry->d_inode->i_ino,
600                                        filp->f_dentry->d_inode->i_generation);
601                 parent = filp->f_dentry->d_parent;
602                 down(&parent->d_inode->i_sem);
603                 new_child = lookup_one_len(id_name, parent, id_len);
604                 if (IS_ERR(new_child)) {
605                         CERROR("getting neg dentry for obj rename: %d\n", rc);
606                         GOTO(out_close, rc = PTR_ERR(new_child));
607                 }
608                 if (new_child->d_inode != NULL) {
609                         CERROR("impossible non-negative obj dentry %lu:%u!\n",
610                                 filp->f_dentry->d_inode->i_ino,
611                                 filp->f_dentry->d_inode->i_generation);
612                         LBUG();
613                 }
614
615                 handle = llog_fsfilt_start(ctxt, parent->d_inode, FSFILT_OP_RENAME, NULL);
616                 if (IS_ERR(handle))
617                         GOTO(out_dput, rc = PTR_ERR(handle));
618
619                 lock_kernel();
620                 rc = vfs_rename(parent->d_inode, filp->f_dentry,
621                                 parent->d_inode, new_child);
622                 unlock_kernel();
623                 if (rc)
624                         CERROR("error renaming new object %lu:%u: rc %d\n",
625                                 filp->f_dentry->d_inode->i_ino,
626                                 filp->f_dentry->d_inode->i_generation, rc);
627
628                 err = llog_fsfilt_commit(ctxt, parent->d_inode, handle, 0);
629                 if (!rc)
630                         rc = err;
631
632         out_dput:
633                 dput(new_child);
634         out_close:
635                 up(&parent->d_inode->i_sem);
636                 if (rc) {
637                         filp_close(filp, 0);
638                         filp = ERR_PTR(rc);
639                 } else {
640                         /* FIXME: is this group 1 is correct? */
641                         lgh_id->lgl_ogr = 1;
642                         lgh_id->lgl_oid = filp->f_dentry->d_inode->i_ino;
643                         lgh_id->lgl_ogen = filp->f_dentry->d_inode->i_generation;
644                 }
645                 RETURN(filp);
646         }
647 }
648
649 /* creates object for generic case (obd exists) */
650 static struct file *
651 llog_object_create_generic(struct llog_ctxt *ctxt, struct llog_logid *lgh_id)
652 {
653         struct file *filp = NULL;
654         struct dentry *dchild;
655         struct obd_device *obd;
656         struct obdo *oa = NULL;
657         int open_flags = O_RDWR | O_LARGEFILE;
658         int rc = 0;
659         ENTRY;
660
661         obd = ctxt->loc_exp->exp_obd;
662         LASSERT(obd != NULL);
663
664         if (lgh_id->lgl_oid) {
665                 dchild = obd_lvfs_id2dentry(ctxt->loc_exp, lgh_id->lgl_oid,
666                                             lgh_id->lgl_ogen, lgh_id->lgl_ogr);
667                 if (IS_ERR(dchild) == -ENOENT) {
668                         OBD_ALLOC(oa, sizeof(*oa));
669                         if (!oa)
670                                 RETURN(ERR_PTR(-ENOMEM));
671
672                         oa->o_id = lgh_id->lgl_oid;
673                         oa->o_generation = lgh_id->lgl_ogen;
674                         oa->o_gr = lgh_id->lgl_ogr;
675                         oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
676                         rc = obd_create(ctxt->loc_exp, oa, NULL, 0, NULL, NULL);
677                         if (rc) {
678                                 CDEBUG(D_INODE, "err during create: %d\n", rc);
679                                 GOTO(out_free_oa, rc);
680                         }
681                         CDEBUG(D_HA, "re-create log object "LPX64":0x%x:"LPX64"\n",
682                                lgh_id->lgl_oid, lgh_id->lgl_ogen, lgh_id->lgl_ogr);
683
684                         dchild = obd_lvfs_id2dentry(ctxt->loc_exp, lgh_id->lgl_oid,
685                                                     lgh_id->lgl_ogen, lgh_id->lgl_ogr);
686                 } else if (IS_ERR(dchild)) {
687                         CERROR("error looking up logfile "LPX64":0x%x: rc %d\n",
688                                lgh_id->lgl_oid, lgh_id->lgl_ogen, rc);
689                         RETURN((struct file *)dchild);
690                 }
691
692                 filp = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
693                 if (IS_ERR(filp)) {
694                         l_dput(dchild);
695                         rc = PTR_ERR(filp);
696                         CERROR("error opening logfile "LPX64"0x%x: rc %d\n",
697                                lgh_id->lgl_oid, lgh_id->lgl_ogen, rc);
698                 }
699                 GOTO(out_free_oa, rc);
700         } else {
701                 /* this is important to work here over obd_create() as it manages 
702                   groups and we need it. Yet another reason is that mds_obd_create()
703                  is fully the same as old version of this function and this helps
704                  us to avoid code duplicating and layering violating. */
705                 OBD_ALLOC(oa, sizeof(*oa));
706                 if (!oa)
707                         RETURN(ERR_PTR(-ENOMEM));
708
709                 oa->o_gr = FILTER_GROUP_LLOG;
710                 oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
711                 rc = obd_create(ctxt->loc_exp, oa, NULL, 0, NULL, NULL);
712                 if (rc)
713                         GOTO(out_free_oa, rc);
714
715                 dchild = obd_lvfs_id2dentry(ctxt->loc_exp, oa->o_id,
716                                             oa->o_generation, oa->o_gr);
717                 if (IS_ERR(dchild))
718                         GOTO(out_free_oa, rc = PTR_ERR(dchild));
719
720                 filp = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
721                                      open_flags);
722                 if (IS_ERR(filp)) {
723                         l_dput(dchild);
724                         GOTO(out_free_oa, rc = PTR_ERR(filp));
725                 }
726
727                 /* group 1 is not longer valid, we use the group which is set 
728                 by obd_create()->mds_obd_create(). */
729                 lgh_id->lgl_ogr = oa->o_gr;
730                 lgh_id->lgl_oid = oa->o_id;
731                 lgh_id->lgl_ogen = oa->o_generation;
732         }
733
734 out_free_oa:
735         if (rc)
736                 filp = ERR_PTR(rc);
737         if (oa)
738                 OBD_FREE(oa, sizeof(*oa));
739         RETURN(filp);
740 }
741
742 static struct file *
743 llog_object_create(struct llog_ctxt *ctxt, struct llog_logid *lgh_id)
744 {
745         if (ctxt->loc_alone)
746                 return llog_object_create_alone(ctxt, lgh_id);
747         else
748                 return llog_object_create_generic(ctxt, lgh_id);
749 }
750
751 static int llog_add_link_object(struct llog_ctxt *ctxt, struct llog_logid logid,
752                                 struct dentry *dentry)
753 {
754         struct dentry *new_child;
755         char id_name[LL_ID_NAMELEN];
756         void *handle;
757         int id_len, rc = 0, err;
758         ENTRY;
759         
760         id_len = ll_id2str(id_name, logid.lgl_oid, logid.lgl_ogen);
761         down(&ctxt->loc_objects_dir->d_inode->i_sem);
762         new_child = lookup_one_len(id_name, ctxt->loc_objects_dir, id_len);
763         if (IS_ERR(new_child)) {
764                 CERROR("getting neg dentry for obj rename: %d\n", rc);
765                 GOTO(out, rc = PTR_ERR(new_child));
766         }
767         if (new_child->d_inode == dentry->d_inode)
768                 GOTO(out_dput, rc);
769         if (new_child->d_inode != NULL) {
770                 CERROR("impossible non-negative obj dentry "LPX64":%u!\n",
771                        logid.lgl_oid, logid.lgl_ogen);
772                 LBUG();
773         }
774         handle = llog_fsfilt_start(ctxt, ctxt->loc_objects_dir->d_inode,
775                                    FSFILT_OP_LINK, NULL);
776         if (IS_ERR(handle))
777                 GOTO(out_dput, rc = PTR_ERR(handle));
778         
779         lock_kernel();
780         rc = vfs_link(dentry, ctxt->loc_objects_dir->d_inode, new_child);
781         unlock_kernel();
782         if (rc) {
783                 CERROR("error link new object "LPX64":%08x: rc %d\n",
784                        logid.lgl_oid, logid.lgl_ogen, rc);
785                 /* it doesn't make much sense to get -EEXIST here */
786                 LASSERTF(rc != -EEXIST, "bug 3490: dentry: %p "
787                          "dir->d_ionode %p new_child: %p  \n",
788                          dentry, ctxt->loc_objects_dir->d_inode, new_child);
789         }
790         err = llog_fsfilt_commit(ctxt, ctxt->loc_objects_dir->d_inode, handle, 0);
791 out_dput:
792         l_dput(new_child);
793 out:
794         up(&ctxt->loc_objects_dir->d_inode->i_sem);
795         RETURN(rc);
796 }
797
798 static int llog_lvfs_open(struct llog_ctxt *ctxt, struct llog_handle **res,
799                           struct llog_logid *logid, char *name, int flags)
800 {
801         struct llog_handle *handle;
802         struct lvfs_run_ctxt saved;
803         int rc = 0;
804         int open_flags = O_RDWR | O_LARGEFILE;
805         ENTRY;
806
807         if (flags & OBD_LLOG_FL_CREATE)
808                 open_flags |= O_CREAT;
809
810         handle = llog_alloc_handle();
811         if (handle == NULL)
812                 RETURN(-ENOMEM);
813         *res = handle;
814         
815         LASSERT(ctxt);
816         if (ctxt->loc_lvfs_ctxt)
817                 push_ctxt(&saved, ctxt->loc_lvfs_ctxt, NULL);
818         
819         if (logid != NULL) {
820                 handle->lgh_file = llog_object_create(ctxt, logid);
821                 if (IS_ERR(handle->lgh_file)) {
822                         CERROR("cannot create/open llog object "LPX64":%x "
823                                "error = %ld", logid->lgl_oid, logid->lgl_ogen,
824                                PTR_ERR(handle->lgh_file));
825                         GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
826                 }
827                 handle->lgh_id = *logid;
828
829         } else if (name) {
830                 handle->lgh_file = llog_filp_open(name, open_flags, 0644);
831                 if (IS_ERR(handle->lgh_file)) {
832                         CERROR("cannot open %s file, error = %ld\n", 
833                                name, PTR_ERR(handle->lgh_file));
834                         GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
835                 }
836                 LASSERT(handle->lgh_file->f_dentry->d_parent == ctxt->loc_logs_dir);
837                 
838                 handle->lgh_id.lgl_ogr = 1;
839                 handle->lgh_id.lgl_oid = handle->lgh_file->f_dentry->d_inode->i_ino;
840                 handle->lgh_id.lgl_ogen = handle->lgh_file->f_dentry->d_inode->i_generation;
841                 rc = llog_add_link_object(ctxt, handle->lgh_id, handle->lgh_file->f_dentry);
842                 if (rc)
843                         GOTO(cleanup, rc);
844
845         } else {
846                 handle->lgh_file = llog_object_create(ctxt, &handle->lgh_id);
847                 if (IS_ERR(handle->lgh_file)) {
848                         CERROR("cannot create llog object, error = %ld\n", 
849                                PTR_ERR(handle->lgh_file));
850                         GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
851                 }
852         }
853
854         handle->lgh_ctxt = ctxt;
855 finish:
856         if (ctxt->loc_lvfs_ctxt)
857                 pop_ctxt(&saved, ctxt->loc_lvfs_ctxt, NULL);
858         RETURN(rc);
859 cleanup:
860         llog_free_handle(handle);
861         goto finish;
862 }
863
864 static int llog_lvfs_close(struct llog_handle *handle)
865 {
866         int rc;
867         ENTRY;
868
869         rc = filp_close(handle->lgh_file, 0);
870         if (rc)
871                 CERROR("error closing log: rc %d\n", rc);
872         RETURN(rc);
873 }
874
875 static int llog_lvfs_destroy(struct llog_handle *loghandle)
876 {
877         struct llog_ctxt *ctxt = loghandle->lgh_ctxt;
878         struct lvfs_run_ctxt saved;
879         struct dentry *fdentry;
880         struct inode *parent_inode;
881         char id_name[LL_ID_NAMELEN];
882         void *handle;
883         int rc = -EINVAL, err, id_len;
884         ENTRY;
885         
886         if (ctxt->loc_lvfs_ctxt)
887                 push_ctxt(&saved, ctxt->loc_lvfs_ctxt, NULL);
888         
889         fdentry = loghandle->lgh_file->f_dentry;
890         parent_inode = fdentry->d_parent->d_inode;
891         
892         if (!strcmp((char *)fdentry->d_parent->d_name.name, "LOGS")) {
893                 LASSERT(parent_inode == ctxt->loc_logs_dir->d_inode);
894                 
895                 id_len = ll_id2str(id_name, fdentry->d_inode->i_ino,
896                                    fdentry->d_inode->i_generation);
897                 dget(fdentry);
898                 rc = llog_lvfs_close(loghandle);
899                 if (rc) {
900                         dput(fdentry);
901                         GOTO(out, rc);
902                 }
903                 
904                 handle = llog_fsfilt_start(ctxt, parent_inode,
905                                            FSFILT_OP_UNLINK, NULL);
906                 if (IS_ERR(handle)) {
907                         dput(fdentry);
908                         GOTO(out, rc = PTR_ERR(handle));
909                 }
910                 
911                 down(&parent_inode->i_sem);
912                 rc = vfs_unlink(parent_inode, fdentry);
913                 up(&parent_inode->i_sem);
914                 dput(fdentry);
915                 
916                 if (!rc) {
917                         down(&ctxt->loc_objects_dir->d_inode->i_sem);
918                         fdentry = lookup_one_len(id_name, ctxt->loc_objects_dir,
919                                                  id_len);
920                         if (IS_ERR(fdentry) || fdentry->d_inode == NULL) {
921                                 CERROR("destroy non_existent object %s\n", 
922                                        id_name);
923                                 GOTO(out_err, rc = IS_ERR(fdentry) ?
924                                      PTR_ERR(fdentry) : -ENOENT);
925                         }
926                         rc = vfs_unlink(ctxt->loc_objects_dir->d_inode, fdentry);
927                         l_dput(fdentry);
928 out_err:
929                         up(&ctxt->loc_objects_dir->d_inode->i_sem);
930                 }
931                 err = llog_fsfilt_commit(ctxt, parent_inode, handle, 0);
932                 if (err && !rc)
933                         err = rc;
934                 
935                 GOTO(out, rc);
936         }
937         if (ctxt->loc_alone) {
938                 if (!strcmp((char *)fdentry->d_parent->d_name.name, "OBJECTS")) {
939                         LASSERT(parent_inode == ctxt->loc_objects_dir->d_inode);
940                         
941                         dget(fdentry);
942                         rc = llog_lvfs_close(loghandle);
943                         if (rc == 0) {
944                                 down(&parent_inode->i_sem);
945                                 rc = vfs_unlink(parent_inode, fdentry);
946                                 up(&parent_inode->i_sem);
947                         }
948                         dput(fdentry);
949                 }
950         } else {
951                 struct obdo *oa = NULL;
952  
953                 OBD_ALLOC(oa, sizeof(*oa));
954                 if (!oa)
955                         GOTO(out, rc = -ENOMEM);
956                 
957                 oa->o_id = loghandle->lgh_id.lgl_oid;
958                 oa->o_gr = loghandle->lgh_id.lgl_ogr;
959                 oa->o_generation = loghandle->lgh_id.lgl_ogen;
960                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
961                 
962                 rc = llog_lvfs_close(loghandle);
963                 if (rc)
964                         GOTO(out_free_oa, rc);
965                 
966                 rc = obd_destroy(loghandle->lgh_ctxt->loc_exp, oa, NULL, NULL);
967 out_free_oa:
968                 OBD_FREE(oa, sizeof(*oa));
969         }
970 out:
971         if (ctxt->loc_lvfs_ctxt)
972                 pop_ctxt(&saved, ctxt->loc_lvfs_ctxt, NULL);
973         RETURN(rc);
974 }
975
976 /* reads the catalog list */
977 int llog_get_cat_list(struct lvfs_run_ctxt *ctxt,
978                       struct fsfilt_operations *fsops, const char *name,
979                       int count, struct llog_catid *idarray)
980 {
981         struct lvfs_run_ctxt saved;
982         struct l_file *file;
983         int size = sizeof(*idarray) * count;
984         loff_t off = 0;
985         int rc;
986
987         LASSERT(count);
988
989         if (ctxt)
990                 push_ctxt(&saved, ctxt, NULL);
991         file = l_filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
992         if (!file || IS_ERR(file)) {
993                 rc = PTR_ERR(file);
994                 CERROR("OBD filter: cannot open/create %s: rc = %d\n",
995                        name, rc);
996                 GOTO(out, rc);
997         }
998
999         if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
1000                 CERROR("%s is not a regular file!: mode = %o\n", name,
1001                        file->f_dentry->d_inode->i_mode);
1002                 GOTO(out, rc = -ENOENT);
1003         }
1004
1005         rc = fsops->fs_read_record(file, idarray, size, &off);
1006         if (rc) {
1007                 CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
1008                        name, rc);
1009                 GOTO(out, rc);
1010         }
1011
1012  out:
1013         if (file && !IS_ERR(file))
1014                 rc = filp_close(file, 0);
1015         if (ctxt)
1016                 pop_ctxt(&saved, ctxt, NULL);
1017         RETURN(rc);
1018 }
1019 EXPORT_SYMBOL(llog_get_cat_list);
1020
1021 /* writes the cat list */
1022 int llog_put_cat_list(struct lvfs_run_ctxt *ctxt,
1023                       struct fsfilt_operations *fsops, const char *name,
1024                       int count, struct llog_catid *idarray)
1025 {
1026         struct lvfs_run_ctxt saved;
1027         struct l_file *file;
1028         int size = sizeof(*idarray) * count;
1029         loff_t off = 0;
1030         int rc;
1031
1032         LASSERT(count);
1033
1034         if (ctxt)
1035                 push_ctxt(&saved, ctxt, NULL);
1036         file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
1037         if (!file || IS_ERR(file)) {
1038                 rc = PTR_ERR(file);
1039                 CERROR("OBD filter: cannot open/create %s: rc = %d\n",
1040                        name, rc);
1041                 GOTO(out, rc);
1042         }
1043
1044         if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
1045                 CERROR("%s is not a regular file!: mode = %o\n", name,
1046                        file->f_dentry->d_inode->i_mode);
1047                 GOTO(out, rc = -ENOENT);
1048         }
1049
1050         rc = fsops->fs_write_record(file, idarray, size, &off, 1);
1051         if (rc) {
1052                 CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
1053                        name, rc);
1054                 GOTO(out, rc);
1055         }
1056
1057  out:
1058         if (file && !IS_ERR(file))
1059                 rc = filp_close(file, 0);
1060         if (ctxt)
1061                 pop_ctxt(&saved, ctxt, NULL);
1062         RETURN(rc);
1063 }
1064 EXPORT_SYMBOL(llog_put_cat_list);
1065
1066 struct llog_operations llog_lvfs_ops = {
1067         lop_open:        llog_lvfs_open,
1068         lop_destroy:     llog_lvfs_destroy,
1069         lop_close:       llog_lvfs_close,
1070         lop_read_header: llog_lvfs_read_header,
1071         lop_write_rec:   llog_lvfs_write_rec,
1072         lop_next_block:  llog_lvfs_next_block,
1073         lop_prev_block:  llog_lvfs_prev_block,
1074 };
1075 EXPORT_SYMBOL(llog_lvfs_ops);
1076
1077 #else /* !__KERNEL__ */
1078
1079 static int llog_lvfs_read_header(struct llog_handle *handle)
1080 {
1081         LBUG();
1082         return 0;
1083 }
1084
1085 static int llog_lvfs_write_rec(struct llog_handle *loghandle,
1086                                struct llog_rec_hdr *rec,
1087                                struct llog_cookie *reccookie, int cookiecount,
1088                                void *buf, int idx)
1089 {
1090         LBUG();
1091         return 0;
1092 }
1093
1094 static int llog_lvfs_open(struct llog_ctxt *ctxt, struct llog_handle **res,
1095                           struct llog_logid *logid, char *name, int flags)
1096 {
1097         LBUG();
1098         return 0;
1099 }
1100
1101 static int llog_lvfs_close(struct llog_handle *handle)
1102 {
1103         LBUG();
1104         return 0;
1105 }
1106
1107 static int llog_lvfs_destroy(struct llog_handle *handle)
1108 {
1109         LBUG();
1110         return 0;
1111 }
1112
1113 int llog_get_cat_list(struct lvfs_run_ctxt *ctxt,
1114                       struct fsfilt_operations *fsops, const char *name,
1115                       int count, struct llog_catid *idarray)
1116 {
1117         LBUG();
1118         return 0;
1119 }
1120
1121 int llog_put_cat_list(struct lvfs_run_ctxt *ctxt,
1122                       struct fsfilt_operations *fsops, const char *name,
1123                       int count, struct llog_catid *idarray)
1124 {
1125         LBUG();
1126         return 0;
1127 }
1128
1129 int llog_lvfs_prev_block(struct llog_handle *loghandle,
1130                          int prev_idx, void *buf, int len)
1131 {
1132         LBUG();
1133         return 0;
1134 }
1135
1136 int llog_lvfs_next_block(struct llog_handle *loghandle, int *curr_idx,
1137                          int next_idx, __u64 *offset, void *buf, int len)
1138 {
1139         LBUG();
1140         return 0;
1141 }
1142
1143 struct llog_operations llog_lvfs_ops = {
1144         lop_open:        llog_lvfs_open,
1145         lop_destroy:     llog_lvfs_destroy,
1146         lop_close:       llog_lvfs_close,
1147         lop_read_header: llog_lvfs_read_header,
1148         lop_write_rec:   llog_lvfs_write_rec,
1149         lop_next_block:  llog_lvfs_next_block,
1150         lop_prev_block:  llog_lvfs_prev_block,
1151 };
1152 #endif