1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2001-2003 Cluster File Systems, Inc.
5 * Author: Andreas Dilger <adilger@clusterfs.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 * OST<->MDS recovery logging infrastructure.
24 * Invariants in implementation:
25 * - we do not share logs among different OST<->MDS connections, so that
26 * if an OST or MDS fails it need only look at log(s) relevant to itself
29 #define DEBUG_SUBSYSTEM S_LOG
38 #include <liblustre.h>
41 #include <linux/obd.h>
42 #include <linux/obd_class.h>
43 #include <linux/lustre_log.h>
44 #include <linux/obd_ost.h>
45 #include <libcfs/list.h>
46 #include <linux/lvfs.h>
47 #include <linux/lustre_fsfilt.h>
48 #include "llog_internal.h"
52 static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
55 struct llog_rec_hdr rec = { 0 };
56 struct llog_rec_tail tail;
60 LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
62 tail.lrt_len = rec.lrh_len = len;
63 tail.lrt_index = rec.lrh_index = index;
64 rec.lrh_type = LLOG_PAD_MAGIC;
66 rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
68 CERROR("error writing padding record: rc %d\n", rc);
72 file->f_pos += len - sizeof(rec) - sizeof(tail);
73 rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
75 CERROR("error writing padding record: rc %d\n", rc);
83 static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
84 struct llog_rec_hdr *rec, void *buf, loff_t off)
87 struct llog_rec_tail end;
88 loff_t saved_off = file->f_pos;
89 int buflen = rec->lrh_len;
95 rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
97 CERROR("error writing log record: rc %d\n", rc);
104 rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
105 rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
107 CERROR("error writing log hdr: rc %d\n", rc);
111 rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
113 CERROR("error writing log buffer: rc %d\n", rc);
117 end.lrt_len = rec->lrh_len;
118 end.lrt_index = rec->lrh_index;
119 rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
121 CERROR("error writing log tail: rc %d\n", rc);
127 if (saved_off > file->f_pos)
128 file->f_pos = saved_off;
133 static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
134 void *buf, int size, loff_t off)
140 rc = fsfilt_read_record(obd, file, buf, size, &offset);
142 CERROR("error reading log record: rc %d\n", rc);
148 static int llog_lvfs_read_header(struct llog_handle *handle)
150 struct obd_device *obd;
154 LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
156 obd = handle->lgh_ctxt->loc_exp->exp_obd;
158 if (handle->lgh_file->f_dentry->d_inode->i_size == 0) {
159 CDEBUG(D_HA, "not reading header from 0-byte log\n");
163 rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
166 CERROR("error reading log header from %.*s\n",
167 handle->lgh_file->f_dentry->d_name.len,
168 handle->lgh_file->f_dentry->d_name.name);
170 struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
172 if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
173 lustre_swab_llog_hdr(handle->lgh_hdr);
175 if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
176 CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
177 handle->lgh_file->f_dentry->d_name.len,
178 handle->lgh_file->f_dentry->d_name.name,
179 llh_hdr->lrh_type, LLOG_HDR_MAGIC);
181 } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
182 CERROR("incorrectly sized log %.*s header: %#x "
184 handle->lgh_file->f_dentry->d_name.len,
185 handle->lgh_file->f_dentry->d_name.name,
186 llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
187 CERROR("you may need to re-run lconf --write_conf.\n");
192 handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
193 handle->lgh_file->f_pos = handle->lgh_file->f_dentry->d_inode->i_size;
198 /* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
199 /* appends if idx == -1, otherwise overwrites record idx. */
200 static int llog_lvfs_write_rec(struct llog_handle *loghandle,
201 struct llog_rec_hdr *rec,
202 struct llog_cookie *reccookie, int cookiecount,
205 struct llog_log_hdr *llh;
206 int reclen = rec->lrh_len, index, rc;
207 struct llog_rec_tail *lrt;
208 struct obd_device *obd;
213 llh = loghandle->lgh_hdr;
214 file = loghandle->lgh_file;
215 obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
217 /* record length should not bigger than LLOG_CHUNK_SIZE */
219 rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
220 sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
222 rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
229 /* no header: only allowed to insert record 1 */
230 if (idx != 1 && !file->f_dentry->d_inode->i_size) {
231 CERROR("idx != -1 in empty log\n");
235 if (idx && llh->llh_size && llh->llh_size != reclen)
238 rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
239 /* we are done if we only write the header or on error */
243 saved_offset = sizeof(*llh) + (idx-1)*rec->lrh_len;
244 rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
245 if (rc == 0 && reccookie) {
246 reccookie->lgc_lgl = loghandle->lgh_id;
247 reccookie->lgc_index = idx;
253 /* Make sure that records don't cross a chunk boundary, so we can
254 * process them page-at-a-time if needed. If it will cross a chunk
255 * boundary, write in a fake (but referenced) entry to pad the chunk.
257 * We know that llog_current_log() will return a loghandle that is
258 * big enough to hold reclen, so all we care about is padding here.
260 left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
262 reclen = sizeof(*rec) + rec->lrh_len +
263 sizeof(struct llog_rec_tail);
265 /* NOTE: padding is a record, but no bit is set */
266 if (left != 0 && left != reclen &&
267 left < (reclen + LLOG_MIN_REC_SIZE)) {
268 loghandle->lgh_last_idx++;
269 rc = llog_lvfs_pad(obd, file, left, loghandle->lgh_last_idx);
272 /* if it's the last idx in log file, then return -ENOSPC */
273 if (loghandle->lgh_last_idx == LLOG_BITMAP_SIZE(llh) - 1)
277 loghandle->lgh_last_idx++;
278 index = loghandle->lgh_last_idx;
279 LASSERT(index < LLOG_BITMAP_SIZE(llh));
280 rec->lrh_index = index;
282 lrt = (struct llog_rec_tail *)
283 ((char *)rec + rec->lrh_len - sizeof(*lrt));
284 lrt->lrt_len = rec->lrh_len;
285 lrt->lrt_index = rec->lrh_index;
287 if (ext2_set_bit(index, llh->llh_bitmap)) {
288 CERROR("argh, index %u already set in log bitmap?\n", index);
289 LBUG(); /* should never happen */
292 llh->llh_tail.lrt_index = index;
294 rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
298 rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
302 CDEBUG(D_HA, "added record "LPX64": idx: %u, %u bytes\n",
303 loghandle->lgh_id.lgl_oid, index, rec->lrh_len);
304 if (rc == 0 && reccookie) {
305 reccookie->lgc_lgl = loghandle->lgh_id;
306 reccookie->lgc_index = index;
307 if (rec->lrh_type == MDS_UNLINK_REC)
308 reccookie->lgc_subsys = LLOG_UNLINK_ORIG_CTXT;
309 else if (rec->lrh_type == OST_SZ_REC)
310 reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
311 else if (rec->lrh_type == OST_RAID1_REC)
312 reccookie->lgc_subsys = LLOG_RD1_ORIG_CTXT;
314 reccookie->lgc_subsys = -1;
317 if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
323 /* We can skip reading at least as many log blocks as the number of
324 * minimum sized log records we are skipping. If it turns out
325 * that we are not far enough along the log (because the
326 * actual records are larger than minimum size) we just skip
327 * some more records. */
329 static void llog_skip_over(__u64 *off, int curr, int goal)
333 *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
334 ~(LLOG_CHUNK_SIZE - 1);
339 * - cur_offset to the furthest point read in the log file
340 * - cur_idx to the log index preceeding cur_offset
341 * returns -EIO/-EINVAL on error
343 static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
344 int next_idx, __u64 *cur_offset, void *buf,
350 if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
353 CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
354 next_idx, *cur_idx, *cur_offset);
356 while (*cur_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) {
357 struct llog_rec_hdr *rec;
358 struct llog_rec_tail *tail;
361 llog_skip_over(cur_offset, *cur_idx, next_idx);
364 rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
365 loghandle->lgh_file, buf, len,
369 CERROR("Cant read llog block at log id "LPU64
370 "/%u offset "LPU64"\n",
371 loghandle->lgh_id.lgl_oid,
372 loghandle->lgh_id.lgl_ogen,
377 /* put number of bytes read into rc to make code simpler */
378 rc = ppos - *cur_offset;
381 if (rc == 0) /* end of file, nothing to do */
384 if (rc < sizeof(*tail)) {
385 CERROR("Invalid llog block at log id "LPU64"/%u offset "
386 LPU64"\n", loghandle->lgh_id.lgl_oid,
387 loghandle->lgh_id.lgl_ogen, *cur_offset);
392 tail = (struct llog_rec_tail *)((char *)buf + rc - sizeof(struct llog_rec_tail));
394 if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) {
395 lustre_swab_llog_rec(rec, tail);
398 *cur_idx = tail->lrt_index;
400 /* this shouldn't happen */
401 if (tail->lrt_index == 0) {
402 CERROR("Invalid llog tail at log id "LPU64"/%u offset "
403 LPU64"\n", loghandle->lgh_id.lgl_oid,
404 loghandle->lgh_id.lgl_ogen, *cur_offset);
407 if (tail->lrt_index < next_idx)
410 /* sanity check that the start of the new buffer is no farther
411 * than the record that we wanted. This shouldn't happen. */
412 if (rec->lrh_index > next_idx) {
413 CERROR("missed desired record? %u > %u\n",
414 rec->lrh_index, next_idx);
422 static struct file *llog_filp_open(char *name, int flags, int mode)
428 OBD_ALLOC(logname, PATH_MAX);
430 return ERR_PTR(-ENOMEM);
432 len = snprintf(logname, PATH_MAX, "LOGS/%s", name);
433 if (len >= PATH_MAX - 1) {
434 filp = ERR_PTR(-ENAMETOOLONG);
436 filp = l_filp_open(logname, flags, mode);
438 CERROR("logfile creation %s: %ld\n", logname,
442 OBD_FREE(logname, PATH_MAX);
446 /* This is a callback from the llog_* functions.
447 * Assumes caller has already pushed us into the kernel context. */
448 static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
449 struct llog_logid *logid, char *name)
451 struct llog_handle *handle;
452 struct obd_device *obd;
453 struct l_dentry *dchild = NULL;
454 struct obdo *oa = NULL;
455 int rc = 0, cleanup_phase = 1;
456 int open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
459 handle = llog_alloc_handle();
465 LASSERT(ctxt->loc_exp);
466 obd = ctxt->loc_exp->exp_obd;
469 dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, logid->lgl_oid,
470 logid->lgl_ogen, logid->lgl_ogr);
472 if (IS_ERR(dchild)) {
473 rc = PTR_ERR(dchild);
474 CERROR("error looking up logfile "LPX64":0x%x: rc %d\n",
475 logid->lgl_oid, logid->lgl_ogen, rc);
480 if (dchild->d_inode == NULL) {
482 CERROR("nonexistent log file "LPX64":"LPX64": rc %d\n",
483 logid->lgl_oid, logid->lgl_ogr, rc);
487 handle->lgh_file = l_dentry_open(&obd->obd_ctxt, dchild,
488 O_RDWR | O_LARGEFILE);
489 if (IS_ERR(handle->lgh_file)) {
490 rc = PTR_ERR(handle->lgh_file);
491 CERROR("error opening logfile "LPX64"0x%x: rc %d\n",
492 logid->lgl_oid, logid->lgl_ogen, rc);
496 /* assign the value of lgh_id for handle directly */
497 handle->lgh_id = *logid;
500 handle->lgh_file = llog_filp_open(name, open_flags, 0644);
501 if (IS_ERR(handle->lgh_file))
502 GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
504 handle->lgh_id.lgl_ogr = 1;
505 handle->lgh_id.lgl_oid =
506 handle->lgh_file->f_dentry->d_inode->i_ino;
507 handle->lgh_id.lgl_ogen =
508 handle->lgh_file->f_dentry->d_inode->i_generation;
512 GOTO(cleanup, rc = -ENOMEM);
513 /* XXX get some filter group constants */
515 oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
516 rc = obd_create(ctxt->loc_exp, oa, NULL, NULL);
520 dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, oa->o_id,
521 oa->o_generation, oa->o_gr);
524 GOTO(cleanup, rc = PTR_ERR(dchild));
526 handle->lgh_file = l_dentry_open(&obd->obd_ctxt, dchild,
528 if (IS_ERR(handle->lgh_file))
529 GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
531 handle->lgh_id.lgl_ogr = oa->o_gr;
532 handle->lgh_id.lgl_oid = oa->o_id;
533 handle->lgh_id.lgl_ogen = oa->o_generation;
536 handle->lgh_ctxt = ctxt;
542 switch (cleanup_phase) {
546 llog_free_handle(handle);
551 static int llog_lvfs_close(struct llog_handle *handle)
556 rc = filp_close(handle->lgh_file, 0);
558 CERROR("error closing log: rc %d\n", rc);
562 static int llog_lvfs_destroy(struct llog_handle *handle)
564 struct dentry *fdentry;
569 fdentry = handle->lgh_file->f_dentry;
570 if (!strcmp(fdentry->d_parent->d_name.name, "LOGS")) {
571 struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
572 struct inode *inode = fdentry->d_parent->d_inode;
573 struct obd_run_ctxt saved;
575 push_ctxt(&saved, &obd->obd_ctxt, NULL);
577 rc = llog_lvfs_close(handle);
581 rc = vfs_unlink(inode, fdentry);
586 pop_ctxt(&saved, &obd->obd_ctxt, NULL);
594 oa->o_id = handle->lgh_id.lgl_oid;
595 oa->o_gr = handle->lgh_id.lgl_ogr;
596 oa->o_generation = handle->lgh_id.lgl_ogen;
597 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
599 rc = llog_lvfs_close(handle);
603 rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL);
609 /* reads the catalog list */
610 int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
611 char *name, int count, struct llog_catid *idarray)
613 struct obd_run_ctxt saved;
616 int size = sizeof(*idarray) * count;
621 push_ctxt(&saved, &obd->obd_ctxt, NULL);
622 file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
623 if (!file || IS_ERR(file)) {
625 CERROR("OBD filter: cannot open/create %s: rc = %d\n",
630 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
631 CERROR("%s is not a regular file!: mode = %o\n", name,
632 file->f_dentry->d_inode->i_mode);
633 GOTO(out, rc = -ENOENT);
636 rc = fsfilt_read_record(disk_obd, file, idarray, size, &off);
638 CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
644 pop_ctxt(&saved, &obd->obd_ctxt, NULL);
645 if (file && !IS_ERR(file))
646 rc = filp_close(file, 0);
649 EXPORT_SYMBOL(llog_get_cat_list);
651 /* writes the cat list */
652 int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
653 char *name, int count, struct llog_catid *idarray)
655 struct obd_run_ctxt saved;
658 int size = sizeof(*idarray) * count;
663 push_ctxt(&saved, &obd->obd_ctxt, NULL);
664 file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
665 if (!file || IS_ERR(file)) {
667 CERROR("OBD filter: cannot open/create %s: rc = %d\n",
672 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
673 CERROR("%s is not a regular file!: mode = %o\n", name,
674 file->f_dentry->d_inode->i_mode);
675 GOTO(out, rc = -ENOENT);
678 rc = fsfilt_write_record(disk_obd, file, idarray, size, &off, 1);
680 CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
686 pop_ctxt(&saved, &obd->obd_ctxt, NULL);
687 if (file && !IS_ERR(file))
688 rc = filp_close(file, 0);
692 struct llog_operations llog_lvfs_ops = {
693 lop_write_rec: llog_lvfs_write_rec,
694 lop_next_block: llog_lvfs_next_block,
695 lop_read_header: llog_lvfs_read_header,
696 lop_create: llog_lvfs_create,
697 lop_destroy: llog_lvfs_destroy,
698 lop_close: llog_lvfs_close,
699 // lop_cancel: llog_lvfs_cancel,
702 EXPORT_SYMBOL(llog_lvfs_ops);
704 #else /* !__KERNEL__ */
706 static int llog_lvfs_read_header(struct llog_handle *handle)
712 static int llog_lvfs_write_rec(struct llog_handle *loghandle,
713 struct llog_rec_hdr *rec,
714 struct llog_cookie *reccookie, int cookiecount,
721 static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx,
722 int next_idx, __u64 *cur_offset, void *buf,
729 static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
730 struct llog_logid *logid, char *name)
736 static int llog_lvfs_close(struct llog_handle *handle)
742 static int llog_lvfs_destroy(struct llog_handle *handle)
748 int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
749 char *name, int count, struct llog_catid *idarray)
755 int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
756 char *name, int count, struct llog_catid *idarray)
762 struct llog_operations llog_lvfs_ops = {
763 lop_write_rec: llog_lvfs_write_rec,
764 lop_next_block: llog_lvfs_next_block,
765 lop_read_header: llog_lvfs_read_header,
766 lop_create: llog_lvfs_create,
767 lop_destroy: llog_lvfs_destroy,
768 lop_close: llog_lvfs_close,
769 // lop_cancel: llog_lvfs_cancel,