4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Alexey Zhuravlev <bzzz@whamcloud.com>
39 * Author: Mikhail Pershin <tappro@whamcloud.com>
42 #define DEBUG_SUBSYSTEM S_FILTER
44 #include "ofd_internal.h"
46 int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
47 struct dt_object *dt, struct lu_buf *buf, loff_t *off)
56 th = dt_trans_create(env, ofd->ofd_osd);
60 rc = dt_declare_record_write(env, dt, buf->lb_len, *off, th);
62 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
64 rc = dt_record_write(env, dt, buf, off, th);
66 dt_trans_stop(env, ofd->ofd_osd, th);
71 int ofd_precreate_batch(struct ofd_device *ofd, int batch)
75 spin_lock(&ofd->ofd_batch_lock);
76 count = min(ofd->ofd_precreate_batch, batch);
77 spin_unlock(&ofd->ofd_batch_lock);
82 struct ofd_seq *ofd_seq_get(struct ofd_device *ofd, obd_seq seq)
86 read_lock(&ofd->ofd_seq_list_lock);
87 cfs_list_for_each_entry(oseq, &ofd->ofd_seq_list, os_list) {
88 if (oseq->os_seq == seq) {
89 cfs_atomic_inc(&oseq->os_refc);
90 read_unlock(&ofd->ofd_seq_list_lock);
94 read_unlock(&ofd->ofd_seq_list_lock);
98 static void ofd_seq_destroy(const struct lu_env *env,
101 LASSERT(cfs_list_empty(&oseq->os_list));
102 LASSERT(oseq->os_lastid_obj != NULL);
103 lu_object_put(env, &oseq->os_lastid_obj->do_lu);
107 void ofd_seq_put(const struct lu_env *env, struct ofd_seq *oseq)
109 if (cfs_atomic_dec_and_test(&oseq->os_refc))
110 ofd_seq_destroy(env, oseq);
113 static void ofd_seq_delete(const struct lu_env *env, struct ofd_seq *oseq)
115 cfs_list_del_init(&oseq->os_list);
116 ofd_seq_put(env, oseq);
120 * Add a new sequence to the OFD device.
122 * \param ofd OFD device
123 * \param new_seq new sequence to be added
125 * \retval the seq to be added or the existing seq
127 static struct ofd_seq *ofd_seq_add(const struct lu_env *env,
128 struct ofd_device *ofd,
129 struct ofd_seq *new_seq)
131 struct ofd_seq *os = NULL;
133 write_lock(&ofd->ofd_seq_list_lock);
134 cfs_list_for_each_entry(os, &ofd->ofd_seq_list, os_list) {
135 if (os->os_seq == new_seq->os_seq) {
136 cfs_atomic_inc(&os->os_refc);
137 write_unlock(&ofd->ofd_seq_list_lock);
138 /* The seq has not been added to the list */
139 ofd_seq_put(env, new_seq);
143 cfs_atomic_inc(&new_seq->os_refc);
144 cfs_list_add_tail(&new_seq->os_list, &ofd->ofd_seq_list);
145 ofd->ofd_seq_count++;
146 write_unlock(&ofd->ofd_seq_list_lock);
150 obd_id ofd_seq_last_oid(struct ofd_seq *oseq)
154 spin_lock(&oseq->os_last_oid_lock);
155 id = oseq->os_last_oid;
156 spin_unlock(&oseq->os_last_oid_lock);
161 void ofd_seq_last_oid_set(struct ofd_seq *oseq, obd_id id)
163 spin_lock(&oseq->os_last_oid_lock);
164 if (likely(oseq->os_last_oid < id))
165 oseq->os_last_oid = id;
166 spin_unlock(&oseq->os_last_oid_lock);
169 int ofd_seq_last_oid_write(const struct lu_env *env, struct ofd_device *ofd,
170 struct ofd_seq *oseq)
172 struct ofd_thread_info *info = ofd_info(env);
178 info->fti_buf.lb_buf = &tmp;
179 info->fti_buf.lb_len = sizeof(tmp);
182 CDEBUG(D_INODE, "%s: write last_objid for seq "LPX64" : "LPX64"\n",
183 ofd_name(ofd), oseq->os_seq, ofd_seq_last_oid(oseq));
185 tmp = cpu_to_le64(ofd_seq_last_oid(oseq));
187 rc = ofd_record_write(env, ofd, oseq->os_lastid_obj, &info->fti_buf,
192 static void ofd_deregister_seq_exp(struct ofd_device *ofd)
194 struct seq_server_site *ss = &ofd->ofd_seq_site;
196 if (ss->ss_client_seq != NULL) {
197 lustre_deregister_lwp_item(&ss->ss_client_seq->lcs_exp);
198 ss->ss_client_seq->lcs_exp = NULL;
201 if (ss->ss_server_fld != NULL) {
202 lustre_deregister_lwp_item(&ss->ss_server_fld->lsf_control_exp);
203 ss->ss_server_fld->lsf_control_exp = NULL;
207 static int ofd_fld_fini(const struct lu_env *env,
208 struct ofd_device *ofd)
210 struct seq_server_site *ss = &ofd->ofd_seq_site;
213 if (ss && ss->ss_server_fld) {
214 fld_server_fini(env, ss->ss_server_fld);
215 OBD_FREE_PTR(ss->ss_server_fld);
216 ss->ss_server_fld = NULL;
222 void ofd_seqs_fini(const struct lu_env *env, struct ofd_device *ofd)
224 struct ofd_seq *oseq;
229 ofd_deregister_seq_exp(ofd);
231 rc = ofd_fid_fini(env, ofd);
233 CERROR("%s: fid fini error: rc = %d\n", ofd_name(ofd), rc);
235 rc = ofd_fld_fini(env, ofd);
237 CERROR("%s: fld fini error: rc = %d\n", ofd_name(ofd), rc);
239 CFS_INIT_LIST_HEAD(&dispose);
240 write_lock(&ofd->ofd_seq_list_lock);
241 cfs_list_for_each_entry_safe(oseq, tmp, &ofd->ofd_seq_list, os_list) {
242 cfs_list_move(&oseq->os_list, &dispose);
244 write_unlock(&ofd->ofd_seq_list_lock);
246 while (!cfs_list_empty(&dispose)) {
247 oseq = container_of0(dispose.next, struct ofd_seq, os_list);
248 ofd_seq_delete(env, oseq);
251 LASSERT(cfs_list_empty(&ofd->ofd_seq_list));
257 * \retval the seq with seq number or errno (never NULL)
259 struct ofd_seq *ofd_seq_load(const struct lu_env *env, struct ofd_device *ofd,
262 struct ofd_thread_info *info = ofd_info(env);
263 struct ofd_seq *oseq = NULL;
264 struct dt_object *dob;
270 /* if seq is already initialized */
271 oseq = ofd_seq_get(ofd, seq);
277 RETURN(ERR_PTR(-ENOMEM));
279 lu_last_id_fid(&info->fti_fid, seq);
280 memset(&info->fti_attr, 0, sizeof(info->fti_attr));
281 info->fti_attr.la_valid = LA_MODE;
282 info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
283 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
285 /* create object tracking per-seq last created
286 * id to be used by orphan recovery mechanism */
287 dob = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
288 &info->fti_dof, &info->fti_attr);
294 oseq->os_lastid_obj = dob;
296 CFS_INIT_LIST_HEAD(&oseq->os_list);
297 mutex_init(&oseq->os_create_lock);
298 spin_lock_init(&oseq->os_last_oid_lock);
301 cfs_atomic_set(&oseq->os_refc, 1);
303 rc = dt_attr_get(env, dob, &info->fti_attr, BYPASS_CAPA);
307 if (info->fti_attr.la_size == 0) {
308 /* object is just created, initialize last id */
309 oseq->os_last_oid = OFD_INIT_OBJID;
310 ofd_seq_last_oid_write(env, ofd, oseq);
311 } else if (info->fti_attr.la_size == sizeof(lastid)) {
313 info->fti_buf.lb_buf = &lastid;
314 info->fti_buf.lb_len = sizeof(lastid);
316 rc = dt_record_read(env, dob, &info->fti_buf, &info->fti_off);
318 CERROR("%s: can't read last_id: rc = %d\n",
322 oseq->os_last_oid = le64_to_cpu(lastid);
324 CERROR("%s: corrupted size "LPU64" LAST_ID of seq "LPX64"\n",
325 ofd_name(ofd), (__u64)info->fti_attr.la_size, seq);
326 GOTO(cleanup, rc = -EINVAL);
329 oseq = ofd_seq_add(env, ofd, oseq);
330 RETURN((oseq != NULL) ? oseq : ERR_PTR(-ENOENT));
332 ofd_seq_put(env, oseq);
336 static int ofd_fld_init(const struct lu_env *env, const char *uuid,
337 struct ofd_device *ofd)
339 struct seq_server_site *ss = &ofd->ofd_seq_site;
343 OBD_ALLOC_PTR(ss->ss_server_fld);
344 if (ss->ss_server_fld == NULL)
345 RETURN(rc = -ENOMEM);
347 rc = fld_server_init(env, ss->ss_server_fld, ofd->ofd_osd, uuid,
348 ss->ss_node_id, LU_SEQ_RANGE_OST);
350 OBD_FREE_PTR(ss->ss_server_fld);
351 ss->ss_server_fld = NULL;
357 static int ofd_register_seq_exp(struct ofd_device *ofd)
359 struct seq_server_site *ss = &ofd->ofd_seq_site;
360 char *lwp_name = NULL;
363 OBD_ALLOC(lwp_name, MAX_OBD_NAME);
364 if (lwp_name == NULL)
365 GOTO(out_free, rc = -ENOMEM);
367 rc = tgt_name2lwpname(ofd_name(ofd), lwp_name);
371 rc = lustre_register_lwp_item(lwp_name, &ss->ss_client_seq->lcs_exp,
376 rc = lustre_register_lwp_item(lwp_name,
377 &ss->ss_server_fld->lsf_control_exp,
380 lustre_deregister_lwp_item(&ss->ss_client_seq->lcs_exp);
381 ss->ss_client_seq->lcs_exp = NULL;
385 if (lwp_name != NULL)
386 OBD_FREE(lwp_name, MAX_OBD_NAME);
391 /* object sequence management */
392 int ofd_seqs_init(const struct lu_env *env, struct ofd_device *ofd)
396 rc = ofd_fid_init(env, ofd);
398 CERROR("%s: fid init error: rc = %d\n", ofd_name(ofd), rc);
402 rc = ofd_fld_init(env, ofd_name(ofd), ofd);
404 CERROR("%s: Can't init fld, rc %d\n", ofd_name(ofd), rc);
408 rc = ofd_register_seq_exp(ofd);
410 CERROR("%s: Can't init seq exp, rc %d\n", ofd_name(ofd), rc);
414 rwlock_init(&ofd->ofd_seq_list_lock);
415 CFS_INIT_LIST_HEAD(&ofd->ofd_seq_list);
416 ofd->ofd_seq_count = 0;
420 int ofd_clients_data_init(const struct lu_env *env, struct ofd_device *ofd,
423 struct obd_device *obd = ofd_obd(ofd);
424 struct lr_server_data *lsd = &ofd->ofd_lut.lut_lsd;
425 struct lsd_client_data *lcd = NULL;
426 struct filter_export_data *fed;
429 loff_t off = lsd->lsd_client_start;
431 CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
432 sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
438 for (cl_idx = 0; off < fsize; cl_idx++) {
439 struct obd_export *exp;
442 /* Don't assume off is incremented properly by
443 * fsfilt_read_record(), in case sizeof(*lcd)
444 * isn't the same as fsd->lsd_client_size. */
445 off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
446 rc = tgt_client_data_read(env, &ofd->ofd_lut, lcd, &off, cl_idx);
448 CERROR("%s: error reading FILT %s idx %d off %llu: "
449 "rc = %d\n", ofd_name(ofd), LAST_RCVD, cl_idx,
452 break; /* read error shouldn't cause startup to fail */
455 if (lcd->lcd_uuid[0] == '\0') {
456 CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
461 last_rcvd = lcd->lcd_last_transno;
463 /* These exports are cleaned up by ofd_disconnect(), so they
464 * need to be set up like real exports as ofd_connect() does.
466 exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
468 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
469 " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx,
470 last_rcvd, lsd->lsd_last_transno);
473 if (PTR_ERR(exp) == -EALREADY) {
474 /* export already exists, zero out this one */
475 CERROR("%s: Duplicate export %s!\n",
476 ofd_name(ofd), lcd->lcd_uuid);
479 GOTO(err_out, rc = PTR_ERR(exp));
482 fed = &exp->exp_filter_data;
483 *fed->fed_ted.ted_lcd = *lcd;
485 rc = tgt_client_add(env, exp, cl_idx);
486 LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
487 /* VBR: set export last committed version */
488 exp->exp_last_committed = last_rcvd;
489 spin_lock(&exp->exp_lock);
490 exp->exp_connecting = 0;
491 exp->exp_in_recovery = 0;
492 spin_unlock(&exp->exp_lock);
493 obd->obd_max_recoverable_clients++;
494 class_export_put(exp);
496 /* Need to check last_rcvd even for duplicated exports. */
497 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
500 spin_lock(&ofd->ofd_lut.lut_translock);
501 if (last_rcvd > lsd->lsd_last_transno)
502 lsd->lsd_last_transno = last_rcvd;
503 spin_unlock(&ofd->ofd_lut.lut_translock);
511 int ofd_server_data_init(const struct lu_env *env, struct ofd_device *ofd)
513 struct ofd_thread_info *info = ofd_info(env);
514 struct lr_server_data *lsd = &ofd->ofd_lut.lut_lsd;
515 struct obd_device *obd = ofd_obd(ofd);
516 unsigned long last_rcvd_size;
520 rc = dt_attr_get(env, ofd->ofd_lut.lut_last_rcvd, &info->fti_attr,
525 last_rcvd_size = (unsigned long)info->fti_attr.la_size;
527 /* ensure padding in the struct is the correct size */
528 CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
529 sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
531 rc = server_name2index(obd->obd_name, &index, NULL);
533 CERROR("%s: Can not get index from obd_name: rc = %d\n",
538 if (last_rcvd_size == 0) {
539 LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
541 memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,
542 sizeof(lsd->lsd_uuid));
543 lsd->lsd_last_transno = 0;
544 lsd->lsd_mount_count = 0;
545 lsd->lsd_server_size = LR_SERVER_SIZE;
546 lsd->lsd_client_start = LR_CLIENT_START;
547 lsd->lsd_client_size = LR_CLIENT_SIZE;
548 lsd->lsd_subdir_count = FILTER_SUBDIR_COUNT;
549 lsd->lsd_feature_incompat = OBD_INCOMPAT_OST;
550 lsd->lsd_osd_index = index;
552 rc = tgt_server_data_read(env, &ofd->ofd_lut);
554 CDEBUG(D_INODE,"OBD ofd: error reading %s: rc %d\n",
558 if (strcmp((char *)lsd->lsd_uuid,
559 (char *)obd->obd_uuid.uuid)) {
560 LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
561 " disk %s. Were the /dev/ assignments "
563 obd->obd_uuid.uuid, lsd->lsd_uuid);
564 GOTO(err_fsd, rc = -EINVAL);
567 if (lsd->lsd_osd_index == 0) {
568 lsd->lsd_osd_index = index;
569 } else if (lsd->lsd_osd_index != index) {
570 LCONSOLE_ERROR("%s: index %d in last rcvd is different"
571 " with the index %d in config log."
572 " It might be disk corruption!\n",
573 obd->obd_name, lsd->lsd_osd_index,
575 GOTO(err_fsd, rc = -EINVAL);
579 lsd->lsd_mount_count++;
580 obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
581 obd->u.obt.obt_instance = (__u32)obd->u.obt.obt_mount_count;
582 ofd->ofd_subdir_count = lsd->lsd_subdir_count;
584 if (lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP) {
585 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
587 lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP);
588 GOTO(err_fsd, rc = -EINVAL);
590 if (lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP) {
591 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
593 lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP);
594 /* Do something like remount filesystem read-only */
595 GOTO(err_fsd, rc = -EINVAL);
598 CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
599 obd->obd_name, lsd->lsd_last_transno);
600 CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
601 obd->obd_name, lsd->lsd_mount_count);
602 CDEBUG(D_INODE, "%s: server data size: %u\n",
603 obd->obd_name, lsd->lsd_server_size);
604 CDEBUG(D_INODE, "%s: per-client data start: %u\n",
605 obd->obd_name, lsd->lsd_client_start);
606 CDEBUG(D_INODE, "%s: per-client data size: %u\n",
607 obd->obd_name, lsd->lsd_client_size);
608 CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
609 obd->obd_name, lsd->lsd_subdir_count);
610 CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
611 last_rcvd_size <= lsd->lsd_client_start ? 0 :
612 (last_rcvd_size - lsd->lsd_client_start) /
613 lsd->lsd_client_size);
615 if (!obd->obd_replayable)
616 CWARN("%s: recovery support OFF\n", obd->obd_name);
618 rc = ofd_clients_data_init(env, ofd, last_rcvd_size);
620 spin_lock(&ofd->ofd_lut.lut_translock);
621 obd->obd_last_committed = lsd->lsd_last_transno;
622 ofd->ofd_lut.lut_last_transno = lsd->lsd_last_transno;
623 spin_unlock(&ofd->ofd_lut.lut_translock);
625 /* save it, so mount count and last_transno is current */
626 rc = tgt_server_data_update(env, &ofd->ofd_lut, 0);
633 class_disconnect_exports(obd);
637 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
638 struct obd_device *obd)
640 struct ofd_thread_info *info = ofd_info(env);
641 struct dt_object *fo;
646 if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FS_SETUP))
649 /* prepare transactions callbacks */
650 ofd->ofd_txn_cb.dtc_txn_start = NULL;
651 ofd->ofd_txn_cb.dtc_txn_stop = ofd_txn_stop_cb;
652 ofd->ofd_txn_cb.dtc_txn_commit = NULL;
653 ofd->ofd_txn_cb.dtc_cookie = ofd;
654 ofd->ofd_txn_cb.dtc_tag = LCT_DT_THREAD;
655 CFS_INIT_LIST_HEAD(&ofd->ofd_txn_cb.dtc_linkage);
657 dt_txn_callback_add(ofd->ofd_osd, &ofd->ofd_txn_cb);
659 rc = ofd_server_data_init(env, ofd);
663 lu_local_obj_fid(&info->fti_fid, OFD_HEALTH_CHECK_OID);
664 memset(&info->fti_attr, 0, sizeof(info->fti_attr));
665 info->fti_attr.la_valid = LA_MODE;
666 info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
667 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
669 fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
670 &info->fti_dof, &info->fti_attr);
672 GOTO(out, rc = PTR_ERR(fo));
674 ofd->ofd_health_check_file = fo;
676 rc = ofd_seqs_init(env, ofd);
682 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
684 dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
688 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd)
694 ofd_info_init(env, NULL);
696 ofd_seqs_fini(env, ofd);
698 i = dt_sync(env, ofd->ofd_osd);
700 CERROR("can't sync: %d\n", i);
702 /* Remove transaction callback */
703 dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
705 if (ofd->ofd_health_check_file) {
706 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
707 ofd->ofd_health_check_file = NULL;