4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Alexey Zhuravlev <bzzz@whamcloud.com>
39 * Author: Mikhail Pershin <tappro@whamcloud.com>
42 #define DEBUG_SUBSYSTEM S_FILTER
44 #include "ofd_internal.h"
46 int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
47 struct dt_object *dt, struct lu_buf *buf, loff_t *off)
56 th = dt_trans_create(env, ofd->ofd_osd);
60 rc = dt_declare_record_write(env, dt, buf->lb_len, *off, th);
62 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
64 rc = dt_record_write(env, dt, buf, off, th);
66 dt_trans_stop(env, ofd->ofd_osd, th);
71 int ofd_precreate_batch(struct ofd_device *ofd, int batch)
75 spin_lock(&ofd->ofd_objid_lock);
76 count = min(ofd->ofd_precreate_batch, batch);
77 spin_unlock(&ofd->ofd_objid_lock);
82 obd_id ofd_last_id(struct ofd_device *ofd, obd_seq group)
86 LASSERT(group <= ofd->ofd_max_group);
88 spin_lock(&ofd->ofd_objid_lock);
89 id = ofd->ofd_last_objids[group];
90 spin_unlock(&ofd->ofd_objid_lock);
95 void ofd_last_id_set(struct ofd_device *ofd, obd_id id, obd_seq group)
97 LASSERT(group <= ofd->ofd_max_group);
98 spin_lock(&ofd->ofd_objid_lock);
99 if (ofd->ofd_last_objids[group] < id)
100 ofd->ofd_last_objids[group] = id;
101 spin_unlock(&ofd->ofd_objid_lock);
104 int ofd_last_id_write(const struct lu_env *env, struct ofd_device *ofd,
107 struct ofd_thread_info *info = ofd_info(env);
113 info->fti_buf.lb_buf = &tmp;
114 info->fti_buf.lb_len = sizeof(tmp);
117 CDEBUG(D_INODE, "%s: write last_objid for group "LPU64": "LPU64"\n",
118 ofd_obd(ofd)->obd_name, group, ofd_last_id(ofd, group));
120 tmp = cpu_to_le64(ofd_last_id(ofd, group));
122 rc = ofd_record_write(env, ofd, ofd->ofd_lastid_obj[group],
123 &info->fti_buf, &info->fti_off);
127 int ofd_last_group_write(const struct lu_env *env, struct ofd_device *ofd)
129 struct ofd_thread_info *info = ofd_info(env);
135 info->fti_buf.lb_buf = &tmp;
136 info->fti_buf.lb_len = sizeof(tmp);
139 tmp = cpu_to_le32(ofd->ofd_max_group);
141 rc = ofd_record_write(env, ofd, ofd->ofd_last_group_file,
142 &info->fti_buf, &info->fti_off);
147 void ofd_group_fini(const struct lu_env *env, struct ofd_device *ofd,
150 LASSERT(ofd->ofd_lastid_obj[group]);
151 lu_object_put(env, &ofd->ofd_lastid_obj[group]->do_lu);
152 ofd->ofd_lastid_obj[group] = NULL;
155 int ofd_group_load(const struct lu_env *env, struct ofd_device *ofd, int group)
157 struct ofd_thread_info *info = ofd_info(env);
158 struct dt_object *dob;
164 /* if group is already initialized */
165 if (ofd->ofd_lastid_obj[group])
168 lu_local_obj_fid(&info->fti_fid, OFD_GROUP0_LAST_OID + group);
169 memset(&info->fti_attr, 0, sizeof(info->fti_attr));
170 info->fti_attr.la_valid = LA_MODE;
171 info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
172 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
174 /* create object tracking per-group last created
175 * id to be used by orphan recovery mechanism */
176 dob = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
177 &info->fti_dof, &info->fti_attr);
179 RETURN(PTR_ERR(dob));
181 ofd->ofd_lastid_obj[group] = dob;
182 mutex_init(&ofd->ofd_create_locks[group]);
184 rc = dt_attr_get(env, dob, &info->fti_attr, BYPASS_CAPA);
188 if (info->fti_attr.la_size == 0) {
189 /* object is just created, initialize last id */
190 ofd->ofd_last_objids[group] = OFD_INIT_OBJID;
191 ofd_last_id_set(ofd, OFD_INIT_OBJID, group);
192 ofd_last_id_write(env, ofd, group);
193 ofd_last_group_write(env, ofd);
194 } else if (info->fti_attr.la_size == sizeof(lastid)) {
196 info->fti_buf.lb_buf = &lastid;
197 info->fti_buf.lb_len = sizeof(lastid);
199 rc = dt_record_read(env, dob, &info->fti_buf, &info->fti_off);
201 CERROR("can't read last_id: %d\n", rc);
204 ofd->ofd_last_objids[group] = le64_to_cpu(lastid);
206 CERROR("corrupted size %Lu LAST_ID of group %u\n",
207 (unsigned long long)info->fti_attr.la_size, group);
213 ofd_group_fini(env, ofd, group);
217 /* ofd groups managements */
218 int ofd_groups_init(const struct lu_env *env, struct ofd_device *ofd)
220 struct ofd_thread_info *info = ofd_info(env);
221 unsigned long groups_size;
228 spin_lock_init(&ofd->ofd_objid_lock);
230 rc = dt_attr_get(env, ofd->ofd_last_group_file,
231 &info->fti_attr, BYPASS_CAPA);
235 ofd->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
236 groups_size = (unsigned long)info->fti_attr.la_size;
238 if (groups_size == sizeof(last_group)) {
240 info->fti_buf.lb_buf = &last_group;
241 info->fti_buf.lb_len = sizeof(last_group);
243 rc = dt_record_read(env, ofd->ofd_last_group_file,
244 &info->fti_buf, &info->fti_off);
246 CERROR("can't read LAST_GROUP: %d\n", rc);
250 ofd->ofd_max_group = le32_to_cpu(last_group);
251 LASSERT(ofd->ofd_max_group <= OFD_MAX_GROUPS);
252 } else if (groups_size == 0) {
253 ofd->ofd_max_group = 0;
255 CERROR("groups file is corrupted? size = %lu\n", groups_size);
256 GOTO(cleanup, rc = -EIO);
259 for (i = 0; i <= ofd->ofd_max_group; i++) {
260 rc = ofd_group_load(env, ofd, i);
262 CERROR("can't load group %d: %d\n", i, rc);
263 /* Clean all previously set groups */
265 ofd_group_fini(env, ofd, --i);
270 CDEBUG(D_OTHER, "%s: %u groups initialized\n",
271 ofd_obd(ofd)->obd_name, ofd->ofd_max_group + 1);
276 int ofd_clients_data_init(const struct lu_env *env, struct ofd_device *ofd,
279 struct obd_device *obd = ofd_obd(ofd);
280 struct lr_server_data *lsd = &ofd->ofd_lut.lut_lsd;
281 struct lsd_client_data *lcd = NULL;
282 struct filter_export_data *fed;
285 loff_t off = lsd->lsd_client_start;
287 CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
288 sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
294 for (cl_idx = 0; off < fsize; cl_idx++) {
295 struct obd_export *exp;
298 /* Don't assume off is incremented properly by
299 * fsfilt_read_record(), in case sizeof(*lcd)
300 * isn't the same as fsd->lsd_client_size. */
301 off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
302 rc = tgt_client_data_read(env, &ofd->ofd_lut, lcd, &off, cl_idx);
304 CERROR("error reading FILT %s idx %d off %llu: rc %d\n",
305 LAST_RCVD, cl_idx, off, rc);
307 break; /* read error shouldn't cause startup to fail */
310 if (lcd->lcd_uuid[0] == '\0') {
311 CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
316 last_rcvd = lcd->lcd_last_transno;
318 /* These exports are cleaned up by ofd_disconnect(), so they
319 * need to be set up like real exports as ofd_connect() does.
321 exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
323 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
324 " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx,
325 last_rcvd, lsd->lsd_last_transno);
328 if (PTR_ERR(exp) == -EALREADY) {
329 /* export already exists, zero out this one */
330 CERROR("Duplicate export %s!\n", lcd->lcd_uuid);
333 GOTO(err_out, rc = PTR_ERR(exp));
336 fed = &exp->exp_filter_data;
337 *fed->fed_ted.ted_lcd = *lcd;
339 rc = tgt_client_add(env, exp, cl_idx);
340 LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
341 /* VBR: set export last committed version */
342 exp->exp_last_committed = last_rcvd;
343 spin_lock(&exp->exp_lock);
344 exp->exp_connecting = 0;
345 exp->exp_in_recovery = 0;
346 spin_unlock(&exp->exp_lock);
347 obd->obd_max_recoverable_clients++;
348 class_export_put(exp);
350 /* Need to check last_rcvd even for duplicated exports. */
351 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
354 spin_lock(&ofd->ofd_lut.lut_translock);
355 if (last_rcvd > lsd->lsd_last_transno)
356 lsd->lsd_last_transno = last_rcvd;
357 spin_unlock(&ofd->ofd_lut.lut_translock);
365 int ofd_server_data_init(const struct lu_env *env, struct ofd_device *ofd)
367 struct ofd_thread_info *info = ofd_info(env);
368 struct lr_server_data *lsd = &ofd->ofd_lut.lut_lsd;
369 struct obd_device *obd = ofd_obd(ofd);
370 unsigned long last_rcvd_size;
373 rc = dt_attr_get(env, ofd->ofd_lut.lut_last_rcvd, &info->fti_attr,
378 last_rcvd_size = (unsigned long)info->fti_attr.la_size;
380 /* ensure padding in the struct is the correct size */
381 CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
382 sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
384 if (last_rcvd_size == 0) {
385 LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
387 memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,
388 sizeof(lsd->lsd_uuid));
389 lsd->lsd_last_transno = 0;
390 lsd->lsd_mount_count = 0;
391 lsd->lsd_server_size = LR_SERVER_SIZE;
392 lsd->lsd_client_start = LR_CLIENT_START;
393 lsd->lsd_client_size = LR_CLIENT_SIZE;
394 lsd->lsd_subdir_count = FILTER_SUBDIR_COUNT;
395 lsd->lsd_feature_incompat = OBD_INCOMPAT_OST;
397 rc = tgt_server_data_read(env, &ofd->ofd_lut);
399 CDEBUG(D_INODE,"OBD ofd: error reading %s: rc %d\n",
403 if (strcmp((char *)lsd->lsd_uuid,
404 (char *)obd->obd_uuid.uuid)) {
405 LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
406 " disk %s. Were the /dev/ assignments "
408 obd->obd_uuid.uuid, lsd->lsd_uuid);
409 GOTO(err_fsd, rc = -EINVAL);
413 lsd->lsd_mount_count++;
414 obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
415 obd->u.obt.obt_instance = (__u32)obd->u.obt.obt_mount_count;
416 ofd->ofd_subdir_count = lsd->lsd_subdir_count;
418 if (lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP) {
419 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
421 lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP);
422 GOTO(err_fsd, rc = -EINVAL);
424 if (lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP) {
425 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
427 lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP);
428 /* Do something like remount filesystem read-only */
429 GOTO(err_fsd, rc = -EINVAL);
432 CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
433 obd->obd_name, lsd->lsd_last_transno);
434 CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
435 obd->obd_name, lsd->lsd_mount_count);
436 CDEBUG(D_INODE, "%s: server data size: %u\n",
437 obd->obd_name, lsd->lsd_server_size);
438 CDEBUG(D_INODE, "%s: per-client data start: %u\n",
439 obd->obd_name, lsd->lsd_client_start);
440 CDEBUG(D_INODE, "%s: per-client data size: %u\n",
441 obd->obd_name, lsd->lsd_client_size);
442 CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
443 obd->obd_name, lsd->lsd_subdir_count);
444 CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
445 last_rcvd_size <= lsd->lsd_client_start ? 0 :
446 (last_rcvd_size - lsd->lsd_client_start) /
447 lsd->lsd_client_size);
449 if (!obd->obd_replayable)
450 CWARN("%s: recovery support OFF\n", obd->obd_name);
452 rc = ofd_clients_data_init(env, ofd, last_rcvd_size);
454 spin_lock(&ofd->ofd_lut.lut_translock);
455 obd->obd_last_committed = lsd->lsd_last_transno;
456 ofd->ofd_lut.lut_last_transno = lsd->lsd_last_transno;
457 spin_unlock(&ofd->ofd_lut.lut_translock);
459 /* save it, so mount count and last_transno is current */
460 rc = tgt_server_data_update(env, &ofd->ofd_lut, 0);
467 class_disconnect_exports(obd);
471 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
472 struct obd_device *obd)
474 struct ofd_thread_info *info = ofd_info(env);
475 struct dt_object *fo;
480 if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FS_SETUP))
483 /* prepare transactions callbacks */
484 ofd->ofd_txn_cb.dtc_txn_start = NULL;
485 ofd->ofd_txn_cb.dtc_txn_stop = ofd_txn_stop_cb;
486 ofd->ofd_txn_cb.dtc_txn_commit = NULL;
487 ofd->ofd_txn_cb.dtc_cookie = ofd;
488 ofd->ofd_txn_cb.dtc_tag = LCT_DT_THREAD;
489 CFS_INIT_LIST_HEAD(&ofd->ofd_txn_cb.dtc_linkage);
491 dt_txn_callback_add(ofd->ofd_osd, &ofd->ofd_txn_cb);
493 rc = ofd_server_data_init(env, ofd);
497 lu_local_obj_fid(&info->fti_fid, OFD_HEALTH_CHECK_OID);
498 memset(&info->fti_attr, 0, sizeof(info->fti_attr));
499 info->fti_attr.la_valid = LA_MODE;
500 info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
501 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
503 fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
504 &info->fti_dof, &info->fti_attr);
506 GOTO(out, rc = PTR_ERR(fo));
508 ofd->ofd_health_check_file = fo;
510 lu_local_obj_fid(&info->fti_fid, OFD_LAST_GROUP_OID);
511 memset(&info->fti_attr, 0, sizeof(info->fti_attr));
512 info->fti_attr.la_valid = LA_MODE;
513 info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
514 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
516 fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
517 &info->fti_dof, &info->fti_attr);
519 GOTO(out_hc, rc = PTR_ERR(fo));
521 ofd->ofd_last_group_file = fo;
523 rc = ofd_groups_init(env, ofd);
529 lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
531 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
533 dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
537 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd)
543 ofd_info_init(env, NULL);
545 for (i = 0; i <= ofd->ofd_max_group; i++) {
546 if (ofd->ofd_lastid_obj[i]) {
547 ofd_last_id_write(env, ofd, i);
548 ofd_group_fini(env, ofd, i);
552 i = dt_sync(env, ofd->ofd_osd);
554 CERROR("can't sync: %d\n", i);
556 /* Remove transaction callback */
557 dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
559 if (ofd->ofd_last_group_file) {
560 lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
561 ofd->ofd_last_group_file = NULL;
564 if (ofd->ofd_health_check_file) {
565 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
566 ofd->ofd_health_check_file = NULL;