Whamcloud - gitweb
31bca72a5f3491ca5769c6be389e689f00a6de3a
[fs/lustre-release.git] / lustre / ofd / ofd_fs.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/ofd/ofd_fs.c
37  *
38  * Author: Alexey Zhuravlev <bzzz@whamcloud.com>
39  * Author: Mikhail Pershin <tappro@whamcloud.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_FILTER
43
44 #include "ofd_internal.h"
45
46 int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
47                      struct dt_object *dt, struct lu_buf *buf, loff_t *off)
48 {
49         struct thandle  *th;
50         int              rc;
51
52         ENTRY;
53
54         LASSERT(dt);
55
56         th = dt_trans_create(env, ofd->ofd_osd);
57         if (IS_ERR(th))
58                 RETURN(PTR_ERR(th));
59
60         rc = dt_declare_record_write(env, dt, buf->lb_len, *off, th);
61         if (rc == 0) {
62                 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
63                 if (rc == 0)
64                         rc = dt_record_write(env, dt, buf, off, th);
65         }
66         dt_trans_stop(env, ofd->ofd_osd, th);
67
68         RETURN(rc);
69 }
70
71 obd_id ofd_last_id(struct ofd_device *ofd, obd_seq group)
72 {
73         obd_id id;
74
75         LASSERT(group <= ofd->ofd_max_group);
76
77         cfs_spin_lock(&ofd->ofd_objid_lock);
78         id = ofd->ofd_last_objids[group];
79         cfs_spin_unlock(&ofd->ofd_objid_lock);
80
81         return id;
82 }
83
84 void ofd_last_id_set(struct ofd_device *ofd, obd_id id, obd_seq group)
85 {
86         LASSERT(group <= ofd->ofd_max_group);
87         cfs_spin_lock(&ofd->ofd_objid_lock);
88         if (ofd->ofd_last_objids[group] < id)
89                 ofd->ofd_last_objids[group] = id;
90         cfs_spin_unlock(&ofd->ofd_objid_lock);
91 }
92
93 int ofd_last_id_write(const struct lu_env *env, struct ofd_device *ofd,
94                       obd_seq group)
95 {
96         struct ofd_thread_info  *info = ofd_info(env);
97         obd_id                   tmp;
98         int                      rc;
99
100         ENTRY;
101
102         info->fti_buf.lb_buf = &tmp;
103         info->fti_buf.lb_len = sizeof(tmp);
104         info->fti_off = 0;
105
106         CDEBUG(D_INODE, "%s: write last_objid for group "LPU64": "LPU64"\n",
107                ofd_obd(ofd)->obd_name, group, ofd_last_id(ofd, group));
108
109         tmp = cpu_to_le64(ofd_last_id(ofd, group));
110
111         rc = ofd_record_write(env, ofd, ofd->ofd_lastid_obj[group],
112                               &info->fti_buf, &info->fti_off);
113         RETURN(rc);
114 }
115
116 int ofd_last_group_write(const struct lu_env *env, struct ofd_device *ofd)
117 {
118         struct ofd_thread_info  *info = ofd_info(env);
119         obd_seq                  tmp;
120         int                      rc;
121
122         ENTRY;
123
124         info->fti_buf.lb_buf = &tmp;
125         info->fti_buf.lb_len = sizeof(tmp);
126         info->fti_off = 0;
127
128         tmp = cpu_to_le32(ofd->ofd_max_group);
129
130         rc = ofd_record_write(env, ofd, ofd->ofd_last_group_file,
131                               &info->fti_buf, &info->fti_off);
132
133         RETURN(rc);
134 }
135
136 void ofd_group_fini(const struct lu_env *env, struct ofd_device *ofd,
137                     int group)
138 {
139         LASSERT(ofd->ofd_lastid_obj[group]);
140         lu_object_put(env, &ofd->ofd_lastid_obj[group]->do_lu);
141         ofd->ofd_lastid_obj[group] = NULL;
142 }
143
144 int ofd_group_load(const struct lu_env *env, struct ofd_device *ofd, int group)
145 {
146         struct ofd_thread_info  *info = ofd_info(env);
147         struct dt_object        *dob;
148         obd_id                   lastid;
149         int                      rc;
150
151         ENTRY;
152
153         /* if group is already initialized */
154         if (ofd->ofd_lastid_obj[group])
155                 RETURN(0);
156
157         lu_local_obj_fid(&info->fti_fid, OFD_GROUP0_LAST_OID + group);
158         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
159         info->fti_attr.la_valid = LA_MODE;
160         info->fti_attr.la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
161         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
162
163         /* create object tracking per-group last created
164          * id to be used by orphan recovery mechanism */
165         dob = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
166                                 &info->fti_dof, &info->fti_attr);
167         if (IS_ERR(dob))
168                 RETURN(PTR_ERR(dob));
169
170         ofd->ofd_lastid_obj[group] = dob;
171         cfs_mutex_init(&ofd->ofd_create_locks[group]);
172
173         rc = dt_attr_get(env, dob, &info->fti_attr, BYPASS_CAPA);
174         if (rc)
175                 GOTO(cleanup, rc);
176
177         if (info->fti_attr.la_size == 0) {
178                 /* object is just created, initialize last id */
179                 ofd->ofd_last_objids[group] = OFD_INIT_OBJID;
180                 ofd_last_id_set(ofd, OFD_INIT_OBJID, group);
181                 ofd_last_id_write(env, ofd, group);
182                 ofd_last_group_write(env, ofd);
183         } else if (info->fti_attr.la_size == sizeof(lastid)) {
184                 info->fti_off = 0;
185                 info->fti_buf.lb_buf = &lastid;
186                 info->fti_buf.lb_len = sizeof(lastid);
187
188                 rc = dt_record_read(env, dob, &info->fti_buf, &info->fti_off);
189                 if (rc) {
190                         CERROR("can't read last_id: %d\n", rc);
191                         GOTO(cleanup, rc);
192                 }
193                 ofd->ofd_last_objids[group] = le64_to_cpu(lastid);
194         } else {
195                 CERROR("corrupted size %Lu LAST_ID of group %u\n",
196                        (unsigned long long)info->fti_attr.la_size, group);
197                 rc = -EINVAL;
198         }
199
200         RETURN(0);
201 cleanup:
202         ofd_group_fini(env, ofd, group);
203         RETURN(rc);
204 }
205
206 /* ofd groups managements */
207 int ofd_groups_init(const struct lu_env *env, struct ofd_device *ofd)
208 {
209         struct ofd_thread_info  *info = ofd_info(env);
210         unsigned long            groups_size;
211         obd_seq                  last_group;
212         int                      rc = 0;
213         int                      i;
214
215         ENTRY;
216
217         cfs_spin_lock_init(&ofd->ofd_objid_lock);
218
219         rc = dt_attr_get(env, ofd->ofd_last_group_file,
220                          &info->fti_attr, BYPASS_CAPA);
221         if (rc)
222                 GOTO(cleanup, rc);
223
224         groups_size = (unsigned long)info->fti_attr.la_size;
225
226         if (groups_size == sizeof(last_group)) {
227                 info->fti_off = 0;
228                 info->fti_buf.lb_buf = &last_group;
229                 info->fti_buf.lb_len = sizeof(last_group);
230
231                 rc = dt_record_read(env, ofd->ofd_last_group_file,
232                                     &info->fti_buf, &info->fti_off);
233                 if (rc) {
234                         CERROR("can't read LAST_GROUP: %d\n", rc);
235                         GOTO(cleanup, rc);
236                 }
237
238                 ofd->ofd_max_group = le32_to_cpu(last_group);
239                 LASSERT(ofd->ofd_max_group <= OFD_MAX_GROUPS);
240         } else if (groups_size == 0) {
241                 ofd->ofd_max_group = 0;
242         } else {
243                 CERROR("groups file is corrupted? size = %lu\n", groups_size);
244                 GOTO(cleanup, rc = -EIO);
245         }
246
247         for (i = 0; i <= ofd->ofd_max_group; i++) {
248                 rc = ofd_group_load(env, ofd, i);
249                 if (rc) {
250                         CERROR("can't load group %d: %d\n", i, rc);
251                         /* Clean all previously set groups */
252                         while (i > 0)
253                                 ofd_group_fini(env, ofd, --i);
254                         GOTO(cleanup, rc);
255                 }
256         }
257
258         CWARN("%s: %u groups initialized\n",
259               ofd_obd(ofd)->obd_name, ofd->ofd_max_group + 1);
260 cleanup:
261         RETURN(rc);
262 }
263
264 int ofd_clients_data_init(const struct lu_env *env, struct ofd_device *ofd,
265                           unsigned long fsize)
266 {
267         struct obd_device               *obd = ofd_obd(ofd);
268         struct lr_server_data           *lsd = &ofd->ofd_lut.lut_lsd;
269         struct lsd_client_data          *lcd = NULL;
270         struct filter_export_data       *fed;
271         int                              cl_idx;
272         int                              rc = 0;
273         loff_t                           off = lsd->lsd_client_start;
274
275         CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
276                  sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
277
278         OBD_ALLOC_PTR(lcd);
279         if (lcd == NULL)
280                 RETURN(-ENOMEM);
281
282         for (cl_idx = 0; off < fsize; cl_idx++) {
283                 struct obd_export       *exp;
284                 __u64                    last_rcvd;
285
286                 /* Don't assume off is incremented properly by
287                  * fsfilt_read_record(), in case sizeof(*lcd)
288                  * isn't the same as fsd->lsd_client_size.  */
289                 off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
290                 rc = lut_client_data_read(env, &ofd->ofd_lut, lcd, &off, cl_idx);
291                 if (rc) {
292                         CERROR("error reading FILT %s idx %d off %llu: rc %d\n",
293                                LAST_RCVD, cl_idx, off, rc);
294                         rc = 0;
295                         break; /* read error shouldn't cause startup to fail */
296                 }
297
298                 if (lcd->lcd_uuid[0] == '\0') {
299                         CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
300                                cl_idx);
301                         continue;
302                 }
303
304                 last_rcvd = lcd->lcd_last_transno;
305
306                 /* These exports are cleaned up by ofd_disconnect(), so they
307                  * need to be set up like real exports as ofd_connect() does.
308                  */
309                 exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
310
311                 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
312                        " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx,
313                        last_rcvd, lsd->lsd_last_transno);
314
315                 if (IS_ERR(exp)) {
316                         if (PTR_ERR(exp) == -EALREADY) {
317                                 /* export already exists, zero out this one */
318                                 CERROR("Duplicate export %s!\n", lcd->lcd_uuid);
319                                 continue;
320                         }
321                         GOTO(err_out, rc = PTR_ERR(exp));
322                 }
323
324                 fed = &exp->exp_filter_data;
325                 *fed->fed_ted.ted_lcd = *lcd;
326
327                 ofd_export_stats_init(ofd, exp, NULL);
328                 rc = lut_client_add(env, exp, cl_idx);
329                 LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
330                 /* VBR: set export last committed version */
331                 exp->exp_last_committed = last_rcvd;
332                 cfs_spin_lock(&exp->exp_lock);
333                 exp->exp_connecting = 0;
334                 exp->exp_in_recovery = 0;
335                 cfs_spin_unlock(&exp->exp_lock);
336                 obd->obd_max_recoverable_clients++;
337                 class_export_put(exp);
338
339                 /* Need to check last_rcvd even for duplicated exports. */
340                 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
341                        cl_idx, last_rcvd);
342
343                 cfs_spin_lock(&ofd->ofd_lut.lut_translock);
344                 if (last_rcvd > lsd->lsd_last_transno)
345                         lsd->lsd_last_transno = last_rcvd;
346                 cfs_spin_unlock(&ofd->ofd_lut.lut_translock);
347         }
348
349 err_out:
350         OBD_FREE_PTR(lcd);
351         RETURN(rc);
352 }
353
354 int ofd_server_data_init(const struct lu_env *env, struct ofd_device *ofd)
355 {
356         struct ofd_thread_info  *info = ofd_info(env);
357         struct lr_server_data   *lsd = &ofd->ofd_lut.lut_lsd;
358         struct obd_device       *obd = ofd_obd(ofd);
359         unsigned long            last_rcvd_size;
360         int                      rc;
361
362         rc = dt_attr_get(env, ofd->ofd_lut.lut_last_rcvd, &info->fti_attr,
363                          BYPASS_CAPA);
364         if (rc)
365                 RETURN(rc);
366
367         last_rcvd_size = (unsigned long)info->fti_attr.la_size;
368
369         /* ensure padding in the struct is the correct size */
370         CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
371                   sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
372
373         if (last_rcvd_size == 0) {
374                 LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
375
376                 memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,
377                        sizeof(lsd->lsd_uuid));
378                 lsd->lsd_last_transno = 0;
379                 lsd->lsd_mount_count = 0;
380                 lsd->lsd_server_size = LR_SERVER_SIZE;
381                 lsd->lsd_client_start = LR_CLIENT_START;
382                 lsd->lsd_client_size = LR_CLIENT_SIZE;
383                 lsd->lsd_subdir_count = FILTER_SUBDIR_COUNT;
384                 lsd->lsd_feature_incompat = OBD_INCOMPAT_OST;
385         } else {
386                 rc = lut_server_data_read(env, &ofd->ofd_lut);
387                 if (rc) {
388                         CDEBUG(D_INODE,"OBD ofd: error reading %s: rc %d\n",
389                                LAST_RCVD, rc);
390                         GOTO(err_fsd, rc);
391                 }
392                 if (strcmp((char *)lsd->lsd_uuid,
393                            (char *)obd->obd_uuid.uuid)) {
394                         LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
395                                        " disk %s. Were the /dev/ assignments "
396                                        "rearranged?\n",
397                                        obd->obd_uuid.uuid, lsd->lsd_uuid);
398                         GOTO(err_fsd, rc = -EINVAL);
399                 }
400         }
401
402         lsd->lsd_mount_count++;
403         obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
404         obd->u.obt.obt_instance = (__u32)obd->u.obt.obt_mount_count;
405         ofd->ofd_subdir_count = lsd->lsd_subdir_count;
406
407         if (lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP) {
408                 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
409                        obd->obd_name,
410                        lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP);
411                 GOTO(err_fsd, rc = -EINVAL);
412         }
413         if (lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP) {
414                 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
415                        obd->obd_name,
416                        lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP);
417                 /* Do something like remount filesystem read-only */
418                 GOTO(err_fsd, rc = -EINVAL);
419         }
420
421         CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
422                obd->obd_name, lsd->lsd_last_transno);
423         CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
424                obd->obd_name, lsd->lsd_mount_count);
425         CDEBUG(D_INODE, "%s: server data size: %u\n",
426                obd->obd_name, lsd->lsd_server_size);
427         CDEBUG(D_INODE, "%s: per-client data start: %u\n",
428                obd->obd_name, lsd->lsd_client_start);
429         CDEBUG(D_INODE, "%s: per-client data size: %u\n",
430                obd->obd_name, lsd->lsd_client_size);
431         CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
432                obd->obd_name, lsd->lsd_subdir_count);
433         CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
434                last_rcvd_size <= lsd->lsd_client_start ? 0 :
435                (last_rcvd_size - lsd->lsd_client_start) /
436                lsd->lsd_client_size);
437
438         if (!obd->obd_replayable)
439                 CWARN("%s: recovery support OFF\n", obd->obd_name);
440
441         rc = ofd_clients_data_init(env, ofd, last_rcvd_size);
442
443         cfs_spin_lock(&ofd->ofd_lut.lut_translock);
444         obd->obd_last_committed = lsd->lsd_last_transno;
445         cfs_spin_unlock(&ofd->ofd_lut.lut_translock);
446
447         /* save it, so mount count and last_transno is current */
448         rc = lut_server_data_update(env, &ofd->ofd_lut, 0);
449         if (rc)
450                 GOTO(err_fsd, rc);
451
452         RETURN(0);
453
454 err_fsd:
455         class_disconnect_exports(obd);
456         RETURN(rc);
457 }
458
459 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
460                  struct obd_device *obd)
461 {
462         struct ofd_thread_info  *info = ofd_info(env);
463         struct dt_object        *fo;
464         int                      rc = 0;
465
466         ENTRY;
467
468         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FS_SETUP))
469                 RETURN (-ENOENT);
470
471         rc = ofd_server_data_init(env, ofd);
472         if (rc)
473                 GOTO(out, rc);
474
475         lu_local_obj_fid(&info->fti_fid, OFD_HEALTH_CHECK_OID);
476         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
477         info->fti_attr.la_valid = LA_MODE;
478         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
479         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
480
481         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
482                                &info->fti_dof, &info->fti_attr);
483         if (IS_ERR(fo))
484                 GOTO(out, rc = PTR_ERR(fo));
485
486         ofd->ofd_health_check_file = fo;
487
488         lu_local_obj_fid(&info->fti_fid, OFD_LAST_GROUP_OID);
489         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
490         info->fti_attr.la_valid = LA_MODE;
491         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
492         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
493
494         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
495                                &info->fti_dof, &info->fti_attr);
496         if (IS_ERR(fo))
497                 GOTO(out_hc, rc = PTR_ERR(fo));
498
499         ofd->ofd_last_group_file = fo;
500
501         rc = ofd_groups_init(env, ofd);
502         if (rc)
503                 GOTO(out_lg, rc);
504
505         RETURN(0);
506 out_lg:
507         lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
508 out_hc:
509         lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
510 out:
511         return rc;
512 }
513
514 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd)
515 {
516         int i;
517
518         ENTRY;
519
520         ofd_info_init(env, NULL);
521
522         for (i = 0; i <= ofd->ofd_max_group; i++) {
523                 if (ofd->ofd_lastid_obj[i]) {
524                         ofd_last_id_write(env, ofd, i);
525                         ofd_group_fini(env, ofd, i);
526                 }
527         }
528
529         i = dt_sync(env, ofd->ofd_osd);
530         if (i)
531                 CERROR("can't sync: %d\n", i);
532
533         if (ofd->ofd_last_group_file) {
534                 lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
535                 ofd->ofd_last_group_file = NULL;
536         }
537
538         if (ofd->ofd_health_check_file) {
539                 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
540                 ofd->ofd_health_check_file = NULL;
541         }
542
543         EXIT;
544 }
545