Whamcloud - gitweb
0d881dc714fffa572d91d07372fb5f298ce3651f
[fs/lustre-release.git] / lustre / ofd / ofd_fs.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/ofd/ofd_fs.c
37  *
38  * Author: Alexey Zhuravlev <bzzz@whamcloud.com>
39  * Author: Mikhail Pershin <tappro@whamcloud.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_FILTER
43
44 #include "ofd_internal.h"
45
46 int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
47                      struct dt_object *dt, struct lu_buf *buf, loff_t *off)
48 {
49         struct thandle  *th;
50         int              rc;
51
52         ENTRY;
53
54         LASSERT(dt);
55
56         th = dt_trans_create(env, ofd->ofd_osd);
57         if (IS_ERR(th))
58                 RETURN(PTR_ERR(th));
59
60         rc = dt_declare_record_write(env, dt, buf->lb_len, *off, th);
61         if (rc == 0) {
62                 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
63                 if (rc == 0)
64                         rc = dt_record_write(env, dt, buf, off, th);
65         }
66         dt_trans_stop(env, ofd->ofd_osd, th);
67
68         RETURN(rc);
69 }
70
71 int ofd_precreate_batch(struct ofd_device *ofd, int batch)
72 {
73         int count;
74
75         spin_lock(&ofd->ofd_objid_lock);
76         count = min(ofd->ofd_precreate_batch, batch);
77         spin_unlock(&ofd->ofd_objid_lock);
78
79         return count;
80 }
81
82 obd_id ofd_last_id(struct ofd_device *ofd, obd_seq group)
83 {
84         obd_id id;
85
86         LASSERT(group <= ofd->ofd_max_group);
87
88         spin_lock(&ofd->ofd_objid_lock);
89         id = ofd->ofd_last_objids[group];
90         spin_unlock(&ofd->ofd_objid_lock);
91
92         return id;
93 }
94
95 void ofd_last_id_set(struct ofd_device *ofd, obd_id id, obd_seq group)
96 {
97         LASSERT(group <= ofd->ofd_max_group);
98         spin_lock(&ofd->ofd_objid_lock);
99         if (ofd->ofd_last_objids[group] < id)
100                 ofd->ofd_last_objids[group] = id;
101         spin_unlock(&ofd->ofd_objid_lock);
102 }
103
104 int ofd_last_id_write(const struct lu_env *env, struct ofd_device *ofd,
105                       obd_seq group)
106 {
107         struct ofd_thread_info  *info = ofd_info(env);
108         obd_id                   tmp;
109         int                      rc;
110
111         ENTRY;
112
113         info->fti_buf.lb_buf = &tmp;
114         info->fti_buf.lb_len = sizeof(tmp);
115         info->fti_off = 0;
116
117         CDEBUG(D_INODE, "%s: write last_objid for group "LPU64": "LPU64"\n",
118                ofd_obd(ofd)->obd_name, group, ofd_last_id(ofd, group));
119
120         tmp = cpu_to_le64(ofd_last_id(ofd, group));
121
122         rc = ofd_record_write(env, ofd, ofd->ofd_lastid_obj[group],
123                               &info->fti_buf, &info->fti_off);
124         RETURN(rc);
125 }
126
127 int ofd_last_group_write(const struct lu_env *env, struct ofd_device *ofd)
128 {
129         struct ofd_thread_info  *info = ofd_info(env);
130         obd_seq                  tmp;
131         int                      rc;
132
133         ENTRY;
134
135         info->fti_buf.lb_buf = &tmp;
136         info->fti_buf.lb_len = sizeof(tmp);
137         info->fti_off = 0;
138
139         tmp = cpu_to_le32(ofd->ofd_max_group);
140
141         rc = ofd_record_write(env, ofd, ofd->ofd_last_group_file,
142                               &info->fti_buf, &info->fti_off);
143
144         RETURN(rc);
145 }
146
147 void ofd_group_fini(const struct lu_env *env, struct ofd_device *ofd,
148                     int group)
149 {
150         LASSERT(ofd->ofd_lastid_obj[group]);
151         lu_object_put(env, &ofd->ofd_lastid_obj[group]->do_lu);
152         ofd->ofd_lastid_obj[group] = NULL;
153 }
154
155 int ofd_group_load(const struct lu_env *env, struct ofd_device *ofd, int group)
156 {
157         struct ofd_thread_info  *info = ofd_info(env);
158         struct dt_object        *dob;
159         obd_id                   lastid;
160         int                      rc;
161
162         ENTRY;
163
164         /* if group is already initialized */
165         if (ofd->ofd_lastid_obj[group])
166                 RETURN(0);
167
168         lu_local_obj_fid(&info->fti_fid, OFD_GROUP0_LAST_OID + group);
169         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
170         info->fti_attr.la_valid = LA_MODE;
171         info->fti_attr.la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
172         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
173
174         /* create object tracking per-group last created
175          * id to be used by orphan recovery mechanism */
176         dob = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
177                                 &info->fti_dof, &info->fti_attr);
178         if (IS_ERR(dob))
179                 RETURN(PTR_ERR(dob));
180
181         ofd->ofd_lastid_obj[group] = dob;
182         mutex_init(&ofd->ofd_create_locks[group]);
183
184         rc = dt_attr_get(env, dob, &info->fti_attr, BYPASS_CAPA);
185         if (rc)
186                 GOTO(cleanup, rc);
187
188         if (info->fti_attr.la_size == 0) {
189                 /* object is just created, initialize last id */
190                 ofd->ofd_last_objids[group] = OFD_INIT_OBJID;
191                 ofd_last_id_set(ofd, OFD_INIT_OBJID, group);
192                 ofd_last_id_write(env, ofd, group);
193                 ofd_last_group_write(env, ofd);
194         } else if (info->fti_attr.la_size == sizeof(lastid)) {
195                 info->fti_off = 0;
196                 info->fti_buf.lb_buf = &lastid;
197                 info->fti_buf.lb_len = sizeof(lastid);
198
199                 rc = dt_record_read(env, dob, &info->fti_buf, &info->fti_off);
200                 if (rc) {
201                         CERROR("can't read last_id: %d\n", rc);
202                         GOTO(cleanup, rc);
203                 }
204                 ofd->ofd_last_objids[group] = le64_to_cpu(lastid);
205         } else {
206                 CERROR("corrupted size %Lu LAST_ID of group %u\n",
207                        (unsigned long long)info->fti_attr.la_size, group);
208                 rc = -EINVAL;
209         }
210
211         RETURN(0);
212 cleanup:
213         ofd_group_fini(env, ofd, group);
214         RETURN(rc);
215 }
216
217 /* ofd groups managements */
218 int ofd_groups_init(const struct lu_env *env, struct ofd_device *ofd)
219 {
220         struct ofd_thread_info  *info = ofd_info(env);
221         unsigned long            groups_size;
222         obd_seq                  last_group;
223         int                      rc = 0;
224         int                      i;
225
226         ENTRY;
227
228         spin_lock_init(&ofd->ofd_objid_lock);
229
230         rc = dt_attr_get(env, ofd->ofd_last_group_file,
231                          &info->fti_attr, BYPASS_CAPA);
232         if (rc)
233                 GOTO(cleanup, rc);
234
235         ofd->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
236         groups_size = (unsigned long)info->fti_attr.la_size;
237
238         if (groups_size == sizeof(last_group)) {
239                 info->fti_off = 0;
240                 info->fti_buf.lb_buf = &last_group;
241                 info->fti_buf.lb_len = sizeof(last_group);
242
243                 rc = dt_record_read(env, ofd->ofd_last_group_file,
244                                     &info->fti_buf, &info->fti_off);
245                 if (rc) {
246                         CERROR("can't read LAST_GROUP: %d\n", rc);
247                         GOTO(cleanup, rc);
248                 }
249
250                 ofd->ofd_max_group = le32_to_cpu(last_group);
251                 LASSERT(ofd->ofd_max_group <= OFD_MAX_GROUPS);
252         } else if (groups_size == 0) {
253                 ofd->ofd_max_group = 0;
254         } else {
255                 CERROR("groups file is corrupted? size = %lu\n", groups_size);
256                 GOTO(cleanup, rc = -EIO);
257         }
258
259         for (i = 0; i <= ofd->ofd_max_group; i++) {
260                 rc = ofd_group_load(env, ofd, i);
261                 if (rc) {
262                         CERROR("can't load group %d: %d\n", i, rc);
263                         /* Clean all previously set groups */
264                         while (i > 0)
265                                 ofd_group_fini(env, ofd, --i);
266                         GOTO(cleanup, rc);
267                 }
268         }
269
270         CDEBUG(D_OTHER, "%s: %u groups initialized\n",
271               ofd_obd(ofd)->obd_name, ofd->ofd_max_group + 1);
272 cleanup:
273         RETURN(rc);
274 }
275
276 int ofd_clients_data_init(const struct lu_env *env, struct ofd_device *ofd,
277                           unsigned long fsize)
278 {
279         struct obd_device               *obd = ofd_obd(ofd);
280         struct lr_server_data           *lsd = &ofd->ofd_lut.lut_lsd;
281         struct lsd_client_data          *lcd = NULL;
282         struct filter_export_data       *fed;
283         int                              cl_idx;
284         int                              rc = 0;
285         loff_t                           off = lsd->lsd_client_start;
286
287         CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
288                  sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
289
290         OBD_ALLOC_PTR(lcd);
291         if (lcd == NULL)
292                 RETURN(-ENOMEM);
293
294         for (cl_idx = 0; off < fsize; cl_idx++) {
295                 struct obd_export       *exp;
296                 __u64                    last_rcvd;
297
298                 /* Don't assume off is incremented properly by
299                  * fsfilt_read_record(), in case sizeof(*lcd)
300                  * isn't the same as fsd->lsd_client_size.  */
301                 off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
302                 rc = tgt_client_data_read(env, &ofd->ofd_lut, lcd, &off, cl_idx);
303                 if (rc) {
304                         CERROR("error reading FILT %s idx %d off %llu: rc %d\n",
305                                LAST_RCVD, cl_idx, off, rc);
306                         rc = 0;
307                         break; /* read error shouldn't cause startup to fail */
308                 }
309
310                 if (lcd->lcd_uuid[0] == '\0') {
311                         CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
312                                cl_idx);
313                         continue;
314                 }
315
316                 last_rcvd = lcd->lcd_last_transno;
317
318                 /* These exports are cleaned up by ofd_disconnect(), so they
319                  * need to be set up like real exports as ofd_connect() does.
320                  */
321                 exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
322
323                 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
324                        " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx,
325                        last_rcvd, lsd->lsd_last_transno);
326
327                 if (IS_ERR(exp)) {
328                         if (PTR_ERR(exp) == -EALREADY) {
329                                 /* export already exists, zero out this one */
330                                 CERROR("Duplicate export %s!\n", lcd->lcd_uuid);
331                                 continue;
332                         }
333                         GOTO(err_out, rc = PTR_ERR(exp));
334                 }
335
336                 fed = &exp->exp_filter_data;
337                 *fed->fed_ted.ted_lcd = *lcd;
338
339                 rc = tgt_client_add(env, exp, cl_idx);
340                 LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
341                 /* VBR: set export last committed version */
342                 exp->exp_last_committed = last_rcvd;
343                 spin_lock(&exp->exp_lock);
344                 exp->exp_connecting = 0;
345                 exp->exp_in_recovery = 0;
346                 spin_unlock(&exp->exp_lock);
347                 obd->obd_max_recoverable_clients++;
348                 class_export_put(exp);
349
350                 /* Need to check last_rcvd even for duplicated exports. */
351                 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
352                        cl_idx, last_rcvd);
353
354                 spin_lock(&ofd->ofd_lut.lut_translock);
355                 if (last_rcvd > lsd->lsd_last_transno)
356                         lsd->lsd_last_transno = last_rcvd;
357                 spin_unlock(&ofd->ofd_lut.lut_translock);
358         }
359
360 err_out:
361         OBD_FREE_PTR(lcd);
362         RETURN(rc);
363 }
364
365 int ofd_server_data_init(const struct lu_env *env, struct ofd_device *ofd)
366 {
367         struct ofd_thread_info  *info = ofd_info(env);
368         struct lr_server_data   *lsd = &ofd->ofd_lut.lut_lsd;
369         struct obd_device       *obd = ofd_obd(ofd);
370         unsigned long            last_rcvd_size;
371         int                      rc;
372
373         rc = dt_attr_get(env, ofd->ofd_lut.lut_last_rcvd, &info->fti_attr,
374                          BYPASS_CAPA);
375         if (rc)
376                 RETURN(rc);
377
378         last_rcvd_size = (unsigned long)info->fti_attr.la_size;
379
380         /* ensure padding in the struct is the correct size */
381         CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
382                   sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
383
384         if (last_rcvd_size == 0) {
385                 LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
386
387                 memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,
388                        sizeof(lsd->lsd_uuid));
389                 lsd->lsd_last_transno = 0;
390                 lsd->lsd_mount_count = 0;
391                 lsd->lsd_server_size = LR_SERVER_SIZE;
392                 lsd->lsd_client_start = LR_CLIENT_START;
393                 lsd->lsd_client_size = LR_CLIENT_SIZE;
394                 lsd->lsd_subdir_count = FILTER_SUBDIR_COUNT;
395                 lsd->lsd_feature_incompat = OBD_INCOMPAT_OST;
396         } else {
397                 rc = tgt_server_data_read(env, &ofd->ofd_lut);
398                 if (rc) {
399                         CDEBUG(D_INODE,"OBD ofd: error reading %s: rc %d\n",
400                                LAST_RCVD, rc);
401                         GOTO(err_fsd, rc);
402                 }
403                 if (strcmp((char *)lsd->lsd_uuid,
404                            (char *)obd->obd_uuid.uuid)) {
405                         LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
406                                        " disk %s. Were the /dev/ assignments "
407                                        "rearranged?\n",
408                                        obd->obd_uuid.uuid, lsd->lsd_uuid);
409                         GOTO(err_fsd, rc = -EINVAL);
410                 }
411         }
412
413         lsd->lsd_mount_count++;
414         obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
415         obd->u.obt.obt_instance = (__u32)obd->u.obt.obt_mount_count;
416         ofd->ofd_subdir_count = lsd->lsd_subdir_count;
417
418         if (lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP) {
419                 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
420                        obd->obd_name,
421                        lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP);
422                 GOTO(err_fsd, rc = -EINVAL);
423         }
424         if (lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP) {
425                 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
426                        obd->obd_name,
427                        lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP);
428                 /* Do something like remount filesystem read-only */
429                 GOTO(err_fsd, rc = -EINVAL);
430         }
431
432         CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
433                obd->obd_name, lsd->lsd_last_transno);
434         CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
435                obd->obd_name, lsd->lsd_mount_count);
436         CDEBUG(D_INODE, "%s: server data size: %u\n",
437                obd->obd_name, lsd->lsd_server_size);
438         CDEBUG(D_INODE, "%s: per-client data start: %u\n",
439                obd->obd_name, lsd->lsd_client_start);
440         CDEBUG(D_INODE, "%s: per-client data size: %u\n",
441                obd->obd_name, lsd->lsd_client_size);
442         CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
443                obd->obd_name, lsd->lsd_subdir_count);
444         CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
445                last_rcvd_size <= lsd->lsd_client_start ? 0 :
446                (last_rcvd_size - lsd->lsd_client_start) /
447                lsd->lsd_client_size);
448
449         if (!obd->obd_replayable)
450                 CWARN("%s: recovery support OFF\n", obd->obd_name);
451
452         rc = ofd_clients_data_init(env, ofd, last_rcvd_size);
453
454         spin_lock(&ofd->ofd_lut.lut_translock);
455         obd->obd_last_committed = lsd->lsd_last_transno;
456         ofd->ofd_lut.lut_last_transno = lsd->lsd_last_transno;
457         spin_unlock(&ofd->ofd_lut.lut_translock);
458
459         /* save it, so mount count and last_transno is current */
460         rc = tgt_server_data_update(env, &ofd->ofd_lut, 0);
461         if (rc)
462                 GOTO(err_fsd, rc);
463
464         RETURN(0);
465
466 err_fsd:
467         class_disconnect_exports(obd);
468         RETURN(rc);
469 }
470
471 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
472                  struct obd_device *obd)
473 {
474         struct ofd_thread_info  *info = ofd_info(env);
475         struct dt_object        *fo;
476         int                      rc = 0;
477
478         ENTRY;
479
480         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FS_SETUP))
481                 RETURN (-ENOENT);
482
483         /* prepare transactions callbacks */
484         ofd->ofd_txn_cb.dtc_txn_start = NULL;
485         ofd->ofd_txn_cb.dtc_txn_stop = ofd_txn_stop_cb;
486         ofd->ofd_txn_cb.dtc_txn_commit = NULL;
487         ofd->ofd_txn_cb.dtc_cookie = ofd;
488         ofd->ofd_txn_cb.dtc_tag = LCT_DT_THREAD;
489         CFS_INIT_LIST_HEAD(&ofd->ofd_txn_cb.dtc_linkage);
490
491         dt_txn_callback_add(ofd->ofd_osd, &ofd->ofd_txn_cb);
492
493         rc = ofd_server_data_init(env, ofd);
494         if (rc)
495                 GOTO(out, rc);
496
497         lu_local_obj_fid(&info->fti_fid, OFD_HEALTH_CHECK_OID);
498         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
499         info->fti_attr.la_valid = LA_MODE;
500         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
501         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
502
503         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
504                                &info->fti_dof, &info->fti_attr);
505         if (IS_ERR(fo))
506                 GOTO(out, rc = PTR_ERR(fo));
507
508         ofd->ofd_health_check_file = fo;
509
510         lu_local_obj_fid(&info->fti_fid, OFD_LAST_GROUP_OID);
511         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
512         info->fti_attr.la_valid = LA_MODE;
513         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
514         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
515
516         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
517                                &info->fti_dof, &info->fti_attr);
518         if (IS_ERR(fo))
519                 GOTO(out_hc, rc = PTR_ERR(fo));
520
521         ofd->ofd_last_group_file = fo;
522
523         rc = ofd_groups_init(env, ofd);
524         if (rc)
525                 GOTO(out_lg, rc);
526
527         RETURN(0);
528 out_lg:
529         lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
530 out_hc:
531         lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
532 out:
533         dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
534         return rc;
535 }
536
537 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd)
538 {
539         int i;
540
541         ENTRY;
542
543         ofd_info_init(env, NULL);
544
545         for (i = 0; i <= ofd->ofd_max_group; i++) {
546                 if (ofd->ofd_lastid_obj[i]) {
547                         ofd_last_id_write(env, ofd, i);
548                         ofd_group_fini(env, ofd, i);
549                 }
550         }
551
552         i = dt_sync(env, ofd->ofd_osd);
553         if (i)
554                 CERROR("can't sync: %d\n", i);
555
556         /* Remove transaction callback */
557         dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
558
559         if (ofd->ofd_last_group_file) {
560                 lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
561                 ofd->ofd_last_group_file = NULL;
562         }
563
564         if (ofd->ofd_health_check_file) {
565                 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
566                 ofd->ofd_health_check_file = NULL;
567         }
568
569         EXIT;
570 }
571