Whamcloud - gitweb
182875858a28eacdc9177797babac8dce58b2a9a
[fs/lustre-release.git] / lustre / ofd / ofd_fs.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/ofd/ofd_fs.c
37  *
38  * Author: Alexey Zhuravlev <bzzz@whamcloud.com>
39  * Author: Mikhail Pershin <tappro@whamcloud.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_FILTER
43
44 #include "ofd_internal.h"
45
46 int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
47                      struct dt_object *dt, struct lu_buf *buf, loff_t *off)
48 {
49         struct thandle  *th;
50         int              rc;
51
52         ENTRY;
53
54         LASSERT(dt);
55
56         th = dt_trans_create(env, ofd->ofd_osd);
57         if (IS_ERR(th))
58                 RETURN(PTR_ERR(th));
59
60         rc = dt_declare_record_write(env, dt, buf->lb_len, *off, th);
61         if (rc == 0) {
62                 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
63                 if (rc == 0)
64                         rc = dt_record_write(env, dt, buf, off, th);
65         }
66         dt_trans_stop(env, ofd->ofd_osd, th);
67
68         RETURN(rc);
69 }
70
71 obd_id ofd_last_id(struct ofd_device *ofd, obd_seq group)
72 {
73         obd_id id;
74
75         LASSERT(group <= ofd->ofd_max_group);
76
77         cfs_spin_lock(&ofd->ofd_objid_lock);
78         id = ofd->ofd_last_objids[group];
79         cfs_spin_unlock(&ofd->ofd_objid_lock);
80
81         return id;
82 }
83
84 void ofd_last_id_set(struct ofd_device *ofd, obd_id id, obd_seq group)
85 {
86         LASSERT(group <= ofd->ofd_max_group);
87         cfs_spin_lock(&ofd->ofd_objid_lock);
88         if (ofd->ofd_last_objids[group] < id)
89                 ofd->ofd_last_objids[group] = id;
90         cfs_spin_unlock(&ofd->ofd_objid_lock);
91 }
92
93 int ofd_last_id_write(const struct lu_env *env, struct ofd_device *ofd,
94                       obd_seq group)
95 {
96         struct ofd_thread_info  *info = ofd_info(env);
97         obd_id                   tmp;
98         int                      rc;
99
100         ENTRY;
101
102         info->fti_buf.lb_buf = &tmp;
103         info->fti_buf.lb_len = sizeof(tmp);
104         info->fti_off = 0;
105
106         CDEBUG(D_INODE, "%s: write last_objid for group "LPU64": "LPU64"\n",
107                ofd_obd(ofd)->obd_name, group, ofd_last_id(ofd, group));
108
109         tmp = cpu_to_le64(ofd_last_id(ofd, group));
110
111         rc = ofd_record_write(env, ofd, ofd->ofd_lastid_obj[group],
112                               &info->fti_buf, &info->fti_off);
113         RETURN(rc);
114 }
115
116 int ofd_last_group_write(const struct lu_env *env, struct ofd_device *ofd)
117 {
118         struct ofd_thread_info  *info = ofd_info(env);
119         obd_seq                  tmp;
120         int                      rc;
121
122         ENTRY;
123
124         info->fti_buf.lb_buf = &tmp;
125         info->fti_buf.lb_len = sizeof(tmp);
126         info->fti_off = 0;
127
128         tmp = cpu_to_le32(ofd->ofd_max_group);
129
130         rc = ofd_record_write(env, ofd, ofd->ofd_last_group_file,
131                               &info->fti_buf, &info->fti_off);
132
133         RETURN(rc);
134 }
135
136 void ofd_group_fini(const struct lu_env *env, struct ofd_device *ofd,
137                     int group)
138 {
139         LASSERT(ofd->ofd_lastid_obj[group]);
140         lu_object_put(env, &ofd->ofd_lastid_obj[group]->do_lu);
141         ofd->ofd_lastid_obj[group] = NULL;
142 }
143
144 int ofd_group_load(const struct lu_env *env, struct ofd_device *ofd, int group)
145 {
146         struct ofd_thread_info  *info = ofd_info(env);
147         struct dt_object        *dob;
148         obd_id                   lastid;
149         int                      rc;
150
151         ENTRY;
152
153         /* if group is already initialized */
154         if (ofd->ofd_lastid_obj[group])
155                 RETURN(0);
156
157         lu_local_obj_fid(&info->fti_fid, OFD_GROUP0_LAST_OID + group);
158         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
159         info->fti_attr.la_valid = LA_MODE;
160         info->fti_attr.la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
161         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
162
163         /* create object tracking per-group last created
164          * id to be used by orphan recovery mechanism */
165         dob = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
166                                 &info->fti_dof, &info->fti_attr);
167         if (IS_ERR(dob))
168                 RETURN(PTR_ERR(dob));
169
170         ofd->ofd_lastid_obj[group] = dob;
171         cfs_mutex_init(&ofd->ofd_create_locks[group]);
172
173         rc = dt_attr_get(env, dob, &info->fti_attr, BYPASS_CAPA);
174         if (rc)
175                 GOTO(cleanup, rc);
176
177         if (info->fti_attr.la_size == 0) {
178                 /* object is just created, initialize last id */
179                 ofd->ofd_last_objids[group] = OFD_INIT_OBJID;
180                 ofd_last_id_set(ofd, OFD_INIT_OBJID, group);
181                 ofd_last_id_write(env, ofd, group);
182                 ofd_last_group_write(env, ofd);
183         } else if (info->fti_attr.la_size == sizeof(lastid)) {
184                 info->fti_off = 0;
185                 info->fti_buf.lb_buf = &lastid;
186                 info->fti_buf.lb_len = sizeof(lastid);
187
188                 rc = dt_record_read(env, dob, &info->fti_buf, &info->fti_off);
189                 if (rc) {
190                         CERROR("can't read last_id: %d\n", rc);
191                         GOTO(cleanup, rc);
192                 }
193                 ofd->ofd_last_objids[group] = le64_to_cpu(lastid);
194         } else {
195                 CERROR("corrupted size %Lu LAST_ID of group %u\n",
196                        (unsigned long long)info->fti_attr.la_size, group);
197                 rc = -EINVAL;
198         }
199
200         RETURN(0);
201 cleanup:
202         ofd_group_fini(env, ofd, group);
203         RETURN(rc);
204 }
205
206 /* ofd groups managements */
207 int ofd_groups_init(const struct lu_env *env, struct ofd_device *ofd)
208 {
209         struct ofd_thread_info  *info = ofd_info(env);
210         unsigned long            groups_size;
211         obd_seq                  last_group;
212         int                      rc = 0;
213         int                      i;
214
215         ENTRY;
216
217         cfs_spin_lock_init(&ofd->ofd_objid_lock);
218
219         rc = dt_attr_get(env, ofd->ofd_last_group_file,
220                          &info->fti_attr, BYPASS_CAPA);
221         if (rc)
222                 GOTO(cleanup, rc);
223
224         groups_size = (unsigned long)info->fti_attr.la_size;
225
226         if (groups_size == sizeof(last_group)) {
227                 info->fti_off = 0;
228                 info->fti_buf.lb_buf = &last_group;
229                 info->fti_buf.lb_len = sizeof(last_group);
230
231                 rc = dt_record_read(env, ofd->ofd_last_group_file,
232                                     &info->fti_buf, &info->fti_off);
233                 if (rc) {
234                         CERROR("can't read LAST_GROUP: %d\n", rc);
235                         GOTO(cleanup, rc);
236                 }
237
238                 ofd->ofd_max_group = le32_to_cpu(last_group);
239                 LASSERT(ofd->ofd_max_group <= OFD_MAX_GROUPS);
240         } else if (groups_size == 0) {
241                 ofd->ofd_max_group = 0;
242         } else {
243                 CERROR("groups file is corrupted? size = %lu\n", groups_size);
244                 GOTO(cleanup, rc = -EIO);
245         }
246
247         for (i = 0; i <= ofd->ofd_max_group; i++) {
248                 rc = ofd_group_load(env, ofd, i);
249                 if (rc) {
250                         CERROR("can't load group %d: %d\n", i, rc);
251                         /* Clean all previously set groups */
252                         while (i > 0)
253                                 ofd_group_fini(env, ofd, --i);
254                         GOTO(cleanup, rc);
255                 }
256         }
257
258         CWARN("%s: %u groups initialized\n",
259               ofd_obd(ofd)->obd_name, ofd->ofd_max_group + 1);
260 cleanup:
261         RETURN(rc);
262 }
263
264 int ofd_clients_data_init(const struct lu_env *env, struct ofd_device *ofd,
265                           unsigned long fsize)
266 {
267         struct obd_device               *obd = ofd_obd(ofd);
268         struct lr_server_data           *lsd = &ofd->ofd_lut.lut_lsd;
269         struct lsd_client_data          *lcd = NULL;
270         struct filter_export_data       *fed;
271         int                              cl_idx;
272         int                              rc = 0;
273         loff_t                           off = lsd->lsd_client_start;
274
275         CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
276                  sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
277
278         OBD_ALLOC_PTR(lcd);
279         if (lcd == NULL)
280                 RETURN(-ENOMEM);
281
282         for (cl_idx = 0; off < fsize; cl_idx++) {
283                 struct obd_export       *exp;
284                 __u64                    last_rcvd;
285
286                 /* Don't assume off is incremented properly by
287                  * fsfilt_read_record(), in case sizeof(*lcd)
288                  * isn't the same as fsd->lsd_client_size.  */
289                 off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
290                 rc = lut_client_data_read(env, &ofd->ofd_lut, lcd, &off, cl_idx);
291                 if (rc) {
292                         CERROR("error reading FILT %s idx %d off %llu: rc %d\n",
293                                LAST_RCVD, cl_idx, off, rc);
294                         rc = 0;
295                         break; /* read error shouldn't cause startup to fail */
296                 }
297
298                 if (lcd->lcd_uuid[0] == '\0') {
299                         CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
300                                cl_idx);
301                         continue;
302                 }
303
304                 last_rcvd = lcd->lcd_last_transno;
305
306                 /* These exports are cleaned up by ofd_disconnect(), so they
307                  * need to be set up like real exports as ofd_connect() does.
308                  */
309                 exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
310
311                 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
312                        " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx,
313                        last_rcvd, lsd->lsd_last_transno);
314
315                 if (IS_ERR(exp)) {
316                         if (PTR_ERR(exp) == -EALREADY) {
317                                 /* export already exists, zero out this one */
318                                 CERROR("Duplicate export %s!\n", lcd->lcd_uuid);
319                                 continue;
320                         }
321                         GOTO(err_out, rc = PTR_ERR(exp));
322                 }
323
324                 fed = &exp->exp_filter_data;
325                 *fed->fed_ted.ted_lcd = *lcd;
326
327                 rc = lut_client_add(env, exp, cl_idx);
328                 LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
329                 /* VBR: set export last committed version */
330                 exp->exp_last_committed = last_rcvd;
331                 cfs_spin_lock(&exp->exp_lock);
332                 exp->exp_connecting = 0;
333                 exp->exp_in_recovery = 0;
334                 cfs_spin_unlock(&exp->exp_lock);
335                 obd->obd_max_recoverable_clients++;
336                 class_export_put(exp);
337
338                 /* Need to check last_rcvd even for duplicated exports. */
339                 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
340                        cl_idx, last_rcvd);
341
342                 cfs_spin_lock(&ofd->ofd_lut.lut_translock);
343                 if (last_rcvd > lsd->lsd_last_transno)
344                         lsd->lsd_last_transno = last_rcvd;
345                 cfs_spin_unlock(&ofd->ofd_lut.lut_translock);
346         }
347
348 err_out:
349         OBD_FREE_PTR(lcd);
350         RETURN(rc);
351 }
352
353 int ofd_server_data_init(const struct lu_env *env, struct ofd_device *ofd)
354 {
355         struct ofd_thread_info  *info = ofd_info(env);
356         struct lr_server_data   *lsd = &ofd->ofd_lut.lut_lsd;
357         struct obd_device       *obd = ofd_obd(ofd);
358         unsigned long            last_rcvd_size;
359         int                      rc;
360
361         rc = dt_attr_get(env, ofd->ofd_lut.lut_last_rcvd, &info->fti_attr,
362                          BYPASS_CAPA);
363         if (rc)
364                 RETURN(rc);
365
366         last_rcvd_size = (unsigned long)info->fti_attr.la_size;
367
368         /* ensure padding in the struct is the correct size */
369         CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
370                   sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
371
372         if (last_rcvd_size == 0) {
373                 LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
374
375                 memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,
376                        sizeof(lsd->lsd_uuid));
377                 lsd->lsd_last_transno = 0;
378                 lsd->lsd_mount_count = 0;
379                 lsd->lsd_server_size = LR_SERVER_SIZE;
380                 lsd->lsd_client_start = LR_CLIENT_START;
381                 lsd->lsd_client_size = LR_CLIENT_SIZE;
382                 lsd->lsd_subdir_count = FILTER_SUBDIR_COUNT;
383                 lsd->lsd_feature_incompat = OBD_INCOMPAT_OST;
384         } else {
385                 rc = lut_server_data_read(env, &ofd->ofd_lut);
386                 if (rc) {
387                         CDEBUG(D_INODE,"OBD ofd: error reading %s: rc %d\n",
388                                LAST_RCVD, rc);
389                         GOTO(err_fsd, rc);
390                 }
391                 if (strcmp((char *)lsd->lsd_uuid,
392                            (char *)obd->obd_uuid.uuid)) {
393                         LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
394                                        " disk %s. Were the /dev/ assignments "
395                                        "rearranged?\n",
396                                        obd->obd_uuid.uuid, lsd->lsd_uuid);
397                         GOTO(err_fsd, rc = -EINVAL);
398                 }
399         }
400
401         lsd->lsd_mount_count++;
402         obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
403         obd->u.obt.obt_instance = (__u32)obd->u.obt.obt_mount_count;
404         ofd->ofd_subdir_count = lsd->lsd_subdir_count;
405
406         if (lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP) {
407                 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
408                        obd->obd_name,
409                        lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP);
410                 GOTO(err_fsd, rc = -EINVAL);
411         }
412         if (lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP) {
413                 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
414                        obd->obd_name,
415                        lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP);
416                 /* Do something like remount filesystem read-only */
417                 GOTO(err_fsd, rc = -EINVAL);
418         }
419
420         CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
421                obd->obd_name, lsd->lsd_last_transno);
422         CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
423                obd->obd_name, lsd->lsd_mount_count);
424         CDEBUG(D_INODE, "%s: server data size: %u\n",
425                obd->obd_name, lsd->lsd_server_size);
426         CDEBUG(D_INODE, "%s: per-client data start: %u\n",
427                obd->obd_name, lsd->lsd_client_start);
428         CDEBUG(D_INODE, "%s: per-client data size: %u\n",
429                obd->obd_name, lsd->lsd_client_size);
430         CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
431                obd->obd_name, lsd->lsd_subdir_count);
432         CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
433                last_rcvd_size <= lsd->lsd_client_start ? 0 :
434                (last_rcvd_size - lsd->lsd_client_start) /
435                lsd->lsd_client_size);
436
437         if (!obd->obd_replayable)
438                 CWARN("%s: recovery support OFF\n", obd->obd_name);
439
440         rc = ofd_clients_data_init(env, ofd, last_rcvd_size);
441
442         cfs_spin_lock(&ofd->ofd_lut.lut_translock);
443         obd->obd_last_committed = lsd->lsd_last_transno;
444         cfs_spin_unlock(&ofd->ofd_lut.lut_translock);
445
446         /* save it, so mount count and last_transno is current */
447         rc = lut_server_data_update(env, &ofd->ofd_lut, 0);
448         if (rc)
449                 GOTO(err_fsd, rc);
450
451         RETURN(0);
452
453 err_fsd:
454         class_disconnect_exports(obd);
455         RETURN(rc);
456 }
457
458 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
459                  struct obd_device *obd)
460 {
461         struct ofd_thread_info  *info = ofd_info(env);
462         struct dt_object        *fo;
463         int                      rc = 0;
464
465         ENTRY;
466
467         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FS_SETUP))
468                 RETURN (-ENOENT);
469
470         /* prepare transactions callbacks */
471         ofd->ofd_txn_cb.dtc_txn_start = NULL;
472         ofd->ofd_txn_cb.dtc_txn_stop = ofd_txn_stop_cb;
473         ofd->ofd_txn_cb.dtc_txn_commit = NULL;
474         ofd->ofd_txn_cb.dtc_cookie = ofd;
475         ofd->ofd_txn_cb.dtc_tag = LCT_DT_THREAD;
476         CFS_INIT_LIST_HEAD(&ofd->ofd_txn_cb.dtc_linkage);
477
478         dt_txn_callback_add(ofd->ofd_osd, &ofd->ofd_txn_cb);
479
480         rc = ofd_server_data_init(env, ofd);
481         if (rc)
482                 GOTO(out, rc);
483
484         lu_local_obj_fid(&info->fti_fid, OFD_HEALTH_CHECK_OID);
485         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
486         info->fti_attr.la_valid = LA_MODE;
487         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
488         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
489
490         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
491                                &info->fti_dof, &info->fti_attr);
492         if (IS_ERR(fo))
493                 GOTO(out, rc = PTR_ERR(fo));
494
495         ofd->ofd_health_check_file = fo;
496
497         lu_local_obj_fid(&info->fti_fid, OFD_LAST_GROUP_OID);
498         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
499         info->fti_attr.la_valid = LA_MODE;
500         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
501         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
502
503         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
504                                &info->fti_dof, &info->fti_attr);
505         if (IS_ERR(fo))
506                 GOTO(out_hc, rc = PTR_ERR(fo));
507
508         ofd->ofd_last_group_file = fo;
509
510         rc = ofd_groups_init(env, ofd);
511         if (rc)
512                 GOTO(out_lg, rc);
513
514         RETURN(0);
515 out_lg:
516         lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
517 out_hc:
518         lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
519 out:
520         dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
521         return rc;
522 }
523
524 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd)
525 {
526         int i;
527
528         ENTRY;
529
530         ofd_info_init(env, NULL);
531
532         for (i = 0; i <= ofd->ofd_max_group; i++) {
533                 if (ofd->ofd_lastid_obj[i]) {
534                         ofd_last_id_write(env, ofd, i);
535                         ofd_group_fini(env, ofd, i);
536                 }
537         }
538
539         i = dt_sync(env, ofd->ofd_osd);
540         if (i)
541                 CERROR("can't sync: %d\n", i);
542
543         /* Remove transaction callback */
544         dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
545
546         if (ofd->ofd_last_group_file) {
547                 lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
548                 ofd->ofd_last_group_file = NULL;
549         }
550
551         if (ofd->ofd_health_check_file) {
552                 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
553                 ofd->ofd_health_check_file = NULL;
554         }
555
556         EXIT;
557 }
558