Whamcloud - gitweb
LU-2100 ofd: small batched precreation on a small system
[fs/lustre-release.git] / lustre / ofd / ofd_fs.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/ofd/ofd_fs.c
37  *
38  * Author: Alexey Zhuravlev <bzzz@whamcloud.com>
39  * Author: Mikhail Pershin <tappro@whamcloud.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_FILTER
43
44 #include "ofd_internal.h"
45
46 int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
47                      struct dt_object *dt, struct lu_buf *buf, loff_t *off)
48 {
49         struct thandle  *th;
50         int              rc;
51
52         ENTRY;
53
54         LASSERT(dt);
55
56         th = dt_trans_create(env, ofd->ofd_osd);
57         if (IS_ERR(th))
58                 RETURN(PTR_ERR(th));
59
60         rc = dt_declare_record_write(env, dt, buf->lb_len, *off, th);
61         if (rc == 0) {
62                 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
63                 if (rc == 0)
64                         rc = dt_record_write(env, dt, buf, off, th);
65         }
66         dt_trans_stop(env, ofd->ofd_osd, th);
67
68         RETURN(rc);
69 }
70
71 int ofd_precreate_batch(struct ofd_device *ofd, int batch)
72 {
73         int count;
74
75         spin_lock(&ofd->ofd_objid_lock);
76         count = min(ofd->ofd_precreate_batch, batch);
77         spin_unlock(&ofd->ofd_objid_lock);
78
79         return count;
80 }
81
82 obd_id ofd_last_id(struct ofd_device *ofd, obd_seq group)
83 {
84         obd_id id;
85
86         LASSERT(group <= ofd->ofd_max_group);
87
88         spin_lock(&ofd->ofd_objid_lock);
89         id = ofd->ofd_last_objids[group];
90         spin_unlock(&ofd->ofd_objid_lock);
91
92         return id;
93 }
94
95 void ofd_last_id_set(struct ofd_device *ofd, obd_id id, obd_seq group)
96 {
97         LASSERT(group <= ofd->ofd_max_group);
98         spin_lock(&ofd->ofd_objid_lock);
99         if (ofd->ofd_last_objids[group] < id)
100                 ofd->ofd_last_objids[group] = id;
101         spin_unlock(&ofd->ofd_objid_lock);
102 }
103
104 int ofd_last_id_write(const struct lu_env *env, struct ofd_device *ofd,
105                       obd_seq group)
106 {
107         struct ofd_thread_info  *info = ofd_info(env);
108         obd_id                   tmp;
109         int                      rc;
110
111         ENTRY;
112
113         info->fti_buf.lb_buf = &tmp;
114         info->fti_buf.lb_len = sizeof(tmp);
115         info->fti_off = 0;
116
117         CDEBUG(D_INODE, "%s: write last_objid for group "LPU64": "LPU64"\n",
118                ofd_obd(ofd)->obd_name, group, ofd_last_id(ofd, group));
119
120         tmp = cpu_to_le64(ofd_last_id(ofd, group));
121
122         rc = ofd_record_write(env, ofd, ofd->ofd_lastid_obj[group],
123                               &info->fti_buf, &info->fti_off);
124         RETURN(rc);
125 }
126
127 int ofd_last_group_write(const struct lu_env *env, struct ofd_device *ofd)
128 {
129         struct ofd_thread_info  *info = ofd_info(env);
130         obd_seq                  tmp;
131         int                      rc;
132
133         ENTRY;
134
135         info->fti_buf.lb_buf = &tmp;
136         info->fti_buf.lb_len = sizeof(tmp);
137         info->fti_off = 0;
138
139         tmp = cpu_to_le32(ofd->ofd_max_group);
140
141         rc = ofd_record_write(env, ofd, ofd->ofd_last_group_file,
142                               &info->fti_buf, &info->fti_off);
143
144         RETURN(rc);
145 }
146
147 void ofd_group_fini(const struct lu_env *env, struct ofd_device *ofd,
148                     int group)
149 {
150         LASSERT(ofd->ofd_lastid_obj[group]);
151         lu_object_put(env, &ofd->ofd_lastid_obj[group]->do_lu);
152         ofd->ofd_lastid_obj[group] = NULL;
153 }
154
155 int ofd_group_load(const struct lu_env *env, struct ofd_device *ofd, int group)
156 {
157         struct ofd_thread_info  *info = ofd_info(env);
158         struct dt_object        *dob;
159         obd_id                   lastid;
160         int                      rc;
161
162         ENTRY;
163
164         /* if group is already initialized */
165         if (ofd->ofd_lastid_obj[group])
166                 RETURN(0);
167
168         lu_local_obj_fid(&info->fti_fid, OFD_GROUP0_LAST_OID + group);
169         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
170         info->fti_attr.la_valid = LA_MODE;
171         info->fti_attr.la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
172         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
173
174         /* create object tracking per-group last created
175          * id to be used by orphan recovery mechanism */
176         dob = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
177                                 &info->fti_dof, &info->fti_attr);
178         if (IS_ERR(dob))
179                 RETURN(PTR_ERR(dob));
180
181         ofd->ofd_lastid_obj[group] = dob;
182         mutex_init(&ofd->ofd_create_locks[group]);
183
184         rc = dt_attr_get(env, dob, &info->fti_attr, BYPASS_CAPA);
185         if (rc)
186                 GOTO(cleanup, rc);
187
188         if (info->fti_attr.la_size == 0) {
189                 /* object is just created, initialize last id */
190                 ofd->ofd_last_objids[group] = OFD_INIT_OBJID;
191                 ofd_last_id_set(ofd, OFD_INIT_OBJID, group);
192                 ofd_last_id_write(env, ofd, group);
193                 ofd_last_group_write(env, ofd);
194         } else if (info->fti_attr.la_size == sizeof(lastid)) {
195                 info->fti_off = 0;
196                 info->fti_buf.lb_buf = &lastid;
197                 info->fti_buf.lb_len = sizeof(lastid);
198
199                 rc = dt_record_read(env, dob, &info->fti_buf, &info->fti_off);
200                 if (rc) {
201                         CERROR("can't read last_id: %d\n", rc);
202                         GOTO(cleanup, rc);
203                 }
204                 ofd->ofd_last_objids[group] = le64_to_cpu(lastid);
205         } else {
206                 CERROR("corrupted size %Lu LAST_ID of group %u\n",
207                        (unsigned long long)info->fti_attr.la_size, group);
208                 rc = -EINVAL;
209         }
210
211         RETURN(0);
212 cleanup:
213         ofd_group_fini(env, ofd, group);
214         RETURN(rc);
215 }
216
217 /* ofd groups managements */
218 int ofd_groups_init(const struct lu_env *env, struct ofd_device *ofd)
219 {
220         struct ofd_thread_info  *info = ofd_info(env);
221         unsigned long            groups_size;
222         obd_seq                  last_group;
223         int                      rc = 0;
224         int                      i;
225
226         ENTRY;
227
228         spin_lock_init(&ofd->ofd_objid_lock);
229
230         rc = dt_attr_get(env, ofd->ofd_last_group_file,
231                          &info->fti_attr, BYPASS_CAPA);
232         if (rc)
233                 GOTO(cleanup, rc);
234
235         groups_size = (unsigned long)info->fti_attr.la_size;
236
237         if (groups_size == sizeof(last_group)) {
238                 info->fti_off = 0;
239                 info->fti_buf.lb_buf = &last_group;
240                 info->fti_buf.lb_len = sizeof(last_group);
241
242                 rc = dt_record_read(env, ofd->ofd_last_group_file,
243                                     &info->fti_buf, &info->fti_off);
244                 if (rc) {
245                         CERROR("can't read LAST_GROUP: %d\n", rc);
246                         GOTO(cleanup, rc);
247                 }
248
249                 ofd->ofd_max_group = le32_to_cpu(last_group);
250                 LASSERT(ofd->ofd_max_group <= OFD_MAX_GROUPS);
251         } else if (groups_size == 0) {
252                 ofd->ofd_max_group = 0;
253         } else {
254                 CERROR("groups file is corrupted? size = %lu\n", groups_size);
255                 GOTO(cleanup, rc = -EIO);
256         }
257
258         for (i = 0; i <= ofd->ofd_max_group; i++) {
259                 rc = ofd_group_load(env, ofd, i);
260                 if (rc) {
261                         CERROR("can't load group %d: %d\n", i, rc);
262                         /* Clean all previously set groups */
263                         while (i > 0)
264                                 ofd_group_fini(env, ofd, --i);
265                         GOTO(cleanup, rc);
266                 }
267         }
268
269         CDEBUG(D_OTHER, "%s: %u groups initialized\n",
270               ofd_obd(ofd)->obd_name, ofd->ofd_max_group + 1);
271 cleanup:
272         RETURN(rc);
273 }
274
275 int ofd_clients_data_init(const struct lu_env *env, struct ofd_device *ofd,
276                           unsigned long fsize)
277 {
278         struct obd_device               *obd = ofd_obd(ofd);
279         struct lr_server_data           *lsd = &ofd->ofd_lut.lut_lsd;
280         struct lsd_client_data          *lcd = NULL;
281         struct filter_export_data       *fed;
282         int                              cl_idx;
283         int                              rc = 0;
284         loff_t                           off = lsd->lsd_client_start;
285
286         CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
287                  sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
288
289         OBD_ALLOC_PTR(lcd);
290         if (lcd == NULL)
291                 RETURN(-ENOMEM);
292
293         for (cl_idx = 0; off < fsize; cl_idx++) {
294                 struct obd_export       *exp;
295                 __u64                    last_rcvd;
296
297                 /* Don't assume off is incremented properly by
298                  * fsfilt_read_record(), in case sizeof(*lcd)
299                  * isn't the same as fsd->lsd_client_size.  */
300                 off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
301                 rc = tgt_client_data_read(env, &ofd->ofd_lut, lcd, &off, cl_idx);
302                 if (rc) {
303                         CERROR("error reading FILT %s idx %d off %llu: rc %d\n",
304                                LAST_RCVD, cl_idx, off, rc);
305                         rc = 0;
306                         break; /* read error shouldn't cause startup to fail */
307                 }
308
309                 if (lcd->lcd_uuid[0] == '\0') {
310                         CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
311                                cl_idx);
312                         continue;
313                 }
314
315                 last_rcvd = lcd->lcd_last_transno;
316
317                 /* These exports are cleaned up by ofd_disconnect(), so they
318                  * need to be set up like real exports as ofd_connect() does.
319                  */
320                 exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
321
322                 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
323                        " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx,
324                        last_rcvd, lsd->lsd_last_transno);
325
326                 if (IS_ERR(exp)) {
327                         if (PTR_ERR(exp) == -EALREADY) {
328                                 /* export already exists, zero out this one */
329                                 CERROR("Duplicate export %s!\n", lcd->lcd_uuid);
330                                 continue;
331                         }
332                         GOTO(err_out, rc = PTR_ERR(exp));
333                 }
334
335                 fed = &exp->exp_filter_data;
336                 *fed->fed_ted.ted_lcd = *lcd;
337
338                 rc = tgt_client_add(env, exp, cl_idx);
339                 LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
340                 /* VBR: set export last committed version */
341                 exp->exp_last_committed = last_rcvd;
342                 spin_lock(&exp->exp_lock);
343                 exp->exp_connecting = 0;
344                 exp->exp_in_recovery = 0;
345                 spin_unlock(&exp->exp_lock);
346                 obd->obd_max_recoverable_clients++;
347                 class_export_put(exp);
348
349                 /* Need to check last_rcvd even for duplicated exports. */
350                 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
351                        cl_idx, last_rcvd);
352
353                 spin_lock(&ofd->ofd_lut.lut_translock);
354                 if (last_rcvd > lsd->lsd_last_transno)
355                         lsd->lsd_last_transno = last_rcvd;
356                 spin_unlock(&ofd->ofd_lut.lut_translock);
357         }
358
359 err_out:
360         OBD_FREE_PTR(lcd);
361         RETURN(rc);
362 }
363
364 int ofd_server_data_init(const struct lu_env *env, struct ofd_device *ofd)
365 {
366         struct ofd_thread_info  *info = ofd_info(env);
367         struct lr_server_data   *lsd = &ofd->ofd_lut.lut_lsd;
368         struct obd_device       *obd = ofd_obd(ofd);
369         unsigned long            last_rcvd_size;
370         int                      rc;
371
372         rc = dt_attr_get(env, ofd->ofd_lut.lut_last_rcvd, &info->fti_attr,
373                          BYPASS_CAPA);
374         if (rc)
375                 RETURN(rc);
376
377         last_rcvd_size = (unsigned long)info->fti_attr.la_size;
378
379         /* ensure padding in the struct is the correct size */
380         CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
381                   sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
382
383         if (last_rcvd_size == 0) {
384                 LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
385
386                 memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,
387                        sizeof(lsd->lsd_uuid));
388                 lsd->lsd_last_transno = 0;
389                 lsd->lsd_mount_count = 0;
390                 lsd->lsd_server_size = LR_SERVER_SIZE;
391                 lsd->lsd_client_start = LR_CLIENT_START;
392                 lsd->lsd_client_size = LR_CLIENT_SIZE;
393                 lsd->lsd_subdir_count = FILTER_SUBDIR_COUNT;
394                 lsd->lsd_feature_incompat = OBD_INCOMPAT_OST;
395         } else {
396                 rc = tgt_server_data_read(env, &ofd->ofd_lut);
397                 if (rc) {
398                         CDEBUG(D_INODE,"OBD ofd: error reading %s: rc %d\n",
399                                LAST_RCVD, rc);
400                         GOTO(err_fsd, rc);
401                 }
402                 if (strcmp((char *)lsd->lsd_uuid,
403                            (char *)obd->obd_uuid.uuid)) {
404                         LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
405                                        " disk %s. Were the /dev/ assignments "
406                                        "rearranged?\n",
407                                        obd->obd_uuid.uuid, lsd->lsd_uuid);
408                         GOTO(err_fsd, rc = -EINVAL);
409                 }
410         }
411
412         lsd->lsd_mount_count++;
413         obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
414         obd->u.obt.obt_instance = (__u32)obd->u.obt.obt_mount_count;
415         ofd->ofd_subdir_count = lsd->lsd_subdir_count;
416
417         if (lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP) {
418                 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
419                        obd->obd_name,
420                        lsd->lsd_feature_incompat & ~OFD_INCOMPAT_SUPP);
421                 GOTO(err_fsd, rc = -EINVAL);
422         }
423         if (lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP) {
424                 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
425                        obd->obd_name,
426                        lsd->lsd_feature_rocompat & ~OFD_ROCOMPAT_SUPP);
427                 /* Do something like remount filesystem read-only */
428                 GOTO(err_fsd, rc = -EINVAL);
429         }
430
431         CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
432                obd->obd_name, lsd->lsd_last_transno);
433         CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
434                obd->obd_name, lsd->lsd_mount_count);
435         CDEBUG(D_INODE, "%s: server data size: %u\n",
436                obd->obd_name, lsd->lsd_server_size);
437         CDEBUG(D_INODE, "%s: per-client data start: %u\n",
438                obd->obd_name, lsd->lsd_client_start);
439         CDEBUG(D_INODE, "%s: per-client data size: %u\n",
440                obd->obd_name, lsd->lsd_client_size);
441         CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
442                obd->obd_name, lsd->lsd_subdir_count);
443         CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
444                last_rcvd_size <= lsd->lsd_client_start ? 0 :
445                (last_rcvd_size - lsd->lsd_client_start) /
446                lsd->lsd_client_size);
447
448         if (!obd->obd_replayable)
449                 CWARN("%s: recovery support OFF\n", obd->obd_name);
450
451         rc = ofd_clients_data_init(env, ofd, last_rcvd_size);
452
453         spin_lock(&ofd->ofd_lut.lut_translock);
454         obd->obd_last_committed = lsd->lsd_last_transno;
455         ofd->ofd_lut.lut_last_transno = lsd->lsd_last_transno;
456         spin_unlock(&ofd->ofd_lut.lut_translock);
457
458         /* save it, so mount count and last_transno is current */
459         rc = tgt_server_data_update(env, &ofd->ofd_lut, 0);
460         if (rc)
461                 GOTO(err_fsd, rc);
462
463         RETURN(0);
464
465 err_fsd:
466         class_disconnect_exports(obd);
467         RETURN(rc);
468 }
469
470 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
471                  struct obd_device *obd)
472 {
473         struct ofd_thread_info  *info = ofd_info(env);
474         struct dt_object        *fo;
475         int                      rc = 0;
476
477         ENTRY;
478
479         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FS_SETUP))
480                 RETURN (-ENOENT);
481
482         /* prepare transactions callbacks */
483         ofd->ofd_txn_cb.dtc_txn_start = NULL;
484         ofd->ofd_txn_cb.dtc_txn_stop = ofd_txn_stop_cb;
485         ofd->ofd_txn_cb.dtc_txn_commit = NULL;
486         ofd->ofd_txn_cb.dtc_cookie = ofd;
487         ofd->ofd_txn_cb.dtc_tag = LCT_DT_THREAD;
488         CFS_INIT_LIST_HEAD(&ofd->ofd_txn_cb.dtc_linkage);
489
490         dt_txn_callback_add(ofd->ofd_osd, &ofd->ofd_txn_cb);
491
492         rc = ofd_server_data_init(env, ofd);
493         if (rc)
494                 GOTO(out, rc);
495
496         lu_local_obj_fid(&info->fti_fid, OFD_HEALTH_CHECK_OID);
497         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
498         info->fti_attr.la_valid = LA_MODE;
499         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
500         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
501
502         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
503                                &info->fti_dof, &info->fti_attr);
504         if (IS_ERR(fo))
505                 GOTO(out, rc = PTR_ERR(fo));
506
507         ofd->ofd_health_check_file = fo;
508
509         lu_local_obj_fid(&info->fti_fid, OFD_LAST_GROUP_OID);
510         memset(&info->fti_attr, 0, sizeof(info->fti_attr));
511         info->fti_attr.la_valid = LA_MODE;
512         info->fti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
513         info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
514
515         fo = dt_find_or_create(env, ofd->ofd_osd, &info->fti_fid,
516                                &info->fti_dof, &info->fti_attr);
517         if (IS_ERR(fo))
518                 GOTO(out_hc, rc = PTR_ERR(fo));
519
520         ofd->ofd_last_group_file = fo;
521
522         rc = ofd_groups_init(env, ofd);
523         if (rc)
524                 GOTO(out_lg, rc);
525
526         RETURN(0);
527 out_lg:
528         lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
529 out_hc:
530         lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
531 out:
532         dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
533         return rc;
534 }
535
536 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd)
537 {
538         int i;
539
540         ENTRY;
541
542         ofd_info_init(env, NULL);
543
544         for (i = 0; i <= ofd->ofd_max_group; i++) {
545                 if (ofd->ofd_lastid_obj[i]) {
546                         ofd_last_id_write(env, ofd, i);
547                         ofd_group_fini(env, ofd, i);
548                 }
549         }
550
551         i = dt_sync(env, ofd->ofd_osd);
552         if (i)
553                 CERROR("can't sync: %d\n", i);
554
555         /* Remove transaction callback */
556         dt_txn_callback_del(ofd->ofd_osd, &ofd->ofd_txn_cb);
557
558         if (ofd->ofd_last_group_file) {
559                 lu_object_put(env, &ofd->ofd_last_group_file->do_lu);
560                 ofd->ofd_last_group_file = NULL;
561         }
562
563         if (ofd->ofd_health_check_file) {
564                 lu_object_put(env, &ofd->ofd_health_check_file->do_lu);
565                 ofd->ofd_health_check_file = NULL;
566         }
567
568         EXIT;
569 }
570