Whamcloud - gitweb
dadea414f07f40fa2fece8d95a4995866e2a978f
[fs/lustre-release.git] / lustre / lmv / lmv_obd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #ifndef EXPORT_SYMTAB
23 # define EXPORT_SYMTAB
24 #endif
25 #define DEBUG_SUBSYSTEM S_LMV
26 #ifdef __KERNEL__
27 #include <linux/slab.h>
28 #include <linux/module.h>
29 #include <linux/init.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <asm/div64.h>
33 #include <linux/seq_file.h>
34 #else
35 #include <liblustre.h>
36 #endif
37 #include <linux/ext2_fs.h>
38
39 #include <linux/obd_support.h>
40 #include <linux/lustre_lib.h>
41 #include <linux/lustre_net.h>
42 #include <linux/lustre_idl.h>
43 #include <linux/lustre_dlm.h>
44 #include <linux/lustre_mds.h>
45 #include <linux/obd_class.h>
46 #include <linux/obd_ost.h>
47 #include <linux/lprocfs_status.h>
48 #include <linux/lustre_fsfilt.h>
49 #include <linux/obd_lmv.h>
50 #include "lmv_internal.h"
51
52 /* Error codes:
53  *
54  *  -EINVAL  : UUID can't be found in the LMV's target list
55  *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
56  *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
57  */
58 static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
59                               int activate)
60 {
61         struct obd_device *obd;
62         struct lmv_tgt_desc *tgt;
63         int i, rc = 0;
64         ENTRY;
65
66         CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
67                lmv, uuid->uuid, activate);
68
69         spin_lock(&lmv->lmv_lock);
70         for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgt++) {
71                 if (tgt->ltd_exp == NULL)
72                         continue;
73
74                 CDEBUG(D_INFO, "lmv idx %d is %s conn "LPX64"\n",
75                        i, tgt->uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
76                 if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof uuid->uuid) == 0)
77                         break;
78         }
79
80         if (i == lmv->desc.ld_tgt_count)
81                 GOTO(out, rc = -EINVAL);
82
83         obd = class_exp2obd(tgt->ltd_exp);
84         if (obd == NULL)
85                 GOTO(out, rc = -ENOTCONN);
86
87         CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
88                obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
89                obd->obd_type->typ_name, i);
90         LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
91
92         if (tgt->active == activate) {
93                 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
94                        activate ? "" : "in");
95                 GOTO(out, rc);
96         }
97
98         CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
99
100         tgt->active = activate;
101         if (activate)
102                 lmv->desc.ld_active_tgt_count++;
103         else
104                 lmv->desc.ld_active_tgt_count--;
105
106         EXIT;
107  out:
108         spin_unlock(&lmv->lmv_lock);
109         return rc;
110 }
111
112 static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
113                       int active)
114 {
115         int rc;
116         struct obd_uuid *uuid;
117
118         if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
119                 CERROR("unexpected notification of %s %s!\n",
120                        watched->obd_type->typ_name,
121                        watched->obd_name);
122                 return -EINVAL;
123         }
124         uuid = &watched->u.cli.cl_import->imp_target_uuid;
125
126         /* Set MDC as active before notifying the observer, so the
127          * observer can use the MDC normally.  
128          */
129         rc = lmv_set_mdc_active(&obd->u.lmv, uuid, active);
130         if (rc) {
131                 CERROR("%sactivation of %s failed: %d\n",
132                        active ? "" : "de", uuid->uuid, rc);
133                 RETURN(rc);
134         }
135
136         if (obd->obd_observer)
137                 /* Pass the notification up the chain. */
138                 rc = obd_notify(obd->obd_observer, watched, active);
139
140         RETURN(rc);
141 }
142
143 int lmv_attach(struct obd_device *dev, obd_count len, void *data)
144 {
145         struct lprocfs_static_vars lvars;
146         int rc;
147         ENTRY;
148
149         lprocfs_init_vars(lmv, &lvars);
150         rc = lprocfs_obd_attach(dev, lvars.obd_vars);
151         if (rc == 0) {
152 #ifdef __KERNEL__
153                 struct proc_dir_entry *entry;
154                 
155                 entry = create_proc_entry("target_obd", 0444, dev->obd_proc_entry);
156                 if (entry == NULL)
157                         RETURN(-ENOMEM);
158                 /* entry->proc_fops = &lmv_proc_target_fops; */
159                 entry->data = dev;
160 #endif
161        }
162         RETURN (rc);
163 }
164
165 int lmv_detach(struct obd_device *dev)
166 {
167         return lprocfs_obd_detach(dev);
168 }
169
170 /* This is fake connect function. Its purpose is to initialize lmv and 
171  * say caller that everything is okay. Real connection will be performed
172  * later. */
173 static int lmv_connect(struct lustre_handle *conn, struct obd_device *obd,
174                        struct obd_uuid *cluuid)
175 {
176         struct lmv_obd *lmv = &obd->u.lmv;
177         struct obd_export *exp;
178         int rc;
179         ENTRY;
180
181         rc = class_connect(conn, obd, cluuid);
182         if (rc) {
183                 CERROR("class_connection() returned %d\n", rc);
184                 RETURN(rc);
185         }
186
187         exp = class_conn2export(conn);
188         /* We don't want to actually do the underlying connections more than
189          * once, so keep track. */
190         lmv->refcount++;
191         if (lmv->refcount > 1) {
192                 class_export_put(exp);
193                 RETURN(0);
194         }
195
196         lmv->cluuid = *cluuid;
197         lmv->connected = 0;
198         lmv->exp = exp;
199
200         RETURN(0);
201 }
202
203 void lmv_set_timeouts(struct obd_device *obd)
204 {
205         struct lmv_tgt_desc *tgts;
206         struct lmv_obd *lmv;
207         int i;
208
209         lmv = &obd->u.lmv;
210         if (lmv->server_timeout == 0)
211                 return;
212
213         if (lmv->connected == 0)
214                 return;
215
216         for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
217                 if (tgts->ltd_exp == NULL)
218                         continue;
219                 obd_set_info(tgts->ltd_exp, strlen("inter_mds"),
220                              "inter_mds", 0, NULL);
221         }
222 }
223
224 /* Performs a check if passed obd is connected. If no - connect it. */
225 int lmv_check_connect(struct obd_device *obd) {
226         struct lmv_obd *lmv = &obd->u.lmv;
227         struct obd_uuid *cluuid;
228         struct lmv_tgt_desc *tgts;
229         struct obd_export *exp;
230         int rc, rc2, i;
231
232         if (lmv->connected)
233                 return 0;
234       
235         lmv->connected = 1;
236         cluuid = &lmv->cluuid;
237         exp = lmv->exp;
238         
239         CDEBUG(D_OTHER, "time to connect %s to %s\n",
240                cluuid->uuid, obd->obd_name);
241
242         for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
243                 struct obd_device *tgt_obd;
244                 struct obd_uuid lmv_osc_uuid = { "LMV_OSC_UUID" };
245                 struct lustre_handle conn = {0, };
246
247                 LASSERT(tgts != NULL);
248
249                 tgt_obd = class_find_client_obd(&tgts->uuid, LUSTRE_MDC_NAME, 
250                                                 &obd->obd_uuid);
251                 if (!tgt_obd) {
252                         CERROR("Target %s not attached\n", tgts->uuid.uuid);
253                         GOTO(out_disc, rc = -EINVAL);
254                 }
255
256                 /* for MDS: don't connect to yourself */
257                 if (obd_uuid_equals(&tgts->uuid, cluuid)) {
258                         CDEBUG(D_OTHER, "don't connect back to %s\n",
259                                cluuid->uuid);
260                         tgts->ltd_exp = NULL;
261                         continue;
262                 }
263
264                 CDEBUG(D_OTHER, "connect to %s(%s) - %s, %s FOR %s\n",
265                         tgt_obd->obd_name, tgt_obd->obd_uuid.uuid,
266                         tgts->uuid.uuid, obd->obd_uuid.uuid,
267                         cluuid->uuid);
268
269                 if (!tgt_obd->obd_set_up) {
270                         CERROR("Target %s not set up\n", tgts->uuid.uuid);
271                         GOTO(out_disc, rc = -EINVAL);
272                 }
273                 
274                 rc = obd_connect(&conn, tgt_obd, &lmv_osc_uuid);
275                 if (rc) {
276                         CERROR("Target %s connect error %d\n",
277                                 tgts->uuid.uuid, rc);
278                         GOTO(out_disc, rc);
279                 }
280                 tgts->ltd_exp = class_conn2export(&conn);
281
282                 obd_init_ea_size(tgts->ltd_exp, lmv->max_easize,
283                                  lmv->max_cookiesize);
284                 
285                 rc = obd_register_observer(tgt_obd, obd);
286                 if (rc) {
287                         CERROR("Target %s register_observer error %d\n",
288                                tgts->uuid.uuid, rc);
289                         obd_disconnect(tgts->ltd_exp, 0);
290                         GOTO(out_disc, rc);
291                 }
292
293                 lmv->desc.ld_active_tgt_count++;
294                 tgts->active = 1;
295                 
296                 CDEBUG(D_OTHER, "connected to %s(%s) successfully (%d)\n",
297                         tgt_obd->obd_name, tgt_obd->obd_uuid.uuid,
298                         atomic_read(&obd->obd_refcount));
299         }
300
301         lmv_set_timeouts(obd);
302         class_export_put(exp);
303         return 0;
304
305  out_disc:
306         while (i-- > 0) {
307                 struct obd_uuid uuid;
308                 --tgts;
309                 --lmv->desc.ld_active_tgt_count;
310                 tgts->active = 0;
311                 /* save for CERROR below; (we know it's terminated) */
312                 uuid = tgts->uuid;
313                 rc2 = obd_disconnect(tgts->ltd_exp, 0);
314                 if (rc2)
315                         CERROR("error: LMV target %s disconnect on MDT idx %d: "
316                                "error %d\n", uuid.uuid, i, rc2);
317         }
318         class_disconnect(exp, 0);
319         RETURN (rc);
320 }
321
322 static int lmv_disconnect(struct obd_export *exp, int flags)
323 {
324         struct obd_device *obd = class_exp2obd(exp);
325         struct lmv_obd *lmv = &obd->u.lmv;
326         int rc, i;
327         ENTRY;
328
329         if (!lmv->tgts)
330                 goto out_local;
331
332         /* Only disconnect the underlying layers on the final disconnect. */
333         lmv->refcount--;
334         if (lmv->refcount != 0)
335                 goto out_local;
336
337         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
338                 if (lmv->tgts[i].ltd_exp == NULL)
339                         continue;
340
341                 if (obd->obd_no_recov) {
342                         /* Pass it on to our clients.
343                          * XXX This should be an argument to disconnect,
344                          * XXX not a back-door flag on the OBD.  Ah well.
345                          */
346                         struct obd_device *mdc_obd;
347                         mdc_obd = class_exp2obd(lmv->tgts[i].ltd_exp);
348                         if (mdc_obd)
349                                 mdc_obd->obd_no_recov = 1;
350                 }
351
352                 CDEBUG(D_OTHER, "disconnected from %s(%s) successfully\n",
353                         lmv->tgts[i].ltd_exp->exp_obd->obd_name,
354                         lmv->tgts[i].ltd_exp->exp_obd->obd_uuid.uuid);
355
356                 obd_register_observer(lmv->tgts[i].ltd_exp->exp_obd, NULL);
357
358                 rc = obd_disconnect(lmv->tgts[i].ltd_exp, flags);
359                 if (rc) {
360                         if (lmv->tgts[i].active) {
361                                 CERROR("Target %s disconnect error %d\n",
362                                        lmv->tgts[i].uuid.uuid, rc);
363                         }
364                         rc = 0;
365                 }
366                 if (lmv->tgts[i].active) {
367                         lmv->desc.ld_active_tgt_count--;
368                         lmv->tgts[i].active = 0;
369                 }
370                 lmv->tgts[i].ltd_exp = NULL;
371         }
372
373 out_local:
374         /* this is the case when no real connection is established by
375          * lmv_check_connect(). */
376         if (!lmv->connected)
377                 class_export_put(exp);
378         rc = class_disconnect(exp, 0);
379         if (lmv->refcount == 0)
380                 lmv->connected = 0;
381         RETURN(rc);
382 }
383
384 static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
385                          int len, void *karg, void *uarg)
386 {
387         struct obd_device *obddev = class_exp2obd(exp);
388         struct lmv_obd *lmv = &obddev->u.lmv;
389         int i, rc = 0, set = 0;
390
391         ENTRY;
392
393         if (lmv->desc.ld_tgt_count == 0)
394                 RETURN(-ENOTTY);
395         
396         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
397                 int err;
398
399                 if (lmv->tgts[i].ltd_exp == NULL) {
400                         CWARN("%s: NULL export for %d\n", obddev->obd_name, i);
401                         continue;
402                 }
403
404                 err = obd_iocontrol(cmd, lmv->tgts[i].ltd_exp, len, karg, uarg);
405                 if (err) {
406                         if (lmv->tgts[i].active) {
407                                 CERROR("error: iocontrol MDC %s on MDT"
408                                        "idx %d: err = %d\n",
409                                        lmv->tgts[i].uuid.uuid, i, err);
410                                 if (!rc)
411                                         rc = err;
412                         }
413                 } else
414                         set = 1;
415         }
416         if (!set && !rc)
417                 rc = -EIO;
418
419         RETURN(rc);
420 }
421
422 static int lmv_setup(struct obd_device *obd, obd_count len, void *buf)
423 {
424         int i, rc = 0;
425         struct lmv_desc *desc;
426         struct obd_uuid *uuids;
427         struct lmv_tgt_desc *tgts;
428         struct lustre_cfg *lcfg = buf;
429         struct lmv_obd *lmv = &obd->u.lmv;
430         ENTRY;
431
432         if (lcfg->lcfg_inllen1 < 1) {
433                 CERROR("LMV setup requires a descriptor\n");
434                 RETURN(-EINVAL);
435         }
436
437         if (lcfg->lcfg_inllen2 < 1) {
438                 CERROR("LMV setup requires an OST UUID list\n");
439                 RETURN(-EINVAL);
440         }
441
442         desc = (struct lmv_desc *)lcfg->lcfg_inlbuf1;
443         if (sizeof(*desc) > lcfg->lcfg_inllen1) {
444                 CERROR("descriptor size wrong: %d > %d\n",
445                        (int)sizeof(*desc), lcfg->lcfg_inllen1);
446                 RETURN(-EINVAL);
447         }
448
449         uuids = (struct obd_uuid *)lcfg->lcfg_inlbuf2;
450         if (sizeof(*uuids) * desc->ld_tgt_count != lcfg->lcfg_inllen2) {
451                 CERROR("UUID array size wrong: %u * %u != %u\n",
452                        sizeof(*uuids), desc->ld_tgt_count, lcfg->lcfg_inllen2);
453                 RETURN(-EINVAL);
454         }
455
456         lmv->bufsize = sizeof(struct lmv_tgt_desc) * desc->ld_tgt_count;
457         OBD_ALLOC(lmv->tgts, lmv->bufsize);
458         if (lmv->tgts == NULL) {
459                 CERROR("Out of memory\n");
460                 RETURN(-ENOMEM);
461         }
462
463         lmv->desc = *desc;
464         spin_lock_init(&lmv->lmv_lock);
465         
466         for (i = 0, tgts = lmv->tgts; i < desc->ld_tgt_count; i++, tgts++)
467                 tgts->uuid = uuids[i];
468         
469         lmv->max_cookiesize = 0;
470
471         lmv->max_easize = sizeof(struct ll_fid) *
472                 desc->ld_tgt_count + sizeof(struct mea);
473         
474         rc = lmv_setup_mgr(obd);
475         if (rc) {
476                 CERROR("Can't setup LMV object manager, "
477                        "error %d.\n", rc);
478                 OBD_FREE(lmv->tgts, lmv->bufsize);
479         }
480
481         RETURN(rc);
482 }
483
484 static int lmv_statfs(struct obd_device *obd, struct obd_statfs *osfs,
485                       unsigned long max_age)
486 {
487         struct lmv_obd *lmv = &obd->u.lmv;
488         struct obd_statfs temp;
489         int rc = 0, i;
490         ENTRY;
491         
492         rc = lmv_check_connect(obd);
493         if (rc)
494                 RETURN(rc);
495                 
496         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
497                 if (lmv->tgts[i].ltd_exp == NULL) {
498                         CWARN("%s: NULL export for %d\n", obd->obd_name, i);
499                         continue;
500                 }
501
502                 rc = obd_statfs(lmv->tgts[i].ltd_exp->exp_obd, &temp, max_age);
503                 if (rc) {
504                         CERROR("can't stat MDS #%d (%s)\n", i,
505                                lmv->tgts[i].ltd_exp->exp_obd->obd_name);
506                         RETURN(rc);
507                 }
508                 if (i == 0) {
509                         memcpy(osfs, &temp, sizeof(temp));
510                 } else {
511                         osfs->os_bavail += temp.os_bavail;
512                         osfs->os_blocks += temp.os_blocks;
513                         osfs->os_ffree += temp.os_ffree;
514                         osfs->os_files += temp.os_files;
515                 }
516         }
517         RETURN(rc);
518 }
519
520 static int lmv_cleanup(struct obd_device *obd, int flags) 
521 {
522         struct lmv_obd *lmv = &obd->u.lmv;
523         ENTRY;
524         lmv_cleanup_mgr(obd);
525         OBD_FREE(lmv->tgts, lmv->bufsize);
526         RETURN(0);
527 }
528
529 static int lmv_getstatus(struct obd_export *exp, struct ll_fid *fid)
530 {
531         struct obd_device *obd = exp->exp_obd;
532         struct lmv_obd *lmv = &obd->u.lmv;
533         int rc;
534         ENTRY;
535         rc = lmv_check_connect(obd);
536         if (rc)
537                 RETURN(rc);
538         rc = md_getstatus(lmv->tgts[0].ltd_exp, fid);
539         fid->mds = 0;
540         RETURN(rc);
541 }
542
543 static int lmv_getattr(struct obd_export *exp, struct ll_fid *fid,
544                        unsigned long valid, unsigned int ea_size,
545                        struct ptlrpc_request **request)
546 {
547         struct obd_device *obd = exp->exp_obd;
548         struct lmv_obd *lmv = &obd->u.lmv;
549         int rc, i = fid->mds;
550         struct lmv_obj *obj;
551         ENTRY;
552
553         rc = lmv_check_connect(obd);
554         if (rc)
555                 RETURN(rc);
556
557         LASSERT(i < lmv->desc.ld_tgt_count);
558
559         rc = md_getattr(lmv->tgts[i].ltd_exp, fid, valid,
560                         ea_size, request);
561         if (rc)
562                 RETURN(rc);
563         
564         obj = lmv_grab_obj(obd, fid);
565         
566         CDEBUG(D_OTHER, "GETATTR for %lu/%lu/%lu %s\n",
567                (unsigned long)fid->mds, (unsigned long)fid->id,
568                (unsigned long)fid->generation, obj ? "(splitted)" : "");
569
570         /* if object is splitted, then we loop over all the slaves and gather
571          * size attribute. In ideal world we would have to gather also mds field
572          * from all slaves, as object is spread over the cluster and this is
573          * definitely interesting information and it is not good to loss it,
574          * but...*/
575         if (obj) {
576                 struct mds_body *body;
577
578                 if (*request == NULL) {
579                         lmv_put_obj(obj);
580                         RETURN(rc);
581                 }
582                         
583                 body = lustre_msg_buf((*request)->rq_repmsg, 0,
584                                       sizeof(*body));
585                 LASSERT(body != NULL);
586
587                 lmv_lock_obj(obj);
588         
589                 for (i = 0; i < obj->objcount; i++) {
590
591                         if (lmv->tgts[i].ltd_exp == NULL) {
592                                 CWARN("%s: NULL export for %d\n",
593                                       obd->obd_name, i);
594                                 continue;
595                         }
596
597                         /* skip master obj. */
598                         if (fid_equal(&obj->fid, &obj->objs[i].fid))
599                                 continue;
600                         
601                         body->size += obj->objs[i].size;
602                 }
603
604                 lmv_unlock_obj(obj);
605                 lmv_put_obj(obj);
606         }
607         
608         RETURN(rc);
609 }
610
611 static int lmv_change_cbdata(struct obd_export *exp, struct ll_fid *fid, 
612                              ldlm_iterator_t it, void *data)
613 {
614         struct obd_device *obd = exp->exp_obd;
615         struct lmv_obd *lmv = &obd->u.lmv;
616         int rc = 0;
617         ENTRY;
618         
619         rc = lmv_check_connect(obd);
620         if (rc)
621                 RETURN(rc);
622         
623         CDEBUG(D_OTHER, "CBDATA for %lu/%lu/%lu\n", (unsigned long)fid->mds,
624                (unsigned long)fid->id, (unsigned long)fid->generation);
625         
626         LASSERT(fid->mds < lmv->desc.ld_tgt_count);
627
628         rc = md_change_cbdata(lmv->tgts[fid->mds].ltd_exp,
629                               fid, it, data);
630         
631         RETURN(rc);
632 }
633
634 static int lmv_change_cbdata_name(struct obd_export *exp, struct ll_fid *pfid,
635                                   char *name, int len, struct ll_fid *cfid,
636                                   ldlm_iterator_t it, void *data)
637 {
638         struct obd_device *obd = exp->exp_obd;
639         struct lmv_obd *lmv = &obd->u.lmv;
640         struct lmv_obj *obj;
641         int rc = 0, mds;
642         ENTRY;
643
644         rc = lmv_check_connect(obd);
645         if (rc)
646                 RETURN(rc);
647
648         LASSERT(pfid->mds < lmv->desc.ld_tgt_count);
649         LASSERT(cfid->mds < lmv->desc.ld_tgt_count);
650         
651         CDEBUG(D_OTHER, "CBDATA for %lu/%lu/%lu:%*s -> %lu/%lu/%lu\n",
652                (unsigned long)pfid->mds, (unsigned long)pfid->id,
653                (unsigned long)pfid->generation, len, name,
654                (unsigned long)cfid->mds, (unsigned long)cfid->id,
655                (unsigned long)cfid->generation);
656
657         /* this is default mds for directory name belongs to. */
658         mds = pfid->mds;
659         obj = lmv_grab_obj(obd, pfid);
660         if (obj) {
661                 /* directory is splitted. look for right mds for this name. */
662                 mds = raw_name2idx(obj->hashtype, obj->objcount, name, len);
663                 mds = obj->objs[mds].fid.mds;
664                 lmv_put_obj(obj);
665         }
666         rc = md_change_cbdata(lmv->tgts[mds].ltd_exp, cfid, it, data);
667         RETURN(rc);
668 }
669
670 static int lmv_valid_attrs(struct obd_export *exp, struct ll_fid *fid) 
671 {
672         struct obd_device *obd = exp->exp_obd;
673         struct lmv_obd *lmv = &obd->u.lmv;
674         int rc = 0;
675         ENTRY;
676         rc = lmv_check_connect(obd);
677         if (rc)
678                 RETURN(rc);
679         CDEBUG(D_OTHER, "validate %lu/%lu/%lu\n", (unsigned long) fid->mds,
680                (unsigned long) fid->id, (unsigned long) fid->generation);
681         LASSERT(fid->mds < lmv->desc.ld_tgt_count);
682         rc = md_valid_attrs(lmv->tgts[fid->mds].ltd_exp, fid);
683         RETURN(rc);
684 }
685
686 int lmv_close(struct obd_export *exp, struct obdo *obdo,
687                   struct obd_client_handle *och,
688                   struct ptlrpc_request **request)
689 {
690         struct obd_device *obd = exp->exp_obd;
691         struct lmv_obd *lmv = &obd->u.lmv;
692         int rc, i = obdo->o_mds;
693         ENTRY;
694         rc = lmv_check_connect(obd);
695         if (rc)
696                 RETURN(rc);
697         LASSERT(i < lmv->desc.ld_tgt_count);
698         CDEBUG(D_OTHER, "CLOSE %lu/%lu/%lu\n", (unsigned long) obdo->o_mds,
699                (unsigned long) obdo->o_id, (unsigned long) obdo->o_generation);
700         rc = md_close(lmv->tgts[i].ltd_exp, obdo, och, request);
701         RETURN(rc);
702 }
703
704 int lmv_get_mea_and_update_object(struct obd_export *exp, struct ll_fid *fid)
705 {
706         struct obd_device *obd = exp->exp_obd;
707         struct lmv_obd *lmv = &obd->u.lmv;
708         struct ptlrpc_request *req = NULL;
709         struct lmv_obj *obj;
710         struct lustre_md md;
711         unsigned long valid;
712         int mealen, rc;
713
714         md.mea = NULL;
715         mealen = MEA_SIZE_LMV(lmv);
716         
717         valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
718
719         /* time to update mea of parent fid */
720         rc = md_getattr(lmv->tgts[fid->mds].ltd_exp, fid,
721                         valid, mealen, &req);
722         if (rc) {
723                 CERROR("md_getattr() failed, error %d\n", rc);
724                 GOTO(cleanup, rc);
725         }
726
727         rc = mdc_req2lustre_md(exp, req, 0, NULL, &md);
728         if (rc) {
729                 CERROR("mdc_req2lustre_md() failed, error %d\n", rc);
730                 GOTO(cleanup, rc);
731         }
732
733         if (md.mea == NULL)
734                 GOTO(cleanup, rc = -ENODATA);
735
736         obj = lmv_create_obj(exp, fid, md.mea);
737         if (IS_ERR(obj))
738                 rc = PTR_ERR(obj);
739         
740         lmv_put_obj(obj);
741         obd_free_memmd(exp, (struct lov_stripe_md **)&md.mea);
742
743 cleanup:
744         if (req)
745                 ptlrpc_req_finished(req);
746         RETURN(rc);
747 }
748
749 int lmv_create(struct obd_export *exp, struct mdc_op_data *op_data,
750                const void *data, int datalen, int mode, __u32 uid,
751                __u32 gid, __u64 rdev, struct ptlrpc_request **request)
752 {
753         struct obd_device *obd = exp->exp_obd;
754         struct lmv_obd *lmv = &obd->u.lmv;
755         struct mds_body *body;
756         struct lmv_obj *obj;
757         int rc, mds, loop = 0;
758         ENTRY;
759
760         rc = lmv_check_connect(obd);
761         if (rc)
762                 RETURN(rc);
763
764         if (!lmv->desc.ld_active_tgt_count)
765                 RETURN(-EIO);
766 repeat:
767         LASSERT(++loop <= 2);
768         obj = lmv_grab_obj(obd, &op_data->fid1);
769         if (obj) {
770                 mds = raw_name2idx(obj->hashtype, obj->objcount, op_data->name,
771                                    op_data->namelen);
772                 op_data->fid1 = obj->objs[mds].fid;
773                 lmv_put_obj(obj);
774         }
775
776         CDEBUG(D_OTHER, "CREATE '%*s' on %lu/%lu/%lu\n", op_data->namelen,
777                op_data->name, (unsigned long)op_data->fid1.mds,
778                (unsigned long)op_data->fid1.id,
779                (unsigned long)op_data->fid1.generation);
780         
781         rc = md_create(lmv->tgts[op_data->fid1.mds].ltd_exp, op_data, data,
782                        datalen, mode, uid, gid, rdev, request);
783         if (rc == 0) {
784                 if (*request == NULL)
785                         RETURN(rc);
786
787                 body = lustre_msg_buf((*request)->rq_repmsg, 0,
788                                       sizeof(*body));
789                 LASSERT(body != NULL);
790                 
791                 CDEBUG(D_OTHER, "created. id = %lu, generation = %lu, "
792                        "mds = %d\n", (unsigned long)body->fid1.id,
793                        (unsigned long)body->fid1.generation, op_data->fid1.mds);
794                 
795                 LASSERT(body->valid & OBD_MD_MDS ||
796                         body->mds == op_data->fid1.mds);
797         } else if (rc == -ERESTART) {
798                 /* directory got splitted. time to update local object and
799                  * repeat the request with proper MDS */
800                 rc = lmv_get_mea_and_update_object(exp, &op_data->fid1);
801                 if (rc == 0) {
802                         ptlrpc_req_finished(*request);
803                         goto repeat;
804                 }
805         }
806         RETURN(rc);
807 }
808
809 int lmv_done_writing(struct obd_export *exp, struct obdo *obdo)
810 {
811         struct obd_device *obd = exp->exp_obd;
812         struct lmv_obd *lmv = &obd->u.lmv;
813         int rc;
814         ENTRY;
815         rc = lmv_check_connect(obd);
816         if (rc)
817                 RETURN(rc);
818
819         /* FIXME: choose right MDC here */
820         CWARN("this method isn't implemented yet\n");
821         rc = md_done_writing(lmv->tgts[0].ltd_exp, obdo);
822         RETURN(rc);
823 }
824
825 int lmv_enqueue_slaves(struct obd_export *exp, int locktype,
826                        struct lookup_intent *it, int lockmode,
827                        struct mdc_op_data *data, struct lustre_handle *lockh,
828                        void *lmm, int lmmsize, ldlm_completion_callback cb_completion,
829                        ldlm_blocking_callback cb_blocking, void *cb_data)
830 {
831         struct obd_device *obd = exp->exp_obd;
832         struct lmv_obd *lmv = &obd->u.lmv;
833         struct mea *mea = data->mea1;
834         struct mdc_op_data data2;
835         int i, rc, mds;
836         ENTRY;
837
838         LASSERT(mea != NULL);
839         for (i = 0; i < mea->mea_count; i++) {
840                 memset(&data2, 0, sizeof(data2));
841                 data2.fid1 = mea->mea_fids[i];
842                 mds = data2.fid1.mds;
843                 
844                 if (lmv->tgts[mds].ltd_exp == NULL)
845                         continue;
846
847                 rc = md_enqueue(lmv->tgts[mds].ltd_exp, locktype, it, lockmode,
848                                 &data2, lockh + i, lmm, lmmsize, cb_completion,
849                                 cb_blocking, cb_data);
850                 
851                 CDEBUG(D_OTHER, "take lock on slave %lu/%lu/%lu -> %d/%d\n",
852                        (unsigned long)mea->mea_fids[i].mds,
853                        (unsigned long)mea->mea_fids[i].id,
854                        (unsigned long)mea->mea_fids[i].generation,
855                        rc, it->d.lustre.it_status);
856                 if (rc)
857                         GOTO(cleanup, rc);
858                 if (it->d.lustre.it_data) {
859                         struct ptlrpc_request *req;
860                         req = (struct ptlrpc_request *) it->d.lustre.it_data;
861                         ptlrpc_req_finished(req);
862                 }
863                 
864                 if (it->d.lustre.it_status)
865                         GOTO(cleanup, rc = it->d.lustre.it_status);
866         }
867         RETURN(0);
868         
869 cleanup:
870         /* drop all taken locks */
871         while (--i >= 0) {
872                 if (lockh[i].cookie)
873                         ldlm_lock_decref(lockh + i, lockmode);
874                 lockh[i].cookie = 0;
875         }
876         RETURN(rc);
877 }
878
879 int lmv_enqueue(struct obd_export *exp, int lock_type,
880                 struct lookup_intent *it, int lock_mode,
881                 struct mdc_op_data *data, struct lustre_handle *lockh,
882                 void *lmm, int lmmsize, ldlm_completion_callback cb_completion,
883                 ldlm_blocking_callback cb_blocking, void *cb_data)
884 {
885         struct obd_device *obd = exp->exp_obd;
886         struct lmv_obd *lmv = &obd->u.lmv;
887         struct lmv_obj *obj;
888         int rc, mds;
889         ENTRY;
890
891         rc = lmv_check_connect(obd);
892         if (rc)
893                 RETURN(rc);
894
895         if (it->it_op == IT_UNLINK) {
896                 rc = lmv_enqueue_slaves(exp, lock_type, it, lock_mode,
897                                         data, lockh, lmm, lmmsize,
898                                         cb_completion, cb_blocking, cb_data);
899                 RETURN(rc);
900         }
901
902         if (data->namelen) {
903                 obj = lmv_grab_obj(obd, &data->fid1);
904                 if (obj) {
905                         /* directory is splitted. look for right mds for this
906                          * name */
907                         mds = raw_name2idx(obj->hashtype, obj->objcount,
908                                            (char *)data->name, data->namelen);
909                         data->fid1 = obj->objs[mds].fid;
910                         lmv_put_obj(obj);
911                 }
912         }
913         CDEBUG(D_OTHER, "ENQUEUE '%s' on %lu/%lu\n", LL_IT2STR(it),
914                (unsigned long)data->fid1.id, (unsigned long)data->fid1.generation);
915         
916         rc = md_enqueue(lmv->tgts[data->fid1.mds].ltd_exp, lock_type, it,
917                         lock_mode, data, lockh, lmm, lmmsize, cb_completion,
918                         cb_blocking, cb_data);
919
920         RETURN(rc);
921 }
922
923 int lmv_getattr_name(struct obd_export *exp, struct ll_fid *fid,
924                      char *filename, int namelen, unsigned long valid,
925                      unsigned int ea_size, struct ptlrpc_request **request)
926 {
927         struct obd_device *obd = exp->exp_obd;
928         struct lmv_obd *lmv = &obd->u.lmv;
929         struct ll_fid rfid = *fid;
930         int rc, mds = fid->mds, loop = 0;
931         struct mds_body *body;
932         struct lmv_obj *obj;
933         ENTRY;
934         rc = lmv_check_connect(obd);
935         if (rc)
936                 RETURN(rc);
937 repeat:
938         LASSERT(++loop <= 2);
939         obj = lmv_grab_obj(obd, fid);
940         if (obj) {
941                 /* directory is splitted. look for right mds for this name */
942                 mds = raw_name2idx(obj->hashtype, obj->objcount, filename, namelen - 1);
943                 rfid = obj->objs[mds].fid;
944                 lmv_put_obj(obj);
945         }
946         
947         CDEBUG(D_OTHER, "getattr_name for %*s on %lu/%lu/%lu -> %lu/%lu/%lu\n",
948                namelen, filename, (unsigned long)fid->mds,
949                (unsigned long)fid->id, (unsigned long)fid->generation,
950                (unsigned long)rfid.mds, (unsigned long)rfid.id,
951                (unsigned long)rfid.generation);
952
953         rc = md_getattr_name(lmv->tgts[rfid.mds].ltd_exp, &rfid, filename,
954                              namelen, valid, ea_size, request);
955         if (rc == 0) {
956                 /* this could be cross-node reference. in this case all we have
957                  * right now is mds/ino/generation triple. we'd like to find
958                  * other attributes */
959                 body = lustre_msg_buf((*request)->rq_repmsg, 0, sizeof(*body));
960                 LASSERT(body != NULL);
961                 if (body->valid & OBD_MD_MDS) {
962                         struct ptlrpc_request *req = NULL;
963                         rfid = body->fid1;
964                         CDEBUG(D_OTHER, "request attrs for %lu/%lu/%lu\n",
965                                (unsigned long) rfid.mds,
966                                (unsigned long) rfid.id,
967                                (unsigned long) rfid.generation);
968                         rc = md_getattr_name(lmv->tgts[rfid.mds].ltd_exp, &rfid,
969                                              NULL, 1, valid, ea_size, &req);
970                         ptlrpc_req_finished(*request);
971                         *request = req;
972                 }
973         } else if (rc == -ERESTART) {
974                 /* directory got splitted. time to update local object and
975                  * repeat the request with proper MDS */
976                 rc = lmv_get_mea_and_update_object(exp, &rfid);
977                 if (rc == 0) {
978                         ptlrpc_req_finished(*request);
979                         goto repeat;
980                 }
981         }
982         RETURN(rc);
983 }
984
985
986 /*
987  * llite passes fid of an target inode in data->fid1 and fid of directory in
988  * data->fid2
989  */
990 int lmv_link(struct obd_export *exp, struct mdc_op_data *data,
991              struct ptlrpc_request **request)
992 {
993         struct obd_device *obd = exp->exp_obd;
994         struct lmv_obd *lmv = &obd->u.lmv;
995         struct lmv_obj *obj;
996         int rc;
997         ENTRY;
998         
999         rc = lmv_check_connect(obd);
1000         if (rc)
1001                 RETURN(rc);
1002
1003         if (data->namelen != 0) {
1004                 /* usual link request */
1005                 obj = lmv_grab_obj(obd, &data->fid1);
1006                 if (obj) {
1007                         rc = raw_name2idx(obj->hashtype, obj->objcount, data->name,
1008                                           data->namelen);
1009                         data->fid1 = obj->objs[rc].fid;
1010                         lmv_put_obj(obj);
1011                 }
1012                 
1013                 CDEBUG(D_OTHER,"link %lu/%lu/%lu:%*s to %lu/%lu/%lu mds %lu\n",
1014                        (unsigned long)data->fid2.mds,
1015                        (unsigned long)data->fid2.id,
1016                        (unsigned long)data->fid2.generation,
1017                        data->namelen, data->name,
1018                        (unsigned long)data->fid1.mds,
1019                        (unsigned long)data->fid1.id,
1020                        (unsigned long)data->fid1.generation,
1021                        (unsigned long)data->fid1.mds);
1022         } else {
1023                 /* request from MDS to acquire i_links for inode by fid1 */
1024                 CDEBUG(D_OTHER, "inc i_nlinks for %lu/%lu/%lu\n",
1025                        (unsigned long)data->fid1.mds,
1026                        (unsigned long)data->fid1.id,
1027                        (unsigned long)data->fid1.generation);
1028         }
1029                         
1030         rc = md_link(lmv->tgts[data->fid1.mds].ltd_exp, data, request);
1031         RETURN(rc);
1032 }
1033
1034 int lmv_rename(struct obd_export *exp, struct mdc_op_data *data,
1035                const char *old, int oldlen, const char *new, int newlen,
1036                struct ptlrpc_request **request)
1037 {
1038         struct obd_device *obd = exp->exp_obd;
1039         struct lmv_obd *lmv = &obd->u.lmv;
1040         struct lmv_obj *obj;
1041         int rc, mds;
1042         ENTRY;
1043
1044         CDEBUG(D_OTHER, "rename %*s in %lu/%lu/%lu to %*s in %lu/%lu/%lu\n",
1045                oldlen, old, (unsigned long)data->fid1.mds,
1046                (unsigned long)data->fid1.id,
1047                (unsigned long)data->fid1.generation,
1048                newlen, new, (unsigned long) data->fid2.mds,
1049                (unsigned long) data->fid2.id,
1050                (unsigned long) data->fid2.generation);
1051         
1052         if (!fid_equal(&data->fid1, &data->fid2))
1053                 CWARN("cross-node rename %lu/%lu/%lu:%*s to %lu/%lu/%lu:%*s\n",
1054                       (unsigned long)data->fid1.mds,
1055                       (unsigned long)data->fid1.id,
1056                       (unsigned long)data->fid1.generation, oldlen, old,
1057                       (unsigned long)data->fid2.mds,
1058                       (unsigned long)data->fid2.id,
1059                       (unsigned long)data->fid2.generation, newlen, new);
1060
1061         rc = lmv_check_connect(obd);
1062         if (rc)
1063                 RETURN(rc);
1064
1065         if (oldlen == 0) {
1066                 /* MDS with old dir entry is asking another MDS to create name
1067                  * there */
1068                 CDEBUG(D_OTHER,
1069                        "create %*s(%d/%d) in %lu/%lu/%lu pointing to %lu/%lu/%lu\n",
1070                        newlen, new, oldlen, newlen,
1071                        (unsigned long)data->fid2.mds,
1072                        (unsigned long)data->fid2.id,
1073                        (unsigned long)data->fid2.generation,
1074                        (unsigned long)data->fid1.mds,
1075                        (unsigned long)data->fid1.id,
1076                        (unsigned long)data->fid1.generation);
1077                 mds = data->fid2.mds;
1078                 goto request;
1079         }
1080
1081         obj = lmv_grab_obj(obd, &data->fid1);
1082         if (obj) {
1083                 /* directory is already splitted, so we have to forward request
1084                  * to the right MDS */
1085                 mds = raw_name2idx(obj->hashtype, obj->objcount, (char *)old, oldlen);
1086                 data->fid1 = obj->objs[mds].fid;
1087                 CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
1088                        (unsigned long)obj->objs[mds].fid.mds,
1089                        (unsigned long)obj->objs[mds].fid.id,
1090                        (unsigned long)obj->objs[mds].fid.generation);
1091                 lmv_put_obj(obj);
1092         }
1093
1094         obj = lmv_grab_obj(obd, &data->fid2);
1095         if (obj) {
1096                 /* directory is already splitted, so we have to forward request
1097                  * to the right MDS */
1098                 mds = raw_name2idx(obj->hashtype, obj->objcount, (char *)new, newlen);
1099                 data->fid2 = obj->objs[mds].fid;
1100                 CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
1101                        (unsigned long)obj->objs[mds].fid.mds,
1102                        (unsigned long)obj->objs[mds].fid.id,
1103                        (unsigned long)obj->objs[mds].fid.generation);
1104                 lmv_put_obj(obj);
1105         }
1106         
1107         mds = data->fid1.mds;
1108
1109 request:
1110         rc = md_rename(lmv->tgts[mds].ltd_exp, data, old, oldlen,
1111                        new, newlen, request); 
1112         RETURN(rc);
1113 }
1114
1115 int lmv_setattr(struct obd_export *exp, struct mdc_op_data *data,
1116                 struct iattr *iattr, void *ea, int ealen, void *ea2,
1117                 int ea2len, struct ptlrpc_request **request)
1118 {
1119         struct obd_device *obd = exp->exp_obd;
1120         struct lmv_obd *lmv = &obd->u.lmv;
1121         struct ptlrpc_request *req;
1122         struct mds_body *body;
1123         struct lmv_obj *obj;
1124         int rc = 0, i;
1125         ENTRY;
1126
1127         rc = lmv_check_connect(obd);
1128         if (rc)
1129                 RETURN(rc);
1130
1131         obj = lmv_grab_obj(obd, &data->fid1);
1132         
1133         CDEBUG(D_OTHER, "SETATTR for %lu/%lu/%lu, valid 0x%x%s\n",
1134                (unsigned long)data->fid1.mds, (unsigned long)data->fid1.id,
1135                (unsigned long)data->fid1.generation, iattr->ia_valid,
1136                obj ? ", splitted" : "");
1137         
1138         if (obj) {
1139                 for (i = 0; i < obj->objcount; i++) {
1140                         data->fid1 = obj->objs[i].fid;
1141                         
1142                         rc = md_setattr(lmv->tgts[data->fid1.mds].ltd_exp, data,
1143                                         iattr, ea, ealen, ea2, ea2len, &req);
1144
1145                         if (fid_equal(&obj->fid, &obj->objs[i].fid)) {
1146                                 /* this is master object and this request should
1147                                  * be returned back to llite */
1148                                 *request = req;
1149                         } else {
1150                                 ptlrpc_req_finished(req);
1151                         }
1152
1153                         if (rc)
1154                                 break;
1155                 }
1156                 lmv_put_obj(obj);
1157         } else {
1158                 LASSERT(data->fid1.mds < lmv->desc.ld_tgt_count);
1159                 rc = md_setattr(lmv->tgts[data->fid1.mds].ltd_exp, data,
1160                                 iattr, ea, ealen, ea2, ea2len, request); 
1161                 if (rc == 0) {
1162                         body = lustre_msg_buf((*request)->rq_repmsg, 0,
1163                                               sizeof(*body));
1164                         LASSERT(body != NULL);
1165                         LASSERT(body->mds == data->fid1.mds);
1166                 }
1167         }
1168         RETURN(rc);
1169 }
1170
1171 int lmv_sync(struct obd_export *exp, struct ll_fid *fid,
1172              struct ptlrpc_request **request)
1173 {
1174         struct obd_device *obd = exp->exp_obd;
1175         struct lmv_obd *lmv = &obd->u.lmv;
1176         int rc;
1177         ENTRY;
1178
1179         rc = lmv_check_connect(obd);
1180         if (rc)
1181                 RETURN(rc);
1182
1183         rc = md_sync(lmv->tgts[fid->mds].ltd_exp, fid, request); 
1184         RETURN(rc);
1185 }
1186
1187 int lmv_dirobj_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
1188                             void *data, int flag)
1189 {
1190         struct lustre_handle lockh;
1191         struct lmv_obj *obj;
1192         int rc;
1193         ENTRY;
1194
1195         switch (flag) {
1196         case LDLM_CB_BLOCKING:
1197                 ldlm_lock2handle(lock, &lockh);
1198                 rc = ldlm_cli_cancel(&lockh);
1199                 if (rc < 0) {
1200                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
1201                         RETURN(rc);
1202                 }
1203                 break;
1204         case LDLM_CB_CANCELING:
1205                 /* time to drop cached attrs for dirobj */
1206                 obj = lock->l_ast_data;
1207                 if (obj) {
1208                         CDEBUG(D_OTHER, "cancel %s on %lu/%lu, master %lu/%lu/%lu\n",
1209                                lock->l_resource->lr_name.name[3] == 1 ? "LOOKUP" : "UPDATE",
1210                                (unsigned long)lock->l_resource->lr_name.name[0],
1211                                (unsigned long)lock->l_resource->lr_name.name[1],
1212                                (unsigned long)obj->fid.mds, (unsigned long)obj->fid.id,
1213                                (unsigned long)obj->fid.generation);
1214                         lmv_put_obj(obj);
1215                 }
1216                 break;
1217         default:
1218                 LBUG();
1219         }
1220         RETURN(0);
1221 }
1222
1223 void lmv_remove_dots(struct page *page)
1224 {
1225         char *kaddr = page_address(page);
1226         unsigned limit = PAGE_CACHE_SIZE;
1227         unsigned offs, rec_len;
1228         struct ext2_dir_entry_2 *p;
1229
1230         for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
1231                 p = (struct ext2_dir_entry_2 *)(kaddr + offs);
1232                 rec_len = le16_to_cpu(p->rec_len);
1233
1234                 if ((p->name_len == 1 && p->name[0] == '.') ||
1235                     (p->name_len == 2 && p->name[0] == '.' && p->name[1] == '.'))
1236                         p->inode = 0;
1237         }
1238 }
1239
1240 int lmv_readpage(struct obd_export *exp, struct ll_fid *mdc_fid,
1241                  __u64 offset, struct page *page,
1242                  struct ptlrpc_request **request)
1243 {
1244         struct obd_device *obd = exp->exp_obd;
1245         struct lmv_obd *lmv = &obd->u.lmv;
1246         struct ll_fid rfid = *mdc_fid;
1247         struct lmv_obj *obj;
1248         int rc, i;
1249         ENTRY;
1250
1251         rc = lmv_check_connect(obd);
1252         if (rc)
1253                 RETURN(rc);
1254
1255         LASSERT(mdc_fid->mds < lmv->desc.ld_tgt_count);
1256         CDEBUG(D_OTHER, "READPAGE at %llu from %lu/%lu/%lu\n",
1257                offset, (unsigned long) rfid.mds,
1258                (unsigned long) rfid.id,
1259                (unsigned long) rfid.generation);
1260
1261         obj = lmv_grab_obj(obd, mdc_fid);
1262         if (obj) {
1263                 lmv_lock_obj(obj);
1264
1265                 /* find dirobj containing page with requested offset. */
1266                 for (i = 0; i < obj->objcount; i++) {
1267                         if (offset < obj->objs[i].size)
1268                                 break;
1269                         offset -= obj->objs[i].size;
1270                 }
1271                 rfid = obj->objs[i].fid;
1272                 
1273                 lmv_unlock_obj(obj);
1274                 lmv_put_obj(obj);
1275                 
1276                 CDEBUG(D_OTHER, "forward to %lu/%lu/%lu with offset %lu\n",
1277                        (unsigned long)rfid.mds, (unsigned long)rfid.id,
1278                        (unsigned long)rfid.generation, (unsigned long)offset);
1279         }
1280         rc = md_readpage(lmv->tgts[rfid.mds].ltd_exp, &rfid, offset,
1281                          page, request);
1282         
1283         if (rc == 0 && !fid_equal(&rfid, mdc_fid))
1284                 /* this page isn't from master object. To avoid "." and ".." 
1285                  * duplication in directory, we have to remove them from all
1286                  * slave objects */
1287                 lmv_remove_dots(page);
1288         
1289         RETURN(rc);
1290 }
1291
1292 int lmv_unlink_slaves(struct obd_export *exp, struct mdc_op_data *data,
1293                       struct ptlrpc_request **req)
1294 {
1295         struct obd_device *obd = exp->exp_obd;
1296         struct lmv_obd *lmv = &obd->u.lmv;
1297         struct mea *mea = data->mea1;
1298         struct mdc_op_data data2;
1299         int i, rc = 0, mds;
1300         ENTRY;
1301
1302         LASSERT(mea != NULL);
1303         for (i = 0; i < mea->mea_count; i++) {
1304                 memset(&data2, 0, sizeof(data2));
1305                 data2.fid1 = mea->mea_fids[i];
1306                 data2.create_mode = MDS_MODE_DONT_LOCK | S_IFDIR;
1307                 mds = data2.fid1.mds;
1308
1309                 if (lmv->tgts[mds].ltd_exp == NULL)
1310                         continue;
1311
1312                 rc = md_unlink(lmv->tgts[mds].ltd_exp, &data2, req);
1313                 CDEBUG(D_OTHER, "unlink slave %lu/%lu/%lu -> %d\n",
1314                        (unsigned long) mea->mea_fids[i].mds,
1315                        (unsigned long) mea->mea_fids[i].id,
1316                        (unsigned long) mea->mea_fids[i].generation, rc);
1317                 if (*req) {
1318                         ptlrpc_req_finished(*req);
1319                         *req = NULL;
1320                 }
1321                 if (rc)
1322                         break;
1323         }
1324         RETURN(rc);
1325 }
1326
1327 int lmv_delete_object(struct obd_export *exp, struct ll_fid *fid)
1328 {
1329         ENTRY;
1330
1331         if (!lmv_delete_obj(exp, fid)) {
1332                 CDEBUG(D_OTHER, "Object %lu/%lu/%lu is not found.\n",
1333                        (unsigned long)fid->mds, (unsigned long)fid->id,
1334                        (unsigned long)fid->generation);
1335         }
1336         
1337         RETURN(0);
1338 }
1339
1340 int lmv_unlink(struct obd_export *exp, struct mdc_op_data *data,
1341                struct ptlrpc_request **request)
1342 {
1343         struct obd_device *obd = exp->exp_obd;
1344         struct lmv_obd *lmv = &obd->u.lmv;
1345         int rc, i = 0;
1346         ENTRY;
1347         
1348         rc = lmv_check_connect(obd);
1349         if (rc)
1350                 RETURN(rc);
1351
1352         if (data->namelen == 0 && data->mea1 != NULL) {
1353                 /* mds asks to remove slave objects */
1354                 rc = lmv_unlink_slaves(exp, data, request);
1355                 RETURN(rc);
1356         } else if (data->namelen != 0) {
1357                 struct lmv_obj *obj;
1358                 
1359                 obj = lmv_grab_obj(obd, &data->fid1);
1360                 if (obj) {
1361                         i = raw_name2idx(obj->hashtype, obj->objcount, data->name,
1362                                          data->namelen);
1363                         data->fid1 = obj->objs[i].fid;
1364                         lmv_put_obj(obj);
1365                 }
1366                 CDEBUG(D_OTHER, "unlink '%*s' in %lu/%lu/%lu -> %u\n",
1367                        data->namelen, data->name,
1368                        (unsigned long) data->fid1.mds,
1369                        (unsigned long) data->fid1.id,
1370                        (unsigned long) data->fid1.generation, i);
1371         } else {
1372                 CDEBUG(D_OTHER, "drop i_nlink on %lu/%lu/%lu\n",
1373                        (unsigned long) data->fid1.mds,
1374                        (unsigned long) data->fid1.id,
1375                        (unsigned long) data->fid1.generation);
1376         }
1377         rc = md_unlink(lmv->tgts[data->fid1.mds].ltd_exp, data, request); 
1378         RETURN(rc);
1379 }
1380
1381 struct obd_device *lmv_get_real_obd(struct obd_export *exp,
1382                                     char *name, int len)
1383 {
1384         struct obd_device *obd = exp->exp_obd;
1385         struct lmv_obd *lmv = &obd->u.lmv;
1386         int rc;
1387         ENTRY;
1388
1389         rc = lmv_check_connect(obd);
1390         if (rc)
1391                 RETURN(ERR_PTR(rc));
1392 #warning "we need well-desgined readdir() implementation to remove this mess"
1393         obd = lmv->tgts[0].ltd_exp->exp_obd;
1394         EXIT;
1395         return obd;
1396 }
1397
1398 int lmv_init_ea_size(struct obd_export *exp, int easize, int cookiesize)
1399 {
1400         struct obd_device *obd = exp->exp_obd;
1401         struct lmv_obd *lmv = &obd->u.lmv;
1402         int i, rc = 0, change = 0;
1403         ENTRY;
1404
1405         if (lmv->max_easize < easize) {
1406                 lmv->max_easize = easize;
1407                 change = 1;
1408         }
1409         if (lmv->max_cookiesize < cookiesize) {
1410                 lmv->max_cookiesize = cookiesize;
1411                 change = 1;
1412         }
1413         if (change == 0)
1414                 RETURN(0);
1415         
1416         if (lmv->connected == 0)
1417                 RETURN(0);
1418
1419         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1420                 if (lmv->tgts[i].ltd_exp == NULL) {
1421                         CWARN("%s: NULL export for %d\n", obd->obd_name, i);
1422                         continue;
1423                 }
1424
1425                 rc = obd_init_ea_size(lmv->tgts[i].ltd_exp, easize, cookiesize);
1426                 if (rc) {
1427                         CERROR("obd_init_ea_size() failed on MDT target %d, "
1428                                "error %d.\n", i, rc);
1429                         break;
1430                 }
1431         }
1432         RETURN(rc);
1433 }
1434
1435 int lmv_obd_create_single(struct obd_export *exp, struct obdo *oa,
1436                           struct lov_stripe_md **ea, struct obd_trans_info *oti)
1437 {
1438         struct obd_device *obd = exp->exp_obd;
1439         struct lmv_obd *lmv = &obd->u.lmv;
1440         struct lov_stripe_md obj_md;
1441         struct lov_stripe_md *obj_mdp = &obj_md;
1442         int rc = 0;
1443         ENTRY;
1444
1445         rc = lmv_check_connect(obd);
1446         if (rc)
1447                 RETURN(rc);
1448
1449         LASSERT(ea == NULL);
1450         LASSERT(oa->o_mds < lmv->desc.ld_tgt_count);
1451
1452         rc = obd_create(lmv->tgts[oa->o_mds].ltd_exp, oa, &obj_mdp, oti);
1453
1454         RETURN(rc);
1455 }
1456
1457 /*
1458  * to be called from MDS only
1459  */
1460 int lmv_obd_create(struct obd_export *exp, struct obdo *oa,
1461                    struct lov_stripe_md **ea, struct obd_trans_info *oti)
1462 {
1463         struct obd_device *obd = exp->exp_obd;
1464         struct lmv_obd *lmv = &obd->u.lmv;
1465         int i, c, rc = 0;
1466         struct mea *mea;
1467         struct ll_fid mfid;
1468         int lcount;
1469         ENTRY;
1470
1471         rc = lmv_check_connect(obd);
1472         if (rc)
1473                 RETURN(rc);
1474
1475         LASSERT(oa != NULL);
1476         
1477         if (ea == NULL) {
1478                 rc = lmv_obd_create_single(exp, oa, NULL, oti);
1479                 RETURN(rc);
1480         }
1481
1482         if (*ea == NULL) {
1483                 rc = obd_alloc_diskmd(exp, (struct lov_mds_md **)ea);
1484                 if (rc < 0) {
1485                         CERROR("obd_alloc_diskmd() failed, error %d\n",
1486                                rc);
1487                         RETURN(rc);
1488                 }
1489                 
1490                 if (*ea == NULL)
1491                         RETURN(-EINVAL);
1492         }
1493
1494         rc = 0;
1495         mfid.id = oa->o_id;
1496         mfid.generation = oa->o_generation;
1497         
1498         mea = (struct mea *)*ea;
1499         if (!mea->mea_count || mea->mea_count > lmv->desc.ld_tgt_count)
1500                 mea->mea_count = lmv->desc.ld_tgt_count;
1501         mea->mea_magic = MEA_MAGIC_ALL_CHARS;
1502
1503         mea->mea_master = -1;
1504         lcount = lmv->desc.ld_tgt_count;
1505         for (i = 0, c = 0; c < mea->mea_count && i < lcount; i++) {
1506                 struct lov_stripe_md obj_md;
1507                 struct lov_stripe_md *obj_mdp = &obj_md;
1508                
1509                 if (lmv->tgts[i].ltd_exp == NULL) {
1510                         /* this is master MDS */
1511                         mea->mea_fids[c].id = mfid.id;
1512                         mea->mea_fids[c].generation = mfid.generation;
1513                         mea->mea_fids[c].mds = i;
1514                         mea->mea_master = i;
1515                         c++;
1516                         continue;
1517                 }
1518
1519                 /* "master" MDS should always be part of stripped dir, so scan
1520                    for it. */
1521                 if (mea->mea_master == -1 && c == mea->mea_count - 1)
1522                         continue;
1523
1524                 oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLTYPE | OBD_MD_FLMODE
1525                         | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLID;
1526
1527                 rc = obd_create(lmv->tgts[c].ltd_exp, oa, &obj_mdp, oti);
1528                 if (rc) {
1529                         CERROR("obd_create() failed on MDT target %d, "
1530                                "error %d\n", c, rc);
1531                         RETURN(rc);
1532                 }
1533
1534                 mea->mea_fids[c].id = oa->o_id;
1535                 mea->mea_fids[c].generation = oa->o_generation;
1536                 mea->mea_fids[c].mds = i;
1537                 c++;
1538                 CDEBUG(D_OTHER, "dirobj at mds %d: "LPU64"/%u\n",
1539                        i, oa->o_id, oa->o_generation);
1540         }
1541         LASSERT(c == mea->mea_count);
1542         CDEBUG(D_OTHER, "%d dirobjects created\n", (int) mea->mea_count);
1543
1544         RETURN(rc);
1545 }
1546
1547 static int lmv_get_info(struct obd_export *exp, __u32 keylen,
1548                         void *key, __u32 *vallen, void *val)
1549 {
1550         struct obd_device *obd;
1551         struct lmv_obd *lmv;
1552         ENTRY;
1553
1554         obd = class_exp2obd(exp);
1555         if (obd == NULL) {
1556                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
1557                        exp->exp_handle.h_cookie);
1558                 RETURN(-EINVAL);
1559         }
1560
1561         lmv = &obd->u.lmv;
1562         if (keylen == 6 && memcmp(key, "mdsize", 6) == 0) {
1563                 __u32 *mdsize = val;
1564                 *vallen = sizeof(__u32);
1565                 *mdsize = sizeof(struct ll_fid) * lmv->desc.ld_tgt_count
1566                                 + sizeof(struct mea);
1567                 RETURN(0);
1568         } else if (keylen == 6 && memcmp(key, "mdsnum", 6) == 0) {
1569                 struct obd_uuid *cluuid = &lmv->cluuid;
1570                 struct lmv_tgt_desc *tgts;
1571                 __u32 *mdsnum = val;
1572                 int i;
1573
1574                 for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
1575                         if (obd_uuid_equals(&tgts->uuid, cluuid)) {
1576                                 *vallen = sizeof(__u32);
1577                                 *mdsnum = i;
1578                                 RETURN(0);
1579                         }
1580                 }
1581                 LASSERT(0);
1582         }
1583
1584         CDEBUG(D_IOCTL, "invalid key\n");
1585         RETURN(-EINVAL);
1586 }
1587
1588 int lmv_set_info(struct obd_export *exp, obd_count keylen,
1589                  void *key, obd_count vallen, void *val)
1590 {
1591         struct obd_device *obd;
1592         struct lmv_obd *lmv;
1593         ENTRY;
1594
1595         obd = class_exp2obd(exp);
1596         if (obd == NULL) {
1597                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
1598                        exp->exp_handle.h_cookie);
1599                 RETURN(-EINVAL);
1600         }
1601         lmv = &obd->u.lmv;
1602
1603         if (keylen >= strlen("client") && strcmp(key, "client") == 0) {
1604                 struct lmv_tgt_desc *tgts;
1605                 int i, rc;
1606
1607                 rc = lmv_check_connect(obd);
1608                 if (rc)
1609                         RETURN(rc);
1610
1611                 for (i = 0, tgts = lmv->tgts; 
1612                         i < lmv->desc.ld_tgt_count; i++, tgts++) {
1613                         rc = obd_set_info(tgts->ltd_exp, keylen, key, vallen, val);
1614                         if (rc)
1615                                 RETURN(rc);
1616                 }
1617                 RETURN(0);
1618         } else if (keylen >= strlen("inter_mds") && strcmp(key, "inter_mds") == 0) {
1619                 lmv->server_timeout = 1;
1620                 lmv_set_timeouts(obd);
1621                 RETURN(0);
1622         }
1623         
1624         RETURN(-EINVAL);
1625 }
1626
1627 int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
1628                struct lov_stripe_md *lsm)
1629 {
1630         struct obd_device *obd = class_exp2obd(exp);
1631         struct lmv_obd *lmv = &obd->u.lmv;
1632         int mea_size;
1633         ENTRY;
1634
1635         mea_size = sizeof(struct ll_fid) * 
1636                 lmv->desc.ld_tgt_count + sizeof(struct mea);
1637         if (!lmmp)
1638                 RETURN(mea_size);
1639
1640         if (*lmmp && !lsm) {
1641                 OBD_FREE(*lmmp, mea_size);
1642                 *lmmp = NULL;
1643                 RETURN(0);
1644         }
1645
1646         if (*lmmp == NULL) {
1647                 OBD_ALLOC(*lmmp, mea_size);
1648                 if (*lmmp == NULL)
1649                         RETURN(-ENOMEM);
1650         }
1651
1652         if (!lsm)
1653                 RETURN(mea_size);
1654
1655 #warning "MEA packing/convertation must be here! -bzzz"
1656         memcpy(*lmmp, lsm, mea_size);
1657         RETURN(mea_size);
1658 }
1659
1660 int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **mem_tgt,
1661                  struct lov_mds_md *disk_src, int mdsize)
1662 {
1663         struct obd_device *obd = class_exp2obd(exp);
1664         struct lmv_obd *lmv = &obd->u.lmv;
1665         struct mea **tmea = (struct mea **) mem_tgt;
1666         struct mea *mea = (void *) disk_src;
1667         int mea_size;
1668         ENTRY;
1669
1670         mea_size = sizeof(struct ll_fid) * 
1671                 lmv->desc.ld_tgt_count + sizeof(struct mea);
1672         if (mem_tgt == NULL)
1673                 return mea_size;
1674
1675         if (*mem_tgt != NULL && disk_src == NULL) {
1676                 OBD_FREE(*tmea, mea_size);
1677                 RETURN(0);
1678         }
1679
1680         LASSERT(mea_size == mdsize);
1681
1682         OBD_ALLOC(*tmea, mea_size);
1683         if (*tmea == NULL)
1684                 RETURN(-ENOMEM);
1685
1686         if (!disk_src)
1687                 RETURN(mea_size);
1688
1689 #warning "MEA unpacking/convertation must be here! -bzzz"
1690         memcpy(*tmea, mea, mdsize);
1691         RETURN(mea_size);
1692 }
1693
1694 int lmv_brw(int rw, struct obd_export *exp, struct obdo *oa,
1695             struct lov_stripe_md *ea, obd_count oa_bufs,
1696             struct brw_page *pgarr, struct obd_trans_info *oti)
1697 {
1698         struct obd_device *obd = exp->exp_obd;
1699         struct lmv_obd *lmv = &obd->u.lmv;
1700         struct mea *mea = (struct mea *) ea;
1701         int err;
1702       
1703         LASSERT(oa != NULL);
1704         LASSERT(ea != NULL);
1705         LASSERT(pgarr != NULL);
1706         LASSERT(oa->o_mds < lmv->desc.ld_tgt_count);
1707
1708         oa->o_gr = mea->mea_fids[oa->o_mds].generation;
1709         oa->o_id = mea->mea_fids[oa->o_mds].id;
1710         oa->o_valid =  OBD_MD_FLID | OBD_MD_FLGROUP;
1711         err = obd_brw(rw, lmv->tgts[oa->o_mds].ltd_exp, oa,
1712                       NULL, oa_bufs, pgarr, oti);
1713         RETURN(err);
1714 }
1715
1716 struct obd_ops lmv_obd_ops = {
1717         .o_owner                = THIS_MODULE,
1718         .o_attach               = lmv_attach,
1719         .o_detach               = lmv_detach,
1720         .o_setup                = lmv_setup,
1721         .o_cleanup              = lmv_cleanup,
1722         .o_connect              = lmv_connect,
1723         .o_disconnect           = lmv_disconnect,
1724         .o_statfs               = lmv_statfs,
1725         .o_get_info             = lmv_get_info,
1726         .o_set_info             = lmv_set_info,
1727         .o_create               = lmv_obd_create,
1728         .o_packmd               = lmv_packmd,
1729         .o_unpackmd             = lmv_unpackmd,
1730         .o_brw                  = lmv_brw,
1731         .o_init_ea_size         = lmv_init_ea_size,
1732         .o_notify               = lmv_notify,
1733         .o_iocontrol            = lmv_iocontrol,
1734 };
1735
1736 struct md_ops lmv_md_ops = {
1737         .m_getstatus            = lmv_getstatus,
1738         .m_getattr              = lmv_getattr,
1739         .m_change_cbdata        = lmv_change_cbdata,
1740         .m_change_cbdata_name   = lmv_change_cbdata_name,
1741         .m_close                = lmv_close,
1742         .m_create               = lmv_create,
1743         .m_done_writing         = lmv_done_writing,
1744         .m_enqueue              = lmv_enqueue,
1745         .m_getattr_name         = lmv_getattr_name,
1746         .m_intent_lock          = lmv_intent_lock,
1747         .m_link                 = lmv_link,
1748         .m_rename               = lmv_rename,
1749         .m_setattr              = lmv_setattr,
1750         .m_sync                 = lmv_sync,
1751         .m_readpage             = lmv_readpage,
1752         .m_unlink               = lmv_unlink,
1753         .m_get_real_obd         = lmv_get_real_obd,
1754         .m_valid_attrs          = lmv_valid_attrs,
1755         .m_delete_object        = lmv_delete_object,
1756 };
1757
1758 int __init lmv_init(void)
1759 {
1760         struct lprocfs_static_vars lvars;
1761         int rc;
1762
1763         lprocfs_init_vars(lmv, &lvars);
1764         rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
1765                                  lvars.module_vars, OBD_LMV_DEVICENAME);
1766         RETURN(rc);
1767 }
1768
1769 #ifdef __KERNEL__
1770 static void lmv_exit(void)
1771 {
1772         class_unregister_type(OBD_LMV_DEVICENAME);
1773 }
1774
1775 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1776 MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
1777 MODULE_LICENSE("GPL");
1778
1779 module_init(lmv_init);
1780 module_exit(lmv_exit);
1781 #endif