Whamcloud - gitweb
Commit OST AMD support to HEAD so we can being running with a common code base.
[fs/lustre-release.git] / lustre / lmv / lmv_obd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #ifndef EXPORT_SYMTAB
23 # define EXPORT_SYMTAB
24 #endif
25 #define DEBUG_SUBSYSTEM S_LMV
26 #ifdef __KERNEL__
27 #include <linux/slab.h>
28 #include <linux/module.h>
29 #include <linux/init.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <asm/div64.h>
33 #include <linux/seq_file.h>
34 #else
35 #include <liblustre.h>
36 #endif
37 #include <linux/ext2_fs.h>
38
39 #include <linux/obd_support.h>
40 #include <linux/lustre_lib.h>
41 #include <linux/lustre_net.h>
42 #include <linux/lustre_idl.h>
43 #include <linux/lustre_dlm.h>
44 #include <linux/lustre_mds.h>
45 #include <linux/obd_class.h>
46 #include <linux/obd_ost.h>
47 #include <linux/lprocfs_status.h>
48 #include <linux/lustre_fsfilt.h>
49 #include <linux/obd_lmv.h>
50 #include "lmv_internal.h"
51
52 /* Error codes:
53  *
54  *  -EINVAL  : UUID can't be found in the LMV's target list
55  *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
56  *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
57  */
58 static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
59                               int activate)
60 {
61         struct obd_device *obd;
62         struct lmv_tgt_desc *tgt;
63         int i, rc = 0;
64         ENTRY;
65
66         CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
67                lmv, uuid->uuid, activate);
68
69         spin_lock(&lmv->lmv_lock);
70         for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgt++) {
71                 if (tgt->ltd_exp == NULL)
72                         continue;
73
74                 CDEBUG(D_INFO, "lmv idx %d is %s conn "LPX64"\n",
75                        i, tgt->uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
76                 if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof uuid->uuid) == 0)
77                         break;
78         }
79
80         if (i == lmv->desc.ld_tgt_count)
81                 GOTO(out, rc = -EINVAL);
82
83         obd = class_exp2obd(tgt->ltd_exp);
84         if (obd == NULL)
85                 GOTO(out, rc = -ENOTCONN);
86
87         CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
88                obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
89                obd->obd_type->typ_name, i);
90         LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
91
92         if (tgt->active == activate) {
93                 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
94                        activate ? "" : "in");
95                 GOTO(out, rc);
96         }
97
98         CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
99
100         tgt->active = activate;
101         if (activate)
102                 lmv->desc.ld_active_tgt_count++;
103         else
104                 lmv->desc.ld_active_tgt_count--;
105
106         EXIT;
107  out:
108         spin_unlock(&lmv->lmv_lock);
109         return rc;
110 }
111
112 static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
113                       int active, void *data)
114 {
115         int rc;
116         struct obd_uuid *uuid;
117
118         if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
119                 CERROR("unexpected notification of %s %s!\n",
120                        watched->obd_type->typ_name,
121                        watched->obd_name);
122                 return -EINVAL;
123         }
124         uuid = &watched->u.cli.cl_import->imp_target_uuid;
125
126         /* Set MDC as active before notifying the observer, so the
127          * observer can use the MDC normally.  
128          */
129         rc = lmv_set_mdc_active(&obd->u.lmv, uuid, active);
130         if (rc) {
131                 CERROR("%sactivation of %s failed: %d\n",
132                        active ? "" : "de", uuid->uuid, rc);
133                 RETURN(rc);
134         }
135
136         if (obd->obd_observer)
137                 /* Pass the notification up the chain. */
138                 rc = obd_notify(obd->obd_observer, watched, active, data);
139
140         RETURN(rc);
141 }
142
143 int lmv_attach(struct obd_device *dev, obd_count len, void *data)
144 {
145         struct lprocfs_static_vars lvars;
146         int rc;
147         ENTRY;
148
149         lprocfs_init_vars(lmv, &lvars);
150         rc = lprocfs_obd_attach(dev, lvars.obd_vars);
151         if (rc == 0) {
152 #ifdef __KERNEL__
153                 struct proc_dir_entry *entry;
154                 
155                 entry = create_proc_entry("target_obd", 0444, dev->obd_proc_entry);
156                 if (entry == NULL)
157                         RETURN(-ENOMEM);
158                 /* entry->proc_fops = &lmv_proc_target_fops; */
159                 entry->data = dev;
160 #endif
161        }
162         RETURN (rc);
163 }
164
165 int lmv_detach(struct obd_device *dev)
166 {
167         return lprocfs_obd_detach(dev);
168 }
169
170 /* This is fake connect function. Its purpose is to initialize lmv and 
171  * say caller that everything is okay. Real connection will be performed
172  * later. */
173 static int lmv_connect(struct lustre_handle *conn, struct obd_device *obd,
174                        struct obd_uuid *cluuid, unsigned long connect_flags)
175 {
176         struct lmv_obd *lmv = &obd->u.lmv;
177         struct obd_export *exp;
178         int rc;
179         ENTRY;
180
181         rc = class_connect(conn, obd, cluuid);
182         if (rc) {
183                 CERROR("class_connection() returned %d\n", rc);
184                 RETURN(rc);
185         }
186
187         exp = class_conn2export(conn);
188         /* We don't want to actually do the underlying connections more than
189          * once, so keep track. */
190         lmv->refcount++;
191         if (lmv->refcount > 1) {
192                 class_export_put(exp);
193                 RETURN(0);
194         }
195
196         lmv->cluuid = *cluuid;
197         lmv->connect_flags = connect_flags;
198         lmv->connected = 0;
199         lmv->exp = exp;
200         sema_init(&lmv->init_sem, 1);
201
202         RETURN(0);
203 }
204
205 void lmv_set_timeouts(struct obd_device *obd)
206 {
207         struct lmv_tgt_desc *tgts;
208         struct lmv_obd *lmv;
209         int i;
210
211         lmv = &obd->u.lmv;
212         if (lmv->server_timeout == 0)
213                 return;
214
215         if (lmv->connected == 0)
216                 return;
217
218         for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
219                 if (tgts->ltd_exp == NULL)
220                         continue;
221                 obd_set_info(tgts->ltd_exp, strlen("inter_mds"),
222                              "inter_mds", 0, NULL);
223         }
224 }
225
226 /* Performs a check if passed obd is connected. If no - connect it. */
227 int lmv_check_connect(struct obd_device *obd)
228 {
229         struct lmv_obd *lmv = &obd->u.lmv;
230         struct obd_uuid *cluuid;
231         struct lmv_tgt_desc *tgts;
232         struct obd_export *exp;
233         int rc, rc2, i;
234
235         if (lmv->connected)
236                 return 0;
237
238         down(&lmv->init_sem);
239         if (lmv->connected) {
240                 up(&lmv->init_sem);
241                 return 0;
242         }
243
244         lmv->connected = 1;
245         cluuid = &lmv->cluuid;
246         exp = lmv->exp;
247         
248         CDEBUG(D_OTHER, "time to connect %s to %s\n",
249                cluuid->uuid, obd->obd_name);
250
251         for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
252                 struct obd_device *tgt_obd;
253                 struct obd_uuid lmv_osc_uuid = { "LMV_OSC_UUID" };
254                 struct lustre_handle conn = {0, };
255
256                 LASSERT(tgts != NULL);
257
258                 tgt_obd = class_find_client_obd(&tgts->uuid, LUSTRE_MDC_NAME, 
259                                                 &obd->obd_uuid);
260                 if (!tgt_obd) {
261                         CERROR("Target %s not attached\n", tgts->uuid.uuid);
262                         GOTO(out_disc, rc = -EINVAL);
263                 }
264
265                 /* for MDS: don't connect to yourself */
266                 if (obd_uuid_equals(&tgts->uuid, cluuid)) {
267                         CDEBUG(D_OTHER, "don't connect back to %s\n",
268                                cluuid->uuid);
269                         tgts->ltd_exp = NULL;
270                         continue;
271                 }
272
273                 CDEBUG(D_OTHER, "connect to %s(%s) - %s, %s FOR %s\n",
274                         tgt_obd->obd_name, tgt_obd->obd_uuid.uuid,
275                         tgts->uuid.uuid, obd->obd_uuid.uuid,
276                         cluuid->uuid);
277
278                 if (!tgt_obd->obd_set_up) {
279                         CERROR("Target %s not set up\n", tgts->uuid.uuid);
280                         GOTO(out_disc, rc = -EINVAL);
281                 }
282                 
283                 rc = obd_connect(&conn, tgt_obd, &lmv_osc_uuid, lmv->connect_flags);
284                 if (rc) {
285                         CERROR("Target %s connect error %d\n",
286                                 tgts->uuid.uuid, rc);
287                         GOTO(out_disc, rc);
288                 }
289                 tgts->ltd_exp = class_conn2export(&conn);
290
291                 obd_init_ea_size(tgts->ltd_exp, lmv->max_easize,
292                                  lmv->max_cookiesize);
293                 
294                 rc = obd_register_observer(tgt_obd, obd);
295                 if (rc) {
296                         CERROR("Target %s register_observer error %d\n",
297                                tgts->uuid.uuid, rc);
298                         obd_disconnect(tgts->ltd_exp, 0);
299                         GOTO(out_disc, rc);
300                 }
301
302                 lmv->desc.ld_active_tgt_count++;
303                 tgts->active = 1;
304                 
305                 CDEBUG(D_OTHER, "connected to %s(%s) successfully (%d)\n",
306                         tgt_obd->obd_name, tgt_obd->obd_uuid.uuid,
307                         atomic_read(&obd->obd_refcount));
308         }
309
310         lmv_set_timeouts(obd);
311         class_export_put(exp);
312         up(&lmv->init_sem);
313         return 0;
314
315  out_disc:
316         while (i-- > 0) {
317                 struct obd_uuid uuid;
318                 --tgts;
319                 --lmv->desc.ld_active_tgt_count;
320                 tgts->active = 0;
321                 /* save for CERROR below; (we know it's terminated) */
322                 uuid = tgts->uuid;
323                 rc2 = obd_disconnect(tgts->ltd_exp, 0);
324                 if (rc2)
325                         CERROR("error: LMV target %s disconnect on MDT idx %d: "
326                                "error %d\n", uuid.uuid, i, rc2);
327         }
328         class_disconnect(exp, 0);
329         up(&lmv->init_sem);
330         RETURN (rc);
331 }
332
333 static int lmv_disconnect(struct obd_export *exp, int flags)
334 {
335         struct obd_device *obd = class_exp2obd(exp);
336         struct lmv_obd *lmv = &obd->u.lmv;
337         int rc, i;
338         ENTRY;
339
340         if (!lmv->tgts)
341                 goto out_local;
342
343         /* Only disconnect the underlying layers on the final disconnect. */
344         lmv->refcount--;
345         if (lmv->refcount != 0)
346                 goto out_local;
347
348         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
349                 if (lmv->tgts[i].ltd_exp == NULL)
350                         continue;
351
352                 if (obd->obd_no_recov) {
353                         /* Pass it on to our clients.
354                          * XXX This should be an argument to disconnect,
355                          * XXX not a back-door flag on the OBD.  Ah well.
356                          */
357                         struct obd_device *mdc_obd;
358                         mdc_obd = class_exp2obd(lmv->tgts[i].ltd_exp);
359                         if (mdc_obd)
360                                 mdc_obd->obd_no_recov = 1;
361                 }
362
363                 CDEBUG(D_OTHER, "disconnected from %s(%s) successfully\n",
364                         lmv->tgts[i].ltd_exp->exp_obd->obd_name,
365                         lmv->tgts[i].ltd_exp->exp_obd->obd_uuid.uuid);
366
367                 obd_register_observer(lmv->tgts[i].ltd_exp->exp_obd, NULL);
368
369                 rc = obd_disconnect(lmv->tgts[i].ltd_exp, flags);
370                 if (rc) {
371                         if (lmv->tgts[i].active) {
372                                 CERROR("Target %s disconnect error %d\n",
373                                        lmv->tgts[i].uuid.uuid, rc);
374                         }
375                         rc = 0;
376                 }
377                 if (lmv->tgts[i].active) {
378                         lmv->desc.ld_active_tgt_count--;
379                         lmv->tgts[i].active = 0;
380                 }
381                 lmv->tgts[i].ltd_exp = NULL;
382         }
383
384 out_local:
385         /* this is the case when no real connection is established by
386          * lmv_check_connect(). */
387         if (!lmv->connected)
388                 class_export_put(exp);
389         rc = class_disconnect(exp, 0);
390         if (lmv->refcount == 0)
391                 lmv->connected = 0;
392         RETURN(rc);
393 }
394
395 static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
396                          int len, void *karg, void *uarg)
397 {
398         struct obd_device *obddev = class_exp2obd(exp);
399         struct lmv_obd *lmv = &obddev->u.lmv;
400         int i, rc = 0, set = 0;
401
402         ENTRY;
403
404         if (lmv->desc.ld_tgt_count == 0)
405                 RETURN(-ENOTTY);
406         
407         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
408                 int err;
409
410                 if (lmv->tgts[i].ltd_exp == NULL) {
411                         CWARN("%s: NULL export for %d\n", obddev->obd_name, i);
412                         continue;
413                 }
414
415                 err = obd_iocontrol(cmd, lmv->tgts[i].ltd_exp, len, karg, uarg);
416                 if (err) {
417                         if (lmv->tgts[i].active) {
418                                 CERROR("error: iocontrol MDC %s on MDT"
419                                        "idx %d: err = %d\n",
420                                        lmv->tgts[i].uuid.uuid, i, err);
421                                 if (!rc)
422                                         rc = err;
423                         }
424                 } else
425                         set = 1;
426         }
427         if (!set && !rc)
428                 rc = -EIO;
429
430         RETURN(rc);
431 }
432
433 static int lmv_setup(struct obd_device *obd, obd_count len, void *buf)
434 {
435         int i, rc = 0;
436         struct lmv_desc *desc;
437         struct obd_uuid *uuids;
438         struct lmv_tgt_desc *tgts;
439         struct obd_device *tgt_obd;
440         struct lustre_cfg *lcfg = buf;
441         struct lmv_obd *lmv = &obd->u.lmv;
442         ENTRY;
443
444         if (lcfg->lcfg_inllen1 < 1) {
445                 CERROR("LMV setup requires a descriptor\n");
446                 RETURN(-EINVAL);
447         }
448
449         if (lcfg->lcfg_inllen2 < 1) {
450                 CERROR("LMV setup requires an OST UUID list\n");
451                 RETURN(-EINVAL);
452         }
453
454         desc = (struct lmv_desc *)lcfg->lcfg_inlbuf1;
455         if (sizeof(*desc) > lcfg->lcfg_inllen1) {
456                 CERROR("descriptor size wrong: %d > %d\n",
457                        (int)sizeof(*desc), lcfg->lcfg_inllen1);
458                 RETURN(-EINVAL);
459         }
460
461         uuids = (struct obd_uuid *)lcfg->lcfg_inlbuf2;
462         if (sizeof(*uuids) * desc->ld_tgt_count != lcfg->lcfg_inllen2) {
463                 CERROR("UUID array size wrong: %u * %u != %u\n",
464                        sizeof(*uuids), desc->ld_tgt_count, lcfg->lcfg_inllen2);
465                 RETURN(-EINVAL);
466         }
467
468         lmv->bufsize = sizeof(struct lmv_tgt_desc) * desc->ld_tgt_count;
469         OBD_ALLOC(lmv->tgts, lmv->bufsize);
470         if (lmv->tgts == NULL) {
471                 CERROR("Out of memory\n");
472                 RETURN(-ENOMEM);
473         }
474
475         lmv->desc = *desc;
476         spin_lock_init(&lmv->lmv_lock);
477         
478         for (i = 0, tgts = lmv->tgts; i < desc->ld_tgt_count; i++, tgts++)
479                 tgts->uuid = uuids[i];
480         
481         lmv->max_cookiesize = 0;
482
483         lmv->max_easize = sizeof(struct ll_fid) *
484                 desc->ld_tgt_count + sizeof(struct mea);
485         
486         rc = lmv_setup_mgr(obd);
487         if (rc) {
488                 CERROR("Can't setup LMV object manager, "
489                        "error %d.\n", rc);
490                 OBD_FREE(lmv->tgts, lmv->bufsize);
491         }
492
493         tgt_obd = class_find_client_obd(&lmv->tgts->uuid, LUSTRE_MDC_NAME, 
494                                         &obd->obd_uuid);
495         if (!tgt_obd) {
496                 CERROR("Target %s not attached\n", lmv->tgts->uuid.uuid);
497                 RETURN(-EINVAL);
498         }
499
500         rc = obd_llog_init(obd, &obd->obd_llogs, tgt_obd, 0, NULL);
501         if (rc) {
502                 CERROR("failed to setup llogging subsystems\n");
503         }
504
505         RETURN(rc);
506 }
507
508 static int lmv_statfs(struct obd_device *obd, struct obd_statfs *osfs,
509                       unsigned long max_age)
510 {
511         struct lmv_obd *lmv = &obd->u.lmv;
512         struct obd_statfs temp;
513         int rc = 0, i;
514         ENTRY;
515         
516         rc = lmv_check_connect(obd);
517         if (rc)
518                 RETURN(rc);
519                 
520         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
521                 if (lmv->tgts[i].ltd_exp == NULL) {
522                         CWARN("%s: NULL export for %d\n", obd->obd_name, i);
523                         continue;
524                 }
525
526                 rc = obd_statfs(lmv->tgts[i].ltd_exp->exp_obd, &temp, max_age);
527                 if (rc) {
528                         CERROR("can't stat MDS #%d (%s)\n", i,
529                                lmv->tgts[i].ltd_exp->exp_obd->obd_name);
530                         RETURN(rc);
531                 }
532                 if (i == 0) {
533                         memcpy(osfs, &temp, sizeof(temp));
534                 } else {
535                         osfs->os_bavail += temp.os_bavail;
536                         osfs->os_blocks += temp.os_blocks;
537                         osfs->os_ffree += temp.os_ffree;
538                         osfs->os_files += temp.os_files;
539                 }
540         }
541         RETURN(rc);
542 }
543
544 static int lmv_cleanup(struct obd_device *obd, int flags) 
545 {
546         struct lmv_obd *lmv = &obd->u.lmv;
547         ENTRY;
548         lmv_cleanup_mgr(obd);
549         OBD_FREE(lmv->tgts, lmv->bufsize);
550         RETURN(0);
551 }
552
553 static int lmv_getstatus(struct obd_export *exp, struct ll_fid *fid)
554 {
555         struct obd_device *obd = exp->exp_obd;
556         struct lmv_obd *lmv = &obd->u.lmv;
557         int rc;
558         ENTRY;
559         rc = lmv_check_connect(obd);
560         if (rc)
561                 RETURN(rc);
562         rc = md_getstatus(lmv->tgts[0].ltd_exp, fid);
563         fid->mds = 0;
564         RETURN(rc);
565 }
566
567 static int lmv_getattr(struct obd_export *exp, struct ll_fid *fid,
568                        unsigned long valid, unsigned int ea_size,
569                        struct ptlrpc_request **request)
570 {
571         struct obd_device *obd = exp->exp_obd;
572         struct lmv_obd *lmv = &obd->u.lmv;
573         int rc, i = fid->mds;
574         struct lmv_obj *obj;
575         ENTRY;
576
577         rc = lmv_check_connect(obd);
578         if (rc)
579                 RETURN(rc);
580
581         LASSERT(i < lmv->desc.ld_tgt_count);
582
583         rc = md_getattr(lmv->tgts[i].ltd_exp, fid, valid,
584                         ea_size, request);
585         if (rc)
586                 RETURN(rc);
587         
588         obj = lmv_grab_obj(obd, fid);
589         
590         CDEBUG(D_OTHER, "GETATTR for %lu/%lu/%lu %s\n",
591                (unsigned long)fid->mds, (unsigned long)fid->id,
592                (unsigned long)fid->generation, obj ? "(splitted)" : "");
593
594         /* if object is splitted, then we loop over all the slaves and gather
595          * size attribute. In ideal world we would have to gather also mds field
596          * from all slaves, as object is spread over the cluster and this is
597          * definitely interesting information and it is not good to loss it,
598          * but...*/
599         if (obj) {
600                 struct mds_body *body;
601
602                 if (*request == NULL) {
603                         lmv_put_obj(obj);
604                         RETURN(rc);
605                 }
606                         
607                 body = lustre_msg_buf((*request)->rq_repmsg, 0,
608                                       sizeof(*body));
609                 LASSERT(body != NULL);
610
611                 lmv_lock_obj(obj);
612         
613                 for (i = 0; i < obj->objcount; i++) {
614
615                         if (lmv->tgts[i].ltd_exp == NULL) {
616                                 CWARN("%s: NULL export for %d\n",
617                                       obd->obd_name, i);
618                                 continue;
619                         }
620
621                         /* skip master obj. */
622                         if (fid_equal(&obj->fid, &obj->objs[i].fid))
623                                 continue;
624                         
625                         body->size += obj->objs[i].size;
626                 }
627
628                 lmv_unlock_obj(obj);
629                 lmv_put_obj(obj);
630         }
631         
632         RETURN(rc);
633 }
634
635 static int lmv_change_cbdata(struct obd_export *exp, struct ll_fid *fid, 
636                              ldlm_iterator_t it, void *data)
637 {
638         struct obd_device *obd = exp->exp_obd;
639         struct lmv_obd *lmv = &obd->u.lmv;
640         int rc = 0;
641         ENTRY;
642         
643         rc = lmv_check_connect(obd);
644         if (rc)
645                 RETURN(rc);
646         
647         CDEBUG(D_OTHER, "CBDATA for %lu/%lu/%lu\n", (unsigned long)fid->mds,
648                (unsigned long)fid->id, (unsigned long)fid->generation);
649         
650         LASSERT(fid->mds < lmv->desc.ld_tgt_count);
651
652         rc = md_change_cbdata(lmv->tgts[fid->mds].ltd_exp,
653                               fid, it, data);
654         
655         RETURN(rc);
656 }
657
658 static int lmv_change_cbdata_name(struct obd_export *exp, struct ll_fid *pfid,
659                                   char *name, int len, struct ll_fid *cfid,
660                                   ldlm_iterator_t it, void *data)
661 {
662         struct obd_device *obd = exp->exp_obd;
663         struct lmv_obd *lmv = &obd->u.lmv;
664         struct lmv_obj *obj;
665         int rc = 0, mds;
666         ENTRY;
667
668         rc = lmv_check_connect(obd);
669         if (rc)
670                 RETURN(rc);
671
672         LASSERT(pfid->mds < lmv->desc.ld_tgt_count);
673         LASSERT(cfid->mds < lmv->desc.ld_tgt_count);
674         
675         CDEBUG(D_OTHER, "CBDATA for %lu/%lu/%lu:%*s -> %lu/%lu/%lu\n",
676                (unsigned long)pfid->mds, (unsigned long)pfid->id,
677                (unsigned long)pfid->generation, len, name,
678                (unsigned long)cfid->mds, (unsigned long)cfid->id,
679                (unsigned long)cfid->generation);
680
681         /* this is default mds for directory name belongs to. */
682         mds = pfid->mds;
683         obj = lmv_grab_obj(obd, pfid);
684         if (obj) {
685                 /* directory is splitted. look for right mds for this name. */
686                 mds = raw_name2idx(obj->hashtype, obj->objcount, name, len);
687                 mds = obj->objs[mds].fid.mds;
688                 lmv_put_obj(obj);
689         }
690         rc = md_change_cbdata(lmv->tgts[mds].ltd_exp, cfid, it, data);
691         RETURN(rc);
692 }
693
694 static int lmv_valid_attrs(struct obd_export *exp, struct ll_fid *fid) 
695 {
696         struct obd_device *obd = exp->exp_obd;
697         struct lmv_obd *lmv = &obd->u.lmv;
698         int rc = 0;
699         ENTRY;
700         rc = lmv_check_connect(obd);
701         if (rc)
702                 RETURN(rc);
703         CDEBUG(D_OTHER, "validate %lu/%lu/%lu\n", (unsigned long) fid->mds,
704                (unsigned long) fid->id, (unsigned long) fid->generation);
705         LASSERT(fid->mds < lmv->desc.ld_tgt_count);
706         rc = md_valid_attrs(lmv->tgts[fid->mds].ltd_exp, fid);
707         RETURN(rc);
708 }
709
710 int lmv_close(struct obd_export *exp, struct obdo *obdo,
711                   struct obd_client_handle *och,
712                   struct ptlrpc_request **request)
713 {
714         struct obd_device *obd = exp->exp_obd;
715         struct lmv_obd *lmv = &obd->u.lmv;
716         int rc, i = obdo->o_mds;
717         ENTRY;
718         rc = lmv_check_connect(obd);
719         if (rc)
720                 RETURN(rc);
721         LASSERT(i < lmv->desc.ld_tgt_count);
722         CDEBUG(D_OTHER, "CLOSE %lu/%lu/%lu\n", (unsigned long) obdo->o_mds,
723                (unsigned long) obdo->o_id, (unsigned long) obdo->o_generation);
724         rc = md_close(lmv->tgts[i].ltd_exp, obdo, och, request);
725         RETURN(rc);
726 }
727
728 int lmv_get_mea_and_update_object(struct obd_export *exp, struct ll_fid *fid)
729 {
730         struct obd_device *obd = exp->exp_obd;
731         struct lmv_obd *lmv = &obd->u.lmv;
732         struct ptlrpc_request *req = NULL;
733         struct lmv_obj *obj;
734         struct lustre_md md;
735         unsigned long valid;
736         int mealen, rc;
737
738         md.mea = NULL;
739         mealen = MEA_SIZE_LMV(lmv);
740         
741         valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
742
743         /* time to update mea of parent fid */
744         rc = md_getattr(lmv->tgts[fid->mds].ltd_exp, fid,
745                         valid, mealen, &req);
746         if (rc) {
747                 CERROR("md_getattr() failed, error %d\n", rc);
748                 GOTO(cleanup, rc);
749         }
750
751         rc = mdc_req2lustre_md(exp, req, 0, NULL, &md);
752         if (rc) {
753                 CERROR("mdc_req2lustre_md() failed, error %d\n", rc);
754                 GOTO(cleanup, rc);
755         }
756
757         if (md.mea == NULL)
758                 GOTO(cleanup, rc = -ENODATA);
759
760         obj = lmv_create_obj(exp, fid, md.mea);
761         if (IS_ERR(obj))
762                 rc = PTR_ERR(obj);
763         
764         lmv_put_obj(obj);
765         obd_free_memmd(exp, (struct lov_stripe_md **)&md.mea);
766
767 cleanup:
768         if (req)
769                 ptlrpc_req_finished(req);
770         RETURN(rc);
771 }
772
773 int lmv_create(struct obd_export *exp, struct mdc_op_data *op_data,
774                const void *data, int datalen, int mode, __u32 uid,
775                __u32 gid, __u64 rdev, struct ptlrpc_request **request)
776 {
777         struct obd_device *obd = exp->exp_obd;
778         struct lmv_obd *lmv = &obd->u.lmv;
779         struct mds_body *body;
780         struct lmv_obj *obj;
781         int rc, mds, loop = 0;
782         ENTRY;
783
784         rc = lmv_check_connect(obd);
785         if (rc)
786                 RETURN(rc);
787
788         if (!lmv->desc.ld_active_tgt_count)
789                 RETURN(-EIO);
790 repeat:
791         LASSERT(++loop <= 2);
792         obj = lmv_grab_obj(obd, &op_data->fid1);
793         if (obj) {
794                 mds = raw_name2idx(obj->hashtype, obj->objcount, op_data->name,
795                                    op_data->namelen);
796                 op_data->fid1 = obj->objs[mds].fid;
797                 lmv_put_obj(obj);
798         }
799
800         CDEBUG(D_OTHER, "CREATE '%*s' on %lu/%lu/%lu\n", op_data->namelen,
801                op_data->name, (unsigned long)op_data->fid1.mds,
802                (unsigned long)op_data->fid1.id,
803                (unsigned long)op_data->fid1.generation);
804         
805         rc = md_create(lmv->tgts[op_data->fid1.mds].ltd_exp, op_data, data,
806                        datalen, mode, uid, gid, rdev, request);
807         if (rc == 0) {
808                 if (*request == NULL)
809                         RETURN(rc);
810
811                 body = lustre_msg_buf((*request)->rq_repmsg, 0,
812                                       sizeof(*body));
813                 LASSERT(body != NULL);
814                 
815                 CDEBUG(D_OTHER, "created. id = %lu, generation = %lu, "
816                        "mds = %d\n", (unsigned long)body->fid1.id,
817                        (unsigned long)body->fid1.generation, op_data->fid1.mds);
818                 
819                 LASSERT(body->valid & OBD_MD_MDS ||
820                         body->mds == op_data->fid1.mds);
821         } else if (rc == -ERESTART) {
822                 /* directory got splitted. time to update local object and
823                  * repeat the request with proper MDS */
824                 rc = lmv_get_mea_and_update_object(exp, &op_data->fid1);
825                 if (rc == 0) {
826                         ptlrpc_req_finished(*request);
827                         goto repeat;
828                 }
829         }
830         RETURN(rc);
831 }
832
833 int lmv_done_writing(struct obd_export *exp, struct obdo *obdo)
834 {
835         struct obd_device *obd = exp->exp_obd;
836         struct lmv_obd *lmv = &obd->u.lmv;
837         int rc;
838         ENTRY;
839         rc = lmv_check_connect(obd);
840         if (rc)
841                 RETURN(rc);
842
843         /* FIXME: choose right MDC here */
844         CWARN("this method isn't implemented yet\n");
845         rc = md_done_writing(lmv->tgts[0].ltd_exp, obdo);
846         RETURN(rc);
847 }
848
849 int lmv_enqueue_slaves(struct obd_export *exp, int locktype,
850                        struct lookup_intent *it, int lockmode,
851                        struct mdc_op_data *data, struct lustre_handle *lockh,
852                        void *lmm, int lmmsize, ldlm_completion_callback cb_completion,
853                        ldlm_blocking_callback cb_blocking, void *cb_data)
854 {
855         struct obd_device *obd = exp->exp_obd;
856         struct lmv_obd *lmv = &obd->u.lmv;
857         struct mea *mea = data->mea1;
858         struct mdc_op_data data2;
859         int i, rc, mds;
860         ENTRY;
861
862         LASSERT(mea != NULL);
863         for (i = 0; i < mea->mea_count; i++) {
864                 memset(&data2, 0, sizeof(data2));
865                 data2.fid1 = mea->mea_fids[i];
866                 mds = data2.fid1.mds;
867                 
868                 if (lmv->tgts[mds].ltd_exp == NULL)
869                         continue;
870
871                 rc = md_enqueue(lmv->tgts[mds].ltd_exp, locktype, it, lockmode,
872                                 &data2, lockh + i, lmm, lmmsize, cb_completion,
873                                 cb_blocking, cb_data);
874                 
875                 CDEBUG(D_OTHER, "take lock on slave %lu/%lu/%lu -> %d/%d\n",
876                        (unsigned long)mea->mea_fids[i].mds,
877                        (unsigned long)mea->mea_fids[i].id,
878                        (unsigned long)mea->mea_fids[i].generation,
879                        rc, it->d.lustre.it_status);
880                 if (rc)
881                         GOTO(cleanup, rc);
882                 if (it->d.lustre.it_data) {
883                         struct ptlrpc_request *req;
884                         req = (struct ptlrpc_request *) it->d.lustre.it_data;
885                         ptlrpc_req_finished(req);
886                 }
887                 
888                 if (it->d.lustre.it_status)
889                         GOTO(cleanup, rc = it->d.lustre.it_status);
890         }
891         RETURN(0);
892         
893 cleanup:
894         /* drop all taken locks */
895         while (--i >= 0) {
896                 if (lockh[i].cookie)
897                         ldlm_lock_decref(lockh + i, lockmode);
898                 lockh[i].cookie = 0;
899         }
900         RETURN(rc);
901 }
902
903 int lmv_enqueue(struct obd_export *exp, int lock_type,
904                 struct lookup_intent *it, int lock_mode,
905                 struct mdc_op_data *data, struct lustre_handle *lockh,
906                 void *lmm, int lmmsize, ldlm_completion_callback cb_completion,
907                 ldlm_blocking_callback cb_blocking, void *cb_data)
908 {
909         struct obd_device *obd = exp->exp_obd;
910         struct lmv_obd *lmv = &obd->u.lmv;
911         struct lmv_obj *obj;
912         int rc, mds;
913         ENTRY;
914
915         rc = lmv_check_connect(obd);
916         if (rc)
917                 RETURN(rc);
918
919         if (it->it_op == IT_UNLINK) {
920                 rc = lmv_enqueue_slaves(exp, lock_type, it, lock_mode,
921                                         data, lockh, lmm, lmmsize,
922                                         cb_completion, cb_blocking, cb_data);
923                 RETURN(rc);
924         }
925
926         if (data->namelen) {
927                 obj = lmv_grab_obj(obd, &data->fid1);
928                 if (obj) {
929                         /* directory is splitted. look for right mds for this
930                          * name */
931                         mds = raw_name2idx(obj->hashtype, obj->objcount,
932                                            (char *)data->name, data->namelen);
933                         data->fid1 = obj->objs[mds].fid;
934                         lmv_put_obj(obj);
935                 }
936         }
937         CDEBUG(D_OTHER, "ENQUEUE '%s' on %lu/%lu\n", LL_IT2STR(it),
938                (unsigned long)data->fid1.id, (unsigned long)data->fid1.generation);
939         
940         rc = md_enqueue(lmv->tgts[data->fid1.mds].ltd_exp, lock_type, it,
941                         lock_mode, data, lockh, lmm, lmmsize, cb_completion,
942                         cb_blocking, cb_data);
943
944         RETURN(rc);
945 }
946
947 int lmv_getattr_name(struct obd_export *exp, struct ll_fid *fid,
948                      char *filename, int namelen, unsigned long valid,
949                      unsigned int ea_size, struct ptlrpc_request **request)
950 {
951         struct obd_device *obd = exp->exp_obd;
952         struct lmv_obd *lmv = &obd->u.lmv;
953         struct ll_fid rfid = *fid;
954         int rc, mds = fid->mds, loop = 0;
955         struct mds_body *body;
956         struct lmv_obj *obj;
957         ENTRY;
958         rc = lmv_check_connect(obd);
959         if (rc)
960                 RETURN(rc);
961 repeat:
962         LASSERT(++loop <= 2);
963         obj = lmv_grab_obj(obd, fid);
964         if (obj) {
965                 /* directory is splitted. look for right mds for this name */
966                 mds = raw_name2idx(obj->hashtype, obj->objcount, filename, namelen - 1);
967                 rfid = obj->objs[mds].fid;
968                 lmv_put_obj(obj);
969         }
970         
971         CDEBUG(D_OTHER, "getattr_name for %*s on %lu/%lu/%lu -> %lu/%lu/%lu\n",
972                namelen, filename, (unsigned long)fid->mds,
973                (unsigned long)fid->id, (unsigned long)fid->generation,
974                (unsigned long)rfid.mds, (unsigned long)rfid.id,
975                (unsigned long)rfid.generation);
976
977         rc = md_getattr_name(lmv->tgts[rfid.mds].ltd_exp, &rfid, filename,
978                              namelen, valid, ea_size, request);
979         if (rc == 0) {
980                 /* this could be cross-node reference. in this case all we have
981                  * right now is mds/ino/generation triple. we'd like to find
982                  * other attributes */
983                 body = lustre_msg_buf((*request)->rq_repmsg, 0, sizeof(*body));
984                 LASSERT(body != NULL);
985                 if (body->valid & OBD_MD_MDS) {
986                         struct ptlrpc_request *req = NULL;
987                         rfid = body->fid1;
988                         CDEBUG(D_OTHER, "request attrs for %lu/%lu/%lu\n",
989                                (unsigned long) rfid.mds,
990                                (unsigned long) rfid.id,
991                                (unsigned long) rfid.generation);
992                         rc = md_getattr_name(lmv->tgts[rfid.mds].ltd_exp, &rfid,
993                                              NULL, 1, valid, ea_size, &req);
994                         ptlrpc_req_finished(*request);
995                         *request = req;
996                 }
997         } else if (rc == -ERESTART) {
998                 /* directory got splitted. time to update local object and
999                  * repeat the request with proper MDS */
1000                 rc = lmv_get_mea_and_update_object(exp, &rfid);
1001                 if (rc == 0) {
1002                         ptlrpc_req_finished(*request);
1003                         goto repeat;
1004                 }
1005         }
1006         RETURN(rc);
1007 }
1008
1009
1010 /*
1011  * llite passes fid of an target inode in data->fid1 and fid of directory in
1012  * data->fid2
1013  */
1014 int lmv_link(struct obd_export *exp, struct mdc_op_data *data,
1015              struct ptlrpc_request **request)
1016 {
1017         struct obd_device *obd = exp->exp_obd;
1018         struct lmv_obd *lmv = &obd->u.lmv;
1019         struct lmv_obj *obj;
1020         int rc;
1021         ENTRY;
1022         
1023         rc = lmv_check_connect(obd);
1024         if (rc)
1025                 RETURN(rc);
1026
1027         if (data->namelen != 0) {
1028                 /* usual link request */
1029                 obj = lmv_grab_obj(obd, &data->fid1);
1030                 if (obj) {
1031                         rc = raw_name2idx(obj->hashtype, obj->objcount, data->name,
1032                                           data->namelen);
1033                         data->fid1 = obj->objs[rc].fid;
1034                         lmv_put_obj(obj);
1035                 }
1036                 
1037                 CDEBUG(D_OTHER,"link %lu/%lu/%lu:%*s to %lu/%lu/%lu mds %lu\n",
1038                        (unsigned long)data->fid2.mds,
1039                        (unsigned long)data->fid2.id,
1040                        (unsigned long)data->fid2.generation,
1041                        data->namelen, data->name,
1042                        (unsigned long)data->fid1.mds,
1043                        (unsigned long)data->fid1.id,
1044                        (unsigned long)data->fid1.generation,
1045                        (unsigned long)data->fid1.mds);
1046         } else {
1047                 /* request from MDS to acquire i_links for inode by fid1 */
1048                 CDEBUG(D_OTHER, "inc i_nlinks for %lu/%lu/%lu\n",
1049                        (unsigned long)data->fid1.mds,
1050                        (unsigned long)data->fid1.id,
1051                        (unsigned long)data->fid1.generation);
1052         }
1053                         
1054         rc = md_link(lmv->tgts[data->fid1.mds].ltd_exp, data, request);
1055         RETURN(rc);
1056 }
1057
1058 int lmv_rename(struct obd_export *exp, struct mdc_op_data *data,
1059                const char *old, int oldlen, const char *new, int newlen,
1060                struct ptlrpc_request **request)
1061 {
1062         struct obd_device *obd = exp->exp_obd;
1063         struct lmv_obd *lmv = &obd->u.lmv;
1064         struct lmv_obj *obj;
1065         int rc, mds;
1066         ENTRY;
1067
1068         CDEBUG(D_OTHER, "rename %*s in %lu/%lu/%lu to %*s in %lu/%lu/%lu\n",
1069                oldlen, old, (unsigned long)data->fid1.mds,
1070                (unsigned long)data->fid1.id,
1071                (unsigned long)data->fid1.generation,
1072                newlen, new, (unsigned long) data->fid2.mds,
1073                (unsigned long) data->fid2.id,
1074                (unsigned long) data->fid2.generation);
1075         
1076         if (!fid_equal(&data->fid1, &data->fid2))
1077                 CWARN("cross-node rename %lu/%lu/%lu:%*s to %lu/%lu/%lu:%*s\n",
1078                       (unsigned long)data->fid1.mds,
1079                       (unsigned long)data->fid1.id,
1080                       (unsigned long)data->fid1.generation, oldlen, old,
1081                       (unsigned long)data->fid2.mds,
1082                       (unsigned long)data->fid2.id,
1083                       (unsigned long)data->fid2.generation, newlen, new);
1084
1085         rc = lmv_check_connect(obd);
1086         if (rc)
1087                 RETURN(rc);
1088
1089         if (oldlen == 0) {
1090                 /* MDS with old dir entry is asking another MDS to create name
1091                  * there */
1092                 CDEBUG(D_OTHER,
1093                        "create %*s(%d/%d) in %lu/%lu/%lu pointing to %lu/%lu/%lu\n",
1094                        newlen, new, oldlen, newlen,
1095                        (unsigned long)data->fid2.mds,
1096                        (unsigned long)data->fid2.id,
1097                        (unsigned long)data->fid2.generation,
1098                        (unsigned long)data->fid1.mds,
1099                        (unsigned long)data->fid1.id,
1100                        (unsigned long)data->fid1.generation);
1101                 mds = data->fid2.mds;
1102                 goto request;
1103         }
1104
1105         obj = lmv_grab_obj(obd, &data->fid1);
1106         if (obj) {
1107                 /* directory is already splitted, so we have to forward request
1108                  * to the right MDS */
1109                 mds = raw_name2idx(obj->hashtype, obj->objcount, (char *)old, oldlen);
1110                 data->fid1 = obj->objs[mds].fid;
1111                 CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
1112                        (unsigned long)obj->objs[mds].fid.mds,
1113                        (unsigned long)obj->objs[mds].fid.id,
1114                        (unsigned long)obj->objs[mds].fid.generation);
1115                 lmv_put_obj(obj);
1116         }
1117
1118         obj = lmv_grab_obj(obd, &data->fid2);
1119         if (obj) {
1120                 /* directory is already splitted, so we have to forward request
1121                  * to the right MDS */
1122                 mds = raw_name2idx(obj->hashtype, obj->objcount, (char *)new, newlen);
1123                 data->fid2 = obj->objs[mds].fid;
1124                 CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
1125                        (unsigned long)obj->objs[mds].fid.mds,
1126                        (unsigned long)obj->objs[mds].fid.id,
1127                        (unsigned long)obj->objs[mds].fid.generation);
1128                 lmv_put_obj(obj);
1129         }
1130         
1131         mds = data->fid1.mds;
1132
1133 request:
1134         rc = md_rename(lmv->tgts[mds].ltd_exp, data, old, oldlen,
1135                        new, newlen, request); 
1136         RETURN(rc);
1137 }
1138
1139 int lmv_setattr(struct obd_export *exp, struct mdc_op_data *data,
1140                 struct iattr *iattr, void *ea, int ealen, void *ea2,
1141                 int ea2len, struct ptlrpc_request **request)
1142 {
1143         struct obd_device *obd = exp->exp_obd;
1144         struct lmv_obd *lmv = &obd->u.lmv;
1145         struct ptlrpc_request *req;
1146         struct mds_body *body;
1147         struct lmv_obj *obj;
1148         int rc = 0, i;
1149         ENTRY;
1150
1151         rc = lmv_check_connect(obd);
1152         if (rc)
1153                 RETURN(rc);
1154
1155         obj = lmv_grab_obj(obd, &data->fid1);
1156         
1157         CDEBUG(D_OTHER, "SETATTR for %lu/%lu/%lu, valid 0x%x%s\n",
1158                (unsigned long)data->fid1.mds, (unsigned long)data->fid1.id,
1159                (unsigned long)data->fid1.generation, iattr->ia_valid,
1160                obj ? ", splitted" : "");
1161         
1162         if (obj) {
1163                 for (i = 0; i < obj->objcount; i++) {
1164                         data->fid1 = obj->objs[i].fid;
1165                         
1166                         rc = md_setattr(lmv->tgts[data->fid1.mds].ltd_exp, data,
1167                                         iattr, ea, ealen, ea2, ea2len, &req);
1168
1169                         if (fid_equal(&obj->fid, &obj->objs[i].fid)) {
1170                                 /* this is master object and this request should
1171                                  * be returned back to llite */
1172                                 *request = req;
1173                         } else {
1174                                 ptlrpc_req_finished(req);
1175                         }
1176
1177                         if (rc)
1178                                 break;
1179                 }
1180                 lmv_put_obj(obj);
1181         } else {
1182                 LASSERT(data->fid1.mds < lmv->desc.ld_tgt_count);
1183                 rc = md_setattr(lmv->tgts[data->fid1.mds].ltd_exp, data,
1184                                 iattr, ea, ealen, ea2, ea2len, request); 
1185                 if (rc == 0) {
1186                         body = lustre_msg_buf((*request)->rq_repmsg, 0,
1187                                               sizeof(*body));
1188                         LASSERT(body != NULL);
1189                         LASSERT(body->mds == data->fid1.mds);
1190                 }
1191         }
1192         RETURN(rc);
1193 }
1194
1195 int lmv_sync(struct obd_export *exp, struct ll_fid *fid,
1196              struct ptlrpc_request **request)
1197 {
1198         struct obd_device *obd = exp->exp_obd;
1199         struct lmv_obd *lmv = &obd->u.lmv;
1200         int rc;
1201         ENTRY;
1202
1203         rc = lmv_check_connect(obd);
1204         if (rc)
1205                 RETURN(rc);
1206
1207         rc = md_sync(lmv->tgts[fid->mds].ltd_exp, fid, request); 
1208         RETURN(rc);
1209 }
1210
1211 int lmv_dirobj_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
1212                             void *data, int flag)
1213 {
1214         struct lustre_handle lockh;
1215         struct lmv_obj *obj;
1216         int rc;
1217         ENTRY;
1218
1219         switch (flag) {
1220         case LDLM_CB_BLOCKING:
1221                 ldlm_lock2handle(lock, &lockh);
1222                 rc = ldlm_cli_cancel(&lockh);
1223                 if (rc < 0) {
1224                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
1225                         RETURN(rc);
1226                 }
1227                 break;
1228         case LDLM_CB_CANCELING:
1229                 /* time to drop cached attrs for dirobj */
1230                 obj = lock->l_ast_data;
1231                 if (obj) {
1232                         CDEBUG(D_OTHER, "cancel %s on %lu/%lu, master %lu/%lu/%lu\n",
1233                                lock->l_resource->lr_name.name[3] == 1 ? "LOOKUP" : "UPDATE",
1234                                (unsigned long)lock->l_resource->lr_name.name[0],
1235                                (unsigned long)lock->l_resource->lr_name.name[1],
1236                                (unsigned long)obj->fid.mds, (unsigned long)obj->fid.id,
1237                                (unsigned long)obj->fid.generation);
1238                         lmv_put_obj(obj);
1239                 }
1240                 break;
1241         default:
1242                 LBUG();
1243         }
1244         RETURN(0);
1245 }
1246
1247 void lmv_remove_dots(struct page *page)
1248 {
1249         char *kaddr = page_address(page);
1250         unsigned limit = PAGE_CACHE_SIZE;
1251         unsigned offs, rec_len;
1252         struct ext2_dir_entry_2 *p;
1253
1254         for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
1255                 p = (struct ext2_dir_entry_2 *)(kaddr + offs);
1256                 rec_len = le16_to_cpu(p->rec_len);
1257
1258                 if ((p->name_len == 1 && p->name[0] == '.') ||
1259                     (p->name_len == 2 && p->name[0] == '.' && p->name[1] == '.'))
1260                         p->inode = 0;
1261         }
1262 }
1263
1264 int lmv_readpage(struct obd_export *exp, struct ll_fid *mdc_fid,
1265                  __u64 offset, struct page *page,
1266                  struct ptlrpc_request **request)
1267 {
1268         struct obd_device *obd = exp->exp_obd;
1269         struct lmv_obd *lmv = &obd->u.lmv;
1270         struct ll_fid rfid = *mdc_fid;
1271         struct lmv_obj *obj;
1272         int rc, i;
1273         ENTRY;
1274
1275         rc = lmv_check_connect(obd);
1276         if (rc)
1277                 RETURN(rc);
1278
1279         LASSERT(mdc_fid->mds < lmv->desc.ld_tgt_count);
1280         CDEBUG(D_OTHER, "READPAGE at %llu from %lu/%lu/%lu\n",
1281                offset, (unsigned long) rfid.mds,
1282                (unsigned long) rfid.id,
1283                (unsigned long) rfid.generation);
1284
1285         obj = lmv_grab_obj(obd, mdc_fid);
1286         if (obj) {
1287                 lmv_lock_obj(obj);
1288
1289                 /* find dirobj containing page with requested offset. */
1290                 for (i = 0; i < obj->objcount; i++) {
1291                         if (offset < obj->objs[i].size)
1292                                 break;
1293                         offset -= obj->objs[i].size;
1294                 }
1295                 rfid = obj->objs[i].fid;
1296                 
1297                 lmv_unlock_obj(obj);
1298                 lmv_put_obj(obj);
1299                 
1300                 CDEBUG(D_OTHER, "forward to %lu/%lu/%lu with offset %lu\n",
1301                        (unsigned long)rfid.mds, (unsigned long)rfid.id,
1302                        (unsigned long)rfid.generation, (unsigned long)offset);
1303         }
1304         rc = md_readpage(lmv->tgts[rfid.mds].ltd_exp, &rfid, offset,
1305                          page, request);
1306         
1307         if (rc == 0 && !fid_equal(&rfid, mdc_fid))
1308                 /* this page isn't from master object. To avoid "." and ".." 
1309                  * duplication in directory, we have to remove them from all
1310                  * slave objects */
1311                 lmv_remove_dots(page);
1312         
1313         RETURN(rc);
1314 }
1315
1316 int lmv_unlink_slaves(struct obd_export *exp, struct mdc_op_data *data,
1317                       struct ptlrpc_request **req)
1318 {
1319         struct obd_device *obd = exp->exp_obd;
1320         struct lmv_obd *lmv = &obd->u.lmv;
1321         struct mea *mea = data->mea1;
1322         struct mdc_op_data data2;
1323         int i, rc = 0, mds;
1324         ENTRY;
1325
1326         LASSERT(mea != NULL);
1327         for (i = 0; i < mea->mea_count; i++) {
1328                 memset(&data2, 0, sizeof(data2));
1329                 data2.fid1 = mea->mea_fids[i];
1330                 data2.create_mode = MDS_MODE_DONT_LOCK | S_IFDIR;
1331                 mds = data2.fid1.mds;
1332
1333                 if (lmv->tgts[mds].ltd_exp == NULL)
1334                         continue;
1335
1336                 rc = md_unlink(lmv->tgts[mds].ltd_exp, &data2, req);
1337                 CDEBUG(D_OTHER, "unlink slave %lu/%lu/%lu -> %d\n",
1338                        (unsigned long) mea->mea_fids[i].mds,
1339                        (unsigned long) mea->mea_fids[i].id,
1340                        (unsigned long) mea->mea_fids[i].generation, rc);
1341                 if (*req) {
1342                         ptlrpc_req_finished(*req);
1343                         *req = NULL;
1344                 }
1345                 if (rc)
1346                         break;
1347         }
1348         RETURN(rc);
1349 }
1350
1351 int lmv_delete_object(struct obd_export *exp, struct ll_fid *fid)
1352 {
1353         ENTRY;
1354
1355         if (!lmv_delete_obj(exp, fid)) {
1356                 CDEBUG(D_OTHER, "Object %lu/%lu/%lu is not found.\n",
1357                        (unsigned long)fid->mds, (unsigned long)fid->id,
1358                        (unsigned long)fid->generation);
1359         }
1360         
1361         RETURN(0);
1362 }
1363
1364 int lmv_unlink(struct obd_export *exp, struct mdc_op_data *data,
1365                struct ptlrpc_request **request)
1366 {
1367         struct obd_device *obd = exp->exp_obd;
1368         struct lmv_obd *lmv = &obd->u.lmv;
1369         int rc, i = 0;
1370         ENTRY;
1371         
1372         rc = lmv_check_connect(obd);
1373         if (rc)
1374                 RETURN(rc);
1375
1376         if (data->namelen == 0 && data->mea1 != NULL) {
1377                 /* mds asks to remove slave objects */
1378                 rc = lmv_unlink_slaves(exp, data, request);
1379                 RETURN(rc);
1380         } else if (data->namelen != 0) {
1381                 struct lmv_obj *obj;
1382                 
1383                 obj = lmv_grab_obj(obd, &data->fid1);
1384                 if (obj) {
1385                         i = raw_name2idx(obj->hashtype, obj->objcount, data->name,
1386                                          data->namelen);
1387                         data->fid1 = obj->objs[i].fid;
1388                         lmv_put_obj(obj);
1389                 }
1390                 CDEBUG(D_OTHER, "unlink '%*s' in %lu/%lu/%lu -> %u\n",
1391                        data->namelen, data->name,
1392                        (unsigned long) data->fid1.mds,
1393                        (unsigned long) data->fid1.id,
1394                        (unsigned long) data->fid1.generation, i);
1395         } else {
1396                 CDEBUG(D_OTHER, "drop i_nlink on %lu/%lu/%lu\n",
1397                        (unsigned long) data->fid1.mds,
1398                        (unsigned long) data->fid1.id,
1399                        (unsigned long) data->fid1.generation);
1400         }
1401         rc = md_unlink(lmv->tgts[data->fid1.mds].ltd_exp, data, request); 
1402         RETURN(rc);
1403 }
1404
1405 struct obd_device *lmv_get_real_obd(struct obd_export *exp,
1406                                     char *name, int len)
1407 {
1408         struct obd_device *obd = exp->exp_obd;
1409         struct lmv_obd *lmv = &obd->u.lmv;
1410         int rc;
1411         ENTRY;
1412
1413         rc = lmv_check_connect(obd);
1414         if (rc)
1415                 RETURN(ERR_PTR(rc));
1416 #warning "we need well-desgined readdir() implementation to remove this mess"
1417         obd = lmv->tgts[0].ltd_exp->exp_obd;
1418         EXIT;
1419         return obd;
1420 }
1421
1422 int lmv_init_ea_size(struct obd_export *exp, int easize, int cookiesize)
1423 {
1424         struct obd_device *obd = exp->exp_obd;
1425         struct lmv_obd *lmv = &obd->u.lmv;
1426         int i, rc = 0, change = 0;
1427         ENTRY;
1428
1429         if (lmv->max_easize < easize) {
1430                 lmv->max_easize = easize;
1431                 change = 1;
1432         }
1433         if (lmv->max_cookiesize < cookiesize) {
1434                 lmv->max_cookiesize = cookiesize;
1435                 change = 1;
1436         }
1437         if (change == 0)
1438                 RETURN(0);
1439         
1440         if (lmv->connected == 0)
1441                 RETURN(0);
1442
1443         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1444                 if (lmv->tgts[i].ltd_exp == NULL) {
1445                         CWARN("%s: NULL export for %d\n", obd->obd_name, i);
1446                         continue;
1447                 }
1448
1449                 rc = obd_init_ea_size(lmv->tgts[i].ltd_exp, easize, cookiesize);
1450                 if (rc) {
1451                         CERROR("obd_init_ea_size() failed on MDT target %d, "
1452                                "error %d.\n", i, rc);
1453                         break;
1454                 }
1455         }
1456         RETURN(rc);
1457 }
1458
1459 int lmv_obd_create_single(struct obd_export *exp, struct obdo *oa,
1460                           struct lov_stripe_md **ea, struct obd_trans_info *oti)
1461 {
1462         struct obd_device *obd = exp->exp_obd;
1463         struct lmv_obd *lmv = &obd->u.lmv;
1464         struct lov_stripe_md obj_md;
1465         struct lov_stripe_md *obj_mdp = &obj_md;
1466         int rc = 0;
1467         ENTRY;
1468
1469         rc = lmv_check_connect(obd);
1470         if (rc)
1471                 RETURN(rc);
1472
1473         LASSERT(ea == NULL);
1474         LASSERT(oa->o_mds < lmv->desc.ld_tgt_count);
1475
1476         rc = obd_create(lmv->tgts[oa->o_mds].ltd_exp, oa, &obj_mdp, oti);
1477
1478         RETURN(rc);
1479 }
1480
1481 /*
1482  * to be called from MDS only
1483  */
1484 int lmv_obd_create(struct obd_export *exp, struct obdo *oa,
1485                    struct lov_stripe_md **ea, struct obd_trans_info *oti)
1486 {
1487         struct obd_device *obd = exp->exp_obd;
1488         struct lmv_obd *lmv = &obd->u.lmv;
1489         int i, c, rc = 0;
1490         struct mea *mea;
1491         struct ll_fid mfid;
1492         int lcount;
1493         ENTRY;
1494
1495         rc = lmv_check_connect(obd);
1496         if (rc)
1497                 RETURN(rc);
1498
1499         LASSERT(oa != NULL);
1500         
1501         if (ea == NULL) {
1502                 rc = lmv_obd_create_single(exp, oa, NULL, oti);
1503                 RETURN(rc);
1504         }
1505
1506         if (*ea == NULL) {
1507                 rc = obd_alloc_diskmd(exp, (struct lov_mds_md **)ea);
1508                 if (rc < 0) {
1509                         CERROR("obd_alloc_diskmd() failed, error %d\n",
1510                                rc);
1511                         RETURN(rc);
1512                 }
1513                 
1514                 if (*ea == NULL)
1515                         RETURN(-EINVAL);
1516         }
1517
1518         rc = 0;
1519         mfid.id = oa->o_id;
1520         mfid.generation = oa->o_generation;
1521         
1522         mea = (struct mea *)*ea;
1523         if (!mea->mea_count || mea->mea_count > lmv->desc.ld_tgt_count)
1524                 mea->mea_count = lmv->desc.ld_tgt_count;
1525         mea->mea_magic = MEA_MAGIC_ALL_CHARS;
1526
1527         mea->mea_master = -1;
1528         lcount = lmv->desc.ld_tgt_count;
1529         for (i = 0, c = 0; c < mea->mea_count && i < lcount; i++) {
1530                 struct lov_stripe_md obj_md;
1531                 struct lov_stripe_md *obj_mdp = &obj_md;
1532                
1533                 if (lmv->tgts[i].ltd_exp == NULL) {
1534                         /* this is master MDS */
1535                         mea->mea_fids[c].id = mfid.id;
1536                         mea->mea_fids[c].generation = mfid.generation;
1537                         mea->mea_fids[c].mds = i;
1538                         mea->mea_master = i;
1539                         c++;
1540                         continue;
1541                 }
1542
1543                 /* "master" MDS should always be part of stripped dir, so scan
1544                    for it. */
1545                 if (mea->mea_master == -1 && c == mea->mea_count - 1)
1546                         continue;
1547
1548                 oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLTYPE | OBD_MD_FLMODE
1549                         | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLID;
1550
1551                 rc = obd_create(lmv->tgts[c].ltd_exp, oa, &obj_mdp, oti);
1552                 if (rc) {
1553                         CERROR("obd_create() failed on MDT target %d, "
1554                                "error %d\n", c, rc);
1555                         RETURN(rc);
1556                 }
1557
1558                 mea->mea_fids[c].id = oa->o_id;
1559                 mea->mea_fids[c].generation = oa->o_generation;
1560                 mea->mea_fids[c].mds = i;
1561                 c++;
1562                 CDEBUG(D_OTHER, "dirobj at mds %d: "LPU64"/%u\n",
1563                        i, oa->o_id, oa->o_generation);
1564         }
1565         LASSERT(c == mea->mea_count);
1566         CDEBUG(D_OTHER, "%d dirobjects created\n", (int) mea->mea_count);
1567
1568         RETURN(rc);
1569 }
1570
1571 static int lmv_llog_init(struct obd_device *obd, struct obd_llogs *llogs, 
1572                          struct obd_device *tgt, int count,
1573                          struct llog_catid *logid)
1574 {
1575         struct llog_ctxt *ctxt;
1576         int rc;
1577         ENTRY;
1578
1579         rc = obd_llog_setup(obd, llogs, LLOG_CONFIG_REPL_CTXT, tgt, 0, NULL,
1580                             &llog_client_ops);
1581         if (rc == 0) {
1582                 ctxt = llog_get_context(llogs, LLOG_CONFIG_REPL_CTXT);
1583                 ctxt->loc_imp = tgt->u.cli.cl_import;
1584         }
1585
1586         RETURN(rc);
1587 }
1588
1589 static int lmv_llog_finish(struct obd_device *obd,
1590                            struct obd_llogs *llogs, int count)
1591 {
1592         int rc;
1593         ENTRY;
1594
1595         rc = obd_llog_cleanup(llog_get_context(llogs, LLOG_CONFIG_REPL_CTXT));
1596         RETURN(rc);
1597 }
1598
1599 static int lmv_get_info(struct obd_export *exp, __u32 keylen,
1600                         void *key, __u32 *vallen, void *val)
1601 {
1602         struct obd_device *obd;
1603         struct lmv_obd *lmv;
1604         ENTRY;
1605
1606         obd = class_exp2obd(exp);
1607         if (obd == NULL) {
1608                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
1609                        exp->exp_handle.h_cookie);
1610                 RETURN(-EINVAL);
1611         }
1612
1613         lmv = &obd->u.lmv;
1614         if (keylen == 6 && memcmp(key, "mdsize", 6) == 0) {
1615                 __u32 *mdsize = val;
1616                 *vallen = sizeof(__u32);
1617                 *mdsize = sizeof(struct ll_fid) * lmv->desc.ld_tgt_count
1618                                 + sizeof(struct mea);
1619                 RETURN(0);
1620         } else if (keylen == 6 && memcmp(key, "mdsnum", 6) == 0) {
1621                 struct obd_uuid *cluuid = &lmv->cluuid;
1622                 struct lmv_tgt_desc *tgts;
1623                 __u32 *mdsnum = val;
1624                 int i;
1625
1626                 for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
1627                         if (obd_uuid_equals(&tgts->uuid, cluuid)) {
1628                                 *vallen = sizeof(__u32);
1629                                 *mdsnum = i;
1630                                 RETURN(0);
1631                         }
1632                 }
1633                 LASSERT(0);
1634         }
1635
1636         CDEBUG(D_IOCTL, "invalid key\n");
1637         RETURN(-EINVAL);
1638 }
1639
1640 int lmv_set_info(struct obd_export *exp, obd_count keylen,
1641                  void *key, obd_count vallen, void *val)
1642 {
1643         struct obd_device *obd;
1644         struct lmv_obd *lmv;
1645         ENTRY;
1646
1647         obd = class_exp2obd(exp);
1648         if (obd == NULL) {
1649                 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
1650                        exp->exp_handle.h_cookie);
1651                 RETURN(-EINVAL);
1652         }
1653         lmv = &obd->u.lmv;
1654
1655         if (keylen >= strlen("inter_mds") && strcmp(key, "inter_mds") == 0) {
1656                 lmv->server_timeout = 1;
1657                 lmv_set_timeouts(obd);
1658                 RETURN(0);
1659         }
1660         
1661         RETURN(-EINVAL);
1662 }
1663
1664 int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
1665                struct lov_stripe_md *lsm)
1666 {
1667         struct obd_device *obd = class_exp2obd(exp);
1668         struct lmv_obd *lmv = &obd->u.lmv;
1669         int mea_size;
1670         ENTRY;
1671
1672         mea_size = sizeof(struct ll_fid) * 
1673                 lmv->desc.ld_tgt_count + sizeof(struct mea);
1674         if (!lmmp)
1675                 RETURN(mea_size);
1676
1677         if (*lmmp && !lsm) {
1678                 OBD_FREE(*lmmp, mea_size);
1679                 *lmmp = NULL;
1680                 RETURN(0);
1681         }
1682
1683         if (*lmmp == NULL) {
1684                 OBD_ALLOC(*lmmp, mea_size);
1685                 if (*lmmp == NULL)
1686                         RETURN(-ENOMEM);
1687         }
1688
1689         if (!lsm)
1690                 RETURN(mea_size);
1691
1692 #warning "MEA packing/convertation must be here! -bzzz"
1693         memcpy(*lmmp, lsm, mea_size);
1694         RETURN(mea_size);
1695 }
1696
1697 int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **mem_tgt,
1698                  struct lov_mds_md *disk_src, int mdsize)
1699 {
1700         struct obd_device *obd = class_exp2obd(exp);
1701         struct lmv_obd *lmv = &obd->u.lmv;
1702         struct mea **tmea = (struct mea **) mem_tgt;
1703         struct mea *mea = (void *) disk_src;
1704         int mea_size;
1705         ENTRY;
1706
1707         mea_size = sizeof(struct ll_fid) * 
1708                 lmv->desc.ld_tgt_count + sizeof(struct mea);
1709         if (mem_tgt == NULL)
1710                 return mea_size;
1711
1712         if (*mem_tgt != NULL && disk_src == NULL) {
1713                 OBD_FREE(*tmea, mea_size);
1714                 RETURN(0);
1715         }
1716
1717         LASSERT(mea_size == mdsize);
1718
1719         OBD_ALLOC(*tmea, mea_size);
1720         if (*tmea == NULL)
1721                 RETURN(-ENOMEM);
1722
1723         if (!disk_src)
1724                 RETURN(mea_size);
1725
1726 #warning "MEA unpacking/convertation must be here! -bzzz"
1727         memcpy(*tmea, mea, mdsize);
1728         RETURN(mea_size);
1729 }
1730
1731 int lmv_brw(int rw, struct obd_export *exp, struct obdo *oa,
1732             struct lov_stripe_md *ea, obd_count oa_bufs,
1733             struct brw_page *pgarr, struct obd_trans_info *oti)
1734 {
1735         struct obd_device *obd = exp->exp_obd;
1736         struct lmv_obd *lmv = &obd->u.lmv;
1737         struct mea *mea = (struct mea *) ea;
1738         int err;
1739       
1740         LASSERT(oa != NULL);
1741         LASSERT(ea != NULL);
1742         LASSERT(pgarr != NULL);
1743         LASSERT(oa->o_mds < lmv->desc.ld_tgt_count);
1744
1745         oa->o_gr = mea->mea_fids[oa->o_mds].generation;
1746         oa->o_id = mea->mea_fids[oa->o_mds].id;
1747         oa->o_valid =  OBD_MD_FLID | OBD_MD_FLGROUP;
1748         err = obd_brw(rw, lmv->tgts[oa->o_mds].ltd_exp, oa,
1749                       NULL, oa_bufs, pgarr, oti);
1750         RETURN(err);
1751 }
1752
1753 struct obd_ops lmv_obd_ops = {
1754         .o_owner                = THIS_MODULE,
1755         .o_attach               = lmv_attach,
1756         .o_detach               = lmv_detach,
1757         .o_setup                = lmv_setup,
1758         .o_cleanup              = lmv_cleanup,
1759         .o_connect              = lmv_connect,
1760         .o_disconnect           = lmv_disconnect,
1761         .o_statfs               = lmv_statfs,
1762         .o_llog_init            = lmv_llog_init,
1763         .o_llog_finish          = lmv_llog_finish,
1764         .o_get_info             = lmv_get_info,
1765         .o_set_info             = lmv_set_info,
1766         .o_create               = lmv_obd_create,
1767         .o_packmd               = lmv_packmd,
1768         .o_unpackmd             = lmv_unpackmd,
1769         .o_brw                  = lmv_brw,
1770         .o_init_ea_size         = lmv_init_ea_size,
1771         .o_notify               = lmv_notify,
1772         .o_iocontrol            = lmv_iocontrol,
1773 };
1774
1775 struct md_ops lmv_md_ops = {
1776         .m_getstatus            = lmv_getstatus,
1777         .m_getattr              = lmv_getattr,
1778         .m_change_cbdata        = lmv_change_cbdata,
1779         .m_change_cbdata_name   = lmv_change_cbdata_name,
1780         .m_close                = lmv_close,
1781         .m_create               = lmv_create,
1782         .m_done_writing         = lmv_done_writing,
1783         .m_enqueue              = lmv_enqueue,
1784         .m_getattr_name         = lmv_getattr_name,
1785         .m_intent_lock          = lmv_intent_lock,
1786         .m_link                 = lmv_link,
1787         .m_rename               = lmv_rename,
1788         .m_setattr              = lmv_setattr,
1789         .m_sync                 = lmv_sync,
1790         .m_readpage             = lmv_readpage,
1791         .m_unlink               = lmv_unlink,
1792         .m_get_real_obd         = lmv_get_real_obd,
1793         .m_valid_attrs          = lmv_valid_attrs,
1794         .m_delete_object        = lmv_delete_object,
1795 };
1796
1797 int __init lmv_init(void)
1798 {
1799         struct lprocfs_static_vars lvars;
1800         int rc;
1801
1802         lprocfs_init_vars(lmv, &lvars);
1803         rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
1804                                  lvars.module_vars, OBD_LMV_DEVICENAME);
1805         RETURN(rc);
1806 }
1807
1808 #ifdef __KERNEL__
1809 static void lmv_exit(void)
1810 {
1811         class_unregister_type(OBD_LMV_DEVICENAME);
1812 }
1813
1814 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1815 MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
1816 MODULE_LICENSE("GPL");
1817
1818 module_init(lmv_init);
1819 module_exit(lmv_exit);
1820 #endif