Whamcloud - gitweb
- fixes in llite with using mds_body vs. mdt_body
[fs/lustre-release.git] / lustre / llite / llite_lib.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Lustre Light Super operations
5  *
6  *  Copyright (c) 2002-2005 Cluster File Systems, Inc.
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #define DEBUG_SUBSYSTEM S_LLITE
25
26 #include <linux/module.h>
27 #include <linux/types.h>
28 #include <linux/random.h>
29 #include <linux/version.h>
30
31 #include <linux/lustre_idl.h>
32 #include <linux/lustre_lite.h>
33 #include <linux/lustre_ha.h>
34 #include <linux/lustre_ver.h>
35 #include <linux/lustre_dlm.h>
36 #include <linux/lprocfs_status.h>
37 #include <linux/lustre_disk.h>
38 #include "llite_internal.h"
39
40 kmem_cache_t *ll_file_data_slab;
41
42 LIST_HEAD(ll_super_blocks);
43 spinlock_t ll_sb_lock = SPIN_LOCK_UNLOCKED;
44
45 extern struct address_space_operations ll_aops;
46 extern struct address_space_operations ll_dir_aops;
47
48 #ifndef log2
49 #define log2(n) ffz(~(n))
50 #endif
51
52
53 struct ll_sb_info *ll_init_sbi(void)
54 {
55         struct ll_sb_info *sbi = NULL;
56         class_uuid_t uuid;
57         ENTRY;
58
59         OBD_ALLOC(sbi, sizeof(*sbi));
60         if (!sbi)
61                 RETURN(NULL);
62
63         spin_lock_init(&sbi->ll_lock);
64         spin_lock_init(&sbi->ll_lco.lco_lock);
65         INIT_LIST_HEAD(&sbi->ll_pglist);
66         sbi->ll_pglist_gen = 0;
67         if (num_physpages >> (20 - PAGE_SHIFT) < 512)
68                 sbi->ll_async_page_max = num_physpages / 2;
69         else
70                 sbi->ll_async_page_max = (num_physpages / 4) * 3;
71         sbi->ll_ra_info.ra_max_pages = min(num_physpages / 8,
72                                            SBI_DEFAULT_READAHEAD_MAX);
73
74         INIT_LIST_HEAD(&sbi->ll_conn_chain);
75         INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
76
77         class_generate_random_uuid(uuid);
78         class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
79         CDEBUG(D_HA, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
80
81         spin_lock(&ll_sb_lock);
82         list_add_tail(&sbi->ll_list, &ll_super_blocks);
83         spin_unlock(&ll_sb_lock);
84
85         INIT_LIST_HEAD(&sbi->ll_deathrow);
86         spin_lock_init(&sbi->ll_deathrow_lock);
87         RETURN(sbi);
88 }
89
90 void ll_free_sbi(struct super_block *sb)
91 {
92         struct ll_sb_info *sbi = ll_s2sbi(sb);
93         ENTRY;
94
95         if (sbi != NULL) {
96                 spin_lock(&ll_sb_lock);
97                 list_del(&sbi->ll_list);
98                 spin_unlock(&ll_sb_lock);
99                 OBD_FREE(sbi, sizeof(*sbi));
100         }
101         EXIT;
102 }
103
104 static struct dentry_operations ll_d_root_ops = {
105         .d_compare = ll_dcompare,
106 };
107
108 int client_common_fill_super(struct super_block *sb, char *mdc, char *osc)
109 {
110         struct inode *root = 0;
111         struct ll_sb_info *sbi = ll_s2sbi(sb);
112         struct obd_device *obd;
113         struct lu_fid rootfid;
114         struct obd_statfs osfs;
115         struct ptlrpc_request *request = NULL;
116         struct lustre_handle osc_conn = {0, };
117         struct lustre_handle mdc_conn = {0, };
118         struct lustre_md md;
119         struct obd_connect_data *data = NULL;
120         int err;
121         ENTRY;
122
123         obd = class_name2obd(mdc);
124         if (!obd) {
125                 CERROR("MDC %s: not setup or attached\n", mdc);
126                 RETURN(-EINVAL);
127         }
128
129         OBD_ALLOC(data, sizeof(*data));
130         if (data == NULL)
131                 RETURN(-ENOMEM);
132
133         if (proc_lustre_fs_root) {
134                 err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
135                                                   osc, mdc);
136                 if (err < 0)
137                         CERROR("could not register mount in /proc/lustre");
138         }
139
140         /* indicate that inodebits locking is supported by this client */
141         data->ocd_connect_flags |= OBD_CONNECT_IBITS;
142         data->ocd_ibits_known = MDS_INODELOCK_FULL;
143
144         if (sb->s_flags & MS_RDONLY)
145                 data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
146         if (sbi->ll_flags & LL_SBI_USER_XATTR)
147                 data->ocd_connect_flags |= OBD_CONNECT_XATTR;
148         data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_JOIN;
149
150         if (sbi->ll_flags & LL_SBI_FLOCK) {
151                 sbi->ll_fop = &ll_file_operations_flock;
152         } else {
153                 sbi->ll_fop = &ll_file_operations;
154         }
155
156         data->ocd_connect_flags |= OBD_CONNECT_VERSION;
157         data->ocd_version = LUSTRE_VERSION_CODE;
158
159         err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data);
160         if (err == -EBUSY) {
161                 CERROR("An MDT (mdc %s) is performing recovery, of which this"
162                        " client is not a part.  Please wait for recovery to "
163                        "complete, abort, or time out.\n", mdc);
164                 GOTO(out, err);
165         } else if (err) {
166                 CERROR("cannot connect to %s: rc = %d\n", mdc, err);
167                 GOTO(out, err);
168         }
169         sbi->ll_mdc_exp = class_conn2export(&mdc_conn);
170
171         err = obd_statfs(obd, &osfs, jiffies - HZ);
172         if (err)
173                 GOTO(out_mdc, err);
174
175         /* async connect is surely finished by now */
176         *data = class_exp2cliimp(sbi->ll_mdc_exp)->imp_connect_data;
177
178         LASSERT(osfs.os_bsize);
179         sb->s_blocksize = osfs.os_bsize;
180         sb->s_blocksize_bits = log2(osfs.os_bsize);
181         sb->s_magic = LL_SUPER_MAGIC;
182         sb->s_maxbytes = PAGE_CACHE_MAXBYTES;
183         sbi->ll_namelen = osfs.os_namelen;
184
185         if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
186             !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
187                 LCONSOLE_INFO("Disabling user_xattr feature because "
188                               "it is not supported on the server\n"); 
189                 sbi->ll_flags &= ~LL_SBI_USER_XATTR;
190         }
191
192         if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
193 #ifdef MS_POSIXACL
194                 sb->s_flags |= MS_POSIXACL;
195 #endif
196                 sbi->ll_flags |= LL_SBI_ACL;
197         } else
198                 sbi->ll_flags &= ~LL_SBI_ACL;
199
200         if (data->ocd_connect_flags & OBD_CONNECT_JOIN)
201                 sbi->ll_flags |= LL_SBI_JOIN;
202
203 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
204         /* We set sb->s_dev equal on all lustre clients in order to support
205          * NFS export clustering.  NFSD requires that the FSID be the same
206          * on all clients. */
207         /* s_dev is also used in lt_compare() to compare two fs, but that is
208          * only a node-local comparison. */
209         sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid,
210                          strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid));
211 #endif
212
213         obd = class_name2obd(osc);
214         if (!obd) {
215                 CERROR("OSC %s: not setup or attached\n", osc);
216                 GOTO(out_mdc, err);
217         }
218
219         data->ocd_connect_flags =
220                 OBD_CONNECT_GRANT|OBD_CONNECT_VERSION|OBD_CONNECT_REQPORTAL;
221
222         CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
223                "ocd_grant: %d\n", data->ocd_connect_flags,
224                data->ocd_version, data->ocd_grant);
225
226         obd->obd_upcall.onu_owner = &sbi->ll_lco;
227         obd->obd_upcall.onu_upcall = ll_ocd_update;
228
229         err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, data);
230         if (err == -EBUSY) {
231                 CERROR("An OST (osc %s) is performing recovery, of which this"
232                        " client is not a part.  Please wait for recovery to "
233                        "complete, abort, or time out.\n", osc);
234                 GOTO(out, err);
235         } else if (err) {
236                 CERROR("cannot connect to %s: rc = %d\n", osc, err);
237                 GOTO(out_mdc, err);
238         }
239         sbi->ll_osc_exp = class_conn2export(&osc_conn);
240         spin_lock(&sbi->ll_lco.lco_lock);
241         sbi->ll_lco.lco_flags = data->ocd_connect_flags;
242         spin_unlock(&sbi->ll_lco.lco_lock);
243
244         mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp);
245
246         err = obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL,
247                                   0, NULL, NULL, NULL);
248         if (err < 0) {
249                 LCONSOLE_ERROR("There are no OST's in this filesystem. "
250                                "There must be at least one active OST for "
251                                "a client to start.\n");
252                 GOTO(out_osc, err);
253         }
254
255         if (!ll_async_page_slab) {
256                 ll_async_page_slab_size =
257                         size_round(sizeof(struct ll_async_page)) + err;
258                 ll_async_page_slab = kmem_cache_create("ll_async_page",
259                                                        ll_async_page_slab_size,
260                                                        0, 0, NULL, NULL);
261                 if (!ll_async_page_slab)
262                         GOTO(out_osc, -ENOMEM);
263         }
264
265         err = mdc_getstatus(sbi->ll_mdc_exp, &rootfid);
266         if (err) {
267                 CERROR("cannot mds_connect: rc = %d\n", err);
268                 GOTO(out_osc, err);
269         }
270         CDEBUG(D_SUPER, "rootfid "DFID3"\n", PFID3(&rootfid));
271         sbi->ll_root_fid = rootfid;
272
273         sb->s_op = &lustre_super_operations;
274
275         /* make root inode
276          * XXX: move this to after cbd setup? */
277         err = mdc_getattr(sbi->ll_mdc_exp, &rootfid,
278                           OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS |
279                           (sbi->ll_flags & LL_SBI_ACL ? OBD_MD_FLACL : 0),
280                           0, &request);
281         if (err) {
282                 CERROR("mdc_getattr failed for root: rc = %d\n", err);
283                 GOTO(out_osc, err);
284         }
285
286         err = mdc_req2lustre_md(request, 0, sbi->ll_osc_exp, &md);
287         if (err) {
288                 CERROR("failed to understand root inode md: rc = %d\n", err);
289                 ptlrpc_req_finished (request);
290                 GOTO(out_osc, err);
291         }
292
293         LASSERT(fid_oid(&sbi->ll_root_fid) != 0);
294         root = ll_iget(sb, ll_fid2ino(sbi, &sbi->ll_root_fid), &md);
295         ll_i2info(root)->lli_fid = sbi->ll_root_fid;
296         ptlrpc_req_finished(request);
297
298         if (root == NULL || is_bad_inode(root)) {
299                 mdc_free_lustre_md(sbi->ll_osc_exp, &md);
300                 CERROR("lustre_lite: bad iget4 for root\n");
301                 GOTO(out_root, err = -EBADF);
302         }
303
304         err = ll_close_thread_start(&sbi->ll_lcq);
305         if (err) {
306                 CERROR("cannot start close thread: rc %d\n", err);
307                 GOTO(out_root, err);
308         }
309
310         /* making vm readahead 0 for 2.4.x. In the case of 2.6.x,
311            backing dev info assigned to inode mapping is used for
312            determining maximal readahead. */
313 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \
314     !defined(KERNEL_HAS_AS_MAX_READAHEAD)
315         /* bug 2805 - set VM readahead to zero */
316         vm_max_readahead = vm_min_readahead = 0;
317 #endif
318
319         sb->s_root = d_alloc_root(root);
320         if (data != NULL)
321                 OBD_FREE(data, sizeof(*data));
322         sb->s_root->d_op = &ll_d_root_ops;
323         RETURN(err);
324
325 out_root:
326         if (root)
327                 iput(root);
328 out_osc:
329         obd_disconnect(sbi->ll_osc_exp);
330 out_mdc:
331         obd_disconnect(sbi->ll_mdc_exp);
332 out:
333         if (data != NULL)
334                 OBD_FREE(data, sizeof(*data));
335         lprocfs_unregister_mountpoint(sbi);
336         RETURN(err);
337 }
338
339 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
340 {
341         int size, rc;
342
343         *lmmsize = obd_size_diskmd(sbi->ll_osc_exp, NULL);
344         size = sizeof(int);
345         rc = obd_get_info(sbi->ll_mdc_exp, strlen("max_easize"), "max_easize", 
346                           &size, lmmsize);
347         if (rc) 
348                 CERROR("Get max mdsize error rc %d \n", rc);
349         
350         RETURN(rc);
351 }
352
353 void ll_dump_inode(struct inode *inode)
354 {
355         struct list_head *tmp;
356         int dentry_count = 0;
357
358         LASSERT(inode != NULL);
359
360         list_for_each(tmp, &inode->i_dentry)
361                 dentry_count++;
362
363         CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n",
364                inode, ll_i2mdcexp(inode)->exp_obd->obd_name, inode->i_ino,
365                inode->i_mode, atomic_read(&inode->i_count), dentry_count);
366 }
367
368 void lustre_dump_dentry(struct dentry *dentry, int recur)
369 {
370         struct list_head *tmp;
371         int subdirs = 0;
372
373         LASSERT(dentry != NULL);
374
375         list_for_each(tmp, &dentry->d_subdirs)
376                 subdirs++;
377
378         CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u,"
379                " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
380                dentry->d_name.len, dentry->d_name.name,
381                dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
382                dentry->d_parent, dentry->d_inode, atomic_read(&dentry->d_count),
383                dentry->d_flags, dentry->d_fsdata, subdirs);
384         if (dentry->d_inode != NULL)
385                 ll_dump_inode(dentry->d_inode);
386
387         if (recur == 0)
388                 return;
389
390         list_for_each(tmp, &dentry->d_subdirs) {
391                 struct dentry *d = list_entry(tmp, struct dentry, d_child);
392                 lustre_dump_dentry(d, recur - 1);
393         }
394 }
395
396 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
397 void lustre_throw_orphan_dentries(struct super_block *sb)
398 {
399         struct hlist_node *tmp, *next;
400         struct ll_sb_info *sbi = ll_s2sbi(sb);
401
402         /* Do this to get rid of orphaned dentries. That is not really trw. */
403         hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
404                 struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash);
405                 CWARN("found orphan dentry %.*s (%p->%p) at unmount, dumping "
406                       "before and after shrink_dcache_parent\n",
407                       dentry->d_name.len, dentry->d_name.name, dentry, next);
408                 lustre_dump_dentry(dentry, 1);
409                 shrink_dcache_parent(dentry);
410                 lustre_dump_dentry(dentry, 1);
411         }
412 }
413 #else
414 #define lustre_throw_orphan_dentries(sb)
415 #endif
416
417 static void prune_deathrow(struct ll_sb_info *sbi, int try)
418 {
419         LIST_HEAD(throw_away);
420         int locked = 0;
421         ENTRY;
422
423         if (try) {
424                 locked = spin_trylock(&sbi->ll_deathrow_lock);
425         } else {
426                 spin_lock(&sbi->ll_deathrow_lock);
427                 locked = 1;
428         }
429
430         if (!locked) {
431                 EXIT;
432                 return;
433         }
434
435         list_splice_init(&sbi->ll_deathrow, &throw_away);
436         spin_unlock(&sbi->ll_deathrow_lock);
437
438         while (!list_empty(&throw_away)) {
439                 struct ll_inode_info *lli;
440                 struct inode *inode;
441
442                 lli = list_entry(throw_away.next, struct ll_inode_info,
443                                  lli_dead_list);
444                 list_del_init(&lli->lli_dead_list);
445
446                 inode = ll_info2i(lli);
447                 d_prune_aliases(inode);
448
449                 CDEBUG(D_INODE, "prune duplicate inode %p inum %lu count %u\n",
450                        inode, inode->i_ino, atomic_read(&inode->i_count));
451                 iput(inode);
452         }
453         EXIT;
454 }
455
456 void client_common_put_super(struct super_block *sb)
457 {
458         struct ll_sb_info *sbi = ll_s2sbi(sb);
459         ENTRY;
460
461         ll_close_thread_shutdown(sbi->ll_lcq);
462
463         /* destroy inodes in deathrow */
464         prune_deathrow(sbi, 0);
465
466         list_del(&sbi->ll_conn_chain);
467         obd_disconnect(sbi->ll_osc_exp);
468
469         lprocfs_unregister_mountpoint(sbi);
470         if (sbi->ll_proc_root) {
471                 lprocfs_remove(sbi->ll_proc_root);
472                 sbi->ll_proc_root = NULL;
473         }
474
475         obd_disconnect(sbi->ll_mdc_exp);
476
477         lustre_throw_orphan_dentries(sb);
478         EXIT;
479 }
480
481 char *ll_read_opt(const char *opt, char *data)
482 {
483         char *value;
484         char *retval;
485         ENTRY;
486
487         CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
488         if (strncmp(opt, data, strlen(opt)))
489                 RETURN(NULL);
490         if ((value = strchr(data, '=')) == NULL)
491                 RETURN(NULL);
492
493         value++;
494         OBD_ALLOC(retval, strlen(value) + 1);
495         if (!retval) {
496                 CERROR("out of memory!\n");
497                 RETURN(NULL);
498         }
499
500         memcpy(retval, value, strlen(value)+1);
501         CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval);
502         RETURN(retval);
503 }
504
505 static inline int ll_set_opt(const char *opt, char *data, int fl)
506 {
507         if (strncmp(opt, data, strlen(opt)) != 0)
508                 return(0);
509         else
510                 return(fl);
511 }
512
513 /* non-client-specific mount options are parsed in lmd_parse */
514 void ll_options(char *options, int *flags)
515 {
516         int tmp;
517         char *s1 = options, *s2;
518         ENTRY;
519
520         if (!options) {
521                 EXIT;
522                 return;
523         }
524
525         CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
526
527         while (*s1) {
528                 CDEBUG(D_SUPER, "next opt=%s\n", s1);
529                 tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
530                 if (tmp) {
531                         *flags |= tmp;
532                         goto next;
533                 }
534                 tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
535                 if (tmp) {
536                         *flags |= tmp;
537                         goto next;
538                 }
539                 tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK);
540                 if (tmp) {
541                         *flags &= ~tmp;
542                         goto next;
543                 }
544                 tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
545                 if (tmp) {
546                         *flags |= tmp;
547                         goto next;
548                 }
549                 tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
550                 if (tmp) {
551                         *flags &= ~tmp;
552                         goto next;
553                 }
554                 tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
555                 if (tmp) {
556                         /* Ignore deprecated mount option.  The client will
557                          * always try to mount with ACL support, whether this
558                          * is used depends on whether server supports it. */
559                         goto next;
560                 }
561                 tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
562                 if (tmp) {
563                         goto next;
564                 }
565
566 next:
567                 /* Find next opt */
568                 s2 = strchr(s1, ',');
569                 if (s2 == NULL) 
570                         break;
571                 s1 = s2 + 1;
572         }
573         EXIT;
574 }
575                 
576 void ll_lli_init(struct ll_inode_info *lli)
577 {
578         sema_init(&lli->lli_open_sem, 1);
579         sema_init(&lli->lli_size_sem, 1);
580         lli->lli_flags = 0;
581         lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
582         spin_lock_init(&lli->lli_lock);
583         INIT_LIST_HEAD(&lli->lli_pending_write_llaps);
584         lli->lli_inode_magic = LLI_INODE_MAGIC;
585         INIT_LIST_HEAD(&lli->lli_dead_list);
586 }
587
588 int ll_fill_super(struct super_block *sb)
589 {
590         struct lustre_profile *lprof;
591         struct lustre_sb_info *lsi = s2lsi(sb);
592         struct ll_sb_info *sbi;
593         char  *osc = NULL;
594         char  *mdc = NULL;
595         char  *profilenm = get_profile_name(sb);
596         struct config_llog_instance cfg;
597         char   ll_instance[sizeof(sb) * 2 + 1];
598         int    err;
599         ENTRY;
600                                                                                  
601         CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
602
603         /* client additional sb info */
604         lsi->lsi_llsbi = sbi = ll_init_sbi();
605         if (!sbi) 
606                 RETURN(-ENOMEM);
607
608         ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
609         
610         /* Generate a string unique to this super, in case some joker tries
611            to mount the same fs at two mount points. 
612            Use the address of the super itself.*/
613         sprintf(ll_instance, "%p", sb);
614         cfg.cfg_instance = ll_instance;
615         cfg.cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
616         cfg.cfg_last_idx = 0;
617
618         /* set up client obds */
619         err = lustre_process_log(sb, profilenm, &cfg);
620         if (err < 0) {
621                 CERROR("Unable to process log: %d\n", err);
622                 GOTO(out_free, err);
623         }
624
625         lprof = class_get_profile(profilenm);
626         if (lprof == NULL) {
627                 CERROR("No profile found: %s\n", profilenm);
628                 GOTO(out_free, err = -EINVAL);
629         }
630         CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, 
631                lprof->lp_mdc, lprof->lp_osc);
632
633         OBD_ALLOC(osc, strlen(lprof->lp_osc) +
634                   strlen(ll_instance) + 2);
635         if (!osc) 
636                 GOTO(out_free, err = -ENOMEM);
637         sprintf(osc, "%s-%s", lprof->lp_osc, ll_instance);
638
639         OBD_ALLOC(mdc, strlen(lprof->lp_mdc) +
640                   strlen(ll_instance) + 2);
641         if (!mdc) 
642                 GOTO(out_free, err = -ENOMEM);
643         sprintf(mdc, "%s-%s", lprof->lp_mdc, ll_instance);
644   
645         /* connections, registrations, sb setup */
646         err = client_common_fill_super(sb, mdc, osc);
647   
648 out_free:
649         if (mdc)
650                 OBD_FREE(mdc, strlen(mdc) + 1);
651         if (osc)
652                 OBD_FREE(osc, strlen(osc) + 1);
653         if (err) {
654                 struct obd_device *obd;
655                 int next = 0;
656                 /* like client_put_super below */
657                 while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) 
658                        != NULL) {
659                         class_manual_cleanup(obd);
660                 }                       
661                 class_del_profile(profilenm);
662                 ll_free_sbi(sb);
663                 lsi->lsi_llsbi = NULL;
664         }
665         RETURN(err);
666 } /* ll_fill_super */
667
668
669 void ll_put_super(struct super_block *sb)
670 {
671         struct config_llog_instance cfg;
672         char   ll_instance[sizeof(sb) * 2 + 1];
673         struct obd_device *obd;
674         struct lustre_sb_info *lsi = s2lsi(sb);
675         struct ll_sb_info *sbi = ll_s2sbi(sb);
676         char *profilenm = get_profile_name(sb);
677         int next = 0;
678         ENTRY;
679
680         CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
681         
682         sprintf(ll_instance, "%p", sb);
683         cfg.cfg_instance = ll_instance;
684         lustre_end_log(sb, NULL, &cfg);
685         
686         obd = class_exp2obd(sbi->ll_mdc_exp);
687         if (obd) {
688                 int next = 0;
689                 int force = obd->obd_no_recov;
690                 /* We need to set force before the lov_disconnect in 
691                 lustre_common_put_super, since l_d cleans up osc's as well. */
692                 while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) 
693                        != NULL) {
694                         obd->obd_force = force;
695                 }                       
696         }
697
698         client_common_put_super(sb);
699                 
700         while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
701                 class_manual_cleanup(obd);
702         }                       
703         
704         if (profilenm) 
705                 class_del_profile(profilenm);
706
707         ll_free_sbi(sb);
708         lsi->lsi_llsbi = NULL;
709
710         lustre_common_put_super(sb);
711
712         CDEBUG(D_WARNING, "client umount done\n");
713         EXIT;
714 } /* client_put_super */
715
716 #ifdef HAVE_REGISTER_CACHE
717 #include <linux/cache_def.h>
718 #ifdef HAVE_CACHE_RETURN_INT
719 static int
720 #else
721 static void
722 #endif
723 ll_shrink_cache(int priority, unsigned int gfp_mask)
724 {
725         struct ll_sb_info *sbi;
726         int count = 0;
727
728         list_for_each_entry(sbi, &ll_super_blocks, ll_list)
729                 count += llap_shrink_cache(sbi, priority);
730
731 #ifdef HAVE_CACHE_RETURN_INT
732         return count;
733 #endif
734 }
735
736 struct cache_definition ll_cache_definition = {
737         .name = "llap_cache",
738         .shrink = ll_shrink_cache
739 };
740 #endif /* HAVE_REGISTER_CACHE */
741
742 struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
743 {
744         struct inode *inode = NULL;
745         l_lock(&lock->l_resource->lr_namespace->ns_lock);
746         if (lock->l_ast_data) {
747                 struct ll_inode_info *lli = ll_i2info(lock->l_ast_data);
748                 if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
749                         inode = igrab(lock->l_ast_data);
750                 } else {
751                         inode = lock->l_ast_data;
752                         __LDLM_DEBUG(inode->i_state & I_FREEING ?
753                                      D_INFO : D_WARNING, lock,
754                                      "l_ast_data %p is bogus: magic %08x",
755                                      lock->l_ast_data, lli->lli_inode_magic);
756                         inode = NULL;
757                 }
758         }
759         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
760         return inode;
761 }
762
763 static int null_if_equal(struct ldlm_lock *lock, void *data)
764 {
765         if (data == lock->l_ast_data) {
766                 lock->l_ast_data = NULL;
767
768                 if (lock->l_req_mode != lock->l_granted_mode)
769                         LDLM_ERROR(lock,"clearing inode with ungranted lock");
770         }
771
772         return LDLM_ITER_CONTINUE;
773 }
774
775 void ll_clear_inode(struct inode *inode)
776 {
777         struct lu_fid fid;
778         struct ll_inode_info *lli = ll_i2info(inode);
779         struct ll_sb_info *sbi = ll_i2sbi(inode);
780         ENTRY;
781
782         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
783                inode->i_generation, inode);
784
785         ll_inode2fid(&fid, inode);
786         clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
787         mdc_change_cbdata(sbi->ll_mdc_exp, &fid, null_if_equal, inode);
788
789         if (lli->lli_smd) {
790                 obd_change_cbdata(sbi->ll_osc_exp, lli->lli_smd,
791                                   null_if_equal, inode);
792
793                 obd_free_memmd(sbi->ll_osc_exp, &lli->lli_smd);
794                 lli->lli_smd = NULL;
795         }
796
797         if (lli->lli_symlink_name) {
798                 OBD_FREE(lli->lli_symlink_name,
799                          strlen(lli->lli_symlink_name) + 1);
800                 lli->lli_symlink_name = NULL;
801         }
802
803 #ifdef CONFIG_FS_POSIX_ACL
804         if (lli->lli_posix_acl) {
805                 LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
806                 posix_acl_release(lli->lli_posix_acl);
807                 lli->lli_posix_acl = NULL;
808         }
809 #endif
810
811         lli->lli_inode_magic = LLI_INODE_DEAD;
812
813         spin_lock(&sbi->ll_deathrow_lock);
814         list_del_init(&lli->lli_dead_list);
815         spin_unlock(&sbi->ll_deathrow_lock);
816
817         EXIT;
818 }
819
820 /* If this inode has objects allocated to it (lsm != NULL), then the OST
821  * object(s) determine the file size and mtime.  Otherwise, the MDS will
822  * keep these values until such a time that objects are allocated for it.
823  * We do the MDS operations first, as it is checking permissions for us.
824  * We don't to the MDS RPC if there is nothing that we want to store there,
825  * otherwise there is no harm in updating mtime/atime on the MDS if we are
826  * going to do an RPC anyways.
827  *
828  * If we are doing a truncate, we will send the mtime and ctime updates
829  * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
830  * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
831  * at the same time.
832  */
833 int ll_setattr_raw(struct inode *inode, struct iattr *attr)
834 {
835         struct ll_inode_info *lli = ll_i2info(inode);
836         struct lov_stripe_md *lsm = lli->lli_smd;
837         struct ll_sb_info *sbi = ll_i2sbi(inode);
838         struct ptlrpc_request *request = NULL;
839         struct mdc_op_data op_data;
840         int ia_valid = attr->ia_valid;
841         int rc = 0;
842         ENTRY;
843
844         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu valid %x\n", inode->i_ino,
845                attr->ia_valid);
846         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_SETATTR);
847
848         if (ia_valid & ATTR_SIZE) {
849                 if (attr->ia_size > ll_file_maxbytes(inode)) {
850                         CDEBUG(D_INODE, "file too large %llu > "LPU64"\n",
851                                attr->ia_size, ll_file_maxbytes(inode));
852                         RETURN(-EFBIG);
853                 }
854
855                 attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
856         }
857
858         /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
859         if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
860                 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
861                         RETURN(-EPERM);
862         }
863
864         /* We mark all of the fields "set" so MDS/OST does not re-set them */
865         if (attr->ia_valid & ATTR_CTIME) {
866                 attr->ia_ctime = CURRENT_TIME;
867                 attr->ia_valid |= ATTR_CTIME_SET;
868         }
869         if (!(ia_valid & ATTR_ATIME_SET) && (attr->ia_valid & ATTR_ATIME)) {
870                 attr->ia_atime = CURRENT_TIME;
871                 attr->ia_valid |= ATTR_ATIME_SET;
872         }
873         if (!(ia_valid & ATTR_MTIME_SET) && (attr->ia_valid & ATTR_MTIME)) {
874                 attr->ia_mtime = CURRENT_TIME;
875                 attr->ia_valid |= ATTR_MTIME_SET;
876         }
877
878         if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
879                 CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
880                        LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
881                        CURRENT_SECONDS);
882
883
884         /* NB: ATTR_SIZE will only be set after this point if the size
885          * resides on the MDS, ie, this file has no objects. */
886         if (lsm)
887                 attr->ia_valid &= ~ATTR_SIZE;
888
889         /* If only OST attributes being set on objects, don't do MDS RPC.
890          * In that case, we need to check permissions and update the local
891          * inode ourselves so we can call obdo_from_inode() always. */
892         if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
893                 struct lustre_md md;
894                 ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
895
896                 rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
897                                  attr, NULL, 0, NULL, 0, &request);
898
899                 if (rc) {
900                         ptlrpc_req_finished(request);
901                         if (rc != -EPERM && rc != -EACCES)
902                                 CERROR("mdc_setattr fails: rc = %d\n", rc);
903                         RETURN(rc);
904                 }
905
906                 rc = mdc_req2lustre_md(request, 0, sbi->ll_osc_exp, &md);
907                 if (rc) {
908                         ptlrpc_req_finished(request);
909                         RETURN(rc);
910                 }
911
912                 /* We call inode_setattr to adjust timestamps.
913                  * If there is at least some data in file, we cleared ATTR_SIZE
914                  * above to avoid invoking vmtruncate, otherwise it is important
915                  * to call vmtruncate in inode_setattr to update inode->i_size
916                  * (bug 6196) */
917                 rc = inode_setattr(inode, attr);
918
919                 ll_update_inode(inode, &md);
920                 ptlrpc_req_finished(request);
921
922                 if (!lsm || !S_ISREG(inode->i_mode)) {
923                         CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n");
924                         RETURN(rc);
925                 }
926         } else {
927                 /* The OST doesn't check permissions, but the alternative is
928                  * a gratuitous RPC to the MDS.  We already rely on the client
929                  * to do read/write/truncate permission checks, so is mtime OK?
930                  */
931                 if (ia_valid & (ATTR_MTIME | ATTR_ATIME)) {
932                         /* from sys_utime() */
933                         if (!(ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))) {
934                                 if (current->fsuid != inode->i_uid &&
935                                     (rc=ll_permission(inode,MAY_WRITE,NULL))!=0)
936                                         RETURN(rc);
937                         } else {
938                                 /* from inode_change_ok() */
939                                 if (current->fsuid != inode->i_uid &&
940                                     !capable(CAP_FOWNER))
941                                         RETURN(-EPERM);
942                         }
943                 }
944
945                 /* Won't invoke vmtruncate, as we already cleared ATTR_SIZE */
946                 rc = inode_setattr(inode, attr);
947         }
948
949         /* We really need to get our PW lock before we change inode->i_size.
950          * If we don't we can race with other i_size updaters on our node, like
951          * ll_file_read.  We can also race with i_size propogation to other
952          * nodes through dirtying and writeback of final cached pages.  This
953          * last one is especially bad for racing o_append users on other
954          * nodes. */
955         if (ia_valid & ATTR_SIZE) {
956                 ldlm_policy_data_t policy = { .l_extent = {attr->ia_size,
957                                                            OBD_OBJECT_EOF } };
958                 struct lustre_handle lockh = { 0 };
959                 int err, ast_flags = 0;
960                 /* XXX when we fix the AST intents to pass the discard-range
961                  * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
962                  * XXX here. */
963                 if (attr->ia_size == 0)
964                         ast_flags = LDLM_AST_DISCARD_DATA;
965
966                 up(&inode->i_sem);
967                 UP_WRITE_I_ALLOC_SEM(inode);
968                 rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, &lockh,
969                                     ast_flags);
970 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
971                 DOWN_WRITE_I_ALLOC_SEM(inode);
972                 down(&inode->i_sem);
973 #else
974                 down(&inode->i_sem);
975                 DOWN_WRITE_I_ALLOC_SEM(inode);
976 #endif
977                 if (rc != 0)
978                         RETURN(rc);
979
980                 /* Only ll_inode_size_lock is taken at this level.
981                  * lov_stripe_lock() is grabbed by ll_truncate() only over
982                  * call to obd_adjust_kms().  If vmtruncate returns 0, then
983                  * ll_truncate dropped ll_inode_size_lock() */
984                 ll_inode_size_lock(inode, 0);
985                 rc = vmtruncate(inode, attr->ia_size);
986                 if (rc != 0) {
987                         LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
988                         ll_inode_size_unlock(inode, 0);
989                 }
990
991                 err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
992                 if (err) {
993                         CERROR("ll_extent_unlock failed: %d\n", err);
994                         if (!rc)
995                                 rc = err;
996                 }
997         } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
998                 obd_flag flags;
999                 struct obdo oa;
1000
1001                 CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
1002                        inode->i_ino, LTIME_S(attr->ia_mtime));
1003                 
1004                 oa.o_id = lsm->lsm_object_id;
1005                 oa.o_valid = OBD_MD_FLID;
1006
1007                 flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
1008                         OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1009                         OBD_MD_FLFID | OBD_MD_FLGENER;
1010                 
1011                 obdo_from_inode(&oa, inode, flags);
1012                 rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL);
1013                 if (rc)
1014                         CERROR("obd_setattr fails: rc=%d\n", rc);
1015         }
1016         RETURN(rc);
1017 }
1018
1019 int ll_setattr(struct dentry *de, struct iattr *attr)
1020 {
1021         LBUG(); /* code is unused, but leave this in case of VFS changes */
1022         RETURN(-ENOSYS);
1023 }
1024
1025 int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
1026                        unsigned long max_age)
1027 {
1028         struct ll_sb_info *sbi = ll_s2sbi(sb);
1029         struct obd_statfs obd_osfs;
1030         int rc;
1031         ENTRY;
1032
1033         rc = obd_statfs(class_exp2obd(sbi->ll_mdc_exp), osfs, max_age);
1034         if (rc) {
1035                 CERROR("mdc_statfs fails: rc = %d\n", rc);
1036                 RETURN(rc);
1037         }
1038
1039         osfs->os_type = sb->s_magic;
1040
1041         CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
1042                osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
1043
1044         rc = obd_statfs(class_exp2obd(sbi->ll_osc_exp), &obd_osfs, max_age);
1045         if (rc) {
1046                 CERROR("obd_statfs fails: rc = %d\n", rc);
1047                 RETURN(rc);
1048         }
1049
1050         CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
1051                obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
1052                obd_osfs.os_files);
1053
1054         osfs->os_blocks = obd_osfs.os_blocks;
1055         osfs->os_bfree = obd_osfs.os_bfree;
1056         osfs->os_bavail = obd_osfs.os_bavail;
1057
1058         /* If we don't have as many objects free on the OST as inodes
1059          * on the MDS, we reduce the total number of inodes to
1060          * compensate, so that the "inodes in use" number is correct.
1061          */
1062         if (obd_osfs.os_ffree < osfs->os_ffree) {
1063                 osfs->os_files = (osfs->os_files - osfs->os_ffree) +
1064                         obd_osfs.os_ffree;
1065                 osfs->os_ffree = obd_osfs.os_ffree;
1066         }
1067
1068         RETURN(rc);
1069 }
1070
1071 int ll_statfs(struct super_block *sb, struct kstatfs *sfs)
1072 {
1073         struct obd_statfs osfs;
1074         int rc;
1075
1076         CDEBUG(D_VFSTRACE, "VFS Op:\n");
1077         lprocfs_counter_incr(ll_s2sbi(sb)->ll_stats, LPROC_LL_STAFS);
1078
1079         /* For now we will always get up-to-date statfs values, but in the
1080          * future we may allow some amount of caching on the client (e.g.
1081          * from QOS or lprocfs updates). */
1082         rc = ll_statfs_internal(sb, &osfs, jiffies - 1);
1083         if (rc)
1084                 return rc;
1085
1086         statfs_unpack(sfs, &osfs);
1087
1088         if (sizeof(sfs->f_blocks) == 4) {
1089                 while (osfs.os_blocks > ~0UL) {
1090                         sfs->f_bsize <<= 1;
1091
1092                         osfs.os_blocks >>= 1;
1093                         osfs.os_bfree >>= 1;
1094                         osfs.os_bavail >>= 1;
1095                 }
1096         }
1097
1098         sfs->f_blocks = osfs.os_blocks;
1099         sfs->f_bfree = osfs.os_bfree;
1100         sfs->f_bavail = osfs.os_bavail;
1101
1102         return 0;
1103 }
1104
1105 void ll_inode_size_lock(struct inode *inode, int lock_lsm)
1106 {
1107         struct ll_inode_info *lli;
1108         struct lov_stripe_md *lsm;
1109
1110         lli = ll_i2info(inode);
1111         LASSERT(lli->lli_size_sem_owner != current);
1112         down(&lli->lli_size_sem);
1113         LASSERT(lli->lli_size_sem_owner == NULL);
1114         lli->lli_size_sem_owner = current;
1115         lsm = lli->lli_smd;
1116         LASSERTF(lsm != NULL || lock_lsm == 0, "lsm %p, lock_lsm %d\n",
1117                  lsm, lock_lsm);
1118         if (lock_lsm)
1119                 lov_stripe_lock(lsm);
1120 }
1121
1122 void ll_inode_size_unlock(struct inode *inode, int unlock_lsm)
1123 {
1124         struct ll_inode_info *lli;
1125         struct lov_stripe_md *lsm;
1126
1127         lli = ll_i2info(inode);
1128         lsm = lli->lli_smd;
1129         LASSERTF(lsm != NULL || unlock_lsm == 0, "lsm %p, lock_lsm %d\n",
1130                  lsm, unlock_lsm);
1131         if (unlock_lsm)
1132                 lov_stripe_unlock(lsm);
1133         LASSERT(lli->lli_size_sem_owner == current);
1134         lli->lli_size_sem_owner = NULL;
1135         up(&lli->lli_size_sem);
1136 }
1137
1138 static void ll_replace_lsm(struct inode *inode, struct lov_stripe_md *lsm)
1139 {
1140         struct ll_inode_info *lli = ll_i2info(inode);
1141  
1142         dump_lsm(D_INODE, lsm);
1143         dump_lsm(D_INODE, lli->lli_smd); 
1144         LASSERTF(lsm->lsm_magic == LOV_MAGIC_JOIN, 
1145                  "lsm must be joined lsm %p\n", lsm);
1146         obd_free_memmd(ll_i2obdexp(inode), &lli->lli_smd);
1147         CDEBUG(D_INODE, "replace lsm %p to lli_smd %p for inode %lu%u(%p)\n",
1148                lsm, lli->lli_smd, inode->i_ino, inode->i_generation, inode);
1149         lli->lli_smd = lsm;
1150         lli->lli_maxbytes = lsm->lsm_maxbytes;
1151         if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
1152                 lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
1153 }
1154
1155 void ll_update_inode(struct inode *inode, struct lustre_md *md)
1156 {
1157         struct ll_inode_info *lli = ll_i2info(inode);
1158         struct mdt_body *body = md->body;
1159         struct lov_stripe_md *lsm = md->lsm;
1160
1161         LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
1162         if (lsm != NULL) {
1163                 if (lli->lli_smd == NULL) {
1164                         if (lsm->lsm_magic != LOV_MAGIC && 
1165                             lsm->lsm_magic != LOV_MAGIC_JOIN) {
1166                                 dump_lsm(D_ERROR, lsm);
1167                                 LBUG();
1168                         }
1169                         CDEBUG(D_INODE, "adding lsm %p to inode %lu/%u(%p)\n",
1170                                lsm, inode->i_ino, inode->i_generation, inode);
1171                         /* ll_inode_size_lock() requires it is only called
1172                          * with lli_smd != NULL or lock_lsm == 0 or we can
1173                          * race between lock/unlock.  bug 9547 */
1174                         lli->lli_smd = lsm;
1175                         lli->lli_maxbytes = lsm->lsm_maxbytes;
1176                         if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
1177                                 lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
1178                 } else {
1179                         if (lli->lli_smd->lsm_magic == lsm->lsm_magic &&
1180                              lli->lli_smd->lsm_stripe_count == 
1181                                         lsm->lsm_stripe_count) {
1182                                 if (lov_stripe_md_cmp(lli->lli_smd, lsm)) {
1183                                         CERROR("lsm mismatch for inode %ld\n",
1184                                                 inode->i_ino);
1185                                         CERROR("lli_smd:\n");
1186                                         dump_lsm(D_ERROR, lli->lli_smd);
1187                                         CERROR("lsm:\n");
1188                                         dump_lsm(D_ERROR, lsm);
1189                                         LBUG();
1190                                 }
1191                         } else 
1192                                 ll_replace_lsm(inode, lsm);
1193                 }
1194                 /* bug 2844 - limit i_blksize for broken user-space apps */
1195                 LASSERTF(lsm->lsm_xfersize != 0, "%lu\n", lsm->lsm_xfersize);
1196                 inode->i_blksize = min(lsm->lsm_xfersize, LL_MAX_BLKSIZE);
1197                 if (lli->lli_smd != lsm)
1198                         obd_free_memmd(ll_i2obdexp(inode), &lsm);
1199         } else {
1200                 inode->i_blksize = max(inode->i_blksize,
1201                                        inode->i_sb->s_blocksize);
1202         }
1203
1204 #ifdef CONFIG_FS_POSIX_ACL
1205         LASSERT(!md->posix_acl || (body->valid & OBD_MD_FLACL));
1206         if (body->valid & OBD_MD_FLACL) {
1207                 spin_lock(&lli->lli_lock);
1208                 if (lli->lli_posix_acl)
1209                         posix_acl_release(lli->lli_posix_acl);
1210                 lli->lli_posix_acl = md->posix_acl;
1211                 spin_unlock(&lli->lli_lock);
1212         }
1213 #endif
1214
1215         if (body->valid & OBD_MD_FLID)
1216                 inode->i_ino = body->ino;
1217         if (body->valid & OBD_MD_FLATIME &&
1218             body->atime > LTIME_S(inode->i_atime))
1219                 LTIME_S(inode->i_atime) = body->atime;
1220         if (body->valid & OBD_MD_FLMTIME &&
1221             body->mtime > LTIME_S(inode->i_mtime)) {
1222                 CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n",
1223                        inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
1224                 LTIME_S(inode->i_mtime) = body->mtime;
1225         }
1226         if (body->valid & OBD_MD_FLCTIME &&
1227             body->ctime > LTIME_S(inode->i_ctime))
1228                 LTIME_S(inode->i_ctime) = body->ctime;
1229         if (body->valid & OBD_MD_FLMODE)
1230                 inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
1231         if (body->valid & OBD_MD_FLTYPE)
1232                 inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
1233         if (body->valid & OBD_MD_FLUID)
1234                 inode->i_uid = body->uid;
1235         if (body->valid & OBD_MD_FLGID)
1236                 inode->i_gid = body->gid;
1237         if (body->valid & OBD_MD_FLFLAGS)
1238                 inode->i_flags = body->flags;
1239         if (body->valid & OBD_MD_FLNLINK)
1240                 inode->i_nlink = body->nlink;
1241         if (body->valid & OBD_MD_FLGENER)
1242                 inode->i_generation = body->generation;
1243         if (body->valid & OBD_MD_FLRDEV)
1244 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1245                 inode->i_rdev = body->rdev;
1246 #else
1247                 inode->i_rdev = old_decode_dev(body->rdev);
1248 #endif
1249         if (body->valid & OBD_MD_FLSIZE)
1250                 inode->i_size = body->size;
1251         if (body->valid & OBD_MD_FLBLOCKS)
1252                 inode->i_blocks = body->blocks;
1253
1254         if (body->valid & OBD_MD_FLSIZE)
1255                 set_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags);
1256 }
1257
1258 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
1259 static struct backing_dev_info ll_backing_dev_info = {
1260         .ra_pages       = 0,    /* No readahead */
1261 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
1262         .capabilities   = 0,    /* Does contribute to dirty memory */
1263 #else
1264         .memory_backed  = 0,    /* Does contribute to dirty memory */
1265 #endif
1266 };
1267 #endif
1268
1269 void ll_read_inode2(struct inode *inode, void *opaque)
1270 {
1271         struct lustre_md *md = opaque;
1272         struct ll_inode_info *lli = ll_i2info(inode);
1273         ENTRY;
1274
1275         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1276                inode->i_generation, inode);
1277
1278         ll_lli_init(lli);
1279
1280         LASSERT(!lli->lli_smd);
1281
1282         /* Core attributes from the MDS first.  This is a new inode, and
1283          * the VFS doesn't zero times in the core inode so we have to do
1284          * it ourselves.  They will be overwritten by either MDS or OST
1285          * attributes - we just need to make sure they aren't newer. */
1286         LTIME_S(inode->i_mtime) = 0;
1287         LTIME_S(inode->i_atime) = 0;
1288         LTIME_S(inode->i_ctime) = 0;
1289         inode->i_rdev = 0;
1290         ll_update_inode(inode, md);
1291
1292         /* OIDEBUG(inode); */
1293
1294         if (S_ISREG(inode->i_mode)) {
1295                 struct ll_sb_info *sbi = ll_i2sbi(inode);
1296                 inode->i_op = &ll_file_inode_operations;
1297                 inode->i_fop = sbi->ll_fop;
1298                 inode->i_mapping->a_ops = &ll_aops;
1299                 EXIT;
1300         } else if (S_ISDIR(inode->i_mode)) {
1301                 inode->i_op = &ll_dir_inode_operations;
1302                 inode->i_fop = &ll_dir_operations;
1303                 inode->i_mapping->a_ops = &ll_dir_aops;
1304                 EXIT;
1305         } else if (S_ISLNK(inode->i_mode)) {
1306                 inode->i_op = &ll_fast_symlink_inode_operations;
1307                 EXIT;
1308         } else {
1309                 inode->i_op = &ll_special_inode_operations;
1310
1311 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1312                 init_special_inode(inode, inode->i_mode,
1313                                    kdev_t_to_nr(inode->i_rdev));
1314
1315                 /* initializing backing dev info. */
1316                 inode->i_mapping->backing_dev_info = &ll_backing_dev_info;
1317 #else
1318                 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1319 #endif
1320                 lli->ll_save_ifop = inode->i_fop;
1321
1322                 if (S_ISCHR(inode->i_mode))
1323                         inode->i_fop = &ll_special_chr_inode_fops;
1324                 else if (S_ISBLK(inode->i_mode))
1325                         inode->i_fop = &ll_special_blk_inode_fops;
1326                 else if (S_ISFIFO(inode->i_mode))
1327                         inode->i_fop = &ll_special_fifo_inode_fops;
1328                 else if (S_ISSOCK(inode->i_mode))
1329                         inode->i_fop = &ll_special_sock_inode_fops;
1330                 EXIT;
1331         }
1332 }
1333
1334 int ll_iocontrol(struct inode *inode, struct file *file,
1335                  unsigned int cmd, unsigned long arg)
1336 {
1337         struct ll_sb_info *sbi = ll_i2sbi(inode);
1338         struct ptlrpc_request *req = NULL;
1339         int rc, flags = 0;
1340         ENTRY;
1341
1342         switch(cmd) {
1343         case EXT3_IOC_GETFLAGS: {
1344                 struct lu_fid fid;
1345                 struct mdt_body *body;
1346
1347                 ll_inode2fid(&fid, inode);
1348                 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, OBD_MD_FLFLAGS,0,&req);
1349                 if (rc) {
1350                         CERROR("failure %d inode %lu\n", rc, inode->i_ino);
1351                         RETURN(-abs(rc));
1352                 }
1353
1354                 body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
1355
1356                 if (body->flags & S_APPEND)
1357                         flags |= EXT3_APPEND_FL;
1358                 if (body->flags & S_IMMUTABLE)
1359                         flags |= EXT3_IMMUTABLE_FL;
1360                 if (body->flags & S_NOATIME)
1361                         flags |= EXT3_NOATIME_FL;
1362
1363                 ptlrpc_req_finished (req);
1364
1365                 RETURN(put_user(flags, (int *)arg));
1366         }
1367         case EXT3_IOC_SETFLAGS: {
1368                 struct mdc_op_data op_data;
1369                 struct iattr attr;
1370                 struct obdo *oa;
1371                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1372
1373                 if (get_user(flags, (int *)arg))
1374                         RETURN(-EFAULT);
1375
1376                 oa = obdo_alloc();
1377                 if (!oa)
1378                         RETURN(-ENOMEM);
1379
1380                 ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
1381
1382                 memset(&attr, 0x0, sizeof(attr));
1383                 attr.ia_attr_flags = flags;
1384                 attr.ia_valid |= ATTR_ATTR_FLAG;
1385
1386                 rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
1387                                  &attr, NULL, 0, NULL, 0, &req);
1388                 if (rc) {
1389                         ptlrpc_req_finished(req);
1390                         if (rc != -EPERM && rc != -EACCES)
1391                                 CERROR("mdc_setattr fails: rc = %d\n", rc);
1392                         obdo_free(oa);
1393                         RETURN(rc);
1394                 }
1395                 ptlrpc_req_finished(req);
1396
1397                 oa->o_id = lsm->lsm_object_id;
1398                 oa->o_flags = flags;
1399                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
1400
1401                 obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
1402                 rc = obd_setattr(sbi->ll_osc_exp, oa, lsm, NULL);
1403                 obdo_free(oa);
1404                 if (rc) {
1405                         if (rc != -EPERM && rc != -EACCES)
1406                                 CERROR("mdc_setattr fails: rc = %d\n", rc);
1407                         RETURN(rc);
1408                 }
1409
1410                 if (flags & EXT3_APPEND_FL)
1411                         inode->i_flags |= S_APPEND;
1412                 else
1413                         inode->i_flags &= ~S_APPEND;
1414                 if (flags & EXT3_IMMUTABLE_FL)
1415                         inode->i_flags |= S_IMMUTABLE;
1416                 else
1417                         inode->i_flags &= ~S_IMMUTABLE;
1418                 if (flags & EXT3_NOATIME_FL)
1419                         inode->i_flags |= S_NOATIME;
1420                 else
1421                         inode->i_flags &= ~S_NOATIME;
1422
1423                 RETURN(0);
1424         }
1425         default:
1426                 RETURN(-ENOSYS);
1427         }
1428
1429         RETURN(0);
1430 }
1431
1432 /* umount -f client means force down, don't save state */
1433 void ll_umount_begin(struct super_block *sb)
1434 {
1435         struct lustre_sb_info *lsi = s2lsi(sb);
1436         struct ll_sb_info *sbi = ll_s2sbi(sb);
1437         struct obd_device *obd;
1438         struct obd_ioctl_data ioc_data = { 0 };
1439         ENTRY;
1440
1441         /* Tell the MGC we got umount -f */
1442         lsi->lsi_flags |= LSI_UMOUNT_FORCE;
1443
1444         CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
1445                sb->s_count, atomic_read(&sb->s_active));
1446
1447         obd = class_exp2obd(sbi->ll_mdc_exp);
1448         if (obd == NULL) {
1449                 CERROR("Invalid MDC connection handle "LPX64"\n",
1450                        sbi->ll_mdc_exp->exp_handle.h_cookie);
1451                 EXIT;
1452                 return;
1453         }
1454         obd->obd_no_recov = 1;
1455         obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_mdc_exp, sizeof ioc_data,
1456                       &ioc_data, NULL);
1457
1458         obd = class_exp2obd(sbi->ll_osc_exp);
1459         if (obd == NULL) {
1460                 CERROR("Invalid LOV connection handle "LPX64"\n",
1461                        sbi->ll_osc_exp->exp_handle.h_cookie);
1462                 EXIT;
1463                 return;
1464         }
1465         obd->obd_no_recov = 1;
1466         obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_osc_exp, sizeof ioc_data,
1467                       &ioc_data, NULL);
1468
1469         /* Really, we'd like to wait until there are no requests outstanding,
1470          * and then continue.  For now, we just invalidate the requests,
1471          * schedule, and hope.
1472          */
1473         schedule();
1474
1475         EXIT;
1476 }
1477
1478 int ll_remount_fs(struct super_block *sb, int *flags, char *data)
1479 {
1480         struct ll_sb_info *sbi = ll_s2sbi(sb);
1481         int err;
1482         __u32 read_only;
1483  
1484         if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
1485                 read_only = *flags & MS_RDONLY;
1486                 err = obd_set_info(sbi->ll_mdc_exp, strlen("read-only"),
1487                                    "read-only", sizeof(read_only), &read_only);
1488                 if (err) {
1489                         CERROR("Failed to change the read-only flag during "
1490                                "remount: %d\n", err);
1491                         return err;
1492                 }
1493  
1494                 if (read_only)
1495                         sb->s_flags |= MS_RDONLY;
1496                 else
1497                         sb->s_flags &= ~MS_RDONLY;
1498         }
1499         return 0;
1500 }
1501
1502 int ll_prep_inode(struct obd_export *exp, struct inode **inode,
1503                   struct ptlrpc_request *req, int offset,
1504                   struct super_block *sb)
1505 {
1506         struct ll_sb_info *sbi = NULL;
1507         struct lustre_md md;
1508         int rc = 0;
1509         ENTRY;
1510
1511         LASSERT(*inode || sb);
1512         sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
1513         prune_deathrow(sbi, 1);
1514
1515         rc = mdc_req2lustre_md(req, offset, exp, &md);
1516         if (rc)
1517                 RETURN(rc);
1518
1519         if (*inode) {
1520                 ll_update_inode(*inode, &md);
1521         } else {
1522                 struct lu_fid fid;
1523                 
1524                 LASSERT(sb != NULL);
1525                 
1526                 rc = ll_fid_alloc(sbi, &fid);
1527                 if (rc) {
1528                         CERROR("cannot allocate new fid, rc %d\n", 
1529                                rc);
1530                         mdc_free_lustre_md(exp, &md);
1531                         GOTO(out, rc);
1532                 }
1533
1534                 *inode = ll_iget(sb, ll_fid2ino(sbi, &fid), &md);
1535                 if (*inode == NULL || is_bad_inode(*inode)) {
1536                         mdc_free_lustre_md(exp, &md);
1537                         rc = -ENOMEM;
1538                         CERROR("new_inode -fatal: rc %d\n", rc);
1539                         GOTO(out, rc);
1540                 }
1541                 ll_i2info(*inode)->lli_fid = fid;
1542         }
1543
1544         rc = obd_checkmd(exp, ll_i2mdcexp(*inode),
1545                          ll_i2info(*inode)->lli_smd);
1546 out:
1547         RETURN(rc);
1548 }
1549
1550 char *llap_origins[] = {
1551         [LLAP_ORIGIN_UNKNOWN] = "--",
1552         [LLAP_ORIGIN_READPAGE] = "rp",
1553         [LLAP_ORIGIN_READAHEAD] = "ra",
1554         [LLAP_ORIGIN_COMMIT_WRITE] = "cw",
1555         [LLAP_ORIGIN_WRITEPAGE] = "wp",
1556 };
1557
1558 struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
1559                                              struct list_head *list)
1560 {
1561         struct ll_async_page *llap;
1562         struct list_head *pos;
1563
1564         list_for_each(pos, list) {
1565                 if (pos == &sbi->ll_pglist)
1566                         return NULL;
1567                 llap = list_entry(pos, struct ll_async_page, llap_pglist_item);
1568                 if (llap->llap_page == NULL)
1569                         continue;
1570                 return llap;
1571         }
1572         LBUG();
1573         return NULL;
1574 }
1575
1576 int ll_obd_statfs(struct inode *inode, void *arg)
1577 {
1578         struct ll_sb_info *sbi = NULL;
1579         struct obd_device *client_obd = NULL, *lov_obd = NULL;
1580         struct lov_obd *lov = NULL;
1581         struct obd_import *client_imp = NULL;
1582         struct obd_statfs stat_buf = {0};
1583         char *buf = NULL;
1584         struct obd_ioctl_data *data = NULL;
1585         __u32 type, index;
1586         int len, rc;
1587
1588         if (!inode || !(sbi = ll_i2sbi(inode)))
1589                 GOTO(out_statfs, rc = -EINVAL);
1590
1591         rc = obd_ioctl_getdata(&buf, &len, arg);
1592         if (rc)
1593                 GOTO(out_statfs, rc);
1594
1595         data = (void*)buf;
1596         if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
1597             !data->ioc_pbuf1 || !data->ioc_pbuf2)
1598                 GOTO(out_statfs, rc = -EINVAL);
1599
1600         memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
1601         memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
1602
1603         if (type == LL_STATFS_MDC) {
1604                 if (index > 0)
1605                         GOTO(out_statfs, rc = -ENODEV);
1606                 client_obd = class_exp2obd(sbi->ll_mdc_exp);
1607                 client_imp = class_exp2cliimp(sbi->ll_mdc_exp);
1608         } else if (type == LL_STATFS_LOV) {
1609                 lov_obd = class_exp2obd(sbi->ll_osc_exp);
1610                 lov = &lov_obd->u.lov;
1611
1612                 if (index >= lov->desc.ld_tgt_count)
1613                         GOTO(out_statfs, rc = -ENODEV);
1614
1615                 client_obd = class_exp2obd(lov->tgts[index].ltd_exp);
1616                 client_imp = class_exp2cliimp(lov->tgts[index].ltd_exp);
1617                 if (!lov->tgts[index].active)
1618                         GOTO(out_uuid, rc = -ENODATA);
1619         }
1620
1621         if (!client_obd || !client_imp)
1622                 GOTO(out_statfs, rc = -EINVAL);
1623
1624         rc = obd_statfs(client_obd, &stat_buf, jiffies - 1);
1625         if (rc)
1626                 GOTO(out_statfs, rc);
1627
1628         if (copy_to_user(data->ioc_pbuf1, &stat_buf, data->ioc_plen1))
1629                 GOTO(out_statfs, rc = -EFAULT);
1630
1631 out_uuid:
1632         if (copy_to_user(data->ioc_pbuf2, &client_imp->imp_target_uuid,
1633                          data->ioc_plen2))
1634                 rc = -EFAULT;
1635
1636 out_statfs:
1637         if (buf)
1638                 obd_ioctl_freedata(buf, len);
1639         return rc;
1640 }
1641
1642 EXPORT_SYMBOL(ll_fill_super);
1643 EXPORT_SYMBOL(ll_put_super);
1644 EXPORT_SYMBOL(ll_remount_fs);
1645 EXPORT_SYMBOL(ll_umount_begin);
1646