Whamcloud - gitweb
kernel-patches: add dynlocks to -fc{3,5} series.
[fs/lustre-release.git] / lustre / llite / llite_capa.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2005 Cluster File Systems, Inc.
5  *
6  * Author: Lai Siyao <lsy@clusterfs.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #define DEBUG_SUBSYSTEM S_LLITE
25
26 #include <linux/fs.h>
27 #include <linux/version.h>
28 #include <asm/uaccess.h>
29 #include <linux/file.h>
30 #include <linux/kmod.h>
31
32 #include <lustre_lite.h>
33 #include "llite_internal.h"
34
35 /* for obd_capa.c_list, client capa might stay in three places:
36  * 1. ll_capa_list.
37  * 2. ll_idle_capas.
38  * 3. stand alone: just allocated.
39  */
40
41 /* capas for oss writeback and those failed to renew */
42 static LIST_HEAD(ll_idle_capas);
43 static struct ptlrpc_thread ll_capa_thread;
44 static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
45
46 /* llite capa renewal timer */
47 cfs_timer_t ll_capa_timer;
48 /* for debug: indicate whether capa on llite is enabled or not */
49 static atomic_t ll_capa_debug = ATOMIC_INIT(0);
50
51 static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry)
52 {
53         if (cfs_time_before(expiry, cfs_timer_deadline(&ll_capa_timer)) ||
54             !cfs_timer_is_armed(&ll_capa_timer)) {
55                 cfs_timer_arm(&ll_capa_timer, expiry);
56                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
57                            "ll_capa_timer update: %lu/%lu by",
58                            expiry, jiffies);
59         }
60 }
61
62 static inline int have_expired_capa(void)
63 {
64         struct obd_capa *ocapa = NULL;
65         int expired = 0;
66
67         /* if ll_capa_list has client capa to expire or ll_idle_capas has
68          * expired capa, return 1.
69          */
70         spin_lock(&capa_lock);
71         if (!list_empty(ll_capa_list)) {
72                 ocapa = list_entry(ll_capa_list->next, struct obd_capa, c_list);
73                 expired = capa_is_to_expire(ocapa);
74                 if (!expired)
75                         update_capa_timer(ocapa, capa_renewal_time(ocapa));
76         } else if (!list_empty(&ll_idle_capas)) {
77                 ocapa = list_entry(ll_idle_capas.next, struct obd_capa, c_list);
78                 expired = capa_is_expired(ocapa);
79                 if (!expired)
80                         update_capa_timer(ocapa, ocapa->c_expiry);
81         }
82         spin_unlock(&capa_lock);
83
84         if (expired)
85                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
86         return expired;
87 }
88
89 static inline int ll_capa_check_stop(void)
90 {
91         return (ll_capa_thread.t_flags & SVC_STOPPING) ? 1: 0;
92 }
93
94 static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
95 {
96         struct obd_capa *tmp;
97         struct list_head *before = NULL;
98
99         /* TODO: client capa is sorted by expiry, this could be optimized */
100         list_for_each_entry_reverse(tmp, head, c_list) {
101                 if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
102                         before = &tmp->c_list;
103                         break;
104                 }
105         }
106
107         LASSERT(&ocapa->c_list != before);
108         list_add(&ocapa->c_list, before ?: head);
109 }
110
111 static inline int obd_capa_open_count(struct obd_capa *oc)
112 {
113         struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
114         return atomic_read(&lli->lli_open_count);
115 }
116
117 static void ll_delete_capa(struct obd_capa *ocapa)
118 {
119         struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
120
121         if (capa_for_mds(&ocapa->c_capa)) {
122                 LASSERT(lli->lli_mds_capa == ocapa);
123                 lli->lli_mds_capa = NULL;
124         } else if (capa_for_oss(&ocapa->c_capa)) {
125                 list_del_init(&ocapa->u.cli.lli_list);
126         }
127
128         DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
129         list_del(&ocapa->c_list);
130         free_capa(ocapa);
131 }
132
133 /* three places where client capa is deleted:
134  * 1. capa_thread_main(), main place to delete expired capa.
135  * 2. ll_clear_inode_capas() in ll_clear_inode().
136  * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_truncate().
137  */
138 static int capa_thread_main(void *unused)
139 {
140         struct obd_capa *ocapa, *tmp, *next;
141         struct inode *inode = NULL;
142         struct l_wait_info lwi = { 0 };
143         int rc;
144         ENTRY;
145
146         cfs_daemonize("ll_capa");
147
148         ll_capa_thread.t_flags = SVC_RUNNING;
149         wake_up(&ll_capa_thread.t_ctl_waitq);
150
151         while (1) {
152                 l_wait_event(ll_capa_thread.t_ctl_waitq,
153                              (ll_capa_check_stop() || have_expired_capa()),
154                              &lwi);
155
156                 if (ll_capa_check_stop())
157                         break;
158
159                 spin_lock(&capa_lock);
160                 next = NULL;
161                 list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
162                         LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
163
164                         if (!capa_is_to_expire(ocapa)) {
165                                 next = ocapa;
166                                 break;
167                         }
168
169                         if (capa_for_mds(&ocapa->c_capa) &&
170                             !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
171                             obd_capa_open_count(ocapa) == 0 &&
172                             !obd_capa_is_root(ocapa) &&
173                             !ll_have_md_lock(ocapa->u.cli.inode,
174                                              MDS_INODELOCK_LOOKUP)) {
175                                 /* MDS capa without LOOKUP lock, and the related
176                                  * inode is not opened, it won't renew,
177                                  * move to idle list (except root fid) */
178                                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
179                                            "skip renewal for");
180                                 list_del_init(&ocapa->c_list);
181                                 sort_add_capa(ocapa, &ll_idle_capas);
182                                 continue;
183                         }
184
185                         if (capa_for_oss(&ocapa->c_capa) &&
186                             obd_capa_open_count(ocapa) == 0) {
187                                 /* oss capa with open count == 0 won't renew,
188                                  * move to idle list */
189                                 list_del_init(&ocapa->c_list);
190                                 sort_add_capa(ocapa, &ll_idle_capas);
191                                 continue;
192                         }
193
194                         /* NB iput() is in ll_update_capa() */
195                         inode = igrab(ocapa->u.cli.inode);
196                         if (inode == NULL) {
197                                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
198                                            "igrab failed for");
199                                 ll_delete_capa(ocapa);
200                                 continue;
201                         }
202
203                         list_del_init(&ocapa->c_list);
204                         capa_get(ocapa);
205                         spin_unlock(&capa_lock);
206
207                         rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
208                                            ll_update_capa);
209                         spin_lock(&capa_lock);
210                         if (rc) {
211                                 DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
212                                            "renew failed: %d", rc);
213                                 sort_add_capa(ocapa, &ll_idle_capas);
214                         }
215                 }
216
217                 if (next)
218                         update_capa_timer(next, capa_renewal_time(next));
219
220                 list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas, c_list) {
221                         if (!capa_is_expired(ocapa)) {
222                                 if (!next)
223                                         update_capa_timer(ocapa, ocapa->c_expiry);
224                                 break;
225                         }
226
227                         if (atomic_read(&ocapa->c_refc)) {
228                                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
229                                            "expired(c_refc %d), don't release",
230                                            atomic_read(&ocapa->c_refc));
231                                 obd_capa_set_expired(ocapa);
232                                 /* don't try to renew any more */
233                                 list_del_init(&ocapa->c_list);
234                                 continue;
235                         }
236
237                         /* expired capa is released. */
238                         DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
239                         ll_delete_capa(ocapa);
240                 }
241
242                 spin_unlock(&capa_lock);
243         }
244
245         ll_capa_thread.t_flags = SVC_STOPPED;
246         wake_up(&ll_capa_thread.t_ctl_waitq);
247         RETURN(0);
248 }
249
250 void ll_capa_timer_callback(unsigned long unused)
251 {
252         wake_up(&ll_capa_thread.t_ctl_waitq);
253 }
254
255 int ll_capa_thread_start(void)
256 {
257         int rc;
258         ENTRY;
259
260         init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
261
262         rc = kernel_thread(capa_thread_main, NULL, 0);
263         if (rc < 0) {
264                 CERROR("cannot start expired capa thread: rc %d\n", rc);
265                 RETURN(rc);
266         }
267         wait_event(ll_capa_thread.t_ctl_waitq,
268                    ll_capa_thread.t_flags & SVC_RUNNING);
269
270         RETURN(0);
271 }
272
273 void ll_capa_thread_stop(void)
274 {
275         ll_capa_thread.t_flags = SVC_STOPPING;
276         wake_up(&ll_capa_thread.t_ctl_waitq);
277         wait_event(ll_capa_thread.t_ctl_waitq,
278                    ll_capa_thread.t_flags & SVC_STOPPED);
279 }
280
281 static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
282 {
283         struct ll_inode_info *lli = ll_i2info(inode);
284         struct obd_capa *ocapa;
285
286         /* inside capa_lock */
287         list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
288                 if (!obd_capa_is_valid(ocapa))
289                         continue;
290                 if ((capa_opc(&ocapa->c_capa) & opc) != opc)
291                         continue;
292
293                 LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
294                                   ll_inode2fid(inode)));
295                 LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
296
297                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
298                 return ocapa;
299         }
300
301         return NULL;
302 }
303
304 struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
305 {
306         struct ll_inode_info *lli = ll_i2info(inode);
307         struct obd_capa *ocapa;
308         int found = 0;
309
310         if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
311                 return NULL;
312         ENTRY;
313
314         LASSERT(opc == CAPA_OPC_OSS_WRITE ||
315                 opc == (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ) ||
316                 opc == CAPA_OPC_OSS_TRUNC);
317
318         spin_lock(&capa_lock);
319         list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
320                 if (!obd_capa_is_valid(ocapa))
321                         continue;
322                 if ((opc & CAPA_OPC_OSS_WRITE) &&
323                     capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
324                         found = 1; break;
325                 } else if ((opc & CAPA_OPC_OSS_READ) &&
326                            capa_opc_supported(&ocapa->c_capa,
327                                               CAPA_OPC_OSS_READ)) {
328                         found = 1; break;
329                 } else if ((opc & CAPA_OPC_OSS_TRUNC) &&
330                            capa_opc_supported(&ocapa->c_capa, opc)) {
331                         found = 1; break;
332                 }
333         }
334
335         if (found) {
336                 LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
337                                   ll_inode2fid(inode)));
338                 LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
339
340                 capa_get(ocapa);
341
342                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
343         } else {
344                 ocapa = NULL;
345
346                 if (atomic_read(&ll_capa_debug)) {
347                         CERROR("no capability for "DFID" opc "LPX64"\n",
348                                PFID(&lli->lli_fid), opc);
349                         atomic_set(&ll_capa_debug, 0);
350                 }
351         }
352         spin_unlock(&capa_lock);
353
354         RETURN(ocapa);
355 }
356
357 struct obd_capa *ll_mdscapa_get(struct inode *inode)
358 {
359         struct ll_inode_info *lli = ll_i2info(inode);
360         struct obd_capa *ocapa;
361         ENTRY;
362
363         LASSERT(inode != NULL);
364         
365         if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
366                 RETURN(NULL);
367
368         spin_lock(&capa_lock);
369         ocapa = capa_get(lli->lli_mds_capa);
370         spin_unlock(&capa_lock);
371         
372         if (ocapa && !obd_capa_is_valid(ocapa)) {
373                 DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "invalid (flags %d)",
374                            ocapa->c_flags);
375                 capa_put(ocapa);
376                 ocapa = NULL;
377         }
378
379         if (!ocapa && atomic_read(&ll_capa_debug)) {
380 #if 0
381                 LASSERT(!S_ISDIR(inode->i_mode));
382                 LASSERT(!obd_capa_open_count(ocapa));
383                 LASSERT(!ll_have_md_lock(ocapa->u.cli.inode,
384                                          MDS_INODELOCK_LOOKUP));
385 #endif
386                 atomic_set(&ll_capa_debug, 0);
387         }
388
389         RETURN(ocapa);
390 }
391
392 static inline int do_add_mds_capa(struct inode *inode, struct obd_capa **pcapa)
393 {
394         struct ll_inode_info *lli = ll_i2info(inode);
395         struct obd_capa *old = lli->lli_mds_capa;
396         struct obd_capa *ocapa = *pcapa;
397         int rc = 0;
398
399         if (!old) {
400                 ocapa->u.cli.inode = inode;
401                 lli->lli_mds_capa = ocapa;
402                 obd_capa_clear_new(ocapa);
403                 obd_capa_set_valid(ocapa);
404
405                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "add MDS");
406         } else {
407                 if (!memcmp(&old->c_capa, &ocapa->c_capa, sizeof(old->c_capa)))
408                 {
409                         rc = -EEXIST;
410                 } else {
411                         spin_lock(&old->c_lock);
412                         old->c_capa = ocapa->c_capa;
413                         obd_capa_set_valid(old);
414                         spin_unlock(&old->c_lock);
415
416                         DEBUG_CAPA(D_SEC, &old->c_capa, "update MDS");
417                 }
418
419                 free_capa(ocapa);
420                 *pcapa = old;
421         }
422
423         return rc;
424 }
425
426 static inline void inode_add_oss_capa(struct inode *inode,
427                                       struct obd_capa *ocapa)
428 {
429         struct ll_inode_info *lli = ll_i2info(inode);
430         struct obd_capa *tmp;
431         struct list_head *next = NULL;
432
433         /* capa is sorted in lli_oss_capas so lookup can always find the
434          * latest one */
435         list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
436                 if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
437                         next = &tmp->u.cli.lli_list;
438                         break;
439                 }
440         }
441         list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
442 }
443
444 static inline int do_add_oss_capa(struct inode *inode, struct obd_capa **pcapa)
445 {
446         struct obd_capa *old, *ocapa = *pcapa;
447         struct lustre_capa *capa = &ocapa->c_capa;
448         int rc = 0;
449
450         LASSERTF(S_ISREG(inode->i_mode),
451                  "inode has oss capa, but not regular file, mode: %d\n",
452                  inode->i_mode);
453
454         /* FIXME: can't replace it so easily with fine-grained opc */
455         old = do_lookup_oss_capa(inode, capa->lc_opc & CAPA_OPC_OSS_ONLY);
456         if (!old) {
457                 ocapa->u.cli.inode = inode;
458                 INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
459                 obd_capa_set_valid(ocapa);
460
461                 DEBUG_CAPA(D_SEC, capa, "add OSS");
462         } else {
463                 if (old->c_capa.lc_expiry == capa->lc_expiry) {
464                         rc = -EEXIST;
465                 } else {
466                         spin_lock(&old->c_lock);
467                         old->c_capa = *capa;
468                         obd_capa_set_valid(old);
469                         spin_unlock(&old->c_lock);
470
471                         DEBUG_CAPA(D_SEC, capa, "update OSS");
472                 }
473
474                 free_capa(ocapa);
475                 *pcapa = old;
476         }
477
478         if (!rc)
479                 inode_add_oss_capa(inode, *pcapa);
480         return rc;
481 }
482
483 struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
484 {
485         struct obd_capa **pcapa = &ocapa;
486         int rc;
487
488         spin_lock(&capa_lock);
489         rc = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, pcapa) :
490                                             do_add_oss_capa(inode, pcapa);
491
492         ocapa = *pcapa;
493         /* truncate capa won't renew, or no existed capa changed, don't update
494          * capa timer. */
495         if (!rc && ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
496                 spin_lock(&ocapa->c_lock);
497                 set_capa_expiry(ocapa);
498                 spin_unlock(&ocapa->c_lock);
499
500                 list_del_init(&ocapa->c_list);
501                 sort_add_capa(ocapa, ll_capa_list);
502
503                 update_capa_timer(ocapa, capa_renewal_time(ocapa));
504         }
505
506         atomic_set(&ll_capa_debug, 1);
507         spin_unlock(&capa_lock);
508
509         return ocapa;
510 }
511
512
513 int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
514 {
515         struct inode *inode = ocapa->u.cli.inode;
516         cfs_time_t expiry;
517         int rc = 0;
518
519         LASSERT(ocapa);
520
521         if (IS_ERR(capa)) {
522                 /* set error code */
523                 rc = PTR_ERR(capa);
524                 /* failed capa won't be renewed any longer, but if -EIO, client
525                  * might be doing recovery, retry in 1 min. */
526                 spin_lock(&capa_lock);
527                 if (rc == -EIO && !capa_is_expired(ocapa)) {
528                         expiry = jiffies + 60 * HZ;
529                         DEBUG_CAPA(D_SEC, &ocapa->c_capa,
530                                    "renewal failed: -EIO, retry in 1 min");
531                         goto retry;
532                 } else {
533                         if (rc == -ENOENT && !capa_is_to_expire(ocapa)) {
534                                 /* NB: in period of renewal, inode might be 
535                                  * deleted and then created, so actually ocapa
536                                  * is a completely new one! */
537                                 LASSERT(!list_empty(&ocapa->c_list));
538                         } else {
539                                 LASSERT(list_empty(&ocapa->c_list));
540                                 sort_add_capa(ocapa, &ll_idle_capas);
541                         }
542                 }
543                 spin_unlock(&capa_lock);
544
545                 DEBUG_CAPA(rc == -ENOENT ? D_SEC : D_ERROR, &ocapa->c_capa,
546                            "renewal failed(rc: %d) for", rc);
547                 goto out;
548         }
549
550         LASSERT(!memcmp(&ocapa->c_capa, capa,
551                         offsetof(struct lustre_capa, lc_flags)));
552
553         spin_lock(&ocapa->c_lock);
554         ocapa->c_capa = *capa;
555         set_capa_expiry(ocapa);
556         spin_unlock(&ocapa->c_lock);
557
558         spin_lock(&capa_lock);
559         if (capa->lc_opc & (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE))
560                 inode_add_oss_capa(inode, ocapa);
561         DEBUG_CAPA(D_SEC, capa, "renew");
562
563         expiry = capa_renewal_time(ocapa);
564 retry:
565         sort_add_capa(ocapa, ll_capa_list);
566         update_capa_timer(ocapa, expiry);
567         spin_unlock(&capa_lock);
568
569 out:
570         capa_put(ocapa);
571         iput(inode);
572         return rc;
573 }
574
575 void ll_capa_open(struct inode *inode)
576 {
577         struct ll_inode_info *lli = ll_i2info(inode);
578
579         if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
580             == 0)
581                 return;
582
583         if (!S_ISREG(inode->i_mode))
584                 return;
585
586         atomic_inc(&lli->lli_open_count);
587 }
588
589 void ll_capa_close(struct inode *inode)
590 {
591         struct ll_inode_info *lli = ll_i2info(inode);
592
593         if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
594             == 0)
595                 return;
596
597         if (!S_ISREG(inode->i_mode))
598                 return;
599
600         atomic_dec(&lli->lli_open_count);
601 }
602
603 /* delete CAPA_OPC_OSS_TRUNC only */
604 void ll_truncate_free_capa(struct obd_capa *ocapa)
605 {
606         struct inode *inode;
607
608         if (!ocapa)
609                 return;
610
611         LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
612         DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release truncate");
613
614         inode = ocapa->u.cli.inode;
615
616         spin_lock(&capa_lock);
617         capa_put(ocapa);
618         ll_delete_capa(ocapa);
619         spin_unlock(&capa_lock);
620 }
621
622 void ll_clear_inode_capas(struct inode *inode)
623 {
624         struct ll_inode_info *lli = ll_i2info(inode);
625         struct obd_capa *ocapa, *tmp;
626
627         spin_lock(&capa_lock);
628         ocapa = lli->lli_mds_capa;
629         if (ocapa)
630                 ll_delete_capa(ocapa);
631                 
632         list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
633                                  u.cli.lli_list)
634                 ll_delete_capa(ocapa);
635         spin_unlock(&capa_lock);
636 }