Whamcloud - gitweb
c109778854d40f0de08e15d850a04cf08cb9f7bd
[fs/lustre-release.git] / lustre / llite / llite_capa.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2005 Cluster File Systems, Inc.
5  *
6  * Author: Lai Siyao <lsy@clusterfs.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #define DEBUG_SUBSYSTEM S_LLITE
25
26 #include <linux/fs.h>
27 #include <linux/version.h>
28 #include <asm/uaccess.h>
29 #include <linux/file.h>
30 #include <linux/kmod.h>
31
32 #include <lustre_lite.h>
33 #include "llite_internal.h"
34
35 /* for obd_capa.c_list, client capa might stay in three places:
36  * 1. ll_capa_list.
37  * 2. ll_idle_capas.
38  * 3. stand alone: just allocated.
39  */
40
41 /* capas for oss writeback and those failed to renew */
42 static LIST_HEAD(ll_idle_capas);
43 static struct ptlrpc_thread ll_capa_thread;
44 static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
45
46 /* llite capa renewal timer */
47 struct timer_list ll_capa_timer;
48 /* for debug: indicate whether capa on llite is enabled or not */
49 static atomic_t ll_capa_debug = ATOMIC_INIT(0);
50 static unsigned long long ll_capa_renewed = 0;
51 static unsigned long long ll_capa_renewal_noent = 0;
52 static unsigned long long ll_capa_renewal_failed = 0;
53 static unsigned long long ll_capa_renewal_retries = 0;
54
55 static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry)
56 {
57         if (time_before(expiry, ll_capa_timer.expires) ||
58             !timer_pending(&ll_capa_timer)) {
59                 mod_timer(&ll_capa_timer, expiry);
60                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
61                            "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
62         }
63 }
64
65 static inline cfs_time_t capa_renewal_time(struct obd_capa *ocapa)
66 {
67         return cfs_time_sub(ocapa->c_expiry,
68                             cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
69 }
70
71 static inline int capa_is_to_expire(struct obd_capa *ocapa)
72 {
73         return cfs_time_beforeq(capa_renewal_time(ocapa), cfs_time_current());
74 }
75
76 static inline int have_expired_capa(void)
77 {
78         struct obd_capa *ocapa = NULL;
79         int expired = 0;
80
81         /* if ll_capa_list has client capa to expire or ll_idle_capas has
82          * expired capa, return 1.
83          */
84         spin_lock(&capa_lock);
85         if (!list_empty(ll_capa_list)) {
86                 ocapa = list_entry(ll_capa_list->next, struct obd_capa, c_list);
87                 expired = capa_is_to_expire(ocapa);
88                 if (!expired)
89                         update_capa_timer(ocapa, capa_renewal_time(ocapa));
90         } else if (!list_empty(&ll_idle_capas)) {
91                 ocapa = list_entry(ll_idle_capas.next, struct obd_capa, c_list);
92                 expired = capa_is_expired(ocapa);
93                 if (!expired)
94                         update_capa_timer(ocapa, ocapa->c_expiry);
95         }
96         spin_unlock(&capa_lock);
97
98         if (expired)
99                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
100         return expired;
101 }
102
103 static inline int ll_capa_check_stop(void)
104 {
105         return (ll_capa_thread.t_flags & SVC_STOPPING) ? 1: 0;
106 }
107
108 static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
109 {
110         struct obd_capa *tmp;
111         struct list_head *before = NULL;
112
113         /* TODO: client capa is sorted by expiry, this could be optimized */
114         list_for_each_entry_reverse(tmp, head, c_list) {
115                 if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
116                         before = &tmp->c_list;
117                         break;
118                 }
119         }
120
121         LASSERT(&ocapa->c_list != before);
122         list_add(&ocapa->c_list, before ?: head);
123 }
124
125 static inline int obd_capa_open_count(struct obd_capa *oc)
126 {
127         struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
128         return atomic_read(&lli->lli_open_count);
129 }
130
131 static void ll_delete_capa(struct obd_capa *ocapa)
132 {
133         struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
134
135         if (capa_for_mds(&ocapa->c_capa)) {
136                 LASSERT(lli->lli_mds_capa == ocapa);
137                 lli->lli_mds_capa = NULL;
138         } else if (capa_for_oss(&ocapa->c_capa)) {
139                 list_del_init(&ocapa->u.cli.lli_list);
140         }
141
142         DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
143         list_del(&ocapa->c_list);
144         capa_count[CAPA_SITE_CLIENT]--;
145         free_capa(ocapa);
146 }
147
148 /* three places where client capa is deleted:
149  * 1. capa_thread_main(), main place to delete expired capa.
150  * 2. ll_clear_inode_capas() in ll_clear_inode().
151  * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_truncate().
152  */
153 static int capa_thread_main(void *unused)
154 {
155         struct obd_capa *ocapa, *tmp, *next;
156         struct inode *inode = NULL;
157         struct l_wait_info lwi = { 0 };
158         int rc;
159         ENTRY;
160
161         cfs_daemonize("ll_capa");
162
163         ll_capa_thread.t_flags = SVC_RUNNING;
164         wake_up(&ll_capa_thread.t_ctl_waitq);
165
166         while (1) {
167                 l_wait_event(ll_capa_thread.t_ctl_waitq,
168                              (ll_capa_check_stop() || have_expired_capa()),
169                              &lwi);
170
171                 if (ll_capa_check_stop())
172                         break;
173
174                 next = NULL;
175
176                 spin_lock(&capa_lock);
177                 list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
178                         LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
179
180                         if (!capa_is_to_expire(ocapa)) {
181                                 next = ocapa;
182                                 break;
183                         }
184
185                         list_del_init(&ocapa->c_list);
186
187                         /* for MDS capability, only renew those which belong to
188                          * dir, or its inode is opened, or client holds LOOKUP
189                          * lock.
190                          */
191                         if (capa_for_mds(&ocapa->c_capa) &&
192                             !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
193                             obd_capa_open_count(ocapa) == 0 &&
194                             !ll_have_md_lock(ocapa->u.cli.inode,
195                                              MDS_INODELOCK_LOOKUP)) {
196                                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
197                                            "skip renewal for");
198                                 sort_add_capa(ocapa, &ll_idle_capas);
199                                 continue;
200                         }
201
202                         /* for OSS capability, only renew those whose inode is
203                          * opened.
204                          */
205                         if (capa_for_oss(&ocapa->c_capa) &&
206                             obd_capa_open_count(ocapa) == 0) {
207                                 /* oss capa with open count == 0 won't renew,
208                                  * move to idle list */
209                                 sort_add_capa(ocapa, &ll_idle_capas);
210                                 continue;
211                         }
212
213                         /* NB iput() is in ll_update_capa() */
214                         inode = igrab(ocapa->u.cli.inode);
215                         if (inode == NULL) {
216                                 DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
217                                            "igrab failed for");
218                                 continue;
219                         }
220
221                         capa_get(ocapa);
222                         ll_capa_renewed++;
223                         spin_unlock(&capa_lock);
224
225                         rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
226                                            ll_update_capa);
227                         spin_lock(&capa_lock);
228                         if (rc) {
229                                 DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
230                                            "renew failed: %d", rc);
231                                 ll_capa_renewal_failed++;
232                         }
233                 }
234
235                 if (next)
236                         update_capa_timer(next, capa_renewal_time(next));
237
238                 list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas, c_list) {
239                         if (!capa_is_expired(ocapa)) {
240                                 if (!next)
241                                         update_capa_timer(ocapa, ocapa->c_expiry);
242                                 break;
243                         }
244
245                         if (atomic_read(&ocapa->c_refc)) {
246                                 DEBUG_CAPA(D_SEC, &ocapa->c_capa,
247                                            "expired(c_refc %d), don't release",
248                                            atomic_read(&ocapa->c_refc));
249                                 /* don't try to renew any more */
250                                 list_del_init(&ocapa->c_list);
251                                 continue;
252                         }
253
254                         /* expired capa is released. */
255                         DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
256                         ll_delete_capa(ocapa);
257                 }
258
259                 spin_unlock(&capa_lock);
260         }
261
262         ll_capa_thread.t_flags = SVC_STOPPED;
263         wake_up(&ll_capa_thread.t_ctl_waitq);
264         RETURN(0);
265 }
266
267 void ll_capa_timer_callback(unsigned long unused)
268 {
269         wake_up(&ll_capa_thread.t_ctl_waitq);
270 }
271
272 int ll_capa_thread_start(void)
273 {
274         int rc;
275         ENTRY;
276
277         init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
278
279         rc = kernel_thread(capa_thread_main, NULL, 0);
280         if (rc < 0) {
281                 CERROR("cannot start expired capa thread: rc %d\n", rc);
282                 RETURN(rc);
283         }
284         wait_event(ll_capa_thread.t_ctl_waitq,
285                    ll_capa_thread.t_flags & SVC_RUNNING);
286
287         RETURN(0);
288 }
289
290 void ll_capa_thread_stop(void)
291 {
292         ll_capa_thread.t_flags = SVC_STOPPING;
293         wake_up(&ll_capa_thread.t_ctl_waitq);
294         wait_event(ll_capa_thread.t_ctl_waitq,
295                    ll_capa_thread.t_flags & SVC_STOPPED);
296 }
297
298 static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
299 {
300         struct ll_inode_info *lli = ll_i2info(inode);
301         struct obd_capa *ocapa;
302
303         /* inside capa_lock */
304         list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
305                 if ((capa_opc(&ocapa->c_capa) & opc) != opc)
306                         continue;
307
308                 LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
309                                   ll_inode2fid(inode)));
310                 LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
311
312                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
313                 return ocapa;
314         }
315
316         return NULL;
317 }
318
319 struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
320 {
321         struct ll_inode_info *lli = ll_i2info(inode);
322         struct obd_capa *ocapa;
323         int found = 0;
324
325         if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
326                 return NULL;
327         ENTRY;
328
329         LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
330                 opc == CAPA_OPC_OSS_TRUNC);
331
332         spin_lock(&capa_lock);
333         list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
334                 if (capa_is_expired(ocapa))
335                         continue;
336                 if ((opc & CAPA_OPC_OSS_WRITE) &&
337                     capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
338                         found = 1; break;
339                 } else if ((opc & CAPA_OPC_OSS_READ) &&
340                            capa_opc_supported(&ocapa->c_capa,
341                                               CAPA_OPC_OSS_READ)) {
342                         found = 1; break;
343                 } else if ((opc & CAPA_OPC_OSS_TRUNC) &&
344                            capa_opc_supported(&ocapa->c_capa, opc)) {
345                         found = 1; break;
346                 }
347         }
348
349         if (found) {
350                 LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
351                                   ll_inode2fid(inode)));
352                 LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
353
354                 capa_get(ocapa);
355
356                 DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
357         } else {
358                 ocapa = NULL;
359
360                 if (atomic_read(&ll_capa_debug)) {
361                         CERROR("no capability for "DFID" opc "LPX64"\n",
362                                PFID(&lli->lli_fid), opc);
363                         atomic_set(&ll_capa_debug, 0);
364                 }
365         }
366         spin_unlock(&capa_lock);
367
368         RETURN(ocapa);
369 }
370
371 struct obd_capa *ll_mdscapa_get(struct inode *inode)
372 {
373         struct ll_inode_info *lli = ll_i2info(inode);
374         struct obd_capa *ocapa;
375         ENTRY;
376
377         LASSERT(inode != NULL);
378         
379         if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
380                 RETURN(NULL);
381
382         spin_lock(&capa_lock);
383         ocapa = capa_get(lli->lli_mds_capa);
384         spin_unlock(&capa_lock);
385         if (!ocapa && atomic_read(&ll_capa_debug)) {
386                 CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
387                 atomic_set(&ll_capa_debug, 0);
388         }
389
390         RETURN(ocapa);
391 }
392
393 static struct obd_capa *do_add_mds_capa(struct inode *inode,
394                                         struct obd_capa *ocapa)
395 {
396         struct ll_inode_info *lli = ll_i2info(inode);
397         struct obd_capa *old = lli->lli_mds_capa;
398         struct lustre_capa *capa = &ocapa->c_capa;
399
400         if (!old) {
401                 ocapa->u.cli.inode = inode;
402                 lli->lli_mds_capa = ocapa;
403                 capa_count[CAPA_SITE_CLIENT]++;
404
405                 DEBUG_CAPA(D_SEC, capa, "add MDS");
406         } else {
407                 spin_lock(&old->c_lock);
408                 old->c_capa = *capa;
409                 spin_unlock(&old->c_lock);
410
411                 DEBUG_CAPA(D_SEC, capa, "update MDS");
412
413                 free_capa(ocapa);
414                 ocapa = old;
415         }
416         return ocapa;
417 }
418
419 static inline void inode_add_oss_capa(struct inode *inode,
420                                       struct obd_capa *ocapa)
421 {
422         struct ll_inode_info *lli = ll_i2info(inode);
423         struct obd_capa *tmp;
424         struct list_head *next = NULL;
425
426         /* capa is sorted in lli_oss_capas so lookup can always find the
427          * latest one */
428         list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
429                 if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
430                         next = &tmp->u.cli.lli_list;
431                         break;
432                 }
433         }
434         LASSERT(&ocapa->u.cli.lli_list != next);
435         list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
436 }
437
438 static struct obd_capa *do_add_oss_capa(struct inode *inode,
439                                         struct obd_capa *ocapa)
440 {
441         struct obd_capa *old;
442         struct lustre_capa *capa = &ocapa->c_capa;
443
444         LASSERTF(S_ISREG(inode->i_mode),
445                  "inode has oss capa, but not regular file, mode: %d\n",
446                  inode->i_mode);
447
448         /* FIXME: can't replace it so easily with fine-grained opc */
449         old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
450         if (!old) {
451                 ocapa->u.cli.inode = inode;
452                 INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
453                 capa_count[CAPA_SITE_CLIENT]++;
454
455                 DEBUG_CAPA(D_SEC, capa, "add OSS");
456         } else {
457                 spin_lock(&old->c_lock);
458                 old->c_capa = *capa;
459                 spin_unlock(&old->c_lock);
460
461                 DEBUG_CAPA(D_SEC, capa, "update OSS");
462
463                 free_capa(ocapa);
464                 ocapa = old;
465         }
466
467         inode_add_oss_capa(inode, ocapa);
468         return ocapa;
469 }
470
471 struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
472 {
473         spin_lock(&capa_lock);
474         ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
475                                                do_add_oss_capa(inode, ocapa);
476
477         /* truncate capa won't renew */
478         if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
479                 set_capa_expiry(ocapa);
480                 list_del(&ocapa->c_list);
481                 sort_add_capa(ocapa, ll_capa_list);
482
483                 update_capa_timer(ocapa, capa_renewal_time(ocapa));
484         }
485
486         spin_unlock(&capa_lock);
487
488         atomic_set(&ll_capa_debug, 1);
489         return ocapa;
490 }
491
492 static inline void delay_capa_renew(struct obd_capa *oc, cfs_time_t delay)
493 {
494         /* NB: set a fake expiry for this capa to prevent it renew too soon */
495         oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
496 }
497
498 int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
499 {
500         struct inode *inode = ocapa->u.cli.inode;
501         int rc = 0;
502         ENTRY;
503
504         LASSERT(ocapa);
505
506         if (IS_ERR(capa)) {
507                 /* set error code */
508                 rc = PTR_ERR(capa);
509                 spin_lock(&capa_lock);
510                 if (rc == -ENOENT) {
511                         DEBUG_CAPA(D_SEC, &ocapa->c_capa,
512                                    "renewal canceled because object removed");
513                         ll_capa_renewal_noent++;
514                 } else {
515                         ll_capa_renewal_failed++;
516
517                         /* failed capa won't be renewed any longer, but if -EIO,
518                          * client might be doing recovery, retry in 2 min. */
519                         if (rc == -EIO && !capa_is_expired(ocapa)) {
520                                 delay_capa_renew(ocapa, 120);
521                                 DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
522                                            "renewal failed: -EIO, retry in 2 mins");
523                                 ll_capa_renewal_retries++;
524                                 GOTO(retry, rc);
525                         } else {
526                                 DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
527                                            "renewal failed(rc: %d) for", rc);
528                         }
529                 }
530
531                 list_del(&ocapa->c_list);
532                 sort_add_capa(ocapa, &ll_idle_capas);
533                 spin_unlock(&capa_lock);
534
535                 capa_put(ocapa);
536                 iput(inode);
537                 return rc;
538         }
539
540         spin_lock(&ocapa->c_lock);
541         LASSERT(!memcmp(&ocapa->c_capa, capa,
542                         offsetof(struct lustre_capa, lc_flags)));
543         ocapa->c_capa = *capa;
544         set_capa_expiry(ocapa);
545         spin_unlock(&ocapa->c_lock);
546
547         spin_lock(&capa_lock);
548         if (capa_for_oss(capa))
549                 inode_add_oss_capa(inode, ocapa);
550         DEBUG_CAPA(D_SEC, capa, "renew");
551         EXIT;
552 retry:
553         list_del_init(&ocapa->c_list);
554         sort_add_capa(ocapa, ll_capa_list);
555         update_capa_timer(ocapa, capa_renewal_time(ocapa));
556         spin_unlock(&capa_lock);
557
558         capa_put(ocapa);
559         iput(inode);
560         return rc;
561 }
562
563 void ll_capa_open(struct inode *inode)
564 {
565         struct ll_inode_info *lli = ll_i2info(inode);
566
567         if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
568             == 0)
569                 return;
570
571         if (!S_ISREG(inode->i_mode))
572                 return;
573
574         atomic_inc(&lli->lli_open_count);
575 }
576
577 void ll_capa_close(struct inode *inode)
578 {
579         struct ll_inode_info *lli = ll_i2info(inode);
580
581         if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
582             == 0)
583                 return;
584
585         if (!S_ISREG(inode->i_mode))
586                 return;
587
588         atomic_dec(&lli->lli_open_count);
589 }
590
591 /* delete CAPA_OPC_OSS_TRUNC only */
592 void ll_truncate_free_capa(struct obd_capa *ocapa)
593 {
594         if (!ocapa)
595                 return;
596
597         LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
598         DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
599
600         capa_put(ocapa);
601         spin_lock(&capa_lock);
602         ll_delete_capa(ocapa);
603         spin_unlock(&capa_lock);
604 }
605
606 void ll_clear_inode_capas(struct inode *inode)
607 {
608         struct ll_inode_info *lli = ll_i2info(inode);
609         struct obd_capa *ocapa, *tmp;
610
611         spin_lock(&capa_lock);
612         ocapa = lli->lli_mds_capa;
613         if (ocapa)
614                 ll_delete_capa(ocapa);
615                 
616         list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
617                                  u.cli.lli_list)
618                 ll_delete_capa(ocapa);
619         spin_unlock(&capa_lock);
620 }
621
622 void ll_print_capa_stat(struct ll_sb_info *sbi)
623 {
624         if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
625                 LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
626                               "Fid capabilities renewal ENOENT: %llu\n"
627                               "Fid capabilities failed to renew: %llu\n"
628                               "Fid capabilities renewal retries: %llu\n",
629                               ll_capa_renewed, ll_capa_renewal_noent,
630                               ll_capa_renewal_failed, ll_capa_renewal_retries);
631 }