Whamcloud - gitweb
libext2fs: zero blocks via FALLOC_FL_ZERO_RANGE in ext2fs_zero_blocks
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #define _XOPEN_SOURCE 600
19 #define _DARWIN_C_SOURCE
20 #define _FILE_OFFSET_BITS 64
21 #define _LARGEFILE_SOURCE
22 #define _LARGEFILE64_SOURCE
23 #ifndef _GNU_SOURCE
24 #define _GNU_SOURCE
25 #endif
26
27 #include "config.h"
28 #include <stdio.h>
29 #include <string.h>
30 #if HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #if HAVE_ERRNO_H
34 #include <errno.h>
35 #endif
36 #include <fcntl.h>
37 #include <time.h>
38 #ifdef __linux__
39 #include <sys/utsname.h>
40 #endif
41 #if HAVE_SYS_TYPES_H
42 #include <sys/types.h>
43 #endif
44 #ifdef HAVE_SYS_IOCTL_H
45 #include <sys/ioctl.h>
46 #endif
47 #ifdef HAVE_SYS_MOUNT_H
48 #include <sys/mount.h>
49 #endif
50 #if HAVE_SYS_STAT_H
51 #include <sys/stat.h>
52 #endif
53 #if HAVE_SYS_RESOURCE_H
54 #include <sys/resource.h>
55 #endif
56 #if HAVE_LINUX_FALLOC_H
57 #include <linux/falloc.h>
58 #endif
59
60 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
61 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
62 #endif
63
64 #undef ALIGN_DEBUG
65
66 #include "ext2_fs.h"
67 #include "ext2fs.h"
68
69 /*
70  * For checking structure magic numbers...
71  */
72
73 #define EXT2_CHECK_MAGIC(struct, code) \
74           if ((struct)->magic != (code)) return (code)
75
76 struct unix_cache {
77         char                    *buf;
78         unsigned long long      block;
79         int                     access_time;
80         unsigned                dirty:1;
81         unsigned                in_use:1;
82 };
83
84 #define CACHE_SIZE 8
85 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
86 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
87
88 struct unix_private_data {
89         int     magic;
90         int     dev;
91         int     flags;
92         int     align;
93         int     access_time;
94         ext2_loff_t offset;
95         struct unix_cache cache[CACHE_SIZE];
96         void    *bounce;
97         struct struct_io_stats io_stats;
98 };
99
100 #define IS_ALIGNED(n, align) ((((unsigned long) n) & \
101                                ((unsigned long) ((align)-1))) == 0)
102
103 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
104 {
105         errcode_t       retval = 0;
106
107         struct unix_private_data *data;
108
109         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
110         data = (struct unix_private_data *) channel->private_data;
111         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
112
113         if (stats)
114                 *stats = &data->io_stats;
115
116         return retval;
117 }
118
119 /*
120  * Here are the raw I/O functions
121  */
122 static errcode_t raw_read_blk(io_channel channel,
123                               struct unix_private_data *data,
124                               unsigned long long block,
125                               int count, void *bufv)
126 {
127         errcode_t       retval;
128         ssize_t         size;
129         ext2_loff_t     location;
130         int             actual = 0;
131         unsigned char   *buf = bufv;
132
133         size = (count < 0) ? -count : count * channel->block_size;
134         data->io_stats.bytes_read += size;
135         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
136
137 #ifdef HAVE_PREAD64
138         /* Try an aligned pread */
139         if ((channel->align == 0) ||
140             (IS_ALIGNED(buf, channel->align) &&
141              IS_ALIGNED(size, channel->align))) {
142                 actual = pread64(data->dev, buf, size, location);
143                 if (actual == size)
144                         return 0;
145         }
146 #elif HAVE_PREAD
147         /* Try an aligned pread */
148         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
149             ((channel->align == 0) ||
150              (IS_ALIGNED(buf, channel->align) &&
151               IS_ALIGNED(size, channel->align)))) {
152                 actual = pread(data->dev, buf, size, location);
153                 if (actual == size)
154                         return 0;
155         }
156 #endif /* HAVE_PREAD */
157
158         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
159                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
160                 goto error_out;
161         }
162         if ((channel->align == 0) ||
163             (IS_ALIGNED(buf, channel->align) &&
164              IS_ALIGNED(size, channel->align))) {
165                 actual = read(data->dev, buf, size);
166                 if (actual != size) {
167                 short_read:
168                         if (actual < 0)
169                                 actual = 0;
170                         retval = EXT2_ET_SHORT_READ;
171                         goto error_out;
172                 }
173                 return 0;
174         }
175
176 #ifdef ALIGN_DEBUG
177         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
178                (unsigned long) size);
179 #endif
180
181         /*
182          * The buffer or size which we're trying to read isn't aligned
183          * to the O_DIRECT rules, so we need to do this the hard way...
184          */
185         while (size > 0) {
186                 actual = read(data->dev, data->bounce, channel->block_size);
187                 if (actual != channel->block_size)
188                         goto short_read;
189                 actual = size;
190                 if (size > channel->block_size)
191                         actual = channel->block_size;
192                 memcpy(buf, data->bounce, actual);
193                 size -= actual;
194                 buf += actual;
195         }
196         return 0;
197
198 error_out:
199         memset((char *) buf+actual, 0, size-actual);
200         if (channel->read_error)
201                 retval = (channel->read_error)(channel, block, count, buf,
202                                                size, actual, retval);
203         return retval;
204 }
205
206 static errcode_t raw_write_blk(io_channel channel,
207                                struct unix_private_data *data,
208                                unsigned long long block,
209                                int count, const void *bufv)
210 {
211         ssize_t         size;
212         ext2_loff_t     location;
213         int             actual = 0;
214         errcode_t       retval;
215         const unsigned char *buf = bufv;
216
217         if (count == 1)
218                 size = channel->block_size;
219         else {
220                 if (count < 0)
221                         size = -count;
222                 else
223                         size = count * channel->block_size;
224         }
225         data->io_stats.bytes_written += size;
226
227         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
228
229 #ifdef HAVE_PWRITE64
230         /* Try an aligned pwrite */
231         if ((channel->align == 0) ||
232             (IS_ALIGNED(buf, channel->align) &&
233              IS_ALIGNED(size, channel->align))) {
234                 actual = pwrite64(data->dev, buf, size, location);
235                 if (actual == size)
236                         return 0;
237         }
238 #elif HAVE_PWRITE
239         /* Try an aligned pwrite */
240         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
241             ((channel->align == 0) ||
242              (IS_ALIGNED(buf, channel->align) &&
243               IS_ALIGNED(size, channel->align)))) {
244                 actual = pwrite(data->dev, buf, size, location);
245                 if (actual == size)
246                         return 0;
247         }
248 #endif /* HAVE_PWRITE */
249
250         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
251                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
252                 goto error_out;
253         }
254
255         if ((channel->align == 0) ||
256             (IS_ALIGNED(buf, channel->align) &&
257              IS_ALIGNED(size, channel->align))) {
258                 actual = write(data->dev, buf, size);
259                 if (actual != size) {
260                 short_write:
261                         retval = EXT2_ET_SHORT_WRITE;
262                         goto error_out;
263                 }
264                 return 0;
265         }
266
267 #ifdef ALIGN_DEBUG
268         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
269                (unsigned long) size);
270 #endif
271         /*
272          * The buffer or size which we're trying to write isn't aligned
273          * to the O_DIRECT rules, so we need to do this the hard way...
274          */
275         while (size > 0) {
276                 if (size < channel->block_size) {
277                         actual = read(data->dev, data->bounce,
278                                       channel->block_size);
279                         if (actual != channel->block_size) {
280                                 retval = EXT2_ET_SHORT_READ;
281                                 goto error_out;
282                         }
283                 }
284                 actual = size;
285                 if (size > channel->block_size)
286                         actual = channel->block_size;
287                 memcpy(data->bounce, buf, actual);
288                 actual = write(data->dev, data->bounce, channel->block_size);
289                 if (actual != channel->block_size)
290                         goto short_write;
291                 size -= actual;
292                 buf += actual;
293         }
294         return 0;
295
296 error_out:
297         if (channel->write_error)
298                 retval = (channel->write_error)(channel, block, count, buf,
299                                                 size, actual, retval);
300         return retval;
301 }
302
303
304 /*
305  * Here we implement the cache functions
306  */
307
308 /* Allocate the cache buffers */
309 static errcode_t alloc_cache(io_channel channel,
310                              struct unix_private_data *data)
311 {
312         errcode_t               retval;
313         struct unix_cache       *cache;
314         int                     i;
315
316         data->access_time = 0;
317         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
318                 cache->block = 0;
319                 cache->access_time = 0;
320                 cache->dirty = 0;
321                 cache->in_use = 0;
322                 if (cache->buf)
323                         ext2fs_free_mem(&cache->buf);
324                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
325                 if (retval)
326                         return retval;
327         }
328         if (channel->align) {
329                 if (data->bounce)
330                         ext2fs_free_mem(&data->bounce);
331                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
332         }
333         return retval;
334 }
335
336 /* Free the cache buffers */
337 static void free_cache(struct unix_private_data *data)
338 {
339         struct unix_cache       *cache;
340         int                     i;
341
342         data->access_time = 0;
343         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
344                 cache->block = 0;
345                 cache->access_time = 0;
346                 cache->dirty = 0;
347                 cache->in_use = 0;
348                 if (cache->buf)
349                         ext2fs_free_mem(&cache->buf);
350         }
351         if (data->bounce)
352                 ext2fs_free_mem(&data->bounce);
353 }
354
355 #ifndef NO_IO_CACHE
356 /*
357  * Try to find a block in the cache.  If the block is not found, and
358  * eldest is a non-zero pointer, then fill in eldest with the cache
359  * entry to that should be reused.
360  */
361 static struct unix_cache *find_cached_block(struct unix_private_data *data,
362                                             unsigned long long block,
363                                             struct unix_cache **eldest)
364 {
365         struct unix_cache       *cache, *unused_cache, *oldest_cache;
366         int                     i;
367
368         unused_cache = oldest_cache = 0;
369         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
370                 if (!cache->in_use) {
371                         if (!unused_cache)
372                                 unused_cache = cache;
373                         continue;
374                 }
375                 if (cache->block == block) {
376                         cache->access_time = ++data->access_time;
377                         return cache;
378                 }
379                 if (!oldest_cache ||
380                     (cache->access_time < oldest_cache->access_time))
381                         oldest_cache = cache;
382         }
383         if (eldest)
384                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
385         return 0;
386 }
387
388 /*
389  * Reuse a particular cache entry for another block.
390  */
391 static void reuse_cache(io_channel channel, struct unix_private_data *data,
392                  struct unix_cache *cache, unsigned long long block)
393 {
394         if (cache->dirty && cache->in_use)
395                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
396
397         cache->in_use = 1;
398         cache->dirty = 0;
399         cache->block = block;
400         cache->access_time = ++data->access_time;
401 }
402
403 /*
404  * Flush all of the blocks in the cache
405  */
406 static errcode_t flush_cached_blocks(io_channel channel,
407                                      struct unix_private_data *data,
408                                      int invalidate)
409
410 {
411         struct unix_cache       *cache;
412         errcode_t               retval, retval2;
413         int                     i;
414
415         retval2 = 0;
416         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
417                 if (!cache->in_use)
418                         continue;
419
420                 if (invalidate)
421                         cache->in_use = 0;
422
423                 if (!cache->dirty)
424                         continue;
425
426                 retval = raw_write_blk(channel, data,
427                                        cache->block, 1, cache->buf);
428                 if (retval)
429                         retval2 = retval;
430                 else
431                         cache->dirty = 0;
432         }
433         return retval2;
434 }
435 #endif /* NO_IO_CACHE */
436
437 #ifdef __linux__
438 #ifndef BLKDISCARDZEROES
439 #define BLKDISCARDZEROES _IO(0x12,124)
440 #endif
441 #endif
442
443 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
444 {
445         if (mode)
446 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
447                 return open64(pathname, flags, mode);
448         else
449                 return open64(pathname, flags);
450 #else
451                 return open(pathname, flags, mode);
452         else
453                 return open(pathname, flags);
454 #endif
455 }
456
457 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
458 {
459 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
460         return stat64(path, buf);
461 #else
462         return stat(path, buf);
463 #endif
464 }
465
466 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
467 {
468 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
469         return fstat64(fd, buf);
470 #else
471         return fstat(fd, buf);
472 #endif
473 }
474
475 static errcode_t unix_open(const char *name, int flags, io_channel *channel)
476 {
477         io_channel      io = NULL;
478         struct unix_private_data *data = NULL;
479         errcode_t       retval;
480         int             open_flags;
481         int             f_nocache = 0;
482         ext2fs_struct_stat st;
483 #ifdef __linux__
484         struct          utsname ut;
485 #endif
486
487         if (name == 0)
488                 return EXT2_ET_BAD_DEVICE_NAME;
489         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
490         if (retval)
491                 goto cleanup;
492         memset(io, 0, sizeof(struct struct_io_channel));
493         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
494         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
495         if (retval)
496                 goto cleanup;
497
498         io->manager = unix_io_manager;
499         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
500         if (retval)
501                 goto cleanup;
502
503         strcpy(io->name, name);
504         io->private_data = data;
505         io->block_size = 1024;
506         io->read_error = 0;
507         io->write_error = 0;
508         io->refcount = 1;
509
510         memset(data, 0, sizeof(struct unix_private_data));
511         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
512         data->io_stats.num_fields = 2;
513         data->dev = -1;
514
515         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
516         if (flags & IO_FLAG_EXCLUSIVE)
517                 open_flags |= O_EXCL;
518 #if defined(O_DIRECT)
519         if (flags & IO_FLAG_DIRECT_IO) {
520                 open_flags |= O_DIRECT;
521                 io->align = ext2fs_get_dio_alignment(data->dev);
522         }
523 #elif defined(F_NOCACHE)
524         if (flags & IO_FLAG_DIRECT_IO) {
525                 f_nocache = F_NOCACHE;
526                 io->align = 4096;
527         }
528 #endif
529         data->flags = flags;
530
531         data->dev = ext2fs_open_file(io->name, open_flags, 0);
532         if (data->dev < 0) {
533                 retval = errno;
534                 goto cleanup;
535         }
536         if (f_nocache) {
537                 if (fcntl(data->dev, f_nocache, 1) < 0) {
538                         retval = errno;
539                         goto cleanup;
540                 }
541         }
542
543         /*
544          * If the device is really a block device, then set the
545          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
546          * because we are going to use punch hole instead of discard
547          * and if it succeed, subsequent read from sparse area returns
548          * zero.
549          */
550         if (ext2fs_stat(io->name, &st) == 0) {
551                 if (S_ISBLK(st.st_mode))
552                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
553                 else
554                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
555         }
556
557 #ifdef BLKDISCARDZEROES
558         {
559                 int zeroes = 0;
560                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
561                     zeroes)
562                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
563         }
564 #endif
565
566 #if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
567         /*
568          * Some operating systems require that the buffers be aligned,
569          * regardless of O_DIRECT
570          */
571         if (!io->align)
572                 io->align = 512;
573 #endif
574
575
576         if ((retval = alloc_cache(io, data)))
577                 goto cleanup;
578
579 #ifdef BLKROGET
580         if (flags & IO_FLAG_RW) {
581                 int error;
582                 int readonly = 0;
583
584                 /* Is the block device actually writable? */
585                 error = ioctl(data->dev, BLKROGET, &readonly);
586                 if (!error && readonly) {
587                         retval = EPERM;
588                         goto cleanup;
589                 }
590         }
591 #endif
592
593 #ifdef __linux__
594 #undef RLIM_INFINITY
595 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
596 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
597 #else
598 #define RLIM_INFINITY  (~0UL)
599 #endif
600         /*
601          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
602          * block devices are wrongly getting hit by the filesize
603          * limit.  This workaround isn't perfect, since it won't work
604          * if glibc wasn't built against 2.2 header files.  (Sigh.)
605          *
606          */
607         if ((flags & IO_FLAG_RW) &&
608             (uname(&ut) == 0) &&
609             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
610              (ut.release[2] == '4') && (ut.release[3] == '.') &&
611              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
612              (ut.release[5] < '8')) &&
613             (ext2fs_stat(io->name, &st) == 0) &&
614             (S_ISBLK(st.st_mode))) {
615                 struct rlimit   rlim;
616
617                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
618                 setrlimit(RLIMIT_FSIZE, &rlim);
619                 getrlimit(RLIMIT_FSIZE, &rlim);
620                 if (((unsigned long) rlim.rlim_cur) <
621                     ((unsigned long) rlim.rlim_max)) {
622                         rlim.rlim_cur = rlim.rlim_max;
623                         setrlimit(RLIMIT_FSIZE, &rlim);
624                 }
625         }
626 #endif
627         *channel = io;
628         return 0;
629
630 cleanup:
631         if (data) {
632                 if (data->dev >= 0)
633                         close(data->dev);
634                 free_cache(data);
635                 ext2fs_free_mem(&data);
636         }
637         if (io) {
638                 if (io->name) {
639                         ext2fs_free_mem(&io->name);
640                 }
641                 ext2fs_free_mem(&io);
642         }
643         return retval;
644 }
645
646 static errcode_t unix_close(io_channel channel)
647 {
648         struct unix_private_data *data;
649         errcode_t       retval = 0;
650
651         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
652         data = (struct unix_private_data *) channel->private_data;
653         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
654
655         if (--channel->refcount > 0)
656                 return 0;
657
658 #ifndef NO_IO_CACHE
659         retval = flush_cached_blocks(channel, data, 0);
660 #endif
661
662         if (close(data->dev) < 0)
663                 retval = errno;
664         free_cache(data);
665
666         ext2fs_free_mem(&channel->private_data);
667         if (channel->name)
668                 ext2fs_free_mem(&channel->name);
669         ext2fs_free_mem(&channel);
670         return retval;
671 }
672
673 static errcode_t unix_set_blksize(io_channel channel, int blksize)
674 {
675         struct unix_private_data *data;
676         errcode_t               retval;
677
678         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
679         data = (struct unix_private_data *) channel->private_data;
680         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
681
682         if (channel->block_size != blksize) {
683 #ifndef NO_IO_CACHE
684                 if ((retval = flush_cached_blocks(channel, data, 0)))
685                         return retval;
686 #endif
687
688                 channel->block_size = blksize;
689                 free_cache(data);
690                 if ((retval = alloc_cache(channel, data)))
691                         return retval;
692         }
693         return 0;
694 }
695
696
697 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
698                                int count, void *buf)
699 {
700         struct unix_private_data *data;
701         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
702         errcode_t       retval;
703         char            *cp;
704         int             i, j;
705
706         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
707         data = (struct unix_private_data *) channel->private_data;
708         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
709
710 #ifdef NO_IO_CACHE
711         return raw_read_blk(channel, data, block, count, buf);
712 #else
713         /*
714          * If we're doing an odd-sized read or a very large read,
715          * flush out the cache and then do a direct read.
716          */
717         if (count < 0 || count > WRITE_DIRECT_SIZE) {
718                 if ((retval = flush_cached_blocks(channel, data, 0)))
719                         return retval;
720                 return raw_read_blk(channel, data, block, count, buf);
721         }
722
723         cp = buf;
724         while (count > 0) {
725                 /* If it's in the cache, use it! */
726                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
727 #ifdef DEBUG
728                         printf("Using cached block %lu\n", block);
729 #endif
730                         memcpy(cp, cache->buf, channel->block_size);
731                         count--;
732                         block++;
733                         cp += channel->block_size;
734                         continue;
735                 }
736                 if (count == 1) {
737                         /*
738                          * Special case where we read directly into the
739                          * cache buffer; important in the O_DIRECT case
740                          */
741                         cache = reuse[0];
742                         reuse_cache(channel, data, cache, block);
743                         if ((retval = raw_read_blk(channel, data, block, 1,
744                                                    cache->buf))) {
745                                 cache->in_use = 0;
746                                 return retval;
747                         }
748                         memcpy(cp, cache->buf, channel->block_size);
749                         return 0;
750                 }
751
752                 /*
753                  * Find the number of uncached blocks so we can do a
754                  * single read request
755                  */
756                 for (i=1; i < count; i++)
757                         if (find_cached_block(data, block+i, &reuse[i]))
758                                 break;
759 #ifdef DEBUG
760                 printf("Reading %d blocks starting at %lu\n", i, block);
761 #endif
762                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
763                         return retval;
764
765                 /* Save the results in the cache */
766                 for (j=0; j < i; j++) {
767                         count--;
768                         cache = reuse[j];
769                         reuse_cache(channel, data, cache, block++);
770                         memcpy(cache->buf, cp, channel->block_size);
771                         cp += channel->block_size;
772                 }
773         }
774         return 0;
775 #endif /* NO_IO_CACHE */
776 }
777
778 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
779                                int count, void *buf)
780 {
781         return unix_read_blk64(channel, block, count, buf);
782 }
783
784 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
785                                 int count, const void *buf)
786 {
787         struct unix_private_data *data;
788         struct unix_cache *cache, *reuse;
789         errcode_t       retval = 0;
790         const char      *cp;
791         int             writethrough;
792
793         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
794         data = (struct unix_private_data *) channel->private_data;
795         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
796
797 #ifdef NO_IO_CACHE
798         return raw_write_blk(channel, data, block, count, buf);
799 #else
800         /*
801          * If we're doing an odd-sized write or a very large write,
802          * flush out the cache completely and then do a direct write.
803          */
804         if (count < 0 || count > WRITE_DIRECT_SIZE) {
805                 if ((retval = flush_cached_blocks(channel, data, 1)))
806                         return retval;
807                 return raw_write_blk(channel, data, block, count, buf);
808         }
809
810         /*
811          * For a moderate-sized multi-block write, first force a write
812          * if we're in write-through cache mode, and then fill the
813          * cache with the blocks.
814          */
815         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
816         if (writethrough)
817                 retval = raw_write_blk(channel, data, block, count, buf);
818
819         cp = buf;
820         while (count > 0) {
821                 cache = find_cached_block(data, block, &reuse);
822                 if (!cache) {
823                         cache = reuse;
824                         reuse_cache(channel, data, cache, block);
825                 }
826                 if (cache->buf != cp)
827                         memcpy(cache->buf, cp, channel->block_size);
828                 cache->dirty = !writethrough;
829                 count--;
830                 block++;
831                 cp += channel->block_size;
832         }
833         return retval;
834 #endif /* NO_IO_CACHE */
835 }
836
837 static errcode_t unix_cache_readahead(io_channel channel,
838                                       unsigned long long block,
839                                       unsigned long long count)
840 {
841 #ifdef POSIX_FADV_WILLNEED
842         struct unix_private_data *data;
843
844         data = (struct unix_private_data *)channel->private_data;
845         return posix_fadvise(data->dev,
846                              (ext2_loff_t)block * channel->block_size,
847                              (ext2_loff_t)count * channel->block_size,
848                              POSIX_FADV_WILLNEED);
849 #else
850         return EXT2_ET_OP_NOT_SUPPORTED;
851 #endif
852 }
853
854 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
855                                 int count, const void *buf)
856 {
857         return unix_write_blk64(channel, block, count, buf);
858 }
859
860 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
861                                  int size, const void *buf)
862 {
863         struct unix_private_data *data;
864         errcode_t       retval = 0;
865         ssize_t         actual;
866
867         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
868         data = (struct unix_private_data *) channel->private_data;
869         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
870
871         if (channel->align != 0) {
872 #ifdef ALIGN_DEBUG
873                 printf("unix_write_byte: O_DIRECT fallback\n");
874 #endif
875                 return EXT2_ET_UNIMPLEMENTED;
876         }
877
878 #ifndef NO_IO_CACHE
879         /*
880          * Flush out the cache completely
881          */
882         if ((retval = flush_cached_blocks(channel, data, 1)))
883                 return retval;
884 #endif
885
886         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
887                 return errno;
888
889         actual = write(data->dev, buf, size);
890         if (actual != size)
891                 return EXT2_ET_SHORT_WRITE;
892
893         return 0;
894 }
895
896 /*
897  * Flush data buffers to disk.
898  */
899 static errcode_t unix_flush(io_channel channel)
900 {
901         struct unix_private_data *data;
902         errcode_t retval = 0;
903
904         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
905         data = (struct unix_private_data *) channel->private_data;
906         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
907
908 #ifndef NO_IO_CACHE
909         retval = flush_cached_blocks(channel, data, 0);
910 #endif
911         fsync(data->dev);
912         return retval;
913 }
914
915 static errcode_t unix_set_option(io_channel channel, const char *option,
916                                  const char *arg)
917 {
918         struct unix_private_data *data;
919         unsigned long long tmp;
920         char *end;
921
922         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
923         data = (struct unix_private_data *) channel->private_data;
924         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
925
926         if (!strcmp(option, "offset")) {
927                 if (!arg)
928                         return EXT2_ET_INVALID_ARGUMENT;
929
930                 tmp = strtoull(arg, &end, 0);
931                 if (*end)
932                         return EXT2_ET_INVALID_ARGUMENT;
933                 data->offset = tmp;
934                 if (data->offset < 0)
935                         return EXT2_ET_INVALID_ARGUMENT;
936                 return 0;
937         }
938         return EXT2_ET_INVALID_ARGUMENT;
939 }
940
941 #if defined(__linux__) && !defined(BLKDISCARD)
942 #define BLKDISCARD              _IO(0x12,119)
943 #endif
944
945 static errcode_t unix_discard(io_channel channel, unsigned long long block,
946                               unsigned long long count)
947 {
948         struct unix_private_data *data;
949         int             ret;
950
951         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
952         data = (struct unix_private_data *) channel->private_data;
953         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
954
955         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
956 #ifdef BLKDISCARD
957                 __u64 range[2];
958
959                 range[0] = (__u64)(block) * channel->block_size;
960                 range[1] = (__u64)(count) * channel->block_size;
961
962                 ret = ioctl(data->dev, BLKDISCARD, &range);
963 #else
964                 goto unimplemented;
965 #endif
966         } else {
967 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
968                 /*
969                  * If we are not on block device, try to use punch hole
970                  * to reclaim free space.
971                  */
972                 ret = fallocate(data->dev,
973                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
974                                 (off_t)(block) * channel->block_size,
975                                 (off_t)(count) * channel->block_size);
976 #else
977                 goto unimplemented;
978 #endif
979         }
980         if (ret < 0) {
981                 if (errno == EOPNOTSUPP)
982                         goto unimplemented;
983                 return errno;
984         }
985         return 0;
986 unimplemented:
987         return EXT2_ET_UNIMPLEMENTED;
988 }
989
990 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
991                               unsigned long long count)
992 {
993         struct unix_private_data *data;
994         int             ret;
995
996         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
997         data = (struct unix_private_data *) channel->private_data;
998         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
999
1000         if (getenv("UNIX_IO_NOZEROOUT"))
1001                 goto unimplemented;
1002
1003         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1004                 /* Not implemented until the BLKZEROOUT mess is fixed */
1005                 goto unimplemented;
1006         } else {
1007                 /* Regular file, try to use truncate/punch/zero. */
1008 #if defined(HAVE_FALLOCATE) && (defined(FALLOC_FL_ZERO_RANGE) || \
1009         (defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)))
1010                 struct stat statbuf;
1011
1012                 if (count == 0)
1013                         return 0;
1014                 /*
1015                  * If we're trying to zero a range past the end of the file,
1016                  * extend the file size, then punch (or zero_range) everything.
1017                  */
1018                 ret = fstat(data->dev, &statbuf);
1019                 if (ret)
1020                         goto err;
1021                 if (statbuf.st_size < (block + count) * channel->block_size) {
1022                         ret = ftruncate(data->dev,
1023                                         (block + count) * channel->block_size);
1024                         if (ret)
1025                                 goto err;
1026                 }
1027 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1028                 ret = fallocate(data->dev,
1029                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1030                                 (off_t)(block) * channel->block_size,
1031                                 (off_t)(count) * channel->block_size);
1032                 if (ret == 0)
1033                         goto err;
1034 #endif
1035 #ifdef FALLOC_FL_ZERO_RANGE
1036                 ret = fallocate(data->dev,
1037                                 FALLOC_FL_ZERO_RANGE,
1038                                 (off_t)(block) * channel->block_size,
1039                                 (off_t)(count) * channel->block_size);
1040 #endif
1041 #else
1042                 goto unimplemented;
1043 #endif /* HAVE_FALLOCATE && (ZERO_RANGE || (PUNCH_HOLE && KEEP_SIZE)) */
1044         }
1045 err:
1046         if (ret < 0) {
1047                 if (errno == EOPNOTSUPP)
1048                         goto unimplemented;
1049                 return errno;
1050         }
1051         return 0;
1052 unimplemented:
1053         return EXT2_ET_UNIMPLEMENTED;
1054 }
1055
1056 static struct struct_io_manager struct_unix_manager = {
1057         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1058         .name           = "Unix I/O Manager",
1059         .open           = unix_open,
1060         .close          = unix_close,
1061         .set_blksize    = unix_set_blksize,
1062         .read_blk       = unix_read_blk,
1063         .write_blk      = unix_write_blk,
1064         .flush          = unix_flush,
1065         .write_byte     = unix_write_byte,
1066         .set_option     = unix_set_option,
1067         .get_stats      = unix_get_stats,
1068         .read_blk64     = unix_read_blk64,
1069         .write_blk64    = unix_write_blk64,
1070         .discard        = unix_discard,
1071         .cache_readahead        = unix_cache_readahead,
1072         .zeroout        = unix_zeroout,
1073 };
1074
1075 io_manager unix_io_manager = &struct_unix_manager;