Whamcloud - gitweb
libext2fs: fix offset support in unix_io.c
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #if HAVE_SYS_STAT_H
57 #include <sys/stat.h>
58 #endif
59 #if HAVE_SYS_RESOURCE_H
60 #include <sys/resource.h>
61 #endif
62 #if HAVE_LINUX_FALLOC_H
63 #include <linux/falloc.h>
64 #endif
65
66 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
67 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
68 #endif
69
70 #undef ALIGN_DEBUG
71
72 #include "ext2_fs.h"
73 #include "ext2fs.h"
74
75 /*
76  * For checking structure magic numbers...
77  */
78
79 #define EXT2_CHECK_MAGIC(struct, code) \
80           if ((struct)->magic != (code)) return (code)
81
82 struct unix_cache {
83         char                    *buf;
84         unsigned long long      block;
85         int                     access_time;
86         unsigned                dirty:1;
87         unsigned                in_use:1;
88 };
89
90 #define CACHE_SIZE 8
91 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
92 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
93
94 struct unix_private_data {
95         int     magic;
96         int     dev;
97         int     flags;
98         int     align;
99         int     access_time;
100         ext2_loff_t offset;
101         struct unix_cache cache[CACHE_SIZE];
102         void    *bounce;
103         struct struct_io_stats io_stats;
104 };
105
106 #define IS_ALIGNED(n, align) ((((unsigned long) n) & \
107                                ((unsigned long) ((align)-1))) == 0)
108
109 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
110 {
111         errcode_t       retval = 0;
112
113         struct unix_private_data *data;
114
115         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
116         data = (struct unix_private_data *) channel->private_data;
117         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
118
119         if (stats)
120                 *stats = &data->io_stats;
121
122         return retval;
123 }
124
125 /*
126  * Here are the raw I/O functions
127  */
128 static errcode_t raw_read_blk(io_channel channel,
129                               struct unix_private_data *data,
130                               unsigned long long block,
131                               int count, void *bufv)
132 {
133         errcode_t       retval;
134         ssize_t         size;
135         ext2_loff_t     location;
136         int             actual = 0;
137         unsigned char   *buf = bufv;
138
139         size = (count < 0) ? -count : count * channel->block_size;
140         data->io_stats.bytes_read += size;
141         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
142
143 #ifdef HAVE_PREAD64
144         /* Try an aligned pread */
145         if ((channel->align == 0) ||
146             (IS_ALIGNED(buf, channel->align) &&
147              IS_ALIGNED(size, channel->align))) {
148                 actual = pread64(data->dev, buf, size, location);
149                 if (actual == size)
150                         return 0;
151         }
152 #elif HAVE_PREAD
153         /* Try an aligned pread */
154         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
155             ((channel->align == 0) ||
156              (IS_ALIGNED(buf, channel->align) &&
157               IS_ALIGNED(size, channel->align)))) {
158                 actual = pread(data->dev, buf, size, location);
159                 if (actual == size)
160                         return 0;
161         }
162 #endif /* HAVE_PREAD */
163
164         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
165                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
166                 goto error_out;
167         }
168         if ((channel->align == 0) ||
169             (IS_ALIGNED(buf, channel->align) &&
170              IS_ALIGNED(size, channel->align))) {
171                 actual = read(data->dev, buf, size);
172                 if (actual != size) {
173                 short_read:
174                         if (actual < 0)
175                                 actual = 0;
176                         retval = EXT2_ET_SHORT_READ;
177                         goto error_out;
178                 }
179                 return 0;
180         }
181
182 #ifdef ALIGN_DEBUG
183         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
184                (unsigned long) size);
185 #endif
186
187         /*
188          * The buffer or size which we're trying to read isn't aligned
189          * to the O_DIRECT rules, so we need to do this the hard way...
190          */
191         while (size > 0) {
192                 actual = read(data->dev, data->bounce, channel->block_size);
193                 if (actual != channel->block_size)
194                         goto short_read;
195                 actual = size;
196                 if (size > channel->block_size)
197                         actual = channel->block_size;
198                 memcpy(buf, data->bounce, actual);
199                 size -= actual;
200                 buf += actual;
201         }
202         return 0;
203
204 error_out:
205         memset((char *) buf+actual, 0, size-actual);
206         if (channel->read_error)
207                 retval = (channel->read_error)(channel, block, count, buf,
208                                                size, actual, retval);
209         return retval;
210 }
211
212 static errcode_t raw_write_blk(io_channel channel,
213                                struct unix_private_data *data,
214                                unsigned long long block,
215                                int count, const void *bufv)
216 {
217         ssize_t         size;
218         ext2_loff_t     location;
219         int             actual = 0;
220         errcode_t       retval;
221         const unsigned char *buf = bufv;
222
223         if (count == 1)
224                 size = channel->block_size;
225         else {
226                 if (count < 0)
227                         size = -count;
228                 else
229                         size = count * channel->block_size;
230         }
231         data->io_stats.bytes_written += size;
232
233         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
234
235 #ifdef HAVE_PWRITE64
236         /* Try an aligned pwrite */
237         if ((channel->align == 0) ||
238             (IS_ALIGNED(buf, channel->align) &&
239              IS_ALIGNED(size, channel->align))) {
240                 actual = pwrite64(data->dev, buf, size, location);
241                 if (actual == size)
242                         return 0;
243         }
244 #elif HAVE_PWRITE
245         /* Try an aligned pwrite */
246         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
247             ((channel->align == 0) ||
248              (IS_ALIGNED(buf, channel->align) &&
249               IS_ALIGNED(size, channel->align)))) {
250                 actual = pwrite(data->dev, buf, size, location);
251                 if (actual == size)
252                         return 0;
253         }
254 #endif /* HAVE_PWRITE */
255
256         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
257                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
258                 goto error_out;
259         }
260
261         if ((channel->align == 0) ||
262             (IS_ALIGNED(buf, channel->align) &&
263              IS_ALIGNED(size, channel->align))) {
264                 actual = write(data->dev, buf, size);
265                 if (actual != size) {
266                 short_write:
267                         retval = EXT2_ET_SHORT_WRITE;
268                         goto error_out;
269                 }
270                 return 0;
271         }
272
273 #ifdef ALIGN_DEBUG
274         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
275                (unsigned long) size);
276 #endif
277         /*
278          * The buffer or size which we're trying to write isn't aligned
279          * to the O_DIRECT rules, so we need to do this the hard way...
280          */
281         while (size > 0) {
282                 if (size < channel->block_size) {
283                         actual = read(data->dev, data->bounce,
284                                       channel->block_size);
285                         if (actual != channel->block_size) {
286                                 retval = EXT2_ET_SHORT_READ;
287                                 goto error_out;
288                         }
289                 }
290                 actual = size;
291                 if (size > channel->block_size)
292                         actual = channel->block_size;
293                 memcpy(data->bounce, buf, actual);
294                 actual = write(data->dev, data->bounce, channel->block_size);
295                 if (actual != channel->block_size)
296                         goto short_write;
297                 size -= actual;
298                 buf += actual;
299         }
300         return 0;
301
302 error_out:
303         if (channel->write_error)
304                 retval = (channel->write_error)(channel, block, count, buf,
305                                                 size, actual, retval);
306         return retval;
307 }
308
309
310 /*
311  * Here we implement the cache functions
312  */
313
314 /* Allocate the cache buffers */
315 static errcode_t alloc_cache(io_channel channel,
316                              struct unix_private_data *data)
317 {
318         errcode_t               retval;
319         struct unix_cache       *cache;
320         int                     i;
321
322         data->access_time = 0;
323         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
324                 cache->block = 0;
325                 cache->access_time = 0;
326                 cache->dirty = 0;
327                 cache->in_use = 0;
328                 if (cache->buf)
329                         ext2fs_free_mem(&cache->buf);
330                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
331                 if (retval)
332                         return retval;
333         }
334         if (channel->align) {
335                 if (data->bounce)
336                         ext2fs_free_mem(&data->bounce);
337                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
338         }
339         return retval;
340 }
341
342 /* Free the cache buffers */
343 static void free_cache(struct unix_private_data *data)
344 {
345         struct unix_cache       *cache;
346         int                     i;
347
348         data->access_time = 0;
349         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
350                 cache->block = 0;
351                 cache->access_time = 0;
352                 cache->dirty = 0;
353                 cache->in_use = 0;
354                 if (cache->buf)
355                         ext2fs_free_mem(&cache->buf);
356         }
357         if (data->bounce)
358                 ext2fs_free_mem(&data->bounce);
359 }
360
361 #ifndef NO_IO_CACHE
362 /*
363  * Try to find a block in the cache.  If the block is not found, and
364  * eldest is a non-zero pointer, then fill in eldest with the cache
365  * entry to that should be reused.
366  */
367 static struct unix_cache *find_cached_block(struct unix_private_data *data,
368                                             unsigned long long block,
369                                             struct unix_cache **eldest)
370 {
371         struct unix_cache       *cache, *unused_cache, *oldest_cache;
372         int                     i;
373
374         unused_cache = oldest_cache = 0;
375         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
376                 if (!cache->in_use) {
377                         if (!unused_cache)
378                                 unused_cache = cache;
379                         continue;
380                 }
381                 if (cache->block == block) {
382                         cache->access_time = ++data->access_time;
383                         return cache;
384                 }
385                 if (!oldest_cache ||
386                     (cache->access_time < oldest_cache->access_time))
387                         oldest_cache = cache;
388         }
389         if (eldest)
390                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
391         return 0;
392 }
393
394 /*
395  * Reuse a particular cache entry for another block.
396  */
397 static void reuse_cache(io_channel channel, struct unix_private_data *data,
398                  struct unix_cache *cache, unsigned long long block)
399 {
400         if (cache->dirty && cache->in_use)
401                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
402
403         cache->in_use = 1;
404         cache->dirty = 0;
405         cache->block = block;
406         cache->access_time = ++data->access_time;
407 }
408
409 /*
410  * Flush all of the blocks in the cache
411  */
412 static errcode_t flush_cached_blocks(io_channel channel,
413                                      struct unix_private_data *data,
414                                      int invalidate)
415
416 {
417         struct unix_cache       *cache;
418         errcode_t               retval, retval2;
419         int                     i;
420
421         retval2 = 0;
422         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
423                 if (!cache->in_use)
424                         continue;
425
426                 if (invalidate)
427                         cache->in_use = 0;
428
429                 if (!cache->dirty)
430                         continue;
431
432                 retval = raw_write_blk(channel, data,
433                                        cache->block, 1, cache->buf);
434                 if (retval)
435                         retval2 = retval;
436                 else
437                         cache->dirty = 0;
438         }
439         return retval2;
440 }
441 #endif /* NO_IO_CACHE */
442
443 #ifdef __linux__
444 #ifndef BLKDISCARDZEROES
445 #define BLKDISCARDZEROES _IO(0x12,124)
446 #endif
447 #endif
448
449 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
450 {
451         if (mode)
452 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
453                 return open64(pathname, flags, mode);
454         else
455                 return open64(pathname, flags);
456 #else
457                 return open(pathname, flags, mode);
458         else
459                 return open(pathname, flags);
460 #endif
461 }
462
463 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
464 {
465 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
466         return stat64(path, buf);
467 #else
468         return stat(path, buf);
469 #endif
470 }
471
472 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
473 {
474 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
475         return fstat64(fd, buf);
476 #else
477         return fstat(fd, buf);
478 #endif
479 }
480
481 static errcode_t unix_open(const char *name, int flags, io_channel *channel)
482 {
483         io_channel      io = NULL;
484         struct unix_private_data *data = NULL;
485         errcode_t       retval;
486         int             open_flags;
487         int             f_nocache = 0;
488         ext2fs_struct_stat st;
489 #ifdef __linux__
490         struct          utsname ut;
491 #endif
492
493         if (name == 0)
494                 return EXT2_ET_BAD_DEVICE_NAME;
495         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
496         if (retval)
497                 goto cleanup;
498         memset(io, 0, sizeof(struct struct_io_channel));
499         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
500         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
501         if (retval)
502                 goto cleanup;
503
504         io->manager = unix_io_manager;
505         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
506         if (retval)
507                 goto cleanup;
508
509         strcpy(io->name, name);
510         io->private_data = data;
511         io->block_size = 1024;
512         io->read_error = 0;
513         io->write_error = 0;
514         io->refcount = 1;
515
516         memset(data, 0, sizeof(struct unix_private_data));
517         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
518         data->io_stats.num_fields = 2;
519         data->dev = -1;
520
521         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
522         if (flags & IO_FLAG_EXCLUSIVE)
523                 open_flags |= O_EXCL;
524 #if defined(O_DIRECT)
525         if (flags & IO_FLAG_DIRECT_IO) {
526                 open_flags |= O_DIRECT;
527                 io->align = ext2fs_get_dio_alignment(data->dev);
528         }
529 #elif defined(F_NOCACHE)
530         if (flags & IO_FLAG_DIRECT_IO) {
531                 f_nocache = F_NOCACHE;
532                 io->align = 4096;
533         }
534 #endif
535         data->flags = flags;
536
537         data->dev = ext2fs_open_file(io->name, open_flags, 0);
538         if (data->dev < 0) {
539                 retval = errno;
540                 goto cleanup;
541         }
542         if (f_nocache) {
543                 if (fcntl(data->dev, f_nocache, 1) < 0) {
544                         retval = errno;
545                         goto cleanup;
546                 }
547         }
548
549         /*
550          * If the device is really a block device, then set the
551          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
552          * because we are going to use punch hole instead of discard
553          * and if it succeed, subsequent read from sparse area returns
554          * zero.
555          */
556         if (ext2fs_stat(io->name, &st) == 0) {
557                 if (S_ISBLK(st.st_mode))
558                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
559                 else
560                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
561         }
562
563 #ifdef BLKDISCARDZEROES
564         {
565                 int zeroes = 0;
566                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
567                     zeroes)
568                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
569         }
570 #endif
571
572 #if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
573         /*
574          * Some operating systems require that the buffers be aligned,
575          * regardless of O_DIRECT
576          */
577         if (!io->align)
578                 io->align = 512;
579 #endif
580
581
582         if ((retval = alloc_cache(io, data)))
583                 goto cleanup;
584
585 #ifdef BLKROGET
586         if (flags & IO_FLAG_RW) {
587                 int error;
588                 int readonly = 0;
589
590                 /* Is the block device actually writable? */
591                 error = ioctl(data->dev, BLKROGET, &readonly);
592                 if (!error && readonly) {
593                         retval = EPERM;
594                         goto cleanup;
595                 }
596         }
597 #endif
598
599 #ifdef __linux__
600 #undef RLIM_INFINITY
601 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
602 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
603 #else
604 #define RLIM_INFINITY  (~0UL)
605 #endif
606         /*
607          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
608          * block devices are wrongly getting hit by the filesize
609          * limit.  This workaround isn't perfect, since it won't work
610          * if glibc wasn't built against 2.2 header files.  (Sigh.)
611          *
612          */
613         if ((flags & IO_FLAG_RW) &&
614             (uname(&ut) == 0) &&
615             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
616              (ut.release[2] == '4') && (ut.release[3] == '.') &&
617              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
618              (ut.release[5] < '8')) &&
619             (ext2fs_stat(io->name, &st) == 0) &&
620             (S_ISBLK(st.st_mode))) {
621                 struct rlimit   rlim;
622
623                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
624                 setrlimit(RLIMIT_FSIZE, &rlim);
625                 getrlimit(RLIMIT_FSIZE, &rlim);
626                 if (((unsigned long) rlim.rlim_cur) <
627                     ((unsigned long) rlim.rlim_max)) {
628                         rlim.rlim_cur = rlim.rlim_max;
629                         setrlimit(RLIMIT_FSIZE, &rlim);
630                 }
631         }
632 #endif
633         *channel = io;
634         return 0;
635
636 cleanup:
637         if (data) {
638                 if (data->dev >= 0)
639                         close(data->dev);
640                 free_cache(data);
641                 ext2fs_free_mem(&data);
642         }
643         if (io) {
644                 if (io->name) {
645                         ext2fs_free_mem(&io->name);
646                 }
647                 ext2fs_free_mem(&io);
648         }
649         return retval;
650 }
651
652 static errcode_t unix_close(io_channel channel)
653 {
654         struct unix_private_data *data;
655         errcode_t       retval = 0;
656
657         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
658         data = (struct unix_private_data *) channel->private_data;
659         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
660
661         if (--channel->refcount > 0)
662                 return 0;
663
664 #ifndef NO_IO_CACHE
665         retval = flush_cached_blocks(channel, data, 0);
666 #endif
667
668         if (close(data->dev) < 0)
669                 retval = errno;
670         free_cache(data);
671
672         ext2fs_free_mem(&channel->private_data);
673         if (channel->name)
674                 ext2fs_free_mem(&channel->name);
675         ext2fs_free_mem(&channel);
676         return retval;
677 }
678
679 static errcode_t unix_set_blksize(io_channel channel, int blksize)
680 {
681         struct unix_private_data *data;
682         errcode_t               retval;
683
684         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
685         data = (struct unix_private_data *) channel->private_data;
686         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
687
688         if (channel->block_size != blksize) {
689 #ifndef NO_IO_CACHE
690                 if ((retval = flush_cached_blocks(channel, data, 0)))
691                         return retval;
692 #endif
693
694                 channel->block_size = blksize;
695                 free_cache(data);
696                 if ((retval = alloc_cache(channel, data)))
697                         return retval;
698         }
699         return 0;
700 }
701
702
703 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
704                                int count, void *buf)
705 {
706         struct unix_private_data *data;
707         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
708         errcode_t       retval;
709         char            *cp;
710         int             i, j;
711
712         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
713         data = (struct unix_private_data *) channel->private_data;
714         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
715
716 #ifdef NO_IO_CACHE
717         return raw_read_blk(channel, data, block, count, buf);
718 #else
719         /*
720          * If we're doing an odd-sized read or a very large read,
721          * flush out the cache and then do a direct read.
722          */
723         if (count < 0 || count > WRITE_DIRECT_SIZE) {
724                 if ((retval = flush_cached_blocks(channel, data, 0)))
725                         return retval;
726                 return raw_read_blk(channel, data, block, count, buf);
727         }
728
729         cp = buf;
730         while (count > 0) {
731                 /* If it's in the cache, use it! */
732                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
733 #ifdef DEBUG
734                         printf("Using cached block %lu\n", block);
735 #endif
736                         memcpy(cp, cache->buf, channel->block_size);
737                         count--;
738                         block++;
739                         cp += channel->block_size;
740                         continue;
741                 }
742                 if (count == 1) {
743                         /*
744                          * Special case where we read directly into the
745                          * cache buffer; important in the O_DIRECT case
746                          */
747                         cache = reuse[0];
748                         reuse_cache(channel, data, cache, block);
749                         if ((retval = raw_read_blk(channel, data, block, 1,
750                                                    cache->buf))) {
751                                 cache->in_use = 0;
752                                 return retval;
753                         }
754                         memcpy(cp, cache->buf, channel->block_size);
755                         return 0;
756                 }
757
758                 /*
759                  * Find the number of uncached blocks so we can do a
760                  * single read request
761                  */
762                 for (i=1; i < count; i++)
763                         if (find_cached_block(data, block+i, &reuse[i]))
764                                 break;
765 #ifdef DEBUG
766                 printf("Reading %d blocks starting at %lu\n", i, block);
767 #endif
768                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
769                         return retval;
770
771                 /* Save the results in the cache */
772                 for (j=0; j < i; j++) {
773                         count--;
774                         cache = reuse[j];
775                         reuse_cache(channel, data, cache, block++);
776                         memcpy(cache->buf, cp, channel->block_size);
777                         cp += channel->block_size;
778                 }
779         }
780         return 0;
781 #endif /* NO_IO_CACHE */
782 }
783
784 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
785                                int count, void *buf)
786 {
787         return unix_read_blk64(channel, block, count, buf);
788 }
789
790 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
791                                 int count, const void *buf)
792 {
793         struct unix_private_data *data;
794         struct unix_cache *cache, *reuse;
795         errcode_t       retval = 0;
796         const char      *cp;
797         int             writethrough;
798
799         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
800         data = (struct unix_private_data *) channel->private_data;
801         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
802
803 #ifdef NO_IO_CACHE
804         return raw_write_blk(channel, data, block, count, buf);
805 #else
806         /*
807          * If we're doing an odd-sized write or a very large write,
808          * flush out the cache completely and then do a direct write.
809          */
810         if (count < 0 || count > WRITE_DIRECT_SIZE) {
811                 if ((retval = flush_cached_blocks(channel, data, 1)))
812                         return retval;
813                 return raw_write_blk(channel, data, block, count, buf);
814         }
815
816         /*
817          * For a moderate-sized multi-block write, first force a write
818          * if we're in write-through cache mode, and then fill the
819          * cache with the blocks.
820          */
821         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
822         if (writethrough)
823                 retval = raw_write_blk(channel, data, block, count, buf);
824
825         cp = buf;
826         while (count > 0) {
827                 cache = find_cached_block(data, block, &reuse);
828                 if (!cache) {
829                         cache = reuse;
830                         reuse_cache(channel, data, cache, block);
831                 }
832                 if (cache->buf != cp)
833                         memcpy(cache->buf, cp, channel->block_size);
834                 cache->dirty = !writethrough;
835                 count--;
836                 block++;
837                 cp += channel->block_size;
838         }
839         return retval;
840 #endif /* NO_IO_CACHE */
841 }
842
843 static errcode_t unix_cache_readahead(io_channel channel,
844                                       unsigned long long block,
845                                       unsigned long long count)
846 {
847 #ifdef POSIX_FADV_WILLNEED
848         struct unix_private_data *data;
849
850         data = (struct unix_private_data *)channel->private_data;
851         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
852         return posix_fadvise(data->dev,
853                              (ext2_loff_t)block * channel->block_size + data->offset,
854                              (ext2_loff_t)count * channel->block_size,
855                              POSIX_FADV_WILLNEED);
856 #else
857         return EXT2_ET_OP_NOT_SUPPORTED;
858 #endif
859 }
860
861 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
862                                 int count, const void *buf)
863 {
864         return unix_write_blk64(channel, block, count, buf);
865 }
866
867 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
868                                  int size, const void *buf)
869 {
870         struct unix_private_data *data;
871         errcode_t       retval = 0;
872         ssize_t         actual;
873
874         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
875         data = (struct unix_private_data *) channel->private_data;
876         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
877
878         if (channel->align != 0) {
879 #ifdef ALIGN_DEBUG
880                 printf("unix_write_byte: O_DIRECT fallback\n");
881 #endif
882                 return EXT2_ET_UNIMPLEMENTED;
883         }
884
885 #ifndef NO_IO_CACHE
886         /*
887          * Flush out the cache completely
888          */
889         if ((retval = flush_cached_blocks(channel, data, 1)))
890                 return retval;
891 #endif
892
893         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
894                 return errno;
895
896         actual = write(data->dev, buf, size);
897         if (actual != size)
898                 return EXT2_ET_SHORT_WRITE;
899
900         return 0;
901 }
902
903 /*
904  * Flush data buffers to disk.
905  */
906 static errcode_t unix_flush(io_channel channel)
907 {
908         struct unix_private_data *data;
909         errcode_t retval = 0;
910
911         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
912         data = (struct unix_private_data *) channel->private_data;
913         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
914
915 #ifndef NO_IO_CACHE
916         retval = flush_cached_blocks(channel, data, 0);
917 #endif
918         fsync(data->dev);
919         return retval;
920 }
921
922 static errcode_t unix_set_option(io_channel channel, const char *option,
923                                  const char *arg)
924 {
925         struct unix_private_data *data;
926         unsigned long long tmp;
927         char *end;
928
929         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
930         data = (struct unix_private_data *) channel->private_data;
931         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
932
933         if (!strcmp(option, "offset")) {
934                 if (!arg)
935                         return EXT2_ET_INVALID_ARGUMENT;
936
937                 tmp = strtoull(arg, &end, 0);
938                 if (*end)
939                         return EXT2_ET_INVALID_ARGUMENT;
940                 data->offset = tmp;
941                 if (data->offset < 0)
942                         return EXT2_ET_INVALID_ARGUMENT;
943                 return 0;
944         }
945         return EXT2_ET_INVALID_ARGUMENT;
946 }
947
948 #if defined(__linux__) && !defined(BLKDISCARD)
949 #define BLKDISCARD              _IO(0x12,119)
950 #endif
951
952 static errcode_t unix_discard(io_channel channel, unsigned long long block,
953                               unsigned long long count)
954 {
955         struct unix_private_data *data;
956         int             ret;
957
958         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
959         data = (struct unix_private_data *) channel->private_data;
960         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
961
962         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
963 #ifdef BLKDISCARD
964                 __u64 range[2];
965
966                 range[0] = (__u64)(block) * channel->block_size + data->offset;
967                 range[1] = (__u64)(count) * channel->block_size;
968
969                 ret = ioctl(data->dev, BLKDISCARD, &range);
970 #else
971                 goto unimplemented;
972 #endif
973         } else {
974 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
975                 /*
976                  * If we are not on block device, try to use punch hole
977                  * to reclaim free space.
978                  */
979                 ret = fallocate(data->dev,
980                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
981                                 (off_t)(block) * channel->block_size + data->offset,
982                                 (off_t)(count) * channel->block_size);
983 #else
984                 goto unimplemented;
985 #endif
986         }
987         if (ret < 0) {
988                 if (errno == EOPNOTSUPP)
989                         goto unimplemented;
990                 return errno;
991         }
992         return 0;
993 unimplemented:
994         return EXT2_ET_UNIMPLEMENTED;
995 }
996
997 /* parameters might not be used if OS doesn't support zeroout */
998 #pragma GCC diagnostic push
999 #pragma GCC diagnostic ignored "-Wunused-parameter"
1000 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1001                               unsigned long long count)
1002 {
1003         struct unix_private_data *data;
1004         int             ret;
1005
1006         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1007         data = (struct unix_private_data *) channel->private_data;
1008         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1009
1010         if (getenv("UNIX_IO_NOZEROOUT"))
1011                 goto unimplemented;
1012
1013         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1014                 /* Not implemented until the BLKZEROOUT mess is fixed */
1015                 goto unimplemented;
1016         } else {
1017                 /* Regular file, try to use truncate/punch/zero. */
1018                 struct stat statbuf;
1019
1020                 if (count == 0)
1021                         return 0;
1022                 /*
1023                  * If we're trying to zero a range past the end of the file,
1024                  * extend the file size, then truncate everything.
1025                  */
1026                 ret = fstat(data->dev, &statbuf);
1027                 if (ret)
1028                         goto err;
1029                 if ((unsigned long long) statbuf.st_size <
1030                         (block + count) * channel->block_size + data->offset) {
1031                         ret = ftruncate(data->dev,
1032                                         (block + count) * channel->block_size + data->offset);
1033                         if (ret)
1034                                 goto err;
1035                 }
1036 #if defined(HAVE_FALLOCATE) && (defined(FALLOC_FL_ZERO_RANGE) || \
1037         (defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)))
1038 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1039                 ret = fallocate(data->dev,
1040                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1041                                 (off_t)(block) * channel->block_size + data->offset,
1042                                 (off_t)(count) * channel->block_size);
1043                 if (ret == 0)
1044                         goto err;
1045 #endif
1046 #ifdef FALLOC_FL_ZERO_RANGE
1047                 ret = fallocate(data->dev,
1048                                 FALLOC_FL_ZERO_RANGE,
1049                                 (off_t)(block) * channel->block_size + data->offset,
1050                                 (off_t)(count) * channel->block_size);
1051 #endif
1052 #else
1053                 goto unimplemented;
1054 #endif /* HAVE_FALLOCATE && (ZERO_RANGE || (PUNCH_HOLE && KEEP_SIZE)) */
1055         }
1056 err:
1057         if (ret < 0) {
1058                 if (errno == EOPNOTSUPP)
1059                         goto unimplemented;
1060                 return errno;
1061         }
1062         return 0;
1063 unimplemented:
1064         return EXT2_ET_UNIMPLEMENTED;
1065 }
1066 #pragma GCC diagnostic pop
1067
1068 static struct struct_io_manager struct_unix_manager = {
1069         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1070         .name           = "Unix I/O Manager",
1071         .open           = unix_open,
1072         .close          = unix_close,
1073         .set_blksize    = unix_set_blksize,
1074         .read_blk       = unix_read_blk,
1075         .write_blk      = unix_write_blk,
1076         .flush          = unix_flush,
1077         .write_byte     = unix_write_byte,
1078         .set_option     = unix_set_option,
1079         .get_stats      = unix_get_stats,
1080         .read_blk64     = unix_read_blk64,
1081         .write_blk64    = unix_write_blk64,
1082         .discard        = unix_discard,
1083         .cache_readahead        = unix_cache_readahead,
1084         .zeroout        = unix_zeroout,
1085 };
1086
1087 io_manager unix_io_manager = &struct_unix_manager;