/* * This file provides an implementation for block I/O functions as expected by * libfsdriver for root file systems. In particular, the lmfs_driver function * can be used to implement fdr_driver, the lmfs_bio function can be used to * implement the fdr_bread, fdr_bwrite, and fdr_bpeek hooks, and the the * lmfs_bflush function can be used to implement the fdr_bflush hook. At the * very least, a file system that makes use of the provided functionality * must adhere to the following rules: * * o it must initialize this library in order to set up a buffer pool for * use by these functions, using the lmfs_buf_pool function; the * recommended number of blocks for *non*-disk-backed file systems is * LMFS_MAX_PREFETCH buffers (disk-backed file systems typically use many * more); * o it must enable VM caching in order to support memory mapping of block * devices, using the lmfs_may_use_vmcache function; * o it must either use lmfs_flushall as implementation for the fdr_sync * hook, or call lmfs_flushall as part of its own fdr_sync implementation. * * In addition, a disk-backed file system (as opposed to e.g. a networked file * system that intends to be able to serve as a root file system) should * consider the following points: * * o it may restrict calls to fdr_bwrite on the mounted partition, for * example to the partition's first 1024 bytes; it should generally not * prevent that area from being written even if the file system is mounted * read-only; * o it is free to set its own block size, although the default block size * works fine for raw block I/O as well. */ #include #include #include #include #include #include #include #include "inc.h" /* * Set the driver label of the device identified by 'dev' to 'label'. While * 'dev' is a full device number, only its major device number is to be used. * This is a very thin wrapper right now, but eventually we will want to hide * all of libbdev from file systems that use this library, so it is a start. */ void lmfs_driver(dev_t dev, char *label) { bdev_driver(dev, label); } /* * Prefetch up to "nblocks" blocks on "dev" starting from block number "block". * The size to be used for the last block in the range is given as "last_size". * Stop early when either the I/O request fills up or when a block is already * found to be in the cache. The latter is likely to happen often, since this * function is called before getting each block for reading. Prefetching is a * strictly best-effort operation, and may fail silently. * TODO: limit according to the number of available buffers. */ static void block_prefetch(dev_t dev, block64_t block, unsigned int nblocks, size_t block_size, size_t last_size) { struct buf *bp; unsigned int count, limit; int r; limit = lmfs_readahead_limit(); assert(limit >= 1 && limit <= LMFS_MAX_PREFETCH); if (nblocks > limit) { nblocks = limit; last_size = block_size; } for (count = 0; count < nblocks; count++) { if (count == nblocks - 1 && last_size < block_size) r = lmfs_get_partial_block(&bp, dev, block + count, PEEK, last_size); else r = lmfs_get_block(&bp, dev, block + count, PEEK); if (r == OK) { lmfs_put_block(bp); last_size = block_size; break; } } if (count > 0) lmfs_readahead(dev, block, count, last_size); } /* * Perform block I/O, on "dev", starting from offset "pos", for a total of * "bytes" bytes. Reading, writing, and peeking are highly similar, and thus, * this function implements all of them. The "call" parameter indicates the * call type (one of FSC_READ, FSC_WRITE, FSC_PEEK). For read and write calls, * "data" will identify the user buffer to use; for peek calls, "data" is set * to NULL. In all cases, this function returns the number of bytes * successfully transferred, 0 on end-of-file conditions, and a negative error * code if no bytes could be transferred due to an error. Dirty data is not * flushed immediately, and thus, a successful write only indicates that the * data have been taken in by the cache (for immediate I/O, a character device * would have to be used, but MINIX3 no longer supports this), which may be * follwed later by silent failures. End-of-file conditions are always * reported immediately, though. */ ssize_t lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos, int call) { block64_t block; struct part_geom part; size_t block_size, off, block_off, last_size, size, chunk; unsigned int blocks_left; struct buf *bp; int r, do_write, how; if (dev == NO_DEV) return EINVAL; block_size = lmfs_fs_block_size(); do_write = (call == FSC_WRITE); assert(block_size > 0); if (bytes == 0) return 0; /* just in case */ if (pos < 0 || bytes > SSIZE_MAX || pos > INT64_MAX - bytes + 1) return EINVAL; /* * Get the partition size, so that we can handle EOF ourselves. * Unfortunately, we cannot cache the results between calls, since we * do not get to see DIOCSETP ioctls--see also repartition(8). */ if ((r = bdev_ioctl(dev, DIOCGETP, &part, NONE /*user_endpt*/)) != OK) return r; if ((uint64_t)pos >= part.size) return 0; /* EOF */ if ((uint64_t)pos > part.size - bytes) bytes = part.size - pos; off = 0; block = pos / block_size; block_off = (size_t)(pos % block_size); blocks_left = howmany(block_off + bytes, block_size); assert(blocks_left > 0); /* * If the last block we need is also the last block of the device, * see how many bytes we should actually transfer for that block. */ if (block + blocks_left - 1 == part.size / block_size) last_size = part.size % block_size; else last_size = block_size; r = OK; for (off = 0; off < bytes && blocks_left > 0; off += chunk) { size = (blocks_left == 1) ? last_size : block_size; chunk = size - block_off; if (chunk > bytes - off) chunk = bytes - off; assert(chunk > 0 && chunk <= size); /* * For read requests, help the block driver form larger I/O * requests. */ if (!do_write) block_prefetch(dev, block, blocks_left, block_size, last_size); /* * Do not read the block from disk if we will end up * overwriting all of its contents. */ how = (do_write && chunk == size) ? NO_READ : NORMAL; if (size < block_size) r = lmfs_get_partial_block(&bp, dev, block, how, size); else r = lmfs_get_block(&bp, dev, block, how); if (r != OK) { printf("libminixfs: error getting block <%"PRIx64"," "%"PRIu64"> for device I/O (%d)\n", dev, block, r); break; } /* Perform the actual copy. */ if (r == OK && data != NULL) { if (do_write) { r = fsdriver_copyin(data, off, (char *)bp->data + block_off, chunk); /* * Mark the block as dirty even if the copy * failed, since the copy may in fact have * succeeded partially. This is an interface * issue that should be resolved at some point, * but for now we do not want the cache to be * desynchronized from the disk contents. */ lmfs_markdirty(bp); } else r = fsdriver_copyout(data, off, (char *)bp->data + block_off, chunk); } lmfs_put_block(bp); if (r != OK) break; block++; block_off = 0; blocks_left--; } /* * If we were not able to do any I/O, return the error. Otherwise, * return how many bytes we did manage to transfer. */ if (r != OK && off == 0) return r; return off; } /* * Perform a flush request on a block device, flushing and invalidating all * blocks associated with this device, both in the local cache and in VM. * This operation is called after a block device is closed and must prevent * that stale copies of blocks remain in any cache. */ void lmfs_bflush(dev_t dev) { /* First flush any dirty blocks on this device to disk. */ lmfs_flushdev(dev); /* Then purge any blocks associated with the device. */ lmfs_invalidate(dev); }