libminixfs: add support for memory-mapped holes

When VM asks a file system to provide a block to satisfy a page fault on a file memory mapping, the file system previously had no way to inform VM that the block is a hole, since there is no corresponding block on the underlying device. To work around this, MFS and ext2 would actually allocate a block for the hole when asked by VM, which not only defeats the point of holes in the first place, but also does not work on read-only file systems. With this patch, a new libminixfs call allows the file system to inform VM about holes. This issue does raise the question as to whether the VM cache is using the right data structures, since there are now two places where we have to fake a device offset. This will have to be revisited in the future. The patch changes file systems accordingly, and adds a test to test74. Change-Id: Ib537d56b3f30a8eb05bc1f63c92b5c7428d18f4c
2015-08-13 12:23:06 +00:00 · 2015-08-13 12:23:06 +00:00 · d75faf18d9
commit d75faf18d9
parent e94f856b38
5 changed files with 183 additions and 6 deletions
--- a/minix/fs/ext2/read.c
+++ b/minix/fs/ext2/read.c
@ -148,8 +148,12 @@ int *completed;                 /* number of bytes copied */
 			printf("ext2fs: fsdriver_zero failed\n");
 		}
 		return r;
+	} else if (call == FSC_PEEK) {
+		/* Peeking a nonexistent block. Report to VM. */
+		lmfs_zero_block_ino(dev, ino, ino_off);
+		return OK;
 	} else {
-               /* Writing to or peeking a nonexistent block.
+               /* Writing to a nonexistent block.
                * Create and enter in inode.
                */
 		if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL)
--- a/minix/fs/mfs/read.c
+++ b/minix/fs/mfs/read.c
@ -159,8 +159,12 @@ int *completed;			/* number of bytes copied */
 			printf("MFS: fsdriver_zero failed\n");
 		}
 		return r;
+	} else if (call == FSC_PEEK) {
+		/* Peeking a nonexistent block. Report to VM. */
+		lmfs_zero_block_ino(dev, ino, ino_off);
+		return OK;
 	} else {
-		/* Writing to or peeking a nonexistent block.
+		/* Writing to a nonexistent block.
 		 * Create and enter in inode.
 		 */
 		if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL)
--- a/minix/include/minix/libminixfs.h
+++ b/minix/include/minix/libminixfs.h
@ -47,6 +47,7 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block,int only_search,
 	ino_t ino, u64_t off);
 void lmfs_put_block(struct buf *bp, int block_type);
 void lmfs_free_block(dev_t dev, block64_t block);
+void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t off);
 void lmfs_invalidate(dev_t device);
 void lmfs_rw_scattered(dev_t, struct buf **, int, int);
 void lmfs_setquiet(int q);
--- a/minix/lib/libminixfs/cache.c
+++ b/minix/lib/libminixfs/cache.c
@ -445,7 +445,7 @@ void lmfs_put_block(
 */
  dev_t dev;
  uint64_t dev_off;
-  int r;
+  int r, setflags;

  if (bp == NULL) return;	/* it is easier to check here than in caller */

@ -487,9 +487,10 @@ void lmfs_put_block(

  /* block has sensible content - if necesary, identify it to VM */
  if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) {
-  	if((r=vm_set_cacheblock(bp->data, dev, dev_off,
-	bp->lmfs_inode, bp->lmfs_inode_offset,
-	&bp->lmfs_flags, fs_block_size, 0)) != OK) {
+	setflags = (block_type & ONE_SHOT) ? VMSF_ONCE : 0;
+	if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode,
+	    bp->lmfs_inode_offset, &bp->lmfs_flags, fs_block_size,
+	    setflags)) != OK) {
 		if(r == ENOSYS) {
 			printf("libminixfs: ENOSYS, disabling VM calls\n");
 			vmcache = 0;
@ -500,6 +501,14 @@ void lmfs_put_block(
 	}
  }
  bp->lmfs_needsetcache = 0;
+
+  /* Now that we (may) have given the block to VM, invalidate the block if it
+   * is a one-shot block.  Otherwise, it may still be reobtained immediately
+   * after, which could be a problem if VM already forgot the block and we are
+   * expected to pass it to VM again, which then wouldn't happen.
+   */
+  if (block_type & ONE_SHOT)
+	bp->lmfs_dev = NO_DEV;
 }

 /*===========================================================================*
@ -544,6 +553,62 @@ void lmfs_free_block(dev_t dev, block64_t block)
   */
 }

+/*===========================================================================*
+ *				lmfs_zero_block_ino			     *
+ *===========================================================================*/
+void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off)
+{
+/* Files may have holes. From an application perspective, these are just file
+ * regions filled with zeroes. From a file system perspective however, holes
+ * may represent unallocated regions on disk. Thus, these holes do not have
+ * corresponding blocks on the disk, and therefore also no block number.
+ * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads,
+ * this is not a problem, since the file system can just zero out the target
+ * application buffer instead. For mapped pages however, this *is* a problem,
+ * since the VM cache needs to be told about the corresponding block, and VM
+ * does not accept blocks without a device offset. The role of this function is
+ * therefore to tell VM about the hole using a fake device offset. The device
+ * offsets are picked so that the VM cache will see a block memory-mapped for
+ * the hole in the file, while the same block is not visible when
+ * memory-mapping the block device.
+ */
+  struct buf *bp;
+  static block64_t fake_block = 0;
+
+  if (!vmcache)
+	return;
+
+  assert(fs_block_size > 0);
+
+  /* Pick a block number which is above the threshold of what can possibly be
+   * mapped in by mmap'ing the device, since off_t is signed, and it is safe to
+   * say that it will take a while before we have 8-exabyte devices. Pick a
+   * different block number each time to avoid possible concurrency issues.
+   * FIXME: it does not seem like VM actually verifies mmap offsets though..
+   */
+  if (fake_block == 0 || ++fake_block >= UINT64_MAX / fs_block_size)
+	fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size;
+
+  /* Obtain a block. */
+  bp = lmfs_get_block_ino(dev, fake_block, NO_READ, ino, ino_off);
+  assert(bp != NULL);
+  assert(bp->lmfs_dev != NO_DEV);
+
+  /* The block is already zeroed, as it has just been allocated with mmap. File
+   * systems do not rely on this assumption yet, so if VM ever gets changed to
+   * not clear the blocks we allocate (e.g., by recycling pages in the VM cache
+   * for the same process, which would be safe), we need to add a memset here.
+   */
+
+  /* Release the block. We don't expect it to be accessed ever again. Moreover,
+   * if we keep the block around in the VM cache, it may erroneously be mapped
+   * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM.
+   * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate
+   * all such pages in its cache.
+   */
+  lmfs_put_block(bp, ONE_SHOT);
+}
+
 void lmfs_cache_reevaluate(dev_t dev)
 {
  if(bufs_in_use == 0 && dev != NO_DEV) {
--- a/minix/tests/test74.c
+++ b/minix/tests/test74.c
@ -690,6 +690,107 @@ corruption_regression(void)
 	free(buf);
 }

+/*
+ * Test mmap on file holes.  Holes are a tricky case with the current VM
+ * implementation.  There are two main issues.  First, whenever a file data
+ * block is freed, VM has to know about this, or it will later blindly map in
+ * the old data.  This, file systems explicitly tell VM (through libminixfs)
+ * whenever a block is freed, upon which VM cache forgets the block.  Second,
+ * blocks are accessed primarily by a <dev,dev_off> pair and only additionally
+ * by a <ino,ino_off> pair.  Holes have no meaningful value for the first pair,
+ * but do need to be registered in VM with the second pair, or accessing them
+ * will generate a segmentation fault.  Thus, file systems explicitly tell VM
+ * (through libminixfs) when a hole is being peeked; libminixfs currently fakes
+ * a device offset to make this work.
+ */
+static void
+hole_regression(void)
+{
+	struct statvfs st;
+	size_t block_size;
+	char *buf;
+	int fd;
+
+	if (statvfs(".", &st) < 0) e(1);
+
+	block_size = st.f_bsize;
+
+	if ((buf = malloc(block_size)) == NULL) e(2);
+
+	if ((fd = open("testfile", O_CREAT | O_TRUNC | O_RDWR)) < 0) e(3);
+
+	if (unlink("testfile") != 0) e(4);
+
+	/*
+	 * We perform the test twice, in a not-so-perfect attempt to test the
+	 * two aspects independently.  The first part immediately creates a
+	 * hole, and is supposed to fail only if reporting holes to VM does not
+	 * work.  However, it may also fail if a page for a previous file with
+	 * the same inode number as "testfile" is still in the VM cache.
+	 */
+	memset(buf, 12, block_size);
+
+	if (write(fd, buf, block_size) != block_size) e(5);
+
+	if (lseek(fd, block_size * 2, SEEK_CUR) != block_size * 3) e(6);
+
+	memset(buf, 78, block_size);
+
+	if (write(fd, buf, block_size) != block_size) e(7);
+
+	free(buf);
+
+	if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
+	    fd, 0)) == MAP_FAILED) e(8);
+
+	if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(9);
+	if (buf[1 * block_size] !=  0 || buf[2 * block_size - 1] !=  0) e(10);
+	if (buf[2 * block_size] !=  0 || buf[3 * block_size - 1] !=  0) e(11);
+	if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(12);
+
+	if (munmap(buf, 4 * block_size) != 0) e(13);
+
+	/*
+	 * The second part first creates file content and only turns part of it
+	 * into a file hole, thus ensuring that VM has previously cached pages
+	 * for the blocks that are freed.  The test will fail if VM keeps the
+	 * pages around in its cache.
+	 */
+	if ((buf = malloc(block_size)) == NULL) e(14);
+
+	if (lseek(fd, block_size, SEEK_SET) != block_size) e(15);
+
+	memset(buf, 34, block_size);
+
+	if (write(fd, buf, block_size) != block_size) e(16);
+
+	memset(buf, 56, block_size);
+
+	if (write(fd, buf, block_size) != block_size) e(17);
+
+	if (ftruncate(fd, block_size) != 0) e(18);
+
+	if (lseek(fd, block_size * 3, SEEK_SET) != block_size * 3) e(19);
+
+	memset(buf, 78, block_size);
+
+	if (write(fd, buf, block_size) != block_size) e(20);
+
+	free(buf);
+
+	if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
+	    fd, 0)) == MAP_FAILED) e(21);
+
+	if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(22);
+	if (buf[1 * block_size] !=  0 || buf[2 * block_size - 1] !=  0) e(23);
+	if (buf[2 * block_size] !=  0 || buf[3 * block_size - 1] !=  0) e(24);
+	if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(25);
+
+	if (munmap(buf, 4 * block_size) != 0) e(26);
+
+	close(fd);
+}
+
 int
 main(int argc, char *argv[])
 {
@ -709,6 +810,8 @@ main(int argc, char *argv[])
 	for (i = 0; i < 10; i++)
 		corruption_regression();

+	hole_regression();
+
 	test_memory_types_vs_operations();

 	makefiles(MAXFILES);