libfsdriver: support mmap on FSes with no device

This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves.  This includes hgfs,
vbfs, and procfs.

The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again.  A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use.  This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time.  No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.

Compared to a libc-based approach, this patch retains the on-demand
nature of mmap.  Especially tail(1) is known to map in a large file
area only to use a small portion of it.

All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.

A very basic regression test is added to test74.

Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
This commit is contained in:
David van Moolenbroek 2014-11-15 10:14:00 +00:00
parent 289b04677a
commit e321f65582
13 changed files with 200 additions and 21 deletions

View file

@ -385,6 +385,8 @@ service procfs
;
vm
INFO
SETCACHEPAGE
CLEARCACHE
;
uid 0;
};
@ -402,6 +404,10 @@ service hgfs
ipc
SYSTEM pm vfs rs vm
;
vm
SETCACHEPAGE
CLEARCACHE
;
};
service vbfs
@ -409,6 +415,10 @@ service vbfs
ipc
SYSTEM pm vfs rs ds vm vbox
;
vm
SETCACHEPAGE
CLEARCACHE
;
};
service printer
@ -554,6 +564,10 @@ service hello
service devman
{
uid 0;
vm
SETCACHEPAGE
CLEARCACHE
;
};
service mmc
@ -589,6 +603,10 @@ service gpio
IRQCTL # 19
PADCONF # 57
;
vm
SETCACHEPAGE
CLEARCACHE
;
irq
29 # GPIO module 1 (dm37xx)
30 # GPIO module 2 (dm37xx)

View file

@ -71,7 +71,8 @@ int vm_procctl_clear(endpoint_t ep);
int vm_procctl_handlemem(endpoint_t ep, vir_bytes m1, vir_bytes m2, int wr);
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize);
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize,
int setflags);
void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize);
@ -87,5 +88,8 @@ int vm_clear_cache(dev_t dev);
/* special inode number for vm cache functions */
#define VMC_NO_INODE 0 /* to reference a disk block, no associated file */
/* setflags for vm_set_cacheblock, also used internally in VM */
#define VMSF_ONCE 0x01 /* discard block after one-time use */
#endif /* _MINIX_VM_H */

View file

@ -1,6 +1,7 @@
#include "fsdriver.h"
#include <minix/ds.h>
#include <sys/mman.h>
/*
* Process a READSUPER request from VFS.
@ -43,7 +44,8 @@ fsdriver_readsuper(const struct fsdriver * __restrict fdp,
if (r == OK) {
/* This one we can set on the file system's behalf. */
if (fdp->fdr_peek != NULL && fdp->fdr_bpeek != NULL)
if ((fdp->fdr_peek != NULL && fdp->fdr_bpeek != NULL) ||
major(dev) == NONE_MAJOR)
res_flags |= RES_HASPEEK;
m_out->m_fs_vfs_readsuper.inode = root_node.fn_ino_nr;
@ -74,6 +76,10 @@ fsdriver_unmount(const struct fsdriver * __restrict fdp,
if (fdp->fdr_unmount != NULL)
fdp->fdr_unmount();
/* If we used mmap emulation, clear any cached blocks from VM. */
if (fdp->fdr_peek == NULL && major(fsdriver_device) == NONE_MAJOR)
vm_clear_cache(fsdriver_device);
/* Update library-local state. */
fsdriver_mounted = FALSE;
@ -206,6 +212,61 @@ fsdriver_write(const struct fsdriver * __restrict fdp,
return read_write(fdp, m_in, m_out, FSC_WRITE);
}
/*
* A read-based peek implementation. This allows file systems that do not have
* a buffer cache and do not implement peek, to support a limited form of mmap.
* We map in a block, fill it by calling the file system's read function, tell
* VM about the page, and then unmap the block again. We tell VM not to cache
* the block beyond its immediate use for the mmap request, so as to prevent
* potentially stale data from being cached--at the cost of performance.
*/
static ssize_t
builtin_peek(const struct fsdriver * __restrict fdp, ino_t ino_nr,
size_t nbytes, off_t pos)
{
static u32_t flags = 0; /* storage for the VMMC_ flags of all blocks */
static off_t dev_off = 0; /* fake device offset, see below */
struct fsdriver_data data;
char *buf;
ssize_t r;
if ((buf = mmap(NULL, nbytes, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
return ENOMEM;
data.endpt = SELF;
data.grant = (cp_grant_id_t)buf;
data.size = nbytes;
r = fdp->fdr_read(ino_nr, &data, nbytes, pos, FSC_READ);
if (r >= 0) {
if ((size_t)r < nbytes)
memset(&buf[r], 0, nbytes - r);
/*
* VM uses serialized communication to VFS. Since the page is
* to be used only once, VM will use and then discard it before
* sending a new peek request. Thus, it should be safe to
* reuse the same device offset all the time. However, relying
* on assumptions in protocols elsewhere a bit dangerous, so we
* use an ever-increasing device offset just to be safe.
*/
r = vm_set_cacheblock(buf, fsdriver_device, dev_off, ino_nr,
pos, &flags, nbytes, VMSF_ONCE);
if (r == OK) {
dev_off += nbytes;
r = nbytes;
}
}
munmap(buf, nbytes);
return r;
}
/*
* Process a PEEK request from VFS.
*/
@ -222,13 +283,22 @@ fsdriver_peek(const struct fsdriver * __restrict fdp,
pos = m_in->m_vfs_fs_readwrite.seek_pos;
nbytes = m_in->m_vfs_fs_readwrite.nbytes;
if (fdp->fdr_peek == NULL)
return ENOSYS;
if (pos < 0 || nbytes > SSIZE_MAX)
return EINVAL;
r = fdp->fdr_peek(ino_nr, NULL /*data*/, nbytes, pos, FSC_PEEK);
if (fdp->fdr_peek == NULL) {
if (major(fsdriver_device) != NONE_MAJOR)
return ENOSYS;
/*
* For file systems that have no backing device, emulate peek
* support by reading into temporary buffers and passing these
* to VM.
*/
r = builtin_peek(fdp, ino_nr, nbytes, pos);
} else
r = fdp->fdr_peek(ino_nr, NULL /*data*/, nbytes, pos,
FSC_PEEK);
/* Do not return a new position. */
if (r >= 0) {

View file

@ -469,7 +469,7 @@ void lmfs_put_block(
if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) {
if((r=vm_set_cacheblock(bp->data, dev, dev_off,
bp->lmfs_inode, bp->lmfs_inode_offset,
&bp->lmfs_flags, fs_block_size)) != OK) {
&bp->lmfs_flags, fs_block_size, 0)) != OK) {
if(r == ENOSYS) {
printf("libminixfs: ENOSYS, disabling VM calls\n");
vmcache = 0;

View file

@ -14,7 +14,7 @@
static int vm_cachecall(message *m, int call, void *addr, dev_t dev,
off_t dev_offset, ino_t ino, off_t ino_offset, u32_t *flags,
int blocksize)
int blocksize, int setflags)
{
if(blocksize % PAGE_SIZE)
panic("blocksize %d should be a multiple of pagesize %d\n",
@ -39,7 +39,7 @@ static int vm_cachecall(message *m, int call, void *addr, dev_t dev,
m->m_vmmcp.flags_ptr = flags;
m->m_vmmcp.dev = dev;
m->m_vmmcp.pages = blocksize / PAGE_SIZE;
m->m_vmmcp.flags = 0;
m->m_vmmcp.flags = setflags;
return _taskcall(VM_PROC_NR, call, m);
}
@ -50,19 +50,19 @@ void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
message m;
if(vm_cachecall(&m, VM_MAPCACHEPAGE, NULL, dev, dev_offset,
ino, ino_offset, flags, blocksize) != OK)
ino, ino_offset, flags, blocksize, 0) != OK)
return MAP_FAILED;
return m.m_vmmcp_reply.addr;
}
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize)
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, int setflags)
{
message m;
return vm_cachecall(&m, VM_SETCACHEPAGE, block, dev, dev_offset,
ino, ino_offset, flags, blocksize);
ino, ino_offset, flags, blocksize, setflags);
}
int

View file

@ -213,7 +213,8 @@ struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off,
return NULL;
}
int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, struct phys_block *pb)
int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, int flags,
struct phys_block *pb)
{
int hv_dev;
struct cached_page *hb;
@ -237,6 +238,7 @@ int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, struct phys_blo
hb->dev_offset = dev_off;
hb->ino = ino;
hb->ino_offset = ino_off;
hb->flags = flags & VMSF_ONCE;
hb->page = pb;
hb->page->refcount++; /* block also referenced by cache now */
hb->page->flags |= PBF_INCACHE;

View file

@ -12,6 +12,7 @@ struct cached_page {
ino_t ino; /* which ino is it about */
u64_t ino_offset; /* offset within ino */
int flags; /* currently only VMSF_ONCE or 0 */
struct phys_block *page; /* page ptr */
struct cached_page *older; /* older in lru chain */
struct cached_page *newer; /* newer in lru chain */

View file

@ -175,6 +175,7 @@ do_setcache(message *msg)
dev_t dev = msg->m_vmmcp.dev;
off_t dev_off = msg->m_vmmcp.dev_offset;
off_t ino_off = msg->m_vmmcp.ino_offset;
int flags = msg->m_vmmcp.flags;
int n;
struct vmproc *caller;
phys_bytes offset;
@ -209,7 +210,8 @@ do_setcache(message *msg)
if((hb=find_cached_page_bydev(dev, dev_off + offset,
msg->m_vmmcp.ino, ino_off + offset, 1))) {
/* block inode info updated */
if(hb->page != phys_region->ph) {
if(hb->page != phys_region->ph ||
(hb->flags & VMSF_ONCE)) {
/* previous cache entry has become
* obsolete; make a new one. rmcache
* removes it from the cache and frees
@ -236,8 +238,8 @@ do_setcache(message *msg)
phys_region->memtype = &mem_type_cache;
if((r=addcache(dev, dev_off + offset,
msg->m_vmmcp.ino, ino_off + offset, phys_region->ph)) != OK) {
if((r=addcache(dev, dev_off + offset, msg->m_vmmcp.ino,
ino_off + offset, flags, phys_region->ph)) != OK) {
printf("VM: addcache failed\n");
return r;
}

View file

@ -107,7 +107,17 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region,
cp = find_cached_page_byino(region->param.file.fdref->dev,
region->param.file.fdref->ino, referenced_offset, 1);
}
if(cp) {
/*
* Normally, a cache hit saves a round-trip to the file system
* to load the page. However, if the page in the VM cache is
* marked for one-time use, then force a round-trip through the
* file system anyway, so that the FS can update the page by
* by readding it to the cache. Thus, for one-time use pages,
* no caching is performed. This approach is correct even in
* the light of concurrent requests and disappearing processes
* but relies on VM requests to VFS being fully serialized.
*/
if(cp && (!cb || !(cp->flags & VMSF_ONCE))) {
int result = OK;
pb_unreferenced(region, ph, 0);
pb_link(ph, cp->page, ph->offset, region);
@ -120,6 +130,10 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region,
result = cow_block(vmp, region, ph, 0);
}
/* Discard one-use pages after mapping them in. */
if (result == OK && (cp->flags & VMSF_ONCE))
rmcache(cp);
return result;
}
@ -210,7 +224,14 @@ int mappedfile_setfile(struct vmproc *owner,
cp = find_cached_page_byino(dev, ino,
referenced_offset, 1);
}
if(!cp) continue;
/*
* If we get a hit for a page that is to be used only once,
* then either we found a stale page (due to a process dying
* before a requested once-page could be mapped in) or this is
* a rare case of concurrent requests for the same page. In
* both cases, force the page to be obtained from its FS later.
*/
if(!cp || (cp->flags & VMSF_ONCE)) continue;
if(!(pr = pb_reference(cp->page, vaddr, region,
&mem_type_mappedfile))) {
printf("mappedfile_setfile: pb_reference failed\n");

View file

@ -227,7 +227,8 @@ int do_clearcache(message *m);
struct cached_page *find_cached_page_bydev(dev_t dev, u64_t dev_off,
ino_t ino, u64_t ino_off, int touchlru);
struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off, int touchlru);
int addcache(dev_t dev, u64_t def_off, ino_t ino, u64_t ino_off, struct phys_block *pb);
int addcache(dev_t dev, u64_t def_off, ino_t ino, u64_t ino_off, int flags,
struct phys_block *pb);
void cache_sanitycheck_internal(void);
int cache_freepages(int pages);
void get_stats_info(struct vm_stats_info *vsi);

View file

@ -230,7 +230,7 @@ u32_t sqrt_approx(u32_t v)
}
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize)
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, int setflags)
{
return ENOSYS;
}

View file

@ -36,6 +36,7 @@
#include <sys/ioctl.h>
#include <sys/ioc_memory.h>
#include <sys/param.h>
#include <minix/paths.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
@ -461,6 +462,63 @@ static void basic_regression(void)
}
/*
* Test mmap on none-dev file systems - file systems that do not have a buffer
* cache and therefore have to fake mmap support. We use procfs as target.
* The idea is that while we succeed in mapping in /proc/uptime, we also get
* a new uptime value every time we map in the page -- VM must not cache it.
*/
static void
nonedev_regression(void)
{
int fd;
char *buf;
unsigned long uptime1, uptime2, uptime3;
subtest++;
if ((fd = open(_PATH_PROC "uptime", O_RDONLY)) < 0) e(1);
buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0);
if (buf == MAP_FAILED) e(2);
if (buf[4095] != 0) e(3);
if ((uptime1 = atoi(buf)) == 0) e(4);
if (munmap(buf, 4096) != 0) e(5);
sleep(2);
buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE,
fd, 0);
if (buf == MAP_FAILED) e(6);
if (buf[4095] != 0) e(7);
if ((uptime2 = atoi(buf)) == 0) e(8);
if (uptime1 == uptime2) e(9);
if (munmap(buf, 4096) != 0) e(10);
sleep(2);
buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0);
if (buf == MAP_FAILED) e(11);
if (buf[4095] != 0) e(12);
if ((uptime3 = atoi(buf)) == 0) e(13);
if (uptime1 == uptime3) e(14);
if (uptime2 == uptime3) e(15);
if (munmap(buf, 4096) != 0) e(16);
close(fd);
}
int
main(int argc, char *argv[])
{
@ -470,6 +528,8 @@ main(int argc, char *argv[])
basic_regression();
nonedev_regression();
test_memory_types_vs_operations();
makefiles(MAXFILES);

View file

@ -44,7 +44,7 @@ int dowriteblock(int b, int blocksize, u32_t seed, char *block)
memcpy(bdata, block, blocksize);
if(mustset && (r=vm_set_cacheblock(bdata, MYDEV, dev_off,
VMC_NO_INODE, 0, NULL, blocksize)) != OK) {
VMC_NO_INODE, 0, NULL, blocksize, 0)) != OK) {
printf("dowriteblock: vm_set_cacheblock failed %d\n", r);
exit(1);
}