2014-02-28 16:56:40 +01:00
|
|
|
/* Test 74 - mmap functionality & regression test.
|
|
|
|
*
|
|
|
|
* This test tests some basic functionality of mmap, and also some
|
|
|
|
* cases that are quite complex for the system to handle.
|
|
|
|
*
|
|
|
|
* Memory pages are generally made available on demand. Memory copying
|
|
|
|
* is done by the kernel. As the kernel may encounter pagefaults in
|
|
|
|
* legitimate memory ranges (e.g. pages that aren't mapped; pages that
|
|
|
|
* are mapped RO as they are COW), it cooperates with VM to make the
|
|
|
|
* mappings and let the copy succeed transparently.
|
|
|
|
*
|
|
|
|
* With file-mapped ranges this can result in a deadlock, if care is
|
|
|
|
* not taken, as the copy might be request by VFS or an FS. This test
|
|
|
|
* triggers as many of these states as possible to ensure they are
|
|
|
|
* successful or (where appropriate) fail gracefully, i.e. without
|
|
|
|
* deadlock.
|
|
|
|
*
|
|
|
|
* To do this, system calls are done with source or target buffers with
|
|
|
|
* missing or readonly mappings, both anonymous and file-mapped. The
|
|
|
|
* cache is flushed before mmap() so that we know the mappings should
|
|
|
|
* not be present on mmap() time. Then e.g. a read() or write() is
|
|
|
|
* executed with that buffer as target. This triggers a FS copying
|
|
|
|
* to or from a missing range that it itself is needed to map in first.
|
|
|
|
* VFS detects this, requests VM to map in the pages, which does so with
|
|
|
|
* the help of another VFS thread and the FS, and then re-issues the
|
|
|
|
* request to the FS.
|
|
|
|
*
|
|
|
|
* Another case is the VFS itself does such a copy. This is actually
|
|
|
|
* unusual as filenames are already faulted in by the requesting process
|
|
|
|
* in libc by strlen(). select() allows such a case, however, so this
|
|
|
|
* is tested too. We are satisfied if the call completes.
|
2013-05-07 14:36:09 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/mman.h>
|
2014-02-28 16:56:40 +01:00
|
|
|
#include <sys/ioctl.h>
|
2013-05-07 14:36:09 +02:00
|
|
|
#include <sys/ioc_memory.h>
|
2014-02-28 16:56:40 +01:00
|
|
|
#include <sys/param.h>
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
#include <minix/paths.h>
|
2013-05-07 14:36:09 +02:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <assert.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <fcntl.h>
|
2014-02-28 16:56:40 +01:00
|
|
|
#include <dirent.h>
|
2013-05-07 14:36:09 +02:00
|
|
|
|
|
|
|
#include "common.h"
|
|
|
|
#include "testcache.h"
|
|
|
|
|
2014-02-28 16:56:40 +01:00
|
|
|
int max_error = 0; /* make all e()'s fatal */
|
|
|
|
|
2013-05-07 14:36:09 +02:00
|
|
|
int
|
|
|
|
dowriteblock(int b, int blocksize, u32_t seed, char *data)
|
|
|
|
{
|
|
|
|
u64_t offset;
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
get_fd_offset(b, blocksize, &offset, &fd);
|
|
|
|
|
|
|
|
if(pwrite(fd, data, blocksize, offset) < blocksize) {
|
|
|
|
perror("pwrite");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return blocksize;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
readblock(int b, int blocksize, u32_t seed, char *data)
|
|
|
|
{
|
|
|
|
u64_t offset;
|
|
|
|
int fd;
|
|
|
|
char *mmapdata;
|
|
|
|
int pread_first = random() % 2;
|
|
|
|
|
|
|
|
get_fd_offset(b, blocksize, &offset, &fd);
|
|
|
|
|
|
|
|
if(pread_first) {
|
|
|
|
if(pread(fd, data, blocksize, offset) < blocksize) {
|
|
|
|
perror("pread");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-22 16:38:29 +01:00
|
|
|
if((mmapdata = mmap(NULL, blocksize, PROT_READ, MAP_PRIVATE | MAP_FILE,
|
2013-05-07 14:36:09 +02:00
|
|
|
fd, offset)) == MAP_FAILED) {
|
|
|
|
perror("mmap");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!pread_first) {
|
|
|
|
if(pread(fd, data, blocksize, offset) < blocksize) {
|
|
|
|
perror("pread");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(memcmp(mmapdata, data, blocksize)) {
|
|
|
|
fprintf(stderr, "readblock: mmap, pread mismatch\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2013-11-22 16:38:29 +01:00
|
|
|
if(munmap(mmapdata, blocksize) < 0) {
|
2013-05-07 14:36:09 +02:00
|
|
|
perror("munmap");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return blocksize;
|
|
|
|
}
|
|
|
|
|
|
|
|
void testend(void) { }
|
|
|
|
|
2014-02-28 16:56:40 +01:00
|
|
|
static void do_read(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
ssize_t ret;
|
|
|
|
size_t n = PAGE_SIZE;
|
|
|
|
struct stat sb;
|
|
|
|
if(fstat(fd, &sb) < 0) e(1);
|
|
|
|
if(S_ISDIR(sb.st_mode)) return;
|
|
|
|
ret = read(fd, buf, n);
|
|
|
|
|
|
|
|
/* if the buffer is writable, it should succeed */
|
|
|
|
if(writable) { if(ret != n) e(3); return; }
|
|
|
|
|
|
|
|
/* if the buffer is not writable, it should fail with EFAULT */
|
|
|
|
if(ret >= 0) e(4);
|
|
|
|
if(errno != EFAULT) e(5);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_write(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
size_t n = PAGE_SIZE;
|
|
|
|
struct stat sb;
|
|
|
|
if(fstat(fd, &sb) < 0) e(1);
|
|
|
|
if(S_ISDIR(sb.st_mode)) return;
|
|
|
|
if(write(fd, buf, n) != n) e(3);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_stat(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
r = fstat(fd, (struct stat *) buf);
|
|
|
|
|
|
|
|
/* should succeed if buf is writable */
|
|
|
|
if(writable) { if(r < 0) e(3); return; }
|
|
|
|
|
|
|
|
/* should fail with EFAULT if buf is not */
|
|
|
|
if(r >= 0) e(4);
|
|
|
|
if(errno != EFAULT) e(5);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_getdents(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
struct stat sb;
|
|
|
|
int r;
|
|
|
|
if(fstat(fd, &sb) < 0) e(1);
|
|
|
|
if(!S_ISDIR(sb.st_mode)) return; /* OK */
|
|
|
|
r = getdents(fd, buf, PAGE_SIZE);
|
|
|
|
if(writable) { if(r < 0) e(3); return; }
|
|
|
|
|
|
|
|
/* should fail with EFAULT if buf is not */
|
|
|
|
if(r >= 0) e(4);
|
|
|
|
if(errno != EFAULT) e(5);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_readlink1(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
char target[200];
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
readlink(buf, target, sizeof(target));
|
|
|
|
}
|
|
|
|
|
|
|
|
#define NODENAME "a"
|
|
|
|
#define TARGETNAME "b"
|
|
|
|
|
|
|
|
static void do_readlink2(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
ssize_t rl;
|
|
|
|
unlink(NODENAME);
|
|
|
|
if(symlink(TARGETNAME, NODENAME) < 0) e(1);
|
|
|
|
rl=readlink(NODENAME, buf, sizeof(buf));
|
|
|
|
|
|
|
|
/* if buf is writable, it should succeed, with a certain result */
|
|
|
|
if(writable) {
|
|
|
|
if(rl < 0) e(2);
|
|
|
|
((char *) buf)[rl] = '\0';
|
|
|
|
if(strcmp(buf, TARGETNAME)) {
|
|
|
|
fprintf(stderr, "readlink: expected %s, got %s\n",
|
2015-09-21 19:14:39 +02:00
|
|
|
TARGETNAME, (char *)buf);
|
2014-02-28 16:56:40 +01:00
|
|
|
e(3);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if buf is not writable, it should fail with EFAULT */
|
|
|
|
if(rl >= 0) e(4);
|
|
|
|
|
|
|
|
if(errno != EFAULT) e(5);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_symlink1(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
r = symlink(buf, NODENAME);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_symlink2(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
r = symlink(NODENAME, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_open(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
r = open(buf, O_RDONLY);
|
|
|
|
if(r >= 0) close(r);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_select1(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
struct timeval timeout = { 0, 200000 }; /* 0.2 sec */
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
r = select(1, buf, NULL, NULL, &timeout);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_select2(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
struct timeval timeout = { 0, 200000 }; /* 1 sec */
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
r = select(1, NULL, buf, NULL, &timeout);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_select3(void *buf, int fd, int writable)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
struct timeval timeout = { 0, 200000 }; /* 1 sec */
|
|
|
|
/* the system call just has to fail gracefully */
|
|
|
|
r = select(1, NULL, NULL, buf, &timeout);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fillfile(int fd, int size)
|
|
|
|
{
|
|
|
|
char *buf = malloc(size);
|
|
|
|
|
|
|
|
if(size < 1 || size % PAGE_SIZE || !buf) { e(1); }
|
|
|
|
memset(buf, 'A', size);
|
|
|
|
buf[50] = '\0'; /* so it can be used as a filename arg */
|
|
|
|
buf[size-1] = '\0';
|
|
|
|
if(write(fd, buf, size) != size) { e(2); }
|
|
|
|
if(lseek(fd, SEEK_SET, 0) < 0) { e(3); }
|
|
|
|
free(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void make_buffers(int size,
|
|
|
|
int *ret_fd_rw, int *ret_fd_ro,
|
|
|
|
void **filebuf_rw, void **filebuf_ro, void **anonbuf)
|
|
|
|
{
|
|
|
|
char fn_rw[] = "testfile_rw.XXXXXX", fn_ro[] = "testfile_ro.XXXXXX";
|
|
|
|
*ret_fd_rw = mkstemp(fn_rw);
|
|
|
|
*ret_fd_ro = mkstemp(fn_ro);
|
|
|
|
|
|
|
|
if(size < 1 || size % PAGE_SIZE) { e(2); }
|
|
|
|
if(*ret_fd_rw < 0) { e(1); }
|
|
|
|
if(*ret_fd_ro < 0) { e(1); }
|
|
|
|
fillfile(*ret_fd_rw, size);
|
|
|
|
fillfile(*ret_fd_ro, size);
|
|
|
|
if(fcntl(*ret_fd_rw, F_FLUSH_FS_CACHE) < 0) { e(4); }
|
|
|
|
if(fcntl(*ret_fd_ro, F_FLUSH_FS_CACHE) < 0) { e(4); }
|
|
|
|
|
|
|
|
if((*filebuf_rw = mmap(0, size, PROT_READ | PROT_WRITE,
|
|
|
|
MAP_PRIVATE | MAP_FILE, *ret_fd_rw, 0)) == MAP_FAILED) {
|
|
|
|
e(5);
|
|
|
|
quit();
|
|
|
|
}
|
|
|
|
|
|
|
|
if((*filebuf_ro = mmap(0, size, PROT_READ,
|
|
|
|
MAP_PRIVATE | MAP_FILE, *ret_fd_ro, 0)) == MAP_FAILED) {
|
|
|
|
e(5);
|
|
|
|
quit();
|
|
|
|
}
|
|
|
|
|
|
|
|
if((*anonbuf = mmap(0, size, PROT_READ | PROT_WRITE,
|
|
|
|
MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
|
|
|
|
e(6);
|
|
|
|
quit();
|
|
|
|
}
|
|
|
|
|
|
|
|
if(unlink(fn_rw) < 0) { e(12); }
|
|
|
|
if(unlink(fn_ro) < 0) { e(12); }
|
|
|
|
}
|
|
|
|
|
|
|
|
static void forget_buffers(void *buf1, void *buf2, void *buf3, int fd1, int fd2, int size)
|
|
|
|
{
|
|
|
|
if(munmap(buf1, size) < 0) { e(1); }
|
|
|
|
if(munmap(buf2, size) < 0) { e(2); }
|
|
|
|
if(munmap(buf3, size) < 0) { e(2); }
|
|
|
|
if(fcntl(fd1, F_FLUSH_FS_CACHE) < 0) { e(3); }
|
|
|
|
if(fcntl(fd2, F_FLUSH_FS_CACHE) < 0) { e(3); }
|
|
|
|
if(close(fd1) < 0) { e(4); }
|
|
|
|
if(close(fd2) < 0) { e(4); }
|
|
|
|
}
|
|
|
|
|
|
|
|
#define NEXPERIMENTS 12
|
|
|
|
struct {
|
|
|
|
void (*do_operation)(void * buf, int fd, int writable);
|
|
|
|
} experiments[NEXPERIMENTS] = {
|
|
|
|
{ do_read },
|
|
|
|
{ do_write },
|
|
|
|
{ do_stat },
|
|
|
|
{ do_getdents },
|
|
|
|
{ do_readlink1 },
|
|
|
|
{ do_readlink2 },
|
|
|
|
{ do_symlink1 },
|
|
|
|
{ do_symlink2 },
|
|
|
|
{ do_open, },
|
|
|
|
{ do_select1 },
|
|
|
|
{ do_select2 },
|
|
|
|
{ do_select3 },
|
|
|
|
};
|
|
|
|
|
2014-09-23 13:49:24 +02:00
|
|
|
static void test_memory_types_vs_operations(void)
|
2014-02-28 16:56:40 +01:00
|
|
|
{
|
|
|
|
#define NFDS 4
|
|
|
|
#define BUFSIZE (10 * PAGE_SIZE)
|
|
|
|
int exp, fds[NFDS];
|
|
|
|
int f = 0, size = BUFSIZE;
|
|
|
|
|
|
|
|
/* open some test fd's */
|
|
|
|
#define OPEN(fn, mode) { assert(f >= 0 && f < NFDS); \
|
|
|
|
fds[f] = open(fn, mode); if(fds[f] < 0) { e(2); } f++; }
|
|
|
|
OPEN("regular", O_RDWR | O_CREAT);
|
|
|
|
OPEN(".", O_RDONLY);
|
|
|
|
OPEN("/dev/ram", O_RDWR);
|
|
|
|
OPEN("/dev/zero", O_RDWR);
|
|
|
|
|
|
|
|
/* make sure the regular file has plenty of size to play with */
|
|
|
|
fillfile(fds[0], BUFSIZE);
|
|
|
|
|
|
|
|
/* and the ramdisk too */
|
|
|
|
if(ioctl(fds[2], MIOCRAMSIZE, &size) < 0) { e(3); }
|
|
|
|
|
|
|
|
for(exp = 0; exp < NEXPERIMENTS; exp++) {
|
|
|
|
for(f = 0; f < NFDS; f++) {
|
|
|
|
void *anonmem, *filemem_rw, *filemem_ro;
|
|
|
|
int buffd_rw, buffd_ro;
|
|
|
|
|
|
|
|
make_buffers(BUFSIZE, &buffd_rw, &buffd_ro,
|
|
|
|
&filemem_rw, &filemem_ro, &anonmem);
|
|
|
|
|
|
|
|
if(lseek(fds[f], 0, SEEK_SET) != 0) { e(10); }
|
|
|
|
experiments[exp].do_operation(anonmem, fds[f], 1);
|
|
|
|
|
|
|
|
if(lseek(fds[f], 0, SEEK_SET) != 0) { e(11); }
|
|
|
|
experiments[exp].do_operation(filemem_rw, fds[f], 1);
|
|
|
|
|
|
|
|
if(lseek(fds[f], 0, SEEK_SET) != 0) { e(12); }
|
|
|
|
experiments[exp].do_operation(filemem_ro, fds[f], 0);
|
|
|
|
|
|
|
|
forget_buffers(filemem_rw, filemem_ro, anonmem, buffd_rw, buffd_ro, BUFSIZE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-23 13:49:24 +02:00
|
|
|
static void basic_regression(void)
|
2014-01-08 17:43:19 +01:00
|
|
|
{
|
2014-02-28 16:56:40 +01:00
|
|
|
int fd, fd1, fd2;
|
|
|
|
ssize_t rb, wr;
|
|
|
|
char buf[PAGE_SIZE*2];
|
|
|
|
void *block, *block1, *block2;
|
2014-01-08 17:43:19 +01:00
|
|
|
#define BLOCKSIZE (PAGE_SIZE*10)
|
2013-11-22 16:38:29 +01:00
|
|
|
block = mmap(0, BLOCKSIZE, PROT_READ | PROT_WRITE,
|
2014-01-08 17:43:19 +01:00
|
|
|
MAP_PRIVATE | MAP_ANON, -1, 0);
|
|
|
|
|
2014-02-28 16:56:40 +01:00
|
|
|
if(block == MAP_FAILED) { e(1); }
|
2014-01-08 17:43:19 +01:00
|
|
|
|
|
|
|
memset(block, 0, BLOCKSIZE);
|
|
|
|
|
|
|
|
/* shrink from bottom */
|
2013-11-22 16:38:29 +01:00
|
|
|
munmap(block, PAGE_SIZE);
|
2014-02-28 16:56:40 +01:00
|
|
|
|
|
|
|
/* Next test: use a system call write() to access a block of
|
|
|
|
* unavailable file-mapped memory.
|
|
|
|
*
|
|
|
|
* This is a thorny corner case to make succeed transparently
|
|
|
|
* because
|
|
|
|
* (1) it is a filesystem that is doing the memory access
|
|
|
|
* (copy from the constblock1 range in this process to the
|
|
|
|
* FS) but is also the FS needed to satisfy the range if it
|
|
|
|
* isn't in the cache.
|
|
|
|
* (2) there are two separate memory regions involved, requiring
|
|
|
|
* separate VFS requests from VM to properly satisfy, requiring
|
|
|
|
* some complex state to be kept.
|
|
|
|
*/
|
|
|
|
|
|
|
|
fd1 = open("../testsh1", O_RDONLY);
|
|
|
|
fd2 = open("../testsh2", O_RDONLY);
|
|
|
|
if(fd1 < 0 || fd2 < 0) { e(2); }
|
|
|
|
|
2014-03-17 15:53:28 +01:00
|
|
|
/* just check that we can't mmap() a file writable */
|
|
|
|
if(mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, fd1, 0) != MAP_FAILED) {
|
|
|
|
e(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check that we can mmap() a file MAP_SHARED readonly */
|
|
|
|
if(mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED | MAP_FILE, fd1, 0) == MAP_FAILED) {
|
|
|
|
e(1);
|
|
|
|
}
|
|
|
|
|
2014-02-28 16:56:40 +01:00
|
|
|
/* clear cache of files before mmap so pages won't be present already */
|
|
|
|
if(fcntl(fd1, F_FLUSH_FS_CACHE) < 0) { e(1); }
|
|
|
|
if(fcntl(fd2, F_FLUSH_FS_CACHE) < 0) { e(1); }
|
|
|
|
|
|
|
|
#define LOCATION1 (void *) 0x90000000
|
2014-09-23 13:49:24 +02:00
|
|
|
#define LOCATION2 ((void *)((char *)LOCATION1 + PAGE_SIZE))
|
2014-02-28 16:56:40 +01:00
|
|
|
block1 = mmap(LOCATION1, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd1, 0);
|
|
|
|
if(block1 == MAP_FAILED) { e(4); }
|
|
|
|
if(block1 != LOCATION1) { e(5); }
|
|
|
|
|
|
|
|
block2 = mmap(LOCATION2, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd2, 0);
|
|
|
|
if(block2 == MAP_FAILED) { e(10); }
|
|
|
|
if(block2 != LOCATION2) { e(11); }
|
|
|
|
|
|
|
|
unlink("testfile");
|
|
|
|
fd = open("testfile", O_CREAT | O_RDWR);
|
|
|
|
if(fd < 0) { e(15); }
|
|
|
|
|
|
|
|
/* write() using the mmap()ped memory as buffer */
|
|
|
|
|
|
|
|
if((wr=write(fd, LOCATION1, sizeof(buf))) != sizeof(buf)) {
|
|
|
|
fprintf(stderr, "wrote %zd bytes instead of %zd\n",
|
|
|
|
wr, sizeof(buf));
|
|
|
|
e(20);
|
|
|
|
quit();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* verify written contents */
|
|
|
|
|
|
|
|
if((rb=pread(fd, buf, sizeof(buf), 0)) != sizeof(buf)) {
|
|
|
|
if(rb < 0) perror("pread");
|
|
|
|
fprintf(stderr, "wrote %zd bytes\n", wr);
|
|
|
|
fprintf(stderr, "read %zd bytes instead of %zd\n",
|
|
|
|
rb, sizeof(buf));
|
|
|
|
e(21);
|
|
|
|
quit();
|
|
|
|
}
|
|
|
|
|
|
|
|
if(memcmp(buf, LOCATION1, sizeof(buf))) {
|
|
|
|
e(22);
|
|
|
|
quit();
|
|
|
|
}
|
|
|
|
|
|
|
|
close(fd);
|
|
|
|
close(fd1);
|
|
|
|
close(fd2);
|
|
|
|
|
2014-01-08 17:43:19 +01:00
|
|
|
}
|
|
|
|
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
/*
|
|
|
|
* Test mmap on none-dev file systems - file systems that do not have a buffer
|
|
|
|
* cache and therefore have to fake mmap support. We use procfs as target.
|
|
|
|
* The idea is that while we succeed in mapping in /proc/uptime, we also get
|
|
|
|
* a new uptime value every time we map in the page -- VM must not cache it.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
nonedev_regression(void)
|
|
|
|
{
|
2014-12-22 18:20:40 +01:00
|
|
|
int fd, fd2;
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
char *buf;
|
|
|
|
unsigned long uptime1, uptime2, uptime3;
|
|
|
|
|
|
|
|
subtest++;
|
|
|
|
|
|
|
|
if ((fd = open(_PATH_PROC "uptime", O_RDONLY)) < 0) e(1);
|
|
|
|
|
|
|
|
buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0);
|
|
|
|
if (buf == MAP_FAILED) e(2);
|
|
|
|
|
|
|
|
if (buf[4095] != 0) e(3);
|
|
|
|
|
|
|
|
if ((uptime1 = atoi(buf)) == 0) e(4);
|
|
|
|
|
|
|
|
if (munmap(buf, 4096) != 0) e(5);
|
|
|
|
|
|
|
|
sleep(2);
|
|
|
|
|
|
|
|
buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE,
|
|
|
|
fd, 0);
|
|
|
|
if (buf == MAP_FAILED) e(6);
|
|
|
|
|
|
|
|
if (buf[4095] != 0) e(7);
|
|
|
|
|
|
|
|
if ((uptime2 = atoi(buf)) == 0) e(8);
|
|
|
|
|
|
|
|
if (uptime1 == uptime2) e(9);
|
|
|
|
|
|
|
|
if (munmap(buf, 4096) != 0) e(10);
|
|
|
|
|
|
|
|
sleep(2);
|
|
|
|
|
|
|
|
buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0);
|
|
|
|
if (buf == MAP_FAILED) e(11);
|
|
|
|
|
|
|
|
if (buf[4095] != 0) e(12);
|
|
|
|
|
|
|
|
if ((uptime3 = atoi(buf)) == 0) e(13);
|
|
|
|
|
|
|
|
if (uptime1 == uptime3) e(14);
|
|
|
|
if (uptime2 == uptime3) e(15);
|
|
|
|
|
|
|
|
if (munmap(buf, 4096) != 0) e(16);
|
|
|
|
|
2014-12-22 18:20:40 +01:00
|
|
|
/* Also test page faults not incurred by the process itself. */
|
|
|
|
if ((fd2 = open("testfile", O_CREAT | O_TRUNC | O_WRONLY)) < 0) e(17);
|
|
|
|
|
|
|
|
if (unlink("testfile") != 0) e(18);
|
|
|
|
|
|
|
|
buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0);
|
|
|
|
if (buf == MAP_FAILED) e(19);
|
|
|
|
|
|
|
|
if (write(fd2, buf, 10) != 10) e(20);
|
|
|
|
|
|
|
|
if (munmap(buf, 4096) != 0) e(21);
|
|
|
|
|
|
|
|
close(fd2);
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
close(fd);
|
|
|
|
}
|
|
|
|
|
libminixfs/VM: fix memory-mapped file corruption
This patch employs one solution to resolve two independent but related
issues. Both issues are the result of one fundamental aspect of the
way VM's memory mapping works: VM uses its cache to map in blocks for
memory-mapped file regions, and for blocks already in the VM cache, VM
does not go to the file system before mapping them in. To preserve
consistency between the FS and VM caches, VM relies on being informed
about all updates to file contents through the block cache. The two
issues are both the result of VM not being properly informed about
such updates:
1. Once a file system provides libminixfs with an inode association
(inode number + inode offset) for a disk block, this association
is not broken until a new inode association is provided for it.
If a block is freed and reallocated as a metadata (non-inode)
block, its old association is maintained, and may be supplied to
VM's secondary cache. Due to reuse of inodes, it is possible
that the same inode association becomes valid for an actual file
block again. In that case, when that new file is memory-mapped,
under certain circumstances, VM may end up using the metadata
block to satisfy a page fault on the file, due to the stale inode
association. The result is a corrupted memory mapping, with the
application seeing data other than the current file contents
mapped in at the file block.
2. When a hole is created in a file, the underlying block is freed
from the device, but VM is not informed of this update, and thus,
if VM's cache contains the block with its previous inode
association, this block will remain there. As a result, if an
application subsequently memory-maps the file, VM will map in the
old block at the position of the hole, rather than an all-zeroes
block. Thus, again, the result is a corrupted memory mapping.
This patch resolves both issues by making the file system inform the
minixfs library about blocks being freed, so that libminixfs can
break the inode association for that block, both in its own cache and
in the VM cache. Since libminixfs does not know whether VM has the
block in its cache or not, it makes a call to VM for each block being
freed. Thus, this change introduces more calls to VM, but it solves
the correctness issues at hand; optimizations may be introduced
later. On the upside, all freed blocks are now marked as clean,
which should result in fewer blocks being written back to the device,
and the blocks are removed from the caches entirely, which should
result in slightly better cache usage.
This patch is necessary but not sufficient to resolve the situation
with respect to memory mapping of file holes in general. Therefore,
this patch extends test 74 with a (rather particular but effective)
test for the first issue, but not yet with a test for the second one.
This fixes #90.
Change-Id: Iad8b134d2f88a884f15d3fc303e463280749c467
2015-08-13 13:29:33 +02:00
|
|
|
/*
|
|
|
|
* Regression test for a nasty memory-mapped file corruption bug, which is not
|
|
|
|
* easy to reproduce but, before being solved, did occur in practice every once
|
|
|
|
* in a while. The executive summary is that through stale inode associations,
|
|
|
|
* VM could end up using an old block to satisfy a memory mapping.
|
|
|
|
*
|
|
|
|
* This subtest relies on a number of assumptions regarding allocation and
|
|
|
|
* reuse of inode numbers and blocks. These assumptions hold for MFS but
|
|
|
|
* possibly no other file system. However, if the subtest's assumptions are
|
|
|
|
* not met, it will simply succeed.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
corruption_regression(void)
|
|
|
|
{
|
|
|
|
char *ptr, *buf;
|
|
|
|
struct statvfs sf;
|
|
|
|
struct stat st;
|
|
|
|
size_t block_size;
|
|
|
|
off_t size;
|
|
|
|
int fd, fd2;
|
|
|
|
|
|
|
|
subtest = 1;
|
|
|
|
|
|
|
|
if (statvfs(".", &sf) != 0) e(0);
|
|
|
|
block_size = sf.f_bsize;
|
|
|
|
|
|
|
|
if ((buf = malloc(block_size * 2)) == NULL) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We first need a file that is just large enough that it requires the
|
|
|
|
* allocation of a metadata block - an indirect block - when more data
|
|
|
|
* is written to it. This is fileA. We keep it open throughout the
|
|
|
|
* test so we can unlink it immediately.
|
|
|
|
*/
|
|
|
|
if ((fd = open("fileA", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
|
|
|
|
e(0);
|
|
|
|
if (unlink("fileA") != 0) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write to fileA until its next block requires the allocation of an
|
|
|
|
* additional metadata block - an indirect block.
|
|
|
|
*/
|
|
|
|
size = 0;
|
|
|
|
memset(buf, 'A', block_size);
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* Repeatedly write an extra block, until the file consists of
|
|
|
|
* more blocks than just the file data.
|
|
|
|
*/
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(0);
|
|
|
|
size += block_size;
|
|
|
|
if (size >= block_size * 64) {
|
|
|
|
/*
|
|
|
|
* It doesn't look like this is going to work.
|
|
|
|
* Skip this subtest altogether.
|
|
|
|
*/
|
|
|
|
if (close(fd) != 0) e(0);
|
|
|
|
free(buf);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (fstat(fd, &st) != 0) e(0);
|
|
|
|
} while (st.st_blocks * 512 == size);
|
|
|
|
|
|
|
|
/* Once we get there, go one step back by truncating by one block. */
|
|
|
|
size -= block_size; /* for MFS, size will end up being 7*block_size */
|
|
|
|
if (ftruncate(fd, size) != 0) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a first file, fileB, and write two blocks to it. FileB's
|
|
|
|
* blocks are going to end up in the secondary VM cache, associated to
|
|
|
|
* fileB's inode number (and two different offsets within the file).
|
|
|
|
* The block cache does not know about files getting deleted, so we can
|
|
|
|
* unlink fileB immediately after creating it. So far so good.
|
|
|
|
*/
|
|
|
|
if ((fd2 = open("fileB", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
|
|
|
|
e(0);
|
|
|
|
if (unlink("fileB") != 0) e(0);
|
|
|
|
memset(buf, 'B', block_size * 2);
|
|
|
|
if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
|
|
|
|
if (close(fd2) != 0) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write one extra block to fileA, hoping that this causes allocation
|
|
|
|
* of a metadata block as well. This is why we tried to get fileA to
|
|
|
|
* the point that one more block would also require the allocation of a
|
|
|
|
* metadata block. Our intent is to recycle the blocks that we just
|
|
|
|
* allocated and freed for fileB. As of writing, for the metadata
|
|
|
|
* block, this will *not* break the association with fileB's inode,
|
|
|
|
* which by itself is not a problem, yet crucial to reproducing
|
|
|
|
* the actual problem a bit later. Note that the test does not rely on
|
|
|
|
* whether the file system allocates the data block or the metadata
|
|
|
|
* block first, although it does need reverse deallocation (see below).
|
|
|
|
*/
|
|
|
|
memset(buf, 'A', block_size);
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new file, fileC, which recycles the inode number of fileB,
|
|
|
|
* but uses two new blocks to store its data. These new blocks will
|
|
|
|
* get associated to the fileB inode number, and one of them will
|
|
|
|
* thereby eclipse (but not remove) the association of fileA's metadata
|
|
|
|
* block to the inode of fileB.
|
|
|
|
*/
|
|
|
|
if ((fd2 = open("fileC", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
|
|
|
|
e(0);
|
|
|
|
if (unlink("fileC") != 0) e(0);
|
|
|
|
memset(buf, 'C', block_size * 2);
|
|
|
|
if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
|
|
|
|
if (close(fd2) != 0) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free up the extra fileA blocks for reallocation, in particular
|
|
|
|
* including the metadata block. Again, this will not affect the
|
|
|
|
* contents of the VM cache in any way. FileA's metadata block remains
|
|
|
|
* cached in VM, with the inode association for fileB's block.
|
|
|
|
*/
|
|
|
|
if (ftruncate(fd, size) != 0) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now create yet one more file, fileD, which also recycles the inode
|
|
|
|
* number of fileB and fileC. Write two blocks to it; these blocks
|
|
|
|
* should recycle the blocks we just freed. One of these is fileA's
|
|
|
|
* just-freed metadata block, for which the new inode association will
|
|
|
|
* be equal to the inode association it had already (as long as blocks
|
|
|
|
* are freed in reverse order of their allocation, which happens to be
|
|
|
|
* the case for MFS). As a result, the block is not updated in the VM
|
|
|
|
* cache, and VM will therefore continue to see the inode association
|
|
|
|
* for the corresponding block of fileC which is still in the VM cache.
|
|
|
|
*/
|
|
|
|
if ((fd2 = open("fileD", O_CREAT | O_TRUNC | O_RDWR, 0600)) == -1)
|
|
|
|
e(0);
|
|
|
|
memset(buf, 'D', block_size * 2);
|
|
|
|
if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
|
|
|
|
|
|
|
|
ptr = mmap(NULL, block_size * 2, PROT_READ, MAP_FILE, fd2, 0);
|
|
|
|
if (ptr == MAP_FAILED) e(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Finally, we can test the issue. Since fileC's block is still the
|
|
|
|
* block for which VM has the corresponding inode association, VM will
|
|
|
|
* now find and map in fileC's block, instead of fileD's block. The
|
|
|
|
* result is that we get a memory-mapped area with stale contents,
|
|
|
|
* different from those of the underlying file.
|
|
|
|
*/
|
|
|
|
if (memcmp(buf, ptr, block_size * 2)) e(0);
|
|
|
|
|
|
|
|
/* Clean up. */
|
|
|
|
if (munmap(ptr, block_size * 2) != 0) e(0);
|
|
|
|
|
|
|
|
if (close(fd2) != 0) e(0);
|
|
|
|
if (unlink("fileD") != 0) e(0);
|
|
|
|
|
|
|
|
if (close(fd) != 0) e(0);
|
|
|
|
|
|
|
|
free(buf);
|
|
|
|
}
|
|
|
|
|
2015-08-13 14:23:06 +02:00
|
|
|
/*
|
|
|
|
* Test mmap on file holes. Holes are a tricky case with the current VM
|
|
|
|
* implementation. There are two main issues. First, whenever a file data
|
|
|
|
* block is freed, VM has to know about this, or it will later blindly map in
|
|
|
|
* the old data. This, file systems explicitly tell VM (through libminixfs)
|
|
|
|
* whenever a block is freed, upon which VM cache forgets the block. Second,
|
|
|
|
* blocks are accessed primarily by a <dev,dev_off> pair and only additionally
|
|
|
|
* by a <ino,ino_off> pair. Holes have no meaningful value for the first pair,
|
|
|
|
* but do need to be registered in VM with the second pair, or accessing them
|
|
|
|
* will generate a segmentation fault. Thus, file systems explicitly tell VM
|
|
|
|
* (through libminixfs) when a hole is being peeked; libminixfs currently fakes
|
|
|
|
* a device offset to make this work.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hole_regression(void)
|
|
|
|
{
|
|
|
|
struct statvfs st;
|
|
|
|
size_t block_size;
|
|
|
|
char *buf;
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
if (statvfs(".", &st) < 0) e(1);
|
|
|
|
|
|
|
|
block_size = st.f_bsize;
|
|
|
|
|
|
|
|
if ((buf = malloc(block_size)) == NULL) e(2);
|
|
|
|
|
|
|
|
if ((fd = open("testfile", O_CREAT | O_TRUNC | O_RDWR)) < 0) e(3);
|
|
|
|
|
|
|
|
if (unlink("testfile") != 0) e(4);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We perform the test twice, in a not-so-perfect attempt to test the
|
|
|
|
* two aspects independently. The first part immediately creates a
|
|
|
|
* hole, and is supposed to fail only if reporting holes to VM does not
|
|
|
|
* work. However, it may also fail if a page for a previous file with
|
|
|
|
* the same inode number as "testfile" is still in the VM cache.
|
|
|
|
*/
|
|
|
|
memset(buf, 12, block_size);
|
|
|
|
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(5);
|
|
|
|
|
|
|
|
if (lseek(fd, block_size * 2, SEEK_CUR) != block_size * 3) e(6);
|
|
|
|
|
|
|
|
memset(buf, 78, block_size);
|
|
|
|
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(7);
|
|
|
|
|
|
|
|
free(buf);
|
|
|
|
|
|
|
|
if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
|
|
|
|
fd, 0)) == MAP_FAILED) e(8);
|
|
|
|
|
|
|
|
if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(9);
|
|
|
|
if (buf[1 * block_size] != 0 || buf[2 * block_size - 1] != 0) e(10);
|
|
|
|
if (buf[2 * block_size] != 0 || buf[3 * block_size - 1] != 0) e(11);
|
|
|
|
if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(12);
|
|
|
|
|
|
|
|
if (munmap(buf, 4 * block_size) != 0) e(13);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The second part first creates file content and only turns part of it
|
|
|
|
* into a file hole, thus ensuring that VM has previously cached pages
|
|
|
|
* for the blocks that are freed. The test will fail if VM keeps the
|
|
|
|
* pages around in its cache.
|
|
|
|
*/
|
|
|
|
if ((buf = malloc(block_size)) == NULL) e(14);
|
|
|
|
|
|
|
|
if (lseek(fd, block_size, SEEK_SET) != block_size) e(15);
|
|
|
|
|
|
|
|
memset(buf, 34, block_size);
|
|
|
|
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(16);
|
|
|
|
|
|
|
|
memset(buf, 56, block_size);
|
|
|
|
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(17);
|
|
|
|
|
|
|
|
if (ftruncate(fd, block_size) != 0) e(18);
|
|
|
|
|
|
|
|
if (lseek(fd, block_size * 3, SEEK_SET) != block_size * 3) e(19);
|
|
|
|
|
|
|
|
memset(buf, 78, block_size);
|
|
|
|
|
|
|
|
if (write(fd, buf, block_size) != block_size) e(20);
|
|
|
|
|
|
|
|
free(buf);
|
|
|
|
|
|
|
|
if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
|
|
|
|
fd, 0)) == MAP_FAILED) e(21);
|
|
|
|
|
|
|
|
if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(22);
|
|
|
|
if (buf[1 * block_size] != 0 || buf[2 * block_size - 1] != 0) e(23);
|
|
|
|
if (buf[2 * block_size] != 0 || buf[3 * block_size - 1] != 0) e(24);
|
|
|
|
if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(25);
|
|
|
|
|
|
|
|
if (munmap(buf, 4 * block_size) != 0) e(26);
|
|
|
|
|
|
|
|
close(fd);
|
|
|
|
}
|
|
|
|
|
2013-05-07 14:36:09 +02:00
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
libminixfs/VM: fix memory-mapped file corruption
This patch employs one solution to resolve two independent but related
issues. Both issues are the result of one fundamental aspect of the
way VM's memory mapping works: VM uses its cache to map in blocks for
memory-mapped file regions, and for blocks already in the VM cache, VM
does not go to the file system before mapping them in. To preserve
consistency between the FS and VM caches, VM relies on being informed
about all updates to file contents through the block cache. The two
issues are both the result of VM not being properly informed about
such updates:
1. Once a file system provides libminixfs with an inode association
(inode number + inode offset) for a disk block, this association
is not broken until a new inode association is provided for it.
If a block is freed and reallocated as a metadata (non-inode)
block, its old association is maintained, and may be supplied to
VM's secondary cache. Due to reuse of inodes, it is possible
that the same inode association becomes valid for an actual file
block again. In that case, when that new file is memory-mapped,
under certain circumstances, VM may end up using the metadata
block to satisfy a page fault on the file, due to the stale inode
association. The result is a corrupted memory mapping, with the
application seeing data other than the current file contents
mapped in at the file block.
2. When a hole is created in a file, the underlying block is freed
from the device, but VM is not informed of this update, and thus,
if VM's cache contains the block with its previous inode
association, this block will remain there. As a result, if an
application subsequently memory-maps the file, VM will map in the
old block at the position of the hole, rather than an all-zeroes
block. Thus, again, the result is a corrupted memory mapping.
This patch resolves both issues by making the file system inform the
minixfs library about blocks being freed, so that libminixfs can
break the inode association for that block, both in its own cache and
in the VM cache. Since libminixfs does not know whether VM has the
block in its cache or not, it makes a call to VM for each block being
freed. Thus, this change introduces more calls to VM, but it solves
the correctness issues at hand; optimizations may be introduced
later. On the upside, all freed blocks are now marked as clean,
which should result in fewer blocks being written back to the device,
and the blocks are removed from the caches entirely, which should
result in slightly better cache usage.
This patch is necessary but not sufficient to resolve the situation
with respect to memory mapping of file holes in general. Therefore,
this patch extends test 74 with a (rather particular but effective)
test for the first issue, but not yet with a test for the second one.
This fixes #90.
Change-Id: Iad8b134d2f88a884f15d3fc303e463280749c467
2015-08-13 13:29:33 +02:00
|
|
|
int i, iter = 2;
|
2013-05-07 14:36:09 +02:00
|
|
|
|
|
|
|
start(74);
|
|
|
|
|
2014-01-08 17:43:19 +01:00
|
|
|
basic_regression();
|
|
|
|
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
nonedev_regression();
|
|
|
|
|
libminixfs/VM: fix memory-mapped file corruption
This patch employs one solution to resolve two independent but related
issues. Both issues are the result of one fundamental aspect of the
way VM's memory mapping works: VM uses its cache to map in blocks for
memory-mapped file regions, and for blocks already in the VM cache, VM
does not go to the file system before mapping them in. To preserve
consistency between the FS and VM caches, VM relies on being informed
about all updates to file contents through the block cache. The two
issues are both the result of VM not being properly informed about
such updates:
1. Once a file system provides libminixfs with an inode association
(inode number + inode offset) for a disk block, this association
is not broken until a new inode association is provided for it.
If a block is freed and reallocated as a metadata (non-inode)
block, its old association is maintained, and may be supplied to
VM's secondary cache. Due to reuse of inodes, it is possible
that the same inode association becomes valid for an actual file
block again. In that case, when that new file is memory-mapped,
under certain circumstances, VM may end up using the metadata
block to satisfy a page fault on the file, due to the stale inode
association. The result is a corrupted memory mapping, with the
application seeing data other than the current file contents
mapped in at the file block.
2. When a hole is created in a file, the underlying block is freed
from the device, but VM is not informed of this update, and thus,
if VM's cache contains the block with its previous inode
association, this block will remain there. As a result, if an
application subsequently memory-maps the file, VM will map in the
old block at the position of the hole, rather than an all-zeroes
block. Thus, again, the result is a corrupted memory mapping.
This patch resolves both issues by making the file system inform the
minixfs library about blocks being freed, so that libminixfs can
break the inode association for that block, both in its own cache and
in the VM cache. Since libminixfs does not know whether VM has the
block in its cache or not, it makes a call to VM for each block being
freed. Thus, this change introduces more calls to VM, but it solves
the correctness issues at hand; optimizations may be introduced
later. On the upside, all freed blocks are now marked as clean,
which should result in fewer blocks being written back to the device,
and the blocks are removed from the caches entirely, which should
result in slightly better cache usage.
This patch is necessary but not sufficient to resolve the situation
with respect to memory mapping of file holes in general. Therefore,
this patch extends test 74 with a (rather particular but effective)
test for the first issue, but not yet with a test for the second one.
This fixes #90.
Change-Id: Iad8b134d2f88a884f15d3fc303e463280749c467
2015-08-13 13:29:33 +02:00
|
|
|
/*
|
|
|
|
* Any inode or block allocation happening concurrently with this
|
|
|
|
* subtest will make the subtest succeed without testing the actual
|
|
|
|
* issue. Thus, repeat the subtest a fair number of times.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < 10; i++)
|
|
|
|
corruption_regression();
|
|
|
|
|
2015-08-13 14:23:06 +02:00
|
|
|
hole_regression();
|
|
|
|
|
2014-02-28 16:56:40 +01:00
|
|
|
test_memory_types_vs_operations();
|
|
|
|
|
2013-05-07 14:36:09 +02:00
|
|
|
makefiles(MAXFILES);
|
|
|
|
|
|
|
|
cachequiet(!bigflag);
|
|
|
|
if(bigflag) iter = 3;
|
|
|
|
|
|
|
|
/* Try various combinations working set sizes
|
|
|
|
* and block sizes in order to specifically
|
|
|
|
* target the primary cache, then primary+secondary
|
|
|
|
* cache, then primary+secondary cache+secondary
|
|
|
|
* cache eviction.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if(dotest(PAGE_SIZE, 100, iter)) e(5);
|
|
|
|
if(dotest(PAGE_SIZE*2, 100, iter)) e(2);
|
|
|
|
if(dotest(PAGE_SIZE*3, 100, iter)) e(3);
|
|
|
|
if(dotest(PAGE_SIZE, 20000, iter)) e(5);
|
|
|
|
|
|
|
|
if(bigflag) {
|
|
|
|
u32_t totalmem, freemem, cachedmem;
|
|
|
|
if(dotest(PAGE_SIZE, 150000, iter)) e(5);
|
|
|
|
getmem(&totalmem, &freemem, &cachedmem);
|
|
|
|
if(dotest(PAGE_SIZE, totalmem*1.5, iter)) e(6);
|
|
|
|
}
|
|
|
|
|
|
|
|
quit();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|