minix/servers/vfs/read.c

377 lines
10 KiB
C
Raw Normal View History

/* This file contains the heart of the mechanism used to read (and write)
* files. Read and write requests are split up into chunks that do not cross
* block boundaries. Each chunk is then processed in turn. Reads on special
* files are also detected and handled.
*
* The entry points into this file are
* do_read: perform the READ system call by calling read_write
* do_getdents: read entries from a directory (GETDENTS)
* read_write: actually do the work of READ and WRITE
*
*/
#include "fs.h"
#include <minix/callnr.h>
#include <minix/com.h>
#include <minix/u64.h>
#include <minix/vfsif.h>
#include <assert.h>
#include <sys/dirent.h>
#include <fcntl.h>
#include <unistd.h>
#include "file.h"
#include "scratchpad.h"
#include "vnode.h"
#include "vmnt.h"
/*===========================================================================*
* do_read *
*===========================================================================*/
int do_read(void)
{
return(do_read_write_peek(READING, job_m_in.VFS_READWRITE_FD,
job_m_in.VFS_READWRITE_BUF, (size_t) job_m_in.VFS_READWRITE_LEN));
}
/*===========================================================================*
2012-02-13 16:28:04 +01:00
* lock_bsf *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
void lock_bsf(void)
2012-02-13 16:28:04 +01:00
{
struct worker_thread *org_self;
if (mutex_trylock(&bsf_lock) == 0)
return;
VFS: worker thread model overhaul The main purpose of this patch is to fix handling of unpause calls from PM while another call is ongoing. The solution to this problem sparked a full revision of the threading model, consisting of a large number of related changes: - all active worker threads are now always associated with a process, and every process has at most one active thread working for it; - the process lock is always held by a process's worker thread; - a process can now have both normal work and postponed PM work associated to it; - timer expiry and non-postponed PM work is done from the main thread; - filp garbage collection is done from a thread associated with VFS; - reboot calls from PM are now done from a thread associated with PM; - the DS events handler is protected from starting multiple threads; - support for a system worker thread has been removed; - the deadlock recovery thread has been replaced by a parameter to the worker_start() function; the number of worker threads has consequently been increased by one; - saving and restoring of global but per-thread variables is now centralized in worker_suspend() and worker_resume(); err_code is now saved and restored in all cases; - the concept of jobs has been removed, and job_m_in now points to a message stored in the worker thread structure instead; - the PM lock has been removed; - the separate exec lock has been replaced by a lock on the VM process, which was already being locked for exec calls anyway; - PM_UNPAUSE is now processed as a postponed PM request, from a thread associated with the target process; - the FP_DROP_WORK flag has been removed, since it is no longer more than just an optimization and only applied to processes operating on a pipe when getting killed; - assignment to "fp" now takes place only when obtaining new work in the main thread or a worker thread, when resuming execution of a thread, and in the special case of exiting processes during reboot; - there are no longer special cases where the yield() call is used to force a thread to run. Change-Id: I7a97b9b95c2450454a9b5318dfa0e6150d4e6858
2013-08-30 14:00:50 +02:00
org_self = worker_suspend();
2012-02-13 16:28:04 +01:00
if (mutex_lock(&bsf_lock) != 0)
panic("unable to lock block special file lock");
VFS: worker thread model overhaul The main purpose of this patch is to fix handling of unpause calls from PM while another call is ongoing. The solution to this problem sparked a full revision of the threading model, consisting of a large number of related changes: - all active worker threads are now always associated with a process, and every process has at most one active thread working for it; - the process lock is always held by a process's worker thread; - a process can now have both normal work and postponed PM work associated to it; - timer expiry and non-postponed PM work is done from the main thread; - filp garbage collection is done from a thread associated with VFS; - reboot calls from PM are now done from a thread associated with PM; - the DS events handler is protected from starting multiple threads; - support for a system worker thread has been removed; - the deadlock recovery thread has been replaced by a parameter to the worker_start() function; the number of worker threads has consequently been increased by one; - saving and restoring of global but per-thread variables is now centralized in worker_suspend() and worker_resume(); err_code is now saved and restored in all cases; - the concept of jobs has been removed, and job_m_in now points to a message stored in the worker thread structure instead; - the PM lock has been removed; - the separate exec lock has been replaced by a lock on the VM process, which was already being locked for exec calls anyway; - PM_UNPAUSE is now processed as a postponed PM request, from a thread associated with the target process; - the FP_DROP_WORK flag has been removed, since it is no longer more than just an optimization and only applied to processes operating on a pipe when getting killed; - assignment to "fp" now takes place only when obtaining new work in the main thread or a worker thread, when resuming execution of a thread, and in the special case of exiting processes during reboot; - there are no longer special cases where the yield() call is used to force a thread to run. Change-Id: I7a97b9b95c2450454a9b5318dfa0e6150d4e6858
2013-08-30 14:00:50 +02:00
worker_resume(org_self);
2012-02-13 16:28:04 +01:00
}
/*===========================================================================*
* unlock_bsf *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
void unlock_bsf(void)
2012-02-13 16:28:04 +01:00
{
if (mutex_unlock(&bsf_lock) != 0)
panic("failed to unlock block special file lock");
}
/*===========================================================================*
* check_bsf *
*===========================================================================*/
void check_bsf_lock(void)
{
int r = mutex_trylock(&bsf_lock);
if (r == -EBUSY)
panic("bsf_lock locked");
else if (r != 0)
panic("bsf_lock weird state");
/* r == 0 */
unlock_bsf();
2012-02-13 16:28:04 +01:00
}
/*===========================================================================*
* actual_read_write_peek *
2012-02-13 16:28:04 +01:00
*===========================================================================*/
int actual_read_write_peek(struct fproc *rfp, int rw_flag, int io_fd,
char *io_buf, size_t io_nbytes)
{
/* Perform read(fd, buffer, nbytes) or write(fd, buffer, nbytes) call. */
2012-02-13 16:28:04 +01:00
struct filp *f;
tll_access_t locktype;
int r;
int ro = 1;
2012-02-13 16:28:04 +01:00
if(rw_flag == WRITING) ro = 0;
scratch(rfp).file.fd_nr = io_fd;
scratch(rfp).io.io_buffer = io_buf;
scratch(rfp).io.io_nbytes = io_nbytes;
locktype = rw_flag == WRITING ? VNODE_WRITE : VNODE_READ;
if ((f = get_filp2(rfp, scratch(rfp).file.fd_nr, locktype)) == NULL)
2012-02-13 16:28:04 +01:00
return(err_code);
assert(f->filp_count > 0);
if (((f->filp_mode) & (ro ? R_BIT : W_BIT)) == 0) {
2012-02-13 16:28:04 +01:00
unlock_filp(f);
return(EBADF);
}
if (scratch(rfp).io.io_nbytes == 0) {
2012-02-13 16:28:04 +01:00
unlock_filp(f);
return(0); /* so char special files need not check for 0*/
2012-02-13 16:28:04 +01:00
}
r = read_write(rfp, rw_flag, f, scratch(rfp).io.io_buffer,
scratch(rfp).io.io_nbytes, who_e);
2012-02-13 16:28:04 +01:00
unlock_filp(f);
return(r);
}
/*===========================================================================*
* do_read_write_peek *
*===========================================================================*/
int do_read_write_peek(int rw_flag, int io_fd, char *io_buf, size_t io_nbytes)
{
return actual_read_write_peek(fp, rw_flag, io_fd, io_buf, io_nbytes);
}
2012-02-13 16:28:04 +01:00
/*===========================================================================*
* read_write *
*===========================================================================*/
int read_write(struct fproc *rfp, int rw_flag, struct filp *f,
char *buf, size_t size, endpoint_t for_e)
2012-02-13 16:28:04 +01:00
{
register struct vnode *vp;
off_t position, res_pos;
2012-02-13 16:28:04 +01:00
unsigned int cum_io, cum_io_incr, res_cum_io;
int op, r;
dev_t dev;
position = f->filp_pos;
vp = f->filp_vno;
r = OK;
cum_io = 0;
assert(rw_flag == READING || rw_flag == WRITING || rw_flag == PEEKING);
2012-02-13 16:28:04 +01:00
if (size > SSIZE_MAX) return(EINVAL);
op = (rw_flag == READING ? CDEV_READ : CDEV_WRITE);
if (S_ISFIFO(vp->v_mode)) { /* Pipes */
if (rfp->fp_cum_io_partial != 0) {
2012-02-13 16:28:04 +01:00
panic("VFS: read_write: fp_cum_io_partial not clear");
2007-08-07 14:52:47 +02:00
}
if(rw_flag == PEEKING) {
printf("read_write: peek on pipe makes no sense\n");
return EINVAL;
}
2012-02-13 16:28:04 +01:00
r = rw_pipe(rw_flag, for_e, f, buf, size);
} else if (S_ISCHR(vp->v_mode)) { /* Character special files. */
if(rw_flag == PEEKING) {
printf("read_write: peek on char device makes no sense\n");
return EINVAL;
}
2008-02-22 15:26:41 +01:00
2012-04-25 14:44:42 +02:00
if (vp->v_sdev == NO_DEV)
panic("VFS: read_write tries to access char dev NO_DEV");
2008-02-22 15:26:41 +01:00
dev = (dev_t) vp->v_sdev;
r = cdev_io(op, dev, for_e, buf, position, size, f->filp_flags);
2008-02-22 15:26:41 +01:00
if (r >= 0) {
/* This should no longer happen: all calls are asynchronous. */
printf("VFS: I/O to device %x succeeded immediately!?\n", dev);
2008-02-22 15:26:41 +01:00
cum_io = r;
position += r;
2008-02-22 15:26:41 +01:00
r = OK;
} else if (r == SUSPEND) {
/* FIXME: multiple read/write operations on a single filp
* should be serialized. They currently aren't; in order to
* achieve a similar effect, we optimistically advance the file
* position here. This works under the following assumptions:
* - character drivers that use the seek position at all,
* expose a view of a statically-sized range of bytes, i.e.,
* they are basically byte-granular block devices;
* - if short I/O or an error is returned, all subsequent calls
* will return (respectively) EOF and an error;
* - the application never checks its own file seek position,
* or does not care that it may end up having seeked beyond
* the number of bytes it has actually read;
* - communication to the character driver is FIFO (this one
* is actually true! whew).
* Many improvements are possible here, but in the end,
* anything short of queuing concurrent operations will be
* suboptimal - so we settle for this hack for now.
*/
position += size;
2008-02-22 15:26:41 +01:00
}
2012-04-25 14:44:42 +02:00
} else if (S_ISBLK(vp->v_mode)) { /* Block special files. */
if (vp->v_sdev == NO_DEV)
panic("VFS: read_write tries to access block dev NO_DEV");
2012-02-13 16:28:04 +01:00
lock_bsf();
if(rw_flag == PEEKING) {
r = req_bpeek(vp->v_bfs_e, vp->v_sdev, position, size);
} else {
r = req_breadwrite(vp->v_bfs_e, for_e, vp->v_sdev, position,
size, (vir_bytes) buf, rw_flag, &res_pos, &res_cum_io);
if (r == OK) {
position = res_pos;
cum_io += res_cum_io;
}
}
2012-02-13 16:28:04 +01:00
unlock_bsf();
} else { /* Regular files */
2012-04-25 14:44:42 +02:00
if (rw_flag == WRITING) {
/* Check for O_APPEND flag. */
if (f->filp_flags & O_APPEND) position = vp->v_size;
}
2007-01-05 17:36:55 +01:00
/* Issue request */
if(rw_flag == PEEKING) {
r = req_peek(vp->v_fs_e, vp->v_inode_nr, position, size);
} else {
off_t new_pos;
r = req_readwrite(vp->v_fs_e, vp->v_inode_nr, position,
rw_flag, for_e, (vir_bytes) buf, size, &new_pos,
&cum_io_incr);
if (r >= 0) {
position = new_pos;
cum_io += cum_io_incr;
}
}
}
/* On write, update file size and access time. */
if (rw_flag == WRITING) {
2012-04-25 14:44:42 +02:00
if (S_ISREG(vp->v_mode) || S_ISDIR(vp->v_mode)) {
if (position > vp->v_size) {
vp->v_size = position;
}
}
}
f->filp_pos = position;
2012-02-13 16:28:04 +01:00
if (r == EPIPE && rw_flag == WRITING) {
/* Process is writing, but there is no reader. Tell the kernel to
* generate s SIGPIPE signal.
*/
if (!(f->filp_flags & O_NOSIGPIPE)) {
sys_kill(rfp->fp_endpoint, SIGPIPE);
}
}
if (r == OK) {
return(cum_io);
}
return(r);
}
/*===========================================================================*
* do_getdents *
*===========================================================================*/
int do_getdents(void)
{
/* Perform the getdents(fd, buf, size) system call. */
int r = OK;
off_t new_pos;
register struct filp *rfilp;
scratch(fp).file.fd_nr = job_m_in.VFS_READWRITE_FD;
scratch(fp).io.io_buffer = job_m_in.VFS_READWRITE_BUF;
scratch(fp).io.io_nbytes = (size_t) job_m_in.VFS_READWRITE_LEN;
/* Is the file descriptor valid? */
if ( (rfilp = get_filp(scratch(fp).file.fd_nr, VNODE_READ)) == NULL)
return(err_code);
2012-02-13 16:28:04 +01:00
if (!(rfilp->filp_mode & R_BIT))
2012-02-13 16:28:04 +01:00
r = EBADF;
2012-04-25 14:44:42 +02:00
else if (!S_ISDIR(rfilp->filp_vno->v_mode))
2012-02-13 16:28:04 +01:00
r = EBADF;
2012-02-13 16:28:04 +01:00
if (r == OK) {
r = req_getdents(rfilp->filp_vno->v_fs_e, rfilp->filp_vno->v_inode_nr,
rfilp->filp_pos, scratch(fp).io.io_buffer,
scratch(fp).io.io_nbytes, &new_pos, 0);
2012-02-13 16:28:04 +01:00
if (r > 0) rfilp->filp_pos = new_pos;
}
2012-02-13 16:28:04 +01:00
unlock_filp(rfilp);
return(r);
}
2007-08-07 14:52:47 +02:00
/*===========================================================================*
* rw_pipe *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int rw_pipe(rw_flag, usr_e, f, buf, req_size)
2007-08-07 14:52:47 +02:00
int rw_flag; /* READING or WRITING */
endpoint_t usr_e;
2007-08-07 14:52:47 +02:00
struct filp *f;
char *buf;
size_t req_size;
{
int r, oflags, partial_pipe = 0;
size_t size, cum_io, cum_io_incr;
2007-08-07 14:52:47 +02:00
struct vnode *vp;
off_t position, new_pos;
2007-08-07 14:52:47 +02:00
2012-02-13 16:28:04 +01:00
/* Must make sure we're operating on locked filp and vnode */
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
assert(tll_locked_by_me(&f->filp_vno->v_lock));
2012-02-13 16:28:04 +01:00
assert(mutex_trylock(&f->filp_lock) == -EDEADLK);
2007-08-07 14:52:47 +02:00
oflags = f->filp_flags;
vp = f->filp_vno;
position = 0; /* Not actually used */
assert(rw_flag == READING || rw_flag == WRITING);
2007-08-07 14:52:47 +02:00
/* fp->fp_cum_io_partial is only nonzero when doing partial writes */
2012-02-13 16:28:04 +01:00
cum_io = fp->fp_cum_io_partial;
2007-08-07 14:52:47 +02:00
r = pipe_check(f, rw_flag, oflags, req_size, 0);
if (r <= 0) {
2012-02-13 16:28:04 +01:00
if (r == SUSPEND) pipe_suspend(f, buf, req_size);
return(r);
2007-08-07 14:52:47 +02:00
}
size = r;
if (size < req_size) partial_pipe = 1;
2007-08-07 14:52:47 +02:00
/* Truncate read request at size. */
if (rw_flag == READING && size > vp->v_size) {
size = vp->v_size;
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
if (vp->v_mapfs_e == 0)
panic("unmapped pipe");
2010-01-27 10:30:39 +01:00
r = req_readwrite(vp->v_mapfs_e, vp->v_mapinode_nr, position, rw_flag, usr_e,
(vir_bytes) buf, size, &new_pos, &cum_io_incr);
if (r != OK) {
return(r);
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
cum_io += cum_io_incr;
buf += cum_io_incr;
req_size -= cum_io_incr;
vp->v_size = new_pos;
if (partial_pipe) {
/* partial write on pipe with */
/* O_NONBLOCK, return write count */
if (!(oflags & O_NONBLOCK)) {
/* partial write on pipe with req_size > PIPE_SIZE,
* non-atomic
*/
fp->fp_cum_io_partial = cum_io;
pipe_suspend(f, buf, req_size);
return(SUSPEND);
2007-08-07 14:52:47 +02:00
}
}
fp->fp_cum_io_partial = 0;
2007-08-07 14:52:47 +02:00
return(cum_io);
2007-08-07 14:52:47 +02:00
}