minix/kernel/system/do_safecopy.c
Ben Gras 50e2064049 No more intel/minix segments.
This commit removes all traces of Minix segments (the text/data/stack
memory map abstraction in the kernel) and significance of Intel segments
(hardware segments like CS, DS that add offsets to all addressing before
page table translation). This ultimately simplifies the memory layout
and addressing and makes the same layout possible on non-Intel
architectures.

There are only two types of addresses in the world now: virtual
and physical; even the kernel and processes have the same virtual
address space. Kernel and user processes can be distinguished at a
glance as processes won't use 0xF0000000 and above.

No static pre-allocated memory sizes exist any more.

Changes to booting:
        . The pre_init.c leaves the kernel and modules exactly as
          they were left by the bootloader in physical memory
        . The kernel starts running using physical addressing,
          loaded at a fixed location given in its linker script by the
          bootloader.  All code and data in this phase are linked to
          this fixed low location.
        . It makes a bootstrap pagetable to map itself to a
          fixed high location (also in linker script) and jumps to
          the high address. All code and data then use this high addressing.
        . All code/data symbols linked at the low addresses is prefixed by
          an objcopy step with __k_unpaged_*, so that that code cannot
          reference highly-linked symbols (which aren't valid yet) or vice
          versa (symbols that aren't valid any more).
        . The two addressing modes are separated in the linker script by
          collecting the unpaged_*.o objects and linking them with low
          addresses, and linking the rest high. Some objects are linked
          twice, once low and once high.
        . The bootstrap phase passes a lot of information (e.g. free memory
          list, physical location of the modules, etc.) using the kinfo
          struct.
        . After this bootstrap the low-linked part is freed.
        . The kernel maps in VM into the bootstrap page table so that VM can
          begin executing. Its first job is to make page tables for all other
          boot processes. So VM runs before RS, and RS gets a fully dynamic,
          VM-managed address space. VM gets its privilege info from RS as usual
          but that happens after RS starts running.
        . Both the kernel loading VM and VM organizing boot processes happen
	  using the libexec logic. This removes the last reason for VM to
	  still know much about exec() and vm/exec.c is gone.

Further Implementation:
        . All segments are based at 0 and have a 4 GB limit.
        . The kernel is mapped in at the top of the virtual address
          space so as not to constrain the user processes.
        . Processes do not use segments from the LDT at all; there are
          no segments in the LDT any more, so no LLDT is needed.
        . The Minix segments T/D/S are gone and so none of the
          user-space or in-kernel copy functions use them. The copy
          functions use a process endpoint of NONE to realize it's
          a physical address, virtual otherwise.
        . The umap call only makes sense to translate a virtual address
          to a physical address now.
        . Segments-related calls like newmap and alloc_segments are gone.
        . All segments-related translation in VM is gone (vir2map etc).
        . Initialization in VM is simpler as no moving around is necessary.
        . VM and all other boot processes can be linked wherever they wish
          and will be mapped in at the right location by the kernel and VM
          respectively.

Other changes:
        . The multiboot code is less special: it does not use mb_print
          for its diagnostics any more but uses printf() as normal, saving
          the output into the diagnostics buffer, only printing to the
          screen using the direct print functions if a panic() occurs.
        . The multiboot code uses the flexible 'free memory map list'
          style to receive the list of free memory if available.
        . The kernel determines the memory layout of the processes to
          a degree: it tells VM where the kernel starts and ends and
          where the kernel wants the top of the process to be. VM then
          uses this entire range, i.e. the stack is right at the top,
          and mmap()ped bits of memory are placed below that downwards,
          and the break grows upwards.

Other Consequences:
        . Every process gets its own page table as address spaces
          can't be separated any more by segments.
        . As all segments are 0-based, there is no distinction between
          virtual and linear addresses, nor between userspace and
          kernel addresses.
        . Less work is done when context switching, leading to a net
          performance increase. (8% faster on my machine for 'make servers'.)
	. The layout and configuration of the GDT makes sysenter and syscall
	  possible.
2012-07-15 22:30:15 +02:00

421 lines
12 KiB
C

/* The kernel call implemented in this file:
* m_type: SYS_SAFECOPYFROM or SYS_SAFECOPYTO or SYS_VSAFECOPY
*
* The parameters for this kernel call are:
* SCP_FROM_TO other endpoint
* SCP_GID grant id
* SCP_OFFSET offset within granted space
* SCP_ADDRESS address in own address space
* SCP_BYTES bytes to be copied
*
* For the vectored variant (do_vsafecopy):
* VSCP_VEC_ADDR address of vector
* VSCP_VEC_SIZE number of significant elements in vector
*/
#include <assert.h>
#include <minix/type.h>
#include <minix/safecopies.h>
#include "kernel/system.h"
#define MAX_INDIRECT_DEPTH 5 /* up to how many indirect grants to follow? */
#define MEM_TOP 0xFFFFFFFFUL
static int safecopy(struct proc *, endpoint_t, endpoint_t,
cp_grant_id_t, size_t, vir_bytes, vir_bytes, int);
#define HASGRANTTABLE(gr) \
(priv(gr) && priv(gr)->s_grant_table)
/*===========================================================================*
* verify_grant *
*===========================================================================*/
int verify_grant(granter, grantee, grant, bytes, access,
offset_in, offset_result, e_granter)
endpoint_t granter, grantee; /* copyee, copyer */
cp_grant_id_t grant; /* grant id */
vir_bytes bytes; /* copy size */
int access; /* direction (read/write) */
vir_bytes offset_in; /* copy offset within grant */
vir_bytes *offset_result; /* copy offset within virtual address space */
endpoint_t *e_granter; /* new granter (magic grants) */
{
static cp_grant_t g;
static int proc_nr;
static const struct proc *granter_proc;
int depth = 0;
do {
/* Get granter process slot (if valid), and check range of
* grant id.
*/
if(!isokendpt(granter, &proc_nr) ) {
printf(
"grant verify failed: invalid granter %d\n", (int) granter);
return(EINVAL);
}
if(!GRANT_VALID(grant)) {
printf(
"grant verify failed: invalid grant %d\n", (int) grant);
return(EINVAL);
}
granter_proc = proc_addr(proc_nr);
/* If there is no priv. structure, or no grant table in the
* priv. structure, or the grant table in the priv. structure
* is too small for the grant, return EPERM.
*/
if(!HASGRANTTABLE(granter_proc)) {
printf(
"grant verify failed: granter %d has no grant table\n",
granter);
return(EPERM);
}
if(priv(granter_proc)->s_grant_entries <= grant) {
printf(
"verify_grant: grant verify failed in ep %d "
"proc %d: grant %d out of range "
"for table size %d\n",
granter, proc_nr, grant,
priv(granter_proc)->s_grant_entries);
return(EPERM);
}
/* Copy the grant entry corresponding to this id to see what it
* looks like. If it fails, hide the fact that granter has
* (presumably) set an invalid grant table entry by returning
* EPERM, just like with an invalid grant id.
*/
if(data_copy(granter,
priv(granter_proc)->s_grant_table + sizeof(g)*grant,
KERNEL, (vir_bytes) &g, sizeof(g)) != OK) {
printf(
"verify_grant: grant verify: data_copy failed\n");
return EPERM;
}
/* Check validity. */
if((g.cp_flags & (CPF_USED | CPF_VALID)) !=
(CPF_USED | CPF_VALID)) {
printf(
"verify_grant: grant failed: invalid (%d flags 0x%lx)\n",
grant, g.cp_flags);
return EPERM;
}
/* The given grant may be an indirect grant, that is, a grant
* that provides permission to use a grant given to the
* granter (i.e., for which it is the grantee). This can lead
* to a chain of indirect grants which must be followed back.
*/
if((g.cp_flags & CPF_INDIRECT)) {
/* Stop after a few iterations. There may be a loop. */
if (depth == MAX_INDIRECT_DEPTH) {
printf(
"verify grant: indirect grant verify "
"failed: exceeded maximum depth\n");
return ELOOP;
}
depth++;
/* Verify actual grantee. */
if(g.cp_u.cp_indirect.cp_who_to != grantee &&
grantee != ANY &&
g.cp_u.cp_indirect.cp_who_to != ANY) {
printf(
"verify_grant: indirect grant verify "
"failed: bad grantee\n");
return EPERM;
}
/* Start over with new granter, grant, and grantee. */
grantee = granter;
granter = g.cp_u.cp_indirect.cp_who_from;
grant = g.cp_u.cp_indirect.cp_grant;
}
} while(g.cp_flags & CPF_INDIRECT);
/* Check access of grant. */
if(((g.cp_flags & access) != access)) {
printf(
"verify_grant: grant verify failed: access invalid; want 0x%x, have 0x%x\n",
access, g.cp_flags);
return EPERM;
}
if((g.cp_flags & CPF_DIRECT)) {
/* Don't fiddle around with grants that wrap, arithmetic
* below may be confused.
*/
if(MEM_TOP - g.cp_u.cp_direct.cp_len + 1 <
g.cp_u.cp_direct.cp_start) {
printf(
"verify_grant: direct grant verify failed: len too long\n");
return EPERM;
}
/* Verify actual grantee. */
if(g.cp_u.cp_direct.cp_who_to != grantee && grantee != ANY
&& g.cp_u.cp_direct.cp_who_to != ANY) {
printf(
"verify_grant: direct grant verify failed: bad grantee\n");
return EPERM;
}
/* Verify actual copy range. */
if((offset_in+bytes < offset_in) ||
offset_in+bytes > g.cp_u.cp_direct.cp_len) {
printf(
"verify_grant: direct grant verify failed: bad size or range. "
"granted %d bytes @ 0x%lx; wanted %d bytes @ 0x%lx\n",
g.cp_u.cp_direct.cp_len,
g.cp_u.cp_direct.cp_start,
bytes, offset_in);
return EPERM;
}
/* Verify successful - tell caller what address it is. */
*offset_result = g.cp_u.cp_direct.cp_start + offset_in;
*e_granter = granter;
} else if(g.cp_flags & CPF_MAGIC) {
/* Currently, it is hardcoded that only FS may do
* magic grants.
*/
if(granter != VFS_PROC_NR) {
printf(
"verify_grant: magic grant verify failed: granter (%d) "
"is not FS (%d)\n", granter, VFS_PROC_NR);
return EPERM;
}
/* Verify actual grantee. */
if(g.cp_u.cp_magic.cp_who_to != grantee && grantee != ANY
&& g.cp_u.cp_direct.cp_who_to != ANY) {
printf(
"verify_grant: magic grant verify failed: bad grantee\n");
return EPERM;
}
/* Verify actual copy range. */
if((offset_in+bytes < offset_in) ||
offset_in+bytes > g.cp_u.cp_magic.cp_len) {
printf(
"verify_grant: magic grant verify failed: bad size or range. "
"granted %d bytes @ 0x%lx; wanted %d bytes @ 0x%lx\n",
g.cp_u.cp_magic.cp_len,
g.cp_u.cp_magic.cp_start,
bytes, offset_in);
return EPERM;
}
/* Verify successful - tell caller what address it is. */
*offset_result = g.cp_u.cp_magic.cp_start + offset_in;
*e_granter = g.cp_u.cp_magic.cp_who_from;
} else {
printf(
"verify_grant: grant verify failed: unknown grant type\n");
return EPERM;
}
return OK;
}
/*===========================================================================*
* safecopy *
*===========================================================================*/
static int safecopy(caller, granter, grantee, grantid, bytes,
g_offset, addr, access)
struct proc * caller;
endpoint_t granter, grantee;
cp_grant_id_t grantid;
size_t bytes;
vir_bytes g_offset, addr;
int access; /* CPF_READ for a copy from granter to grantee, CPF_WRITE
* for a copy from grantee to granter.
*/
{
static struct vir_addr v_src, v_dst;
static vir_bytes v_offset;
endpoint_t new_granter, *src, *dst;
struct proc *granter_p;
int r;
#if PERF_USE_COW_SAFECOPY
vir_bytes size;
#endif
if(granter == NONE || grantee == NONE) {
printf("safecopy: nonsense processes\n");
return EFAULT;
}
/* See if there is a reasonable grant table. */
if(!(granter_p = endpoint_lookup(granter))) return EINVAL;
if(!HASGRANTTABLE(granter_p)) {
printf(
"safecopy failed: granter %d has no grant table\n", granter);
return(EPERM);
}
/* Decide who is src and who is dst. */
if(access & CPF_READ) {
src = &granter;
dst = &grantee;
} else {
src = &grantee;
dst = &granter;
}
/* Verify permission exists. */
if((r=verify_grant(granter, grantee, grantid, bytes, access,
g_offset, &v_offset, &new_granter)) != OK) {
printf(
"grant %d verify to copy %d->%d by %d failed: err %d\n",
grantid, *src, *dst, grantee, r);
return r;
}
/* verify_grant() can redirect the grantee to someone else,
* meaning the source or destination changes.
*/
granter = new_granter;
/* Now it's a regular copy. */
v_src.proc_nr_e = *src;
v_dst.proc_nr_e = *dst;
/* Now the offset in virtual addressing is known in 'offset'.
* Depending on the access, this is the source or destination
* address.
*/
if(access & CPF_READ) {
v_src.offset = v_offset;
v_dst.offset = (vir_bytes) addr;
} else {
v_src.offset = (vir_bytes) addr;
v_dst.offset = v_offset;
}
/* Do the regular copy. */
#if PERF_USE_COW_SAFECOPY
if(v_offset % CLICK_SIZE != addr % CLICK_SIZE || bytes < CLICK_SIZE) {
/* Give up on COW immediately when offsets are not aligned
* or we are copying less than a page.
*/
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
}
if((size = v_offset % CLICK_SIZE) != 0) {
/* Normal copy for everything before the first page boundary. */
size = CLICK_SIZE - size;
r = virtual_copy_vmcheck(caller, &v_src, &v_dst, size);
if(r != OK)
return r;
v_src.offset += size;
v_dst.offset += size;
bytes -= size;
}
if((size = bytes / CLICK_SIZE) != 0) {
/* Use COW optimization when copying entire pages. */
size *= CLICK_SIZE;
r = map_invoke_vm(VMPTYPE_COWMAP,
v_dst.proc_nr_e, v_dst.segment, v_dst.offset,
v_src.proc_nr_e, v_src.segment, v_src.offset,
size, 0);
if(r != OK)
return r;
v_src.offset += size;
v_dst.offset += size;
bytes -= size;
}
if(bytes != 0) {
/* Normal copy for everything after the last page boundary. */
r = virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
if(r != OK)
return r;
}
return OK;
#else
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
#endif
}
/*===========================================================================*
* do_safecopy_to *
*===========================================================================*/
int do_safecopy_to(struct proc * caller, message * m_ptr)
{
return safecopy(caller, m_ptr->SCP_FROM_TO, caller->p_endpoint,
(cp_grant_id_t) m_ptr->SCP_GID,
m_ptr->SCP_BYTES, m_ptr->SCP_OFFSET,
(vir_bytes) m_ptr->SCP_ADDRESS, CPF_WRITE);
}
/*===========================================================================*
* do_safecopy_from *
*===========================================================================*/
int do_safecopy_from(struct proc * caller, message * m_ptr)
{
return safecopy(caller, m_ptr->SCP_FROM_TO, caller->p_endpoint,
(cp_grant_id_t) m_ptr->SCP_GID,
m_ptr->SCP_BYTES, m_ptr->SCP_OFFSET,
(vir_bytes) m_ptr->SCP_ADDRESS, CPF_READ);
}
/*===========================================================================*
* do_vsafecopy *
*===========================================================================*/
int do_vsafecopy(struct proc * caller, message * m_ptr)
{
static struct vscp_vec vec[SCPVEC_NR];
static struct vir_addr src, dst;
int r, i, els;
size_t bytes;
/* Set vector copy parameters. */
src.proc_nr_e = caller->p_endpoint;
assert(src.proc_nr_e != NONE);
src.offset = (vir_bytes) m_ptr->VSCP_VEC_ADDR;
dst.proc_nr_e = KERNEL;
dst.offset = (vir_bytes) vec;
/* No. of vector elements. */
els = m_ptr->VSCP_VEC_SIZE;
bytes = els * sizeof(struct vscp_vec);
/* Obtain vector of copies. */
if((r=virtual_copy_vmcheck(caller, &src, &dst, bytes)) != OK)
return r;
/* Perform safecopies. */
for(i = 0; i < els; i++) {
int access;
endpoint_t granter;
if(vec[i].v_from == SELF) {
access = CPF_WRITE;
granter = vec[i].v_to;
} else if(vec[i].v_to == SELF) {
access = CPF_READ;
granter = vec[i].v_from;
} else {
printf("vsafecopy: %d: element %d/%d: no SELF found\n",
caller->p_endpoint, i, els);
return EINVAL;
}
/* Do safecopy for this element. */
if((r=safecopy(caller, granter, caller->p_endpoint,
vec[i].v_gid,
vec[i].v_bytes, vec[i].v_offset,
vec[i].v_addr, access)) != OK) {
return r;
}
}
return OK;
}