This commit removes all traces of Minix segments (the text/data/stack memory map abstraction in the kernel) and significance of Intel segments (hardware segments like CS, DS that add offsets to all addressing before page table translation). This ultimately simplifies the memory layout and addressing and makes the same layout possible on non-Intel architectures. There are only two types of addresses in the world now: virtual and physical; even the kernel and processes have the same virtual address space. Kernel and user processes can be distinguished at a glance as processes won't use 0xF0000000 and above. No static pre-allocated memory sizes exist any more. Changes to booting: . The pre_init.c leaves the kernel and modules exactly as they were left by the bootloader in physical memory . The kernel starts running using physical addressing, loaded at a fixed location given in its linker script by the bootloader. All code and data in this phase are linked to this fixed low location. . It makes a bootstrap pagetable to map itself to a fixed high location (also in linker script) and jumps to the high address. All code and data then use this high addressing. . All code/data symbols linked at the low addresses is prefixed by an objcopy step with __k_unpaged_*, so that that code cannot reference highly-linked symbols (which aren't valid yet) or vice versa (symbols that aren't valid any more). . The two addressing modes are separated in the linker script by collecting the unpaged_*.o objects and linking them with low addresses, and linking the rest high. Some objects are linked twice, once low and once high. . The bootstrap phase passes a lot of information (e.g. free memory list, physical location of the modules, etc.) using the kinfo struct. . After this bootstrap the low-linked part is freed. . The kernel maps in VM into the bootstrap page table so that VM can begin executing. Its first job is to make page tables for all other boot processes. So VM runs before RS, and RS gets a fully dynamic, VM-managed address space. VM gets its privilege info from RS as usual but that happens after RS starts running. . Both the kernel loading VM and VM organizing boot processes happen using the libexec logic. This removes the last reason for VM to still know much about exec() and vm/exec.c is gone. Further Implementation: . All segments are based at 0 and have a 4 GB limit. . The kernel is mapped in at the top of the virtual address space so as not to constrain the user processes. . Processes do not use segments from the LDT at all; there are no segments in the LDT any more, so no LLDT is needed. . The Minix segments T/D/S are gone and so none of the user-space or in-kernel copy functions use them. The copy functions use a process endpoint of NONE to realize it's a physical address, virtual otherwise. . The umap call only makes sense to translate a virtual address to a physical address now. . Segments-related calls like newmap and alloc_segments are gone. . All segments-related translation in VM is gone (vir2map etc). . Initialization in VM is simpler as no moving around is necessary. . VM and all other boot processes can be linked wherever they wish and will be mapped in at the right location by the kernel and VM respectively. Other changes: . The multiboot code is less special: it does not use mb_print for its diagnostics any more but uses printf() as normal, saving the output into the diagnostics buffer, only printing to the screen using the direct print functions if a panic() occurs. . The multiboot code uses the flexible 'free memory map list' style to receive the list of free memory if available. . The kernel determines the memory layout of the processes to a degree: it tells VM where the kernel starts and ends and where the kernel wants the top of the process to be. VM then uses this entire range, i.e. the stack is right at the top, and mmap()ped bits of memory are placed below that downwards, and the break grows upwards. Other Consequences: . Every process gets its own page table as address spaces can't be separated any more by segments. . As all segments are 0-based, there is no distinction between virtual and linear addresses, nor between userspace and kernel addresses. . Less work is done when context switching, leading to a net performance increase. (8% faster on my machine for 'make servers'.) . The layout and configuration of the GDT makes sysenter and syscall possible.
421 lines
12 KiB
C
421 lines
12 KiB
C
/* The kernel call implemented in this file:
|
|
* m_type: SYS_SAFECOPYFROM or SYS_SAFECOPYTO or SYS_VSAFECOPY
|
|
*
|
|
* The parameters for this kernel call are:
|
|
* SCP_FROM_TO other endpoint
|
|
* SCP_GID grant id
|
|
* SCP_OFFSET offset within granted space
|
|
* SCP_ADDRESS address in own address space
|
|
* SCP_BYTES bytes to be copied
|
|
*
|
|
* For the vectored variant (do_vsafecopy):
|
|
* VSCP_VEC_ADDR address of vector
|
|
* VSCP_VEC_SIZE number of significant elements in vector
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <minix/type.h>
|
|
#include <minix/safecopies.h>
|
|
|
|
#include "kernel/system.h"
|
|
|
|
#define MAX_INDIRECT_DEPTH 5 /* up to how many indirect grants to follow? */
|
|
|
|
#define MEM_TOP 0xFFFFFFFFUL
|
|
|
|
static int safecopy(struct proc *, endpoint_t, endpoint_t,
|
|
cp_grant_id_t, size_t, vir_bytes, vir_bytes, int);
|
|
|
|
#define HASGRANTTABLE(gr) \
|
|
(priv(gr) && priv(gr)->s_grant_table)
|
|
|
|
/*===========================================================================*
|
|
* verify_grant *
|
|
*===========================================================================*/
|
|
int verify_grant(granter, grantee, grant, bytes, access,
|
|
offset_in, offset_result, e_granter)
|
|
endpoint_t granter, grantee; /* copyee, copyer */
|
|
cp_grant_id_t grant; /* grant id */
|
|
vir_bytes bytes; /* copy size */
|
|
int access; /* direction (read/write) */
|
|
vir_bytes offset_in; /* copy offset within grant */
|
|
vir_bytes *offset_result; /* copy offset within virtual address space */
|
|
endpoint_t *e_granter; /* new granter (magic grants) */
|
|
{
|
|
static cp_grant_t g;
|
|
static int proc_nr;
|
|
static const struct proc *granter_proc;
|
|
int depth = 0;
|
|
|
|
do {
|
|
/* Get granter process slot (if valid), and check range of
|
|
* grant id.
|
|
*/
|
|
if(!isokendpt(granter, &proc_nr) ) {
|
|
printf(
|
|
"grant verify failed: invalid granter %d\n", (int) granter);
|
|
return(EINVAL);
|
|
}
|
|
if(!GRANT_VALID(grant)) {
|
|
printf(
|
|
"grant verify failed: invalid grant %d\n", (int) grant);
|
|
return(EINVAL);
|
|
}
|
|
granter_proc = proc_addr(proc_nr);
|
|
|
|
/* If there is no priv. structure, or no grant table in the
|
|
* priv. structure, or the grant table in the priv. structure
|
|
* is too small for the grant, return EPERM.
|
|
*/
|
|
if(!HASGRANTTABLE(granter_proc)) {
|
|
printf(
|
|
"grant verify failed: granter %d has no grant table\n",
|
|
granter);
|
|
return(EPERM);
|
|
}
|
|
|
|
if(priv(granter_proc)->s_grant_entries <= grant) {
|
|
printf(
|
|
"verify_grant: grant verify failed in ep %d "
|
|
"proc %d: grant %d out of range "
|
|
"for table size %d\n",
|
|
granter, proc_nr, grant,
|
|
priv(granter_proc)->s_grant_entries);
|
|
return(EPERM);
|
|
}
|
|
|
|
/* Copy the grant entry corresponding to this id to see what it
|
|
* looks like. If it fails, hide the fact that granter has
|
|
* (presumably) set an invalid grant table entry by returning
|
|
* EPERM, just like with an invalid grant id.
|
|
*/
|
|
if(data_copy(granter,
|
|
priv(granter_proc)->s_grant_table + sizeof(g)*grant,
|
|
KERNEL, (vir_bytes) &g, sizeof(g)) != OK) {
|
|
printf(
|
|
"verify_grant: grant verify: data_copy failed\n");
|
|
return EPERM;
|
|
}
|
|
|
|
/* Check validity. */
|
|
if((g.cp_flags & (CPF_USED | CPF_VALID)) !=
|
|
(CPF_USED | CPF_VALID)) {
|
|
printf(
|
|
"verify_grant: grant failed: invalid (%d flags 0x%lx)\n",
|
|
grant, g.cp_flags);
|
|
return EPERM;
|
|
}
|
|
|
|
/* The given grant may be an indirect grant, that is, a grant
|
|
* that provides permission to use a grant given to the
|
|
* granter (i.e., for which it is the grantee). This can lead
|
|
* to a chain of indirect grants which must be followed back.
|
|
*/
|
|
if((g.cp_flags & CPF_INDIRECT)) {
|
|
/* Stop after a few iterations. There may be a loop. */
|
|
if (depth == MAX_INDIRECT_DEPTH) {
|
|
printf(
|
|
"verify grant: indirect grant verify "
|
|
"failed: exceeded maximum depth\n");
|
|
return ELOOP;
|
|
}
|
|
depth++;
|
|
|
|
/* Verify actual grantee. */
|
|
if(g.cp_u.cp_indirect.cp_who_to != grantee &&
|
|
grantee != ANY &&
|
|
g.cp_u.cp_indirect.cp_who_to != ANY) {
|
|
printf(
|
|
"verify_grant: indirect grant verify "
|
|
"failed: bad grantee\n");
|
|
return EPERM;
|
|
}
|
|
|
|
/* Start over with new granter, grant, and grantee. */
|
|
grantee = granter;
|
|
granter = g.cp_u.cp_indirect.cp_who_from;
|
|
grant = g.cp_u.cp_indirect.cp_grant;
|
|
}
|
|
} while(g.cp_flags & CPF_INDIRECT);
|
|
|
|
/* Check access of grant. */
|
|
if(((g.cp_flags & access) != access)) {
|
|
printf(
|
|
"verify_grant: grant verify failed: access invalid; want 0x%x, have 0x%x\n",
|
|
access, g.cp_flags);
|
|
return EPERM;
|
|
}
|
|
|
|
if((g.cp_flags & CPF_DIRECT)) {
|
|
/* Don't fiddle around with grants that wrap, arithmetic
|
|
* below may be confused.
|
|
*/
|
|
if(MEM_TOP - g.cp_u.cp_direct.cp_len + 1 <
|
|
g.cp_u.cp_direct.cp_start) {
|
|
printf(
|
|
"verify_grant: direct grant verify failed: len too long\n");
|
|
return EPERM;
|
|
}
|
|
|
|
/* Verify actual grantee. */
|
|
if(g.cp_u.cp_direct.cp_who_to != grantee && grantee != ANY
|
|
&& g.cp_u.cp_direct.cp_who_to != ANY) {
|
|
printf(
|
|
"verify_grant: direct grant verify failed: bad grantee\n");
|
|
return EPERM;
|
|
}
|
|
|
|
/* Verify actual copy range. */
|
|
if((offset_in+bytes < offset_in) ||
|
|
offset_in+bytes > g.cp_u.cp_direct.cp_len) {
|
|
printf(
|
|
"verify_grant: direct grant verify failed: bad size or range. "
|
|
"granted %d bytes @ 0x%lx; wanted %d bytes @ 0x%lx\n",
|
|
g.cp_u.cp_direct.cp_len,
|
|
g.cp_u.cp_direct.cp_start,
|
|
bytes, offset_in);
|
|
return EPERM;
|
|
}
|
|
|
|
/* Verify successful - tell caller what address it is. */
|
|
*offset_result = g.cp_u.cp_direct.cp_start + offset_in;
|
|
*e_granter = granter;
|
|
} else if(g.cp_flags & CPF_MAGIC) {
|
|
/* Currently, it is hardcoded that only FS may do
|
|
* magic grants.
|
|
*/
|
|
if(granter != VFS_PROC_NR) {
|
|
printf(
|
|
"verify_grant: magic grant verify failed: granter (%d) "
|
|
"is not FS (%d)\n", granter, VFS_PROC_NR);
|
|
return EPERM;
|
|
}
|
|
|
|
/* Verify actual grantee. */
|
|
if(g.cp_u.cp_magic.cp_who_to != grantee && grantee != ANY
|
|
&& g.cp_u.cp_direct.cp_who_to != ANY) {
|
|
printf(
|
|
"verify_grant: magic grant verify failed: bad grantee\n");
|
|
return EPERM;
|
|
}
|
|
|
|
/* Verify actual copy range. */
|
|
if((offset_in+bytes < offset_in) ||
|
|
offset_in+bytes > g.cp_u.cp_magic.cp_len) {
|
|
printf(
|
|
"verify_grant: magic grant verify failed: bad size or range. "
|
|
"granted %d bytes @ 0x%lx; wanted %d bytes @ 0x%lx\n",
|
|
g.cp_u.cp_magic.cp_len,
|
|
g.cp_u.cp_magic.cp_start,
|
|
bytes, offset_in);
|
|
return EPERM;
|
|
}
|
|
|
|
/* Verify successful - tell caller what address it is. */
|
|
*offset_result = g.cp_u.cp_magic.cp_start + offset_in;
|
|
*e_granter = g.cp_u.cp_magic.cp_who_from;
|
|
} else {
|
|
printf(
|
|
"verify_grant: grant verify failed: unknown grant type\n");
|
|
return EPERM;
|
|
}
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*===========================================================================*
|
|
* safecopy *
|
|
*===========================================================================*/
|
|
static int safecopy(caller, granter, grantee, grantid, bytes,
|
|
g_offset, addr, access)
|
|
struct proc * caller;
|
|
endpoint_t granter, grantee;
|
|
cp_grant_id_t grantid;
|
|
size_t bytes;
|
|
vir_bytes g_offset, addr;
|
|
int access; /* CPF_READ for a copy from granter to grantee, CPF_WRITE
|
|
* for a copy from grantee to granter.
|
|
*/
|
|
{
|
|
static struct vir_addr v_src, v_dst;
|
|
static vir_bytes v_offset;
|
|
endpoint_t new_granter, *src, *dst;
|
|
struct proc *granter_p;
|
|
int r;
|
|
#if PERF_USE_COW_SAFECOPY
|
|
vir_bytes size;
|
|
#endif
|
|
|
|
if(granter == NONE || grantee == NONE) {
|
|
printf("safecopy: nonsense processes\n");
|
|
return EFAULT;
|
|
}
|
|
|
|
/* See if there is a reasonable grant table. */
|
|
if(!(granter_p = endpoint_lookup(granter))) return EINVAL;
|
|
if(!HASGRANTTABLE(granter_p)) {
|
|
printf(
|
|
"safecopy failed: granter %d has no grant table\n", granter);
|
|
return(EPERM);
|
|
}
|
|
|
|
/* Decide who is src and who is dst. */
|
|
if(access & CPF_READ) {
|
|
src = &granter;
|
|
dst = &grantee;
|
|
} else {
|
|
src = &grantee;
|
|
dst = &granter;
|
|
}
|
|
|
|
/* Verify permission exists. */
|
|
if((r=verify_grant(granter, grantee, grantid, bytes, access,
|
|
g_offset, &v_offset, &new_granter)) != OK) {
|
|
printf(
|
|
"grant %d verify to copy %d->%d by %d failed: err %d\n",
|
|
grantid, *src, *dst, grantee, r);
|
|
return r;
|
|
}
|
|
|
|
/* verify_grant() can redirect the grantee to someone else,
|
|
* meaning the source or destination changes.
|
|
*/
|
|
granter = new_granter;
|
|
|
|
/* Now it's a regular copy. */
|
|
v_src.proc_nr_e = *src;
|
|
v_dst.proc_nr_e = *dst;
|
|
|
|
/* Now the offset in virtual addressing is known in 'offset'.
|
|
* Depending on the access, this is the source or destination
|
|
* address.
|
|
*/
|
|
if(access & CPF_READ) {
|
|
v_src.offset = v_offset;
|
|
v_dst.offset = (vir_bytes) addr;
|
|
} else {
|
|
v_src.offset = (vir_bytes) addr;
|
|
v_dst.offset = v_offset;
|
|
}
|
|
|
|
/* Do the regular copy. */
|
|
#if PERF_USE_COW_SAFECOPY
|
|
if(v_offset % CLICK_SIZE != addr % CLICK_SIZE || bytes < CLICK_SIZE) {
|
|
/* Give up on COW immediately when offsets are not aligned
|
|
* or we are copying less than a page.
|
|
*/
|
|
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
|
|
}
|
|
|
|
if((size = v_offset % CLICK_SIZE) != 0) {
|
|
/* Normal copy for everything before the first page boundary. */
|
|
size = CLICK_SIZE - size;
|
|
r = virtual_copy_vmcheck(caller, &v_src, &v_dst, size);
|
|
if(r != OK)
|
|
return r;
|
|
v_src.offset += size;
|
|
v_dst.offset += size;
|
|
bytes -= size;
|
|
}
|
|
if((size = bytes / CLICK_SIZE) != 0) {
|
|
/* Use COW optimization when copying entire pages. */
|
|
size *= CLICK_SIZE;
|
|
r = map_invoke_vm(VMPTYPE_COWMAP,
|
|
v_dst.proc_nr_e, v_dst.segment, v_dst.offset,
|
|
v_src.proc_nr_e, v_src.segment, v_src.offset,
|
|
size, 0);
|
|
if(r != OK)
|
|
return r;
|
|
v_src.offset += size;
|
|
v_dst.offset += size;
|
|
bytes -= size;
|
|
}
|
|
if(bytes != 0) {
|
|
/* Normal copy for everything after the last page boundary. */
|
|
r = virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
|
|
if(r != OK)
|
|
return r;
|
|
}
|
|
|
|
return OK;
|
|
#else
|
|
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
|
|
#endif
|
|
}
|
|
|
|
/*===========================================================================*
|
|
* do_safecopy_to *
|
|
*===========================================================================*/
|
|
int do_safecopy_to(struct proc * caller, message * m_ptr)
|
|
{
|
|
return safecopy(caller, m_ptr->SCP_FROM_TO, caller->p_endpoint,
|
|
(cp_grant_id_t) m_ptr->SCP_GID,
|
|
m_ptr->SCP_BYTES, m_ptr->SCP_OFFSET,
|
|
(vir_bytes) m_ptr->SCP_ADDRESS, CPF_WRITE);
|
|
}
|
|
|
|
/*===========================================================================*
|
|
* do_safecopy_from *
|
|
*===========================================================================*/
|
|
int do_safecopy_from(struct proc * caller, message * m_ptr)
|
|
{
|
|
return safecopy(caller, m_ptr->SCP_FROM_TO, caller->p_endpoint,
|
|
(cp_grant_id_t) m_ptr->SCP_GID,
|
|
m_ptr->SCP_BYTES, m_ptr->SCP_OFFSET,
|
|
(vir_bytes) m_ptr->SCP_ADDRESS, CPF_READ);
|
|
}
|
|
|
|
/*===========================================================================*
|
|
* do_vsafecopy *
|
|
*===========================================================================*/
|
|
int do_vsafecopy(struct proc * caller, message * m_ptr)
|
|
{
|
|
static struct vscp_vec vec[SCPVEC_NR];
|
|
static struct vir_addr src, dst;
|
|
int r, i, els;
|
|
size_t bytes;
|
|
|
|
/* Set vector copy parameters. */
|
|
src.proc_nr_e = caller->p_endpoint;
|
|
assert(src.proc_nr_e != NONE);
|
|
src.offset = (vir_bytes) m_ptr->VSCP_VEC_ADDR;
|
|
dst.proc_nr_e = KERNEL;
|
|
dst.offset = (vir_bytes) vec;
|
|
|
|
/* No. of vector elements. */
|
|
els = m_ptr->VSCP_VEC_SIZE;
|
|
bytes = els * sizeof(struct vscp_vec);
|
|
|
|
/* Obtain vector of copies. */
|
|
if((r=virtual_copy_vmcheck(caller, &src, &dst, bytes)) != OK)
|
|
return r;
|
|
|
|
/* Perform safecopies. */
|
|
for(i = 0; i < els; i++) {
|
|
int access;
|
|
endpoint_t granter;
|
|
if(vec[i].v_from == SELF) {
|
|
access = CPF_WRITE;
|
|
granter = vec[i].v_to;
|
|
} else if(vec[i].v_to == SELF) {
|
|
access = CPF_READ;
|
|
granter = vec[i].v_from;
|
|
} else {
|
|
printf("vsafecopy: %d: element %d/%d: no SELF found\n",
|
|
caller->p_endpoint, i, els);
|
|
return EINVAL;
|
|
}
|
|
|
|
/* Do safecopy for this element. */
|
|
if((r=safecopy(caller, granter, caller->p_endpoint,
|
|
vec[i].v_gid,
|
|
vec[i].v_bytes, vec[i].v_offset,
|
|
vec[i].v_addr, access)) != OK) {
|
|
return r;
|
|
}
|
|
}
|
|
|
|
return OK;
|
|
}
|
|
|