--- linux-2.4.20/fs/proc/array.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/fs/proc/array.c 2003-06-04 14:21:58.000000000 -0400 @@ -472,11 +472,11 @@ static inline void statm_pmd_range(pgd_t static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end, int * pages, int * shared, int * dirty, int * total) { - while (address < end) { + do { statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total); address = (address + PGDIR_SIZE) & PGDIR_MASK; pgd++; - } + } while (address && (address < end)); } int proc_pid_statm(struct task_struct *task, char * buffer) --- linux-2.4.20/kernel/ksyms.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/kernel/ksyms.c 2003-06-04 14:21:58.000000000 -0400 @@ -144,7 +144,6 @@ EXPORT_SYMBOL(kmap_high); EXPORT_SYMBOL(kunmap_high); EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(create_bounce); -EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); EXPORT_SYMBOL(blk_queue_bounce); #endif --- linux-2.4.20/mm/memory.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/mm/memory.c 2003-06-04 14:21:58.000000000 -0400 @@ -111,7 +111,7 @@ static inline void free_one_pmd(pmd_t * pte_free(pte); } -static inline void free_one_pgd(pgd_t * dir) +static inline void free_one_pgd(pgd_t * dir, int pgd_idx) { int j; pmd_t * pmd; @@ -126,6 +126,11 @@ static inline void free_one_pgd(pgd_t * pmd = pmd_offset(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) { + /* Can't just #define to PAGE_OFFSET since it's 0 on some platforms */ +#ifdef PAGE_OFFSET_USER + if (pgd_idx * PGDIR_SIZE + j * PMD_SIZE >= PAGE_OFFSET_USER) + break; +#endif prefetchw(pmd+j+(PREFETCH_STRIDE/16)); free_one_pmd(pmd+j); } @@ -207,12 +212,14 @@ out: void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) { pgd_t * page_dir = mm->pgd; + int idx = first; spin_lock(&mm->page_table_lock); page_dir += first; do { - free_one_pgd(page_dir); + free_one_pgd(page_dir, idx); page_dir++; + idx++; } while (--nr); spin_unlock(&mm->page_table_lock); @@ -437,7 +444,7 @@ static inline int zap_pte_range(mmu_gath static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) { pmd_t * pmd; - unsigned long end; + unsigned long end, pgd_boundary; int freed; if (pgd_none(*dir)) @@ -449,8 +456,9 @@ static inline int zap_pmd_range(mmu_gath } pmd = pmd_offset(dir, address); end = address + size; - if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) - end = ((address + PGDIR_SIZE) & PGDIR_MASK); + pgd_boundary = ((address + PGDIR_SIZE) & PGDIR_MASK); + if (pgd_boundary && (end > pgd_boundary)) + end = pgd_boundary; freed = 0; do { freed += zap_pte_range(tlb, pmd, address, end - address); @@ -521,6 +529,11 @@ void zap_page_range(struct mm_struct *mm } } +/* 4G/4G split x86 specific macro */ +#ifndef pte_user +#define pte_user(x) 1 +#endif + /* * Do a quick page-table lookup for a single page. */ @@ -544,7 +557,7 @@ struct page * follow_page(struct mm_stru pte = *ptep; pte_unmap(ptep); - if (pte_present(pte)) { + if (pte_present(pte) && pte_user(pte)) { struct page *page = pte_page(pte); prefetch(page); if (!write || --- /dev/null 2003-01-30 05:24:37.000000000 -0500 +++ linux-2.4.20/mm/usercopy.c 2003-06-04 14:21:58.000000000 -0400 @@ -0,0 +1,262 @@ +/* + * linux/mm/usercopy.c + * + * (C) Copyright 2003 Ingo Molnar + * + * Generic implementation of all the user-VM access functions, without + * relying on being able to access the VM directly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Get kernel address of the user page and pin it. + */ +static inline struct page *pin_page(unsigned long addr, int write) +{ + struct mm_struct *mm = current->mm ? : &init_mm; + struct page *page; + + spin_lock(&mm->page_table_lock); + /* + * Do a quick atomic lookup first - this is the fastpath. + */ + page = follow_page(mm, addr, write); + if (likely(page != NULL)) { + if (!PageReserved(page)) + get_page(page); + spin_unlock(&mm->page_table_lock); + return page; + } + + /* + * No luck - bad address or need to fault in the page: + */ + spin_unlock(&mm->page_table_lock); + + down_read(&mm->mmap_sem); + get_user_pages(current, mm, addr, 1, write, 0, &page, NULL); + up_read(&mm->mmap_sem); + + return page; +} + +static inline void unpin_page(struct page *page) +{ + put_page(page); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +static int rw_vm(unsigned long addr, void *buf, int len, int write) +{ + if (!len) + return 0; + + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + struct page *page = NULL; + int bytes, offset; + void *maddr; + + page = pin_page(addr, write); + if (!page) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap_atomic(page, KM_USER_COPY); + +#define HANDLE_TYPE(type) \ + case sizeof(type): *(type *)(maddr+offset) = *(type *)(buf); break; + + if (write) { + switch (bytes) { + HANDLE_TYPE(char); + HANDLE_TYPE(int); + HANDLE_TYPE(long long); + default: + memcpy(maddr + offset, buf, bytes); + } + } else { +#undef HANDLE_TYPE +#define HANDLE_TYPE(type) \ + case sizeof(type): *(type *)(buf) = *(type *)(maddr+offset); break; + switch (bytes) { + HANDLE_TYPE(char); + HANDLE_TYPE(int); + HANDLE_TYPE(long long); + default: + memcpy(buf, maddr + offset, bytes); + } +#undef HANDLE_TYPE + } + kunmap_atomic(maddr, KM_USER_COPY); + unpin_page(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + + return len; +} + +static int str_vm(unsigned long addr, void *buf0, int len, int copy) +{ + struct mm_struct *mm = current->mm ? : &init_mm; + struct page *page; + void *buf = buf0; + + if (!len) + return len; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset, left, copied; + char *maddr; + + ret = get_user_pages(current, mm, addr, 1, copy == 2, 0, &page, NULL); + if (ret <= 0) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap_atomic(page, KM_USER_COPY); + if (copy == 2) { + memset(maddr + offset, 0, bytes); + copied = bytes; + left = 0; + } else if (copy == 1) { + left = strncpy_count(buf, maddr + offset, bytes); + copied = bytes - left; + } else { + copied = strnlen(maddr + offset, bytes); + left = bytes - copied; + } + BUG_ON(bytes < 0 || copied < 0); + kunmap_atomic(maddr, KM_USER_COPY); + page_cache_release(page); + len -= copied; + buf += copied; + addr += copied; + if (left) + break; + } + up_read(&mm->mmap_sem); + + return len; +} + +/* + * Copies memory from userspace (ptr) into kernelspace (val). + * + * returns # of bytes not copied. + */ +int get_user_size(unsigned int size, void *val, const void *ptr) +{ + int ret; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(val, ptr, size); + return 0; + } + ret = rw_vm((unsigned long)ptr, val, size, 0); + if (ret) + /* + * Zero the rest: + */ + memset(val + size - ret, 0, ret); + return ret; +} + +/* + * Copies memory from kernelspace (val) into userspace (ptr). + * + * returns # of bytes not copied. + */ +int put_user_size(unsigned int size, const void *val, void *ptr) +{ + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(ptr, val, size); + return 0; + } + return rw_vm((unsigned long)ptr, (void *)val, size, 1); +} + +int copy_str_fromuser_size(unsigned int size, void *val, const void *ptr) +{ + int copied, left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + left = strncpy_count(val, ptr, size); + copied = size - left; + BUG_ON(copied < 0); + + return copied; + } + left = str_vm((unsigned long)ptr, val, size, 1); + if (left < 0) + return left; + copied = size - left; + BUG_ON(copied < 0); + + return copied; +} +EXPORT_SYMBOL(copy_str_fromuser_size); + +int strlen_fromuser_size(unsigned int size, const void *ptr) +{ + int copied, left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + copied = strnlen(ptr, size) + 1; + BUG_ON(copied < 0); + + return copied; + } + left = str_vm((unsigned long)ptr, NULL, size, 0); + if (left < 0) + return 0; + copied = size - left + 1; + BUG_ON(copied < 0); + + return copied; +} + +int zero_user_size(unsigned int size, void *ptr) +{ + int left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memset(ptr, 0, size); + return 0; + } + left = str_vm((unsigned long)ptr, NULL, size, 2); + if (left < 0) + return size; + return left; +} + --- linux-2.4.20/mm/Makefile.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/mm/Makefile 2003-06-04 14:21:58.000000000 -0400 @@ -9,7 +9,7 @@ O_TARGET := mm.o -export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o usercopy.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ @@ -17,6 +17,7 @@ obj-y := memory.o mmap.o filemap.o mpro shmem.o mempool.o vcache.o rmap.o obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_X86_UACCESS_INDIRECT) += usercopy.o obj-y += wtd.o include $(TOPDIR)/Rules.make --- linux-2.4.20/include/linux/sched.h.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/include/linux/sched.h 2003-06-04 14:21:58.000000000 -0400 @@ -398,9 +398,9 @@ extern struct user_struct root_user; typedef struct prio_array prio_array_t; struct task_struct { - /* - * offsets of these are hardcoded elsewhere - touch with care - */ + + /* --- start of hardcoded fields - touch with care --- */ + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ unsigned long flags; /* per process flags, defined below */ int sigpending; @@ -418,6 +418,10 @@ struct task_struct { * offset 32 begins here on 32-bit platforms. */ unsigned int cpu; + void *real_stack, *virtual_stack, *user_pgd; + + /* ------- end of hardcoded fields ---------------- */ + int prio, static_prio; struct list_head run_list; prio_array_t *array; @@ -689,6 +693,7 @@ extern struct exec_domain default_exec_d alloc_lock: SPIN_LOCK_UNLOCKED, \ switch_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ + real_stack: &tsk, \ } --- linux-2.4.20/include/asm-i386/mmu_context.h.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/include/asm-i386/mmu_context.h 2003-06-04 14:21:58.000000000 -0400 @@ -29,6 +29,10 @@ static inline void enter_lazy_tlb(struct static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) { +#if CONFIG_X86_SWITCH_PAGETABLES + if (tsk->mm) + tsk->user_pgd = (void *)__pa(tsk->mm->pgd); +#endif if (likely(prev != next)) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); @@ -38,7 +42,9 @@ static inline void switch_mm(struct mm_s #endif set_bit(cpu, &next->cpu_vm_mask); /* Re-load page tables */ +#if !CONFIG_X86_SWITCH_PAGETABLES load_cr3(next->pgd); +#endif /* load_LDT, if either the previous or next thread * has a non-default LDT. */ @@ -54,7 +60,9 @@ static inline void switch_mm(struct mm_s /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload %cr3. */ +#if !CONFIG_X86_SWITCH_PAGETABLES load_cr3(next->pgd); +#endif load_LDT(&next->context); } } @@ -62,6 +70,6 @@ static inline void switch_mm(struct mm_s } #define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL,smp_processor_id()) + switch_mm((prev),(next),current,smp_processor_id()) #endif --- linux-2.4.20/include/asm-i386/pgtable.h.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/include/asm-i386/pgtable.h 2003-06-04 14:21:58.000000000 -0400 @@ -112,6 +112,28 @@ extern unsigned long empty_zero_page[102 * newer 3-level PAE-mode page tables. */ #ifndef __ASSEMBLY__ + +extern void set_system_gate(unsigned int n, void *addr); +extern void init_entry_mappings(void); +extern void entry_trampoline_setup(void); + +#if CONFIG_X86_HIGH_ENTRY + +extern char entry_tramp_start, entry_tramp_end; + +/* + * Fix up symbol addresses in the trampoline fixmapped just below 4G. + */ +#define __ENTRY_TRAMP_ADDR(symb) ((void *)(symb) - (void *) &entry_tramp_start + (void *) __fix_to_virt(FIX_ENTRY_TRAMPOLINE)) + +#define ENTRY_TRAMP_ADDR(symb) \ + ({ void *__ret; if ((void *)(symb) >= (void *)&entry_tramp_start && (void *)(symb) < (void *)&entry_tramp_end) __ret = __ENTRY_TRAMP_ADDR(symb); else __ret = (symb); __ret; }) + +#else +# define ENTRY_TRAMP_ADDR(symb) (symb) +# define __ENTRY_TRAMP_ADDR(symb) (symb) +#endif + #if CONFIG_X86_PAE # include @@ -138,7 +160,12 @@ extern void pgtable_cache_init(void); #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#if defined(CONFIG_X86_PAE) && defined(CONFIG_X86_4G_VM_LAYOUT) +# define USER_PTRS_PER_PGD 4 +#else +# define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE) + ((TASK_SIZE % PGDIR_SIZE) + PGDIR_SIZE-1)/PGDIR_SIZE) +#endif + #define FIRST_USER_PGD_NR 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) @@ -227,9 +254,22 @@ extern void pgtable_cache_init(void); }) #endif -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +#if CONFIG_X86_SWITCH_PAGETABLES +# define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +# define PAGE_KERNEL_GLOBAL MAKE_GLOBAL(__PAGE_KERNEL) +# define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +# define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +# define PAGE_KERNEL_GLOBAL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +# define PAGE_KERNEL_PSE __pgprot(_KERNPG_TABLE|_PAGE_PSE) +#else +# define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) +# define PAGE_KERNEL_GLOBAL MAKE_GLOBAL(__PAGE_KERNEL) +# define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) +# define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +# define PAGE_KERNEL_GLOBAL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +# define PAGE_KERNEL_PSE MAKE_GLOBAL(_KERNPG_TABLE|_PAGE_PSE) +#endif + /* * The i386 can't do page protection for execute, and considers that @@ -265,6 +305,7 @@ extern void pgtable_cache_init(void); extern unsigned long pg0[1024]; #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) +#define pte_user(x) ((x).pte_low & _PAGE_USER) #define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) #define pmd_none(x) (!pmd_val(x)) --- linux-2.4.20/include/asm-i386/uaccess.h.fourfour 2003-06-04 14:21:53.000000000 -0400 +++ linux-2.4.20/include/asm-i386/uaccess.h 2003-06-04 14:21:58.000000000 -0400 @@ -24,7 +24,7 @@ #define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) +#define USER_DS MAKE_MM_SEG(PAGE_OFFSET_USER) #define get_ds() (KERNEL_DS) #define get_fs() (current->addr_limit) @@ -128,6 +128,41 @@ struct exception_table_entry /* Returns 0 if exception not found and fixup otherwise. */ extern unsigned long search_exception_table(unsigned long); +extern int get_user_size(unsigned int size, void *val, const void *ptr); +extern int put_user_size(unsigned int size, const void *val, void *ptr); + +# define indirect_get_user(x,ptr) \ +({ int __ret_gu,__val_gu; \ + __typeof__(ptr) __ptr_gu = (ptr); \ + __ret_gu = get_user_size(sizeof(*__ptr_gu), &__val_gu,__ptr_gu) ? -EFAULT : 0;\ + (x) = (__typeof__(*__ptr_gu))__val_gu; \ + __ret_gu; \ +}) +#define indirect_put_user(x,ptr) \ +({ \ + __typeof__(*(ptr)) *__ptr_pu = (ptr), __x_pu = (x); \ + put_user_size(sizeof(*__ptr_pu), &__x_pu, __ptr_pu) ? -EFAULT : 0; \ +}) +#define __indirect_put_user indirect_put_user +#define __indirect_get_user indirect_get_user + +#define indirect_copy_from_user(to,from,n) get_user_size(n,to,from) +#define indirect_copy_to_user(to,from,n) put_user_size(n,from,to) + +#define __indirect_copy_from_user indirect_copy_from_user +#define __indirect_copy_to_user indirect_copy_to_user + +#define indirect_strncpy_from_user(dst, src, count) \ + copy_str_fromuser_size(count, dst, src) + +extern int strlen_fromuser_size(unsigned int size, const void *ptr); +#define indirect_strnlen_user(str, n) strlen_fromuser_size(n, str) +#define indirect_strlen_user(str) indirect_strnlen_user(str, ~0UL >> 1) + +extern int zero_user_size(unsigned int size, void *ptr); + +#define indirect_clear_user(mem, len) zero_user_size(len, mem) +#define __indirect_clear_user clear_user /* * These are the main single-value transfer routines. They automatically @@ -172,7 +207,7 @@ extern void __get_user_4(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define get_user(x,ptr) \ +#define direct_get_user(x,ptr) \ ({ int __ret_gu,__val_gu; \ switch(sizeof (*(ptr))) { \ case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break; \ @@ -208,7 +243,7 @@ extern void __put_user_bad(void); * * Returns zero on success, or -EFAULT on error. */ -#define put_user(x,ptr) \ +#define direct_put_user(x,ptr) \ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) @@ -232,7 +267,7 @@ extern void __put_user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define __get_user(x,ptr) \ +#define __direct_get_user(x,ptr) \ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) @@ -255,7 +290,7 @@ extern void __put_user_bad(void); * * Returns zero on success, or -EFAULT on error. */ -#define __put_user(x,ptr) \ +#define __direct_put_user(x,ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) #define __put_user_nocheck(x,ptr,size) \ @@ -708,7 +743,7 @@ __constant_copy_from_user_nocheck(void * * Returns number of bytes that could not be copied. * On success, this will be zero. */ -#define copy_to_user(to,from,n) \ +#define direct_copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ __constant_copy_to_user((to),(from),(n)) : \ __generic_copy_to_user((to),(from),(n))) @@ -729,7 +764,7 @@ __constant_copy_from_user_nocheck(void * * If some data could not be copied, this function will pad the copied * data to the requested size using zero bytes. */ -#define copy_from_user(to,from,n) \ +#define direct_copy_from_user(to,from,n) \ (__builtin_constant_p(n) ? \ __constant_copy_from_user((to),(from),(n)) : \ __generic_copy_from_user((to),(from),(n))) @@ -748,7 +783,7 @@ __constant_copy_from_user_nocheck(void * * Returns number of bytes that could not be copied. * On success, this will be zero. */ -#define __copy_to_user(to,from,n) \ +#define __direct_copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ __constant_copy_to_user_nocheck((to),(from),(n)) : \ __generic_copy_to_user_nocheck((to),(from),(n))) @@ -770,14 +805,11 @@ __constant_copy_from_user_nocheck(void * * If some data could not be copied, this function will pad the copied * data to the requested size using zero bytes. */ -#define __copy_from_user(to,from,n) \ +#define __direct_copy_from_user(to,from,n) \ (__builtin_constant_p(n) ? \ __constant_copy_from_user_nocheck((to),(from),(n)) : \ __generic_copy_from_user_nocheck((to),(from),(n))) -long strncpy_from_user(char *dst, const char *src, long count); -long __strncpy_from_user(char *dst, const char *src, long count); - /** * strlen_user: - Get the size of a string in user space. * @str: The string to measure. @@ -792,10 +824,68 @@ long __strncpy_from_user(char *dst, cons * If there is a limit on the length of a valid string, you may wish to * consider using strnlen_user() instead. */ -#define strlen_user(str) strnlen_user(str, ~0UL >> 1) +long direct_strncpy_from_user(char *dst, const char *src, long count); +long __direct_strncpy_from_user(char *dst, const char *src, long count); +#define direct_strlen_user(str) direct_strnlen_user(str, ~0UL >> 1) +long direct_strnlen_user(const char *str, long n); +unsigned long direct_clear_user(void *mem, unsigned long len); +unsigned long __direct_clear_user(void *mem, unsigned long len); -long strnlen_user(const char *str, long n); -unsigned long clear_user(void *mem, unsigned long len); -unsigned long __clear_user(void *mem, unsigned long len); +extern int indirect_uaccess; + +#if CONFIG_X86_UACCESS_INDIRECT + +/* + * Return code and zeroing semantics: + + __clear_user 0 <-> bytes not done + clear_user 0 <-> bytes not done + __copy_to_user 0 <-> bytes not done + copy_to_user 0 <-> bytes not done + __copy_from_user 0 <-> bytes not done, zero rest + copy_from_user 0 <-> bytes not done, zero rest + __get_user 0 <-> -EFAULT + get_user 0 <-> -EFAULT + __put_user 0 <-> -EFAULT + put_user 0 <-> -EFAULT + strlen_user strlen + 1 <-> 0 + strnlen_user strlen + 1 (or n+1) <-> 0 + strncpy_from_user strlen (or n) <-> -EFAULT + + */ + +#define __clear_user(mem,len) __indirect_clear_user(mem,len) +#define clear_user(mem,len) indirect_clear_user(mem,len) +#define __copy_to_user(to,from,n) __indirect_copy_to_user(to,from,n) +#define copy_to_user(to,from,n) indirect_copy_to_user(to,from,n) +#define __copy_from_user(to,from,n) __indirect_copy_from_user(to,from,n) +#define copy_from_user(to,from,n) indirect_copy_from_user(to,from,n) +#define __get_user(val,ptr) __indirect_get_user(val,ptr) +#define get_user(val,ptr) indirect_get_user(val,ptr) +#define __put_user(val,ptr) __indirect_put_user(val,ptr) +#define put_user(val,ptr) indirect_put_user(val,ptr) +#define strlen_user(str) indirect_strlen_user(str) +#define strnlen_user(src,count) indirect_strnlen_user(src,count) +#define strncpy_from_user(dst,src,count) \ + indirect_strncpy_from_user(dst,src,count) + +#else + +#define __clear_user __direct_clear_user +#define clear_user direct_clear_user +#define __copy_to_user __direct_copy_to_user +#define copy_to_user direct_copy_to_user +#define __copy_from_user __direct_copy_from_user +#define copy_from_user direct_copy_from_user +#define __get_user __direct_get_user +#define get_user direct_get_user +#define __put_user __direct_put_user +#define put_user direct_put_user +#define strlen_user direct_strlen_user +#define strnlen_user direct_strnlen_user +#define strncpy_from_user direct_strncpy_from_user + +#endif /* CONFIG_X86_UACCESS_INDIRECT */ #endif /* __i386_UACCESS_H */ + --- linux-2.4.20/include/asm-i386/string.h.fourfour 2001-11-22 14:46:18.000000000 -0500 +++ linux-2.4.20/include/asm-i386/string.h 2003-06-04 14:21:58.000000000 -0400 @@ -62,6 +62,29 @@ __asm__ __volatile__( return dest; } +/* + * This is a more generic variant of strncpy_count() suitable for + * implementing string-access routines with all sorts of return + * code semantics. It's used by mm/usercopy.c. + */ +static inline size_t strncpy_count(char * dest,const char *src,size_t count) +{ + __asm__ __volatile__( + + "1:\tdecl %0\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "2:" + "incl %0" + : "=c" (count) + :"S" (src),"D" (dest),"0" (count) : "memory"); + + return count; +} + #define __HAVE_ARCH_STRCAT static inline char * strcat(char * dest,const char * src) { --- linux-2.4.20/include/asm-i386/checksum.h.fourfour 2002-08-02 20:39:45.000000000 -0400 +++ linux-2.4.20/include/asm-i386/checksum.h 2003-06-04 14:21:58.000000000 -0400 @@ -24,7 +24,7 @@ asmlinkage unsigned int csum_partial(con * better 64-bit) boundary */ -asmlinkage unsigned int csum_partial_copy_generic( const char *src, char *dst, int len, int sum, +asmlinkage unsigned int direct_csum_partial_copy_generic( const char *src, char *dst, int len, int sum, int *src_err_ptr, int *dst_err_ptr); /* @@ -38,14 +38,19 @@ static __inline__ unsigned int csum_partial_copy_nocheck ( const char *src, char *dst, int len, int sum) { - return csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL); + /* + * The direct function is OK for kernel-space => kernel-space copies: + */ + return direct_csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL); } static __inline__ unsigned int csum_partial_copy_from_user ( const char *src, char *dst, int len, int sum, int *err_ptr) { - return csum_partial_copy_generic ( src, dst, len, sum, err_ptr, NULL); + if (copy_from_user(dst, src, len)) + *err_ptr = -EFAULT; + return csum_partial(dst, len, sum); } /* @@ -180,11 +185,26 @@ static __inline__ unsigned short int csu * Copy and checksum to user */ #define HAVE_CSUM_COPY_USER -static __inline__ unsigned int csum_and_copy_to_user(const char *src, char *dst, +static __inline__ unsigned int direct_csum_and_copy_to_user(const char *src, char *dst, int len, int sum, int *err_ptr) { if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr); + return direct_csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr); + + if (len) + *err_ptr = -EFAULT; + + return -1; /* invalid checksum */ +} + +static __inline__ unsigned int csum_and_copy_to_user(const char *src, char *dst, + int len, int sum, int *err_ptr) +{ + if (access_ok(VERIFY_WRITE, dst, len)) { + if (copy_to_user(dst, src, len)) + *err_ptr = -EFAULT; + return csum_partial(src, len, sum); + } if (len) *err_ptr = -EFAULT; --- linux-2.4.20/include/asm-i386/hw_irq.h.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/include/asm-i386/hw_irq.h 2003-06-04 14:21:58.000000000 -0400 @@ -98,7 +98,7 @@ extern char _stext, _etext; #define __STR(x) #x #define STR(x) __STR(x) -#define SAVE_ALL \ +#define __SAVE_ALL \ "cld\n\t" \ "pushl %es\n\t" \ "pushl %ds\n\t" \ @@ -113,6 +113,36 @@ extern char _stext, _etext; "movl %edx,%ds\n\t" \ "movl %edx,%es\n\t" +#if CONFIG_X86_SWITCH_PAGETABLES +# define __SWITCH_TO_KERNEL_PGD \ + "movl %cr3, %edx; \ + cmpl $swapper_pg_dir-" STR(__PAGE_OFFSET) ", %edx; \ + jz 1f; \ + movl $swapper_pg_dir-" STR(__PAGE_OFFSET) ", %edx; \ + movl %edx, %cr3; \ +1: " +#else +# define __SWITCH_TO_KERNEL_PGD +#endif + +/* NOTE: we rely on real_stack having offset 36 below: */ + +#define SAVE_ALL_SWITCH \ + __SAVE_ALL \ + \ + __SWITCH_TO_KERNEL_PGD \ + \ + " \ + /* load the real stack - keep the offset */ \ + \ + movl $-8192, %ebx; \ + andl %esp, %ebx; \ + movl 36(%ebx), %edx; \ + movl %esp, %ebx; \ + andl $0x1fff, %ebx; \ + orl %ebx, %edx; \ + movl %edx, %esp;" + #define IRQ_NAME2(nr) nr##_interrupt(void) #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) @@ -131,39 +161,39 @@ extern char _stext, _etext; asmlinkage void x(void); \ asmlinkage void call_##x(void); \ __asm__( \ -"\n"__ALIGN_STR"\n" \ +".section .entry.text,\"ax\"\n"__ALIGN_STR"\n" \ SYMBOL_NAME_STR(x) ":\n\t" \ "pushl $"#v"-256\n\t" \ - SAVE_ALL \ + SAVE_ALL_SWITCH \ SYMBOL_NAME_STR(call_##x)":\n\t" \ - "call "SYMBOL_NAME_STR(smp_##x)"\n\t" \ - "jmp ret_from_intr\n"); + "movl $"SYMBOL_NAME_STR(smp_##x)", %ebp; call *%ebp\n\t" \ + "jmp ret_from_intr; .previous\n"); #define BUILD_SMP_TIMER_INTERRUPT(x,v) XBUILD_SMP_TIMER_INTERRUPT(x,v) #define XBUILD_SMP_TIMER_INTERRUPT(x,v) \ asmlinkage void x(struct pt_regs * regs); \ asmlinkage void call_##x(void); \ __asm__( \ -"\n"__ALIGN_STR"\n" \ +".section .entry.text,\"ax\"\n"__ALIGN_STR"\n" \ SYMBOL_NAME_STR(x) ":\n\t" \ "pushl $"#v"-256\n\t" \ - SAVE_ALL \ + SAVE_ALL_SWITCH \ "movl %esp,%eax\n\t" \ "pushl %eax\n\t" \ SYMBOL_NAME_STR(call_##x)":\n\t" \ - "call "SYMBOL_NAME_STR(smp_##x)"\n\t" \ + "movl $"SYMBOL_NAME_STR(smp_##x)", %ebp; call *%ebp\n\t" \ "addl $4,%esp\n\t" \ - "jmp ret_from_intr\n"); + "jmp ret_from_intr; .previous\n"); #define BUILD_COMMON_IRQ() \ asmlinkage void call_do_IRQ(void); \ __asm__( \ - "\n" __ALIGN_STR"\n" \ + ".section .entry.text,\"ax\"\n" __ALIGN_STR"\n" \ "common_interrupt:\n\t" \ - SAVE_ALL \ + SAVE_ALL_SWITCH \ SYMBOL_NAME_STR(call_do_IRQ)":\n\t" \ - "call " SYMBOL_NAME_STR(do_IRQ) "\n\t" \ - "jmp ret_from_intr\n"); + "movl $"SYMBOL_NAME_STR(do_IRQ)", %ebp; call *%ebp\n\t" \ + "jmp ret_from_intr; .previous\n"); /* * subtle. orig_eax is used by the signal code to distinct between @@ -178,10 +208,10 @@ __asm__( \ #define BUILD_IRQ(nr) \ asmlinkage void IRQ_NAME(nr); \ __asm__( \ -"\n"__ALIGN_STR"\n" \ +".section .entry.text,\"ax\"\n"__ALIGN_STR"\n" \ SYMBOL_NAME_STR(IRQ) #nr "_interrupt:\n\t" \ "pushl $"#nr"-256\n\t" \ - "jmp common_interrupt"); + "jmp common_interrupt; .previous"); /* * The profiling function is SMP safe. (nothing can mess --- linux-2.4.20/include/asm-i386/processor.h.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/include/asm-i386/processor.h 2003-06-04 14:21:58.000000000 -0400 @@ -82,6 +82,7 @@ struct cpuinfo_x86 { extern struct cpuinfo_x86 boot_cpu_data; extern struct tss_struct init_tss[NR_CPUS]; +extern struct tss_struct doublefault_tss; #ifdef CONFIG_SMP extern struct cpuinfo_x86 cpu_data[]; @@ -279,7 +280,7 @@ extern unsigned int mca_pentium_flag; /* * User space process size: 3GB (default). */ -#define TASK_SIZE (PAGE_OFFSET) +#define TASK_SIZE (PAGE_OFFSET_USER) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -379,6 +380,7 @@ struct tss_struct { struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + void *stack_page0, *stack_page1; unsigned long esp0; unsigned long eip; unsigned long esp; @@ -402,6 +404,7 @@ struct thread_struct { #define INIT_THREAD { \ { { 0, 0 } , }, \ + 0, 0, \ 0, \ 0, 0, 0, 0, \ { [0 ... 7] = 0 }, /* debugging registers */ \ @@ -457,6 +460,12 @@ extern int arch_kernel_thread(int (*fn)( static inline void copy_segments(struct task_struct *p, struct mm_struct * mm) { } static inline void release_segments(struct mm_struct * mm) { } +#if CONFIG_X86_HIGH_ENTRY +#define virtual_esp0(tsk) \ + ((unsigned long)(tsk)->virtual_stack + ((tsk)->thread.esp0 - (unsigned long)(tsk)->real_stack)) +#else +# define virtual_esp0(tsk) ((tsk)->thread.esp0) +#endif /* * Return saved PC of a blocked thread. */ --- linux-2.4.20/include/asm-i386/page.h.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/include/asm-i386/page.h 2003-06-04 14:21:58.000000000 -0400 @@ -7,10 +7,11 @@ #define PAGE_MASK (~(PAGE_SIZE-1)) #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ #include +#ifndef __ASSEMBLY__ + #ifdef CONFIG_X86_USE_3DNOW #include @@ -76,9 +77,18 @@ typedef struct { unsigned long pgprot; } * * If you want more physical memory than this then see the CONFIG_HIGHMEM4G * and CONFIG_HIGHMEM64G options in the kernel configuration. + * + * Note: on PAE the kernel must never go below 32 MB, we use the + * first 8 entries of the 2-level boot pgd for PAE magic. */ -#define __PAGE_OFFSET (0xC0000000) +#if CONFIG_X86_4G_VM_LAYOUT +# define __PAGE_OFFSET (0x02000000) +# define __PAGE_OFFSET_USER (0xff000000) +#else +# define __PAGE_OFFSET (0xc0000000) +# define __PAGE_OFFSET_USER (0xc0000000) +#endif /* * This much address space is reserved for vmalloc() and iomap() @@ -126,6 +136,7 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) +#define PAGE_OFFSET_USER ((unsigned long)__PAGE_OFFSET_USER) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) #define __MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) #define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)) --- linux-2.4.20/include/asm-i386/fixmap.h.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/include/asm-i386/fixmap.h 2003-06-04 14:21:58.000000000 -0400 @@ -17,17 +17,15 @@ #include #include #include -#ifdef CONFIG_HIGHMEM #include #include -#endif /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at * compile time, but to set the physical address only * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. + * from the end of virtual memory (0xffffe000) backwards. * Also this lets us do fail-safe vmalloc(), we * can guarantee that these special addresses and * vmalloc()-ed addresses never overlap. @@ -51,6 +49,8 @@ enum fixed_addresses { FIX_HOLE, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#else + FIX_VSTACK_HOLE_1, #endif #ifdef CONFIG_X86_IO_APIC FIX_IO_APIC_BASE_0, @@ -62,16 +62,18 @@ enum fixed_addresses { FIX_LI_PCIA, /* Lithium PCI Bridge A */ FIX_LI_PCIB, /* Lithium PCI Bridge B */ #endif -#ifndef CONFIG_X86_F00F_WORKS_OK - FIX_F00F, -#endif + FIX_IDT, + FIX_GDT_1, + FIX_GDT_0, + FIX_TSS_1, + FIX_TSS_0, + FIX_ENTRY_TRAMPOLINE, #ifdef CONFIG_X86_SUMMIT FIX_CYCLONE_TIMER, /*cyclone timer register*/ + FIX_VSTACK_HOLE_3, #endif -#ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif __end_of_permanent_fixed_addresses, /* temporary boot-time mappings, used before ioremap() is functional */ #define NR_FIX_BTMAPS 16 @@ -84,19 +86,22 @@ extern void __set_fixmap (enum fixed_add unsigned long phys, pgprot_t flags); #define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) + __set_fixmap(idx, phys, PAGE_KERNEL_GLOBAL) /* * Some hardware wants to get fixmapped without caching. */ #define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + __set_fixmap(idx, phys, PAGE_KERNEL_GLOBAL_NOCACHE) /* - * used by vmalloc.c. + * used by vmalloc.c and various other places. * * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. + * + * IMPORTANT: dont change FIXADDR_TOP without adjusting KM_VSTACK0 + * and KM_VSTACK1 so that the virtual stack is 8K aligned. */ -#define FIXADDR_TOP (0xfffff000UL) +#define FIXADDR_TOP (0xffffe000UL) #define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) --- linux-2.4.20/include/asm-i386/kmap_types.h.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/include/asm-i386/kmap_types.h 2003-06-04 14:21:58.000000000 -0400 @@ -2,7 +2,18 @@ #define _ASM_KMAP_TYPES_H enum km_type { + /* + * IMPORTANT: dont move these 3 entries, the virtual stack + * must be 8K aligned. + */ KM_BOUNCE_READ, + KM_VSTACK1, + KM_VSTACK0, + + KM_LDT_PAGE15, + KM_LDT_PAGE0 = KM_LDT_PAGE15 + 16-1, + KM_USER_COPY, + KM_VSTACK_HOLE, KM_SKB_SUNRPC_DATA, KM_SKB_DATA_SOFTIRQ, KM_USER0, --- linux-2.4.20/include/asm-i386/segment.h.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/include/asm-i386/segment.h 2003-06-04 14:21:58.000000000 -0400 @@ -62,10 +62,12 @@ #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 4) #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 9) +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 + /* - * The GDT has 21 entries but we pad it to cacheline boundary: + * The GDT has 32 entries: */ -#define GDT_ENTRIES 24 +#define GDT_ENTRIES 32 #define GDT_SIZE (GDT_ENTRIES * 8) --- linux-2.4.20/include/asm-i386/highmem.h.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/include/asm-i386/highmem.h 2003-06-04 14:21:58.000000000 -0400 @@ -26,17 +26,11 @@ #include #include -#ifdef CONFIG_DEBUG_HIGHMEM -#define HIGHMEM_DEBUG 1 -#else -#define HIGHMEM_DEBUG 0 -#endif +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; extern void kmap_init(void) __init; @@ -46,7 +40,7 @@ extern void kmap_init(void) __init; * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#define PKMAP_BASE (0xff800000UL) +#define PKMAP_BASE (0xff000000UL) #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 #else @@ -80,50 +74,21 @@ static inline void kunmap(struct page *p kunmap_high(page); } -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ static inline void *kmap_atomic(struct page *page, enum km_type type) { - enum fixed_addresses idx; - unsigned long vaddr; - if (page < highmem_start_page) return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#if HIGHMEM_DEBUG - if (!pte_none(*(kmap_pte-idx))) - out_of_line_bug(); -#endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; + return __kmap_atomic(page, type); } static inline void kunmap_atomic(void *kvaddr, enum km_type type) { #if HIGHMEM_DEBUG unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); if (vaddr < FIXADDR_START) // FIXME return; - - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - out_of_line_bug(); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(kmap_pte-idx); - __flush_tlb_one(vaddr); + __kunmap_atomic(kvaddr, type); #endif } @@ -140,7 +105,6 @@ static inline struct page *kmap_atomic_t return pte_page(*pte); } - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ --- linux-2.4.20/include/asm-i386/desc.h.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/include/asm-i386/desc.h 2003-06-04 14:21:58.000000000 -0400 @@ -17,6 +17,9 @@ struct Xgt_desc_struct { extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern void trap_init_virtual_IDT(void); +extern void trap_init_virtual_GDT(void); + #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) @@ -26,6 +29,7 @@ extern struct Xgt_desc_struct idt_descr, */ extern struct desc_struct default_ldt[]; extern void set_intr_gate(unsigned int irq, void * addr); +extern void set_trap_gate(unsigned int n, void *addr); #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ @@ -38,11 +42,14 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\ "rorl $16,%%eax" \ : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)) -static inline void set_tss_desc(unsigned int cpu, void *addr) + +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) { - _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_TSS], (int)addr, 235, 0x89); + _set_tssldt_desc(&cpu_gdt_table[cpu][entry], (int)addr, 235, 0x89); } +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) { _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); @@ -84,29 +91,8 @@ static inline void load_TLS(struct threa #undef C } -static inline void clear_LDT(void) -{ - set_ldt_desc(smp_processor_id(), &default_ldt[0], 5); - load_LDT_desc(); -} - -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT (mm_context_t *pc) -{ - void *segments = pc->ldt; - int count = pc->size; - - if (!count) { - segments = &default_ldt[0]; - count = 5; - } - - set_ldt_desc(smp_processor_id(), segments, count); - load_LDT_desc(); -} +extern struct page *default_ldt_page; +extern void load_LDT(mm_context_t *pc); #endif /* !__ASSEMBLY__ */ - #endif --- linux-2.4.20/include/asm-i386/mmu.h.fourfour 2003-06-04 14:21:53.000000000 -0400 +++ linux-2.4.20/include/asm-i386/mmu.h 2003-06-04 14:21:58.000000000 -0400 @@ -7,10 +7,13 @@ * * cpu_vm_mask is used to optimize ldt flushing. */ + +#define MAX_LDT_PAGES 16 + typedef struct { int size; struct semaphore sem; - void * ldt; + struct page *ldt_pages[MAX_LDT_PAGES]; } mm_context_t; #endif --- /dev/null 2003-01-30 05:24:37.000000000 -0500 +++ linux-2.4.20/include/asm-i386/atomic_kmap.h 2003-06-04 14:21:58.000000000 -0400 @@ -0,0 +1,94 @@ +/* + * atomic_kmap.h: temporary virtual kernel memory mappings + * + * Copyright (C) 2003 Ingo Molnar + */ + +#ifndef _ASM_ATOMIC_KMAP_H +#define _ASM_ATOMIC_KMAP_H + +#ifdef __KERNEL__ + +#include + +#ifdef CONFIG_DEBUG_HIGHMEM +#define HIGHMEM_DEBUG 1 +#else +#define HIGHMEM_DEBUG 0 +#endif + +extern pte_t *kmap_pte; +#define kmap_prot PAGE_KERNEL_GLOBAL + +static inline unsigned long __kmap_atomic_vaddr(enum km_type type) +{ + enum fixed_addresses idx; + + idx = type + KM_TYPE_NR*smp_processor_id(); + return __fix_to_virt(FIX_KMAP_BEGIN + idx); +} + +static inline void *__kmap_atomic_noflush(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + /* + * NOTE: entries that rely on some secondary TLB-flush + * effect must not be global: + */ + set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + + return (void*) vaddr; +} + +static inline void *__kmap_atomic(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#if HIGHMEM_DEBUG + if (!pte_none(*(kmap_pte-idx))) + out_of_line_bug(); +#else + /* + * Performance optimization - do not flush if the new + * pte is the same as the old one: + */ + if (pte_val(*(kmap_pte-idx)) == pte_val(mk_pte(page, kmap_prot))) + return (void *) vaddr; +#endif + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +static inline void __kunmap_atomic(void *kvaddr, enum km_type type) +{ +#if HIGHMEM_DEBUG + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) + out_of_line_bug(); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(kmap_pte-idx); + __flush_tlb_one(vaddr); +#endif +} + +#define __kunmap_atomic_type(type) \ + __kunmap_atomic((void *)__kmap_atomic_vaddr(type), (type)) + +#endif /* __KERNEL__ */ + +#endif /* _ASM_ATOMIC_KMAP_H */ --- linux-2.4.20/arch/i386/mm/init.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/mm/init.c 2003-06-04 14:21:58.000000000 -0400 @@ -37,6 +37,7 @@ #include #include #include +#include mmu_gather_t mmu_gathers[NR_CPUS]; unsigned long highstart_pfn, highend_pfn; @@ -74,9 +75,7 @@ int do_check_pgt_cache(int low, int high * around without checking the pgd every time. */ -#if CONFIG_HIGHMEM pte_t *kmap_pte; -pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) @@ -88,198 +87,191 @@ void __init kmap_init(void) /* cache the first kmap pte */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; } -#endif /* CONFIG_HIGHMEM */ /* References to section boundaries */ extern char _text, _etext, _edata, __bss_start, _end; extern char __init_begin, __init_end; -static inline void set_pte_phys (unsigned long vaddr, - unsigned long phys, pgprot_t flags) +static __init void prepare_pagetables(pgd_t *pgd_base, unsigned long address) { pgd_t *pgd; pmd_t *pmd; pte_t *pte; - pgd = swapper_pg_dir + __pgd_offset(vaddr); - if (pgd_none(*pgd)) { - printk("PAE BUG #00!\n"); - return; - } - pmd = pmd_offset(pgd, vaddr); - if (pmd_none(*pmd)) { - printk("PAE BUG #01!\n"); - return; + pgd = pgd_base + __pgd_offset(address); + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); } - pte = pte_offset_kernel(pmd, vaddr); - /* stored as-is, to permit clearing entries */ - set_pte(pte, mk_pte_phys(phys, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); } + static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int i, j; unsigned long vaddr; - vaddr = start; - i = __pgd_offset(vaddr); - j = __pmd_offset(vaddr); - pgd = pgd_base + i; - - for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { -#if CONFIG_X86_PAE - if (pgd_none(*pgd)) { - pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd) + 0x1)); - if (pmd != pmd_offset(pgd, 0)) - printk("PAE BUG #02!\n"); - } - pmd = pmd_offset(pgd, vaddr); -#else - pmd = (pmd_t *)pgd; -#endif - for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { - if (pmd_none(*pmd)) { - pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); - if (pte != pte_offset_kernel(pmd, 0)) - BUG(); - } - vaddr += PMD_SIZE; - } - j = 0; - } + for (vaddr = start; vaddr != end; vaddr += PAGE_SIZE) + prepare_pagetables(pgd_base, vaddr); } -static void __init pagetable_init (void) +static void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) { - unsigned long vaddr, end; - pgd_t *pgd, *pgd_base; + unsigned long vaddr; + pgd_t *pgd; int i, j, k; pmd_t *pmd; pte_t *pte, *pte_base; - /* - * This can be zero as well - no problem, in that case we exit - * the loops anyway due to the PTRS_PER_* conditions. - */ - end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); + pgd = pgd_base; - pgd_base = swapper_pg_dir; -#if CONFIG_X86_PAE - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(1 + __pa(empty_zero_page))); -#endif - i = __pgd_offset(PAGE_OFFSET); - pgd = pgd_base + i; - - for (; i < PTRS_PER_PGD; pgd++, i++) { + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { vaddr = i*PGDIR_SIZE; if (end && (vaddr >= end)) break; -#if CONFIG_X86_PAE - pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd) + 0x1)); -#else - pmd = (pmd_t *)pgd; -#endif - if (pmd != pmd_offset(pgd, 0)) - BUG(); + pmd = pmd_offset(pgd, 0); for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { vaddr = i*PGDIR_SIZE + j*PMD_SIZE; if (end && (vaddr >= end)) break; + if (vaddr < start) + continue; if (cpu_has_pse) { unsigned long __pe; set_in_cr4(X86_CR4_PSE); boot_cpu_data.wp_works_ok = 1; - __pe = _KERNPG_TABLE + _PAGE_PSE + __pa(vaddr); + __pe = _KERNPG_TABLE + _PAGE_PSE + vaddr - start; /* Make it "global" too if supported */ if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); +#if !CONFIG_X86_SWITCH_PAGETABLES __pe += _PAGE_GLOBAL; +#endif } set_pmd(pmd, __pmd(__pe)); continue; } pte_base = pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - for (k = 0; k < PTRS_PER_PTE; pte++, k++) { vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; if (end && (vaddr >= end)) break; - *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL); + if (vaddr < start) + continue; + *pte = mk_pte_phys(vaddr-start, PAGE_KERNEL); } set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); - if (pte_base != pte_offset_kernel(pmd, 0)) - BUG(); + } + } +} + +/* + * Clear kernel pagetables in a PMD_SIZE-aligned range. + */ +static void clear_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) +{ + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + int i, j; + + pgd = pgd_base; + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + pmd_clear(pmd); } } + flush_tlb_all(); +} + +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base; + int i; /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): + * This can be zero as well - no problem, in that case we exit + * the loops anyway due to the PTRS_PER_* conditions. */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, 0, pgd_base); + end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); -#if CONFIG_HIGHMEM + pgd_base = swapper_pg_dir; +#if CONFIG_X86_PAE /* - * Permanent kmaps: + * It causes too many problems if there's no proper pmd set up + * for all 4 entries of the PGD - so we allocate all of them. + * PAE systems will not miss this extra 4-8K anyway ... */ - vaddr = PKMAP_BASE; - fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + __pgd_offset(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + for (i = 0; i < PTRS_PER_PGD; i++) { + pmd_t *pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pgd(pgd_base + i, __pgd(__pa(pmd) + 0x1)); + } #endif + /* + * Set up lowmem-sized identity mappings at PAGE_OFFSET: + */ + setup_identity_mappings(pgd_base, PAGE_OFFSET, end); -#if CONFIG_X86_PAE /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. + * Add flat-mode identity-mappings - SMP needs it when + * starting up on an AP from real-mode. (In the non-PAE + * case we already have these mappings through head.S.) * All user-space mappings are explicitly cleared after * SMP startup. */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; +#if CONFIG_SMP && CONFIG_X86_PAE + setup_identity_mappings(pgd_base, 0, 16*1024*1024); +#endif + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + fixrange_init(vaddr, 0, pgd_base); + +#if CONFIG_HIGHMEM + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + __pgd_offset(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } #endif } -void __init zap_low_mappings (void) +void __init zap_low_mappings(void) { - int i; + printk("zapping low mappings.\n"); /* * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#if CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); + clear_mappings(swapper_pg_dir, 0, 16*1024*1024); } static void __init zone_sizes_init(void) @@ -327,9 +319,7 @@ void __init paging_init(void) __flush_tlb_all(); -#ifdef CONFIG_HIGHMEM kmap_init(); -#endif zone_sizes_init(); } @@ -446,7 +436,6 @@ static int __init free_pages_init(void) int bad_ppro, reservedpages, pfn; bad_ppro = ppro_with_ram_bug(); - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -466,10 +455,13 @@ static int __init free_pages_init(void) return reservedpages; } +extern void fixup_sort_exception_table(void); + void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; + fixup_sort_exception_table(); if (!mem_map) BUG(); #ifdef CONFIG_HIGHMEM @@ -520,7 +512,9 @@ void __init mem_init(void) #ifndef CONFIG_SMP zap_low_mappings(); #endif - + entry_trampoline_setup(); + default_ldt_page = virt_to_page(default_ldt); + load_LDT(&init_mm.context); } /* Put this after the callers, so that it cannot be inlined */ --- linux-2.4.20/arch/i386/mm/extable.c.fourfour 2001-09-17 16:16:30.000000000 -0400 +++ linux-2.4.20/arch/i386/mm/extable.c 2003-06-04 14:21:58.000000000 -0400 @@ -6,9 +6,52 @@ #include #include #include +#include -extern const struct exception_table_entry __start___ex_table[]; -extern const struct exception_table_entry __stop___ex_table[]; +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +/* + * The exception table needs to be sorted because we use the macros + * which put things into the exception table in a variety of sections + * as well as the init section and the main kernel text section. + */ +static inline void +sort_ex_table(struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + struct exception_table_entry el, *p, *q; + + /* insertion sort */ + for (p = start + 1; p < finish; ++p) { + /* start .. p-1 is sorted */ + if (p[0].insn < p[-1].insn) { + /* move element p down to its right place */ + el = *p; + q = p; + do { + /* el comes before q[-1], move q[-1] up one */ + q[0] = q[-1]; + --q; + } while (q > start && el.insn < q[-1].insn); + *q = el; + } + } +} + +void fixup_sort_exception_table(void) +{ + struct exception_table_entry *p; + + /* + * Fix up the trampoline exception addresses: + */ + for (p = __start___ex_table; p < __stop___ex_table; p++) { + p->insn = (unsigned long)ENTRY_TRAMP_ADDR((void *)p->insn); + p->fixup = (unsigned long)ENTRY_TRAMP_ADDR((void *)p->fixup); + } + sort_ex_table(__start___ex_table, __stop___ex_table); +} static inline unsigned long search_one_table(const struct exception_table_entry *first, @@ -17,13 +60,15 @@ search_one_table(const struct exception_ { while (first <= last) { const struct exception_table_entry *mid; - long diff; mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) + /* + * careful, the distance between entries can be + * larger than 2GB: + */ + if (mid->insn == value) return mid->fixup; - else if (diff < 0) + else if (mid->insn < value) first = mid+1; else last = mid-1; --- linux-2.4.20/arch/i386/mm/pageattr.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/mm/pageattr.c 2003-06-04 14:21:58.000000000 -0400 @@ -87,8 +87,7 @@ static inline void revert_page(struct pa pte_t *linear = (pte_t *) pmd_offset(pgd_offset(&init_mm, address), address); set_pmd_pte(linear, address, - mk_pte_phys(__pa(address & LARGE_PAGE_MASK), - MAKE_GLOBAL(_KERNPG_TABLE|_PAGE_PSE))); + mk_pte_phys(__pa(address & LARGE_PAGE_MASK), PAGE_KERNEL_PSE)); } /* --- linux-2.4.20/arch/i386/mm/pgtable.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/mm/pgtable.c 2003-06-04 14:21:58.000000000 -0400 @@ -174,16 +174,33 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); if (pgd) { + unsigned long pmd; +#if CONFIG_X86_4G_VM_LAYOUT + pmd_t *pmd0, *kernel_pmd0; +#endif + for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); + pmd = __get_free_page(GFP_KERNEL); if (!pmd) goto out_oom; clear_page(pmd); set_pgd(pgd + i, __pgd(1 + __pa(pmd))); } +#if CONFIG_X86_4G_VM_LAYOUT + /* + * In the 4G userspace case alias the last 4MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ + pmd0 = (pmd_t *)pmd; + kernel_pmd0 = (pmd_t *)pgd_page(swapper_pg_dir[PTRS_PER_PGD-1]); + pmd0[PTRS_PER_PMD-2] = kernel_pmd0[PTRS_PER_PMD-2]; + pmd0[PTRS_PER_PMD-1] = kernel_pmd0[PTRS_PER_PMD-1]; +#else memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif } return pgd; out_oom: @@ -209,10 +226,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#if CONFIG_X86_4G_VM_LAYOUT + memset(pgd, 0, (PTRS_PER_PGD-2) * sizeof(pgd_t)); + pgd[PTRS_PER_PGD-2] = swapper_pg_dir[PTRS_PER_PGD-2]; + pgd[PTRS_PER_PGD-1] = swapper_pg_dir[PTRS_PER_PGD-1]; +#else + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif } return pgd; } --- linux-2.4.20/arch/i386/mm/fault.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/mm/fault.c 2003-06-04 14:22:58.000000000 -0400 @@ -150,6 +150,14 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); +#if CONFIG_X86_SWITCH_PAGETABLES + if (!user_mode(regs)) { + console_verbose(); + printk("invalid kernel-mode pagefault %d! [addr:%08lx, eip:%08lx]\n", error_code, address, regs->eip); + show_regs(regs); + BUG(); + } +#endif /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) --- linux-2.4.20/arch/i386/lib/usercopy.c.fourfour 2003-06-04 14:21:44.000000000 -0400 +++ linux-2.4.20/arch/i386/lib/usercopy.c 2003-06-04 14:21:58.000000000 -0400 @@ -116,7 +116,7 @@ do { \ * and returns @count. */ long -__strncpy_from_user(char *dst, const char *src, long count) +__direct_strncpy_from_user(char *dst, const char *src, long count) { long res; __do_strncpy_from_user(dst, src, count, res); @@ -142,7 +142,7 @@ __strncpy_from_user(char *dst, const cha * and returns @count. */ long -strncpy_from_user(char *dst, const char *src, long count) +direct_strncpy_from_user(char *dst, const char *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) @@ -187,7 +187,7 @@ do { \ * On success, this will be zero. */ unsigned long -clear_user(void *to, unsigned long n) +direct_clear_user(void *to, unsigned long n) { if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); @@ -206,7 +206,7 @@ clear_user(void *to, unsigned long n) * On success, this will be zero. */ unsigned long -__clear_user(void *to, unsigned long n) +__direct_clear_user(void *to, unsigned long n) { __do_clear_user(to, n); return n; @@ -223,7 +223,7 @@ __clear_user(void *to, unsigned long n) * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char *s, long n) +long direct_strnlen_user(const char *s, long n) { unsigned long mask = -__addr_ok(s); unsigned long res, tmp; @@ -252,3 +252,4 @@ long strnlen_user(const char *s, long n) :"cc"); return res & mask; } + --- linux-2.4.20/arch/i386/lib/checksum.S.fourfour 2002-11-28 18:53:09.000000000 -0500 +++ linux-2.4.20/arch/i386/lib/checksum.S 2003-06-04 14:21:58.000000000 -0400 @@ -280,14 +280,14 @@ unsigned int csum_partial_copy_generic ( .previous .align 4 -.globl csum_partial_copy_generic +.globl direct_csum_partial_copy_generic #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: subl $4,%esp pushl %edi pushl %esi @@ -422,7 +422,7 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: pushl %ebx pushl %edi pushl %esi --- linux-2.4.20/arch/i386/kernel/setup.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/setup.c 2003-06-04 14:21:58.000000000 -0400 @@ -119,6 +119,7 @@ #include #include #include +#include /* * Machine setup.. */ @@ -2220,8 +2221,6 @@ static void __init init_rise(struct cpui } -extern void trap_init_f00f_bug(void); - #define LVL_1_INST 1 #define LVL_1_DATA 2 #define LVL_2 3 @@ -2287,7 +2286,7 @@ static void __init init_intel(struct cpu if (c->x86 == 5) { c->f00f_bug = 1; if (!f00f_workaround_enabled) { - trap_init_f00f_bug(); + trap_init_virtual_IDT(); printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } @@ -3128,6 +3127,19 @@ void __init cpu_init (void) __asm__ __volatile__("lgdt %0": "=m" (cpu_gdt_descr[cpu])); __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + t->esp0 = thread->esp0; + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); + if (cpu) + load_LDT(&init_mm.context); + + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); + cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + + if (cpu) + trap_init_virtual_GDT(); /* * Delete NT */ @@ -3141,11 +3153,6 @@ void __init cpu_init (void) if(current->mm) BUG(); enter_lazy_tlb(&init_mm, current, cpu); - t->esp0 = thread->esp0; - set_tss_desc(cpu, t); - cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; - load_TR_desc(); - load_LDT(&init_mm.context); /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); --- linux-2.4.20/arch/i386/kernel/traps.c.fourfour 2003-06-04 14:21:57.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/traps.c 2003-06-04 14:21:58.000000000 -0400 @@ -56,8 +56,8 @@ asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; +struct desc_struct default_ldt[] __attribute__((__section__(".data.default_ldt"))) = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }; +struct page *default_ldt_page; /* * The IDT has to be page-aligned to simplify the Pentium @@ -135,7 +135,9 @@ static inline int kernel_text_address(un void show_trace(unsigned long * stack) { +#if !CONFIG_FRAME_POINTER int i; +#endif unsigned long addr; /* static to not take up stackspace; if we race here too bad */ static char buffer[512]; @@ -144,6 +146,22 @@ void show_trace(unsigned long * stack) stack = (unsigned long*)&stack; printk("Call Trace: "); + /* + * If we have frame pointers then use them to get + * a 100% exact backtrace, up until the entry frame: + */ +#if CONFIG_FRAME_POINTER +#define DO(n) \ + addr = (int)__builtin_return_address(n); \ + if (!kernel_text_address(addr)) \ + goto out; \ + lookup_symbol(addr, buffer, 512); \ + printk("[<%08lx>] %s\n", addr, buffer); + + DO(0); DO(1); DO(2); DO(3); DO(4); DO(5); DO(7); DO(8); DO(9); + DO(10); DO(11); DO(12); DO(13); DO(14); DO(15); DO(17); DO(18); DO(19); +out: +#else i = 1; while (((long) stack & (THREAD_SIZE-1)) != 0) { addr = *stack++; @@ -153,6 +171,7 @@ void show_trace(unsigned long * stack) i++; } } +#endif printk("\n"); } @@ -824,22 +843,53 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -#ifndef CONFIG_X86_F00F_WORKS_OK -void __init trap_init_f00f_bug(void) +void __init trap_init_virtual_IDT(void) { /* * "idt" is magic - it overlaps the idt_descr * variable so that updating idt will automatically * update the idt descriptor.. */ - __set_fixmap(FIX_F00F, __pa(&idt_table), PAGE_KERNEL_RO); - idt_descr.address = __fix_to_virt(FIX_F00F); + __set_fixmap(FIX_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + idt_descr.address = __fix_to_virt(FIX_IDT); __asm__ __volatile__("lidt %0": "=m" (idt_descr)); } + +void __init trap_init_virtual_GDT(void) +{ + int cpu = smp_processor_id(); + struct Xgt_desc_struct *gdt_desc = cpu_gdt_descr + cpu; + struct Xgt_desc_struct tmp_desc = {0, 0}; + struct tss_struct * t; + + __asm__ __volatile__("sgdt %0": "=m" (tmp_desc): :"memory"); + +#if CONFIG_X86_HIGH_ENTRY + if (!cpu) { + __set_fixmap(FIX_GDT_0, __pa(cpu_gdt_table), PAGE_KERNEL); + __set_fixmap(FIX_GDT_1, __pa(cpu_gdt_table) + PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_0, __pa(init_tss), PAGE_KERNEL); + __set_fixmap(FIX_TSS_1, __pa(init_tss) + PAGE_SIZE, PAGE_KERNEL); + } + + gdt_desc->address = __fix_to_virt(FIX_GDT_0) + sizeof(cpu_gdt_table[0]) * cpu; +#else + gdt_desc->address = (unsigned long)cpu_gdt_table[cpu]; #endif + __asm__ __volatile__("lgdt %0": "=m" (*gdt_desc)); -#define _set_gate(gate_addr,type,dpl,addr) \ +#if CONFIG_X86_HIGH_ENTRY + t = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else + t = init_tss + cpu; +#endif + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); +} + +#define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ int __d0, __d1; \ __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ @@ -849,10 +899,9 @@ do { \ :"=m" (*((long *) (gate_addr))), \ "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \ + "3" ((char *) (addr)),"2" ((seg) << 16)); \ } while (0) - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the @@ -861,25 +910,34 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr); + addr = ENTRY_TRAMP_ADDR(addr); + _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); } -static void __init set_trap_gate(unsigned int n, void *addr) +void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr); + addr = ENTRY_TRAMP_ADDR(addr); + _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); } -static void __init set_system_gate(unsigned int n, void *addr) +void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr); + addr = ENTRY_TRAMP_ADDR(addr); + _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); } #if 0 static void __init set_call_gate(void *a, void *addr) { + addr = ENTRY_TRAMP_ADDR(addr); _set_gate(a,12,3,addr); } #endif + +static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); +} #ifdef CONFIG_X86_VISWS_APIC @@ -981,6 +1039,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_LOCAL_APIC init_apic_mappings(); #endif + init_entry_mappings(); set_trap_gate(0,÷_error); set_trap_gate(1,&debug); @@ -990,7 +1049,6 @@ void __init trap_init(void) set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); set_trap_gate(7,&device_not_available); - set_trap_gate(8,&double_fault); set_trap_gate(9,&coprocessor_segment_overrun); set_trap_gate(10,&invalid_TSS); set_trap_gate(11,&segment_not_present); @@ -1004,15 +1062,7 @@ void __init trap_init(void) set_trap_gate(19,&simd_coprocessor_error); set_system_gate(SYSCALL_VECTOR,&system_call); - - /* - * default LDT is a single-entry callgate to lcall7 for iBCS - * and a callgate to lcall27 for Solaris/x86 binaries - */ -#if 0 - set_call_gate(&default_ldt[0],lcall7); - set_call_gate(&default_ldt[4],lcall27); -#endif + set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); /* * Should be a barrier for any external CPU state. --- linux-2.4.20/arch/i386/kernel/i387.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/i387.c 2003-06-04 14:21:58.000000000 -0400 @@ -251,6 +251,7 @@ void set_fpu_mxcsr( struct task_struct * static int convert_fxsr_to_user( struct _fpstate *buf, struct i387_fxsave_struct *fxsave ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpreg *to; struct _fpxreg *from; @@ -267,23 +268,25 @@ static int convert_fxsr_to_user( struct if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) return 1; - to = &buf->_st[0]; + to = tmp; from = (struct _fpxreg *) &fxsave->st_space[0]; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f+1); + to->exponent = from->exponent; } + if (copy_to_user(buf->_st, tmp, sizeof(struct _fpreg [8]))) + return 1; return 0; } static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, struct _fpstate *buf ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpxreg *to; struct _fpreg *from; @@ -291,6 +294,8 @@ static int convert_fxsr_from_user( struc if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) return 1; + if (copy_from_user(tmp, buf->_st, sizeof(struct _fpreg [8]))) + return 1; fxsave->cwd = (unsigned short)(env[0] & 0xffff); fxsave->swd = (unsigned short)(env[1] & 0xffff); @@ -302,15 +307,14 @@ static int convert_fxsr_from_user( struc fxsave->fos = env[6]; to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; + from = tmp; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f + 1); + to->exponent = from->exponent; } return 0; } --- linux-2.4.20/arch/i386/kernel/entry.S.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/entry.S 2003-06-04 14:22:48.000000000 -0400 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -81,9 +82,165 @@ exec_domain = 16 need_resched = 20 tsk_ptrace = 24 cpu = 32 +real_stack = 36 +virtual_stack = 40 +user_pgd = 44 ENOSYS = 38 +#define GET_CURRENT(reg) \ + movl $-8192, reg; \ + andl %esp, reg + +#if CONFIG_X86_HIGH_ENTRY + +#define call_SYMBOL_NAME_ABS(X) movl $X, %ebp; call *%ebp + +#define __SAVE_ALL \ + cld; \ + pushl %es; \ + pushl %ds; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + movl $(__KERNEL_DS),%edx; \ + movl %edx,%ds; \ + movl %edx,%es; + +#define __RESTORE_ALL \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax; \ +111: popl %ds; \ +222: popl %es; \ + addl $4,%esp; \ +333: iret; \ + \ +444: movl $0,(%esp); \ + jmp 111b; \ +555: movl $0,(%esp); \ + jmp 222b; \ +.section .fixup,"ax"; \ +666: pushl %ss; \ + popl %ds; \ + pushl %ss; \ + popl %es; \ + pushl $11; \ + call_SYMBOL_NAME_ABS(do_exit); \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 111b,444b; \ + .long 222b,555b; \ + .long 333b,666b; \ +.previous + +/* clobbers edx, ebx */ + +#if CONFIG_X86_SWITCH_PAGETABLES + +#define __SWITCH_TO_KERNEL_PGD \ + movl %cr3, %edx; \ + cmpl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + jz 1f; \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + movl %edx, %cr3; \ +1: + +#else +#define __SWITCH_TO_KERNEL_PGD \ + movl %cr3, %edx; \ + cmpl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + jz 1f; \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + /* movl %edx, %cr3; */ \ +1: + +#endif + + +#if CONFIG_X86_SWITCH_PAGETABLES + +#define __SWITCH_TO_USER_PGD \ + movl %ecx, %cr3; + +#else + +#define __SWITCH_TO_USER_PGD \ + /* movl %ecx, %cr3; */ +#endif + +#define __SWITCH \ + \ + __SWITCH_TO_KERNEL_PGD \ + \ + /* load the real stack - keep the offset */ \ + \ + GET_CURRENT(%ebx); \ + movl real_stack(%ebx), %edx; \ + movl %esp, %ebx; \ + andl $0x1fff, %ebx; \ + orl %ebx, %edx; \ + movl %edx, %esp; + +#define SAVE_ALL \ + __SAVE_ALL; \ + __SWITCH + +#define RESTORE_ALL \ + /* interrupted the user return path? */ \ + movl return_path_start, %eax; \ + cmpl %eax, EIP(%esp); \ + jb 33f; /* nope - continue with user check */ \ + movl return_path_end, %eax; \ + cmpl %eax, EIP(%esp); \ + jb 22f; /* yes - switch to virtual stack */ \ +33: \ + /* return to userspace? */ \ + \ + movl EFLAGS(%esp),%ecx; \ + movb CS(%esp),%cl; \ + testl $(VM_MASK | 3),%ecx; \ + jz 2f; \ +22: \ + /* switch to the virtual stack, then switch */ \ + /* userspace pagetable. */ \ + \ + GET_CURRENT(%esi); \ + movl virtual_stack(%esi), %edx; \ + movl user_pgd(%esi), %ecx; \ + nop; \ +.globl return_path_start_marker; \ +return_path_start_marker: \ + nop; \ + movl %esp, %ebx; \ + andl $0x1fff, %ebx; \ + orl %ebx, %edx; \ + movl %esp, %eax; \ + movl %edx, %esp; \ + \ + __SWITCH_TO_USER_PGD \ + __RESTORE_ALL; \ + nop; \ +.globl return_path_end_marker; \ +return_path_end_marker: \ + nop; \ +2: \ + __RESTORE_ALL; + +#else /* !CONFIG_X86_HIGH_ENTRY */ + +#define call_SYMBOL_NAME_ABS(X) call SYMBOL_NAME(X) + +#define __SWITCH #define SAVE_ALL \ cld; \ @@ -131,67 +288,7 @@ ENOSYS = 38 .long 3b,6b; \ .previous -#define GET_CURRENT(reg) \ - movl $-8192, reg; \ - andl %esp, reg - -ENTRY(lcall7) - pushfl # We get a different stack layout with call gates, - pushl %eax # which has to be cleaned up later.. - SAVE_ALL - movl EIP(%esp),%eax # due to call gates, this is eflags, not eip.. - movl CS(%esp),%edx # this is eip.. - movl EFLAGS(%esp),%ecx # and this is cs.. - movl %eax,EFLAGS(%esp) # - andl $~(NT_MASK|TF_MASK|DF_MASK), %eax - pushl %eax - popfl - movl %edx,EIP(%esp) # Now we move them to their "normal" places - movl %ecx,CS(%esp) # - movl %esp,%ebx - pushl %ebx - andl $-8192,%ebx # GET_CURRENT - movl exec_domain(%ebx),%edx # Get the execution domain - movl 4(%edx),%edx # Get the lcall7 handler for the domain - pushl $0x7 - call *%edx - addl $4, %esp - popl %eax - jmp ret_from_sys_call - -ENTRY(lcall27) - pushfl # We get a different stack layout with call gates, - pushl %eax # which has to be cleaned up later.. - SAVE_ALL - movl EIP(%esp),%eax # due to call gates, this is eflags, not eip.. - movl CS(%esp),%edx # this is eip.. - movl EFLAGS(%esp),%ecx # and this is cs.. - movl %eax,EFLAGS(%esp) # - andl $~(NT_MASK|TF_MASK|DF_MASK), %eax - pushl %eax - popfl - movl %edx,EIP(%esp) # Now we move them to their "normal" places - movl %ecx,CS(%esp) # - movl %esp,%ebx - pushl %ebx - andl $-8192,%ebx # GET_CURRENT - movl exec_domain(%ebx),%edx # Get the execution domain - movl 4(%edx),%edx # Get the lcall7 handler for the domain - pushl $0x27 - call *%edx - addl $4, %esp - popl %eax - jmp ret_from_sys_call - - -ENTRY(ret_from_fork) - pushl %eax - call SYMBOL_NAME(schedule_tail) - addl $4, %esp - GET_CURRENT(%ebx) - testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS - jne tracesys_exit - jmp ret_from_sys_call +#endif /* * Return to user mode is not as complex as all this looks, @@ -200,6 +297,8 @@ ENTRY(ret_from_fork) * less clear than it otherwise should be. */ +.section .entry.text,"ax" + ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL @@ -208,6 +307,7 @@ ENTRY(system_call) jne tracesys cmpl $(NR_syscalls),%eax jae badsys + call *SYMBOL_NAME(sys_call_table)(,%eax,4) movl %eax,EAX(%esp) # save the return value ENTRY(ret_from_sys_call) @@ -226,28 +326,28 @@ signal_return: movl %esp,%eax jne v86_signal_return xorl %edx,%edx - call SYMBOL_NAME(do_signal) + call_SYMBOL_NAME_ABS(do_signal) jmp restore_all ALIGN v86_signal_return: - call SYMBOL_NAME(save_v86_state) + call_SYMBOL_NAME_ABS(save_v86_state) movl %eax,%esp xorl %edx,%edx - call SYMBOL_NAME(do_signal) + call_SYMBOL_NAME_ABS(do_signal) jmp restore_all ALIGN tracesys: movl $-ENOSYS,EAX(%esp) - call SYMBOL_NAME(syscall_trace) + call_SYMBOL_NAME_ABS(syscall_trace) movl ORIG_EAX(%esp),%eax cmpl $(NR_syscalls),%eax jae tracesys_exit call *SYMBOL_NAME(sys_call_table)(,%eax,4) movl %eax,EAX(%esp) # save the return value tracesys_exit: - call SYMBOL_NAME(syscall_trace) + call_SYMBOL_NAME_ABS(syscall_trace) jmp ret_from_sys_call badsys: movl $-ENOSYS,EAX(%esp) @@ -265,9 +365,18 @@ ret_from_exception: ALIGN reschedule: - call SYMBOL_NAME(schedule) # test + call_SYMBOL_NAME_ABS(schedule) # test jmp ret_from_sys_call +ENTRY(ret_from_fork) + pushl %eax + call_SYMBOL_NAME_ABS(schedule_tail) + addl $4, %esp + GET_CURRENT(%ebx) + testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS + jne tracesys_exit + jmp ret_from_sys_call + ENTRY(divide_error) pushl $0 # no error code pushl $ SYMBOL_NAME(do_divide_error) @@ -289,13 +398,20 @@ error_code: movl ES(%esp), %edi # get the function address movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) - movl %esp,%edx pushl %esi # push the error code - pushl %edx # push the pt_regs pointer movl $(__KERNEL_DS),%edx movl %edx,%ds movl %edx,%es + +/* clobbers edx, ebx */ + __SWITCH + + leal 4(%esp), %edx # prepare pt_regs + GET_CURRENT(%ebx) + + pushl %edx # push pt_regs + call *%edi addl $8,%esp jmp ret_from_exception @@ -317,11 +433,11 @@ ENTRY(device_not_available) movl %cr0,%eax testl $0x4,%eax # EM (math emulation bit) jne device_not_available_emulate - call SYMBOL_NAME(math_state_restore) + call_SYMBOL_NAME_ABS(math_state_restore) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP - call SYMBOL_NAME(math_emulate) + call_SYMBOL_NAME_ABS(math_emulate) addl $4,%esp jmp ret_from_exception @@ -336,9 +452,9 @@ ENTRY(nmi) movl %esp,%edx pushl $0 pushl %edx - call SYMBOL_NAME(do_nmi) + call_SYMBOL_NAME_ABS(do_nmi) addl $8,%esp - RESTORE_ALL + jmp restore_all ENTRY(int3) pushl $0 @@ -365,10 +481,6 @@ ENTRY(coprocessor_segment_overrun) pushl $ SYMBOL_NAME(do_coprocessor_segment_overrun) jmp error_code -ENTRY(double_fault) - pushl $ SYMBOL_NAME(do_double_fault) - jmp error_code - ENTRY(invalid_TSS) pushl $ SYMBOL_NAME(do_invalid_TSS) jmp error_code @@ -403,6 +515,8 @@ ENTRY(spurious_interrupt_bug) pushl $ SYMBOL_NAME(do_spurious_interrupt_bug) jmp error_code +.previous + .data ENTRY(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* 0 - old "setup()" system call*/ --- linux-2.4.20/arch/i386/kernel/head.S.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/head.S 2003-06-04 14:21:58.000000000 -0400 @@ -368,23 +368,27 @@ SYMBOL_NAME(cpu_gdt_descr): /* - * This is initialized to create an identity-mapping at 0-8M (for bootup - * purposes) and another mapping of the 0-8M area at virtual address + * This is initialized to create an identity-mapping at 0-16M (for bootup + * purposes) and another mapping of the 0-16M area at virtual address * PAGE_OFFSET. */ .org 0x1000 ENTRY(swapper_pg_dir) .long 0x00102007 .long 0x00103007 - .fill BOOT_USER_PGD_PTRS-2,4,0 + .long 0x00104007 + .long 0x00105007 + .fill BOOT_USER_PGD_PTRS-4,4,0 /* default: 766 entries */ .long 0x00102007 .long 0x00103007 + .long 0x00104007 + .long 0x00105007 /* default: 254 entries */ - .fill BOOT_KERNEL_PGD_PTRS-2,4,0 + .fill BOOT_KERNEL_PGD_PTRS-4,4,0 /* - * The page tables are initialized to only 8MB here - the final page + * The page tables are initialized to only 16MB here - the final page * tables are set up later depending on memory size. */ .org 0x2000 @@ -393,15 +397,21 @@ ENTRY(pg0) .org 0x3000 ENTRY(pg1) +.org 0x4000 +ENTRY(pg2) + +.org 0x5000 +ENTRY(pg3) + /* * empty_zero_page must immediately follow the page tables ! (The * initialization loop counts until empty_zero_page) */ -.org 0x4000 +.org 0x6000 ENTRY(empty_zero_page) -.org 0x5000 +.org 0x7000 /* * Real beginning of normal "text" segment @@ -409,14 +419,10 @@ ENTRY(empty_zero_page) ENTRY(stext) ENTRY(_stext) -/* - * This starts the data section. Note that the above is all - * in the text section because it has alignment requirements - * that we cannot fulfill any other way. - */ .data -ALIGN +.align 4096 + /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ @@ -453,6 +459,13 @@ ENTRY(cpu_gdt_table) .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0xb8 APM DS data */ + .quad 0x0000000000000000 /* 0xd0 - unused */ + .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xe0 - unused */ + .quad 0x0000000000000000 /* 0xe8 - unused */ + .quad 0x0000000000000000 /* 0xf0 - unused */ + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + #if CONFIG_SMP .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ #endif --- /dev/null 2003-01-30 05:24:37.000000000 -0500 +++ linux-2.4.20/arch/i386/kernel/entry_trampoline.c 2003-06-04 14:21:58.000000000 -0400 @@ -0,0 +1,77 @@ +/* + * linux/arch/i386/kernel/entry_trampoline.c + * + * (C) Copyright 2003 Ingo Molnar + * + * This file contains the needed support code for 4GB userspace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void *return_path_start, *return_path_end; + +extern char return_path_start_marker, return_path_end_marker; + +void __init init_entry_mappings(void) +{ +#if CONFIG_X86_HIGH_ENTRY + + void *tramp; + + /* + * We need a high IDT and GDT for the 4G/4G split: + */ + trap_init_virtual_IDT(); + + __set_fixmap(FIX_ENTRY_TRAMPOLINE, __pa((unsigned long)&entry_tramp_start), PAGE_KERNEL); + tramp = (void *)fix_to_virt(FIX_ENTRY_TRAMPOLINE); + + printk("mapped 4G/4G trampoline to %p.\n", tramp); + /* + * Virtual kernel stack: + */ + BUG_ON(__kmap_atomic_vaddr(KM_VSTACK0) & 8191); + BUG_ON(sizeof(struct desc_struct)*NR_CPUS*GDT_ENTRIES > 2*PAGE_SIZE); + BUG_ON((unsigned int)&entry_tramp_end - (unsigned int)&entry_tramp_start > PAGE_SIZE); + + /* + * set up the initial thread's virtual stack related + * fields: + */ + current->thread.stack_page0 = virt_to_page((char *)current); + current->thread.stack_page1 = virt_to_page((char *)current + PAGE_SIZE); + current->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); + + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(current->thread.stack_page0, KM_VSTACK0); + __kmap_atomic(current->thread.stack_page1, KM_VSTACK1); + + return_path_start = ENTRY_TRAMP_ADDR(&return_path_start_marker); + return_path_end = ENTRY_TRAMP_ADDR(&return_path_end_marker); +#endif + current->real_stack = (void *)current; + current->user_pgd = NULL; + current->thread.esp0 = (unsigned long)current->real_stack + THREAD_SIZE; + +} + + + +void __init entry_trampoline_setup(void) +{ + /* + * old IRQ entries set up by the boot code will still hang + * around - they are a sign of hw trouble anyway, now they'll + * produce a double fault message. + */ + trap_init_virtual_GDT(); +} --- linux-2.4.20/arch/i386/kernel/Makefile.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/Makefile 2003-06-04 14:21:58.000000000 -0400 @@ -18,7 +18,8 @@ export-objs := mca.o mtrr.o msr.o cp obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ - pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o + pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o \ + entry_trampoline.o doublefault.o ifdef CONFIG_PCI --- linux-2.4.20/arch/i386/kernel/vm86.c.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/vm86.c 2003-06-04 14:21:58.000000000 -0400 @@ -112,7 +112,8 @@ struct pt_regs * save_v86_state(struct k do_exit(SIGSEGV); } tss = init_tss + smp_processor_id(); - tss->esp0 = current->thread.esp0 = current->thread.saved_esp0; + current->thread.esp0 = current->thread.saved_esp0; + tss->esp0 = virtual_esp0(current); current->thread.saved_esp0 = 0; loadsegment(fs, current->thread.saved_fs); loadsegment(gs, current->thread.saved_gs); @@ -238,6 +239,8 @@ out: } +asmlinkage void ret_from_sys_call(void); + static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { struct tss_struct *tss; @@ -285,7 +288,8 @@ static void do_sys_vm86(struct kernel_vm asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs)); tss = init_tss + smp_processor_id(); - tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tss->esp0 = virtual_esp0(tsk); tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) @@ -293,9 +297,9 @@ static void do_sys_vm86(struct kernel_vm __asm__ __volatile__( "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" "movl %0,%%esp\n\t" - "jmp ret_from_sys_call" + "pushl %2; ret;" : /* no outputs */ - :"r" (&info->regs), "b" (tsk) : "ax"); + :"r" (&info->regs), "b" (tsk), "g" (ENTRY_TRAMP_ADDR(ret_from_sys_call)) : "ax"); /* we never return here */ } @@ -306,8 +310,8 @@ static inline void return_to_32bit(struc regs32 = save_v86_state(regs16); regs32->eax = retval; __asm__ __volatile__("movl %0,%%esp\n\t" - "jmp ret_from_sys_call" - : : "r" (regs32), "b" (current)); + "pushl %2; ret;" + : : "r" (regs32), "b" (current), "g" (ENTRY_TRAMP_ADDR(ret_from_sys_call))); } static inline void set_IF(struct kernel_vm86_regs * regs) --- linux-2.4.20/arch/i386/kernel/process.c.fourfour 2003-06-04 14:21:57.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/process.c 2003-06-04 14:21:58.000000000 -0400 @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -312,6 +313,12 @@ void machine_real_restart(unsigned char from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ + /* + * NOTE: this is a wrong 4G/4G PAE assumption. But it will triple + * fault the CPU (ie. reboot it) in a guaranteed way so we dont + * lose anything but the ability to warm-reboot. (which doesnt + * work on those big boxes using 4G/4G PAE anyway.) + */ memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); @@ -461,8 +468,7 @@ void show_regs(struct pt_regs * regs) printk("Pid/TGid: %d/%d, comm: %20s\n", current->pid, current->tgid, current->comm); printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id()); printk("\nEIP is at %s (" UTS_RELEASE ")\n",buffer); - if (regs->xcs & 3) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted()); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->eax,regs->ebx,regs->ecx,regs->edx); @@ -550,9 +556,8 @@ void release_thread(struct task_struct * if (dead_task->mm) { // temporary debugging check if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + printk("WARNING: dead process %8s still has LDT? <%d>\n", dead_task->comm, - dead_task->mm->context.ldt, dead_task->mm->context.size); BUG(); } @@ -581,7 +586,17 @@ int copy_thread(int nr, unsigned long cl p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + /* + * get the two stack pages, for the virtual stack. + * + * IMPORTANT: this code relies on the fact that the task + * structure is an 8K aligned piece of physical memory. + */ + p->thread.stack_page0 = virt_to_page((unsigned long)p); + p->thread.stack_page1 = virt_to_page((unsigned long)p + PAGE_SIZE); + + p->thread.eip = (unsigned long) __ENTRY_TRAMP_ADDR(ret_from_fork); + p->real_stack = p; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -717,12 +732,28 @@ struct task_struct * __switch_to(struct unlazy_fpu(prev_p); + +#if CONFIG_X86_HIGH_ENTRY /* - * Reload esp0, LDT and the page table pointer: + * Set the ptes of the virtual stack. (NOTE: a TLB flush is + * needed because otherwise NMIs could interrupt the + * user-return code with a virtual stack and stale TLBs.) */ - tss->esp0 = next->esp0; + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(next->stack_page0, KM_VSTACK0); + __kmap_atomic(next->stack_page1, KM_VSTACK1); /* + * Reload esp0: + */ + /* + * NOTE: here we rely on the task being the stack as well + */ + next_p->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); +#endif + tss->esp0 = virtual_esp0(next_p); + /* * Load the per-thread Thread-Local Storage descriptor. */ load_TLS(next, cpu); --- /dev/null 2003-01-30 05:24:37.000000000 -0500 +++ linux-2.4.20/arch/i386/kernel/doublefault.c 2003-06-04 14:21:58.000000000 -0400 @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +#include +#include +#include + +#define DOUBLEFAULT_STACKSIZE (1024) +static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; +#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) + +static void doublefault_fn(void) +{ + struct Xgt_desc_struct gdt_desc = {0, 0}; + unsigned long gdt, tss; + struct tss_struct *t; + + __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory"); + gdt = gdt_desc.address; + + printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + + gdt += GDT_ENTRY_TSS << 3; + tss = *(u16 *)(gdt+2); + tss += *(u8 *)(gdt+4) << 16; + tss += *(u8 *)(gdt+7) << 24; + printk("double fault, tss at %08lx\n", tss); + + t = (struct tss_struct *)tss; + + printk("eip = %08lx, esp = %08lx, esp0 = %08lx\n", + t->eip, t->esp, t->esp0); + printk("cr3 = %08lx\n", t->__cr3); + printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", + t->eax, t->ebx, t->ecx, t->edx); + printk("esi = %08lx, edi = %08lx\n", t->esi, t->edi); + for (;;) /* nothing */; +} + +struct tss_struct doublefault_tss __cacheline_aligned = { + .esp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .bitmap = INVALID_IO_BITMAP_OFFSET, + .io_bitmap = { [0 ... IO_BITMAP_SIZE ] = ~0 }, + + .eip = (unsigned long) doublefault_fn, + .eflags = 0x00000082, + .esp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + + .__cr3 = __pa(swapper_pg_dir) +}; + --- linux-2.4.20/arch/i386/kernel/ldt.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/ldt.c 2003-06-04 14:21:58.000000000 -0400 @@ -2,7 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 1999, 2003 Ingo Molnar */ #include @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *mm) @@ -29,29 +30,26 @@ static void flush_ldt(void *mm) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; - int oldsize; + int oldsize, newsize, i; if (mincount <= pc->size) return 0; + /* + * LDT got larger - reallocate if necessary. + */ oldsize = pc->size; mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } pc->size = mincount; if (reload) { load_LDT(pc); @@ -60,25 +58,20 @@ static int alloc_ldt(mm_context_t *pc, i smp_call_function(flush_ldt, 0, 1, 1); #endif } - wmb(); - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } return 0; } static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { - int err = alloc_ldt(new, old->size, 0); + int i, err, size = old->size, nr_pages = (size*LDT_ENTRY_SIZE + PAGE_SIZE-1)/PAGE_SIZE; + + err = alloc_ldt(new, size, 0); if (err < 0) { - printk(KERN_WARNING "ldt allocation failed\n"); new->size = 0; return err; } - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + for (i = 0; i < nr_pages; i++) + copy_user_highpage(new->ldt_pages[i], old->ldt_pages[i], 0); return 0; } @@ -93,6 +86,7 @@ int init_new_context(struct task_struct init_MUTEX(&mm->context.sem); mm->context.size = 0; + memset(mm->context.ldt_pages, 0, sizeof(struct page *) * MAX_LDT_PAGES); old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); @@ -109,18 +103,16 @@ int init_new_context(struct task_struct */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } + int i, nr_pages = (mm->context.size*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) + __free_page(mm->context.ldt_pages[i]); + mm->context.size = 0; } static int read_ldt(void * ptr, unsigned long bytecount) { - int err; + int err, i; unsigned long size; struct mm_struct * mm = current->mm; @@ -135,8 +127,25 @@ static int read_ldt(void * ptr, unsigned size = bytecount; err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; + /* + * This is necessary just in case we got here straight from a + * context-switch where the ptes were set but no tlb flush + * was done yet. We rather avoid doing a TLB flush in the + * context-switch path and do it here instead. + */ + __flush_tlb_global(); + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + if (copy_to_user(ptr + i, kaddr, size - i)) + err = -EFAULT; + kunmap(mm->context.ldt_pages[nr]); + } up(&mm->context.sem); if (err < 0) return err; @@ -155,7 +164,7 @@ static int read_default_ldt(void * ptr, err = 0; address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); + size = 5*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -197,7 +206,14 @@ static int write_ldt(void * ptr, unsigne goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + /* + * No rescheduling allowed from this point to the install. + * + * We do a TLB flush for the same reason as in the read_ldt() path. + */ + __flush_tlb_global(); + lp = (__u32 *) ((ldt_info.entry_number << 3) + + (char *) __kmap_atomic_vaddr(KM_LDT_PAGE0)); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -245,3 +261,27 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +/* + * load one particular LDT into the current CPU + */ +void load_LDT(mm_context_t *pc) +{ + struct page **pages = pc->ldt_pages; + int count = pc->size; + int nr_pages, i; + + if (!count) { + pages = &default_ldt_page; + count = 5; + } + nr_pages = (count*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + __kunmap_atomic_type(KM_LDT_PAGE0 - i); + __kmap_atomic(pages[i], KM_LDT_PAGE0 - i); + } + set_ldt_desc(smp_processor_id(), + (void *)__kmap_atomic_vaddr(KM_LDT_PAGE0), count); + load_LDT_desc(); +} --- linux-2.4.20/arch/i386/kernel/init_task.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/init_task.c 2003-06-04 14:21:58.000000000 -0400 @@ -30,5 +30,5 @@ union task_union init_task_union * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +struct tss_struct init_tss[NR_CPUS] __attribute__((__section__(".data.tss"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; --- linux-2.4.20/arch/i386/kernel/mpparse.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/mpparse.c 2003-06-04 14:21:58.000000000 -0400 @@ -799,7 +799,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc((void *)phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); --- linux-2.4.20/arch/i386/kernel/signal.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/signal.c 2003-06-04 14:21:58.000000000 -0400 @@ -186,25 +186,25 @@ struct rt_sigframe }; static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext *sc, int *peax) +restore_sigcontext(struct pt_regs *regs, struct sigcontext *__sc, int *peax) { - unsigned int err = 0; + struct sigcontext scratch; /* 88 bytes of scratch area */ -#define COPY(x) err |= __get_user(regs->x, &sc->x) + if (copy_from_user(&scratch, __sc, sizeof(scratch))) + return -EFAULT; + +#define COPY(x) regs->x = scratch.x #define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp; } #define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp|3; } #define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ loadsegment(seg,tmp); } GET_SEG(gs); @@ -223,27 +223,23 @@ restore_sigcontext(struct pt_regs *regs, COPY_SEG_STRICT(ss); { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); + unsigned int tmpflags = scratch.eflags; regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); regs->orig_eax = -1; /* disable syscall checks */ } { - struct _fpstate * buf; - err |= __get_user(buf, &sc->fpstate); + struct _fpstate * buf = scratch.fpstate; if (buf) { if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); + return -EFAULT; + if (restore_i387(buf)) + return -EFAULT; } } - err |= __get_user(*peax, &sc->eax); - return err; - -badframe: - return 1; + *peax = scratch.eax; + return 0; } asmlinkage int sys_sigreturn(unsigned long __unused) @@ -316,46 +312,47 @@ badframe: */ static int -setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, +setup_sigcontext(struct sigcontext *__sc, struct _fpstate *fpstate, struct pt_regs *regs, unsigned long mask) { - int tmp, err = 0; + struct sigcontext sc; /* 88 bytes of scratch area */ + int tmp; tmp = 0; __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->gs); + *(unsigned int *)&sc.gs = tmp; __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->fs); - - err |= __put_user(regs->xes, (unsigned int *)&sc->es); - err |= __put_user(regs->xds, (unsigned int *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int *)&sc->ss); + *(unsigned int *)&sc.fs = tmp; + *(unsigned int *)&sc.es = regs->xes; + *(unsigned int *)&sc.ds = regs->xds; + sc.edi = regs->edi; + sc.esi = regs->esi; + sc.ebp = regs->ebp; + sc.esp = regs->esp; + sc.ebx = regs->ebx; + sc.edx = regs->edx; + sc.ecx = regs->ecx; + sc.eax = regs->eax; + sc.trapno = current->thread.trap_no; + sc.err = current->thread.error_code; + sc.eip = regs->eip; + *(unsigned int *)&sc.cs = regs->xcs; + sc.eflags = regs->eflags; + sc.esp_at_signal = regs->esp; + *(unsigned int *)&sc.ss = regs->xss; tmp = save_i387(fpstate); if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + return 1; + sc.fpstate = tmp ? fpstate : NULL; /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); + sc.oldmask = mask; + sc.cr2 = current->thread.cr2; - return err; + if (copy_to_user(__sc, &sc, sizeof(sc))) + return 1; + return 0; } /* --- linux-2.4.20/arch/i386/kernel/i386_ksyms.c.fourfour 2003-06-04 14:21:54.000000000 -0400 +++ linux-2.4.20/arch/i386/kernel/i386_ksyms.c 2003-06-04 14:21:58.000000000 -0400 @@ -87,7 +87,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); @@ -102,6 +101,7 @@ EXPORT_SYMBOL(strtok); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); +#if !CONFIG_X86_UACCESS_INDIRECT EXPORT_SYMBOL(strncpy_from_user); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(clear_user); @@ -109,6 +109,13 @@ EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__generic_copy_from_user); EXPORT_SYMBOL(__generic_copy_to_user); EXPORT_SYMBOL(strnlen_user); +#else /* CONFIG_X86_UACCESS_INDIRECT */ +EXPORT_SYMBOL(direct_csum_partial_copy_generic); +#endif +EXPORT_SYMBOL(get_user_size); +EXPORT_SYMBOL(put_user_size); +EXPORT_SYMBOL(strlen_fromuser_size); +EXPORT_SYMBOL(zero_user_size); EXPORT_SYMBOL(pci_alloc_consistent); EXPORT_SYMBOL(pci_free_consistent); --- linux-2.4.20/arch/i386/Makefile.fourfour 2003-06-04 14:21:57.000000000 -0400 +++ linux-2.4.20/arch/i386/Makefile 2003-06-04 14:21:58.000000000 -0400 @@ -125,7 +125,7 @@ MAKEBOOT = $(MAKE) -C arch/$(ARCH)/boot vmlinux: arch/i386/vmlinux.lds -FORCE: ; +FORCE: arch/i386/vmlinux.lds .PHONY: zImage bzImage compressed zlilo bzlilo zdisk bzdisk install \ clean archclean archmrproper archdep @@ -162,3 +162,6 @@ archmrproper: archdep: @$(MAKEBOOT) dep + +arch/i386/vmlinux.lds: dummy + $(CPP) $(CPPFLAGS) -xc -P arch/i386/vmlinux.lds.in -o arch/i386/vmlinux.lds --- linux-2.4.20/arch/i386/config.in.fourfour 2003-06-04 14:21:58.000000000 -0400 +++ linux-2.4.20/arch/i386/config.in 2003-06-04 14:21:58.000000000 -0400 @@ -240,6 +240,15 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then bool 'HIGHMEM I/O support' CONFIG_HIGHIO fi +bool '4 GB kernel-space and 4 GB user-space virtual memory support' CONFIG_X86_4G + +if [ "$CONFIG_X86_4G" = "y" ]; then + define_bool CONFIG_X86_SWITCH_PAGETABLES y + define_bool CONFIG_X86_4G_VM_LAYOUT y + define_bool CONFIG_X86_UACCESS_INDIRECT y + define_bool CONFIG_X86_HIGH_ENTRY y +fi + bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP --- /dev/null 2003-01-30 05:24:37.000000000 -0500 +++ linux-2.4.20/arch/i386/vmlinux.lds.in 2003-06-04 14:21:58.000000000 -0400 @@ -0,0 +1,106 @@ + +#define __ASSEMBLY__ +#include + +/* ld script to make i386 Linux kernel + * Written by Martin Mares ; + */ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +ENTRY(_start) +SECTIONS +{ +#if CONFIG_X86_4G_VM_LAYOUT + . = 0x02000000 + 0x100000; +#else + . = 0xc0000000 + 0x100000; +#endif + _text = .; /* Text and read-only data */ + .text : { + *(.text) + *(.fixup) + *(.gnu.warning) + } = 0x9090 + + _etext = .; /* End of text section */ + + .rodata : { *(.rodata) *(.rodata.*) } + .kstrtab : { *(.kstrtab) } + + . = ALIGN(16); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } + __stop___ex_table = .; + + __start___ksymtab = .; /* Kernel symbol table */ + __ksymtab : { *(__ksymtab) } + __stop___ksymtab = .; + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; + + .data : { /* Data */ + *(.data) + CONSTRUCTORS + } + + _edata = .; /* End of data section */ + + . = ALIGN(8192); /* init_task */ + .data.init_task : { *(.data.init_task) } + + entry_tramp_start = .; + . = ALIGN(4096); /* kernel entry code */ + .entry.text : { *(.entry.text) } + entry_tramp_end = .; + + . = ALIGN(4096); /* Init code and data */ + __init_begin = .; + .text.init : { *(.text.init) } + .data.init : { *(.data.init) } + . = ALIGN(16); + __setup_start = .; + .setup.init : { *(.setup.init) } + __setup_end = .; + __initcall_start = .; + .initcall.init : { *(.initcall.init) } + __initcall_end = .; + . = ALIGN(4096); + __init_end = .; + + . = ALIGN(4096); + .data.page_aligned_tss : { *(.data.tss) } + + . = ALIGN(4096); + .data.page_aligned_default_ldt : { *(.data.default_ldt) } + + . = ALIGN(4096); + .data.page_aligned_idt : { *(.data.idt) } + + . = ALIGN(4096); + .data.page_aligned_gdt : { *(.data.gdt) } + + . = ALIGN(32); + .data.cacheline_aligned : { *(.data.cacheline_aligned) } + + __bss_start = .; /* BSS */ + .bss : { + *(.bss) + } + _end = . ; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.text.exit) + *(.data.exit) + *(.exitcall.exit) + } + + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } +} --- linux-2.4.20/Documentation/Configure.help.fourfour 2003-06-04 14:21:57.000000000 -0400 +++ linux-2.4.20/Documentation/Configure.help 2003-06-04 14:21:58.000000000 -0400 @@ -298,6 +298,41 @@ CONFIG_X86_UP_APIC If you have a system with several CPUs, you do not need to say Y here: the local APIC will be used automatically. +4 GB kernel-space and 4 GB user-space virtual memory support +CONFIG_X86_4G + This option is only useful for systems that have more than 1 GB + of RAM. + + The default kernel VM layout leaves 1 GB of virtual memory for + kernel-space mappings, and 3 GB of VM for user-space applications. + This option ups both the kernel-space VM and the user-space VM to + 4 GB. + + The cost of this option is additional TLB flushes done at + system-entry points that transition from user-mode into kernel-mode. + I.e. system calls and page faults, and IRQs that interrupt user-mode + code. There's also additional overhead to kernel operations that copy + memory to/from user-space. The overhead from this is hard to tell and + depends on the workload - it can be anything from no visible overhead + to 20-30% overhead. A good rule of thumb is to count with a runtime + overhead of 20%. + + The upside is the much increased kernel-space VM, which more than + quadruples the maximum amount of RAM supported. Kernels compiled with + this option boot on 64GB of RAM and still have more than 3.1 GB of + 'lowmem' left. Another bonus is that highmem IO bouncing decreases, + if used with drivers that still use bounce-buffers. + + There's also a 33% increase in user-space VM size - database + applications might see a boost from this. + + But the cost of the TLB flushes and the runtime overhead has to be + weighed against the bonuses offered by the larger VM spaces. The + dividing line depends on the actual workload - there might be 4 GB + systems that benefit from this option. Systems with less than 4 GB + of RAM will rarely see a benefit from this option - but it's not + out of question, the exact circumstances have to be considered. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point