Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/MAINTAINERS =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/MAINTAINERS 2004-11-18 20:59:11.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/MAINTAINERS 2004-11-18 23:25:15.000000000 -0500 @@ -1199,6 +1199,17 @@ W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/ S: Maintained +KEXEC +P: Eric Biederman +P: Randy Dunlap +M: ebiederm@xmission.com +M: rddunlap@osdl.org +W: http://www.xmission.com/~ebiederm/files/kexec/ +W: http://developer.osdl.org/rddunlap/kexec/ +L: linux-kernel@vger.kernel.org +L: fastboot@osdl.org +S: Maintained + LANMEDIA WAN CARD DRIVER P: Andrew Stanley-Jones M: asj@lanmedia.com Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/Kconfig =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/Kconfig 2004-11-18 20:59:11.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/Kconfig 2004-11-18 23:25:15.000000000 -0500 @@ -411,6 +411,23 @@ depends on IA32_EMULATION default y +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + endmenu source drivers/Kconfig Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/Makefile =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/Makefile 2004-11-11 10:28:46.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/Makefile 2004-11-18 23:26:29.000000000 -0500 @@ -19,6 +19,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/apic.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/apic.c 2004-11-11 10:28:46.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/apic.c 2004-11-18 23:25:15.000000000 -0500 @@ -143,6 +143,36 @@ outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT); + apic_write_around(APIC_LVT0, value); + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/e820.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/e820.c 2004-04-03 22:36:53.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/e820.c 2004-11-18 23:25:15.000000000 -0500 @@ -185,8 +185,6 @@ int i; for (i = 0; i < e820.nr_map; i++) { struct resource *res; - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; res = alloc_bootmem_low(sizeof(struct resource)); switch (e820.map[i].type) { case E820_RAM: res->name = "System RAM"; break; Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/i8259.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/i8259.c 2004-11-18 20:59:11.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/i8259.c 2004-11-18 23:25:15.000000000 -0500 @@ -318,6 +318,44 @@ } } +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(0); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + set_kset_name("i8259"), + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + void __init init_8259A(int auto_eoi) { unsigned long flags; Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/io_apic.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/io_apic.c 2004-11-11 10:28:46.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/io_apic.c 2004-11-18 23:25:15.000000000 -0500 @@ -328,7 +328,7 @@ /* * Find the pin to which IRQ[irq] (ISA) is connected */ -static int __init find_isa_irq_pin(int irq, int type) +static int find_isa_irq_pin(int irq, int type) { int i; @@ -1112,11 +1112,43 @@ */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + disconnect_bsp_APIC(); } Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/machine_kexec.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/machine_kexec.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/machine_kexec.c 2004-11-18 23:25:15.000000000 -0500 @@ -0,0 +1,246 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LEVEL0_SIZE (1UL << 12UL) +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +#define LEVEL3_SIZE (1UL << 39UL) +#define LEVEL4_SIZE (1UL << 48UL) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) +#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + +static void init_level2_page( + uint64_t *level2p, unsigned long addr) +{ + unsigned long end_addr; + addr &= PAGE_MASK; + end_addr = addr + LEVEL2_SIZE; + while(addr < end_addr) { + *(level2p++) = addr | L1_ATTR; + addr += LEVEL1_SIZE; + } +} + +static int init_level3_page(struct kimage *image, + uint64_t *level3p, unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + result = 0; + addr &= PAGE_MASK; + end_addr = addr + LEVEL3_SIZE; + while((addr < last_addr) && (addr < end_addr)) { + struct page *page; + uint64_t *level2p; + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level2p = (uint64_t *)page_address(page); + init_level2_page(level2p, addr); + *(level3p++) = __pa(level2p) | L2_ATTR; + addr += LEVEL2_SIZE; + } + /* clear the unused entries */ + while(addr < end_addr) { + *(level3p++) = 0; + addr += LEVEL2_SIZE; + } +out: + return result; +} + + +static int init_level4_page(struct kimage *image, + uint64_t *level4p, unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + result = 0; + addr &= PAGE_MASK; + end_addr = addr + LEVEL4_SIZE; + while((addr < last_addr) && (addr < end_addr)) { + struct page *page; + uint64_t *level3p; + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level3p = (uint64_t *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); + if (result) { + goto out; + } + *(level4p++) = __pa(level3p) | L3_ATTR; + addr += LEVEL3_SIZE; + } + /* clear the unused entries */ + while(addr < end_addr) { + *(level4p++) = 0; + addr += LEVEL3_SIZE; + } + out: + return result; +} + + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + uint64_t *level4p; + level4p = (uint64_t *)__va(start_pgtable); + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +} + +static void set_idt(void *newidt, __u16 limit) +{ + unsigned char curidt[10]; + + /* x86-64 supports unaliged loads & stores */ + (*(__u16 *)(curidt)) = limit; + (*(__u64 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + unsigned char curgdt[10]; + + /* x86-64 supports unaligned loads & stores */ + (*(__u16 *)(curgdt)) = limit; + (*(__u64 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%ss\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + ); +#undef STR +#undef __STR +} + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, unsigned long control_code_buffer, + unsigned long start_address, unsigned long pgtable); + +const extern unsigned char relocate_new_kernel[]; +extern void relocate_new_kernel_end(void); +const extern unsigned long relocate_new_kernel_size; + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable, control_code_buffer; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + 4096UL; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) { + return result; + } + + /* Place the code in the reboot code buffer */ + memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size); + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + return; +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long indirection_page; + unsigned long control_code_buffer; + unsigned long start_pgtable; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Calculate the offsets */ + indirection_page = image->head & PAGE_MASK; + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + 4096UL; + + /* Set the low half of the page table to my identity mapped + * page table for kexec. Leave the high half pointing at the + * kernel pages. Don't bother to flush the global pages + * as that will happen when I fully switch to my identity mapped + * page table anyway. + */ + memcpy((void *)read_pda(level4_pgt), __va(start_pgtable), PAGE_SIZE/2); + __flush_tlb(); + + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again unless you set the segment to a different selector. + * + * The more common model are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + /* now call it */ + rnk = (relocate_new_kernel_t) control_code_buffer; + (*rnk)(indirection_page, control_code_buffer, image->start, start_pgtable); +} Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/reboot.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/reboot.c 2004-04-03 22:37:59.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/reboot.c 2004-11-18 23:25:15.000000000 -0500 @@ -91,31 +91,6 @@ [target] "b" (WARMBOOT_TRAMP)); } -#ifdef CONFIG_SMP -static void smp_halt(void) -{ - int cpuid = safe_smp_processor_id(); - static int first_entry = 1; - - if (first_entry) { - first_entry = 0; - smp_call_function((void *)machine_restart, NULL, 1, 0); - } - - smp_stop_cpu(); - - /* AP calling this. Just halt */ - if (cpuid != boot_cpu_id) { - for (;;) - asm("hlt"); - } - - /* Wait for all other CPUs to have run smp_stop_cpu */ - while (!cpus_empty(cpu_online_map)) - rep_nop(); -} -#endif - static inline void kb_wait(void) { int i; @@ -125,23 +100,45 @@ break; } -void machine_restart(char * __unused) +void machine_shutdown(void) { - int i; - + /* Stop the cpus and apics */ #ifdef CONFIG_SMP - smp_halt(); -#endif + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); + } + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. + */ + smp_send_stop(); +#endif + local_irq_disable(); - + #ifndef CONFIG_SMP disable_local_APIC(); #endif disable_IO_APIC(); - + local_irq_enable(); +} + +void machine_restart(char * __unused) +{ + int i; + + machine_shutdown(); /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/relocate_kernel.S =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/relocate_kernel.S 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/relocate_kernel.S 2004-11-18 23:25:15.000000000 -0500 @@ -0,0 +1,141 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel + .code64 +relocate_new_kernel: + /* %rdi indirection_page + * %rsi reboot_code_buffer + * %rdx start address + * %rcx page_table + * %r8 arg5 + * %r9 arg6 + */ + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* set a new stack at the bottom of our page... */ + lea 4096(%rsi), %rsp + + /* store the parameters back on the stack */ + pushq %rdx /* store the start address */ + + /* Set cr0 to a known state: + * 31 1 == Paging enabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movq %cr0, %rax + andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax + orl $((1<<31)|(1<<0)), %eax + movq %rax, %cr0 + + /* Set cr4 to a known state: + * 10 0 == xmm exceptions disabled + * 9 0 == xmm registers instructions disabled + * 8 0 == performance monitoring counter disabled + * 7 0 == page global disabled + * 6 0 == machine check exceptions disabled + * 5 1 == physical address extension enabled + * 4 0 == page size extensions disabled + * 3 0 == Debug extensions disabled + * 2 0 == Time stamp disable (disabled) + * 1 0 == Protected mode virtual interrupts disabled + * 0 0 == VME disabled + */ + + movq $((1<<5)), %rax + movq %rax, %cr4 + + jmp 1f +1: + + /* Switch to the identity mapped page tables, + * and flush the TLB. + */ + movq %rcx, %cr3 + + /* Do the copies */ + movq %rdi, %rbx /* Put the indirection page in %rbx */ + xorq %rdi, %rdi + xorq %rsi, %rsi + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx + testq $0x1, %rcx /* is it a destination page? */ + jz 1f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +1: + testq $0x2, %rcx /* is it an indirection page? */ + jz 1f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +1: + testq $0x4, %rcx /* is it the done indicator? */ + jz 1f + jmp 2f +1: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq $512, %rcx + rep ; movsq + jmp 0b +2: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* set all of the registers to known values */ + /* leave %rsp alone */ + + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .quad relocate_new_kernel_end - relocate_new_kernel Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/apicdef.h =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/asm-x86_64/apicdef.h 2004-11-11 10:28:46.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/apicdef.h 2004-11-18 23:26:05.000000000 -0500 @@ -32,8 +32,8 @@ #define SET_APIC_LOGICAL_ID(x) (((x)<<24)) #define APIC_ALL_CPUS 0xFFu #define APIC_DFR 0xE0 -#define APIC_DFR_CLUSTER 0x0FFFFFFFu -#define APIC_DFR_FLAT 0xFFFFFFFFu +#define APIC_DFR_CLUSTER 0x0FFFFFFFul +#define APIC_DFR_FLAT 0xFFFFFFFFul #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1<<9) #define APIC_SPIV_APIC_ENABLED (1<<8) @@ -89,6 +89,7 @@ #define APIC_LVT_REMOTE_IRR (1<<14) #define APIC_INPUT_POLARITY (1<<13) #define APIC_SEND_PENDING (1<<12) +#define APIC_MODE_MASK 0x700 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7) #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8)) #define APIC_MODE_FIXED 0x0 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/kexec.h =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/asm-x86_64/kexec.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/kexec.h 2004-11-18 23:25:15.000000000 -0500 @@ -0,0 +1,25 @@ +#ifndef _X86_64_KEXEC_H +#define _X86_64_KEXEC_H + +#include +#include + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * So far x86_64 is limited to 40 physical address bits. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +/* Maximum address we can use for the control pages */ +#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) + +/* Allocate one page for the pdp and the second for the code */ +#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL) + +#endif /* _X86_64_KEXEC_H */ Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/unistd.h =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/asm-x86_64/unistd.h 2004-11-11 10:28:49.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/unistd.h 2004-11-18 23:27:18.000000000 -0500 @@ -551,7 +551,22 @@ #define __NR_mq_getsetattr 245 __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) -#define __NR_syscall_max __NR_mq_getsetattr +#define __NR_mq_open 240 +__SYSCALL(__NR_mq_open, sys_ni_syscall) +#define __NR_mq_unlink 241 +__SYSCALL(__NR_mq_unlink, sys_ni_syscall) +#define __NR_mq_timedsend 242 +__SYSCALL(__NR_mq_timedsend, sys_ni_syscall) +#define __NR_mq_timedreceive 243 +__SYSCALL(__NR_mq_timedreceive, sys_ni_syscall) +#define __NR_mq_notify 244 +__SYSCALL(__NR_mq_notify, sys_ni_syscall) +#define __NR_mq_getsetattr 245 +__SYSCALL(__NR_mq_getsetattr, sys_ni_syscall) +#define __NR_kexec_load 246 +__SYSCALL(__NR_kexec_load, sys_kexec_load) + +#define __NR_syscall_max __NR_kexec_load #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/kexec.h =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/kexec.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/kexec.h 2004-11-18 23:25:15.000000000 -0500 @@ -0,0 +1,56 @@ +#ifndef LINUX_KEXEC_H +#define LINUX_KEXEC_H + +#if CONFIG_KEXEC +#include +#include +#include + +/* + * This structure is used to hold the arguments that are used when loading + * kernel binaries. + */ + +typedef unsigned long kimage_entry_t; +#define IND_DESTINATION 0x1 +#define IND_INDIRECTION 0x2 +#define IND_DONE 0x4 +#define IND_SOURCE 0x8 + +#define KEXEC_SEGMENT_MAX 8 +struct kexec_segment { + void *buf; + size_t bufsz; + void *mem; + size_t memsz; +}; + +struct kimage { + kimage_entry_t head; + kimage_entry_t *entry; + kimage_entry_t *last_entry; + + unsigned long destination; + + unsigned long start; + struct page *control_code_page; + + unsigned long nr_segments; + struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + + struct list_head control_pages; + struct list_head dest_pages; + struct list_head unuseable_pages; +}; + + +/* kexec interface functions */ +extern void machine_kexec(struct kimage *image); +extern int machine_kexec_prepare(struct kimage *image); +extern void machine_kexec_cleanup(struct kimage *image); +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, + struct kexec_segment *segments); +extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern struct kimage *kexec_image; +#endif +#endif /* LINUX_KEXEC_H */ Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/reboot.h =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/reboot.h 2004-04-03 22:38:27.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/reboot.h 2004-11-18 23:25:15.000000000 -0500 @@ -22,6 +22,7 @@ * POWER_OFF Stop OS and remove all power from system, if possible. * RESTART2 Restart system using given command string. * SW_SUSPEND Suspend system using software suspend if compiled in. + * KEXEC Restart system using a previously loaded Linux kernel */ #define LINUX_REBOOT_CMD_RESTART 0x01234567 @@ -31,6 +32,7 @@ #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543 #ifdef __KERNEL__ @@ -49,6 +51,8 @@ extern void machine_halt(void); extern void machine_power_off(void); +extern void machine_shutdown(void); + #endif #endif /* _LINUX_REBOOT_H */ Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/syscalls.h =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/syscalls.h 2004-11-11 10:28:49.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/syscalls.h 2004-11-18 23:25:15.000000000 -0500 @@ -19,6 +19,7 @@ struct iovec; struct itimerspec; struct itimerval; +struct kexec_segment; struct linux_dirent; struct linux_dirent64; struct list_head; @@ -154,6 +155,8 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user *arg); asmlinkage long sys_restart_syscall(void); +asmlinkage long sys_kexec_load(void *entry, unsigned long nr_segments, + struct kexec_segment *segments, unsigned long flags); asmlinkage long sys_exit(int error_code); asmlinkage void sys_exit_group(int error_code); Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/Makefile =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/kernel/Makefile 2004-11-11 10:28:43.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/Makefile 2004-11-18 23:25:15.000000000 -0500 @@ -17,6 +17,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_PAGG) += pagg.o obj-$(CONFIG_IKCONFIG) += configs.o Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/kexec.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/kernel/kexec.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/kexec.c 2004-11-18 23:25:15.000000000 -0500 @@ -0,0 +1,640 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in . + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest); + + +static int kimage_alloc(struct kimage **rimage, + unsigned long nr_segments, struct kexec_segment *segments) +{ + int result; + struct kimage *image; + size_t segment_bytes; + unsigned long i; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) { + goto out; + } + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof*segments; + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mend; + mend = ((unsigned long)(image->segment[i].mem)) + + image->segment[i].memsz; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) { + *rimage = image; + } else { + kfree(image); + } + return result; +} + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + mstart = (unsigned long)image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) { + return 1; + } + } + return 0; +} + +static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + struct page *pages; + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + pages->private = order; + count = 1 << order; + for(i = 0; i < count; i++) { + SetPageReserved(pages + i); + } + } + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + order = page->private; + count = 1 << order; + for(i = 0; i < count; i++) { + ClearPageReserved(page + i); + } + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + + kimage_free_pages(page); + } +} + +struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) + { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while(!pages); + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everyting to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + return pages; + +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) { + image->entry++; + } + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) { + return -ENOMEM; + } + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + return 0; +} + +static int kimage_set_destination( + struct kimage *image, unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) { + image->destination = destination; + } + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) { + image->destination += PAGE_SIZE; + } + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unuseable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static int kimage_terminate(struct kimage *image) +{ + int result; + + result = kimage_add_entry(image, IND_DONE); + if (result == 0) { + /* Point at the terminating element */ + image->entry--; + kimage_free_extra_pages(image); + } + return result; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) { + kimage_free_entry(ind); + } + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) { + kimage_free_entry(entry); + } + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) { + kimage_free_entry(ind); + } + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + destination = entry & PAGE_MASK; + } + else if (entry & IND_SOURCE) { + if (page == destination) { + return ptr; + } + destination += PAGE_SIZE; + } + } + return 0; +} + +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) { + return 0; + } + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + return page; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long mstart; + int result; + unsigned long offset; + unsigned long offset_end; + unsigned char *buf; + + result = 0; + buf = segment->buf; + mstart = (unsigned long)segment->mem; + + offset_end = segment->memsz; + + result = kimage_set_destination(image, mstart); + if (result < 0) { + goto out; + } + for (offset = 0; offset < segment->memsz; offset += PAGE_SIZE) { + struct page *page; + char *ptr; + size_t size, leader; + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT); + if (result < 0) { + goto out; + } + ptr = kmap(page); + if (segment->bufsz < offset) { + /* We are past the end zero the whole page */ + memset(ptr, 0, PAGE_SIZE); + kunmap(page); + continue; + } + size = PAGE_SIZE; + leader = 0; + if ((offset == 0)) { + leader = mstart & ~PAGE_MASK; + } + if (leader) { + /* We are on the first page zero the unused portion */ + memset(ptr, 0, leader); + size -= leader; + ptr += leader; + } + if (size > (segment->bufsz - offset)) { + size = segment->bufsz - offset; + } + if (size < (PAGE_SIZE - leader)) { + /* zero the trailing part of the page */ + memset(ptr + size, 0, (PAGE_SIZE - leader) - size); + } + result = copy_from_user(ptr, buf + offset, size); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + } + out: + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = NULL; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment *segments, unsigned long flags) +{ + struct kimage *image; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * In case we need just a little bit of special behavior for + * reboot on panic. + */ + if (flags != 0) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + if (nr_segments > 0) { + unsigned long i; + result = kimage_alloc(&image, nr_segments, segments); + if (result) { + goto out; + } + result = machine_kexec_prepare(image); + if (result) { + goto out; + } + image->start = entry; + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) { + goto out; + } + } + result = kimage_terminate(image); + if (result) { + goto out; + } + } + + image = xchg(&kexec_image, image); + + out: + kimage_free(image); + return result; +} Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/sys.c =================================================================== --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/kernel/sys.c 2004-11-11 10:28:49.000000000 -0500 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/sys.c 2004-11-18 23:25:15.000000000 -0500 @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -226,6 +228,7 @@ cond_syscall(sys_lookup_dcookie) cond_syscall(sys_swapon) cond_syscall(sys_swapoff) +cond_syscall(sys_kexec_load) cond_syscall(sys_init_module) cond_syscall(sys_delete_module) cond_syscall(sys_socketpair) @@ -505,6 +508,24 @@ machine_restart(buffer); break; +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + unlock_kernel(); + return -EINVAL; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_BOOTING; + device_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); + break; + } +#endif #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: {