1 # This is a BitKeeper generated patch for the following project:
2 # Project Name: Linux kernel tree
3 # This patch format is intended for GNU patch command version 2.5 or higher.
4 # This patch includes the following deltas:
5 # ChangeSet 1.1376 -> 1.1380
6 # arch/i386/kernel/smp.c 1.32 -> 1.33
7 # kernel/sys.c 1.47 -> 1.48
8 # arch/i386/Kconfig 1.62 -> 1.63
9 # arch/i386/kernel/Makefile 1.44 -> 1.45
10 # kernel/Makefile 1.28 -> 1.29
11 # arch/i386/kernel/entry.S 1.64 -> 1.65
12 # arch/i386/kernel/reboot.c 1.8 -> 1.9
13 # arch/i386/kernel/io_apic.c 1.71 -> 1.72
14 # arch/i386/kernel/dmi_scan.c 1.36 -> 1.37
15 # fs/aio.c 1.32 -> 1.33
16 # include/asm-i386/apicdef.h 1.8 -> 1.9
17 # MAINTAINERS 1.149 -> 1.150
18 # include/asm-i386/unistd.h 1.26 -> 1.27
19 # arch/i386/defconfig 1.96 -> 1.97
20 # arch/i386/kernel/i8259.c 1.25 -> 1.26
21 # include/asm-i386/apic.h 1.13 -> 1.14
22 # arch/i386/kernel/apic.c 1.42 -> 1.43
23 # include/linux/reboot.h 1.4 -> 1.5
24 # (new) -> 1.1 include/linux/kexec.h
25 # (new) -> 1.1 include/asm-i386/kexec.h
26 # (new) -> 1.1 kernel/kexec.c
27 # (new) -> 1.1 arch/i386/kernel/relocate_kernel.S
28 # (new) -> 1.1 arch/i386/kernel/machine_kexec.c
30 # The following is the BitKeeper ChangeSet Log
31 # --------------------------------------------
32 # 03/06/23 andyp@andyp.pdx.osdl.net 1.1377
33 # kexec2-2.5.73-common.patch
34 # --------------------------------------------
35 # 03/06/23 andyp@andyp.pdx.osdl.net 1.1378
36 # kexec2-2.5.73-x86.patch
37 # --------------------------------------------
38 # 03/06/23 andyp@andyp.pdx.osdl.net 1.1379
39 # kexec2-2.5.73-syscall.patch
40 # --------------------------------------------
41 # 03/06/23 andyp@andyp.pdx.osdl.net 1.1380
42 # kexec2-2.5.73-defconfig.patch
43 # --------------------------------------------
45 diff -Nru a/MAINTAINERS b/MAINTAINERS
46 --- a/MAINTAINERS Mon Jun 23 12:22:26 2003
47 +++ b/MAINTAINERS Mon Jun 23 12:22:26 2003
48 @@ -1067,6 +1067,17 @@
49 W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
54 +M: ebiederm@xmission.com
55 +M: ebiederman@lnxi.com
56 +W: http://www.xmission.com/~ebiederm/files/kexec/
59 +W: http://www.osdl.org/archive/andyp/bloom/Code/Linux/Kexec/
60 +L: linux-kernel@vger.kernel.org
63 LANMEDIA WAN CARD DRIVER
64 P: Andrew Stanley-Jones
66 diff -Nru a/arch/i386/Kconfig b/arch/i386/Kconfig
67 --- a/arch/i386/Kconfig Mon Jun 23 12:22:26 2003
68 +++ b/arch/i386/Kconfig Mon Jun 23 12:22:26 2003
70 depends on ((X86_SUMMIT || X86_GENERICARCH) && NUMA)
74 + bool "kexec system call (EXPERIMENTAL)"
75 + depends on EXPERIMENTAL
77 + kexec is a system call that implements the ability to shutdown your
78 + current kernel, and to start another kernel. It is like a reboot
79 + but it is indepedent of the system firmware. And like a reboot
80 + you can start any kernel with it not just Linux.
82 + The name comes from the similiarity to the exec system call.
84 + It is on an going process to be certain the hardware in a machine
85 + is properly shutdown, so do not be surprised if this code does not
86 + initially work for you. It may help to enable device hotplugging
87 + support. As of this writing the exact hardware interface is
88 + strongly in flux, so no good recommendation can be made.
93 diff -Nru a/arch/i386/defconfig b/arch/i386/defconfig
94 --- a/arch/i386/defconfig Mon Jun 23 12:22:26 2003
95 +++ b/arch/i386/defconfig Mon Jun 23 12:22:26 2003
97 CONFIG_X86_LOCAL_APIC=y
102 # CONFIG_X86_MCE_NONFATAL is not set
103 CONFIG_X86_MCE_P4THERMAL=y
104 diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
105 --- a/arch/i386/kernel/Makefile Mon Jun 23 12:22:26 2003
106 +++ b/arch/i386/kernel/Makefile Mon Jun 23 12:22:26 2003
108 obj-$(CONFIG_X86_MPPARSE) += mpparse.o
109 obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
110 obj-$(CONFIG_X86_IO_APIC) += io_apic.o
111 +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
112 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o
113 obj-$(CONFIG_X86_NUMAQ) += numaq.o
114 obj-$(CONFIG_X86_SUMMIT) += summit.o
115 diff -Nru a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
116 --- a/arch/i386/kernel/apic.c Mon Jun 23 12:22:26 2003
117 +++ b/arch/i386/kernel/apic.c Mon Jun 23 12:22:26 2003
119 #include <linux/mc146818rtc.h>
120 #include <linux/kernel_stat.h>
121 #include <linux/sysdev.h>
122 +#include <linux/reboot.h>
124 #include <asm/atomic.h>
132 + /* Go back to Virtual Wire compatibility mode */
133 + unsigned long value;
135 + /* For the spurious interrupt use vector F, and enable it */
136 + value = apic_read(APIC_SPIV);
137 + value &= ~APIC_VECTOR_MASK;
138 + value |= APIC_SPIV_APIC_ENABLED;
140 + apic_write_around(APIC_SPIV, value);
142 + /* For LVT0 make it edge triggered, active high, external and enabled */
143 + value = apic_read(APIC_LVT0);
144 + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
145 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
146 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
147 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
148 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
149 + apic_write_around(APIC_LVT0, value);
151 + /* For LVT1 make it edge triggered, active high, nmi and enabled */
152 + value = apic_read(APIC_LVT1);
154 + APIC_MODE_MASK | APIC_SEND_PENDING |
155 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
156 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
157 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
158 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
159 + apic_write_around(APIC_LVT1, value);
161 +#endif /* CONFIG_KEXEC */
165 void disable_local_APIC(void)
166 @@ -1113,6 +1147,26 @@
167 printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n",
168 smp_processor_id(), v , v1);
172 +void stop_apics(void)
174 + /* By resetting the APIC's we disable the nmi watchdog */
177 + * Stop all CPUs and turn off local APICs and the IO-APIC, so
178 + * other OSs see a clean IRQ state.
182 + disable_local_APIC();
184 +#if defined(CONFIG_X86_IO_APIC)
185 + if (smp_found_config) {
189 + disconnect_bsp_APIC();
193 diff -Nru a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
194 --- a/arch/i386/kernel/dmi_scan.c Mon Jun 23 12:22:26 2003
195 +++ b/arch/i386/kernel/dmi_scan.c Mon Jun 23 12:22:26 2003
201 - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic.
203 -static __init int set_smp_reboot(struct dmi_blacklist *d)
206 - extern int reboot_smp;
207 - if (reboot_smp == 0)
210 - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
217 - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic.
219 -static __init int set_smp_bios_reboot(struct dmi_blacklist *d)
222 - set_bios_reboot(d);
227 * Some bioses have a broken protected mode poweroff and need to use realmode
229 MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
230 MATCH(DMI_BIOS_DATE, "134526184"), NO_MATCH
232 - { set_smp_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
233 + { set_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
234 MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
235 MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
237 diff -Nru a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
238 --- a/arch/i386/kernel/entry.S Mon Jun 23 12:22:26 2003
239 +++ b/arch/i386/kernel/entry.S Mon Jun 23 12:22:26 2003
241 .long sys_clock_nanosleep
244 + .long sys_kexec_load /* 270 */
246 nr_syscalls=(.-sys_call_table)/4
247 diff -Nru a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
248 --- a/arch/i386/kernel/i8259.c Mon Jun 23 12:22:26 2003
249 +++ b/arch/i386/kernel/i8259.c Mon Jun 23 12:22:26 2003
254 +static int i8259A_shutdown(struct sys_device *dev)
256 + /* Put the i8259A into a quiescent state that
257 + * the kernel initialization code can get it
260 + outb(0xff, 0x21); /* mask all of 8259A-1 */
261 + outb(0xff, 0xA1); /* mask all of 8259A-1 */
265 static struct sysdev_class i8259_sysdev_class = {
266 set_kset_name("i8259"),
267 .resume = i8259A_resume,
268 + .shutdown = i8259A_shutdown,
271 static struct sys_device device_i8259A = {
272 diff -Nru a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
273 --- a/arch/i386/kernel/io_apic.c Mon Jun 23 12:22:26 2003
274 +++ b/arch/i386/kernel/io_apic.c Mon Jun 23 12:22:26 2003
275 @@ -1562,8 +1562,6 @@
276 * Clear the IO-APIC before rebooting:
280 - disconnect_bsp_APIC();
284 diff -Nru a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
285 --- /dev/null Wed Dec 31 16:00:00 1969
286 +++ b/arch/i386/kernel/machine_kexec.c Mon Jun 23 12:22:26 2003
288 +#include <linux/config.h>
289 +#include <linux/mm.h>
290 +#include <linux/kexec.h>
291 +#include <linux/delay.h>
292 +#include <asm/pgtable.h>
293 +#include <asm/pgalloc.h>
294 +#include <asm/tlbflush.h>
295 +#include <asm/mmu_context.h>
297 +#include <asm/apic.h>
302 + * =======================
306 +static void set_idt(void *newidt, __u16 limit)
308 + unsigned char curidt[6];
310 + /* ia32 supports unaliged loads & stores */
311 + (*(__u16 *)(curidt)) = limit;
312 + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
314 + __asm__ __volatile__ (
321 +static void set_gdt(void *newgdt, __u16 limit)
323 + unsigned char curgdt[6];
325 + /* ia32 supports unaliged loads & stores */
326 + (*(__u16 *)(curgdt)) = limit;
327 + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
329 + __asm__ __volatile__ (
335 +static void load_segments(void)
338 +#define STR(X) __STR(X)
340 + __asm__ __volatile__ (
341 + "\tljmp $"STR(__KERNEL_CS)",$1f\n"
343 + "\tmovl $"STR(__KERNEL_DS)",%eax\n"
344 + "\tmovl %eax,%ds\n"
345 + "\tmovl %eax,%es\n"
346 + "\tmovl %eax,%fs\n"
347 + "\tmovl %eax,%gs\n"
348 + "\tmovl %eax,%ss\n"
354 +typedef void (*relocate_new_kernel_t)(
355 + unsigned long indirection_page, unsigned long reboot_code_buffer,
356 + unsigned long start_address);
358 +const extern unsigned char relocate_new_kernel[];
359 +extern void relocate_new_kernel_end(void);
360 +const extern unsigned int relocate_new_kernel_size;
361 +extern void use_mm(struct mm_struct *mm);
363 +void machine_kexec(struct kimage *image)
365 + unsigned long indirection_page;
366 + unsigned long reboot_code_buffer;
367 + relocate_new_kernel_t rnk;
369 + /* switch to an mm where the reboot_code_buffer is identity mapped */
373 + /* Interrupts aren't acceptable while we reboot */
374 + local_irq_disable();
375 + reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
376 + indirection_page = image->head & PAGE_MASK;
379 + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
381 + /* The segment registers are funny things, they are
382 + * automatically loaded from a table, in memory wherever you
383 + * set them to a specific selector, but this table is never
384 + * accessed again you set the segment to a different selector.
386 + * The more common model is are caches where the behide
387 + * the scenes work is done, but is also dropped at arbitrary
390 + * I take advantage of this here by force loading the
391 + * segments, before I zap the gdt with an invalid value.
394 + /* The gdt & idt are now invalid.
395 + * If you want to load them you must set up your own idt & gdt.
397 + set_gdt(phys_to_virt(0),0);
398 + set_idt(phys_to_virt(0),0);
401 + rnk = (relocate_new_kernel_t) reboot_code_buffer;
402 + (*rnk)(indirection_page, reboot_code_buffer, image->start);
404 diff -Nru a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
405 --- a/arch/i386/kernel/reboot.c Mon Jun 23 12:22:26 2003
406 +++ b/arch/i386/kernel/reboot.c Mon Jun 23 12:22:26 2003
408 #include <linux/interrupt.h>
409 #include <linux/mc146818rtc.h>
410 #include <asm/uaccess.h>
411 +#include <asm/apic.h>
412 #include "mach_reboot.h"
416 int reboot_thru_bios;
420 -static int reboot_cpu = -1;
421 +int reboot_cpu = -1; /* specifies the internal linux cpu id, not the apicid */
422 /* shamelessly grabbed from lib/vsprintf.c for readability */
423 #define is_digit(c) ((c) >= '0' && (c) <= '9')
428 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
430 if (is_digit(*(str+1))) {
431 reboot_cpu = (int) (*(str+1) - '0');
432 if (is_digit(*(str+2)))
435 void machine_restart(char * __unused)
440 - cpuid = GET_APIC_ID(apic_read(APIC_ID));
444 - /* check to see if reboot_cpu is valid
445 - if its not, default to the BSP */
446 - if ((reboot_cpu == -1) ||
447 - (reboot_cpu > (NR_CPUS -1)) ||
448 - !(phys_cpu_present_map & (1<<cpuid)))
449 - reboot_cpu = boot_cpu_physical_apicid;
451 - reboot_smp = 0; /* use this as a flag to only go through this once*/
452 - /* re-run this function on the other CPUs
453 - it will fall though this section since we have
454 - cleared reboot_smp, and do the reboot if it is the
455 - correct CPU, otherwise it halts. */
456 - if (reboot_cpu != cpuid)
457 - smp_call_function((void *)machine_restart , NULL, 1, 0);
460 - /* if reboot_cpu is still -1, then we want a tradional reboot,
461 - and if we are not running on the reboot_cpu,, halt */
462 - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
464 - __asm__ __volatile__ ("hlt");
467 - * Stop all CPUs and turn off local APICs and the IO-APIC, so
468 - * other OSs see a clean IRQ state.
475 if(!reboot_thru_bios) {
476 /* rebooting needs to touch the page at absolute addr 0 */
477 @@ -268,10 +232,12 @@
479 void machine_halt(void)
484 void machine_power_off(void)
490 diff -Nru a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S
491 --- /dev/null Wed Dec 31 16:00:00 1969
492 +++ b/arch/i386/kernel/relocate_kernel.S Mon Jun 23 12:22:26 2003
494 +#include <linux/config.h>
495 +#include <linux/linkage.h>
497 + /* Must be relocatable PIC code callable as a C function, that once
498 + * it starts can not use the previous processes stack.
501 + .globl relocate_new_kernel
502 +relocate_new_kernel:
503 + /* read the arguments and say goodbye to the stack */
504 + movl 4(%esp), %ebx /* indirection_page */
505 + movl 8(%esp), %ebp /* reboot_code_buffer */
506 + movl 12(%esp), %edx /* start address */
508 + /* zero out flags, and disable interrupts */
512 + /* set a new stack at the bottom of our page... */
513 + lea 4096(%ebp), %esp
515 + /* store the parameters back on the stack */
516 + pushl %edx /* store the start address */
518 + /* Set cr0 to a known state:
519 + * 31 0 == Paging disabled
520 + * 18 0 == Alignment check disabled
521 + * 16 0 == Write protect disabled
522 + * 3 0 == No task switch
523 + * 2 0 == Don't do FP software emulation.
524 + * 0 1 == Proctected mode enabled
527 + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
531 + /* Set cr4 to a known state:
532 + * Setting everything to zero seems safe.
541 + /* Flush the TLB (needed?) */
545 + /* Do the copies */
547 +0: /* top, read another word for the indirection page */
551 + testl $0x1, %ecx /* is it a destination page */
554 + andl $0xfffff000, %edi
557 + testl $0x2, %ecx /* is it an indirection page */
560 + andl $0xfffff000, %ebx
563 + testl $0x4, %ecx /* is it the done indicator */
567 + testl $0x8, %ecx /* is it the source indicator */
568 + jz 0b /* Ignore it otherwise */
569 + movl %ecx, %esi /* For every source page do a copy */
570 + andl $0xfffff000, %esi
578 + /* To be certain of avoiding problems with self modifying code
579 + * I need to execute a serializing instruction here.
580 + * So I flush the TLB, it's handy, and not processor dependent.
585 + /* set all of the registers to known values */
586 + /* leave %esp alone */
596 +relocate_new_kernel_end:
598 + .globl relocate_new_kernel_size
599 +relocate_new_kernel_size:
600 + .long relocate_new_kernel_end - relocate_new_kernel
601 diff -Nru a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
602 --- a/arch/i386/kernel/smp.c Mon Jun 23 12:22:26 2003
603 +++ b/arch/i386/kernel/smp.c Mon Jun 23 12:22:26 2003
606 void smp_send_stop(void)
608 + extern int reboot_cpu;
611 + /* The boot cpu is always logical cpu 0 */
614 + /* See if there has been give a command line override .
616 + if ((reboot_cpu != -1) && !(reboot_cpu >= NR_CPUS) &&
617 + test_bit(reboot_cpu, &cpu_online_map)) {
618 + reboot_cpu_id = reboot_cpu;
621 + /* Make certain the the cpu I'm rebooting on is online */
622 + if (!test_bit(reboot_cpu_id, &cpu_online_map)) {
623 + reboot_cpu_id = smp_processor_id();
626 + /* Make certain I only run on the appropriate processor */
627 + set_cpus_allowed(current, 1 << reboot_cpu_id);
629 + /* O.k. Now that I'm on the appropriate processor stop
630 + * all of the others.
632 smp_call_function(stop_this_cpu, NULL, 1, 0);
635 diff -Nru a/fs/aio.c b/fs/aio.c
636 --- a/fs/aio.c Mon Jun 23 12:22:26 2003
637 +++ b/fs/aio.c Mon Jun 23 12:22:26 2003
642 -static void use_mm(struct mm_struct *mm)
643 +void use_mm(struct mm_struct *mm)
645 struct mm_struct *active_mm = current->active_mm;
646 atomic_inc(&mm->mm_count);
647 diff -Nru a/include/asm-i386/apic.h b/include/asm-i386/apic.h
648 --- a/include/asm-i386/apic.h Mon Jun 23 12:22:26 2003
649 +++ b/include/asm-i386/apic.h Mon Jun 23 12:22:26 2003
651 #define NMI_LOCAL_APIC 2
652 #define NMI_INVALID 3
654 +extern void stop_apics(void);
656 +static inline void stop_apics(void) { }
657 #endif /* CONFIG_X86_LOCAL_APIC */
659 #endif /* __ASM_APIC_H */
660 diff -Nru a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h
661 --- a/include/asm-i386/apicdef.h Mon Jun 23 12:22:26 2003
662 +++ b/include/asm-i386/apicdef.h Mon Jun 23 12:22:26 2003
664 #define APIC_LVT_REMOTE_IRR (1<<14)
665 #define APIC_INPUT_POLARITY (1<<13)
666 #define APIC_SEND_PENDING (1<<12)
667 +#define APIC_MODE_MASK 0x700
668 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
669 #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
670 #define APIC_MODE_FIXED 0x0
671 diff -Nru a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h
672 --- /dev/null Wed Dec 31 16:00:00 1969
673 +++ b/include/asm-i386/kexec.h Mon Jun 23 12:22:26 2003
675 +#ifndef _I386_KEXEC_H
676 +#define _I386_KEXEC_H
678 +#include <asm/fixmap.h>
681 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
682 + * I.e. Maximum page that is mapped directly into kernel memory,
683 + * and kmap is not required.
685 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
686 + * calculation for the amount of memory directly mappable into the
687 + * kernel memory space.
690 +/* Maximum physical address we can use pages from */
691 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
692 +/* Maximum address we can reach in physical address mode */
693 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
695 +#define KEXEC_REBOOT_CODE_SIZE 4096
697 +#endif /* _I386_KEXEC_H */
698 diff -Nru a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
699 --- a/include/asm-i386/unistd.h Mon Jun 23 12:22:26 2003
700 +++ b/include/asm-i386/unistd.h Mon Jun 23 12:22:26 2003
702 #define __NR_clock_nanosleep (__NR_timer_create+8)
703 #define __NR_statfs64 268
704 #define __NR_fstatfs64 269
705 +#define __NR_sys_kexec_load 270
707 -#define NR_syscalls 270
708 +#define NR_syscalls 271
710 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
712 diff -Nru a/include/linux/kexec.h b/include/linux/kexec.h
713 --- /dev/null Wed Dec 31 16:00:00 1969
714 +++ b/include/linux/kexec.h Mon Jun 23 12:22:26 2003
716 +#ifndef LINUX_KEXEC_H
717 +#define LINUX_KEXEC_H
720 +#include <linux/types.h>
721 +#include <linux/list.h>
722 +#include <asm/kexec.h>
725 + * This structure is used to hold the arguments that are used when loading
729 +typedef unsigned long kimage_entry_t;
730 +#define IND_DESTINATION 0x1
731 +#define IND_INDIRECTION 0x2
732 +#define IND_DONE 0x4
733 +#define IND_SOURCE 0x8
735 +#define KEXEC_SEGMENT_MAX 8
736 +struct kexec_segment {
744 + kimage_entry_t head;
745 + kimage_entry_t *entry;
746 + kimage_entry_t *last_entry;
748 + unsigned long destination;
749 + unsigned long offset;
751 + unsigned long start;
752 + struct page *reboot_code_pages;
754 + unsigned long nr_segments;
755 + struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
757 + struct list_head dest_pages;
758 + struct list_head unuseable_pages;
762 +/* kexec interface functions */
763 +extern void machine_kexec(struct kimage *image);
764 +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
765 + struct kexec_segment *segments);
766 +extern struct kimage *kexec_image;
768 +#endif /* LINUX_KEXEC_H */
770 diff -Nru a/include/linux/reboot.h b/include/linux/reboot.h
771 --- a/include/linux/reboot.h Mon Jun 23 12:22:26 2003
772 +++ b/include/linux/reboot.h Mon Jun 23 12:22:26 2003
774 * POWER_OFF Stop OS and remove all power from system, if possible.
775 * RESTART2 Restart system using given command string.
776 * SW_SUSPEND Suspend system using Software Suspend if compiled in
777 + * KEXEC Restart the system using a different kernel.
780 #define LINUX_REBOOT_CMD_RESTART 0x01234567
782 #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC
783 #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4
784 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2
785 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543
789 diff -Nru a/kernel/Makefile b/kernel/Makefile
790 --- a/kernel/Makefile Mon Jun 23 12:22:26 2003
791 +++ b/kernel/Makefile Mon Jun 23 12:22:26 2003
793 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
794 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
795 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
796 +obj-$(CONFIG_KEXEC) += kexec.o
797 obj-$(CONFIG_COMPAT) += compat.o
799 ifneq ($(CONFIG_IA64),y)
800 diff -Nru a/kernel/kexec.c b/kernel/kexec.c
801 --- /dev/null Wed Dec 31 16:00:00 1969
802 +++ b/kernel/kexec.c Mon Jun 23 12:22:26 2003
804 +#include <linux/mm.h>
805 +#include <linux/file.h>
806 +#include <linux/slab.h>
807 +#include <linux/fs.h>
808 +#include <linux/version.h>
809 +#include <linux/compile.h>
810 +#include <linux/kexec.h>
811 +#include <linux/spinlock.h>
812 +#include <linux/list.h>
813 +#include <linux/highmem.h>
814 +#include <net/checksum.h>
815 +#include <asm/page.h>
816 +#include <asm/uaccess.h>
818 +#include <asm/system.h>
820 +/* When kexec transitions to the new kernel there is a one to one
821 + * mapping between physical and virtual addresses. On processors
822 + * where you can disable the MMU this is trivial, and easy. For
823 + * others it is still a simple predictable page table to setup.
825 + * In that environment kexec copies the new kernel to it's final
826 + * resting place. This means I can only support memory whose
827 + * physical address can fit in an unsigned long. In particular
828 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
829 + * If the assembly stub has more restrictive requirements
830 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
831 + * defined more restrictively in <asm/kexec.h>.
833 + * The code for the transition from the current kernel to the
834 + * the new kernel is placed in the reboot_code_buffer, whose size
835 + * is given by KEXEC_REBOOT_CODE_SIZE. In the best case only a single
836 + * page of memory is necessary, but some architectures require more.
837 + * Because this memory must be identity mapped in the transition from
838 + * virtual to physical addresses it must live in the range
839 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
842 + * The assembly stub in the reboot code buffer is passed a linked list
843 + * of descriptor pages detailing the source pages of the new kernel,
844 + * and the destination addresses of those source pages. As this data
845 + * structure is not used in the context of the current OS, it must
846 + * be self contained.
848 + * The code has been made to work with highmem pages and will use a
849 + * destination page in it's final resting place (if it happens
850 + * to allocate it). The end product of this is that most of the
851 + * physical address space, and most of ram can be used.
853 + * Future directions include:
854 + * - allocating a page table with the reboot code buffer identity
855 + * mapped, to simplify machine_kexec and make kexec_on_panic, more
857 + * - allocating the pages for a page table for machines that cannot
858 + * disable their MMUs. (Hammer, Alpha...)
861 +/* KIMAGE_NO_DEST is an impossible destination address..., for
862 + * allocating pages whose destination address we do not care about.
864 +#define KIMAGE_NO_DEST (-1UL)
866 +static int kimage_is_destination_range(
867 + struct kimage *image, unsigned long start, unsigned long end);
868 +static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
869 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
872 +static int kimage_alloc(struct kimage **rimage,
873 + unsigned long nr_segments, struct kexec_segment *segments)
876 + struct kimage *image;
877 + size_t segment_bytes;
878 + struct page *reboot_pages;
881 + /* Allocate a controlling structure */
883 + image = kmalloc(sizeof(*image), GFP_KERNEL);
887 + memset(image, 0, sizeof(*image));
889 + image->entry = &image->head;
890 + image->last_entry = &image->head;
892 + /* Initialize the list of destination pages */
893 + INIT_LIST_HEAD(&image->dest_pages);
895 + /* Initialize the list of unuseable pages */
896 + INIT_LIST_HEAD(&image->unuseable_pages);
898 + /* Read in the segments */
899 + image->nr_segments = nr_segments;
900 + segment_bytes = nr_segments * sizeof*segments;
901 + result = copy_from_user(image->segment, segments, segment_bytes);
905 + /* Verify we have good destination addresses. The caller is
906 + * responsible for making certain we don't attempt to load
907 + * the new image into invalid or reserved areas of RAM. This
908 + * just verifies it is an address we can use.
910 + result = -EADDRNOTAVAIL;
911 + for(i = 0; i < nr_segments; i++) {
912 + unsigned long mend;
913 + mend = ((unsigned long)(image->segment[i].mem)) +
914 + image->segment[i].memsz;
915 + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
919 + /* Find a location for the reboot code buffer, and add it
920 + * the vector of segments so that it's pages will also be
921 + * counted as destination pages.
924 + reboot_pages = kimage_alloc_reboot_code_pages(image);
925 + if (!reboot_pages) {
926 + printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
929 + image->reboot_code_pages = reboot_pages;
930 + image->segment[nr_segments].buf = 0;
931 + image->segment[nr_segments].bufsz = 0;
932 + image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
933 + image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
934 + image->nr_segments++;
946 +static int kimage_is_destination_range(
947 + struct kimage *image, unsigned long start, unsigned long end)
950 + for(i = 0; i < image->nr_segments; i++) {
951 + unsigned long mstart, mend;
952 + mstart = (unsigned long)image->segment[i].mem;
953 + mend = mstart + image->segment[i].memsz;
954 + if ((end > mstart) && (start < mend)) {
962 +static int identity_map_pages(struct page *pages, int order)
964 + struct mm_struct *mm;
965 + struct vm_area_struct *vma;
970 + down_write(&mm->mmap_sem);
972 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
977 + memset(vma, 0, sizeof(vma));
979 + vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT;
980 + vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT));
982 + vma->vm_flags = VM_SHARED \
983 + | VM_READ | VM_WRITE | VM_EXEC \
984 + | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \
985 + | VM_DONTCOPY | VM_RESERVED;
986 + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
987 + vma->vm_file = NULL;
988 + vma->vm_private_data = NULL;
989 + INIT_LIST_HEAD(&vma->shared);
990 + insert_vm_struct(mm, vma);
992 + error = remap_page_range(vma, vma->vm_start, vma->vm_start,
993 + vma->vm_end - vma->vm_start, vma->vm_page_prot);
1000 + if (error && vma) {
1001 + kmem_cache_free(vm_area_cachep, vma);
1004 + up_write(&mm->mmap_sem);
1009 +#define identity_map_pages(pages, order) 0
1012 +struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
1014 + /* The reboot code buffer is special. It is the only set of
1015 + * pages that must be allocated in their final resting place,
1016 + * and the only set of pages whose final resting place we can
1019 + * At worst this runs in O(N) of the image size.
1021 + struct list_head extra_pages, *pos, *next;
1022 + struct page *pages;
1023 + unsigned long addr;
1025 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
1026 + count = 1 << order;
1027 + INIT_LIST_HEAD(&extra_pages);
1030 + pages = alloc_pages(GFP_HIGHUSER, order);
1033 + for(i = 0; i < count; i++) {
1034 + SetPageReserved(pages +i);
1036 + addr = page_to_pfn(pages) << PAGE_SHIFT;
1037 + if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
1038 + kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
1039 + list_add(&pages->list, &extra_pages);
1045 + result = identity_map_pages(pages, order);
1047 + list_add(&pages->list, &extra_pages);
1051 + /* If I could convert a multi page allocation into a buch of
1052 + * single page allocations I could add these pages to
1053 + * image->dest_pages. For now it is simpler to just free the
1056 + list_for_each_safe(pos, next, &extra_pages) {
1057 + struct page *page;
1059 + page = list_entry(pos, struct page, list);
1060 + for(i = 0; i < count; i++) {
1061 + ClearPageReserved(pages +i);
1063 + list_del(&extra_pages);
1064 + __free_pages(page, order);
1069 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
1071 + if (image->offset != 0) {
1074 + if (image->entry == image->last_entry) {
1075 + kimage_entry_t *ind_page;
1076 + struct page *page;
1077 + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
1081 + ind_page = page_address(page);
1082 + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
1083 + image->entry = ind_page;
1084 + image->last_entry =
1085 + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
1087 + *image->entry = entry;
1089 + image->offset = 0;
1093 +static int kimage_set_destination(
1094 + struct kimage *image, unsigned long destination)
1097 + destination &= PAGE_MASK;
1098 + result = kimage_add_entry(image, destination | IND_DESTINATION);
1099 + if (result == 0) {
1100 + image->destination = destination;
1106 +static int kimage_add_page(struct kimage *image, unsigned long page)
1109 + page &= PAGE_MASK;
1110 + result = kimage_add_entry(image, page | IND_SOURCE);
1111 + if (result == 0) {
1112 + image->destination += PAGE_SIZE;
1118 +static void kimage_free_extra_pages(struct kimage *image)
1120 + /* Walk through and free any extra destination pages I may have */
1121 + struct list_head *pos, *next;
1122 + list_for_each_safe(pos, next, &image->dest_pages) {
1123 + struct page *page;
1124 + page = list_entry(pos, struct page, list);
1125 + list_del(&page->list);
1126 + ClearPageReserved(page);
1127 + __free_page(page);
1129 + /* Walk through and free any unuseable pages I have cached */
1130 + list_for_each_safe(pos, next, &image->unuseable_pages) {
1131 + struct page *page;
1132 + page = list_entry(pos, struct page, list);
1133 + list_del(&page->list);
1134 + ClearPageReserved(page);
1135 + __free_page(page);
1139 +static int kimage_terminate(struct kimage *image)
1142 + result = kimage_add_entry(image, IND_DONE);
1143 + if (result == 0) {
1144 + /* Point at the terminating element */
1146 + kimage_free_extra_pages(image);
1151 +#define for_each_kimage_entry(image, ptr, entry) \
1152 + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
1153 + ptr = (entry & IND_INDIRECTION)? \
1154 + phys_to_virt((entry & PAGE_MASK)): ptr +1)
1156 +static void kimage_free(struct kimage *image)
1158 + kimage_entry_t *ptr, entry;
1159 + kimage_entry_t ind = 0;
1160 + int i, count, order;
1163 + kimage_free_extra_pages(image);
1164 + for_each_kimage_entry(image, ptr, entry) {
1165 + if (entry & IND_INDIRECTION) {
1166 + /* Free the previous indirection page */
1167 + if (ind & IND_INDIRECTION) {
1168 + free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
1170 + /* Save this indirection page until we are
1175 + else if (entry & IND_SOURCE) {
1176 + free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
1179 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
1180 + count = 1 << order;
1181 + do_munmap(&init_mm,
1182 + page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT,
1183 + count << PAGE_SHIFT);
1184 + for(i = 0; i < count; i++) {
1185 + ClearPageReserved(image->reboot_code_pages + i);
1187 + __free_pages(image->reboot_code_pages, order);
1191 +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
1193 + kimage_entry_t *ptr, entry;
1194 + unsigned long destination = 0;
1195 + for_each_kimage_entry(image, ptr, entry) {
1196 + if (entry & IND_DESTINATION) {
1197 + destination = entry & PAGE_MASK;
1199 + else if (entry & IND_SOURCE) {
1200 + if (page == destination) {
1203 + destination += PAGE_SIZE;
1209 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
1211 + /* Here we implment safe guards to ensure that a source page
1212 + * is not copied to it's destination page before the data on
1213 + * the destination page is no longer useful.
1215 + * To do this we maintain the invariant that a source page is
1216 + * either it's own destination page, or it is not a
1217 + * destination page at all.
1219 + * That is slightly stronger than required, but the proof
1220 + * that no problems will not occur is trivial, and the
1221 + * implemenation is simply to verify.
1223 + * When allocating all pages normally this algorithm will run
1224 + * in O(N) time, but in the worst case it will run in O(N^2)
1225 + * time. If the runtime is a problem the data structures can
1228 + struct page *page;
1229 + unsigned long addr;
1231 + /* Walk through the list of destination pages, and see if I
1234 + list_for_each_entry(page, &image->dest_pages, list) {
1235 + addr = page_to_pfn(page) << PAGE_SHIFT;
1236 + if (addr == destination) {
1237 + list_del(&page->list);
1243 + kimage_entry_t *old;
1244 + /* Allocate a page, if we run out of memory give up */
1245 + page = alloc_page(gfp_mask);
1249 + SetPageReserved(page);
1250 + /* If the page cannot be used file it away */
1251 + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1252 + list_add(&page->list, &image->unuseable_pages);
1255 + addr = page_to_pfn(page) << PAGE_SHIFT;
1257 + /* If it is the destination page we want use it */
1258 + if (addr == destination)
1261 + /* If the page is not a destination page use it */
1262 + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
1265 + /* I know that the page is someones destination page.
1266 + * See if there is already a source page for this
1267 + * destination page. And if so swap the source pages.
1269 + old = kimage_dst_used(image, addr);
1271 + /* If so move it */
1272 + unsigned long old_addr;
1273 + struct page *old_page;
1275 + old_addr = *old & PAGE_MASK;
1276 + old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1277 + copy_highpage(page, old_page);
1278 + *old = addr | (*old & ~PAGE_MASK);
1280 + /* The old page I have found cannot be a
1281 + * destination page, so return it.
1288 + /* Place the page on the destination list I
1289 + * will use it later.
1291 + list_add(&page->list, &image->dest_pages);
1297 +static int kimage_load_segment(struct kimage *image,
1298 + struct kexec_segment *segment)
1300 + unsigned long mstart;
1302 + unsigned long offset;
1303 + unsigned long offset_end;
1304 + unsigned char *buf;
1307 + buf = segment->buf;
1308 + mstart = (unsigned long)segment->mem;
1310 + offset_end = segment->memsz;
1312 + result = kimage_set_destination(image, mstart);
1316 + for(offset = 0; offset < segment->memsz; offset += PAGE_SIZE) {
1317 + struct page *page;
1319 + size_t size, leader;
1320 + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
1325 + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
1330 + if (segment->bufsz < offset) {
1331 + /* We are past the end zero the whole page */
1332 + memset(ptr, 0, PAGE_SIZE);
1338 + if ((offset == 0)) {
1339 + leader = mstart & ~PAGE_MASK;
1342 + /* We are on the first page zero the unused portion */
1343 + memset(ptr, 0, leader);
1347 + if (size > (segment->bufsz - offset)) {
1348 + size = segment->bufsz - offset;
1350 + if (size < (PAGE_SIZE - leader)) {
1351 + /* zero the trailing part of the page */
1352 + memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
1354 + result = copy_from_user(ptr, buf + offset, size);
1357 + result = (result < 0)?result : -EIO;
1366 + * Exec Kernel system call: for obvious reasons only root may call it.
1368 + * This call breaks up into three pieces.
1369 + * - A generic part which loads the new kernel from the current
1370 + * address space, and very carefully places the data in the
1371 + * allocated pages.
1373 + * - A generic part that interacts with the kernel and tells all of
1374 + * the devices to shut down. Preventing on-going dmas, and placing
1375 + * the devices in a consistent state so a later kernel can
1376 + * reinitialize them.
1378 + * - A machine specific part that includes the syscall number
1379 + * and the copies the image to it's final destination. And
1380 + * jumps into the image at entry.
1382 + * kexec does not sync, or unmount filesystems so if you need
1383 + * that to happen you need to do that yourself.
1385 +struct kimage *kexec_image = 0;
1387 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
1388 + struct kexec_segment *segments, unsigned long flags)
1390 + struct kimage *image;
1393 + /* We only trust the superuser with rebooting the system. */
1394 + if (!capable(CAP_SYS_ADMIN))
1397 + /* In case we need just a little bit of special behavior for
1403 + if (nr_segments > KEXEC_SEGMENT_MAX)
1408 + if (nr_segments > 0) {
1410 + result = kimage_alloc(&image, nr_segments, segments);
1414 + image->start = entry;
1415 + for(i = 0; i < nr_segments; i++) {
1416 + result = kimage_load_segment(image, &segments[i]);
1421 + result = kimage_terminate(image);
1427 + image = xchg(&kexec_image, image);
1430 + kimage_free(image);
1433 diff -Nru a/kernel/sys.c b/kernel/sys.c
1434 --- a/kernel/sys.c Mon Jun 23 12:22:26 2003
1435 +++ b/kernel/sys.c Mon Jun 23 12:22:26 2003
1437 #include <linux/init.h>
1438 #include <linux/highuid.h>
1439 #include <linux/fs.h>
1440 +#include <linux/kexec.h>
1441 #include <linux/workqueue.h>
1442 #include <linux/device.h>
1443 #include <linux/times.h>
1445 cond_syscall(sys_lookup_dcookie)
1446 cond_syscall(sys_swapon)
1447 cond_syscall(sys_swapoff)
1448 +cond_syscall(sys_kexec_load)
1449 cond_syscall(sys_init_module)
1450 cond_syscall(sys_delete_module)
1451 cond_syscall(sys_socketpair)
1452 @@ -450,6 +452,27 @@
1453 machine_restart(buffer);
1456 +#ifdef CONFIG_KEXEC
1457 + case LINUX_REBOOT_CMD_KEXEC:
1459 + struct kimage *image;
1464 + image = xchg(&kexec_image, 0);
1469 + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
1470 + system_running = 0;
1471 + device_shutdown();
1472 + printk(KERN_EMERG "Starting new kernel\n");
1473 + machine_kexec(image);
1477 #ifdef CONFIG_SOFTWARE_SUSPEND
1478 case LINUX_REBOOT_CMD_SW_SUSPEND:
1479 if (!software_suspend_enabled) {