Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / kexec-2.6-suse-lnxi.patch
1 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/MAINTAINERS
2 ===================================================================
3 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/MAINTAINERS      2004-11-18 20:59:11.000000000 -0500
4 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/MAINTAINERS   2004-11-18 23:25:15.000000000 -0500
5 @@ -1199,6 +1199,17 @@
6  W:     http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
7  S:     Maintained
8  
9 +KEXEC
10 +P:     Eric Biederman
11 +P:     Randy Dunlap
12 +M:     ebiederm@xmission.com
13 +M:     rddunlap@osdl.org
14 +W:     http://www.xmission.com/~ebiederm/files/kexec/
15 +W:     http://developer.osdl.org/rddunlap/kexec/
16 +L:     linux-kernel@vger.kernel.org
17 +L:     fastboot@osdl.org
18 +S:     Maintained
19 +
20  LANMEDIA WAN CARD DRIVER
21  P:     Andrew Stanley-Jones
22  M:     asj@lanmedia.com
23 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/Kconfig
24 ===================================================================
25 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/Kconfig      2004-11-18 20:59:11.000000000 -0500
26 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/Kconfig   2004-11-18 23:25:15.000000000 -0500
27 @@ -411,6 +411,23 @@
28         depends on IA32_EMULATION
29         default y
30  
31 +config KEXEC
32 +       bool "kexec system call (EXPERIMENTAL)"
33 +       depends on EXPERIMENTAL
34 +       help
35 +         kexec is a system call that implements the ability to shutdown your
36 +         current kernel, and to start another kernel.  It is like a reboot
37 +         but it is indepedent of the system firmware.   And like a reboot
38 +         you can start any kernel with it, not just Linux.
39 +
40 +         The name comes from the similiarity to the exec system call.
41 +
42 +         It is an ongoing process to be certain the hardware in a machine
43 +         is properly shutdown, so do not be surprised if this code does not
44 +         initially work for you.  It may help to enable device hotplugging
45 +         support.  As of this writing the exact hardware interface is
46 +         strongly in flux, so no good recommendation can be made.
47 +
48  endmenu
49  
50  source drivers/Kconfig
51 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/Makefile
52 ===================================================================
53 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/Makefile      2004-11-11 10:28:46.000000000 -0500
54 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/Makefile   2004-11-18 23:26:29.000000000 -0500
55 @@ -19,6 +19,7 @@
56  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o  nmi.o
57  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o mpparse.o \
58                 genapic.o genapic_cluster.o genapic_flat.o
59 +obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o
60  obj-$(CONFIG_PM)               += suspend.o
61  obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
62  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
63 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/apic.c
64 ===================================================================
65 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/apic.c        2004-11-11 10:28:46.000000000 -0500
66 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/apic.c     2004-11-18 23:25:15.000000000 -0500
67 @@ -143,6 +143,36 @@
68                 outb(0x70, 0x22);
69                 outb(0x00, 0x23);
70         }
71 +       else {
72 +               /* Go back to Virtual Wire compatibility mode */
73 +               unsigned long value;
74 +
75 +               /* For the spurious interrupt use vector F, and enable it */
76 +               value = apic_read(APIC_SPIV);
77 +               value &= ~APIC_VECTOR_MASK;
78 +               value |= APIC_SPIV_APIC_ENABLED;
79 +               value |= 0xf;
80 +               apic_write_around(APIC_SPIV, value);
81 +
82 +               /* For LVT0 make it edge triggered, active high, external and enabled */
83 +               value = apic_read(APIC_LVT0);
84 +               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
85 +                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
86 +                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
87 +               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
88 +               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
89 +               apic_write_around(APIC_LVT0, value);
90 +
91 +               /* For LVT1 make it edge triggered, active high, nmi and enabled */
92 +               value = apic_read(APIC_LVT1);
93 +               value &= ~(
94 +                       APIC_MODE_MASK | APIC_SEND_PENDING |
95 +                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
96 +                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
97 +               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
98 +               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
99 +               apic_write_around(APIC_LVT1, value);
100 +       }
101  }
102  
103  void disable_local_APIC(void)
104 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/e820.c
105 ===================================================================
106 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/e820.c        2004-04-03 22:36:53.000000000 -0500
107 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/e820.c     2004-11-18 23:25:15.000000000 -0500
108 @@ -185,8 +185,6 @@
109         int i;
110         for (i = 0; i < e820.nr_map; i++) {
111                 struct resource *res;
112 -               if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
113 -                       continue;
114                 res = alloc_bootmem_low(sizeof(struct resource));
115                 switch (e820.map[i].type) {
116                 case E820_RAM:  res->name = "System RAM"; break;
117 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/i8259.c
118 ===================================================================
119 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/i8259.c       2004-11-18 20:59:11.000000000 -0500
120 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/i8259.c    2004-11-18 23:25:15.000000000 -0500
121 @@ -318,6 +318,44 @@
122         }
123  }
124  
125 +static int i8259A_resume(struct sys_device *dev)
126 +{
127 +       init_8259A(0);
128 +       return 0;
129 +}
130 +
131 +static int i8259A_shutdown(struct sys_device *dev)
132 +{
133 +       /* Put the i8259A into a quiescent state that
134 +        * the kernel initialization code can get it
135 +        * out of.
136 +        */
137 +       outb(0xff, 0x21);       /* mask all of 8259A-1 */
138 +       outb(0xff, 0xA1);       /* mask all of 8259A-1 */
139 +       return 0;
140 +}
141 +
142 +static struct sysdev_class i8259_sysdev_class = {
143 +       set_kset_name("i8259"),
144 +       .resume = i8259A_resume,
145 +       .shutdown = i8259A_shutdown,
146 +};
147 +
148 +static struct sys_device device_i8259A = {
149 +       .id     = 0,
150 +       .cls    = &i8259_sysdev_class,
151 +};
152 +
153 +static int __init i8259A_init_sysfs(void)
154 +{
155 +       int error = sysdev_class_register(&i8259_sysdev_class);
156 +       if (!error)
157 +               error = sysdev_register(&device_i8259A);
158 +       return error;
159 +}
160 +
161 +device_initcall(i8259A_init_sysfs);
162 +
163  void __init init_8259A(int auto_eoi)
164  {
165         unsigned long flags;
166 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/io_apic.c
167 ===================================================================
168 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/io_apic.c     2004-11-11 10:28:46.000000000 -0500
169 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/io_apic.c  2004-11-18 23:25:15.000000000 -0500
170 @@ -328,7 +328,7 @@
171  /*
172   * Find the pin to which IRQ[irq] (ISA) is connected
173   */
174 -static int __init find_isa_irq_pin(int irq, int type)
175 +static int find_isa_irq_pin(int irq, int type)
176  {
177         int i;
178  
179 @@ -1112,11 +1112,43 @@
180   */
181  void disable_IO_APIC(void)
182  {
183 +       int pin;
184         /*
185          * Clear the IO-APIC before rebooting:
186          */
187         clear_IO_APIC();
188  
189 +       /*
190 +        * If the i82559 is routed through an IOAPIC
191 +        * Put that IOAPIC in virtual wire mode
192 +        * so legacy interrups can be delivered.
193 +        */
194 +       pin = find_isa_irq_pin(0, mp_ExtINT);
195 +       if (pin != -1) {
196 +               struct IO_APIC_route_entry entry;
197 +               unsigned long flags;
198 +
199 +               memset(&entry, 0, sizeof(entry));
200 +               entry.mask            = 0; /* Enabled */
201 +               entry.trigger         = 0; /* Edge */
202 +               entry.irr             = 0;
203 +               entry.polarity        = 0; /* High */
204 +               entry.delivery_status = 0;
205 +               entry.dest_mode       = 0; /* Physical */
206 +               entry.delivery_mode   = 7; /* ExtInt */
207 +               entry.vector          = 0;
208 +               entry.dest.physical.physical_dest = 0;
209 +
210 +
211 +               /*
212 +                * Add it to the IO-APIC irq-routing table:
213 +                */
214 +               spin_lock_irqsave(&ioapic_lock, flags);
215 +               io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
216 +               io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
217 +               spin_unlock_irqrestore(&ioapic_lock, flags);
218 +       }
219 +
220         disconnect_bsp_APIC();
221  }
222  
223 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/machine_kexec.c
224 ===================================================================
225 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/machine_kexec.c       1969-12-31 19:00:00.000000000 -0500
226 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/machine_kexec.c    2004-11-18 23:25:15.000000000 -0500
227 @@ -0,0 +1,246 @@
228 +/*
229 + * machine_kexec.c - handle transition of Linux booting another kernel
230 + * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
231 + *
232 + * This source code is licensed under the GNU General Public License,
233 + * Version 2.  See the file COPYING for more details.
234 + */
235 +
236 +#include <linux/mm.h>
237 +#include <linux/kexec.h>
238 +#include <linux/delay.h>
239 +#include <linux/string.h>
240 +#include <linux/reboot.h>
241 +#include <asm/pda.h>
242 +#include <asm/pgtable.h>
243 +#include <asm/pgalloc.h>
244 +#include <asm/tlbflush.h>
245 +#include <asm/mmu_context.h>
246 +#include <asm/io.h>
247 +#include <asm/apic.h>
248 +#include <asm/cpufeature.h>
249 +#include <asm/hw_irq.h>
250 +
251 +#define LEVEL0_SIZE (1UL << 12UL)
252 +#define LEVEL1_SIZE (1UL << 21UL)
253 +#define LEVEL2_SIZE (1UL << 30UL)
254 +#define LEVEL3_SIZE (1UL << 39UL)
255 +#define LEVEL4_SIZE (1UL << 48UL)
256 +
257 +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
258 +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
259 +#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
260 +#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
261 +
262 +static void init_level2_page(
263 +       uint64_t *level2p, unsigned long addr)
264 +{
265 +       unsigned long end_addr;
266 +       addr &= PAGE_MASK;
267 +       end_addr = addr + LEVEL2_SIZE;
268 +       while(addr < end_addr) {
269 +               *(level2p++) = addr | L1_ATTR;
270 +               addr += LEVEL1_SIZE;
271 +       }
272 +}
273 +
274 +static int init_level3_page(struct kimage *image,
275 +       uint64_t *level3p, unsigned long addr, unsigned long last_addr)
276 +{
277 +       unsigned long end_addr;
278 +       int result;
279 +       result = 0;
280 +       addr &= PAGE_MASK;
281 +       end_addr = addr + LEVEL3_SIZE;
282 +       while((addr < last_addr) && (addr < end_addr)) {
283 +               struct page *page;
284 +               uint64_t *level2p;
285 +               page = kimage_alloc_control_pages(image, 0);
286 +               if (!page) {
287 +                       result = -ENOMEM;
288 +                       goto out;
289 +               }
290 +               level2p = (uint64_t *)page_address(page);
291 +               init_level2_page(level2p, addr);
292 +               *(level3p++) = __pa(level2p) | L2_ATTR;
293 +               addr += LEVEL2_SIZE;
294 +       }
295 +       /* clear the unused entries */
296 +       while(addr < end_addr) {
297 +               *(level3p++) = 0;
298 +               addr += LEVEL2_SIZE;
299 +       }
300 +out:
301 +       return result;
302 +}
303 +
304 +
305 +static int init_level4_page(struct kimage *image, 
306 +       uint64_t *level4p, unsigned long addr, unsigned long last_addr)
307 +{
308 +       unsigned long end_addr;
309 +       int result;
310 +       result = 0;
311 +       addr &= PAGE_MASK;
312 +       end_addr = addr + LEVEL4_SIZE;
313 +       while((addr < last_addr) && (addr < end_addr)) {
314 +               struct page *page;
315 +               uint64_t *level3p;
316 +               page = kimage_alloc_control_pages(image, 0);
317 +               if (!page) {
318 +                       result = -ENOMEM;
319 +                       goto out;
320 +               }
321 +               level3p = (uint64_t *)page_address(page);
322 +               result = init_level3_page(image, level3p, addr, last_addr);
323 +               if (result) {
324 +                       goto out;
325 +               }
326 +               *(level4p++) = __pa(level3p) | L3_ATTR;
327 +               addr += LEVEL3_SIZE;
328 +       }
329 +       /* clear the unused entries */
330 +       while(addr < end_addr) {
331 +               *(level4p++) = 0;
332 +               addr += LEVEL3_SIZE;
333 +       }
334 + out:
335 +       return result;
336 +}
337 +
338 +
339 +static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
340 +{
341 +       uint64_t *level4p;
342 +       level4p = (uint64_t *)__va(start_pgtable);
343 +       return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
344 +}
345 +
346 +static void set_idt(void *newidt, __u16 limit)
347 +{
348 +       unsigned char curidt[10];
349 +
350 +       /* x86-64 supports unaliged loads & stores */
351 +       (*(__u16 *)(curidt)) = limit;
352 +       (*(__u64 *)(curidt +2)) = (unsigned long)(newidt);
353 +
354 +       __asm__ __volatile__ (
355 +               "lidt %0\n"
356 +               : "=m" (curidt)
357 +               );
358 +};
359 +
360 +
361 +static void set_gdt(void *newgdt, __u16 limit)
362 +{
363 +       unsigned char curgdt[10];
364 +
365 +       /* x86-64 supports unaligned loads & stores */
366 +       (*(__u16 *)(curgdt)) = limit;
367 +       (*(__u64 *)(curgdt +2)) = (unsigned long)(newgdt);
368 +
369 +       __asm__ __volatile__ (
370 +               "lgdt %0\n"
371 +               : "=m" (curgdt)
372 +               );
373 +};
374 +
375 +static void load_segments(void)
376 +{
377 +       __asm__ __volatile__ (
378 +               "\tmovl $"STR(__KERNEL_DS)",%eax\n"
379 +               "\tmovl %eax,%ds\n"
380 +               "\tmovl %eax,%es\n"
381 +               "\tmovl %eax,%ss\n"
382 +               "\tmovl %eax,%fs\n"
383 +               "\tmovl %eax,%gs\n"
384 +               );
385 +#undef STR
386 +#undef __STR
387 +}
388 +
389 +typedef void (*relocate_new_kernel_t)(
390 +       unsigned long indirection_page, unsigned long control_code_buffer,
391 +       unsigned long start_address, unsigned long pgtable);
392 +
393 +const extern unsigned char relocate_new_kernel[];
394 +extern void relocate_new_kernel_end(void);
395 +const extern unsigned long relocate_new_kernel_size;
396 +
397 +int machine_kexec_prepare(struct kimage *image)
398 +{
399 +       unsigned long start_pgtable, control_code_buffer;
400 +       int result;
401 +
402 +       /* Calculate the offsets */
403 +       start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
404 +       control_code_buffer = start_pgtable + 4096UL;
405 +
406 +       /* Setup the identity mapped 64bit page table */
407 +       result = init_pgtable(image, start_pgtable);
408 +       if (result) {
409 +               return result;
410 +       }
411 +
412 +       /* Place the code in the reboot code buffer */
413 +       memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
414 +
415 +       return 0;
416 +}
417 +
418 +void machine_kexec_cleanup(struct kimage *image)
419 +{
420 +       return;
421 +}
422 +
423 +/*
424 + * Do not allocate memory (or fail in any way) in machine_kexec().
425 + * We are past the point of no return, committed to rebooting now.
426 + */
427 +void machine_kexec(struct kimage *image)
428 +{
429 +       unsigned long indirection_page;
430 +       unsigned long control_code_buffer;
431 +       unsigned long start_pgtable;
432 +       relocate_new_kernel_t rnk;
433 +
434 +       /* Interrupts aren't acceptable while we reboot */
435 +       local_irq_disable();
436 +
437 +       /* Calculate the offsets */
438 +       indirection_page    = image->head & PAGE_MASK;
439 +       start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
440 +       control_code_buffer = start_pgtable + 4096UL;
441 +
442 +       /* Set the low half of the page table to my identity mapped
443 +        * page table for kexec.  Leave the high half pointing at the
444 +        * kernel pages.   Don't bother to flush the global pages
445 +        * as that will happen when I fully switch to my identity mapped
446 +        * page table anyway.
447 +        */
448 +       memcpy((void *)read_pda(level4_pgt), __va(start_pgtable), PAGE_SIZE/2);
449 +       __flush_tlb();
450 +
451 +
452 +       /* The segment registers are funny things, they are
453 +        * automatically loaded from a table, in memory wherever you
454 +        * set them to a specific selector, but this table is never
455 +        * accessed again unless you set the segment to a different selector.
456 +        *
457 +        * The more common model are caches where the behide
458 +        * the scenes work is done, but is also dropped at arbitrary
459 +        * times.
460 +        *
461 +        * I take advantage of this here by force loading the
462 +        * segments, before I zap the gdt with an invalid value.
463 +        */
464 +       load_segments();
465 +       /* The gdt & idt are now invalid.
466 +        * If you want to load them you must set up your own idt & gdt.
467 +        */
468 +       set_gdt(phys_to_virt(0),0);
469 +       set_idt(phys_to_virt(0),0);
470 +       /* now call it */
471 +       rnk = (relocate_new_kernel_t) control_code_buffer;
472 +       (*rnk)(indirection_page, control_code_buffer, image->start, start_pgtable);
473 +}
474 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/reboot.c
475 ===================================================================
476 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/reboot.c      2004-04-03 22:37:59.000000000 -0500
477 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/reboot.c   2004-11-18 23:25:15.000000000 -0500
478 @@ -91,31 +91,6 @@
479                       [target] "b" (WARMBOOT_TRAMP));
480  }
481  
482 -#ifdef CONFIG_SMP
483 -static void smp_halt(void)
484 -{
485 -       int cpuid = safe_smp_processor_id(); 
486 -               static int first_entry = 1;
487 -
488 -               if (first_entry) { 
489 -                       first_entry = 0;
490 -                       smp_call_function((void *)machine_restart, NULL, 1, 0);
491 -               } 
492 -                       
493 -       smp_stop_cpu(); 
494 -
495 -       /* AP calling this. Just halt */
496 -       if (cpuid != boot_cpu_id) { 
497 -               for (;;) 
498 -                       asm("hlt");
499 -       }
500 -
501 -       /* Wait for all other CPUs to have run smp_stop_cpu */
502 -       while (!cpus_empty(cpu_online_map))
503 -               rep_nop(); 
504 -}
505 -#endif
506 -
507  static inline void kb_wait(void)
508  {
509         int i;
510 @@ -125,23 +100,45 @@
511                         break;
512  }
513  
514 -void machine_restart(char * __unused)
515 +void machine_shutdown(void)
516  {
517 -       int i;
518 -
519 +       /* Stop the cpus and apics */
520  #ifdef CONFIG_SMP
521 -       smp_halt(); 
522 -#endif
523 +       int reboot_cpu_id;
524 +
525 +       /* The boot cpu is always logical cpu 0 */
526 +       reboot_cpu_id = 0;
527 +
528 +       /* Make certain the cpu I'm about to reboot on is online */
529 +       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
530 +               reboot_cpu_id = smp_processor_id();
531 +       }
532 +
533 +       /* Make certain I only run on the appropriate processor */
534 +       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
535  
536 +       /* O.K Now that I'm on the appropriate processor, 
537 +        * stop all of the others.
538 +        */
539 +       smp_send_stop();
540 +#endif
541 +       
542         local_irq_disable();
543 -       
544 +
545  #ifndef CONFIG_SMP
546         disable_local_APIC();
547  #endif
548  
549         disable_IO_APIC();
550 -       
551 +
552         local_irq_enable();
553 +}
554 +
555 +void machine_restart(char * __unused)
556 +{
557 +       int i;
558 +
559 +       machine_shutdown();
560         
561         /* Tell the BIOS if we want cold or warm reboot */
562         *((unsigned short *)__va(0x472)) = reboot_mode;
563 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/relocate_kernel.S
564 ===================================================================
565 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/arch/x86_64/kernel/relocate_kernel.S     1969-12-31 19:00:00.000000000 -0500
566 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/arch/x86_64/kernel/relocate_kernel.S  2004-11-18 23:25:15.000000000 -0500
567 @@ -0,0 +1,141 @@
568 +/*
569 + * relocate_kernel.S - put the kernel image in place to boot
570 + * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
571 + *
572 + * This source code is licensed under the GNU General Public License,
573 + * Version 2.  See the file COPYING for more details.
574 + */
575 +
576 +#include <linux/linkage.h>
577 +
578 +       /*
579 +        * Must be relocatable PIC code callable as a C function, that once
580 +        * it starts can not use the previous processes stack.
581 +        */
582 +       .globl relocate_new_kernel
583 +       .code64
584 +relocate_new_kernel:
585 +       /* %rdi indirection_page
586 +        * %rsi reboot_code_buffer
587 +        * %rdx start address
588 +        * %rcx page_table
589 +        * %r8  arg5
590 +        * %r9  arg6
591 +        */             
592 +
593 +       /* zero out flags, and disable interrupts */
594 +       pushq $0
595 +       popfq
596 +
597 +       /* set a new stack at the bottom of our page... */
598 +       lea   4096(%rsi), %rsp
599 +
600 +       /* store the parameters back on the stack */
601 +       pushq   %rdx /* store the start address */      
602 +
603 +       /* Set cr0 to a known state:
604 +        * 31 1 == Paging enabled
605 +        * 18 0 == Alignment check disabled
606 +        * 16 0 == Write protect disabled
607 +        * 3  0 == No task switch
608 +        * 2  0 == Don't do FP software emulation.
609 +        * 0  1 == Proctected mode enabled
610 +        */
611 +       movq    %cr0, %rax
612 +       andq    $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
613 +       orl     $((1<<31)|(1<<0)), %eax
614 +       movq    %rax, %cr0
615 +
616 +       /* Set cr4 to a known state:
617 +        * 10 0 == xmm exceptions disabled
618 +        * 9  0 == xmm registers instructions disabled
619 +        * 8  0 == performance monitoring counter disabled
620 +        * 7  0 == page global disabled
621 +        * 6  0 == machine check exceptions disabled    
622 +        * 5  1 == physical address extension enabled   
623 +        * 4  0 == page size extensions disabled
624 +        * 3  0 == Debug extensions disabled    
625 +        * 2  0 == Time stamp disable (disabled)        
626 +        * 1  0 == Protected mode virtual interrupts disabled
627 +        * 0  0 == VME disabled
628 +        */
629 +
630 +       movq    $((1<<5)), %rax
631 +       movq    %rax, %cr4
632 +
633 +       jmp 1f
634 +1:
635 +
636 +       /* Switch to the identity mapped page tables,
637 +        * and flush the TLB.   
638 +       */
639 +       movq    %rcx, %cr3
640 +
641 +       /* Do the copies */
642 +       movq    %rdi, %rbx      /* Put the indirection page in %rbx */
643 +       xorq    %rdi, %rdi
644 +       xorq    %rsi, %rsi
645 +       
646 +0:     /* top, read another word for the indirection page */
647 +       
648 +       movq    (%rbx), %rcx
649 +       addq    $8,     %rbx
650 +       testq   $0x1,   %rcx  /* is it a destination page? */
651 +       jz      1f
652 +       movq    %rcx,   %rdi
653 +       andq    $0xfffffffffffff000, %rdi
654 +       jmp     0b
655 +1:
656 +       testq   $0x2,   %rcx  /* is it an indirection page? */
657 +       jz      1f
658 +       movq    %rcx,   %rbx
659 +       andq    $0xfffffffffffff000, %rbx
660 +       jmp     0b
661 +1:
662 +       testq   $0x4,   %rcx  /* is it the done indicator? */
663 +       jz      1f
664 +       jmp     2f
665 +1:
666 +       testq   $0x8,   %rcx  /* is it the source indicator? */
667 +       jz      0b            /* Ignore it otherwise */
668 +       movq    %rcx,   %rsi  /* For ever source page do a copy */
669 +       andq    $0xfffffffffffff000, %rsi
670 +
671 +       movq    $512,   %rcx
672 +       rep ; movsq
673 +       jmp     0b
674 +2:
675 +
676 +       /* To be certain of avoiding problems with self-modifying code
677 +        * I need to execute a serializing instruction here.
678 +        * So I flush the TLB by reloading %cr3 here, it's handy, 
679 +        * and not processor dependent.
680 +        */
681 +       movq    %cr3, %rax
682 +       movq    %rax, %cr3
683 +
684 +       /* set all of the registers to known values */
685 +       /* leave %rsp alone */
686 +
687 +       xorq    %rax, %rax
688 +       xorq    %rbx, %rbx
689 +       xorq    %rcx, %rcx
690 +       xorq    %rdx, %rdx
691 +       xorq    %rsi, %rsi
692 +       xorq    %rdi, %rdi
693 +       xorq    %rbp, %rbp
694 +       xorq    %r8,  %r8
695 +       xorq    %r9,  %r9
696 +       xorq    %r10, %r9
697 +       xorq    %r11, %r11
698 +       xorq    %r12, %r12
699 +       xorq    %r13, %r13
700 +       xorq    %r14, %r14
701 +       xorq    %r15, %r15
702 +
703 +       ret
704 +relocate_new_kernel_end:
705 +
706 +       .globl relocate_new_kernel_size
707 +relocate_new_kernel_size:
708 +       .quad relocate_new_kernel_end - relocate_new_kernel
709 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/apicdef.h
710 ===================================================================
711 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/asm-x86_64/apicdef.h     2004-11-11 10:28:46.000000000 -0500
712 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/apicdef.h  2004-11-18 23:26:05.000000000 -0500
713 @@ -32,8 +32,8 @@
714  #define                        SET_APIC_LOGICAL_ID(x)  (((x)<<24))
715  #define                        APIC_ALL_CPUS           0xFFu
716  #define                APIC_DFR        0xE0
717 -#define                        APIC_DFR_CLUSTER        0x0FFFFFFFu
718 -#define                        APIC_DFR_FLAT           0xFFFFFFFFu
719 +#define                        APIC_DFR_CLUSTER        0x0FFFFFFFul
720 +#define                        APIC_DFR_FLAT           0xFFFFFFFFul
721  #define                APIC_SPIV       0xF0
722  #define                        APIC_SPIV_FOCUS_DISABLED        (1<<9)
723  #define                        APIC_SPIV_APIC_ENABLED          (1<<8)
724 @@ -89,6 +89,7 @@
725  #define                        APIC_LVT_REMOTE_IRR             (1<<14)
726  #define                        APIC_INPUT_POLARITY             (1<<13)
727  #define                        APIC_SEND_PENDING               (1<<12)
728 +#define                        APIC_MODE_MASK                  0x700
729  #define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
730  #define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
731  #define                                APIC_MODE_FIXED         0x0
732 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/kexec.h
733 ===================================================================
734 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/asm-x86_64/kexec.h       1969-12-31 19:00:00.000000000 -0500
735 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/kexec.h    2004-11-18 23:25:15.000000000 -0500
736 @@ -0,0 +1,25 @@
737 +#ifndef _X86_64_KEXEC_H
738 +#define _X86_64_KEXEC_H
739 +
740 +#include <asm/page.h>
741 +#include <asm/proto.h>
742 +
743 +/*
744 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
745 + * I.e. Maximum page that is mapped directly into kernel memory,
746 + * and kmap is not required.
747 + *
748 + * So far x86_64 is limited to 40 physical address bits.
749 + */
750 +
751 +/* Maximum physical address we can use pages from */
752 +#define KEXEC_SOURCE_MEMORY_LIMIT      (0xFFFFFFFFFFUL)
753 +/* Maximum address we can reach in physical address mode */
754 +#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
755 +/* Maximum address we can use for the control pages */
756 +#define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
757 +
758 +/* Allocate one page for the pdp and the second for the code */
759 +#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
760 +
761 +#endif /* _X86_64_KEXEC_H */
762 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/unistd.h
763 ===================================================================
764 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/asm-x86_64/unistd.h      2004-11-11 10:28:49.000000000 -0500
765 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/asm-x86_64/unistd.h   2004-11-18 23:27:18.000000000 -0500
766 @@ -551,7 +551,22 @@
767  #define __NR_mq_getsetattr     245
768  __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
769  
770 -#define __NR_syscall_max __NR_mq_getsetattr
771 +#define __NR_mq_open 240
772 +__SYSCALL(__NR_mq_open, sys_ni_syscall)
773 +#define __NR_mq_unlink 241
774 +__SYSCALL(__NR_mq_unlink, sys_ni_syscall)
775 +#define __NR_mq_timedsend 242
776 +__SYSCALL(__NR_mq_timedsend, sys_ni_syscall)
777 +#define __NR_mq_timedreceive 243
778 +__SYSCALL(__NR_mq_timedreceive, sys_ni_syscall)
779 +#define __NR_mq_notify 244
780 +__SYSCALL(__NR_mq_notify, sys_ni_syscall)
781 +#define __NR_mq_getsetattr 245
782 +__SYSCALL(__NR_mq_getsetattr, sys_ni_syscall)
783 +#define __NR_kexec_load 246
784 +__SYSCALL(__NR_kexec_load, sys_kexec_load)
785 +
786 +#define __NR_syscall_max __NR_kexec_load
787  #ifndef __NO_STUBS
788  
789  /* user-visible error numbers are in the range -1 - -4095 */
790 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/kexec.h
791 ===================================================================
792 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/kexec.h    1969-12-31 19:00:00.000000000 -0500
793 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/kexec.h 2004-11-18 23:25:15.000000000 -0500
794 @@ -0,0 +1,56 @@
795 +#ifndef LINUX_KEXEC_H
796 +#define LINUX_KEXEC_H
797 +
798 +#if CONFIG_KEXEC
799 +#include <linux/types.h>
800 +#include <linux/list.h>
801 +#include <asm/kexec.h>
802 +
803 +/*
804 + * This structure is used to hold the arguments that are used when loading
805 + * kernel binaries.
806 + */
807 +
808 +typedef unsigned long kimage_entry_t;
809 +#define IND_DESTINATION  0x1
810 +#define IND_INDIRECTION  0x2
811 +#define IND_DONE         0x4
812 +#define IND_SOURCE       0x8
813 +
814 +#define KEXEC_SEGMENT_MAX 8
815 +struct kexec_segment {
816 +       void *buf;
817 +       size_t bufsz;
818 +       void *mem;
819 +       size_t memsz;
820 +};
821 +
822 +struct kimage {
823 +       kimage_entry_t head;
824 +       kimage_entry_t *entry;
825 +       kimage_entry_t *last_entry;
826 +
827 +       unsigned long destination;
828 +
829 +       unsigned long start;
830 +       struct page *control_code_page;
831 +
832 +       unsigned long nr_segments;
833 +       struct kexec_segment segment[KEXEC_SEGMENT_MAX];
834 +
835 +       struct list_head control_pages;
836 +       struct list_head dest_pages;
837 +       struct list_head unuseable_pages;
838 +};
839 +
840 +
841 +/* kexec interface functions */
842 +extern void machine_kexec(struct kimage *image);
843 +extern int machine_kexec_prepare(struct kimage *image);
844 +extern void machine_kexec_cleanup(struct kimage *image);
845 +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
846 +       struct kexec_segment *segments);
847 +extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
848 +extern struct kimage *kexec_image;
849 +#endif
850 +#endif /* LINUX_KEXEC_H */
851 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/reboot.h
852 ===================================================================
853 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/reboot.h   2004-04-03 22:38:27.000000000 -0500
854 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/reboot.h        2004-11-18 23:25:15.000000000 -0500
855 @@ -22,6 +22,7 @@
856   * POWER_OFF   Stop OS and remove all power from system, if possible.
857   * RESTART2    Restart system using given command string.
858   * SW_SUSPEND  Suspend system using software suspend if compiled in.
859 + * KEXEC       Restart system using a previously loaded Linux kernel
860   */
861  
862  #define        LINUX_REBOOT_CMD_RESTART        0x01234567
863 @@ -31,6 +32,7 @@
864  #define        LINUX_REBOOT_CMD_POWER_OFF      0x4321FEDC
865  #define        LINUX_REBOOT_CMD_RESTART2       0xA1B2C3D4
866  #define        LINUX_REBOOT_CMD_SW_SUSPEND     0xD000FCE2
867 +#define LINUX_REBOOT_CMD_KEXEC          0x45584543
868  
869  
870  #ifdef __KERNEL__
871 @@ -49,6 +51,8 @@
872  extern void machine_halt(void);
873  extern void machine_power_off(void);
874  
875 +extern void machine_shutdown(void);
876 +
877  #endif
878  
879  #endif /* _LINUX_REBOOT_H */
880 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/syscalls.h
881 ===================================================================
882 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/syscalls.h 2004-11-11 10:28:49.000000000 -0500
883 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/syscalls.h      2004-11-18 23:25:15.000000000 -0500
884 @@ -19,6 +19,7 @@
885  struct iovec;
886  struct itimerspec;
887  struct itimerval;
888 +struct kexec_segment;
889  struct linux_dirent;
890  struct linux_dirent64;
891  struct list_head;
892 @@ -154,6 +155,8 @@
893  asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
894                                 void __user *arg);
895  asmlinkage long sys_restart_syscall(void);
896 +asmlinkage long sys_kexec_load(void *entry, unsigned long nr_segments,
897 +                             struct kexec_segment *segments, unsigned long flags);
898  
899  asmlinkage long sys_exit(int error_code);
900  asmlinkage void sys_exit_group(int error_code);
901 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/Makefile
902 ===================================================================
903 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/kernel/Makefile  2004-11-11 10:28:43.000000000 -0500
904 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/Makefile       2004-11-18 23:25:15.000000000 -0500
905 @@ -17,6 +17,7 @@
906  obj-$(CONFIG_KALLSYMS) += kallsyms.o
907  obj-$(CONFIG_PM) += power/
908  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
909 +obj-$(CONFIG_KEXEC) += kexec.o
910  obj-$(CONFIG_COMPAT) += compat.o
911  obj-$(CONFIG_PAGG) += pagg.o
912  obj-$(CONFIG_IKCONFIG) += configs.o
913 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/kexec.c
914 ===================================================================
915 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/kernel/kexec.c   1969-12-31 19:00:00.000000000 -0500
916 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/kexec.c        2004-11-18 23:25:15.000000000 -0500
917 @@ -0,0 +1,640 @@
918 +/*
919 + * kexec.c - kexec system call
920 + * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
921 + *
922 + * This source code is licensed under the GNU General Public License,
923 + * Version 2.  See the file COPYING for more details.
924 + */
925 +
926 +#include <linux/mm.h>
927 +#include <linux/file.h>
928 +#include <linux/slab.h>
929 +#include <linux/fs.h>
930 +#include <linux/kexec.h>
931 +#include <linux/spinlock.h>
932 +#include <linux/list.h>
933 +#include <linux/highmem.h>
934 +#include <net/checksum.h>
935 +#include <asm/page.h>
936 +#include <asm/uaccess.h>
937 +#include <asm/io.h>
938 +#include <asm/system.h>
939 +
940 +/*
941 + * When kexec transitions to the new kernel there is a one-to-one
942 + * mapping between physical and virtual addresses.  On processors
943 + * where you can disable the MMU this is trivial, and easy.  For
944 + * others it is still a simple predictable page table to setup.
945 + *
946 + * In that environment kexec copies the new kernel to its final
947 + * resting place.  This means I can only support memory whose
948 + * physical address can fit in an unsigned long.  In particular
949 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
950 + * If the assembly stub has more restrictive requirements
951 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
952 + * defined more restrictively in <asm/kexec.h>.
953 + *
954 + * The code for the transition from the current kernel to the
955 + * the new kernel is placed in the control_code_buffer, whose size
956 + * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
957 + * page of memory is necessary, but some architectures require more.
958 + * Because this memory must be identity mapped in the transition from
959 + * virtual to physical addresses it must live in the range
960 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
961 + * modifiable.
962 + *
963 + * The assembly stub in the control code buffer is passed a linked list
964 + * of descriptor pages detailing the source pages of the new kernel,
965 + * and the destination addresses of those source pages.  As this data
966 + * structure is not used in the context of the current OS, it must
967 + * be self-contained.
968 + *
969 + * The code has been made to work with highmem pages and will use a
970 + * destination page in its final resting place (if it happens
971 + * to allocate it).  The end product of this is that most of the
972 + * physical address space, and most of RAM can be used.
973 + *
974 + * Future directions include:
975 + *  - allocating a page table with the control code buffer identity
976 + *    mapped, to simplify machine_kexec and make kexec_on_panic more
977 + *    reliable.
978 + */
979 +
980 +/*
981 + * KIMAGE_NO_DEST is an impossible destination address..., for
982 + * allocating pages whose destination address we do not care about.
983 + */
984 +#define KIMAGE_NO_DEST (-1UL)
985 +
986 +static int kimage_is_destination_range(
987 +       struct kimage *image, unsigned long start, unsigned long end);
988 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
989 +
990 +
991 +static int kimage_alloc(struct kimage **rimage,
992 +       unsigned long nr_segments, struct kexec_segment *segments)
993 +{
994 +       int result;
995 +       struct kimage *image;
996 +       size_t segment_bytes;
997 +       unsigned long i;
998 +
999 +       /* Allocate a controlling structure */
1000 +       result = -ENOMEM;
1001 +       image = kmalloc(sizeof(*image), GFP_KERNEL);
1002 +       if (!image) {
1003 +               goto out;
1004 +       }
1005 +       memset(image, 0, sizeof(*image));
1006 +       image->head = 0;
1007 +       image->entry = &image->head;
1008 +       image->last_entry = &image->head;
1009 +
1010 +       /* Initialize the list of control pages */
1011 +       INIT_LIST_HEAD(&image->control_pages);
1012 +
1013 +       /* Initialize the list of destination pages */
1014 +       INIT_LIST_HEAD(&image->dest_pages);
1015 +
1016 +       /* Initialize the list of unuseable pages */
1017 +       INIT_LIST_HEAD(&image->unuseable_pages);
1018 +
1019 +       /* Read in the segments */
1020 +       image->nr_segments = nr_segments;
1021 +       segment_bytes = nr_segments * sizeof*segments;
1022 +       result = copy_from_user(image->segment, segments, segment_bytes);
1023 +       if (result)
1024 +               goto out;
1025 +
1026 +       /*
1027 +        * Verify we have good destination addresses.  The caller is
1028 +        * responsible for making certain we don't attempt to load
1029 +        * the new image into invalid or reserved areas of RAM.  This
1030 +        * just verifies it is an address we can use.
1031 +        */
1032 +       result = -EADDRNOTAVAIL;
1033 +       for (i = 0; i < nr_segments; i++) {
1034 +               unsigned long mend;
1035 +               mend = ((unsigned long)(image->segment[i].mem)) +
1036 +                       image->segment[i].memsz;
1037 +               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
1038 +                       goto out;
1039 +       }
1040 +
1041 +       /*
1042 +        * Find a location for the control code buffer, and add it
1043 +        * the vector of segments so that it's pages will also be
1044 +        * counted as destination pages.
1045 +        */
1046 +       result = -ENOMEM;
1047 +       image->control_code_page = kimage_alloc_control_pages(image,
1048 +               get_order(KEXEC_CONTROL_CODE_SIZE));
1049 +       if (!image->control_code_page) {
1050 +               printk(KERN_ERR "Could not allocate control_code_buffer\n");
1051 +               goto out;
1052 +       }
1053 +
1054 +       result = 0;
1055 + out:
1056 +       if (result == 0) {
1057 +               *rimage = image;
1058 +       } else {
1059 +               kfree(image);
1060 +       }
1061 +       return result;
1062 +}
1063 +
1064 +static int kimage_is_destination_range(
1065 +       struct kimage *image, unsigned long start, unsigned long end)
1066 +{
1067 +       unsigned long i;
1068 +
1069 +       for (i = 0; i < image->nr_segments; i++) {
1070 +               unsigned long mstart, mend;
1071 +               mstart = (unsigned long)image->segment[i].mem;
1072 +               mend   = mstart + image->segment[i].memsz;
1073 +               if ((end > mstart) && (start < mend)) {
1074 +                       return 1;
1075 +               }
1076 +       }
1077 +       return 0;
1078 +}
1079 +
1080 +static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
1081 +{
1082 +       struct page *pages;
1083 +       pages = alloc_pages(gfp_mask, order);
1084 +       if (pages) {
1085 +               unsigned int count, i;
1086 +               pages->mapping = NULL;
1087 +               pages->private = order;
1088 +               count = 1 << order;
1089 +               for(i = 0; i < count; i++) {
1090 +                       SetPageReserved(pages + i);
1091 +               }
1092 +       }
1093 +       return pages;
1094 +}
1095 +
1096 +static void kimage_free_pages(struct page *page)
1097 +{
1098 +       unsigned int order, count, i;
1099 +       order = page->private;
1100 +       count = 1 << order;
1101 +       for(i = 0; i < count; i++) {
1102 +               ClearPageReserved(page + i);
1103 +       }
1104 +       __free_pages(page, order);
1105 +}
1106 +
1107 +static void kimage_free_page_list(struct list_head *list)
1108 +{
1109 +       struct list_head *pos, *next;
1110 +       list_for_each_safe(pos, next, list) {
1111 +               struct page *page;
1112 +
1113 +               page = list_entry(pos, struct page, lru);
1114 +               list_del(&page->lru);
1115 +
1116 +               kimage_free_pages(page);
1117 +       }
1118 +}
1119 +               
1120 +struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
1121 +{
1122 +       /* Control pages are special, they are the intermediaries
1123 +        * that are needed while we copy the rest of the pages
1124 +        * to their final resting place.  As such they must
1125 +        * not conflict with either the destination addresses
1126 +        * or memory the kernel is already using.
1127 +        *
1128 +        * The only case where we really need more than one of
1129 +        * these are for architectures where we cannot disable
1130 +        * the MMU and must instead generate an identity mapped
1131 +        * page table for all of the memory.
1132 +        *
1133 +        * At worst this runs in O(N) of the image size.
1134 +        */
1135 +       struct list_head extra_pages;
1136 +       struct page *pages;
1137 +       unsigned int count;
1138 +
1139 +       count = 1 << order;
1140 +       INIT_LIST_HEAD(&extra_pages);
1141 +
1142 +       /* Loop while I can allocate a page and the page allocated
1143 +        * is a destination page.
1144 +        */
1145 +       do {
1146 +               unsigned long pfn, epfn, addr, eaddr;
1147 +               pages = kimage_alloc_pages(GFP_KERNEL, order);
1148 +               if (!pages)
1149 +                       break;
1150 +               pfn   = page_to_pfn(pages);
1151 +               epfn  = pfn + count;
1152 +               addr  = pfn << PAGE_SHIFT;
1153 +               eaddr = epfn << PAGE_SHIFT;
1154 +               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
1155 +                       kimage_is_destination_range(image, addr, eaddr))
1156 +               {
1157 +                       list_add(&pages->lru, &extra_pages);
1158 +                       pages = NULL;
1159 +               }
1160 +       } while(!pages);
1161 +       if (pages) {
1162 +               /* Remember the allocated page... */
1163 +               list_add(&pages->lru, &image->control_pages);
1164 +
1165 +               /* Because the page is already in it's destination
1166 +                * location we will never allocate another page at
1167 +                * that address.  Therefore kimage_alloc_pages
1168 +                * will not return it (again) and we don't need
1169 +                * to give it an entry in image->segment[].
1170 +                */
1171 +       }
1172 +       /* Deal with the destination pages I have inadvertently allocated.
1173 +        *
1174 +        * Ideally I would convert multi-page allocations into single
1175 +        * page allocations, and add everyting to image->dest_pages.
1176 +        * 
1177 +        * For now it is simpler to just free the pages.
1178 +        */
1179 +       kimage_free_page_list(&extra_pages);
1180 +       return pages;
1181 +       
1182 +}
1183 +
1184 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
1185 +{
1186 +       if (*image->entry != 0) {
1187 +               image->entry++;
1188 +       }
1189 +       if (image->entry == image->last_entry) {
1190 +               kimage_entry_t *ind_page;
1191 +               struct page *page;
1192 +               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
1193 +               if (!page) {
1194 +                       return -ENOMEM;
1195 +               }
1196 +               ind_page = page_address(page);
1197 +               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
1198 +               image->entry = ind_page;
1199 +               image->last_entry =
1200 +                       ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
1201 +       }
1202 +       *image->entry = entry;
1203 +       image->entry++;
1204 +       *image->entry = 0;
1205 +       return 0;
1206 +}
1207 +
1208 +static int kimage_set_destination(
1209 +       struct kimage *image, unsigned long destination)
1210 +{
1211 +       int result;
1212 +
1213 +       destination &= PAGE_MASK;
1214 +       result = kimage_add_entry(image, destination | IND_DESTINATION);
1215 +       if (result == 0) {
1216 +               image->destination = destination;
1217 +       }
1218 +       return result;
1219 +}
1220 +
1221 +
1222 +static int kimage_add_page(struct kimage *image, unsigned long page)
1223 +{
1224 +       int result;
1225 +
1226 +       page &= PAGE_MASK;
1227 +       result = kimage_add_entry(image, page | IND_SOURCE);
1228 +       if (result == 0) {
1229 +               image->destination += PAGE_SIZE;
1230 +       }
1231 +       return result;
1232 +}
1233 +
1234 +
1235 +static void kimage_free_extra_pages(struct kimage *image)
1236 +{
1237 +       /* Walk through and free any extra destination pages I may have */
1238 +       kimage_free_page_list(&image->dest_pages);
1239 +
1240 +       /* Walk through and free any unuseable pages I have cached */
1241 +       kimage_free_page_list(&image->unuseable_pages);
1242 +
1243 +}
1244 +static int kimage_terminate(struct kimage *image)
1245 +{
1246 +       int result;
1247 +
1248 +       result = kimage_add_entry(image, IND_DONE);
1249 +       if (result == 0) {
1250 +               /* Point at the terminating element */
1251 +               image->entry--;
1252 +               kimage_free_extra_pages(image);
1253 +       }
1254 +       return result;
1255 +}
1256 +
1257 +#define for_each_kimage_entry(image, ptr, entry) \
1258 +       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
1259 +               ptr = (entry & IND_INDIRECTION)? \
1260 +                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
1261 +
1262 +static void kimage_free_entry(kimage_entry_t entry)
1263 +{
1264 +       struct page *page;
1265 +
1266 +       page = pfn_to_page(entry >> PAGE_SHIFT);
1267 +       kimage_free_pages(page);
1268 +}
1269 +
1270 +static void kimage_free(struct kimage *image)
1271 +{
1272 +       kimage_entry_t *ptr, entry;
1273 +       kimage_entry_t ind = 0;
1274 +
1275 +       if (!image)
1276 +               return;
1277 +       kimage_free_extra_pages(image);
1278 +       for_each_kimage_entry(image, ptr, entry) {
1279 +               if (entry & IND_INDIRECTION) {
1280 +                       /* Free the previous indirection page */
1281 +                       if (ind & IND_INDIRECTION) {
1282 +                               kimage_free_entry(ind);
1283 +                       }
1284 +                       /* Save this indirection page until we are
1285 +                        * done with it.
1286 +                        */
1287 +                       ind = entry;
1288 +               }
1289 +               else if (entry & IND_SOURCE) {
1290 +                       kimage_free_entry(entry);
1291 +               }
1292 +       }
1293 +       /* Free the final indirection page */
1294 +       if (ind & IND_INDIRECTION) {
1295 +               kimage_free_entry(ind);
1296 +       }
1297 +
1298 +       /* Handle any machine specific cleanup */
1299 +       machine_kexec_cleanup(image);
1300 +
1301 +       /* Free the kexec control pages... */
1302 +       kimage_free_page_list(&image->control_pages);
1303 +       kfree(image);
1304 +}
1305 +
1306 +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
1307 +{
1308 +       kimage_entry_t *ptr, entry;
1309 +       unsigned long destination = 0;
1310 +
1311 +       for_each_kimage_entry(image, ptr, entry) {
1312 +               if (entry & IND_DESTINATION) {
1313 +                       destination = entry & PAGE_MASK;
1314 +               }
1315 +               else if (entry & IND_SOURCE) {
1316 +                       if (page == destination) {
1317 +                               return ptr;
1318 +                       }
1319 +                       destination += PAGE_SIZE;
1320 +               }
1321 +       }
1322 +       return 0;
1323 +}
1324 +
1325 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
1326 +{
1327 +       /*
1328 +        * Here we implement safeguards to ensure that a source page
1329 +        * is not copied to its destination page before the data on
1330 +        * the destination page is no longer useful.
1331 +        *
1332 +        * To do this we maintain the invariant that a source page is
1333 +        * either its own destination page, or it is not a
1334 +        * destination page at all.
1335 +        *
1336 +        * That is slightly stronger than required, but the proof
1337 +        * that no problems will not occur is trivial, and the
1338 +        * implementation is simply to verify.
1339 +        *
1340 +        * When allocating all pages normally this algorithm will run
1341 +        * in O(N) time, but in the worst case it will run in O(N^2)
1342 +        * time.   If the runtime is a problem the data structures can
1343 +        * be fixed.
1344 +        */
1345 +       struct page *page;
1346 +       unsigned long addr;
1347 +
1348 +       /*
1349 +        * Walk through the list of destination pages, and see if I
1350 +        * have a match.
1351 +        */
1352 +       list_for_each_entry(page, &image->dest_pages, lru) {
1353 +               addr = page_to_pfn(page) << PAGE_SHIFT;
1354 +               if (addr == destination) {
1355 +                       list_del(&page->lru);
1356 +                       return page;
1357 +               }
1358 +       }
1359 +       page = NULL;
1360 +       while (1) {
1361 +               kimage_entry_t *old;
1362 +
1363 +               /* Allocate a page, if we run out of memory give up */
1364 +               page = kimage_alloc_pages(gfp_mask, 0);
1365 +               if (!page) {
1366 +                       return 0;
1367 +               }
1368 +               /* If the page cannot be used file it away */
1369 +               if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1370 +                       list_add(&page->lru, &image->unuseable_pages);
1371 +                       continue;
1372 +               }
1373 +               addr = page_to_pfn(page) << PAGE_SHIFT;
1374 +
1375 +               /* If it is the destination page we want use it */
1376 +               if (addr == destination)
1377 +                       break;
1378 +
1379 +               /* If the page is not a destination page use it */
1380 +               if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
1381 +                       break;
1382 +
1383 +               /*
1384 +                * I know that the page is someones destination page.
1385 +                * See if there is already a source page for this
1386 +                * destination page.  And if so swap the source pages.
1387 +                */
1388 +               old = kimage_dst_used(image, addr);
1389 +               if (old) {
1390 +                       /* If so move it */
1391 +                       unsigned long old_addr;
1392 +                       struct page *old_page;
1393 +
1394 +                       old_addr = *old & PAGE_MASK;
1395 +                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1396 +                       copy_highpage(page, old_page);
1397 +                       *old = addr | (*old & ~PAGE_MASK);
1398 +
1399 +                       /* The old page I have found cannot be a
1400 +                        * destination page, so return it.
1401 +                        */
1402 +                       addr = old_addr;
1403 +                       page = old_page;
1404 +                       break;
1405 +               }
1406 +               else {
1407 +                       /* Place the page on the destination list I
1408 +                        * will use it later.
1409 +                        */
1410 +                       list_add(&page->lru, &image->dest_pages);
1411 +               }
1412 +       }
1413 +       return page;
1414 +}
1415 +
1416 +static int kimage_load_segment(struct kimage *image,
1417 +       struct kexec_segment *segment)
1418 +{
1419 +       unsigned long mstart;
1420 +       int result;
1421 +       unsigned long offset;
1422 +       unsigned long offset_end;
1423 +       unsigned char *buf;
1424 +
1425 +       result = 0;
1426 +       buf = segment->buf;
1427 +       mstart = (unsigned long)segment->mem;
1428 +
1429 +       offset_end = segment->memsz;
1430 +
1431 +       result = kimage_set_destination(image, mstart);
1432 +       if (result < 0) {
1433 +               goto out;
1434 +       }
1435 +       for (offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
1436 +               struct page *page;
1437 +               char *ptr;
1438 +               size_t size, leader;
1439 +               page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
1440 +               if (page == 0) {
1441 +                       result  = -ENOMEM;
1442 +                       goto out;
1443 +               }
1444 +               result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
1445 +               if (result < 0) {
1446 +                       goto out;
1447 +               }
1448 +               ptr = kmap(page);
1449 +               if (segment->bufsz < offset) {
1450 +                       /* We are past the end zero the whole page */
1451 +                       memset(ptr, 0, PAGE_SIZE);
1452 +                       kunmap(page);
1453 +                       continue;
1454 +               }
1455 +               size = PAGE_SIZE;
1456 +               leader = 0;
1457 +               if ((offset == 0)) {
1458 +                       leader = mstart & ~PAGE_MASK;
1459 +               }
1460 +               if (leader) {
1461 +                       /* We are on the first page zero the unused portion */
1462 +                       memset(ptr, 0, leader);
1463 +                       size -= leader;
1464 +                       ptr += leader;
1465 +               }
1466 +               if (size > (segment->bufsz - offset)) {
1467 +                       size = segment->bufsz - offset;
1468 +               }
1469 +               if (size < (PAGE_SIZE - leader)) {
1470 +                       /* zero the trailing part of the page */
1471 +                       memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
1472 +               }
1473 +               result = copy_from_user(ptr, buf + offset, size);
1474 +               kunmap(page);
1475 +               if (result) {
1476 +                       result = (result < 0) ? result : -EIO;
1477 +                       goto out;
1478 +               }
1479 +       }
1480 + out:
1481 +       return result;
1482 +}
1483 +
1484 +/*
1485 + * Exec Kernel system call: for obvious reasons only root may call it.
1486 + *
1487 + * This call breaks up into three pieces.
1488 + * - A generic part which loads the new kernel from the current
1489 + *   address space, and very carefully places the data in the
1490 + *   allocated pages.
1491 + *
1492 + * - A generic part that interacts with the kernel and tells all of
1493 + *   the devices to shut down.  Preventing on-going dmas, and placing
1494 + *   the devices in a consistent state so a later kernel can
1495 + *   reinitialize them.
1496 + *
1497 + * - A machine specific part that includes the syscall number
1498 + *   and the copies the image to it's final destination.  And
1499 + *   jumps into the image at entry.
1500 + *
1501 + * kexec does not sync, or unmount filesystems so if you need
1502 + * that to happen you need to do that yourself.
1503 + */
1504 +struct kimage *kexec_image = NULL;
1505 +
1506 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
1507 +       struct kexec_segment *segments, unsigned long flags)
1508 +{
1509 +       struct kimage *image;
1510 +       int result;
1511 +
1512 +       /* We only trust the superuser with rebooting the system. */
1513 +       if (!capable(CAP_SYS_BOOT))
1514 +               return -EPERM;
1515 +
1516 +       /*
1517 +        * In case we need just a little bit of special behavior for
1518 +        * reboot on panic.
1519 +        */
1520 +       if (flags != 0)
1521 +               return -EINVAL;
1522 +
1523 +       if (nr_segments > KEXEC_SEGMENT_MAX)
1524 +               return -EINVAL;
1525 +
1526 +       image = NULL;
1527 +       result = 0;
1528 +
1529 +       if (nr_segments > 0) {
1530 +               unsigned long i;
1531 +               result = kimage_alloc(&image, nr_segments, segments);
1532 +               if (result) {
1533 +                       goto out;
1534 +               }
1535 +               result = machine_kexec_prepare(image);
1536 +               if (result) {
1537 +                       goto out;
1538 +               }
1539 +               image->start = entry;
1540 +               for (i = 0; i < nr_segments; i++) {
1541 +                       result = kimage_load_segment(image, &image->segment[i]);
1542 +                       if (result) {
1543 +                               goto out;
1544 +                       }
1545 +               }
1546 +               result = kimage_terminate(image);
1547 +               if (result) {
1548 +                       goto out;
1549 +               }
1550 +       }
1551 +
1552 +       image = xchg(&kexec_image, image);
1553 +
1554 + out:
1555 +       kimage_free(image);
1556 +       return result;
1557 +}
1558 Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/sys.c
1559 ===================================================================
1560 --- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/kernel/sys.c     2004-11-11 10:28:49.000000000 -0500
1561 +++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/kernel/sys.c  2004-11-18 23:25:15.000000000 -0500
1562 @@ -17,6 +17,8 @@
1563  #include <linux/init.h>
1564  #include <linux/highuid.h>
1565  #include <linux/fs.h>
1566 +#include <linux/kernel.h>
1567 +#include <linux/kexec.h>
1568  #include <linux/workqueue.h>
1569  #include <linux/device.h>
1570  #include <linux/times.h>
1571 @@ -226,6 +228,7 @@
1572  cond_syscall(sys_lookup_dcookie)
1573  cond_syscall(sys_swapon)
1574  cond_syscall(sys_swapoff)
1575 +cond_syscall(sys_kexec_load)
1576  cond_syscall(sys_init_module)
1577  cond_syscall(sys_delete_module)
1578  cond_syscall(sys_socketpair)
1579 @@ -505,6 +508,24 @@
1580                 machine_restart(buffer);
1581                 break;
1582  
1583 +#ifdef CONFIG_KEXEC
1584 +       case LINUX_REBOOT_CMD_KEXEC:
1585 +       {
1586 +               struct kimage *image;
1587 +               image = xchg(&kexec_image, 0);
1588 +               if (!image) {
1589 +                       unlock_kernel();
1590 +                       return -EINVAL;
1591 +               }
1592 +               notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
1593 +               system_state = SYSTEM_BOOTING;
1594 +               device_shutdown();
1595 +               printk(KERN_EMERG "Starting new kernel\n");
1596 +               machine_shutdown();
1597 +               machine_kexec(image);
1598 +               break;
1599 +       }
1600 +#endif
1601  #ifdef CONFIG_SOFTWARE_SUSPEND
1602         case LINUX_REBOOT_CMD_SW_SUSPEND:
1603                 {