2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00
linux/arch/x86/kernel/setup.c
Linus Torvalds 00c010e130 - The 11 patch series "Add folio_mk_pte()" from Matthew Wilcox
simplifies the act of creating a pte which addresses the first page in a
   folio and reduces the amount of plumbing which architecture must
   implement to provide this.
 
 - The 8 patch series "Misc folio patches for 6.16" from Matthew Wilcox
   is a shower of largely unrelated folio infrastructure changes which
   clean things up and better prepare us for future work.
 
 - The 3 patch series "memory,x86,acpi: hotplug memory alignment
   advisement" from Gregory Price adds early-init code to prevent x86 from
   leaving physical memory unused when physical address regions are not
   aligned to memory block size.
 
 - The 2 patch series "mm/compaction: allow more aggressive proactive
   compaction" from Michal Clapinski provides some tuning of the (sadly,
   hard-coded (more sadly, not auto-tuned)) thresholds for our invokation
   of proactive compaction.  In a simple test case, the reduction of a guest
   VM's memory consumption was dramatic.
 
 - The 8 patch series "Minor cleanups and improvements to swap freeing
   code" from Kemeng Shi provides some code cleaups and a small efficiency
   improvement to this part of our swap handling code.
 
 - The 6 patch series "ptrace: introduce PTRACE_SET_SYSCALL_INFO API"
   from Dmitry Levin adds the ability for a ptracer to modify syscalls
   arguments.  At this time we can alter only "system call information that
   are used by strace system call tampering, namely, syscall number,
   syscall arguments, and syscall return value.
 
   This series should have been incorporated into mm.git's "non-MM"
   branch, but I goofed.
 
 - The 3 patch series "fs/proc: extend the PAGEMAP_SCAN ioctl to report
   guard regions" from Andrei Vagin extends the info returned by the
   PAGEMAP_SCAN ioctl against /proc/pid/pagemap.  This permits CRIU to more
   efficiently get at the info about guard regions.
 
 - The 2 patch series "Fix parameter passed to page_mapcount_is_type()"
   from Gavin Shan implements that fix.  No runtime effect is expected
   because validate_page_before_insert() happens to fix up this error.
 
 - The 3 patch series "kernel/events/uprobes: uprobe_write_opcode()
   rewrite" from David Hildenbrand basically brings uprobe text poking into
   the current decade.  Remove a bunch of hand-rolled implementation in
   favor of using more current facilities.
 
 - The 3 patch series "mm/ptdump: Drop assumption that pxd_val() is u64"
   from Anshuman Khandual provides enhancements and generalizations to the
   pte dumping code.  This might be needed when 128-bit Page Table
   Descriptors are enabled for ARM.
 
 - The 12 patch series "Always call constructor for kernel page tables"
   from Kevin Brodsky "ensures that the ctor/dtor is always called for
   kernel pgtables, as it already is for user pgtables".  This permits the
   addition of more functionality such as "insert hooks to protect page
   tables".  This change does result in various architectures performing
   unnecesary work, but this is fixed up where it is anticipated to occur.
 
 - The 9 patch series "Rust support for mm_struct, vm_area_struct, and
   mmap" from Alice Ryhl adds plumbing to permit Rust access to core MM
   structures.
 
 - The 3 patch series "fix incorrectly disallowed anonymous VMA merges"
   from Lorenzo Stoakes takes advantage of some VMA merging opportunities
   which we've been missing for 15 years.
 
 - The 4 patch series "mm/madvise: batch tlb flushes for MADV_DONTNEED
   and MADV_FREE" from SeongJae Park optimizes process_madvise()'s TLB
   flushing.  Instead of flushing each address range in the provided iovec,
   we batch the flushing across all the iovec entries.  The syscall's cost
   was approximately halved with a microbenchmark which was designed to
   load this particular operation.
 
 - The 6 patch series "Track node vacancy to reduce worst case allocation
   counts" from Sidhartha Kumar makes the maple tree smarter about its node
   preallocation.  stress-ng mmap performance increased by single-digit
   percentages and the amount of unnecessarily preallocated memory was
   dramaticelly reduced.
 
 - The 3 patch series "mm/gup: Minor fix, cleanup and improvements" from
   Baoquan He removes a few unnecessary things which Baoquan noted when
   reading the code.
 
 - The 3 patch series ""Enhance sysfs handling for memory hotplug in
   weighted interleave" from Rakie Kim "enhances the weighted interleave
   policy in the memory management subsystem by improving sysfs handling,
   fixing memory leaks, and introducing dynamic sysfs updates for memory
   hotplug support".  Fixes things on error paths which we are unlikely to
   hit.
 
 - The 7 patch series "mm/damon: auto-tune DAMOS for NUMA setups
   including tiered memory" from SeongJae Park introduces new DAMOS quota
   goal metrics which eliminate the manual tuning which is required when
   utilizing DAMON for memory tiering.
 
 - The 5 patch series "mm/vmalloc.c: code cleanup and improvements" from
   Baoquan He provides cleanups and small efficiency improvements which
   Baoquan found via code inspection.
 
 - The 2 patch series "vmscan: enforce mems_effective during demotion"
   from Gregory Price "changes reclaim to respect cpuset.mems_effective
   during demotion when possible".  because "presently, reclaim explicitly
   ignores cpuset.mems_effective when demoting, which may cause the cpuset
   settings to violated." "This is useful for isolating workloads on a
   multi-tenant system from certain classes of memory more consistently."
 
 - The 2 patch series ""Clean up split_huge_pmd_locked() and remove
   unnecessary folio pointers" from Gavin Guo provides minor cleanups and
   efficiency gains in in the huge page splitting and migrating code.
 
 - The 3 patch series "Use kmem_cache for memcg alloc" from Huan Yang
   creates a slab cache for `struct mem_cgroup', yielding improved memory
   utilization.
 
 - The 4 patch series "add max arg to swappiness in memory.reclaim and
   lru_gen" from Zhongkun He adds a new "max" argument to the "swappiness="
   argument for memory.reclaim MGLRU's lru_gen.  This directs proactive
   reclaim to reclaim from only anon folios rather than file-backed folios.
 
 - The 17 patch series "kexec: introduce Kexec HandOver (KHO)" from Mike
   Rapoport is the first step on the path to permitting the kernel to
   maintain existing VMs while replacing the host kernel via file-based
   kexec.  At this time only memblock's reserve_mem is preserved.
 
 - The 7 patch series "mm: Introduce for_each_valid_pfn()" from David
   Woodhouse provides and uses a smarter way of looping over a pfn range.
   By skipping ranges of invalid pfns.
 
 - The 2 patch series "sched/numa: Skip VMA scanning on memory pinned to
   one NUMA node via cpuset.mems" from Libo Chen removes a lot of pointless
   VMA scanning when a task is pinned a single NUMA mode.  Dramatic
   performance benefits were seen in some real world cases.
 
 - The 2 patch series "JFS: Implement migrate_folio for
   jfs_metapage_aops" from Shivank Garg addresses a warning which occurs
   during memory compaction when using JFS.
 
 - The 4 patch series "move all VMA allocation, freeing and duplication
   logic to mm" from Lorenzo Stoakes moves some VMA code from kernel/fork.c
   into the more appropriate mm/vma.c.
 
 - The 6 patch series "mm, swap: clean up swap cache mapping helper" from
   Kairui Song provides code consolidation and cleanups related to the
   folio_index() function.
 
 - The 2 patch series "mm/gup: Cleanup memfd_pin_folios()" from Vishal
   Moola does that.
 
 - The 8 patch series "memcg: Fix test_memcg_min/low test failures" from
   Waiman Long addresses some bogus failures which are being reported by
   the test_memcontrol selftest.
 
 - The 3 patch series "eliminate mmap() retry merge, add .mmap_prepare
   hook" from Lorenzo Stoakes commences the deprecation of
   file_operations.mmap() in favor of the new
   file_operations.mmap_prepare().  The latter is more restrictive and
   prevents drivers from messing with things in ways which, amongst other
   problems, may defeat VMA merging.
 
 - The 4 patch series "memcg: decouple memcg and objcg stocks"" from
   Shakeel Butt decouples the per-cpu memcg charge cache from the objcg's
   one.  This is a step along the way to making memcg and objcg charging
   NMI-safe, which is a BPF requirement.
 
 - The 6 patch series "mm/damon: minor fixups and improvements for code,
   tests, and documents" from SeongJae Park is "yet another batch of
   miscellaneous DAMON changes.  Fix and improve minor problems in code,
   tests and documents."
 
 - The 7 patch series "memcg: make memcg stats irq safe" from Shakeel
   Butt converts memcg stats to be irq safe.  Another step along the way to
   making memcg charging and stats updates NMI-safe, a BPF requirement.
 
 - The 4 patch series "Let unmap_hugepage_range() and several related
   functions take folio instead of page" from Fan Ni provides folio
   conversions in the hugetlb code.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaDt5qgAKCRDdBJ7gKXxA
 ju6XAP9nTiSfRz8Cz1n5LJZpFKEGzLpSihCYyR6P3o1L9oe3mwEAlZ5+XAwk2I5x
 Qqb/UGMEpilyre1PayQqOnct3aSL9Ao=
 =tYYm
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2025-05-31-14-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull MM updates from Andrew Morton:

 - "Add folio_mk_pte()" from Matthew Wilcox simplifies the act of
   creating a pte which addresses the first page in a folio and reduces
   the amount of plumbing which architecture must implement to provide
   this.

 - "Misc folio patches for 6.16" from Matthew Wilcox is a shower of
   largely unrelated folio infrastructure changes which clean things up
   and better prepare us for future work.

 - "memory,x86,acpi: hotplug memory alignment advisement" from Gregory
   Price adds early-init code to prevent x86 from leaving physical
   memory unused when physical address regions are not aligned to memory
   block size.

 - "mm/compaction: allow more aggressive proactive compaction" from
   Michal Clapinski provides some tuning of the (sadly, hard-coded (more
   sadly, not auto-tuned)) thresholds for our invokation of proactive
   compaction. In a simple test case, the reduction of a guest VM's
   memory consumption was dramatic.

 - "Minor cleanups and improvements to swap freeing code" from Kemeng
   Shi provides some code cleaups and a small efficiency improvement to
   this part of our swap handling code.

 - "ptrace: introduce PTRACE_SET_SYSCALL_INFO API" from Dmitry Levin
   adds the ability for a ptracer to modify syscalls arguments. At this
   time we can alter only "system call information that are used by
   strace system call tampering, namely, syscall number, syscall
   arguments, and syscall return value.

   This series should have been incorporated into mm.git's "non-MM"
   branch, but I goofed.

 - "fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions" from
   Andrei Vagin extends the info returned by the PAGEMAP_SCAN ioctl
   against /proc/pid/pagemap. This permits CRIU to more efficiently get
   at the info about guard regions.

 - "Fix parameter passed to page_mapcount_is_type()" from Gavin Shan
   implements that fix. No runtime effect is expected because
   validate_page_before_insert() happens to fix up this error.

 - "kernel/events/uprobes: uprobe_write_opcode() rewrite" from David
   Hildenbrand basically brings uprobe text poking into the current
   decade. Remove a bunch of hand-rolled implementation in favor of
   using more current facilities.

 - "mm/ptdump: Drop assumption that pxd_val() is u64" from Anshuman
   Khandual provides enhancements and generalizations to the pte dumping
   code. This might be needed when 128-bit Page Table Descriptors are
   enabled for ARM.

 - "Always call constructor for kernel page tables" from Kevin Brodsky
   ensures that the ctor/dtor is always called for kernel pgtables, as
   it already is for user pgtables.

   This permits the addition of more functionality such as "insert hooks
   to protect page tables". This change does result in various
   architectures performing unnecesary work, but this is fixed up where
   it is anticipated to occur.

 - "Rust support for mm_struct, vm_area_struct, and mmap" from Alice
   Ryhl adds plumbing to permit Rust access to core MM structures.

 - "fix incorrectly disallowed anonymous VMA merges" from Lorenzo
   Stoakes takes advantage of some VMA merging opportunities which we've
   been missing for 15 years.

 - "mm/madvise: batch tlb flushes for MADV_DONTNEED and MADV_FREE" from
   SeongJae Park optimizes process_madvise()'s TLB flushing.

   Instead of flushing each address range in the provided iovec, we
   batch the flushing across all the iovec entries. The syscall's cost
   was approximately halved with a microbenchmark which was designed to
   load this particular operation.

 - "Track node vacancy to reduce worst case allocation counts" from
   Sidhartha Kumar makes the maple tree smarter about its node
   preallocation.

   stress-ng mmap performance increased by single-digit percentages and
   the amount of unnecessarily preallocated memory was dramaticelly
   reduced.

 - "mm/gup: Minor fix, cleanup and improvements" from Baoquan He removes
   a few unnecessary things which Baoquan noted when reading the code.

 - ""Enhance sysfs handling for memory hotplug in weighted interleave"
   from Rakie Kim "enhances the weighted interleave policy in the memory
   management subsystem by improving sysfs handling, fixing memory
   leaks, and introducing dynamic sysfs updates for memory hotplug
   support". Fixes things on error paths which we are unlikely to hit.

 - "mm/damon: auto-tune DAMOS for NUMA setups including tiered memory"
   from SeongJae Park introduces new DAMOS quota goal metrics which
   eliminate the manual tuning which is required when utilizing DAMON
   for memory tiering.

 - "mm/vmalloc.c: code cleanup and improvements" from Baoquan He
   provides cleanups and small efficiency improvements which Baoquan
   found via code inspection.

 - "vmscan: enforce mems_effective during demotion" from Gregory Price
   changes reclaim to respect cpuset.mems_effective during demotion when
   possible. because presently, reclaim explicitly ignores
   cpuset.mems_effective when demoting, which may cause the cpuset
   settings to violated.

   This is useful for isolating workloads on a multi-tenant system from
   certain classes of memory more consistently.

 - "Clean up split_huge_pmd_locked() and remove unnecessary folio
   pointers" from Gavin Guo provides minor cleanups and efficiency gains
   in in the huge page splitting and migrating code.

 - "Use kmem_cache for memcg alloc" from Huan Yang creates a slab cache
   for `struct mem_cgroup', yielding improved memory utilization.

 - "add max arg to swappiness in memory.reclaim and lru_gen" from
   Zhongkun He adds a new "max" argument to the "swappiness=" argument
   for memory.reclaim MGLRU's lru_gen.

   This directs proactive reclaim to reclaim from only anon folios
   rather than file-backed folios.

 - "kexec: introduce Kexec HandOver (KHO)" from Mike Rapoport is the
   first step on the path to permitting the kernel to maintain existing
   VMs while replacing the host kernel via file-based kexec. At this
   time only memblock's reserve_mem is preserved.

 - "mm: Introduce for_each_valid_pfn()" from David Woodhouse provides
   and uses a smarter way of looping over a pfn range. By skipping
   ranges of invalid pfns.

 - "sched/numa: Skip VMA scanning on memory pinned to one NUMA node via
   cpuset.mems" from Libo Chen removes a lot of pointless VMA scanning
   when a task is pinned a single NUMA mode.

   Dramatic performance benefits were seen in some real world cases.

 - "JFS: Implement migrate_folio for jfs_metapage_aops" from Shivank
   Garg addresses a warning which occurs during memory compaction when
   using JFS.

 - "move all VMA allocation, freeing and duplication logic to mm" from
   Lorenzo Stoakes moves some VMA code from kernel/fork.c into the more
   appropriate mm/vma.c.

 - "mm, swap: clean up swap cache mapping helper" from Kairui Song
   provides code consolidation and cleanups related to the folio_index()
   function.

 - "mm/gup: Cleanup memfd_pin_folios()" from Vishal Moola does that.

 - "memcg: Fix test_memcg_min/low test failures" from Waiman Long
   addresses some bogus failures which are being reported by the
   test_memcontrol selftest.

 - "eliminate mmap() retry merge, add .mmap_prepare hook" from Lorenzo
   Stoakes commences the deprecation of file_operations.mmap() in favor
   of the new file_operations.mmap_prepare().

   The latter is more restrictive and prevents drivers from messing with
   things in ways which, amongst other problems, may defeat VMA merging.

 - "memcg: decouple memcg and objcg stocks"" from Shakeel Butt decouples
   the per-cpu memcg charge cache from the objcg's one.

   This is a step along the way to making memcg and objcg charging
   NMI-safe, which is a BPF requirement.

 - "mm/damon: minor fixups and improvements for code, tests, and
   documents" from SeongJae Park is yet another batch of miscellaneous
   DAMON changes. Fix and improve minor problems in code, tests and
   documents.

 - "memcg: make memcg stats irq safe" from Shakeel Butt converts memcg
   stats to be irq safe. Another step along the way to making memcg
   charging and stats updates NMI-safe, a BPF requirement.

 - "Let unmap_hugepage_range() and several related functions take folio
   instead of page" from Fan Ni provides folio conversions in the
   hugetlb code.

* tag 'mm-stable-2025-05-31-14-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (285 commits)
  mm: pcp: increase pcp->free_count threshold to trigger free_high
  mm/hugetlb: convert use of struct page to folio in __unmap_hugepage_range()
  mm/hugetlb: refactor __unmap_hugepage_range() to take folio instead of page
  mm/hugetlb: refactor unmap_hugepage_range() to take folio instead of page
  mm/hugetlb: pass folio instead of page to unmap_ref_private()
  memcg: objcg stock trylock without irq disabling
  memcg: no stock lock for cpu hot-unplug
  memcg: make __mod_memcg_lruvec_state re-entrant safe against irqs
  memcg: make count_memcg_events re-entrant safe against irqs
  memcg: make mod_memcg_state re-entrant safe against irqs
  memcg: move preempt disable to callers of memcg_rstat_updated
  memcg: memcg_rstat_updated re-entrant safe against irqs
  mm: khugepaged: decouple SHMEM and file folios' collapse
  selftests/eventfd: correct test name and improve messages
  alloc_tag: check mem_profiling_support in alloc_tag_init
  Docs/damon: update titles and brief introductions to explain DAMOS
  selftests/damon/_damon_sysfs: read tried regions directories in order
  mm/damon/tests/core-kunit: add a test for damos_set_filters_default_reject()
  mm/damon/paddr: remove unused variable, folio_list, in damon_pa_stat()
  mm/damon/sysfs-schemes: fix wrong comment on damons_sysfs_quota_goal_metric_strs
  ...
2025-05-31 15:44:16 -07:00

1314 lines
33 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1995 Linus Torvalds
*
* This file contains the setup_arch() code, which handles the architecture-dependent
* parts of early kernel initialization.
*/
#include <linux/acpi.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
#include <linux/hugetlb.h>
#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
#include <linux/memblock.h>
#include <linux/panic_notifier.h>
#include <linux/pci.h>
#include <linux/random.h>
#include <linux/root_dev.h>
#include <linux/static_call.h>
#include <linux/swiotlb.h>
#include <linux/tboot.h>
#include <linux/usb/xhci-dbgp.h>
#include <linux/vmalloc.h>
#include <uapi/linux/mount.h>
#include <xen/xen.h>
#include <asm/apic.h>
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
#include <asm/cacheinfo.h>
#include <asm/coco.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#include <asm/gart.h>
#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/kasan.h>
#include <asm/kaslr.h>
#include <asm/mce.h>
#include <asm/memtype.h>
#include <asm/mtrr.h>
#include <asm/nmi.h>
#include <asm/numa.h>
#include <asm/olpc_ofw.h>
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
#include <asm/realmode.h>
#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
/*
* max_low_pfn_mapped: highest directly mapped pfn < 4 GB
* max_pfn_mapped: highest directly mapped pfn > 4 GB
*
* The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
* represented by pfn_mapped[].
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
#ifdef CONFIG_DMI
RESERVE_BRK(dmi_alloc, 65536);
#endif
unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
struct boot_params boot_params;
/*
* These are the four main kernel memory regions, we put them into
* the resource tree so that kdump tools and other debugging tools
* recover it:
*/
static struct resource rodata_resource = {
.name = "Kernel rodata",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource data_resource = {
.name = "Kernel data",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource code_resource = {
.name = "Kernel code",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource bss_resource = {
.name = "Kernel bss",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
#ifdef CONFIG_X86_32
/* CPU data as detected by the assembly code in head_32.S */
struct cpuinfo_x86 new_cpu_data;
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
struct ist_info ist_info;
EXPORT_SYMBOL(ist_info);
#else
struct ist_info ist_info;
#endif
#endif
struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
SYM_PIC_ALIAS(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
__visible unsigned long mmu_cr4_features __ro_after_init;
#else
__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
#endif
#ifdef CONFIG_IMA
static phys_addr_t ima_kexec_buffer_phys;
static size_t ima_kexec_buffer_size;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
int bootloader_type, bootloader_version;
static const struct ctl_table x86_sysctl_table[] = {
{
.procname = "unknown_nmi_panic",
.data = &unknown_nmi_panic,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "panic_on_unrecovered_nmi",
.data = &panic_on_unrecovered_nmi,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "panic_on_io_nmi",
.data = &panic_on_io_nmi,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "bootloader_type",
.data = &bootloader_type,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "bootloader_version",
.data = &bootloader_version,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "io_delay_type",
.data = &io_delay_type,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#if defined(CONFIG_ACPI_SLEEP)
{
.procname = "acpi_video_flags",
.data = &acpi_realmode_flags,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
#endif
};
static int __init init_x86_sysctl(void)
{
register_sysctl_init("kernel", x86_sysctl_table);
return 0;
}
arch_initcall(init_x86_sysctl);
/*
* Setup options
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
extern int root_mountflags;
unsigned long saved_video_mode;
#define RAMDISK_IMAGE_START_MASK 0x07FF
#define RAMDISK_PROMPT_FLAG 0x8000
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
bool builtin_cmdline_added __ro_after_init;
#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
#ifdef CONFIG_EDD_MODULE
EXPORT_SYMBOL(edd);
#endif
/**
* copy_edd() - Copy the BIOS EDD information
* from boot_params into a safe place.
*
*/
static inline void __init copy_edd(void)
{
memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
sizeof(edd.mbr_signature));
memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
edd.edd_info_nr = boot_params.eddbuf_entries;
}
#else
static inline void __init copy_edd(void)
{
}
#endif
void * __init extend_brk(size_t size, size_t align)
{
size_t mask = align - 1;
void *ret;
BUG_ON(_brk_start == 0);
BUG_ON(align & mask);
_brk_end = (_brk_end + mask) & ~mask;
BUG_ON((char *)(_brk_end + size) > __brk_limit);
ret = (void *)_brk_end;
_brk_end += size;
memset(ret, 0, size);
return ret;
}
#ifdef CONFIG_X86_32
static void __init cleanup_highmap(void)
{
}
#endif
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
memblock_reserve_kern(__pa_symbol(_brk_start),
_brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
_brk_start = 0;
}
#ifdef CONFIG_BLK_DEV_INITRD
static u64 __init get_ramdisk_image(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
if (ramdisk_image == 0)
ramdisk_image = phys_initrd_start;
return ramdisk_image;
}
static u64 __init get_ramdisk_size(void)
{
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
if (ramdisk_size == 0)
ramdisk_size = phys_initrd_size;
return ramdisk_size;
}
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
int ret = 0;
/* We need to move the initrd down into directly mapped mem */
u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
if (ret)
panic("Copy RAMDISK failed\n");
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
}
static void __init early_reserve_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image);
}
static void __init reserve_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
initrd_start = 0;
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
PFN_DOWN(ramdisk_end))) {
/* All are mapped, easy case */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
}
relocate_initrd();
memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
static void __init early_reserve_initrd(void)
{
}
static void __init reserve_initrd(void)
{
}
#endif /* CONFIG_BLK_DEV_INITRD */
static void __init add_early_ima_buffer(u64 phys_addr)
{
#ifdef CONFIG_IMA
struct ima_setup_data *data;
data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data));
if (!data) {
pr_warn("setup: failed to memremap ima_setup_data entry\n");
return;
}
if (data->size) {
memblock_reserve_kern(data->addr, data->size);
ima_kexec_buffer_phys = data->addr;
ima_kexec_buffer_size = data->size;
}
early_memunmap(data, sizeof(*data));
#else
pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n");
#endif
}
#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
int __init ima_free_kexec_buffer(void)
{
if (!ima_kexec_buffer_size)
return -ENOENT;
memblock_free_late(ima_kexec_buffer_phys,
ima_kexec_buffer_size);
ima_kexec_buffer_phys = 0;
ima_kexec_buffer_size = 0;
return 0;
}
int __init ima_get_kexec_buffer(void **addr, size_t *size)
{
if (!ima_kexec_buffer_size)
return -ENOENT;
*addr = __va(ima_kexec_buffer_phys);
*size = ima_kexec_buffer_size;
return 0;
}
#endif
static void __init add_kho(u64 phys_addr, u32 data_len)
{
struct kho_data *kho;
u64 addr = phys_addr + sizeof(struct setup_data);
u64 size = data_len - sizeof(struct setup_data);
if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
return;
}
kho = early_memremap(addr, size);
if (!kho) {
pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
addr, size);
return;
}
kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
early_memunmap(kho, size);
}
static void __init parse_setup_data(void)
{
struct setup_data *data;
u64 pa_data, pa_next;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
u32 data_len, data_type;
data = early_memremap(pa_data, sizeof(*data));
data_len = data->len + sizeof(struct setup_data);
data_type = data->type;
pa_next = data->next;
early_memunmap(data, sizeof(*data));
switch (data_type) {
case SETUP_E820_EXT:
e820__memory_setup_extended(pa_data, data_len);
break;
case SETUP_DTB:
add_dtb(pa_data);
break;
case SETUP_EFI:
parse_efi_setup(pa_data, data_len);
break;
case SETUP_IMA:
add_early_ima_buffer(pa_data);
break;
case SETUP_KEXEC_KHO:
add_kho(pa_data, data_len);
break;
case SETUP_RNG_SEED:
data = early_memremap(pa_data, data_len);
add_bootloader_randomness(data->data, data->len);
/* Zero seed for forward secrecy. */
memzero_explicit(data->data, data->len);
/* Zero length in case we find ourselves back here by accident. */
memzero_explicit(&data->len, sizeof(data->len));
early_memunmap(data, data_len);
break;
default:
break;
}
pa_data = pa_next;
}
}
/*
* Translate the fields of 'struct boot_param' into global variables
* representing these parameters.
*/
static void __init parse_boot_params(void)
{
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
edid_info = boot_params.edid_info;
#ifdef CONFIG_X86_32
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
#endif
saved_video_mode = boot_params.hdr.vid_mode;
bootloader_type = boot_params.hdr.type_of_loader;
if ((bootloader_type >> 4) == 0xe) {
bootloader_type &= 0xf;
bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
}
bootloader_version = bootloader_type & 0xf;
bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
EFI32_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
EFI64_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
set_bit(EFI_64BIT, &efi.flags);
}
#endif
if (!boot_params.hdr.root_flags)
root_mountflags &= ~MS_RDONLY;
}
static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_indirect *indirect;
struct setup_data *data;
u64 pa_data, pa_next;
u32 len;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
if (!data) {
pr_warn("setup: failed to memremap setup_data entry\n");
return;
}
len = sizeof(*data);
pa_next = data->next;
memblock_reserve_kern(pa_data, sizeof(*data) + data->len);
if (data->type == SETUP_INDIRECT) {
len += data->len;
early_memunmap(data, sizeof(*data));
data = early_memremap(pa_data, len);
if (!data) {
pr_warn("setup: failed to memremap indirect setup_data\n");
return;
}
indirect = (struct setup_indirect *)data->data;
if (indirect->type != SETUP_INDIRECT)
memblock_reserve_kern(indirect->addr, indirect->len);
}
pa_data = pa_next;
early_memunmap(data, len);
}
}
static void __init arch_reserve_crashkernel(void)
{
unsigned long long crash_base, crash_size, low_size = 0;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
&low_size, &high);
if (ret)
return;
if (xen_pv_domain()) {
pr_info("Ignoring crashkernel for a Xen PV domain\n");
return;
}
reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}
static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "pic1", .start = 0x20, .end = 0x21,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "timer0", .start = 0x40, .end = 0x43,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "timer1", .start = 0x50, .end = 0x53,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "keyboard", .start = 0x60, .end = 0x60,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "keyboard", .start = 0x64, .end = 0x64,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "pic2", .start = 0xa0, .end = 0xa1,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "dma2", .start = 0xc0, .end = 0xdf,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "fpu", .start = 0xf0, .end = 0xff,
.flags = IORESOURCE_BUSY | IORESOURCE_IO }
};
void __init reserve_standard_io_resources(void)
{
int i;
/* request I/O space for devices used on all i[345]86 PCs */
for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
request_resource(&ioport_resource, &standard_io_resources[i]);
}
static void __init setup_kernel_resources(void)
{
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
rodata_resource.start = __pa_symbol(__start_rodata);
rodata_resource.end = __pa_symbol(__end_rodata)-1;
data_resource.start = __pa_symbol(_sdata);
data_resource.end = __pa_symbol(_edata)-1;
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
insert_resource(&iomem_resource, &code_resource);
insert_resource(&iomem_resource, &rodata_resource);
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
}
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
int i;
u16 vendor, devid;
static const __initconst u16 snb_ids[] = {
0x0102,
0x0112,
0x0122,
0x0106,
0x0116,
0x0126,
0x010a,
};
/* Assume no if something weird is going on with PCI */
if (!early_pci_allowed())
return false;
vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
if (vendor != 0x8086)
return false;
devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
if (devid == snb_ids[i])
return true;
#endif
return false;
}
/*
* Sandy Bridge graphics has trouble with certain ranges, exclude
* them from allocation.
*/
static void __init trim_snb_memory(void)
{
static const __initconst unsigned long bad_pages[] = {
0x20050000,
0x20110000,
0x20130000,
0x20138000,
0x40004000,
};
int i;
if (!snb_gfx_workaround_needed())
return;
printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
/*
* SandyBridge integrated graphics devices have a bug that prevents
* them from accessing certain memory ranges, namely anything below
* 1M and in the pages listed in bad_pages[] above.
*
* To avoid these pages being ever accessed by SNB gfx devices reserve
* bad_pages that have not already been reserved at boot time.
* All memory below the 1 MB mark is anyway reserved later during
* setup_arch(), so there is no need to reserve it here.
*/
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
printk(KERN_WARNING "failed to reserve 0x%08lx\n",
bad_pages[i]);
}
}
static void __init trim_bios_range(void)
{
/*
* A special case is the first 4Kb of memory;
* This is a BIOS owned area, not kernel ram, but generally
* not listed as such in the E820 table.
*
* This typically reserves additional memory (64KiB by default)
* since some BIOSes are known to corrupt low memory. See the
* Kconfig help text for X86_RESERVE_LOW.
*/
e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
/*
* special case: Some BIOSes report the PC BIOS
* area (640Kb -> 1Mb) as RAM even though it is not.
* take them out.
*/
e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
e820__update_table(e820_table);
}
/* called before trim_bios_range() to spare extra sanitize */
static void __init e820_add_kernel_range(void)
{
u64 start = __pa_symbol(_text);
u64 size = __pa_symbol(_end) - start;
/*
* Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
* attempt to fix it by adding the range. We may have a confused BIOS,
* or the user may have used memmap=exactmap or memmap=xxM$yyM to
* exclude kernel range. If we really are running on top non-RAM,
* we will crash later anyways.
*/
if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
return;
pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
e820__range_remove(start, size, E820_TYPE_RAM, 0);
e820__range_add(start, size, E820_TYPE_RAM);
}
static void __init early_reserve_memory(void)
{
/*
* Reserve the memory occupied by the kernel between _text and
* __end_of_kernel_reserve symbols. Any kernel sections after the
* __end_of_kernel_reserve symbol must be explicitly reserved with a
* separate memblock_reserve() or they will be discarded.
*/
memblock_reserve_kern(__pa_symbol(_text),
(unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
/*
* The first 4Kb of memory is a BIOS owned area, but generally it is
* not listed as such in the E820 table.
*
* Reserve the first 64K of memory since some BIOSes are known to
* corrupt low memory. After the real mode trampoline is allocated the
* rest of the memory below 640k is reserved.
*
* In addition, make sure page 0 is always reserved because on
* systems with L1TF its contents can be leaked to user processes.
*/
memblock_reserve(0, SZ_64K);
early_reserve_initrd();
memblock_x86_reserve_range_setup_data();
reserve_bios_regions();
trim_snb_memory();
}
/*
* Dump out kernel offset information on panic.
*/
static int
dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
{
if (kaslr_enabled()) {
pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
kaslr_offset(),
__START_KERNEL,
__START_KERNEL_map,
MODULES_VADDR-1);
} else {
pr_emerg("Kernel Offset: disabled\n");
}
return 0;
}
void x86_configure_nx(void)
{
if (boot_cpu_has(X86_FEATURE_NX))
__supported_pte_mask |= _PAGE_NX;
else
__supported_pte_mask &= ~_PAGE_NX;
}
static void __init x86_report_nx(void)
{
if (!boot_cpu_has(X86_FEATURE_NX)) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"missing in CPU!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
#else
/* 32bit non-PAE kernel, NX cannot be used */
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"cannot be enabled: non-PAE kernel!\n");
#endif
}
}
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
* for initialization. Note, the efi init code path is determined by the
* global efi_enabled. This allows the same kernel image to be used on existing
* systems (with a traditional BIOS) as well as on EFI systems.
*/
/*
* setup_arch - architecture-specific boot-time initializations
*
* Note: On x86_64, fixmaps are ready for use even before this is called.
*/
void __init setup_arch(char **cmdline_p)
{
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
/*
* copy kernel address range established so far and switch
* to the proper swapper page table
*/
clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
initial_page_table + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
load_cr3(swapper_pg_dir);
/*
* Note: Quark X1000 CPUs advertise PGE incorrectly and require
* a cr3 based tlb flush, so the following __flush_tlb_all()
* will not flush anything because the CPU quirk which clears
* X86_FEATURE_PGE has not been invoked yet. Though due to the
* load_cr3() above the TLB has been flushed already. The
* quirk is invoked before subsequent calls to __flush_tlb_all()
* so proper operation is guaranteed.
*/
__flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
if (builtin_cmdline[0]) {
/* append boot loader cmdline to builtin */
strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
}
#endif
builtin_cmdline_added = true;
#endif
strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
/*
* If we have OLPC OFW, we might end up relocating the fixmap due to
* reserve_top(), so do this before touching the ioremap area.
*/
olpc_ofw_detect();
idt_setup_early_traps();
early_cpu_init();
jump_label_init();
static_call_init();
early_ioremap_init();
setup_olpc_ofw_pgd();
parse_boot_params();
x86_init.oem.arch_setup();
/*
* Do some memory reservations *before* memory is added to memblock, so
* memblock allocations won't overwrite it.
*
* After this point, everything still needed from the boot loader or
* firmware or kernel text should be early reserved or marked not RAM in
* e820. All other memory is free game.
*
* This call needs to happen before e820__memory_setup() which calls the
* xen_memory_setup() on Xen dom0 which relies on the fact that those
* early reservations have happened already.
*/
early_reserve_memory();
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
e820__memory_setup();
parse_setup_data();
copy_edd();
setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
* console setup can safely call set_fixmap()).
*/
x86_configure_nx();
parse_early_param();
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
x86_report_nx();
apic_setup_apic_calls();
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
apic_is_disabled = true;
#endif
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
e820__finish_early_params();
if (efi_enabled(EFI_BOOT))
efi_init();
reserve_ibft_region();
x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
* needs to be done after dmi_setup(), for the boot CPU.
* For some guest types (Xen PV, SEV-SNP, TDX) it is required to be
* called before cache_bp_init() for setting up MTRR state.
*/
init_hypervisor_platform();
tsc_early_init();
x86_init.resources.probe_roms();
/*
* Add resources for kernel text and data to the iomem_resource.
* Do it after parse_early_param, so it can be debugged.
*/
setup_kernel_resources();
e820_add_kernel_range();
trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
E820_TYPE_RESERVED);
e820__update_table(e820_table);
printk(KERN_INFO "fixed physical RAM map:\n");
e820__print_table("bad_ppro");
}
#else
early_gart_iommu_check();
#endif
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
cache_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
max_possible_pfn = max_pfn;
/*
* Define random base addresses for memory sections after max_pfn is
* defined and before each memory section base is used.
*/
kernel_randomize_memory();
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
#else
check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
max_low_pfn = e820__end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
#endif
/* Find and reserve MPTABLE area */
x86_init.mpparse.find_mptable();
early_alloc_pgt_buf();
/*
* Need to conclude brk, before e820__memblock_setup()
* it could use memblock_find_in_range, could overlap with
* brk area.
*/
reserve_brk();
cleanup_highmap();
e820__memblock_setup();
/*
* Needs to run after memblock setup because it needs the physical
* memory size.
*/
mem_encrypt_setup_arch();
cc_random_init();
efi_find_mirror();
efi_esrt_init();
efi_mokvar_table_init();
/*
* The EFI specification says that boot service code won't be
* called after ExitBootServices(). This is, in fact, a lie.
*/
efi_reserve_boot_services();
/* preallocate 4k for mptable mpc */
e820__memblock_alloc_reserved_mpc_new();
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
setup_bios_corruption_check();
#endif
#ifdef CONFIG_X86_32
printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
(max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif
/*
* Find free memory for the real mode trampoline and place it there. If
* there is not enough free memory under 1M, on EFI-enabled systems
* there will be additional attempt to reclaim the memory for the real
* mode trampoline at efi_free_boot_services().
*
* Unconditionally reserve the entire first 1M of RAM because BIOSes
* are known to corrupt low memory and several hundred kilobytes are not
* worth complex detection what memory gets clobbered. Windows does the
* same thing for very similar reasons.
*
* Moreover, on machines with SandyBridge graphics or in setups that use
* crashkernel the entire 1M is reserved anyway.
*
* Note the host kernel TDX also requires the first 1MB being reserved.
*/
x86_platform.realmode_reserve();
init_mem_mapping();
/*
* init_mem_mapping() relies on the early IDT page fault handling.
* Now either enable FRED or install the real page fault handler
* for 64-bit in the IDT.
*/
cpu_init_replace_early_idt();
/*
* Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
* with the current CR4 value. This may not be necessary, but
* auditing all the early-boot CR4 manipulation would be needed to
* rule it out.
*
* Mask off features that don't work outside long mode (just
* PCIDE for now).
*/
mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
memblock_set_current_limit(get_max_mapped());
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
*/
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
if (init_ohci1394_dma_early)
init_ohci1394_dma_on_all_controllers();
#endif
/* Allocate bigger log buffer */
setup_log_buf(1);
if (efi_enabled(EFI_BOOT)) {
switch (boot_params.secure_boot) {
case efi_secureboot_mode_disabled:
pr_info("Secure boot disabled\n");
break;
case efi_secureboot_mode_enabled:
pr_info("Secure boot enabled\n");
break;
default:
pr_info("Secure boot could not be determined\n");
break;
}
}
reserve_initrd();
acpi_table_upgrade();
/* Look for ACPI tables and reserve memory occupied by them. */
acpi_boot_table_init();
vsmp_init();
io_delay_init();
early_platform_quirks();
/* Some platforms need the APIC registered for NUMA configuration */
early_acpi_boot_init();
x86_init.mpparse.early_parse_smp_cfg();
x86_flattree_get_config();
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
hugetlb_bootmem_alloc();
}
/*
* Reserve memory for crash kernel after SRAT is parsed so that it
* won't consume hotpluggable memory.
*/
arch_reserve_crashkernel();
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();
x86_init.paging.pagetable_init();
kasan_init();
/*
* Sync back kernel address range.
*
* FIXME: Can the later sync in setup_cpu_entry_areas() replace
* this call?
*/
sync_initial_page_table();
tboot_probe();
map_vsyscall();
x86_32_probe_apic();
early_quirks();
topology_apply_cmdline_limits_early();
/*
* Parse SMP configuration. Try ACPI first and then the platform
* specific parser.
*/
acpi_boot_init();
x86_init.mpparse.parse_smp_cfg();
/* Last opportunity to detect and map the local APIC */
init_apic_mappings();
topology_init_possible_cpus();
init_cpu_to_node();
init_gi_nodes();
io_apic_init_mappings();
x86_init.hyper.guest_late_init();
e820__reserve_resources();
e820__register_nosave_regions(max_pfn);
x86_init.resources.reserve_resources();
e820__setup_pci_gap();
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
vgacon_register_screen(&screen_info);
#endif
#endif
x86_init.oem.banner();
x86_init.timers.wallclock_init();
/*
* This needs to run before setup_local_APIC() which soft-disables the
* local APIC temporarily and that masks the thermal LVT interrupt,
* leading to softlockups on machines which have configured SMI
* interrupt delivery.
*/
therm_lvt_init();
mcheck_init();
register_refined_jiffies(CLOCK_TICK_RATE);
#ifdef CONFIG_EFI
if (efi_enabled(EFI_BOOT))
efi_apply_memmap_quirks();
#endif
unwind_init();
}
#ifdef CONFIG_X86_32
static struct resource video_ram_resource = {
.name = "Video RAM area",
.start = 0xa0000,
.end = 0xbffff,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
void __init i386_reserve_resources(void)
{
request_resource(&iomem_resource, &video_ram_resource);
reserve_standard_io_resources();
}
#endif /* CONFIG_X86_32 */
static struct notifier_block kernel_offset_notifier = {
.notifier_call = dump_kernel_offset
};
static int __init register_kernel_offset_dumper(void)
{
atomic_notifier_chain_register(&panic_notifier_list,
&kernel_offset_notifier);
return 0;
}
__initcall(register_kernel_offset_dumper);
#ifdef CONFIG_HOTPLUG_CPU
bool arch_cpu_is_hotpluggable(int cpu)
{
return cpu > 0;
}
#endif /* CONFIG_HOTPLUG_CPU */