mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00

- A large and involved preparatory series to pave the way to add exception handling for relocate_kernel - which will be a debugging facility that has aided in the field to debug an exceptionally hard to debug early boot bug. Plus assorted cleanups and fixes that were discovered along the way, by David Woodhouse: - Clean up and document register use in relocate_kernel_64.S - Use named labels in swap_pages in relocate_kernel_64.S - Only swap pages for ::preserve_context mode - Allocate PGD for x86_64 transition page tables separately - Copy control page into place in machine_kexec_prepare() - Invoke copy of relocate_kernel() instead of the original - Move relocate_kernel to kernel .data section - Add data section to relocate_kernel - Drop page_list argument from relocate_kernel() - Eliminate writes through kernel mapping of relocate_kernel page - Clean up register usage in relocate_kernel() - Mark relocate_kernel page as ROX instead of RWX - Disable global pages before writing to control page - Ensure preserve_context flag is set on return to kernel - Use correct swap page in swap_pages function - Fix stack and handling of re-entry point for ::preserve_context - Mark machine_kexec() with __nocfi - Cope with relocate_kernel() not being at the start of the page - Use typedef for relocate_kernel_fn function prototype - Fix location of relocate_kernel with -ffunction-sections (fix by Nathan Chancellor) - A series to remove the last remaining absolute symbol references from .head.text, and enforce this at build time, by Ard Biesheuvel: - Avoid WARN()s and panic()s in early boot code - Don't hang but terminate on failure to remap SVSM CA - Determine VA/PA offset before entering C code - Avoid intentional absolute symbol references in .head.text - Disable UBSAN in early boot code - Move ENTRY_TEXT to the start of the image - Move .head.text into its own output section - Reject absolute references in .head.text - Which build-time enforcement uncovered a handful of bugs of essentially non-working code, and a wrokaround for a toolchain bug, fixed by Ard Biesheuvel as well: - Fix spurious undefined reference when CONFIG_X86_5LEVEL=n, on GCC-12 - Disable UBSAN on SEV code that may execute very early - Disable ftrace branch profiling in SEV startup code - And miscellaneous cleanups: - kexec_core: Add and update comments regarding the KEXEC_JUMP flow (Rafael J. Wysocki) - x86/sysfs: Constify 'struct bin_attribute' (Thomas Weißschuh) Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmeQDmURHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1inwRAAjD5QR/Yu7Yiv2nM/ncUwAItsFkv9Jk4Y HPGz9qNJoZxKxuZVj9bfQhWDe3g6VLnlDYgatht9BsyP5b12qZrUe+yp/TOH54Z3 wPD+U/jun4jiSr7oJkJC+bFn+a/tL39pB8Y6m+jblacgVglleO3SH5fBWNE1UbIV e2iiNxi0bfuHy3wquegnKaMyF1e7YLw1p5laGSwwk21g5FjT7cLQOqC0/9u8u9xX Ha+iaod7JOcjiQOqIt/MV57ldWEFCrUhQozRV3tK5Ptf5aoGFpisgQoRoduWUtFz UbHiHhv6zE4DOIUzaAbJjYfR1Z/LCviwON97XJgeOOkJaULF7yFCfhGxKSyQoMIh qZtlBs4VsGl2/dOl+iW6xKwgRiNundTzSQtt5D/xuFz5LnDxe/SrlZnYp8lOPP8R w9V2b/fC0YxmUzEW6EDhBqvfuScKiNWoic47qvYfZPaWyg1ESpvWTIh6AKB5ThUR upgJQdA4HW+y5C57uHW40TSe3xEeqM3+Slk0jxLElP7/yTul5r7jrjq2EkwaAv/j 6/0LsMSr33r9fVFeMP1qLXPUaipcqTWWTpeeTr8NBGUcvOKzw5SltEG4NihzCyhF 3/UMQhcQ6KE3iFMPlRu4hV7ZV4gErZmLoRwh9Uk28f2Xx8T95uoV8KTg1/sRZRTo uQLeRxYnyrw= =vGWS -----END PGP SIGNATURE----- Merge tag 'x86-boot-2025-01-21' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 boot updates from Ingo Molnar: - A large and involved preparatory series to pave the way to add exception handling for relocate_kernel - which will be a debugging facility that has aided in the field to debug an exceptionally hard to debug early boot bug. Plus assorted cleanups and fixes that were discovered along the way, by David Woodhouse: - Clean up and document register use in relocate_kernel_64.S - Use named labels in swap_pages in relocate_kernel_64.S - Only swap pages for ::preserve_context mode - Allocate PGD for x86_64 transition page tables separately - Copy control page into place in machine_kexec_prepare() - Invoke copy of relocate_kernel() instead of the original - Move relocate_kernel to kernel .data section - Add data section to relocate_kernel - Drop page_list argument from relocate_kernel() - Eliminate writes through kernel mapping of relocate_kernel page - Clean up register usage in relocate_kernel() - Mark relocate_kernel page as ROX instead of RWX - Disable global pages before writing to control page - Ensure preserve_context flag is set on return to kernel - Use correct swap page in swap_pages function - Fix stack and handling of re-entry point for ::preserve_context - Mark machine_kexec() with __nocfi - Cope with relocate_kernel() not being at the start of the page - Use typedef for relocate_kernel_fn function prototype - Fix location of relocate_kernel with -ffunction-sections (fix by Nathan Chancellor) - A series to remove the last remaining absolute symbol references from .head.text, and enforce this at build time, by Ard Biesheuvel: - Avoid WARN()s and panic()s in early boot code - Don't hang but terminate on failure to remap SVSM CA - Determine VA/PA offset before entering C code - Avoid intentional absolute symbol references in .head.text - Disable UBSAN in early boot code - Move ENTRY_TEXT to the start of the image - Move .head.text into its own output section - Reject absolute references in .head.text - The above build-time enforcement uncovered a handful of bugs of essentially non-working code, and a wrokaround for a toolchain bug, fixed by Ard Biesheuvel as well: - Fix spurious undefined reference when CONFIG_X86_5LEVEL=n, on GCC-12 - Disable UBSAN on SEV code that may execute very early - Disable ftrace branch profiling in SEV startup code - And miscellaneous cleanups: - kexec_core: Add and update comments regarding the KEXEC_JUMP flow (Rafael J. Wysocki) - x86/sysfs: Constify 'struct bin_attribute' (Thomas Weißschuh)" * tag 'x86-boot-2025-01-21' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) x86/sev: Disable ftrace branch profiling in SEV startup code x86/kexec: Use typedef for relocate_kernel_fn function prototype x86/kexec: Cope with relocate_kernel() not being at the start of the page kexec_core: Add and update comments regarding the KEXEC_JUMP flow x86/kexec: Mark machine_kexec() with __nocfi x86/kexec: Fix location of relocate_kernel with -ffunction-sections x86/kexec: Fix stack and handling of re-entry point for ::preserve_context x86/kexec: Use correct swap page in swap_pages function x86/kexec: Ensure preserve_context flag is set on return to kernel x86/kexec: Disable global pages before writing to control page x86/sev: Don't hang but terminate on failure to remap SVSM CA x86/sev: Disable UBSAN on SEV code that may execute very early x86/boot/64: Fix spurious undefined reference when CONFIG_X86_5LEVEL=n, on GCC-12 x86/sysfs: Constify 'struct bin_attribute' x86/kexec: Mark relocate_kernel page as ROX instead of RWX x86/kexec: Clean up register usage in relocate_kernel() x86/kexec: Eliminate writes through kernel mapping of relocate_kernel page x86/kexec: Drop page_list argument from relocate_kernel() x86/kexec: Add data section to relocate_kernel x86/kexec: Move relocate_kernel to kernel .data section ...
398 lines
9.1 KiB
C
398 lines
9.1 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#define pr_fmt(fmt) "callthunks: " fmt
|
|
|
|
#include <linux/debugfs.h>
|
|
#include <linux/kallsyms.h>
|
|
#include <linux/memory.h>
|
|
#include <linux/moduleloader.h>
|
|
#include <linux/static_call.h>
|
|
|
|
#include <asm/alternative.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/cpu.h>
|
|
#include <asm/ftrace.h>
|
|
#include <asm/insn.h>
|
|
#include <asm/kexec.h>
|
|
#include <asm/nospec-branch.h>
|
|
#include <asm/paravirt.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/switch_to.h>
|
|
#include <asm/sync_core.h>
|
|
#include <asm/text-patching.h>
|
|
#include <asm/xen/hypercall.h>
|
|
|
|
static int __initdata_or_module debug_callthunks;
|
|
|
|
#define MAX_PATCH_LEN (255-1)
|
|
|
|
#define prdbg(fmt, args...) \
|
|
do { \
|
|
if (debug_callthunks) \
|
|
printk(KERN_DEBUG pr_fmt(fmt), ##args); \
|
|
} while(0)
|
|
|
|
static int __init debug_thunks(char *str)
|
|
{
|
|
debug_callthunks = 1;
|
|
return 1;
|
|
}
|
|
__setup("debug-callthunks", debug_thunks);
|
|
|
|
#ifdef CONFIG_CALL_THUNKS_DEBUG
|
|
DEFINE_PER_CPU(u64, __x86_call_count);
|
|
DEFINE_PER_CPU(u64, __x86_ret_count);
|
|
DEFINE_PER_CPU(u64, __x86_stuffs_count);
|
|
DEFINE_PER_CPU(u64, __x86_ctxsw_count);
|
|
EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count);
|
|
EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count);
|
|
#endif
|
|
|
|
extern s32 __call_sites[], __call_sites_end[];
|
|
|
|
struct core_text {
|
|
unsigned long base;
|
|
unsigned long end;
|
|
const char *name;
|
|
};
|
|
|
|
static bool thunks_initialized __ro_after_init;
|
|
|
|
static const struct core_text builtin_coretext = {
|
|
.base = (unsigned long)_text,
|
|
.end = (unsigned long)_etext,
|
|
.name = "builtin",
|
|
};
|
|
|
|
asm (
|
|
".pushsection .rodata \n"
|
|
".global skl_call_thunk_template \n"
|
|
"skl_call_thunk_template: \n"
|
|
__stringify(INCREMENT_CALL_DEPTH)" \n"
|
|
".global skl_call_thunk_tail \n"
|
|
"skl_call_thunk_tail: \n"
|
|
".popsection \n"
|
|
);
|
|
|
|
extern u8 skl_call_thunk_template[];
|
|
extern u8 skl_call_thunk_tail[];
|
|
|
|
#define SKL_TMPL_SIZE \
|
|
((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template))
|
|
|
|
extern void error_entry(void);
|
|
extern void xen_error_entry(void);
|
|
extern void paranoid_entry(void);
|
|
|
|
static inline bool within_coretext(const struct core_text *ct, void *addr)
|
|
{
|
|
unsigned long p = (unsigned long)addr;
|
|
|
|
return ct->base <= p && p < ct->end;
|
|
}
|
|
|
|
static inline bool within_module_coretext(void *addr)
|
|
{
|
|
bool ret = false;
|
|
|
|
#ifdef CONFIG_MODULES
|
|
struct module *mod;
|
|
|
|
preempt_disable();
|
|
mod = __module_address((unsigned long)addr);
|
|
if (mod && within_module_core((unsigned long)addr, mod))
|
|
ret = true;
|
|
preempt_enable();
|
|
#endif
|
|
return ret;
|
|
}
|
|
|
|
static bool is_coretext(const struct core_text *ct, void *addr)
|
|
{
|
|
if (ct && within_coretext(ct, addr))
|
|
return true;
|
|
if (within_coretext(&builtin_coretext, addr))
|
|
return true;
|
|
return within_module_coretext(addr);
|
|
}
|
|
|
|
static bool skip_addr(void *dest)
|
|
{
|
|
if (dest == error_entry)
|
|
return true;
|
|
if (dest == paranoid_entry)
|
|
return true;
|
|
if (dest == xen_error_entry)
|
|
return true;
|
|
/* Does FILL_RSB... */
|
|
if (dest == __switch_to_asm)
|
|
return true;
|
|
/* Accounts directly */
|
|
if (dest == ret_from_fork)
|
|
return true;
|
|
#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
|
|
if (dest == soft_restart_cpu)
|
|
return true;
|
|
#endif
|
|
#ifdef CONFIG_FUNCTION_TRACER
|
|
if (dest == __fentry__)
|
|
return true;
|
|
#endif
|
|
#ifdef CONFIG_KEXEC_CORE
|
|
# ifdef CONFIG_X86_64
|
|
if (dest >= (void *)__relocate_kernel_start &&
|
|
dest < (void *)__relocate_kernel_end)
|
|
return true;
|
|
# else
|
|
if (dest >= (void *)relocate_kernel &&
|
|
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
|
|
return true;
|
|
# endif
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
static __init_or_module void *call_get_dest(void *addr)
|
|
{
|
|
struct insn insn;
|
|
void *dest;
|
|
int ret;
|
|
|
|
ret = insn_decode_kernel(&insn, addr);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
/* Patched out call? */
|
|
if (insn.opcode.bytes[0] != CALL_INSN_OPCODE)
|
|
return NULL;
|
|
|
|
dest = addr + insn.length + insn.immediate.value;
|
|
if (skip_addr(dest))
|
|
return NULL;
|
|
return dest;
|
|
}
|
|
|
|
static const u8 nops[] = {
|
|
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
|
|
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
|
|
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
|
|
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
|
|
};
|
|
|
|
static void *patch_dest(void *dest, bool direct)
|
|
{
|
|
unsigned int tsize = SKL_TMPL_SIZE;
|
|
u8 insn_buff[MAX_PATCH_LEN];
|
|
u8 *pad = dest - tsize;
|
|
|
|
memcpy(insn_buff, skl_call_thunk_template, tsize);
|
|
apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
|
|
|
|
/* Already patched? */
|
|
if (!bcmp(pad, insn_buff, tsize))
|
|
return pad;
|
|
|
|
/* Ensure there are nops */
|
|
if (bcmp(pad, nops, tsize)) {
|
|
pr_warn_once("Invalid padding area for %pS\n", dest);
|
|
return NULL;
|
|
}
|
|
|
|
if (direct)
|
|
memcpy(pad, insn_buff, tsize);
|
|
else
|
|
text_poke_copy_locked(pad, insn_buff, tsize, true);
|
|
return pad;
|
|
}
|
|
|
|
static __init_or_module void patch_call(void *addr, const struct core_text *ct)
|
|
{
|
|
void *pad, *dest;
|
|
u8 bytes[8];
|
|
|
|
if (!within_coretext(ct, addr))
|
|
return;
|
|
|
|
dest = call_get_dest(addr);
|
|
if (!dest || WARN_ON_ONCE(IS_ERR(dest)))
|
|
return;
|
|
|
|
if (!is_coretext(ct, dest))
|
|
return;
|
|
|
|
pad = patch_dest(dest, within_coretext(ct, dest));
|
|
if (!pad)
|
|
return;
|
|
|
|
prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr,
|
|
dest, dest, pad);
|
|
__text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE);
|
|
text_poke_early(addr, bytes, CALL_INSN_SIZE);
|
|
}
|
|
|
|
static __init_or_module void
|
|
patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
|
|
{
|
|
s32 *s;
|
|
|
|
for (s = start; s < end; s++)
|
|
patch_call((void *)s + *s, ct);
|
|
}
|
|
|
|
static __init_or_module void
|
|
patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
|
|
const struct core_text *ct)
|
|
{
|
|
struct alt_instr *a;
|
|
|
|
for (a = start; a < end; a++)
|
|
patch_call((void *)&a->instr_offset + a->instr_offset, ct);
|
|
}
|
|
|
|
static __init_or_module void
|
|
callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
|
|
{
|
|
prdbg("Patching call sites %s\n", ct->name);
|
|
patch_call_sites(cs->call_start, cs->call_end, ct);
|
|
patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
|
|
prdbg("Patching call sites done%s\n", ct->name);
|
|
}
|
|
|
|
void __init callthunks_patch_builtin_calls(void)
|
|
{
|
|
struct callthunk_sites cs = {
|
|
.call_start = __call_sites,
|
|
.call_end = __call_sites_end,
|
|
.alt_start = __alt_instructions,
|
|
.alt_end = __alt_instructions_end
|
|
};
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
|
|
return;
|
|
|
|
pr_info("Setting up call depth tracking\n");
|
|
mutex_lock(&text_mutex);
|
|
callthunks_setup(&cs, &builtin_coretext);
|
|
thunks_initialized = true;
|
|
mutex_unlock(&text_mutex);
|
|
}
|
|
|
|
void *callthunks_translate_call_dest(void *dest)
|
|
{
|
|
void *target;
|
|
|
|
lockdep_assert_held(&text_mutex);
|
|
|
|
if (!thunks_initialized || skip_addr(dest))
|
|
return dest;
|
|
|
|
if (!is_coretext(NULL, dest))
|
|
return dest;
|
|
|
|
target = patch_dest(dest, false);
|
|
return target ? : dest;
|
|
}
|
|
|
|
#ifdef CONFIG_BPF_JIT
|
|
static bool is_callthunk(void *addr)
|
|
{
|
|
unsigned int tmpl_size = SKL_TMPL_SIZE;
|
|
u8 insn_buff[MAX_PATCH_LEN];
|
|
unsigned long dest;
|
|
u8 *pad;
|
|
|
|
dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
|
|
if (!thunks_initialized || skip_addr((void *)dest))
|
|
return false;
|
|
|
|
pad = (void *)(dest - tmpl_size);
|
|
|
|
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
|
|
apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
|
|
|
|
return !bcmp(pad, insn_buff, tmpl_size);
|
|
}
|
|
|
|
int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
|
|
{
|
|
unsigned int tmpl_size = SKL_TMPL_SIZE;
|
|
u8 insn_buff[MAX_PATCH_LEN];
|
|
|
|
if (!thunks_initialized)
|
|
return 0;
|
|
|
|
/* Is function call target a thunk? */
|
|
if (func && is_callthunk(func))
|
|
return 0;
|
|
|
|
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
|
|
apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
|
|
|
|
memcpy(*pprog, insn_buff, tmpl_size);
|
|
*pprog += tmpl_size;
|
|
return tmpl_size;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_MODULES
|
|
void noinline callthunks_patch_module_calls(struct callthunk_sites *cs,
|
|
struct module *mod)
|
|
{
|
|
struct core_text ct = {
|
|
.base = (unsigned long)mod->mem[MOD_TEXT].base,
|
|
.end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size,
|
|
.name = mod->name,
|
|
};
|
|
|
|
if (!thunks_initialized)
|
|
return;
|
|
|
|
mutex_lock(&text_mutex);
|
|
callthunks_setup(cs, &ct);
|
|
mutex_unlock(&text_mutex);
|
|
}
|
|
#endif /* CONFIG_MODULES */
|
|
|
|
#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
|
|
static int callthunks_debug_show(struct seq_file *m, void *p)
|
|
{
|
|
unsigned long cpu = (unsigned long)m->private;
|
|
|
|
seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
|
|
per_cpu(__x86_call_count, cpu),
|
|
per_cpu(__x86_ret_count, cpu),
|
|
per_cpu(__x86_stuffs_count, cpu),
|
|
per_cpu(__x86_ctxsw_count, cpu));
|
|
return 0;
|
|
}
|
|
|
|
static int callthunks_debug_open(struct inode *inode, struct file *file)
|
|
{
|
|
return single_open(file, callthunks_debug_show, inode->i_private);
|
|
}
|
|
|
|
static const struct file_operations dfs_ops = {
|
|
.open = callthunks_debug_open,
|
|
.read = seq_read,
|
|
.llseek = seq_lseek,
|
|
.release = single_release,
|
|
};
|
|
|
|
static int __init callthunks_debugfs_init(void)
|
|
{
|
|
struct dentry *dir;
|
|
unsigned long cpu;
|
|
|
|
dir = debugfs_create_dir("callthunks", NULL);
|
|
for_each_possible_cpu(cpu) {
|
|
void *arg = (void *)cpu;
|
|
char name [10];
|
|
|
|
sprintf(name, "cpu%lu", cpu);
|
|
debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
|
|
}
|
|
return 0;
|
|
}
|
|
__initcall(callthunks_debugfs_init);
|
|
#endif
|