Files
linux/drivers/gpu/drm/xe/xe_lrc.c
Linus Torvalds bf4afc53b7 Convert 'alloc_obj' family to use the new default GFP_KERNEL argument
This was done entirely with mindless brute force, using

    git grep -l '\<k[vmz]*alloc_objs*(.*, GFP_KERNEL)' |
        xargs sed -i 's/\(alloc_objs*(.*\), GFP_KERNEL)/\1)/'

to convert the new alloc_obj() users that had a simple GFP_KERNEL
argument to just drop that argument.

Note that due to the extreme simplicity of the scripting, any slightly
more complex cases spread over multiple lines would not be triggered:
they definitely exist, but this covers the vast bulk of the cases, and
the resulting diff is also then easier to check automatically.

For the same reason the 'flex' versions will be done as a separate
conversion.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2026-02-21 17:09:51 -08:00

2485 lines
65 KiB
C

// SPDX-License-Identifier: MIT
/*
* Copyright © 2021 Intel Corporation
*/
#include "xe_lrc.h"
#include <generated/xe_wa_oob.h>
#include <linux/ascii85.h>
#include <linux/panic.h>
#include "instructions/xe_mi_commands.h"
#include "instructions/xe_gfxpipe_commands.h"
#include "instructions/xe_gfx_state_commands.h"
#include "regs/xe_engine_regs.h"
#include "regs/xe_lrc_layout.h"
#include "xe_bb.h"
#include "xe_bo.h"
#include "xe_configfs.h"
#include "xe_device.h"
#include "xe_drm_client.h"
#include "xe_exec_queue_types.h"
#include "xe_gt.h"
#include "xe_gt_printk.h"
#include "xe_hw_fence.h"
#include "xe_map.h"
#include "xe_memirq.h"
#include "xe_mmio.h"
#include "xe_sriov.h"
#include "xe_trace_lrc.h"
#include "xe_vm.h"
#include "xe_wa.h"
#define LRC_VALID BIT_ULL(0)
#define LRC_PRIVILEGE BIT_ULL(8)
#define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
#define LRC_LEGACY_64B_CONTEXT 3
#define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
#define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
#define LRC_PPHWSP_SIZE SZ_4K
#define LRC_INDIRECT_CTX_BO_SIZE SZ_4K
#define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
#define LRC_PRIORITY GENMASK_ULL(10, 9)
#define LRC_PRIORITY_LOW 0
#define LRC_PRIORITY_NORMAL 1
#define LRC_PRIORITY_HIGH 2
/*
* Layout of the LRC and associated data allocated as
* lrc->bo:
*
* Region Size
* +============================+=================================+ <- __xe_lrc_ring_offset()
* | Ring | ring_size, see |
* | | xe_lrc_init() |
* +============================+=================================+ <- __xe_lrc_pphwsp_offset()
* | PPHWSP (includes SW state) | 4K |
* +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
* | Engine Context Image | n * 4K, see |
* | | xe_gt_lrc_size() |
* +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
* | Indirect Ring State Page | 0 or 4k, see |
* | | XE_LRC_FLAG_INDIRECT_RING_STATE |
* +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
* | Indirect Context Page | 0 or 4k, see |
* | | XE_LRC_FLAG_INDIRECT_CTX |
* +============================+=================================+ <- __xe_lrc_wa_bb_offset()
* | WA BB Per Ctx | 4k |
* +============================+=================================+ <- xe_bo_size(lrc->bo)
*/
static struct xe_device *
lrc_to_xe(struct xe_lrc *lrc)
{
return gt_to_xe(lrc->fence_ctx.gt);
}
static bool
gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
{
struct xe_device *xe = gt_to_xe(gt);
if (XE_GT_WA(gt, 16010904313) &&
(class == XE_ENGINE_CLASS_RENDER ||
class == XE_ENGINE_CLASS_COMPUTE))
return true;
if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
class, NULL))
return true;
return false;
}
/**
* xe_gt_lrc_hang_replay_size() - Hang replay size
* @gt: The GT
* @class: Hardware engine class
*
* Determine size of GPU hang replay state for a GT and hardware engine class.
*
* Return: Size of GPU hang replay size
*/
size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
{
struct xe_device *xe = gt_to_xe(gt);
size_t size = 0;
/* Engine context image */
switch (class) {
case XE_ENGINE_CLASS_RENDER:
if (GRAPHICS_VER(xe) >= 20)
size += 3 * SZ_4K;
else
size += 13 * SZ_4K;
break;
case XE_ENGINE_CLASS_COMPUTE:
if (GRAPHICS_VER(xe) >= 20)
size += 2 * SZ_4K;
else
size += 13 * SZ_4K;
break;
default:
WARN(1, "Unknown engine class: %d", class);
fallthrough;
case XE_ENGINE_CLASS_COPY:
case XE_ENGINE_CLASS_VIDEO_DECODE:
case XE_ENGINE_CLASS_VIDEO_ENHANCE:
case XE_ENGINE_CLASS_OTHER:
size += 1 * SZ_4K;
}
return size;
}
size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
{
size_t size = xe_gt_lrc_hang_replay_size(gt, class);
/* Add indirect ring state page */
if (xe_gt_has_indirect_ring_state(gt))
size += LRC_INDIRECT_RING_STATE_SIZE;
return size + LRC_PPHWSP_SIZE;
}
/*
* The per-platform tables are u8-encoded in @data. Decode @data and set the
* addresses' offset and commands in @regs. The following encoding is used
* for each byte. There are 2 steps: decoding commands and decoding addresses.
*
* Commands:
* [7]: create NOPs - number of NOPs are set in lower bits
* [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
* MI_LRI_FORCE_POSTED
* [5:0]: Number of NOPs or registers to set values to in case of
* MI_LOAD_REGISTER_IMM
*
* Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
* number of registers. They are set by using the REG/REG16 macros: the former
* is used for offsets smaller than 0x200 while the latter is for values bigger
* than that. Those macros already set all the bits documented below correctly:
*
* [7]: When a register offset needs more than 6 bits, use additional bytes, to
* follow, for the lower bits
* [6:0]: Register offset, without considering the engine base.
*
* This function only tweaks the commands and register offsets. Values are not
* filled out.
*/
static void set_offsets(u32 *regs,
const u8 *data,
const struct xe_hw_engine *hwe)
#define NOP(x) (BIT(7) | (x))
#define LRI(count, flags) ((flags) << 6 | (count) | \
BUILD_BUG_ON_ZERO(count >= BIT(6)))
#define POSTED BIT(0)
#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
#define REG16(x) \
(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
(((x) >> 2) & 0x7f)
{
const u32 base = hwe->mmio_base;
while (*data) {
u8 count, flags;
if (*data & BIT(7)) { /* skip */
count = *data++ & ~BIT(7);
regs += count;
continue;
}
count = *data & 0x3f;
flags = *data >> 6;
data++;
*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
if (flags & POSTED)
*regs |= MI_LRI_FORCE_POSTED;
*regs |= MI_LRI_LRM_CS_MMIO;
regs++;
xe_gt_assert(hwe->gt, count);
do {
u32 offset = 0;
u8 v;
do {
v = *data++;
offset <<= 7;
offset |= v & ~BIT(7);
} while (v & BIT(7));
regs[0] = base + (offset << 2);
regs += 2;
} while (--count);
}
*regs = MI_BATCH_BUFFER_END | BIT(0);
}
static const u8 gen12_xcs_offsets[] = {
NOP(1),
LRI(13, POSTED),
REG16(0x244),
REG(0x034),
REG(0x030),
REG(0x038),
REG(0x03c),
REG(0x168),
REG(0x140),
REG(0x110),
REG(0x1c0),
REG(0x1c4),
REG(0x1c8),
REG(0x180),
REG16(0x2b4),
NOP(5),
LRI(9, POSTED),
REG16(0x3a8),
REG16(0x28c),
REG16(0x288),
REG16(0x284),
REG16(0x280),
REG16(0x27c),
REG16(0x278),
REG16(0x274),
REG16(0x270),
0
};
static const u8 dg2_xcs_offsets[] = {
NOP(1),
LRI(15, POSTED),
REG16(0x244),
REG(0x034),
REG(0x030),
REG(0x038),
REG(0x03c),
REG(0x168),
REG(0x140),
REG(0x110),
REG(0x1c0),
REG(0x1c4),
REG(0x1c8),
REG(0x180),
REG16(0x2b4),
REG(0x120),
REG(0x124),
NOP(1),
LRI(9, POSTED),
REG16(0x3a8),
REG16(0x28c),
REG16(0x288),
REG16(0x284),
REG16(0x280),
REG16(0x27c),
REG16(0x278),
REG16(0x274),
REG16(0x270),
0
};
static const u8 gen12_rcs_offsets[] = {
NOP(1),
LRI(13, POSTED),
REG16(0x244),
REG(0x034),
REG(0x030),
REG(0x038),
REG(0x03c),
REG(0x168),
REG(0x140),
REG(0x110),
REG(0x1c0),
REG(0x1c4),
REG(0x1c8),
REG(0x180),
REG16(0x2b4),
NOP(5),
LRI(9, POSTED),
REG16(0x3a8),
REG16(0x28c),
REG16(0x288),
REG16(0x284),
REG16(0x280),
REG16(0x27c),
REG16(0x278),
REG16(0x274),
REG16(0x270),
LRI(3, POSTED),
REG(0x1b0),
REG16(0x5a8),
REG16(0x5ac),
NOP(6),
LRI(1, 0),
REG(0x0c8),
NOP(3 + 9 + 1),
LRI(51, POSTED),
REG16(0x588),
REG16(0x588),
REG16(0x588),
REG16(0x588),
REG16(0x588),
REG16(0x588),
REG(0x028),
REG(0x09c),
REG(0x0c0),
REG(0x178),
REG(0x17c),
REG16(0x358),
REG(0x170),
REG(0x150),
REG(0x154),
REG(0x158),
REG16(0x41c),
REG16(0x600),
REG16(0x604),
REG16(0x608),
REG16(0x60c),
REG16(0x610),
REG16(0x614),
REG16(0x618),
REG16(0x61c),
REG16(0x620),
REG16(0x624),
REG16(0x628),
REG16(0x62c),
REG16(0x630),
REG16(0x634),
REG16(0x638),
REG16(0x63c),
REG16(0x640),
REG16(0x644),
REG16(0x648),
REG16(0x64c),
REG16(0x650),
REG16(0x654),
REG16(0x658),
REG16(0x65c),
REG16(0x660),
REG16(0x664),
REG16(0x668),
REG16(0x66c),
REG16(0x670),
REG16(0x674),
REG16(0x678),
REG16(0x67c),
REG(0x068),
REG(0x084),
NOP(1),
0
};
static const u8 xehp_rcs_offsets[] = {
NOP(1),
LRI(13, POSTED),
REG16(0x244),
REG(0x034),
REG(0x030),
REG(0x038),
REG(0x03c),
REG(0x168),
REG(0x140),
REG(0x110),
REG(0x1c0),
REG(0x1c4),
REG(0x1c8),
REG(0x180),
REG16(0x2b4),
NOP(5),
LRI(9, POSTED),
REG16(0x3a8),
REG16(0x28c),
REG16(0x288),
REG16(0x284),
REG16(0x280),
REG16(0x27c),
REG16(0x278),
REG16(0x274),
REG16(0x270),
LRI(3, POSTED),
REG(0x1b0),
REG16(0x5a8),
REG16(0x5ac),
NOP(6),
LRI(1, 0),
REG(0x0c8),
0
};
static const u8 dg2_rcs_offsets[] = {
NOP(1),
LRI(15, POSTED),
REG16(0x244),
REG(0x034),
REG(0x030),
REG(0x038),
REG(0x03c),
REG(0x168),
REG(0x140),
REG(0x110),
REG(0x1c0),
REG(0x1c4),
REG(0x1c8),
REG(0x180),
REG16(0x2b4),
REG(0x120),
REG(0x124),
NOP(1),
LRI(9, POSTED),
REG16(0x3a8),
REG16(0x28c),
REG16(0x288),
REG16(0x284),
REG16(0x280),
REG16(0x27c),
REG16(0x278),
REG16(0x274),
REG16(0x270),
LRI(3, POSTED),
REG(0x1b0),
REG16(0x5a8),
REG16(0x5ac),
NOP(6),
LRI(1, 0),
REG(0x0c8),
0
};
static const u8 mtl_rcs_offsets[] = {
NOP(1),
LRI(15, POSTED),
REG16(0x244),
REG(0x034),
REG(0x030),
REG(0x038),
REG(0x03c),
REG(0x168),
REG(0x140),
REG(0x110),
REG(0x1c0),
REG(0x1c4),
REG(0x1c8),
REG(0x180),
REG16(0x2b4),
REG(0x120),
REG(0x124),
NOP(1),
LRI(9, POSTED),
REG16(0x3a8),
REG16(0x28c),
REG16(0x288),
REG16(0x284),
REG16(0x280),
REG16(0x27c),
REG16(0x278),
REG16(0x274),
REG16(0x270),
NOP(2),
LRI(2, POSTED),
REG16(0x5a8),
REG16(0x5ac),
NOP(6),
LRI(1, 0),
REG(0x0c8),
0
};
#define XE2_CTX_COMMON \
NOP(1), /* [0x00] */ \
LRI(15, POSTED), /* [0x01] */ \
REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
REG(0x038), /* [0x08] RING_BUFFER_START */ \
REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
REG(0x140), /* [0x0e] BB_ADDR */ \
REG(0x110), /* [0x10] BB_STATE */ \
REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
REG(0x180), /* [0x18] CCID */ \
REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
REG(0x120), /* [0x1c] PRT_BB_STATE */ \
REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
\
NOP(1), /* [0x20] */ \
LRI(9, POSTED), /* [0x21] */ \
REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
REG16(0x284), /* [0x28] dummy reg */ \
REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
REG16(0x274), /* [0x30] PTBP_UDW */ \
REG16(0x270) /* [0x32] PTBP_LDW */
static const u8 xe2_rcs_offsets[] = {
XE2_CTX_COMMON,
NOP(2), /* [0x34] */
LRI(2, POSTED), /* [0x36] */
REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
NOP(6), /* [0x41] */
LRI(1, 0), /* [0x47] */
REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
0
};
static const u8 xe2_bcs_offsets[] = {
XE2_CTX_COMMON,
NOP(4 + 8 + 1), /* [0x34] */
LRI(2, POSTED), /* [0x41] */
REG16(0x200), /* [0x42] BCS_SWCTRL */
REG16(0x204), /* [0x44] BLIT_CCTL */
0
};
static const u8 xe2_xcs_offsets[] = {
XE2_CTX_COMMON,
0
};
static const u8 xe2_indirect_ring_state_offsets[] = {
NOP(1), /* [0x00] */
LRI(5, POSTED), /* [0x01] */
REG(0x034), /* [0x02] RING_BUFFER_HEAD */
REG(0x030), /* [0x04] RING_BUFFER_TAIL */
REG(0x038), /* [0x06] RING_BUFFER_START */
REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
NOP(5), /* [0x0c] */
LRI(9, POSTED), /* [0x11] */
REG(0x168), /* [0x12] BB_ADDR_UDW */
REG(0x140), /* [0x14] BB_ADDR */
REG(0x110), /* [0x16] BB_STATE */
REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
NOP(12), /* [0x00] */
0
};
#undef REG16
#undef REG
#undef LRI
#undef NOP
static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
{
if (class == XE_ENGINE_CLASS_RENDER) {
if (GRAPHICS_VER(xe) >= 20)
return xe2_rcs_offsets;
else if (GRAPHICS_VERx100(xe) >= 1270)
return mtl_rcs_offsets;
else if (GRAPHICS_VERx100(xe) >= 1255)
return dg2_rcs_offsets;
else if (GRAPHICS_VERx100(xe) >= 1250)
return xehp_rcs_offsets;
else
return gen12_rcs_offsets;
} else if (class == XE_ENGINE_CLASS_COPY) {
if (GRAPHICS_VER(xe) >= 20)
return xe2_bcs_offsets;
else
return gen12_xcs_offsets;
} else {
if (GRAPHICS_VER(xe) >= 20)
return xe2_xcs_offsets;
else if (GRAPHICS_VERx100(xe) >= 1255)
return dg2_xcs_offsets;
else
return gen12_xcs_offsets;
}
}
static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
{
regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
if (xe_gt_has_indirect_ring_state(hwe->gt))
regs[CTX_CONTEXT_CONTROL] |=
_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
}
static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
{
struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
struct xe_device *xe = gt_to_xe(hwe->gt);
u8 num_regs;
if (!xe_device_uses_memirq(xe))
return;
regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
num_regs = xe_device_has_msix(xe) ? 3 : 2;
regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
if (xe_device_has_msix(xe)) {
regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
}
}
static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
{
struct xe_device *xe = gt_to_xe(hwe->gt);
if (GRAPHICS_VERx100(xe) >= 1250)
return 0x70;
else
return 0x60;
}
static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
{
int x;
x = lrc_ring_mi_mode(hwe);
regs[x + 1] &= ~STOP_RING;
regs[x + 1] |= STOP_RING << 16;
}
static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
{
return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
}
static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
{
return 0;
}
u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
{
return lrc->ring.size;
}
/* Make the magic macros work */
#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
#define __xe_lrc_regs_offset xe_lrc_regs_offset
#define LRC_SEQNO_PPHWSP_OFFSET 512
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
{
return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
}
/**
* xe_lrc_reg_size() - Get size of the LRC registers area within queues
* @xe: the &xe_device struct instance
*
* Returns: Size of the LRC registers area for current platform
*/
size_t xe_lrc_reg_size(struct xe_device *xe)
{
if (GRAPHICS_VERx100(xe) >= 1250)
return 96 * sizeof(u32);
else
return 80 * sizeof(u32);
}
size_t xe_lrc_skip_size(struct xe_device *xe)
{
return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
}
static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
{
/* The seqno is stored in the driver-defined portion of PPHWSP */
return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
}
static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
{
/* The start seqno is stored in the driver-defined portion of PPHWSP */
return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
}
static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
{
/* This is stored in the driver-defined portion of PPHWSP */
return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
}
static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
{
/* The parallel is stored in the driver-defined portion of PPHWSP */
return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
}
static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
{
return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
}
static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
{
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
}
static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
{
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
}
static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
{
u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
LRC_INDIRECT_RING_STATE_SIZE;
if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
offset -= LRC_INDIRECT_CTX_BO_SIZE;
return offset;
}
static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
{
return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
}
static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
{
return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
}
#define DECL_MAP_ADDR_HELPERS(elem) \
static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
{ \
struct iosys_map map = lrc->bo->vmap; \
\
xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
return map; \
} \
static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
{ \
return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
} \
DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)
DECL_MAP_ADDR_HELPERS(seqno)
DECL_MAP_ADDR_HELPERS(regs)
DECL_MAP_ADDR_HELPERS(start_seqno)
DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
DECL_MAP_ADDR_HELPERS(ctx_timestamp)
DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
DECL_MAP_ADDR_HELPERS(parallel)
DECL_MAP_ADDR_HELPERS(indirect_ring)
DECL_MAP_ADDR_HELPERS(engine_id)
#undef DECL_MAP_ADDR_HELPERS
/**
* xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
* @lrc: Pointer to the lrc.
*
* Returns: ctx timestamp GGTT address
*/
u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
}
/**
* xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
* @lrc: Pointer to the lrc.
*
* Returns: ctx timestamp udw GGTT address
*/
u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
}
/**
* xe_lrc_ctx_timestamp() - Read ctx timestamp value
* @lrc: Pointer to the lrc.
*
* Returns: ctx timestamp value
*/
static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
u32 ldw, udw = 0;
map = __xe_lrc_ctx_timestamp_map(lrc);
ldw = xe_map_read32(xe, &map);
if (xe->info.has_64bit_timestamp) {
map = __xe_lrc_ctx_timestamp_udw_map(lrc);
udw = xe_map_read32(xe, &map);
}
return (u64)udw << 32 | ldw;
}
/**
* xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
* @lrc: Pointer to the lrc.
*
* Returns: ctx timestamp job GGTT address
*/
u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
}
/**
* xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
* @lrc: Pointer to the lrc.
*
* Returns: ctx timestamp job value
*/
u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
map = __xe_lrc_ctx_job_timestamp_map(lrc);
return xe_map_read32(xe, &map);
}
u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_pphwsp_ggtt_addr(lrc);
}
u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
{
if (!xe_lrc_has_indirect_ring_state(lrc))
return 0;
return __xe_lrc_indirect_ring_ggtt_addr(lrc);
}
static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
map = __xe_lrc_indirect_ring_map(lrc);
iosys_map_incr(&map, reg_nr * sizeof(u32));
return xe_map_read32(xe, &map);
}
static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
int reg_nr, u32 val)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
map = __xe_lrc_indirect_ring_map(lrc);
iosys_map_incr(&map, reg_nr * sizeof(u32));
xe_map_write32(xe, &map, val);
}
u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
map = __xe_lrc_regs_map(lrc);
iosys_map_incr(&map, reg_nr * sizeof(u32));
return xe_map_read32(xe, &map);
}
void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
map = __xe_lrc_regs_map(lrc);
iosys_map_incr(&map, reg_nr * sizeof(u32));
xe_map_write32(xe, &map, val);
}
static void *empty_lrc_data(struct xe_hw_engine *hwe)
{
struct xe_gt *gt = hwe->gt;
void *data;
u32 *regs;
data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
if (!data)
return NULL;
/* 1st page: Per-Process of HW status Page */
regs = data + LRC_PPHWSP_SIZE;
set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
set_context_control(regs, hwe);
set_memory_based_intr(regs, hwe);
reset_stop_ring(regs, hwe);
if (xe_gt_has_indirect_ring_state(gt)) {
regs = data + xe_gt_lrc_size(gt, hwe->class) -
LRC_INDIRECT_RING_STATE_SIZE;
set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
}
return data;
}
/**
* xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
* of given engine.
* @hwe: the &xe_hw_engine struct instance
*/
void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
{
struct xe_gt *gt = hwe->gt;
u32 *regs;
if (!gt->default_lrc[hwe->class])
return;
regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
set_memory_based_intr(regs, hwe);
}
/**
* xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
* for given LRC.
* @lrc: the &xe_lrc struct instance
* @hwe: the &xe_hw_engine struct instance
* @regs: scratch buffer to be used as temporary storage
*/
void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
u32 *regs)
{
struct xe_gt *gt = hwe->gt;
struct iosys_map map;
size_t regs_len;
if (!xe_device_uses_memirq(gt_to_xe(gt)))
return;
map = __xe_lrc_regs_map(lrc);
regs_len = xe_lrc_reg_size(gt_to_xe(gt));
xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
set_memory_based_intr(regs, hwe);
xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
}
static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
{
u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
}
static void xe_lrc_finish(struct xe_lrc *lrc)
{
xe_hw_fence_ctx_finish(&lrc->fence_ctx);
xe_bo_unpin_map_no_vm(lrc->bo);
}
/*
* wa_bb_setup_utilization() - Write commands to wa bb to assist
* in calculating active context run ticks.
*
* Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
* context, but only gets updated when the context switches out. In order to
* check how long a context has been active before it switches out, two things
* are required:
*
* (1) Determine if the context is running:
* To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
* the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
* initialized. During a query, we just check for this value to determine if the
* context is active. If the context switched out, it would overwrite this
* location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
* the last part of context restore, so reusing this LRC location will not
* clobber anything.
*
* (2) Calculate the time that the context has been active for:
* The CTX_TIMESTAMP ticks only when the context is active. If a context is
* active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
* While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
* engine instance. Since we do not know which instance the context is running
* on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
* store it in the PPHSWP.
*/
#define CONTEXT_ACTIVE 1ULL
static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
struct xe_hw_engine *hwe,
u32 *batch,
size_t max_len)
{
u32 *cmd = batch;
if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
return 0;
if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
return -ENOSPC;
*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
*cmd++ = ENGINE_ID(0).addr;
*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
*cmd++ = 0;
*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
*cmd++ = 0;
*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
*cmd++ = 0;
*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
}
return cmd - batch;
}
static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
u32 *batch, size_t max_len)
{
const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
u32 *cmd = batch;
if (!XE_GT_WA(lrc->gt, 16010904313) ||
!(hwe->class == XE_ENGINE_CLASS_RENDER ||
hwe->class == XE_ENGINE_CLASS_COMPUTE ||
hwe->class == XE_ENGINE_CLASS_COPY ||
hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
return 0;
if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
return -ENOSPC;
*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
MI_LRM_ASYNC;
*cmd++ = RING_CTX_TIMESTAMP(0).addr;
*cmd++ = ts_addr;
*cmd++ = 0;
*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
MI_LRM_ASYNC;
*cmd++ = RING_CTX_TIMESTAMP(0).addr;
*cmd++ = ts_addr;
*cmd++ = 0;
*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
*cmd++ = RING_CTX_TIMESTAMP(0).addr;
*cmd++ = ts_addr;
*cmd++ = 0;
return cmd - batch;
}
static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
struct xe_hw_engine *hwe,
u32 *batch, size_t max_len)
{
struct xe_device *xe = gt_to_xe(lrc->gt);
const u32 *user_batch;
u32 *cmd = batch;
u32 count;
count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
hwe->class, &user_batch);
if (!count)
return 0;
if (count > max_len)
return -ENOSPC;
/*
* This should be used only for tests and validation. Taint the kernel
* as anything could be submitted directly in context switches
*/
add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
memcpy(cmd, user_batch, count * sizeof(u32));
cmd += count;
return cmd - batch;
}
static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
struct xe_hw_engine *hwe,
u32 *batch, size_t max_len)
{
struct xe_device *xe = gt_to_xe(lrc->gt);
const u32 *user_batch;
u32 *cmd = batch;
u32 count;
count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
hwe->class, &user_batch);
if (!count)
return 0;
if (count > max_len)
return -ENOSPC;
/*
* This should be used only for tests and validation. Taint the kernel
* as anything could be submitted directly in context switches
*/
add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
memcpy(cmd, user_batch, count * sizeof(u32));
cmd += count;
return cmd - batch;
}
static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
struct xe_hw_engine *hwe,
u32 *batch, size_t max_len)
{
u32 *cmd = batch;
if (!XE_GT_WA(lrc->gt, 18022495364) ||
hwe->class != XE_ENGINE_CLASS_RENDER)
return 0;
if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
return -ENOSPC;
*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
*cmd++ = CS_DEBUG_MODE2(0).addr;
*cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
return cmd - batch;
}
struct bo_setup {
ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
u32 *batch, size_t max_size);
};
struct bo_setup_state {
/* Input: */
struct xe_lrc *lrc;
struct xe_hw_engine *hwe;
size_t max_size;
size_t reserve_dw;
unsigned int offset;
const struct bo_setup *funcs;
unsigned int num_funcs;
/* State: */
u32 *buffer;
u32 *ptr;
unsigned int written;
};
static int setup_bo(struct bo_setup_state *state)
{
ssize_t remain;
if (state->lrc->bo->vmap.is_iomem) {
xe_gt_assert(state->hwe->gt, state->buffer);
state->ptr = state->buffer;
} else {
state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
}
remain = state->max_size / sizeof(u32);
for (size_t i = 0; i < state->num_funcs; i++) {
ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
state->ptr, remain);
remain -= len;
/*
* Caller has asked for at least reserve_dw to remain unused.
*/
if (len < 0 ||
xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
goto fail;
state->ptr += len;
state->written += len;
}
return 0;
fail:
return -ENOSPC;
}
static void finish_bo(struct bo_setup_state *state)
{
if (!state->lrc->bo->vmap.is_iomem)
return;
xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
state->offset, state->buffer,
state->written * sizeof(u32));
}
/**
* xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
* @lrc: the &xe_lrc struct instance
* @hwe: the &xe_hw_engine struct instance
* @scratch: preallocated scratch buffer for temporary storage
* Return: 0 on success, negative error code on failure
*/
int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
{
static const struct bo_setup funcs[] = {
{ .setup = setup_timestamp_wa },
{ .setup = setup_invalidate_state_cache_wa },
{ .setup = setup_utilization_wa },
{ .setup = setup_configfs_post_ctx_restore_bb },
};
struct bo_setup_state state = {
.lrc = lrc,
.hwe = hwe,
.max_size = LRC_WA_BB_SIZE,
.buffer = scratch,
.reserve_dw = 1,
.offset = __xe_lrc_wa_bb_offset(lrc),
.funcs = funcs,
.num_funcs = ARRAY_SIZE(funcs),
};
int ret;
ret = setup_bo(&state);
if (ret)
return ret;
*state.ptr++ = MI_BATCH_BUFFER_END;
state.written++;
finish_bo(&state);
xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
return 0;
}
static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
{
u32 *buf = NULL;
int ret;
if (lrc->bo->vmap.is_iomem) {
buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
}
ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
kfree(buf);
return ret;
}
static int
setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
{
static const struct bo_setup rcs_funcs[] = {
{ .setup = setup_timestamp_wa },
{ .setup = setup_configfs_mid_ctx_restore_bb },
};
static const struct bo_setup xcs_funcs[] = {
{ .setup = setup_configfs_mid_ctx_restore_bb },
};
struct bo_setup_state state = {
.lrc = lrc,
.hwe = hwe,
.max_size = (63 * 64) /* max 63 cachelines */,
.buffer = NULL,
.offset = __xe_lrc_indirect_ctx_offset(lrc),
};
int ret;
if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
return 0;
if (hwe->class == XE_ENGINE_CLASS_RENDER ||
hwe->class == XE_ENGINE_CLASS_COMPUTE) {
state.funcs = rcs_funcs;
state.num_funcs = ARRAY_SIZE(rcs_funcs);
} else {
state.funcs = xcs_funcs;
state.num_funcs = ARRAY_SIZE(xcs_funcs);
}
if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
return 0;
if (lrc->bo->vmap.is_iomem) {
state.buffer = kmalloc(state.max_size, GFP_KERNEL);
if (!state.buffer)
return -ENOMEM;
}
ret = setup_bo(&state);
if (ret) {
kfree(state.buffer);
return ret;
}
/*
* Align to 64B cacheline so there's no garbage at the end for CS to
* execute: size for indirect ctx must be a multiple of 64.
*/
while (state.written & 0xf) {
*state.ptr++ = MI_NOOP;
state.written++;
}
finish_bo(&state);
kfree(state.buffer);
/*
* Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
* varies per engine class, but the default is good enough
*/
xe_lrc_write_ctx_reg(lrc,
CTX_CS_INDIRECT_CTX,
(xe_bo_ggtt_addr(lrc->bo) + state.offset) |
/* Size in CLs. */
(state.written * sizeof(u32) / 64));
return 0;
}
static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
{
struct xe_device *xe = gt_to_xe(lrc->gt);
xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
/* xe_multi_queue_priority is directly mapped to LRC priority values */
return priority;
}
/**
* xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
* @lrc: Logical Ring Context
* @priority: Multi queue priority of the exec queue
*
* Convert @priority to LRC multi queue priority and update the @lrc descriptor
*/
void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
{
lrc->desc &= ~LRC_PRIORITY;
lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
}
static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
struct xe_vm *vm, void *replay_state, u32 ring_size,
u16 msix_vec,
u32 init_flags)
{
struct xe_gt *gt = hwe->gt;
const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
struct xe_tile *tile = gt_to_tile(gt);
struct xe_device *xe = gt_to_xe(gt);
struct iosys_map map;
u32 arb_enable;
u32 bo_flags;
int err;
kref_init(&lrc->refcount);
lrc->gt = gt;
lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
lrc->size = lrc_size;
lrc->flags = 0;
lrc->ring.size = ring_size;
lrc->ring.tail = 0;
if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
bo_size += LRC_INDIRECT_CTX_BO_SIZE;
}
if (xe_gt_has_indirect_ring_state(gt))
lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
XE_BO_FLAG_GGTT_INVALIDATE;
if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
bo_size,
ttm_bo_type_kernel,
bo_flags, false);
if (IS_ERR(lrc->bo))
return PTR_ERR(lrc->bo);
xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
hwe->fence_irq, hwe->name);
/*
* Init Per-Process of HW status Page, LRC / context state to known
* values. If there's already a primed default_lrc, just copy it, otherwise
* it's the early submission to record the lrc: build a new empty one from
* scratch.
*/
map = __xe_lrc_pphwsp_map(lrc);
if (gt->default_lrc[hwe->class] || replay_state) {
xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
lrc_size - LRC_PPHWSP_SIZE);
if (replay_state)
xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
replay_state, lrc->replay_size);
} else {
void *init_data = empty_lrc_data(hwe);
if (!init_data) {
err = -ENOMEM;
goto err_lrc_finish;
}
xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
kfree(init_data);
}
if (vm) {
xe_lrc_set_ppgtt(lrc, vm);
if (vm->xef)
xe_drm_client_add_bo(vm->xef->client, lrc->bo);
}
if (xe_device_has_msix(xe)) {
xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
xe_memirq_status_ptr(&tile->memirq, hwe));
xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
xe_memirq_source_ptr(&tile->memirq, hwe));
xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
}
if (xe_gt_has_indirect_ring_state(gt)) {
xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
__xe_lrc_indirect_ring_ggtt_addr(lrc));
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
__xe_lrc_ring_ggtt_addr(lrc));
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
} else {
xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
}
if (init_flags & XE_LRC_CREATE_RUNALONE)
xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
_MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
if (init_flags & XE_LRC_CREATE_PXP)
xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
_MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
lrc->ctx_timestamp = 0;
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
if (xe->info.has_asid && vm)
xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
lrc->desc = LRC_VALID;
lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
/* TODO: Priority */
/* While this appears to have something about privileged batches or
* some such, it really just means PPGTT mode.
*/
if (vm)
lrc->desc |= LRC_PRIVILEGE;
if (GRAPHICS_VERx100(xe) < 1250) {
lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
}
arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
map = __xe_lrc_seqno_map(lrc);
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
map = __xe_lrc_start_seqno_map(lrc);
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
err = setup_wa_bb(lrc, hwe);
if (err)
goto err_lrc_finish;
err = setup_indirect_ctx(lrc, hwe);
if (err)
goto err_lrc_finish;
return 0;
err_lrc_finish:
xe_lrc_finish(lrc);
return err;
}
/**
* xe_lrc_create - Create a LRC
* @hwe: Hardware Engine
* @vm: The VM (address space)
* @replay_state: GPU hang replay state
* @ring_size: LRC ring size
* @msix_vec: MSI-X interrupt vector (for platforms that support it)
* @flags: LRC initialization flags
*
* Allocate and initialize the Logical Ring Context (LRC).
*
* Return pointer to created LRC upon success and an error pointer
* upon failure.
*/
struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
{
struct xe_lrc *lrc;
int err;
lrc = kzalloc_obj(*lrc);
if (!lrc)
return ERR_PTR(-ENOMEM);
err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
if (err) {
kfree(lrc);
return ERR_PTR(err);
}
return lrc;
}
/**
* xe_lrc_destroy - Destroy the LRC
* @ref: reference to LRC
*
* Called when ref == 0, release resources held by the Logical Ring Context
* (LRC) and free the LRC memory.
*/
void xe_lrc_destroy(struct kref *ref)
{
struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
xe_lrc_finish(lrc);
kfree(lrc);
}
/**
* xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
* @lrc: the &xe_lrc struct instance
*/
void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
{
if (xe_lrc_has_indirect_ring_state(lrc)) {
xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
__xe_lrc_indirect_ring_ggtt_addr(lrc));
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
__xe_lrc_ring_ggtt_addr(lrc));
} else {
xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
}
}
void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
{
if (xe_lrc_has_indirect_ring_state(lrc))
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
else
xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
}
u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
{
if (xe_lrc_has_indirect_ring_state(lrc))
return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
else
return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
}
static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
{
if (xe_lrc_has_indirect_ring_state(lrc))
return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
else
return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
}
void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
{
if (xe_lrc_has_indirect_ring_state(lrc))
xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
else
xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
}
u32 xe_lrc_ring_head(struct xe_lrc *lrc)
{
if (xe_lrc_has_indirect_ring_state(lrc))
return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
else
return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
}
u32 xe_lrc_ring_space(struct xe_lrc *lrc)
{
const u32 head = xe_lrc_ring_head(lrc);
const u32 tail = lrc->ring.tail;
const u32 size = lrc->ring.size;
return ((head - tail - 1) & (size - 1)) + 1;
}
static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
const void *data, size_t size)
{
struct xe_device *xe = lrc_to_xe(lrc);
iosys_map_incr(&ring, lrc->ring.tail);
xe_map_memcpy_to(xe, &ring, 0, data, size);
lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
}
void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map ring;
u32 rhs;
size_t aligned_size;
xe_assert(xe, IS_ALIGNED(size, 4));
aligned_size = ALIGN(size, 8);
ring = __xe_lrc_ring_map(lrc);
xe_assert(xe, lrc->ring.tail < lrc->ring.size);
rhs = lrc->ring.size - lrc->ring.tail;
if (size > rhs) {
__xe_lrc_write_ring(lrc, ring, data, rhs);
__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
} else {
__xe_lrc_write_ring(lrc, ring, data, size);
}
if (aligned_size > size) {
u32 noop = MI_NOOP;
__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
}
}
u64 xe_lrc_descriptor(struct xe_lrc *lrc)
{
return lrc->desc | xe_lrc_ggtt_addr(lrc);
}
u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_seqno_ggtt_addr(lrc);
}
/**
* xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
*
* Allocate but don't initialize an lrc seqno fence.
*
* Return: Pointer to the allocated fence or
* negative error pointer on error.
*/
struct dma_fence *xe_lrc_alloc_seqno_fence(void)
{
return xe_hw_fence_alloc();
}
/**
* xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
* @fence: Pointer to the fence to free.
*
* Frees an lrc seqno fence that hasn't yet been
* initialized.
*/
void xe_lrc_free_seqno_fence(struct dma_fence *fence)
{
xe_hw_fence_free(fence);
}
/**
* xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
* @lrc: Pointer to the lrc.
* @fence: Pointer to the fence to initialize.
*
* Initializes a pre-allocated lrc seqno fence.
* After initialization, the fence is subject to normal
* dma-fence refcounting.
*/
void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
{
xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
}
s32 xe_lrc_seqno(struct xe_lrc *lrc)
{
struct iosys_map map = __xe_lrc_seqno_map(lrc);
return xe_map_read32(lrc_to_xe(lrc), &map);
}
s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
{
struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
return xe_map_read32(lrc_to_xe(lrc), &map);
}
u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_start_seqno_ggtt_addr(lrc);
}
u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_parallel_ggtt_addr(lrc);
}
struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
{
return __xe_lrc_parallel_map(lrc);
}
/**
* xe_lrc_engine_id() - Read engine id value
* @lrc: Pointer to the lrc.
*
* Returns: context id value
*/
static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
map = __xe_lrc_engine_id_map(lrc);
return xe_map_read32(xe, &map);
}
static int instr_dw(u32 cmd_header)
{
/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
GFXPIPE_SINGLE_DW_CMD(0, 0))
return 1;
/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
/* Most instructions have the # of dwords (minus 2) in 7:0 */
return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
}
static int dump_mi_command(struct drm_printer *p,
struct xe_gt *gt,
u32 *dw,
int remaining_dw)
{
u32 inst_header = *dw;
u32 numdw = instr_dw(inst_header);
u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
int num_noop;
/* First check for commands that don't have/use a '# DW' field */
switch (inst_header & MI_OPCODE) {
case MI_NOOP:
num_noop = 1;
while (num_noop < remaining_dw &&
(*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
num_noop++;
drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
return num_noop;
case MI_TOPOLOGY_FILTER:
drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
return 1;
case MI_BATCH_BUFFER_END:
drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
/* Return 'remaining_dw' to consume the rest of the LRC */
return remaining_dw;
}
/*
* Any remaining commands include a # of dwords. We should make sure
* it doesn't exceed the remaining size of the LRC.
*/
if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
numdw = remaining_dw;
switch (inst_header & MI_OPCODE) {
case MI_LOAD_REGISTER_IMM:
drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
inst_header, (numdw - 1) / 2);
for (int i = 1; i < numdw; i += 2)
drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
return numdw;
case MI_LOAD_REGISTER_MEM & MI_OPCODE:
drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
inst_header,
dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
if (numdw == 4)
drm_printf(p, " - %#6x = %#010llx\n",
dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
else
drm_printf(p, " - %*ph (%s)\n",
(int)sizeof(u32) * (numdw - 1), dw + 1,
numdw < 4 ? "truncated" : "malformed");
return numdw;
case MI_FORCE_WAKEUP:
drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
return numdw;
default:
drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
inst_header, opcode, numdw);
return numdw;
}
}
static int dump_gfxpipe_command(struct drm_printer *p,
struct xe_gt *gt,
u32 *dw,
int remaining_dw)
{
u32 numdw = instr_dw(*dw);
u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
/*
* Make sure we haven't mis-parsed a number of dwords that exceeds the
* remaining size of the LRC.
*/
if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
numdw = remaining_dw;
switch (*dw & GFXPIPE_MATCH_MASK) {
#define MATCH(cmd) \
case cmd: \
drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
return numdw
#define MATCH3D(cmd) \
case CMD_##cmd: \
drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
return numdw
MATCH(STATE_BASE_ADDRESS);
MATCH(STATE_SIP);
MATCH(GPGPU_CSR_BASE_ADDRESS);
MATCH(STATE_COMPUTE_MODE);
MATCH3D(3DSTATE_BTD);
MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
MATCH3D(3DSTATE_VF_STATISTICS);
MATCH(PIPELINE_SELECT);
MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
MATCH3D(3DSTATE_CLEAR_PARAMS);
MATCH3D(3DSTATE_DEPTH_BUFFER);
MATCH3D(3DSTATE_STENCIL_BUFFER);
MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
MATCH3D(3DSTATE_VERTEX_BUFFERS);
MATCH3D(3DSTATE_VERTEX_ELEMENTS);
MATCH3D(3DSTATE_INDEX_BUFFER);
MATCH3D(3DSTATE_VF);
MATCH3D(3DSTATE_MULTISAMPLE);
MATCH3D(3DSTATE_CC_STATE_POINTERS);
MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
MATCH3D(3DSTATE_VS);
MATCH3D(3DSTATE_GS);
MATCH3D(3DSTATE_CLIP);
MATCH3D(3DSTATE_SF);
MATCH3D(3DSTATE_WM);
MATCH3D(3DSTATE_CONSTANT_VS);
MATCH3D(3DSTATE_CONSTANT_GS);
MATCH3D(3DSTATE_CONSTANT_PS);
MATCH3D(3DSTATE_SAMPLE_MASK);
MATCH3D(3DSTATE_CONSTANT_HS);
MATCH3D(3DSTATE_CONSTANT_DS);
MATCH3D(3DSTATE_HS);
MATCH3D(3DSTATE_TE);
MATCH3D(3DSTATE_DS);
MATCH3D(3DSTATE_STREAMOUT);
MATCH3D(3DSTATE_SBE);
MATCH3D(3DSTATE_PS);
MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
MATCH3D(3DSTATE_CPS_POINTERS);
MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
MATCH3D(3DSTATE_VF_INSTANCING);
MATCH3D(3DSTATE_VF_SGVS);
MATCH3D(3DSTATE_VF_TOPOLOGY);
MATCH3D(3DSTATE_WM_CHROMAKEY);
MATCH3D(3DSTATE_PS_BLEND);
MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
MATCH3D(3DSTATE_PS_EXTRA);
MATCH3D(3DSTATE_RASTER);
MATCH3D(3DSTATE_SBE_SWIZ);
MATCH3D(3DSTATE_WM_HZ_OP);
MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
MATCH3D(3DSTATE_VF_SGVS_2);
MATCH3D(3DSTATE_VFG);
MATCH3D(3DSTATE_URB_ALLOC_VS);
MATCH3D(3DSTATE_URB_ALLOC_HS);
MATCH3D(3DSTATE_URB_ALLOC_DS);
MATCH3D(3DSTATE_URB_ALLOC_GS);
MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
MATCH3D(3DSTATE_AMFS);
MATCH3D(3DSTATE_DEPTH_BOUNDS);
MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
MATCH3D(3DSTATE_MESH_CONTROL);
MATCH3D(3DSTATE_MESH_DISTRIB);
MATCH3D(3DSTATE_TASK_REDISTRIB);
MATCH3D(3DSTATE_MESH_SHADER);
MATCH3D(3DSTATE_MESH_SHADER_DATA);
MATCH3D(3DSTATE_TASK_CONTROL);
MATCH3D(3DSTATE_TASK_SHADER);
MATCH3D(3DSTATE_TASK_SHADER_DATA);
MATCH3D(3DSTATE_URB_ALLOC_MESH);
MATCH3D(3DSTATE_URB_ALLOC_TASK);
MATCH3D(3DSTATE_CLIP_MESH);
MATCH3D(3DSTATE_SBE_MESH);
MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
MATCH3D(3DSTATE_COARSE_PIXEL);
MATCH3D(3DSTATE_DRAWING_RECTANGLE);
MATCH3D(3DSTATE_CHROMA_KEY);
MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
MATCH3D(3DSTATE_LINE_STIPPLE);
MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
MATCH3D(3DSTATE_MONOFILTER_SIZE);
MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
MATCH3D(3DSTATE_SO_DECL_LIST);
MATCH3D(3DSTATE_SO_BUFFER);
MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
MATCH3D(3DSTATE_SAMPLE_PATTERN);
MATCH3D(3DSTATE_3D_MODE);
MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
default:
drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
*dw, pipeline, opcode, subopcode, numdw);
return numdw;
}
}
static int dump_gfx_state_command(struct drm_printer *p,
struct xe_gt *gt,
u32 *dw,
int remaining_dw)
{
u32 numdw = instr_dw(*dw);
u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
/*
* Make sure we haven't mis-parsed a number of dwords that exceeds the
* remaining size of the LRC.
*/
if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
numdw = remaining_dw;
switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
MATCH(STATE_WRITE_INLINE);
default:
drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
*dw, opcode, numdw);
return numdw;
}
}
void xe_lrc_dump_default(struct drm_printer *p,
struct xe_gt *gt,
enum xe_engine_class hwe_class)
{
u32 *dw;
int remaining_dw, num_dw;
if (!gt->default_lrc[hwe_class]) {
drm_printf(p, "No default LRC for class %d\n", hwe_class);
return;
}
/*
* Skip the beginning of the LRC since it contains the per-process
* hardware status page.
*/
dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
while (remaining_dw > 0) {
if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
num_dw = dump_mi_command(p, gt, dw, remaining_dw);
} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
} else {
num_dw = min(instr_dw(*dw), remaining_dw);
drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
*dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
num_dw);
}
dw += num_dw;
remaining_dw -= num_dw;
}
}
struct instr_state {
u32 instr;
u16 num_dw;
};
static const struct instr_state xe_hpg_svg_state[] = {
{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
};
u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
{
struct xe_gt *gt = q->hwe->gt;
struct xe_device *xe = gt_to_xe(gt);
const struct instr_state *state_table = NULL;
int state_table_size = 0;
/*
* Wa_14019789679
*
* If the driver doesn't explicitly emit the SVG instructions while
* setting up the default LRC, the context switch will write 0's
* (noops) into the LRC memory rather than the expected instruction
* headers. Application contexts start out as a copy of the default
* LRC, and if they also do not emit specific settings for some SVG
* state, then on context restore they'll unintentionally inherit
* whatever state setting the previous context had programmed into the
* hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
* prevent the hardware from resetting that state back to any specific
* value).
*
* The official workaround only requires emitting 3DSTATE_MESH_CONTROL
* since that's a specific state setting that can easily cause GPU
* hangs if unintentionally inherited. However to be safe we'll
* continue to emit all of the SVG state since it's best not to leak
* any of the state between contexts, even if that leakage is harmless.
*/
if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
state_table = xe_hpg_svg_state;
state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
}
if (!state_table) {
xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
return cs;
}
for (int i = 0; i < state_table_size; i++) {
u32 instr = state_table[i].instr;
u16 num_dw = state_table[i].num_dw;
bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
xe_gt_assert(gt, num_dw != 0);
xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
/*
* Xe2's SVG context is the same as the one on DG2 / MTL
* except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
* been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
* Just make the replacement here rather than defining a
* whole separate table for the single trivial change.
*/
if (GRAPHICS_VER(xe) >= 20 &&
instr == CMD_3DSTATE_DRAWING_RECTANGLE)
instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
*cs = instr;
if (!is_single_dw)
*cs |= (num_dw - 2);
cs += num_dw;
}
return cs;
}
struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
{
struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
if (!snapshot)
return NULL;
snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
snapshot->head = xe_lrc_ring_head(lrc);
snapshot->tail.internal = lrc->ring.tail;
snapshot->tail.memory = xe_lrc_ring_tail(lrc);
snapshot->start = xe_lrc_ring_start(lrc);
snapshot->start_seqno = xe_lrc_start_seqno(lrc);
snapshot->seqno = xe_lrc_seqno(lrc);
snapshot->lrc_bo = xe_bo_get(lrc->bo);
snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
snapshot->lrc_size = lrc->size;
snapshot->replay_offset = 0;
snapshot->replay_size = lrc->replay_size;
snapshot->lrc_snapshot = NULL;
snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
return snapshot;
}
void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
{
struct xe_bo *bo;
struct iosys_map src;
if (!snapshot)
return;
bo = snapshot->lrc_bo;
snapshot->lrc_bo = NULL;
snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
if (!snapshot->lrc_snapshot)
goto put_bo;
xe_bo_lock(bo, false);
if (!ttm_bo_vmap(&bo->ttm, &src)) {
xe_map_memcpy_from(xe_bo_device(bo),
snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
snapshot->lrc_size);
ttm_bo_vunmap(&bo->ttm, &src);
} else {
kvfree(snapshot->lrc_snapshot);
snapshot->lrc_snapshot = NULL;
}
xe_bo_unlock(bo);
put_bo:
xe_bo_put(bo);
}
void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
{
unsigned long i;
if (!snapshot)
return;
drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
drm_printf(p, "\tHW Ring address: 0x%08x\n",
snapshot->ring_addr);
drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
snapshot->indirect_context_desc);
drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
snapshot->tail.internal, snapshot->tail.memory);
drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
if (!snapshot->lrc_snapshot)
return;
drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
drm_puts(p, "\t[HWSP].data: ");
for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
u32 *val = snapshot->lrc_snapshot + i;
char dumped[ASCII85_BUFSZ];
drm_puts(p, ascii85_encode(*val, dumped));
}
drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
drm_puts(p, "\t[HWCTX].data: ");
for (; i < snapshot->lrc_size; i += sizeof(u32)) {
u32 *val = snapshot->lrc_snapshot + i;
char dumped[ASCII85_BUFSZ];
drm_puts(p, ascii85_encode(*val, dumped));
}
drm_puts(p, "\n");
}
void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
{
if (!snapshot)
return;
kvfree(snapshot->lrc_snapshot);
if (snapshot->lrc_bo)
xe_bo_put(snapshot->lrc_bo);
kfree(snapshot);
}
static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
{
u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
struct xe_hw_engine *hwe;
u64 val;
hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
"Unexpected engine class:instance %d:%d for context utilization\n",
class, instance))
return -1;
if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
val = xe_mmio_read64_2x32(&hwe->gt->mmio,
RING_CTX_TIMESTAMP(hwe->mmio_base));
else
val = xe_mmio_read32(&hwe->gt->mmio,
RING_CTX_TIMESTAMP(hwe->mmio_base));
*reg_ctx_ts = val;
return 0;
}
/**
* xe_lrc_timestamp() - Current ctx timestamp
* @lrc: Pointer to the lrc.
*
* Return latest ctx timestamp. With support for active contexts, the
* calculation may bb slightly racy, so follow a read-again logic to ensure that
* the context is still active before returning the right timestamp.
*
* Returns: New ctx timestamp value
*/
u64 xe_lrc_timestamp(struct xe_lrc *lrc)
{
u64 lrc_ts, reg_ts, new_ts;
u32 engine_id;
lrc_ts = xe_lrc_ctx_timestamp(lrc);
/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
new_ts = lrc_ts;
goto done;
}
if (lrc_ts == CONTEXT_ACTIVE) {
engine_id = xe_lrc_engine_id(lrc);
if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
new_ts = reg_ts;
/* read lrc again to ensure context is still active */
lrc_ts = xe_lrc_ctx_timestamp(lrc);
}
/*
* If context switched out, just use the lrc_ts. Note that this needs to
* be a separate if condition.
*/
if (lrc_ts != CONTEXT_ACTIVE)
new_ts = lrc_ts;
done:
return new_ts;
}
/**
* xe_lrc_update_timestamp() - Update ctx timestamp
* @lrc: Pointer to the lrc.
* @old_ts: Old timestamp value
*
* Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
* update saved value.
*
* Returns: New ctx timestamp value
*/
u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
{
*old_ts = lrc->ctx_timestamp;
lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
trace_xe_lrc_update_timestamp(lrc, *old_ts);
return lrc->ctx_timestamp;
}
/**
* xe_lrc_ring_is_idle() - LRC is idle
* @lrc: Pointer to the lrc.
*
* Compare LRC ring head and tail to determine if idle.
*
* Return: True is ring is idle, False otherwise
*/
bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
{
return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
}